From 3fbd25602b4914867023c8d7a3f192e768aef29d Mon Sep 17 00:00:00 2001 From: Galen Charlton Date: Tue, 6 Nov 2007 16:11:33 -0600 Subject: [PATCH] new batch job to stage a file of MARC biblios for import. As part of this, modified two routines in C4::ImportBatch to support a callback for monitor progress of import processing. Signed-off-by: Chris Cormack Signed-off-by: Joshua Ferraro --- C4/ImportBatch.pm | 51 +++++++++++++-- misc/stage_biblios_file.pl | 128 +++++++++++++++++++++++++++++++++++++ 2 files changed, 175 insertions(+), 4 deletions(-) create mode 100755 misc/stage_biblios_file.pl diff --git a/C4/ImportBatch.pm b/C4/ImportBatch.pm index 59c328494d..e17bc6558e 100644 --- a/C4/ImportBatch.pm +++ b/C4/ImportBatch.pm @@ -231,15 +231,33 @@ sub ModBiblioInBatch { ($batch_id, $num_records, $num_items, @invalid_records) = BatchStageMarcRecords($marc_flavor, $marc_records, $file_name, $comments, $branch_code, $parse_items, - $leave_as_staging); + $leave_as_staging, + $progress_interval, $progress_callback); =back =cut sub BatchStageMarcRecords { - my ($marc_flavor, $marc_records, $file_name, $comments, $branch_code, $parse_items, $leave_as_staging) = @_; - + my $marc_flavor = shift; + my $marc_records = shift; + my $file_name = shift; + my $comments = shift; + my $branch_code = shift; + my $parse_items = shift; + my $leave_as_staging = shift; + + # optional callback to monitor status + # of job + my $progress_interval = 0; + my $progress_callback = undef; + if ($#_ == 1) { + $progress_interval = shift; + $progress_callback = shift; + $progress_interval = 0 unless $progress_interval =~ /^\d+$/ and $progress_interval > 0; + $progress_interval = 0 unless 'CODE' eq ref $progress_callback; + } + my $batch_id = AddImportBatch('create_new', 'staging', 'batch', $file_name, $comments); my @invalid_records = (); my $num_valid = 0; @@ -248,6 +266,9 @@ sub BatchStageMarcRecords { my $rec_num = 0; foreach my $marc_blob (split(/\x1D/, $marc_records)) { $rec_num++; + if ($progress_interval and (0 == ($rec_num % $progress_interval))) { + &$progress_callback($rec_num); + } my $marc_record = FixEncoding($marc_blob, "\x1D"); my $import_record_id; if (scalar($marc_record->fields()) == 0) { @@ -314,7 +335,7 @@ sub AddItemsToImportBiblio { =over 4 -my $num_with_matches = BatchFindBibDuplicates($batch_id, $matcher, $max_matches); +my $num_with_matches = BatchFindBibDuplicates($batch_id, $matcher, $max_matches, $progress_interval, $progress_callback); =back @@ -326,6 +347,12 @@ of each record to "no_match" or "auto_match" as appropriate. The $max_matches parameter is optional; if it is not supplied, it defaults to 10. +The $progress_interval and $progress_callback parameters are +optional; if both are supplied, the sub referred to by +$progress_callback will be invoked every $progress_interval +records using the number of records processed as the +singular argument. + =cut sub BatchFindBibDuplicates { @@ -333,6 +360,17 @@ sub BatchFindBibDuplicates { my $matcher = shift; my $max_matches = @_ ? shift : 10; + # optional callback to monitor status + # of job + my $progress_interval = 0; + my $progress_callback = undef; + if ($#_ == 1) { + $progress_interval = shift; + $progress_callback = shift; + $progress_interval = 0 unless $progress_interval =~ /^\d+$/ and $progress_interval > 0; + $progress_interval = 0 unless 'CODE' eq ref $progress_callback; + } + my $dbh = C4::Context->dbh; my $old_overlay_action = GetImportBatchOverlayAction($batch_id); if ($old_overlay_action eq "create_new") { @@ -345,7 +383,12 @@ sub BatchFindBibDuplicates { WHERE import_batch_id = ?"); $sth->execute($batch_id); my $num_with_matches = 0; + my $rec_num = 0; while (my $rowref = $sth->fetchrow_hashref) { + $rec_num++; + if ($progress_interval and (0 == ($rec_num % $progress_interval))) { + &$progress_callback($rec_num); + } my $marc_record = MARC::Record->new_from_usmarc($rowref->{'marc'}); my @matches = $matcher->get_matches($marc_record, $max_matches); if (scalar(@matches) > 0) { diff --git a/misc/stage_biblios_file.pl b/misc/stage_biblios_file.pl new file mode 100755 index 0000000000..0c70139545 --- /dev/null +++ b/misc/stage_biblios_file.pl @@ -0,0 +1,128 @@ +#!/usr/bin/perl + +use strict; + +use C4::Context; +use C4::ImportBatch; +use C4::Matcher; +use Getopt::Long; + +$| = 1; + +# command-line parameters +my $match_bibs = 0; +my $add_items = 0; +my $input_file = ""; +my $batch_comment = ""; +my $want_help = 0; + +my $result = GetOptions( + 'file:s' => \$input_file, + 'match-bibs' => \$match_bibs, + 'add-items' => \$add_items, + 'comment:s' => \$batch_comment, + 'h|help' => \$want_help +); + +if (not $result or $input_file eq "" or $want_help) { + print_usage(); + exit 0; +} + +unless (-r $input_file) { + die "$0: cannot open input file $input_file: $!\n"; +} + +process_batch($input_file, $match_bibs, $add_items, $batch_comment); + +exit 0; + +sub process_batch { + my ($input_file, $match_bibs, $add_items, $batch_comment) = @_; + + open IN, "<$input_file" or die "$0: cannot open input file $input_file: $!\n"; + my $marc_records = ""; + $/ = "\035"; + my $num_input_records = 0; + while () { + $marc_records .= $_; # FIXME - this sort of string concatenation + # is probably rather inefficient + $num_input_records++; + } + close IN; + + my $marc_flavor = C4::Context->preference('marcflavour'); + + print "... staging MARC records -- please wait\n"; + my ($batch_id, $num_valid, $num_items, @import_errors) = + BatchStageMarcRecords($marc_flavor, $marc_records, $input_file, $batch_comment, '', $add_items, 0, + 100, \&print_progress); + print "... finished staging MARC records\n"; + + my $num_with_matches = 0; + if ($match_bibs) { + my $matcher = C4::Matcher->new('biblio'); + $matcher->add_matchpoint("020", "a", '', 'isbn', 1000); + print "... looking for matches with records already in database\n"; + $num_with_matches = BatchFindBibDuplicates($batch_id, $matcher, 10, 100, \&print_progress); + print "... finished looking for matches\n"; + } + + my $num_invalid_bibs = scalar(@import_errors); + print <<_SUMMARY_; + +MARC record staging report +------------------------------------ +Input file: $input_file +Number of input bibs: $num_input_records +Number of valid bibs: $num_valid +Number of invalid bibs: $num_invalid_bibs +_SUMMARY_ + if ($match_bibs) { + print "Number of bibs matched: $num_with_matches\n"; + } else { + print "Incoming bibs not matched against existing bibs (--match-bibs option not supplied)\n"; + } + if ($add_items) { + print "Number of items parsed: $num_items\n"; + } else { + print "No items parsed (--add-items option not supplied)\n"; + } + + print "\n"; + print "Batch number assigned: $batch_id\n"; + print "\n"; +} + +sub print_progress { + my $recs = shift; + print "... processed $recs records\n"; +} + +sub print_usage { + print <<_USAGE_; +$0: stage MARC bib file into reservoir. + +Use this batch job to load a file of MARC bibliographic records +(with optional item information) into the Koha reservoir. + +After running this program to stage your file, you can use +either the batch job commit_biblios_file.pl or the Koha +Tools option "Manage Staged MARC Records" to load the +records into the main Koha database. + +Parameters: + --file name of input MARC bib file + --match-bibs use this option to match bibs + in the file with bibs already in + the database for future overlay. + --add-items use this option to specify that + item data is embedded in the MARC + bibs and should be parsed. + --comment optional comment to describe + the record batch; if the comment + has spaces in it, surround the + comment with quotation marks. + --help or -h show this message. +_USAGE_ +} -- 2.39.2