new batch job to stage a file of MARC biblios for import.

As part of this, modified two routines in C4::ImportBatch to support a callback for monitor progress of import processing. Signed-off-by: Chris Cormack <crc@liblime.com> Signed-off-by: Joshua Ferraro <jmf@liblime.com>
2007-11-06 16:11:33 -06:00 · 2007-11-06 16:11:33 -06:00 · 3fbd25602b
commit 3fbd25602b
parent 979282933f
2 changed files with 175 additions and 4 deletions
--- a/C4/ImportBatch.pm
+++ b/C4/ImportBatch.pm
@ -231,15 +231,33 @@ sub ModBiblioInBatch {
 ($batch_id, $num_records, $num_items, @invalid_records) = 
    BatchStageMarcRecords($marc_flavor, $marc_records, $file_name, 
                          $comments, $branch_code, $parse_items,
-                          $leave_as_staging);
+                          $leave_as_staging, 
+                          $progress_interval, $progress_callback);

 =back

 =cut

 sub  BatchStageMarcRecords {
-    my ($marc_flavor, $marc_records, $file_name, $comments, $branch_code, $parse_items, $leave_as_staging) = @_;
-
+    my $marc_flavor = shift;
+    my $marc_records = shift;
+    my $file_name = shift;
+    my $comments = shift;
+    my $branch_code = shift;
+    my $parse_items = shift;
+    my $leave_as_staging = shift;
+   
+    # optional callback to monitor status 
+    # of job
+    my $progress_interval = 0;
+    my $progress_callback = undef;
+    if ($#_ == 1) {
+        $progress_interval = shift;
+        $progress_callback = shift;
+        $progress_interval = 0 unless $progress_interval =~ /^\d+$/ and $progress_interval > 0;
+        $progress_interval = 0 unless 'CODE' eq ref $progress_callback;
+    } 
+    
    my $batch_id = AddImportBatch('create_new', 'staging', 'batch', $file_name, $comments);
    my @invalid_records = ();
    my $num_valid = 0;
@ -248,6 +266,9 @@ sub  BatchStageMarcRecords {
    my $rec_num = 0;
    foreach my $marc_blob (split(/\x1D/, $marc_records)) {
        $rec_num++;
+        if ($progress_interval and (0 == ($rec_num % $progress_interval))) {
+            &$progress_callback($rec_num);
+        }
        my $marc_record = FixEncoding($marc_blob, "\x1D");
        my $import_record_id;
        if (scalar($marc_record->fields()) == 0) {
@ -314,7 +335,7 @@ sub AddItemsToImportBiblio {

 =over 4

-my $num_with_matches = BatchFindBibDuplicates($batch_id, $matcher, $max_matches);
+my $num_with_matches = BatchFindBibDuplicates($batch_id, $matcher, $max_matches, $progress_interval, $progress_callback);

 =back

@ -326,6 +347,12 @@ of each record to "no_match" or "auto_match" as appropriate.
 The $max_matches parameter is optional; if it is not supplied,
 it defaults to 10.

+The $progress_interval and $progress_callback parameters are 
+optional; if both are supplied, the sub referred to by
+$progress_callback will be invoked every $progress_interval
+records using the number of records processed as the 
+singular argument.
+
 =cut

 sub BatchFindBibDuplicates {
@ -333,6 +360,17 @@ sub BatchFindBibDuplicates {
    my $matcher = shift;
    my $max_matches = @_ ? shift : 10;

+    # optional callback to monitor status 
+    # of job
+    my $progress_interval = 0;
+    my $progress_callback = undef;
+    if ($#_ == 1) {
+        $progress_interval = shift;
+        $progress_callback = shift;
+        $progress_interval = 0 unless $progress_interval =~ /^\d+$/ and $progress_interval > 0;
+        $progress_interval = 0 unless 'CODE' eq ref $progress_callback;
+    }
+
    my $dbh = C4::Context->dbh;
    my $old_overlay_action = GetImportBatchOverlayAction($batch_id);
    if ($old_overlay_action eq "create_new") {
@ -345,7 +383,12 @@ sub BatchFindBibDuplicates {
                             WHERE import_batch_id = ?");
    $sth->execute($batch_id);
    my $num_with_matches = 0;
+    my $rec_num = 0;
    while (my $rowref = $sth->fetchrow_hashref) {
+        $rec_num++;
+        if ($progress_interval and (0 == ($rec_num % $progress_interval))) {
+            &$progress_callback($rec_num);
+        }
        my $marc_record = MARC::Record->new_from_usmarc($rowref->{'marc'});
        my @matches = $matcher->get_matches($marc_record, $max_matches);
        if (scalar(@matches) > 0) {
--- a/misc/stage_biblios_file.pl
+++ b/misc/stage_biblios_file.pl
@ -0,0 +1,128 @@
+#!/usr/bin/perl
+
+use strict;
+
+use C4::Context;
+use C4::ImportBatch;
+use C4::Matcher;
+use Getopt::Long;
+
+$| = 1;
+
+# command-line parameters
+my $match_bibs = 0;
+my $add_items = 0;
+my $input_file = "";
+my $batch_comment = "";
+my $want_help = 0;
+
+my $result = GetOptions(
+    'file:s'        => \$input_file,
+    'match-bibs'    => \$match_bibs,
+    'add-items'     => \$add_items,
+    'comment:s'     => \$batch_comment,
+    'h|help'        => \$want_help
+);
+
+if (not $result or $input_file eq "" or $want_help) {
+    print_usage();
+    exit 0;
+}
+
+unless (-r $input_file) {
+    die "$0: cannot open input file $input_file: $!\n";
+}
+
+process_batch($input_file, $match_bibs, $add_items, $batch_comment);
+
+exit 0;
+
+sub process_batch {
+    my ($input_file, $match_bibs, $add_items, $batch_comment) = @_;
+
+    open IN, "<$input_file" or die "$0: cannot open input file $input_file: $!\n";
+    my $marc_records = "";
+    $/ = "\035";
+    my $num_input_records = 0;
+    while (<IN>) {
+        $marc_records .= $_; # FIXME - this sort of string concatenation
+                             # is probably rather inefficient
+        $num_input_records++;
+    }
+    close IN;
+
+    my $marc_flavor = C4::Context->preference('marcflavour');
+
+    print "... staging MARC records -- please wait\n";
+    my ($batch_id, $num_valid, $num_items, @import_errors) = 
+        BatchStageMarcRecords($marc_flavor, $marc_records, $input_file, $batch_comment, '', $add_items, 0,
+                              100, \&print_progress);
+    print "... finished staging MARC records\n";
+
+    my $num_with_matches = 0;
+    if ($match_bibs) {
+        my $matcher = C4::Matcher->new('biblio');
+        $matcher->add_matchpoint("020", "a", '', 'isbn', 1000);
+        print "... looking for matches with records already in database\n";
+        $num_with_matches = BatchFindBibDuplicates($batch_id, $matcher, 10, 100, \&print_progress);
+        print "... finished looking for matches\n";
+    }
+
+    my $num_invalid_bibs = scalar(@import_errors);
+    print <<_SUMMARY_;
+
+MARC record staging report
+------------------------------------
+Input file:              $input_file
+Number of input bibs:    $num_input_records
+Number of valid bibs:    $num_valid
+Number of invalid bibs:  $num_invalid_bibs
+_SUMMARY_
+    if ($match_bibs) {
+        print "Number of bibs matched:  $num_with_matches\n";
+    } else {
+        print "Incoming bibs not matched against existing bibs (--match-bibs option not supplied)\n";
+    }
+    if ($add_items) {
+        print "Number of items parsed:  $num_items\n";
+    } else {
+        print "No items parsed (--add-items option not supplied)\n";
+    }
+
+    print "\n";
+    print "Batch number assigned:  $batch_id\n";
+    print "\n";
+}
+
+sub print_progress {
+    my $recs = shift;
+    print "... processed $recs records\n";
+}
+
+sub print_usage {
+    print <<_USAGE_;
+$0: stage MARC bib file into reservoir.
+
+Use this batch job to load a file of MARC bibliographic records
+(with optional item information) into the Koha reservoir.
+
+After running this program to stage your file, you can use
+either the batch job commit_biblios_file.pl or the Koha
+Tools option "Manage Staged MARC Records" to load the
+records into the main Koha database.
+
+Parameters:
+    --file <file_name>      name of input MARC bib file
+    --match-bibs            use this option to match bibs
+                            in the file with bibs already in 
+                            the database for future overlay.
+    --add-items             use this option to specify that
+                            item data is embedded in the MARC
+                            bibs and should be parsed.
+    --comment <comment>     optional comment to describe
+                            the record batch; if the comment
+                            has spaces in it, surround the
+                            comment with quotation marks.
+    --help or -h            show this message.
+_USAGE_
+}