From da51de184c1179fd725f0e341c9186b7f5cd585e Mon Sep 17 00:00:00 2001 From: Galen Charlton Date: Sat, 6 Jun 2009 16:45:54 -0500 Subject: [PATCH] bug 2926: fix staging import hang Fixes a hang of the staging import tool when it attempts to process a MARC21 record that claims that it's UTF-8 when it is not. The staging import will now attempt to fix the character encoding of such records. Also added a FIXME to bulkmarcimport.pl, which because of its use of MARC::Batch will skip over such records - better than the original hang of the staging import, but worse than the staging import's new ability to fix such records. Signed-off-by: Galen Charlton --- C4/Charset.pm | 18 +++++++++++++++++- misc/migration_tools/bulkmarcimport.pl | 6 ++++++ 2 files changed, 23 insertions(+), 1 deletion(-) diff --git a/C4/Charset.pm b/C4/Charset.pm index 001cf01001..5c5e7ce2dd 100644 --- a/C4/Charset.pm +++ b/C4/Charset.pm @@ -153,7 +153,23 @@ sub MarcToUTF8Record { $marc =~ s/^\s+//; $marc =~ s/\s+$//; $marc_blob_is_utf8 = IsStringUTF8ish($marc); - $marc_record = MARC::Record->new_from_usmarc($marc); + eval { + $marc_record = MARC::Record->new_from_usmarc($marc); + }; + if ($@) { + # if we fail the first time, one likely problem + # is that we have a MARC21 record that says that it's + # UTF-8 (Leader/09 = 'a') but contains non-UTF-8 characters. + # We'll try parsing it again. + substr($marc, 9, 1) = ' '; + eval { + $marc_record = MARC::Record->new_from_usmarc($marc); + }; + if ($@) { + # it's hopeless; return an empty MARC::Record + return MARC::Record->new(), 'failed', ['could not parse MARC blob']; + } + } } # If we do not know the source encoding, try some guesses diff --git a/misc/migration_tools/bulkmarcimport.pl b/misc/migration_tools/bulkmarcimport.pl index 5fc8e4c807..2a48540130 100755 --- a/misc/migration_tools/bulkmarcimport.pl +++ b/misc/migration_tools/bulkmarcimport.pl @@ -164,6 +164,12 @@ RECORD: while ( ) { eval { $record = $batch->next() }; if ( $@ ) { print "Bad MARC record: skipped\n"; + # FIXME - because MARC::Batch->next() combines grabbing the next + # blob and parsing it into one operation, a correctable condition + # such as a MARC-8 record claiming that it's UTF-8 can't be recovered + # from because we don't have access to the original blob. Note + # that the staging import can deal with this condition (via + # C4::Charset::MarcToUTF8Record) because it doesn't use MARC::Batch. next; } last unless ( $record ); -- 2.39.5