From b549d7e1f1b7d518e16fa48af7360a38e8233fec Mon Sep 17 00:00:00 2001 From: Galen Charlton Date: Fri, 8 Feb 2008 16:35:18 -0600 Subject: [PATCH] added StripNonXmlChars to C4::Charset Added invocations of StripNonXmlChars to uses of new_from_xml() that involve records saved to Koha fields via MARC::Record->as_xml(); for batch jobs that work on MARC XML files coming from external sources, StripNonXmlChars should not necessarily be used, as it may be better to reject a file or record if it contains that kind of encoding error. Signed-off-by: Joshua Ferraro --- C4/AuthoritiesMarc.pm | 4 +++- C4/Biblio.pm | 10 +++------- C4/Charset.pm | 34 ++++++++++++++++++++++++++++++++++ C4/ImportBatch.pm | 6 +++--- C4/Items.pm | 3 ++- 5 files changed, 45 insertions(+), 12 deletions(-) diff --git a/C4/AuthoritiesMarc.pm b/C4/AuthoritiesMarc.pm index 40a943ed10..c99d332c56 100644 --- a/C4/AuthoritiesMarc.pm +++ b/C4/AuthoritiesMarc.pm @@ -24,6 +24,7 @@ use C4::Biblio; use C4::Search; use C4::AuthoritiesMarc::MARC21; use C4::AuthoritiesMarc::UNIMARC; +use C4::Charset; use vars qw($VERSION @ISA @EXPORT); @@ -675,7 +676,8 @@ sub GetAuthority { my $sth=$dbh->prepare("select authtypecode, marcxml from auth_header where authid=?"); $sth->execute($authid); my ($authtypecode, $marcxml) = $sth->fetchrow; - my $record=MARC::Record->new_from_xml($marcxml,'UTF-8',(C4::Context->preference("marcflavour") eq "UNIMARC"?"UNIMARCAUTH":C4::Context->preference("marcflavour"))); + my $record=MARC::Record->new_from_xml(StripNonXmlChars($marcxml),'UTF-8', + (C4::Context->preference("marcflavour") eq "UNIMARC"?"UNIMARCAUTH":C4::Context->preference("marcflavour"))); $record->encoding('UTF-8'); if (C4::Context->preference("marcflavour") eq "MARC21") { my ($auth_type_tag, $auth_type_subfield) = get_auth_type_location($authtypecode); diff --git a/C4/Biblio.pm b/C4/Biblio.pm index 1243140c7b..ef76847784 100755 --- a/C4/Biblio.pm +++ b/C4/Biblio.pm @@ -30,6 +30,7 @@ use C4::Branch; use C4::Dates qw/format_date/; use C4::Log; # logaction use C4::ClassSource; +use C4::Charset; use vars qw($VERSION @ISA @EXPORT); @@ -821,14 +822,9 @@ sub GetMarcBiblio { my $sth = $dbh->prepare("SELECT marcxml FROM biblioitems WHERE biblionumber=? "); $sth->execute($biblionumber); - my ($marcxml) = $sth->fetchrow; + my $row = $sth->fetchrow_hashref; + my $marcxml = StripNonXmlChars($row->{'marcxml'}); MARC::File::XML->default_record_format(C4::Context->preference('marcflavour')); - $marcxml =~ s/\x1e//g; - $marcxml =~ s/\x1f//g; - $marcxml =~ s/\x1d//g; - $marcxml =~ s/\x0f//g; - $marcxml =~ s/\x0c//g; -# warn $marcxml; my $record = MARC::Record->new(); if ($marcxml) { $record = eval {MARC::Record::new_from_xml( $marcxml, "utf8", C4::Context->preference('marcflavour'))}; diff --git a/C4/Charset.pm b/C4/Charset.pm index 54a1322ca0..2e2e83153a 100644 --- a/C4/Charset.pm +++ b/C4/Charset.pm @@ -32,6 +32,7 @@ BEGIN { IsStringUTF8ish MarcToUTF8Record SetMarcUnicodeFlag + StripNonXmlChars ); } @@ -243,7 +244,40 @@ sub SetMarcUnicodeFlag { } } +=head2 StripNonXmlChars +=over 4 + +my $new_str = StripNonXmlChars($old_str); + +=back + +Given a string, return a copy with the +characters that are illegal in XML +removed. + +This function exists to work around a problem +that can occur with badly-encoded MARC records. +Specifically, if a UTF-8 MARC record also +has excape (\x1b) characters, MARC::File::XML +will let the escape characters pass through +when as_xml() or as_xml_record() is called. The +problem is that the escape character is not +legal in well-formed XML documents, so when +MARC::File::XML attempts to parse such a record, +the XML parser will fail. + +Stripping such characters will allow a +MARC::Record->new_from_xml() +to work, at the possible risk of some data loss. + +=cut + +sub StripNonXmlChars { + my $str = shift; + $str =~ s/[^\x09\x0A\x0D\x{0020}-\x{D7FF}\x{E000}-\x{FFFD}\x{10000}-\x{10FFFF}]//g; + return $str; +} =head1 INTERNAL FUNCTIONS diff --git a/C4/ImportBatch.pm b/C4/ImportBatch.pm index 942a402607..0c93ccd0f1 100644 --- a/C4/ImportBatch.pm +++ b/C4/ImportBatch.pm @@ -487,7 +487,7 @@ sub BatchCommitBibRecords { # remove item fields so that they don't get # added again if record is reverted - my $old_marc = MARC::Record->new_from_xml($oldxml, 'UTF-8', $rowref->{'encoding'}); + my $old_marc = MARC::Record->new_from_xml(StripNonXmlChars($oldxml), 'UTF-8', $rowref->{'encoding'}); foreach my $item_field ($old_marc->field($item_tag)) { $old_marc->delete_field($item_field); } @@ -536,7 +536,7 @@ sub BatchCommitItems { $sth->bind_param(1, $import_record_id); $sth->execute(); while (my $row = $sth->fetchrow_hashref()) { - my $item_marc = MARC::Record->new_from_xml($row->{'marcxml'}, 'UTF-8', $row->{'encoding'}); + my $item_marc = MARC::Record->new_from_xml(StripNonXmlChars($row->{'marcxml'}), 'UTF-8', $row->{'encoding'}); # FIXME - duplicate barcode check needs to become part of AddItemFromMarc() my $item = TransformMarcToKoha($dbh, $item_marc); my $duplicate_barcode = exists($item->{'barcode'}) && GetItemnumberFromBarcode($item->{'barcode'}); @@ -606,7 +606,7 @@ sub BatchRevertBibRecords { } } else { $num_reverted++; - my $old_record = MARC::Record->new_from_xml($rowref->{'marcxml_old'}, 'UTF-8', $rowref->{'encoding'}); + my $old_record = MARC::Record->new_from_xml(StripNonXmlChars($rowref->{'marcxml_old'}), 'UTF-8', $rowref->{'encoding'}); my $biblionumber = $rowref->{'matched_biblionumber'}; my ($count, $oldbiblio) = GetBiblio($biblionumber); $num_items_deleted += BatchRevertItems($rowref->{'import_record_id'}, $rowref->{'matched_biblionumber'}); diff --git a/C4/Items.pm b/C4/Items.pm index b4b11f97b2..fef96cb90b 100644 --- a/C4/Items.pm +++ b/C4/Items.pm @@ -28,6 +28,7 @@ use C4::ClassSource; use C4::Log; use C4::Branch; require C4::Reserves; +use C4::Charset; use vars qw($VERSION @ISA @EXPORT); @@ -2005,7 +2006,7 @@ sub _parse_unlinked_item_subfields_from_xml { my $xml = shift; return unless defined $xml and $xml ne ""; - my $marc = MARC::Record->new_from_xml($xml, 'UTF-8', C4::Context->preference("marcflavour")); + my $marc = MARC::Record->new_from_xml(StripNonXmlChars($xml), 'UTF-8', C4::Context->preference("marcflavour")); my $unlinked_subfields = []; my @fields = $marc->fields(); if ($#fields > -1) { -- 2.20.1