From 34dbeb9c7c8c0e373b1d164bf838f671b1c93c13 Mon Sep 17 00:00:00 2001 From: David Cook Date: Thu, 17 Aug 2023 04:28:29 +0000 Subject: [PATCH] Bug 34549: Strip non-XML chars during TransformHtmlToMarc This patch strips non-XML characters from inputs during TransformHtmlToMarc. To test: 0. Apply patch 1. koha-plack --restart kohadev 2. Go to http://localhost:8081/cgi-bin/koha/cataloguing/addbiblio.pl 3. Fill out record and use the text from "Text file containing control characters" as the title 4. Click Save 5. Note that your record displays without any warnings like the following: Error: invalid data, cannot decode metadata object parser error : PCDATA invalid Char value 27 Signed-off-by: David Nind Signed-off-by: Marcel de Rooy [EDIT] Squashed the tidy patch. Still needed a few spaces to satisfy qa tools. Signed-off-by: Tomas Cohen Arazi (cherry picked from commit 3e1d32f9caaab56acd8f4b338a859eb599955634) Signed-off-by: Fridolin Somers (cherry picked from commit 871d6eaa3fbb5eba14e17c3ebc6a46db708e5483) Signed-off-by: Matt Blenkinsop --- C4/Biblio.pm | 3 +++ t/db_dependent/Biblio/TransformHtmlToMarc.t | 22 +++++++++++++++++++-- 2 files changed, 23 insertions(+), 2 deletions(-) diff --git a/C4/Biblio.pm b/C4/Biblio.pm index 03f0ff1d1f..491f77e951 100644 --- a/C4/Biblio.pm +++ b/C4/Biblio.pm @@ -93,6 +93,7 @@ use C4::Charset qw( nsb_clean SetMarcUnicodeFlag SetUTF8Flag + StripNonXmlChars ); use C4::Languages; use C4::Linker; @@ -2231,6 +2232,7 @@ sub TransformHtmlToMarc { ; # between 001 and 009 (included) } elsif ( $fval ne '' ) { + $fval = StripNonXmlChars($fval); #Strip out any non-XML characters like control characters $newfield = MARC::Field->new( $tag, $fval, ); } @@ -2252,6 +2254,7 @@ sub TransformHtmlToMarc { $newfield->add_subfields( $fkey => $fval); } elsif($fval ne '') { + $fval = StripNonXmlChars($fval); #Strip out any non-XML characters like control characters $newfield = MARC::Field->new( $tag, $ind1, $ind2, $fkey => $fval ); } $j += 2; diff --git a/t/db_dependent/Biblio/TransformHtmlToMarc.t b/t/db_dependent/Biblio/TransformHtmlToMarc.t index 708a764ff4..0d54748031 100755 --- a/t/db_dependent/Biblio/TransformHtmlToMarc.t +++ b/t/db_dependent/Biblio/TransformHtmlToMarc.t @@ -22,7 +22,7 @@ Koha::Caches->get_instance->clear_from_cache( "MarcSubfieldStructure-" ); ( $biblionumbertagfield, $biblionumbertagsubfield ) = C4::Biblio::GetMarcFromKohaField( "biblio.biblionumber" ); subtest 'Biblio record' => sub { - plan tests => 17; + plan tests => 20; my $leader = '00203nam a2200097 4500'; my $input = CGI->new; $input->param( -name => 'biblionumber', -value => '42' ); @@ -73,10 +73,16 @@ subtest 'Biblio record' => sub { $input->param( -name => "tag_490_code_b_1123", -value => 'b' ); $input->param( -name => "tag_490_subfield_b_1123", -value => '490b' ); + # A field (900) after 490 + $input->param( -name => "tag_900_indicator1_1123", -value => "" ); + $input->param( -name => "tag_900_indicator2_1123", -value => "" ); + $input->param( -name => "tag_900_code_a_1123", -value => 'a' ); + $input->param( -name => "tag_900_subfield_a_1123", -value => "This string has bad \x{1B}characters in it" ); + my $record = C4::Biblio::TransformHtmlToMarc($input, 1); my @all_fields = $record->fields; - is( @all_fields, 7, 'The record should have been created with 7 fields' ); + is( @all_fields, 8, 'The record should have been created with 8 fields' ); # biblionumber + 2x010 + 100 + 200 + 390 + 490 my @fields_010 = $record->field('010'); is( @fields_010, 2, 'The record should have been created with 2 010' ); @@ -92,6 +98,13 @@ subtest 'Biblio record' => sub { is( @subfields_200_a, 2, 'The record should have been created with 2 200$a' ); is_deeply( \@subfields_200_a, [ 'first title', 'second title' ], 'The 2 titles should have been kept in the correct order' ); + my @fields_900 = $record->field('900'); + is( @fields_900, 1, 'The record should have been created with 1 900' ); + is_deeply( + $fields_900[0]->subfields(), [ 'a', 'This string has bad characters in it' ], + 'Field 900 had its non-XML characters stripped' + ); + my @subfields_biblionumber = $record->subfield( $biblionumbertagfield, $biblionumbertagsubfield ); is( @subfields_biblionumber, 1, 'The record should contain only one biblionumber field' ); @@ -105,6 +118,11 @@ subtest 'Biblio record' => sub { is( $all_fields[4]->tag, '390', 'Fifth field is 390' ); is( $all_fields[5]->subfield('a'), 42, 'Sixth field contains bibnumber' ); is( $all_fields[6]->tag, '490', 'Last field is 490' ); + + my $new_record = eval { MARC::Record::new_from_xml( $record->as_xml(), 'UTF-8' ); }; + my $record_error = $@; + ok( !$record_error, 'No errors parsing MARCXML generated by TransformHtmlToMarc' ); + }; subtest 'Add authority record' => sub { -- 2.39.5