Bug 34549: Strip non-XML chars during TransformHtmlToMarc

This patch strips non-XML characters from inputs during
TransformHtmlToMarc.

To test:
0. Apply patch
1. koha-plack --restart kohadev
2. Go to http://localhost:8081/cgi-bin/koha/cataloguing/addbiblio.pl
3. Fill out record and use the text from "Text file containing control characters"
as the title
4. Click Save
5. Note that your record displays without any warnings like the following:
Error: invalid data, cannot decode metadata object
parser error : PCDATA invalid Char value 27

Signed-off-by: David Nind <david@davidnind.com>

Signed-off-by: Marcel de Rooy <m.de.rooy@rijksmuseum.nl>
[EDIT] Squashed the tidy patch. Still needed a few spaces to satisfy qa tools.
Signed-off-by: Tomas Cohen Arazi <tomascohen@theke.io>
(cherry picked from commit 3e1d32f9ca)
Signed-off-by: Fridolin Somers <fridolin.somers@biblibre.com>
(cherry picked from commit 871d6eaa3f)
Signed-off-by: Matt Blenkinsop <matt.blenkinsop@ptfs-europe.com>
This commit is contained in:
David Cook 2023-08-17 04:28:29 +00:00 committed by Matt Blenkinsop
parent 42f123dae1
commit 34dbeb9c7c
2 changed files with 23 additions and 2 deletions

View file

@ -93,6 +93,7 @@ use C4::Charset qw(
nsb_clean
SetMarcUnicodeFlag
SetUTF8Flag
StripNonXmlChars
);
use C4::Languages;
use C4::Linker;
@ -2231,6 +2232,7 @@ sub TransformHtmlToMarc {
;
# between 001 and 009 (included)
} elsif ( $fval ne '' ) {
$fval = StripNonXmlChars($fval); #Strip out any non-XML characters like control characters
$newfield = MARC::Field->new( $tag, $fval, );
}
@ -2252,6 +2254,7 @@ sub TransformHtmlToMarc {
$newfield->add_subfields( $fkey => $fval);
}
elsif($fval ne '') {
$fval = StripNonXmlChars($fval); #Strip out any non-XML characters like control characters
$newfield = MARC::Field->new( $tag, $ind1, $ind2, $fkey => $fval );
}
$j += 2;

View file

@ -22,7 +22,7 @@ Koha::Caches->get_instance->clear_from_cache( "MarcSubfieldStructure-" );
( $biblionumbertagfield, $biblionumbertagsubfield ) = C4::Biblio::GetMarcFromKohaField( "biblio.biblionumber" );
subtest 'Biblio record' => sub {
plan tests => 17;
plan tests => 20;
my $leader = '00203nam a2200097 4500';
my $input = CGI->new;
$input->param( -name => 'biblionumber', -value => '42' );
@ -73,10 +73,16 @@ subtest 'Biblio record' => sub {
$input->param( -name => "tag_490_code_b_1123", -value => 'b' );
$input->param( -name => "tag_490_subfield_b_1123", -value => '490b' );
# A field (900) after 490
$input->param( -name => "tag_900_indicator1_1123", -value => "" );
$input->param( -name => "tag_900_indicator2_1123", -value => "" );
$input->param( -name => "tag_900_code_a_1123", -value => 'a' );
$input->param( -name => "tag_900_subfield_a_1123", -value => "This string has bad \x{1B}characters in it" );
my $record = C4::Biblio::TransformHtmlToMarc($input, 1);
my @all_fields = $record->fields;
is( @all_fields, 7, 'The record should have been created with 7 fields' );
is( @all_fields, 8, 'The record should have been created with 8 fields' );
# biblionumber + 2x010 + 100 + 200 + 390 + 490
my @fields_010 = $record->field('010');
is( @fields_010, 2, 'The record should have been created with 2 010' );
@ -92,6 +98,13 @@ subtest 'Biblio record' => sub {
is( @subfields_200_a, 2, 'The record should have been created with 2 200$a' );
is_deeply( \@subfields_200_a, [ 'first title', 'second title' ], 'The 2 titles should have been kept in the correct order' );
my @fields_900 = $record->field('900');
is( @fields_900, 1, 'The record should have been created with 1 900' );
is_deeply(
$fields_900[0]->subfields(), [ 'a', 'This string has bad characters in it' ],
'Field 900 had its non-XML characters stripped'
);
my @subfields_biblionumber = $record->subfield( $biblionumbertagfield, $biblionumbertagsubfield );
is( @subfields_biblionumber, 1, 'The record should contain only one biblionumber field' );
@ -105,6 +118,11 @@ subtest 'Biblio record' => sub {
is( $all_fields[4]->tag, '390', 'Fifth field is 390' );
is( $all_fields[5]->subfield('a'), 42, 'Sixth field contains bibnumber' );
is( $all_fields[6]->tag, '490', 'Last field is 490' );
my $new_record = eval { MARC::Record::new_from_xml( $record->as_xml(), 'UTF-8' ); };
my $record_error = $@;
ok( !$record_error, 'No errors parsing MARCXML generated by TransformHtmlToMarc' );
};
subtest 'Add authority record' => sub {