From 01d78e1ec71c1c227738215c5b5d4aaef847daaf Mon Sep 17 00:00:00 2001 From: Julian Maurice Date: Wed, 27 Oct 2021 12:00:23 +0200 Subject: [PATCH] Bug 29333: Fix encoding of imported UNIMARC authorities MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit MARC::Record and MARC::File::* modules sometimes use the position 09 of the leader to detect encoding. A blank character means 'MARC-8' while an 'a' means 'UTF-8'. In a UNIMARC authority this position is used to store the authority type (see https://www.transition-bibliographique.fr/wp-content/uploads/2021/02/AIntroLabel-2004.pdf [FR]). In this case, 'a' means 'Personal Name'. The result is that the import will succeed for a Personal Name authority, but it will fail for all other authority types. Steps to reproduce: 0. Be sure to have a Koha UNIMARC instance. 1. Download the MARCXML for "Honoré de Balzac" curl -o balzac.marcxml https://www.idref.fr/02670305X.xml 2. Verify that it's encoded in UTF-8 file balzac.marcxml (should output "balzac.marcxml: XML 1.0 document, UTF-8 Unicode text") 3. Go to Tools » Stage MARC for import and import balzac.marcxml with the following settings: Record type: Authority Character encoding: UTF-8 Format: MARCXML Do not touch the other settings 4. Once imported, go to the staged MARC management tool and find your batch. Click on the authority title "Balzac Honoré de 1799-1850" to show the MARC inside a modal window. There should be no encoding issue. 5. Write down the imported record id (the number in column '#') and go to the MARC authority editor. Replace all URL parameters by 'breedingid=THE_ID_YOU_WROTE_DOWN' The URL should look like this: /cgi-bin/koha/authorities/authorities.pl?breedingid=198 You should see no encoding issues. Do not save the record. 6. Import the batch into the catalog. Verify that the authority record has no encoding issue. 7. Now download the MARCXML for "Athènes (Grèce)" curl -o athènes.marcxml https://www.idref.fr/027290530.xml 8. Repeat steps 2 to 6 using athènes.marcxml file. At steps 4 and 5 you should see encoding issues and that the position 9 of the leader was rewritten from 'c' to 'a'. Strangely, importing this batch fix the encoding issue, but we still lose the information in position 09 of the leader This patch makes use of the MARCXML representation of the record instead of the ISO2709 representation, because, unlike MARC::Record::new_from_usmarc, MARC::Record::new_from_xml allows us to pass directly the encoding and the format, which prevents data to be double encoded when position 09 of the leader is different that 'a' Test plan: - Follow the "steps to reproduce" above and verify that you have no encoding issues. Signed-off-by: David Nind Signed-off-by: Martin Renvoize Signed-off-by: Tomas Cohen Arazi --- C4/ImportBatch.pm | 31 -------- Koha/Import/Record.pm | 35 +++++++++ authorities/authorities.pl | 38 ++++------ catalogue/showmarc.pl | 21 ++++-- t/db_dependent/ImportBatch.t | 136 +++++++++++++++++++++++++---------- tools/showdiffmarc.pl | 6 +- 6 files changed, 164 insertions(+), 103 deletions(-) diff --git a/C4/ImportBatch.pm b/C4/ImportBatch.pm index cb0b6430d7..25b44d614b 100644 --- a/C4/ImportBatch.pm +++ b/C4/ImportBatch.pm @@ -48,8 +48,6 @@ BEGIN { GetZ3950BatchId GetWebserviceBatchId GetImportRecordMarc - GetImportRecordMarcXML - GetRecordFromImportBiblio AddImportBatch GetImportBatch AddAuthToBatch @@ -193,17 +191,6 @@ sub GetImportRecordMarc { return $marc, $encoding; } -sub GetRecordFromImportBiblio { - my ( $import_record_id, $embed_items ) = @_; - - my ($marc) = GetImportRecordMarc($import_record_id); - my $record = MARC::Record->new_from_usmarc($marc); - - EmbedItemsInImportBiblio( $record, $import_record_id ) if $embed_items; - - return $record; -} - sub EmbedItemsInImportBiblio { my ( $record, $import_record_id ) = @_; my ( $itemtag, $itemsubfield ) = GetMarcFromKohaField( "items.itemnumber" ); @@ -222,24 +209,6 @@ sub EmbedItemsInImportBiblio { return $record; } -=head2 GetImportRecordMarcXML - - my $marcxml = GetImportRecordMarcXML($import_record_id); - -=cut - -sub GetImportRecordMarcXML { - my ($import_record_id) = @_; - - my $dbh = C4::Context->dbh; - my $sth = $dbh->prepare("SELECT marcxml FROM import_records WHERE import_record_id = ?"); - $sth->execute($import_record_id); - my ($marcxml) = $sth->fetchrow(); - $sth->finish(); - return $marcxml; - -} - =head2 AddImportBatch my $batch_id = AddImportBatch($params_hash); diff --git a/Koha/Import/Record.pm b/Koha/Import/Record.pm index efd017152e..977139320e 100644 --- a/Koha/Import/Record.pm +++ b/Koha/Import/Record.pm @@ -18,7 +18,9 @@ package Koha::Import::Record; use Modern::Perl; use Carp; +use MARC::Record; +use C4::Context; use Koha::Database; use base qw(Koha::Object); @@ -29,6 +31,39 @@ Koha::Import::Record - Koha Import Record Object class =head1 API +=head2 Public methods + +=head3 get_marc_record + +Returns a MARC::Record object + + my $marc_record = $import_record->get_marc_record({ embed_items => $embed_items }) + +If $embed_items is true then items from import_items are embedded into the +MARC::Record returned + +=cut + +sub get_marc_record { + my ($self, $args) = @_; + + my $marcflavour = C4::Context->preference('marcflavour'); + + my $format = $marcflavour eq 'UNIMARC' ? 'UNIMARC' : 'USMARC'; + if ($marcflavour eq 'UNIMARC' && $self->record_type eq 'auth') { + $format = 'UNIMARCAUTH'; + } + + my $record = MARC::Record->new_from_xml($self->marcxml, $self->encoding, $format); + + if ($self->record_type eq 'biblio' && $args->{embed_items}) { + require C4::ImportBatch; + C4::ImportBatch::EmbedItemsInImportBiblio($record, $self->id); + } + + return $record; +} + =head2 Internal methods =head3 _type diff --git a/authorities/authorities.pl b/authorities/authorities.pl index e4063d9d94..b465d4bcc0 100755 --- a/authorities/authorities.pl +++ b/authorities/authorities.pl @@ -24,13 +24,13 @@ use CGI qw ( -utf8 ); use C4::Auth qw( get_template_and_user ); use C4::Output qw( output_html_with_http_headers ); use C4::AuthoritiesMarc qw( AddAuthority ModAuthority GetAuthority GetTagsLabels GetAuthMARCFromKohaField FindDuplicateAuthority ); -use C4::ImportBatch qw( GetImportRecordMarc ); use C4::Context; use Date::Calc qw( Today ); use MARC::File::USMARC; use MARC::File::XML; use C4::Biblio qw( TransformHtmlToMarc ); use Koha::Authority::Types; +use Koha::Import::Records; use Koha::ItemTypes; use vars qw( $tagslib); use vars qw( $authorised_values_sth); @@ -48,21 +48,6 @@ builds list, depending on authorised value... =cut -sub MARCfindbreeding_auth { - my ( $id ) = @_; - my ($marc, $encoding) = GetImportRecordMarc($id); - if ($marc) { - my $record = MARC::Record->new_from_usmarc($marc); - if ( !defined(ref($record)) ) { - return -1; - } else { - return $record, $encoding; - } - } else { - return -1; - } -} - sub build_authorized_values_list { my ( $tag, $subfield, $value, $dbh, $authorised_values_sth,$index_tag,$index_subfield ) = @_; @@ -329,7 +314,7 @@ sub GetMandatoryFieldZ3950 { } sub build_tabs { - my ( $template, $record, $dbh, $encoding,$input ) = @_; + my ( $template, $record, $dbh, $input ) = @_; # fill arrays my @loop_data = (); @@ -365,7 +350,7 @@ sub build_tabs { # if MARC::Record is not empty =>use it as master loop, then add missing subfields that should be in the tab. # if MARC::Record is empty => use tab as master loop. - if ( $record != -1 && ( $record->field($tag) || $tag eq '000' ) ) { + if ( $record && ( $record->field($tag) || $tag eq '000' ) ) { my @fields; if ( $tag ne '000' ) { @fields = $record->field($tag); @@ -571,13 +556,14 @@ $template->param(nonav => $nonav,index=>$myindex,authtypecode=>$authtypecode,b $tagslib = GetTagsLabels(1,$authtypecode); $mandatory_z3950 = GetMandatoryFieldZ3950($authtypecode); -my $record=-1; -my $encoding=""; -if (($authid) && !($breedingid)){ - $record = GetAuthority($authid); -} +my $record; if ($breedingid) { - ( $record, $encoding ) = MARCfindbreeding_auth( $breedingid ); + my $import_record = Koha::Import::Records->find($breedingid); + if ($import_record) { + $record = $import_record->get_marc_record(); + } +} elsif ($authid) { + $record = GetAuthority($authid); } my ($oldauthnumtagfield,$oldauthnumtagsubfield); @@ -619,7 +605,7 @@ if ($op eq "add") { exit; } else { # it may be a duplicate, warn the user and do nothing - build_tabs($template, $record, $dbh, $encoding,$input); + build_tabs($template, $record, $dbh, $input); build_hidden_data; $template->param(authid =>$authid, duplicateauthid => $duplicateauthid, @@ -640,7 +626,7 @@ if ($op eq "duplicate") { $authid = ""; } - build_tabs ($template, $record, $dbh,$encoding,$input); + build_tabs ($template, $record, $dbh, $input); build_hidden_data; $template->param(oldauthtypetagfield=>$oldauthtypetagfield, oldauthtypetagsubfield=>$oldauthtypetagsubfield, oldauthnumtagfield=>$oldauthnumtagfield, oldauthnumtagsubfield=>$oldauthnumtagsubfield, diff --git a/catalogue/showmarc.pl b/catalogue/showmarc.pl index 8292bfe746..8a9f0e9bad 100755 --- a/catalogue/showmarc.pl +++ b/catalogue/showmarc.pl @@ -31,9 +31,10 @@ use C4::Context; use C4::Output qw( output_html_with_http_headers ); use C4::Auth qw( get_template_and_user ); use C4::Biblio qw( GetMarcBiblio GetXmlBiblio ); -use C4::ImportBatch qw( GetRecordFromImportBiblio ); use C4::XSLT; +use Koha::Import::Records; + my $input= CGI->new; my ( $template, $loggedinuser, $cookie ) = get_template_and_user( { @@ -48,9 +49,20 @@ my $biblionumber= $input->param('id'); my $importid= $input->param('importid'); my $view= $input->param('viewas')||''; +my $marcflavour = C4::Context->preference('marcflavour'); + my $record; +my $record_type = 'biblio'; +my $format = $marcflavour eq 'UNIMARC' ? 'UNIMARC' : 'USMARC'; if ($importid) { - $record = C4::ImportBatch::GetRecordFromImportBiblio( $importid, 'embed_items' ); + my $import_record = Koha::Import::Records->find($importid); + if ($import_record) { + if ($marcflavour eq 'UNIMARC' && $import_record->record_type eq 'auth') { + $format = 'UNIMARCAUTH'; + } + + $record = $import_record->get_marc_record({ embed_items => 1 }); + } } else { $record =GetMarcBiblio({ biblionumber => $biblionumber }); @@ -61,11 +73,10 @@ if(!ref $record) { } if($view eq 'card' || $view eq 'html') { - my $xml = $importid ? $record->as_xml(): GetXmlBiblio($biblionumber); + my $xml = $importid ? $record->as_xml($format): GetXmlBiblio($biblionumber); my $xsl; if ( $view eq 'card' ){ - $xsl = C4::Context->preference('marcflavour') eq 'UNIMARC' - ? 'UNIMARC_compact.xsl' : 'compact.xsl'; + $xsl = $marcflavour eq 'UNIMARC' ? 'UNIMARC_compact.xsl' : 'compact.xsl'; } else { $xsl = 'plainMARC.xsl'; diff --git a/t/db_dependent/ImportBatch.t b/t/db_dependent/ImportBatch.t index 9a4ea5f8ce..0442d4d034 100755 --- a/t/db_dependent/ImportBatch.t +++ b/t/db_dependent/ImportBatch.t @@ -10,6 +10,7 @@ use t::lib::Mocks; use t::lib::TestBuilder; use Koha::Database; +use Koha::Import::Records; BEGIN { # Mock pluginsdir before loading Plugins module @@ -17,7 +18,7 @@ BEGIN { t::lib::Mocks::mock_config( 'pluginsdir', $path ); use_ok('Koha::Plugins'); - use_ok('C4::ImportBatch', qw( AddImportBatch GetImportBatch AddBiblioToBatch AddItemsToImportBiblio GetRecordFromImportBiblio SetMatchedBiblionumber GetImportBiblios GetItemNumbersFromImportBatch CleanBatch DeleteBatch RecordsFromMarcPlugin )); + use_ok('C4::ImportBatch', qw( AddImportBatch GetImportBatch AddBiblioToBatch AddItemsToImportBiblio SetMatchedBiblionumber GetImportBiblios GetItemNumbersFromImportBatch CleanBatch DeleteBatch RecordsFromMarcPlugin )); } # Start transaction @@ -93,47 +94,83 @@ my $original_record = MARC::Record->new; $record->leader('03174nam a2200445 a 4500'); $original_record->leader('03174nam a2200445 a 4500'); my ($item_tag, $item_subfield) = C4::Biblio::GetMarcFromKohaField( 'items.itemnumber' ); -my @fields = ( - MARC::Field->new( - 100, '1', ' ', - a => 'Knuth, Donald Ervin', - d => '1938', - ), - MARC::Field->new( - 245, '1', '4', - a => 'The art of computer programming', - c => 'Donald E. Knuth.', - ), - MARC::Field->new( - 650, ' ', '0', - a => 'Computer programming.', - 9 => '462', - ), - MARC::Field->new( - $item_tag, ' ', ' ', - e => 'my edition ❤', - i => 'my item part', - ), - MARC::Field->new( - $item_tag, ' ', ' ', - e => 'my edition 2', - i => 'my item part 2', - ), -); +my @fields; +if (C4::Context->preference('marcflavour') eq 'UNIMARC') { + @fields = ( + MARC::Field->new( + 100, ' ', ' ', + a => '20220520d u||y0frey50 ba', + ), + MARC::Field->new( + 700, ' ', ' ', + a => 'Knuth, Donald Ervin', + f => '1938', + ), + MARC::Field->new( + 200, ' ', ' ', + a => 'The art of computer programming', + f => 'Donald E. Knuth.', + ), + MARC::Field->new( + 650, ' ', '0', + a => 'Computer programming.', + 9 => '462', + ), + MARC::Field->new( + $item_tag, ' ', ' ', + e => 'my edition ❤', + i => 'my item part', + ), + MARC::Field->new( + $item_tag, ' ', ' ', + e => 'my edition 2', + i => 'my item part 2', + ), + ); +} else { + @fields = ( + MARC::Field->new( + 100, '1', ' ', + a => 'Knuth, Donald Ervin', + d => '1938', + ), + MARC::Field->new( + 245, '1', '4', + a => 'The art of computer programming', + c => 'Donald E. Knuth.', + ), + MARC::Field->new( + 650, ' ', '0', + a => 'Computer programming.', + 9 => '462', + ), + MARC::Field->new( + $item_tag, ' ', ' ', + e => 'my edition ❤', + i => 'my item part', + ), + MARC::Field->new( + $item_tag, ' ', ' ', + e => 'my edition 2', + i => 'my item part 2', + ), + ); +} $record->append_fields(@fields); $original_record->append_fields(@fields); my $import_record_id = AddBiblioToBatch( $id_import_batch1, 0, $record, 'utf8', int(rand(99999)), 0 ); AddItemsToImportBiblio( $id_import_batch1, $import_record_id, $record, 0 ); -my $record_from_import_biblio_with_items = C4::ImportBatch::GetRecordFromImportBiblio( $import_record_id, 'embed_items' ); +my $import_record = Koha::Import::Records->find($import_record_id); +my $record_from_import_biblio_with_items = $import_record->get_marc_record({ embed_items => 1 }); $original_record->leader($record_from_import_biblio_with_items->leader()); -is_deeply( $record_from_import_biblio_with_items, $original_record, 'GetRecordFromImportBiblio should return the record with items if specified' ); +is_deeply( $record_from_import_biblio_with_items, $original_record, 'Koha::Import::Record::get_marc_record should return the record with items if specified' ); my $utf8_field = $record_from_import_biblio_with_items->subfield($item_tag, 'e'); is($utf8_field, 'my edition ❤'); $original_record->delete_fields($original_record->field($item_tag)); #Remove items fields -my $record_from_import_biblio_without_items = C4::ImportBatch::GetRecordFromImportBiblio( $import_record_id ); +my $record_from_import_biblio_without_items = $import_record->get_marc_record(); $original_record->leader($record_from_import_biblio_without_items->leader()); -is_deeply( $record_from_import_biblio_without_items, $original_record, 'GetRecordFromImportBiblio should return the record without items by default' ); +is_deeply( $record_from_import_biblio_without_items, $original_record, 'Koha::Import::Record::get_marc_record should return the record without items by default' ); my $another_biblio = $builder->build_sample_biblio; C4::ImportBatch::SetMatchedBiblionumber( $import_record_id, $another_biblio->biblionumber ); @@ -175,7 +212,7 @@ my $id_import_batch3 = C4::ImportBatch::AddImportBatch($sample_import_batch3); # Test CleanBatch C4::ImportBatch::CleanBatch( $id_import_batch3 ); -my $import_record = get_import_record( $id_import_batch3 ); +$import_record = get_import_record( $id_import_batch3 ); is( $import_record, "0E0", "Batch 3 has been cleaned" ); # Test DeleteBatch @@ -221,7 +258,20 @@ subtest "RecordsFromMarcPlugin" => sub { # Create a test file my ( $fh, $name ) = tempfile(); - print $fh q| + if (C4::Context->preference('marcflavour') eq 'UNIMARC') { + print $fh q{ +003 = NLAmRIJ +100,a = 20220520d u||y0frey50 ba +700,a = Author +200,ind2 = 0 +200,a = Silence in the library +500 , a= Some note + +700,a = Another +245,a = Noise in the library}; + close $fh; + } else { + print $fh q| 003 = NLAmRIJ 100,a = Author 245,ind2 = 0 @@ -230,7 +280,8 @@ subtest "RecordsFromMarcPlugin" => sub { 100,a = Another 245,a = Noise in the library|; - close $fh; + close $fh; + } t::lib::Mocks::mock_config( 'enable_plugins', 1 ); @@ -241,10 +292,17 @@ subtest "RecordsFromMarcPlugin" => sub { my $records = C4::ImportBatch::RecordsFromMarcPlugin( $name, ref $plugin, 'UTF-8' ); is( @$records, 2, 'Two results returned' ); is( ref $records->[0], 'MARC::Record', 'Returned MARC::Record object' ); - is( $records->[0]->subfield('245', 'a'), 'Silence in the library', - 'Checked one field in first record' ); - is( $records->[1]->subfield('100', 'a'), 'Another', - 'Checked one field in second record' ); + if (C4::Context->preference('marcflavour') eq 'UNIMARC') { + is( $records->[0]->subfield('200', 'a'), 'Silence in the library', + 'Checked one field in first record' ); + is( $records->[1]->subfield('700', 'a'), 'Another', + 'Checked one field in second record' ); + } else { + is( $records->[0]->subfield('245', 'a'), 'Silence in the library', + 'Checked one field in first record' ); + is( $records->[1]->subfield('100', 'a'), 'Another', + 'Checked one field in second record' ); + } }; subtest "_get_commit_action" => sub { diff --git a/tools/showdiffmarc.pl b/tools/showdiffmarc.pl index 6864d7219d..4961369d2d 100755 --- a/tools/showdiffmarc.pl +++ b/tools/showdiffmarc.pl @@ -30,10 +30,11 @@ use C4::Output qw( output_html_with_http_headers ); use C4::Auth qw( get_template_and_user ); use C4::Biblio qw( GetMarcBiblio ); use C4::Auth qw( get_template_and_user ); -use C4::ImportBatch qw( GetRecordFromImportBiblio GetImportBiblios ); +use C4::ImportBatch qw( GetImportBiblios ); use C4::AuthoritiesMarc qw( GetAuthority ); use Koha::Biblios; +use Koha::Import::Records; # Input params my $input = CGI->new; @@ -79,7 +80,8 @@ if( $record ) { } if( $importid ) { - $recordImportid = C4::ImportBatch::GetRecordFromImportBiblio( $importid, 'embed_items' ); + my $import_record = Koha::Import::Records->find($importid); + my $recordImportid = $import_record->get_marc_record({ embed_items => 1 }); $formatted2 = $recordImportid->as_formatted; my $biblio = GetImportBiblios($importid); $importTitle = $biblio->[0]->{'title'}; -- 2.39.5