From 9d1e7f43e15b869afc3fccd80c1545170cc84ea0 Mon Sep 17 00:00:00 2001 From: Paul Poulain Date: Thu, 31 Dec 2009 09:07:54 +0100 Subject: [PATCH] (bug #4020) XSLT unimarc display When using XSLT Display, and UNIMARC, since marcFlavour is not used in encoding data, when data is true utf8, as_xml fails on some subfields. Moreover, because transformMARCXMLForXSLT edits some values in the marc record and the PERL UTF8 is not handled by MARC::File::USMARC, it endsup in double encoding the data. Sending a patch to fix both issues. This patch adds - two functions in C4/Charset.pm NormalizeString (uses Unicode::Normalize) SetUTF8Flag (This function in my opinion belongs to MARC::Record, or at least MARC::File::USMARC) - edits C4::XSLT in order to cope with the correct marcflavour - edits C4::Search searchResults to use setUTF8Flag --- C4/Charset.pm | 82 +++++++++++++++++++++++++++++++++++++++++++++++++++ C4/Search.pm | 5 ++++ C4/XSLT.pm | 5 ++-- 3 files changed, 90 insertions(+), 2 deletions(-) diff --git a/C4/Charset.pm b/C4/Charset.pm index e1b6c96efa..e39637acf3 100644 --- a/C4/Charset.pm +++ b/C4/Charset.pm @@ -23,6 +23,7 @@ use warnings; use MARC::Charset qw/marc8_to_utf8/; use Text::Iconv; use C4::Debug; +use Unicode::Normalize; use vars qw($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS); @@ -34,6 +35,7 @@ BEGIN { @EXPORT = qw( IsStringUTF8ish MarcToUTF8Record + SetUTF8Flag SetMarcUnicodeFlag StripNonXmlChars ); @@ -111,6 +113,86 @@ sub IsStringUTF8ish { return utf8::decode($str); } +=head2 SetUTF8Flag + +=over 4 + +my $marc_record = SetUTF8Flag($marc_record); + +=back + +This function sets the PERL UTF8 flag for data. +It is required when using new_from_usmarc +since MARC::File::USMARC does not handle PERL UTF8 setting. +When editing unicode marc records fields and subfields, you +would end up in double encoding without using this function. + +FIXME +In my opinion, this function belongs to MARC::Record and not +to this package. +But since it handles charset, and MARC::Record, it finds its way in that package + +=cut + +sub SetUTF8Flag{ + my ($record)=@_; + return unless ($record && $record->fields()); + foreach my $field ($record->fields()){ + if ($field->tag()>=10){ + my @subfields; + foreach my $subfield ($field->subfields()){ + push @subfields,($$subfield[0],NormalizeString($$subfield[1])); + } + my $newfield=MARC::Field->new( + $field->tag(), + $field->indicator(1), + $field->indicator(2), + @subfields + ); + $field->replace_with($newfield); + } + } +} + +=head2 NormalizeString + +=over 4 + + my $normalized_string=NormalizeString($string); + +=back + Given + a string + nfc : If you want to set NFC and not NFD + transform : If you expect all the signs to be removed + Sets the PERL UTF8 Flag on your initial data if need be + and applies cleaning if required + + Returns a utf8 NFD normalized string + + Sample code : + my $string=NormalizeString ("l'ornithoptère"); + #results into ornithoptère in NFD form and sets UTF8 Flag +=cut + +sub NormalizeString{ + my ($string,$nfc,$transform)=@_; + utf8::decode($string) unless (utf8::is_utf8($string)); + if ($nfc){ + $string= NFD($string); + } + else { + $string=NFC($string); + } + if ($transform){ + $string=~s/\<|\>|\^|\;|\.|\?|,|\-|\(|\)|\[|\]|\{|\}|\$|\%|\!|\*|\:|\\|\/|\&|\"|\'/ /g; + #removing one letter words "d'" "l'" was changed into "d " "l " + $string=~s/\b\S\b//g; + $string=~s/\s+$//g; + } + return $string; +} + =head2 MarcToUTF8Record =over 4 diff --git a/C4/Search.pm b/C4/Search.pm index 47ac7a36d0..e403672800 100644 --- a/C4/Search.pm +++ b/C4/Search.pm @@ -448,6 +448,7 @@ sub getRecords { # not an index scan else { $record = $results[ $i - 1 ]->record($j)->raw(); + warn $results[$i-1]->record($j)->render() ; # warn "RECORD $j:".$record; $results_hash->{'RECORDS'}[$j] = $record; @@ -1647,6 +1648,10 @@ sub searchResults { } # XSLT processing of some stuff + my $debug=1; + use C4::Charset; + SetUTF8Flag($marcrecord); + $debug && warn $marcrecord->as_formatted; if (C4::Context->preference("XSLTResultsDisplay") && !$scan) { $oldbiblio->{XSLTResultsRecord} = XSLTParse4Display( $oldbiblio->{biblionumber}, $marcrecord, 'Results' ); diff --git a/C4/XSLT.pm b/C4/XSLT.pm index 8a67e04090..1c7184a11d 100644 --- a/C4/XSLT.pm +++ b/C4/XSLT.pm @@ -124,8 +124,9 @@ sub XSLTParse4Display { my $record = transformMARCXML4XSLT($biblionumber, $orig_record); #return $record->as_formatted(); my $itemsxml = buildKohaItemsNamespace($biblionumber); - my $xmlrecord = $record->as_xml(); + my $xmlrecord = $record->as_xml(C4::Context->preference('marcflavour')); my $sysxml = "\n"; + warn $xmlrecord; foreach my $syspref ( qw/OPACURLOpenInNewWindow DisplayOPACiconsXSLT URLLinkText/ ) { $sysxml .= "" . C4::Context->preference( $syspref ) . @@ -137,7 +138,7 @@ sub XSLTParse4Display { my $parser = XML::LibXML->new(); # don't die when you find &, >, etc - $parser->recover_silently(1); + $parser->recover_silently(0); my $source = $parser->parse_string($xmlrecord); unless ( $stylesheet ) { my $xslt = XML::LibXSLT->new(); -- 2.39.5