From d5f5d844c5ed97fb94adabb1c94b93b0ad5dae23 Mon Sep 17 00:00:00 2001 From: Henri-Damien LAURENT Date: Wed, 27 Jan 2010 17:51:08 +0100 Subject: [PATCH] (bug #4020) XSLT unimarc display When using XSLT Display, and UNIMARC, since marcFlavour is not used in encoding data, when data is true utf8, as_xml fails on some subfields. Moreover, because transformMARCXMLForXSLT edits some values in the marc record and the PERL UTF8 is not handled by MARC::File::USMARC, it endsup in double encoding the data. Sending a patch to fix both issues. This patch adds - two functions in C4/Charset.pm NormalizeString (uses Unicode::Normalize) SetUTF8Flag (This function in my opinion belongs to MARC::Record, or at least MARC::File::USMARC) - edits C4::XSLT in order to cope with the correct marcflavour - edits C4::Search searchResults to use setUTF8Flag Conflicts solved: C4/Charset.pm C4/Search.pm C4/XSLT.pm --- C4/Charset.pm | 83 +++++++++++++++++++++++++++++++++++++++++++++++++++ C4/Search.pm | 4 ++- C4/XSLT.pm | 18 +++++++++-- 3 files changed, 101 insertions(+), 4 deletions(-) diff --git a/C4/Charset.pm b/C4/Charset.pm index b65627708d..839a9204de 100644 --- a/C4/Charset.pm +++ b/C4/Charset.pm @@ -20,6 +20,8 @@ package C4::Charset; use strict; use MARC::Charset qw/marc8_to_utf8/; use Text::Iconv; +use C4::Debug; +use Unicode::Normalize; use vars qw($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS); @@ -31,6 +33,7 @@ BEGIN { @EXPORT = qw( IsStringUTF8ish MarcToUTF8Record + SetUTF8Flag SetMarcUnicodeFlag StripNonXmlChars ); @@ -108,6 +111,86 @@ sub IsStringUTF8ish { return utf8::decode($str); } +=head2 SetUTF8Flag + +=over 4 + +my $marc_record = SetUTF8Flag($marc_record); + +=back + +This function sets the PERL UTF8 flag for data. +It is required when using new_from_usmarc +since MARC::File::USMARC does not handle PERL UTF8 setting. +When editing unicode marc records fields and subfields, you +would end up in double encoding without using this function. + +FIXME +In my opinion, this function belongs to MARC::Record and not +to this package. +But since it handles charset, and MARC::Record, it finds its way in that package + +=cut + +sub SetUTF8Flag{ + my ($record)=@_; + return unless ($record && $record->fields()); + foreach my $field ($record->fields()){ + if ($field->tag()>=10){ + my @subfields; + foreach my $subfield ($field->subfields()){ + push @subfields,($$subfield[0],NormalizeString($$subfield[1])); + } + my $newfield=MARC::Field->new( + $field->tag(), + $field->indicator(1), + $field->indicator(2), + @subfields + ); + $field->replace_with($newfield); + } + } +} + +=head2 NormalizeString + +=over 4 + + my $normalized_string=NormalizeString($string); + +=back + Given + a string + nfc : If you want to set NFC and not NFD + transform : If you expect all the signs to be removed + Sets the PERL UTF8 Flag on your initial data if need be + and applies cleaning if required + + Returns a utf8 NFD normalized string + + Sample code : + my $string=NormalizeString ("l'ornithoptère"); + #results into ornithoptère in NFD form and sets UTF8 Flag +=cut + +sub NormalizeString{ + my ($string,$nfc,$transform)=@_; + utf8::decode($string) unless (utf8::is_utf8($string)); + if ($nfc){ + $string= NFD($string); + } + else { + $string=NFC($string); + } + if ($transform){ + $string=~s/\<|\>|\^|\;|\.|\?|,|\-|\(|\)|\[|\]|\{|\}|\$|\%|\!|\*|\:|\\|\/|\&|\"|\'/ /g; + #removing one letter words "d'" "l'" was changed into "d " "l " + $string=~s/\b\S\b//g; + $string=~s/\s+$//g; + } + return $string; +} + =head2 MarcToUTF8Record =over 4 diff --git a/C4/Search.pm b/C4/Search.pm index 39a111f4a5..3f4ee1563c 100644 --- a/C4/Search.pm +++ b/C4/Search.pm @@ -30,6 +30,7 @@ use C4::Branch; use C4::Debug; use YAML; use URI::Escape; +use C4::Charset; use vars qw($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS $DEBUG); @@ -446,6 +447,7 @@ sub getRecords { # not an index scan else { $record = $results[ $i - 1 ]->record($j)->raw(); + warn $results[$i-1]->record($j)->render() ; # warn "RECORD $j:".$record; $results_hash->{'RECORDS'}[$j] = $record; @@ -1405,6 +1407,7 @@ sub searchResults { # loop through all of the records we've retrieved for ( my $i = $offset ; $i <= $times - 1 ; $i++ ) { my $marcrecord = MARC::File::USMARC::decode( $marcresults[$i] ); + SetUTF8Flag($marcrecord); my $biblionumber; if(not $scan){ @@ -1672,7 +1675,6 @@ sub searchResults { } # XSLT processing of some stuff - # FIXME : This needs some work in order to be more flexible : Can not use a result list for intranet different from OPAC if (C4::Context->preference("XSLTResultsDisplay") && !$scan) { $oldbiblio->{XSLTResultsRecord} = XSLTParse4Display( $oldbiblio->{biblionumber}, $marcrecord, C4::Context->preference("XSLTResultsDisplay") ); diff --git a/C4/XSLT.pm b/C4/XSLT.pm index cd53e313f1..780789d79d 100644 --- a/C4/XSLT.pm +++ b/C4/XSLT.pm @@ -138,11 +138,23 @@ sub XSLTParse4Display { my $record = transformMARCXML4XSLT($biblionumber, $orig_record); #return $record->as_formatted(); my $itemsxml = buildKohaItemsNamespace($biblionumber); - my $xmlrecord = $record->as_xml(); - $xmlrecord =~ s/\<\/record\>/$itemsxml\<\/record\>/; + my $xmlrecord = $record->as_xml(C4::Context->preference('marcflavour')); + my $sysxml = ""; + warn $xmlrecord; + foreach my $syspref ( qw/OPACURLOpenInNewWindow DisplayOPACiconsXSLT URLLinkText/ ) { + if (C4::Context->preference( $syspref ) ){ + $sysxml .= "" . + C4::Context->preference( $syspref ) . + "\n"; + } + } + $sysxml = "\n".$sysxml."\n" if length($sysxml); + $xmlrecord =~ s/\<\/record\>/$itemsxml$sysxml\<\/record\>/; + $xmlrecord =~ s/\& /\&\; /; + my $parser = XML::LibXML->new(); # don't die when you find &, >, etc - $parser->recover_silently(1); + $parser->recover_silently(0); my $source = $parser->parse_string($xmlrecord); unless ( $stylesheet->{$xslfilename} ) { my $xslt = XML::LibXSLT->new(); -- 2.39.5