From cfea1725448518f12056bff63ec5624ab82d5afb Mon Sep 17 00:00:00 2001 From: Galen Charlton Date: Mon, 31 Mar 2008 11:57:18 -0500 Subject: [PATCH] work around issue in MARC::Charset Because of a bug in MARC::Charset 0.98, if a string to convert from MARC-8 to UTF-8 has (a) one or more diacritics that (b) are only in character positions 128 to 255 inclusive, the resulting converted string is not in UTF-8, but the legacy 8-bit encoding (e.g., ISO-8859-1). As a result, when such a record is converted to XML using ->as_xml_record(), the resulting XML can be truncated at the offending character. An example of such a record is one that has a price in Briish pounds in the 260$c but no other diacritics. Signed-off-by: Joshua Ferraro --- C4/Charset.pm | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/C4/Charset.pm b/C4/Charset.pm index 2e2e83153a..a676b7ccf3 100644 --- a/C4/Charset.pm +++ b/C4/Charset.pm @@ -419,6 +419,16 @@ sub _marc_marc8_to_utf8 { my @converted_subfields; foreach my $subfield ($field->subfields()) { my $utf8sf = MARC::Charset::marc8_to_utf8($subfield->[1]); + unless (IsStringUTF8ish($utf8sf)) { + # Because of a bug in MARC::Charset 0.98, if the string + # has (a) one or more diacritics that (b) are only in character positions + # 128 to 255 inclusive, the resulting converted string is not in + # UTF-8, but the legacy 8-bit encoding (e.g., ISO-8859-1). If that + # occurs, upgrade the string in place. Moral of the story seems to be + # that pack("U", ...) is better than chr(...) if you need to guarantee + # that the resulting string is UTF-8. + utf8::upgrade($utf8sf); + } push @converted_subfields, $subfield->[0], $utf8sf; } -- 2.39.5