C4/Charset.pm

   1 package C4::Charset;
   2
   3 # Copyright (C) 2008 LibLime
   4 #
   5 # This file is part of Koha.
   6 #
   7 # Koha is free software; you can redistribute it and/or modify it under the
   8 # terms of the GNU General Public License as published by the Free Software
   9 # Foundation; either version 2 of the License, or (at your option) any later
  10 # version.
  11 #
  12 # Koha is distributed in the hope that it will be useful, but WITHOUT ANY
  13 # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
  14 # A PARTICULAR PURPOSE.  See the GNU General Public License for more details.
  15 #
  16 # You should have received a copy of the GNU General Public License along
  17 # with Koha; if not, write to the Free Software Foundation, Inc.,
  18 # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
  19
  20 use strict;
  21 use warnings;
  22
  23 use MARC::Charset qw/marc8_to_utf8/;
  24 use Text::Iconv;
  25 use C4::Context;
  26 use C4::Debug;
  27 use Unicode::Normalize;
  28
  29 use vars qw($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
  30
  31 BEGIN {
  32     # set the version for version checking
  33     $VERSION = 3.07.00.049;
  34     require Exporter;
  35     @ISA    = qw(Exporter);
  36     @EXPORT = qw(
  37         NormalizeString
  38         IsStringUTF8ish
  39         MarcToUTF8Record
  40         SetUTF8Flag
  41         SetMarcUnicodeFlag
  42         StripNonXmlChars
  43         nsb_clean
  44     );
  45 }
  46
  47 =head1 NAME
  48
  49 C4::Charset - utilities for handling character set conversions.
  50
  51 =head1 SYNOPSIS
  52
  53   use C4::Charset;
  54
  55 =head1 DESCRIPTION
  56
  57 This module contains routines for dealing with character set
  58 conversions, particularly for MARC records.
  59
  60 A variety of character encodings are in use by various MARC
  61 standards, and even more character encodings are used by
  62 non-standard MARC records.  The various MARC formats generally
  63 do not do a good job of advertising a given record's character
  64 encoding, and even when a record does advertise its encoding,
  65 e.g., via the Leader/09, experience has shown that one cannot
  66 trust it.
  67
  68 Ultimately, all MARC records are stored in Koha in UTF-8 and
  69 must be converted from whatever the source character encoding is.
  70 The goal of this module is to ensure that these conversions
  71 take place accurately.  When a character conversion cannot take
  72 place, or at least not accurately, the module was provide
  73 enough information to allow user-facing code to inform the user
  74 on how to deal with the situation.
  75
  76 =cut
  77
  78 =head1 FUNCTIONS
  79
  80 =head2 IsStringUTF8ish
  81
  82   my $is_utf8 = IsStringUTF8ish($str);
  83
  84 Determines if C<$str> is valid UTF-8.  This can mean
  85 one of two things:
  86
  87 =over
  88
  89 =item *
  90
  91 The Perl UTF-8 flag is set and the string contains valid UTF-8.
  92
  93 =item *
  94
  95 The Perl UTF-8 flag is B<not> set, but the octets contain
  96 valid UTF-8.
  97
  98 =back
  99
 100 The function is named C<IsStringUTF8ish> instead of C<IsStringUTF8>
 101 because in one could be presented with a MARC blob that is
 102 not actually in UTF-8 but whose sequence of octets appears to be
 103 valid UTF-8.  The rest of the MARC character conversion functions
 104 will assume that this situation occur does not very often.
 105
 106 =cut
 107
 108 sub IsStringUTF8ish {
 109     my $str = shift;
 110
 111     return 1 if utf8::is_utf8($str);
 112     return utf8::decode($str);
 113 }
 114
 115 =head2 SetUTF8Flag
 116
 117   my $marc_record = SetUTF8Flag($marc_record, $nfd);
 118
 119 This function sets the PERL UTF8 flag for data.
 120 It is required when using new_from_usmarc
 121 since MARC::File::USMARC does not handle PERL UTF8 setting.
 122 When editing unicode marc records fields and subfields, you
 123 would end up in double encoding without using this function.
 124
 125 If $nfd is set, string normalization will use NFD instead of NFC
 126
 127 FIXME
 128 In my opinion, this function belongs to MARC::Record and not
 129 to this package.
 130 But since it handles charset, and MARC::Record, it finds its way in that package
 131
 132 =cut
 133
 134 sub SetUTF8Flag{
 135     my ($record, $nfd)=@_;
 136     return unless ($record && $record->fields());
 137     foreach my $field ($record->fields()){
 138         if ($field->tag()>=10){
 139             my @subfields;
 140             foreach my $subfield ($field->subfields()){
 141                 push @subfields,($$subfield[0],NormalizeString($$subfield[1],$nfd));
 142             }
 143             eval {
 144                 my $newfield=MARC::Field->new(
 145                             $field->tag(),
 146                             $field->indicator(1),
 147                             $field->indicator(2),
 148                             @subfields
 149                         );
 150                 $field->replace_with($newfield);
 151             };
 152             warn "ERROR occurred in SetUTF8Flag $@" if $@;
 153         }
 154     }
 155 }
 156
 157 =head2 NormalizeString
 158
 159     my $normalized_string=NormalizeString($string,$nfd,$transform);
 160
 161 Given a string
 162 nfd : If you want to set NFD and not NFC
 163 transform : If you expect all the signs to be removed
 164
 165 Sets the PERL UTF8 Flag on your initial data if need be
 166 and applies cleaning if required
 167
 168 Returns a utf8 NFC normalized string
 169
 170 Sample code :
 171    my $string=NormalizeString ("l'ornithoptère");
 172    #results into ornithoptère in NFC form and sets UTF8 Flag
 173
 174 =cut
 175
 176
 177 sub NormalizeString{
 178         my ($string,$nfd,$transform)=@_;
 179         utf8::decode($string) unless (utf8::is_utf8($string));
 180         if ($nfd){
 181                 $string= NFD($string);
 182         }
 183         else {
 184                 $string=NFC($string);
 185         }
 186         if ($transform){
 187     $string=~s/\<|\>|\^|\;|\.|\?|,|\-|\(|\)|\[|\]|\{|\}|\$|\%|\!|\*|\:|\\|\/|\&|\"|\'/ /g;
 188         #removing one letter words "d'" "l'"  was changed into "d " "l "
 189     $string=~s/\b\S\b//g;
 190     $string=~s/\s+$//g;
 191         }
 192     return $string;
 193 }
 194
 195 =head2 MarcToUTF8Record
 196
 197   ($marc_record, $converted_from, $errors_arrayref) = MarcToUTF8Record($marc_blob,
 198                                         $marc_flavour, [, $source_encoding]);
 199
 200 Given a MARC blob or a C<MARC::Record>, the MARC flavour, and an
 201 optional source encoding, return a C<MARC::Record> that is
 202 converted to UTF-8.
 203
 204 The returned C<$marc_record> is guaranteed to be in valid UTF-8, but
 205 is not guaranteed to have been converted correctly.  Specifically,
 206 if C<$converted_from> is 'failed', the MARC record returned failed
 207 character conversion and had each of its non-ASCII octets changed
 208 to the Unicode replacement character.
 209
 210 If the source encoding was not specified, this routine will
 211 try to guess it; the character encoding used for a successful
 212 conversion is returned in C<$converted_from>.
 213
 214 =cut
 215
 216 sub MarcToUTF8Record {
 217     my $marc = shift;
 218     my $marc_flavour = shift;
 219     my $source_encoding = shift;
 220     my $marc_record;
 221     my $marc_blob_is_utf8 = 0;
 222     if (ref($marc) eq 'MARC::Record') {
 223         my $marc_blob = $marc->as_usmarc();
 224         $marc_blob_is_utf8 = IsStringUTF8ish($marc_blob);
 225         $marc_record = $marc;
 226     } else {
 227         # dealing with a MARC blob
 228
 229         # remove any ersatz whitespace from the beginning and
 230         # end of the MARC blob -- these can creep into MARC
 231         # files produced by several sources -- caller really
 232         # should be doing this, however
 233         $marc =~ s/^\s+//;
 234         $marc =~ s/\s+$//;
 235         $marc_blob_is_utf8 = IsStringUTF8ish($marc);
 236         eval {
 237             $marc_record = MARC::Record->new_from_usmarc($marc);
 238         };
 239         if ($@) {
 240             # if we fail the first time, one likely problem
 241             # is that we have a MARC21 record that says that it's
 242             # UTF-8 (Leader/09 = 'a') but contains non-UTF-8 characters.
 243             # We'll try parsing it again.
 244             substr($marc, 9, 1) = ' ';
 245             eval {
 246                 $marc_record = MARC::Record->new_from_usmarc($marc);
 247             };
 248             if ($@) {
 249                 # it's hopeless; return an empty MARC::Record
 250                 return MARC::Record->new(), 'failed', ['could not parse MARC blob'];
 251             }
 252         }
 253     }
 254
 255     # If we do not know the source encoding, try some guesses
 256     # as follows:
 257     #   1. Record is UTF-8 already.
 258     #   2. If MARC flavor is MARC21 or NORMARC, then
 259     #      a. record is MARC-8
 260     #      b. record is ISO-8859-1
 261     #   3. If MARC flavor is UNIMARC, then
 262     if (not defined $source_encoding) {
 263         if ($marc_blob_is_utf8) {
 264             # note that for MARC21/NORMARC we are not bothering to check
 265             # if the Leader/09 is set to 'a' or not -- because
 266             # of problems with various ILSs (including Koha in the
 267             # past, alas), this just is not trustworthy.
 268             SetMarcUnicodeFlag($marc_record, $marc_flavour);
 269             return $marc_record, 'UTF-8', [];
 270         } else {
 271             if ($marc_flavour eq 'MARC21' || $marc_flavour eq 'NORMARC') {
 272                 return _default_marc21_charconv_to_utf8($marc_record, $marc_flavour);
 273             } elsif ($marc_flavour =~/UNIMARC/) {
 274                 return _default_unimarc_charconv_to_utf8($marc_record, $marc_flavour);
 275             } else {
 276                 return _default_marc21_charconv_to_utf8($marc_record, $marc_flavour);
 277             }
 278         }
 279     } else {
 280         # caller knows the character encoding
 281         my $original_marc_record = $marc_record->clone();
 282         my @errors;
 283         if ($source_encoding =~ /utf-?8/i) {
 284             if ($marc_blob_is_utf8) {
 285                 SetMarcUnicodeFlag($marc_record, $marc_flavour);
 286                 return $marc_record, 'UTF-8', [];
 287             } else {
 288                 push @errors, 'specified UTF-8 => UTF-8 conversion, but record is not in UTF-8';
 289             }
 290         } elsif ($source_encoding =~ /marc-?8/i) {
 291             @errors = _marc_marc8_to_utf8($marc_record, $marc_flavour);
 292         } elsif ($source_encoding =~ /5426/) {
 293             @errors = _marc_iso5426_to_utf8($marc_record, $marc_flavour);
 294         } else {
 295             # assume any other character encoding is for Text::Iconv
 296             @errors = _marc_to_utf8_via_text_iconv($marc_record, $marc_flavour, $source_encoding);
 297         }
 298
 299         if (@errors) {
 300             _marc_to_utf8_replacement_char($original_marc_record, $marc_flavour);
 301             return $original_marc_record, 'failed', \@errors;
 302         } else {
 303             return $marc_record, $source_encoding, [];
 304         }
 305     }
 306
 307 }
 308
 309 =head2 SetMarcUnicodeFlag
 310
 311   SetMarcUnicodeFlag($marc_record, $marc_flavour);
 312
 313 Set both the internal MARC::Record encoding flag
 314 and the appropriate Leader/09 (MARC21) or
 315 100/26-29 (UNIMARC) to indicate that the record
 316 is in UTF-8.  Note that this does B<not> do
 317 any actual character conversion.
 318
 319 =cut
 320
 321 sub SetMarcUnicodeFlag {
 322     my $marc_record = shift;
 323     my $marc_flavour = shift; # || C4::Context->preference("marcflavour");
 324
 325     $marc_record->encoding('UTF-8');
 326     if ($marc_flavour eq 'MARC21' || $marc_flavour eq 'NORMARC') {
 327         my $leader = $marc_record->leader();
 328         substr($leader, 9, 1) = 'a';
 329         $marc_record->leader($leader);
 330     } elsif ($marc_flavour =~/UNIMARC/) {
 331         my $defaultlanguage = C4::Context->preference("UNIMARCField100Language");
 332         $defaultlanguage = "fre" if (!$defaultlanguage || length($defaultlanguage) != 3);
 333         my $string;
 334                 my ($subflength,$encodingposition)=($marc_flavour=~/AUTH/?(21,12):(36,25));
 335                 $string=$marc_record->subfield( 100, "a" );
 336         if (defined $string && length($string)==$subflength) {
 337                         $string = substr $string, 0,$subflength if (length($string)>$subflength);
 338         }
 339         else {
 340             $string = POSIX::strftime( "%Y%m%d", localtime );
 341             $string =~ s/\-//g;
 342             $string = sprintf( "%-*s", $subflength, $string );
 343             substr ( $string, ($encodingposition - 3), 3, $defaultlanguage);
 344         }
 345         substr( $string, $encodingposition, 3, "y50" );
 346         if ( $marc_record->subfield( 100, "a" ) ) {
 347                         $marc_record->field('100')->update(a=>$string);
 348                 }
 349                 else {
 350             $marc_record->insert_grouped_field(
 351                 MARC::Field->new( 100, '', '', "a" => $string ) );
 352         }
 353                 $debug && warn "encodage: ", substr( $marc_record->subfield(100, 'a'), $encodingposition, 3 );
 354     } else {
 355         warn "Unrecognized marcflavour: $marc_flavour";
 356     }
 357 }
 358
 359 =head2 StripNonXmlChars
 360
 361   my $new_str = StripNonXmlChars($old_str);
 362
 363 Given a string, return a copy with the
 364 characters that are illegal in XML
 365 removed.
 366
 367 This function exists to work around a problem
 368 that can occur with badly-encoded MARC records.
 369 Specifically, if a UTF-8 MARC record also
 370 has excape (\x1b) characters, MARC::File::XML
 371 will let the escape characters pass through
 372 when as_xml() or as_xml_record() is called.  The
 373 problem is that the escape character is not
 374 legal in well-formed XML documents, so when
 375 MARC::File::XML attempts to parse such a record,
 376 the XML parser will fail.
 377
 378 Stripping such characters will allow a
 379 MARC::Record->new_from_xml()
 380 to work, at the possible risk of some data loss.
 381
 382 =cut
 383
 384 sub StripNonXmlChars {
 385     my $str = shift;
 386     if (!defined($str) || $str eq ""){
 387         return "";
 388     }
 389     $str =~ s/[^\x09\x0A\x0D\x{0020}-\x{D7FF}\x{E000}-\x{FFFD}\x{10000}-\x{10FFFF}]//g;
 390     return $str;
 391 }
 392
 393
 394
 395 =head2 nsb_clean
 396
 397 =over 4
 398
 399 nsb_clean($string);
 400
 401 =back
 402
 403 Removes Non Sorting Block characters
 404
 405 =cut
 406 sub nsb_clean {
 407     my $NSB  = '\x88' ;        # NSB : begin Non Sorting Block
 408     my $NSE  = '\x89' ;        # NSE : Non Sorting Block end
 409     my $NSB2 = '\x98' ;        # NSB : begin Non Sorting Block
 410     my $NSE2 = '\x9C' ;        # NSE : Non Sorting Block end
 411     my $C2   = '\xC2' ;        # What is this char ? It is sometimes left by the regexp after removing NSB / NSE
 412
 413     # handles non sorting blocks
 414     my ($string) = @_ ;
 415     $_ = $string ;
 416     s/$NSB//g ;
 417     s/$NSE//g ;
 418     s/$NSB2//g ;
 419     s/$NSE2//g ;
 420     s/$C2//g ;
 421     $string = $_ ;
 422
 423     return($string) ;
 424 }
 425
 426
 427 =head1 INTERNAL FUNCTIONS
 428
 429 =head2 _default_marc21_charconv_to_utf8
 430
 431   my ($new_marc_record, $guessed_charset) = _default_marc21_charconv_to_utf8($marc_record);
 432
 433 Converts a C<MARC::Record> of unknown character set to UTF-8,
 434 first by trying a MARC-8 to UTF-8 conversion, then ISO-8859-1
 435 to UTF-8, then a default conversion that replaces each non-ASCII
 436 character with the replacement character.
 437
 438 The C<$guessed_charset> return value contains the character set
 439 that resulted in a conversion to valid UTF-8; note that
 440 if the MARC-8 and ISO-8859-1 conversions failed, the value of
 441 this is 'failed'.
 442
 443 =cut
 444
 445 sub _default_marc21_charconv_to_utf8 {
 446     my $marc_record = shift;
 447     my $marc_flavour = shift;
 448
 449     my $trial_marc8 = $marc_record->clone();
 450     my @all_errors = ();
 451     my @errors = _marc_marc8_to_utf8($trial_marc8, $marc_flavour);
 452     unless (@errors) {
 453         return $trial_marc8, 'MARC-8', [];
 454     }
 455     push @all_errors, @errors;
 456
 457     my $trial_8859_1 = $marc_record->clone();
 458     @errors = _marc_to_utf8_via_text_iconv($trial_8859_1, $marc_flavour, 'iso-8859-1');
 459     unless (@errors) {
 460         return $trial_8859_1, 'iso-8859-1', []; # note -- we could return \@all_errors
 461                                                 # instead if we wanted to report details
 462                                                 # of the failed attempt at MARC-8 => UTF-8
 463     }
 464     push @all_errors, @errors;
 465
 466     my $default_converted = $marc_record->clone();
 467     _marc_to_utf8_replacement_char($default_converted, $marc_flavour);
 468     return $default_converted, 'failed', \@all_errors;
 469 }
 470
 471 =head2 _default_unimarc_charconv_to_utf8
 472
 473   my ($new_marc_record, $guessed_charset) = _default_unimarc_charconv_to_utf8($marc_record);
 474
 475 Converts a C<MARC::Record> of unknown character set to UTF-8,
 476 first by trying a ISO-5426 to UTF-8 conversion, then ISO-8859-1
 477 to UTF-8, then a default conversion that replaces each non-ASCII
 478 character with the replacement character.
 479
 480 The C<$guessed_charset> return value contains the character set
 481 that resulted in a conversion to valid UTF-8; note that
 482 if the MARC-8 and ISO-8859-1 conversions failed, the value of
 483 this is 'failed'.
 484
 485 =cut
 486
 487 sub _default_unimarc_charconv_to_utf8 {
 488     my $marc_record = shift;
 489     my $marc_flavour = shift;
 490
 491     my $trial_marc8 = $marc_record->clone();
 492     my @all_errors = ();
 493     my @errors = _marc_iso5426_to_utf8($trial_marc8, $marc_flavour);
 494     unless (@errors) {
 495         return $trial_marc8, 'iso-5426';
 496     }
 497     push @all_errors, @errors;
 498
 499     my $trial_8859_1 = $marc_record->clone();
 500     @errors = _marc_to_utf8_via_text_iconv($trial_8859_1, $marc_flavour, 'iso-8859-1');
 501     unless (@errors) {
 502         return $trial_8859_1, 'iso-8859-1';
 503     }
 504     push @all_errors, @errors;
 505
 506     my $default_converted = $marc_record->clone();
 507     _marc_to_utf8_replacement_char($default_converted, $marc_flavour);
 508     return $default_converted, 'failed', \@all_errors;
 509 }
 510
 511 =head2 _marc_marc8_to_utf8
 512
 513   my @errors = _marc_marc8_to_utf8($marc_record, $marc_flavour, $source_encoding);
 514
 515 Convert a C<MARC::Record> to UTF-8 in-place from MARC-8.
 516 If the conversion fails for some reason, an
 517 appropriate messages will be placed in the returned
 518 C<@errors> array.
 519
 520 =cut
 521
 522 sub _marc_marc8_to_utf8 {
 523     my $marc_record = shift;
 524     my $marc_flavour = shift;
 525
 526     my $prev_ignore = MARC::Charset->ignore_errors();
 527     MARC::Charset->ignore_errors(1);
 528
 529     # trap warnings raised by MARC::Charset
 530     my @errors = ();
 531     local $SIG{__WARN__} = sub {
 532         my $msg = $_[0];
 533         if ($msg =~ /MARC.Charset/) {
 534             # FIXME - purpose of this regexp is to strip out the
 535             # line reference to MARC/Charset.pm, but as it
 536             # exists probably won't work quite on Windows --
 537             # some sort of minimal-bunch back-tracking RE
 538             # would be helpful here
 539             $msg =~ s/at [\/].*?.MARC.Charset\.pm line \d+\.\n$//;
 540             push @errors, $msg;
 541         } else {
 542             # if warning doesn't come from MARC::Charset, just
 543             # pass it on
 544             warn $msg;
 545         }
 546     };
 547
 548     foreach my $field ($marc_record->fields()) {
 549         if ($field->is_control_field()) {
 550             ; # do nothing -- control fields should not contain non-ASCII characters
 551         } else {
 552             my @converted_subfields;
 553             foreach my $subfield ($field->subfields()) {
 554                 my $utf8sf = MARC::Charset::marc8_to_utf8($subfield->[1]);
 555                 unless (IsStringUTF8ish($utf8sf)) {
 556                     # Because of a bug in MARC::Charset 0.98, if the string
 557                     # has (a) one or more diacritics that (b) are only in character positions
 558                     # 128 to 255 inclusive, the resulting converted string is not in
 559                     # UTF-8, but the legacy 8-bit encoding (e.g., ISO-8859-1).  If that
 560                     # occurs, upgrade the string in place.  Moral of the story seems to be
 561                     # that pack("U", ...) is better than chr(...) if you need to guarantee
 562                     # that the resulting string is UTF-8.
 563                     utf8::upgrade($utf8sf);
 564                 }
 565                 push @converted_subfields, $subfield->[0], $utf8sf;
 566             }
 567
 568             $field->replace_with(MARC::Field->new(
 569                 $field->tag(), $field->indicator(1), $field->indicator(2),
 570                 @converted_subfields)
 571             );
 572         }
 573     }
 574
 575     MARC::Charset->ignore_errors($prev_ignore);
 576
 577     SetMarcUnicodeFlag($marc_record, $marc_flavour);
 578
 579     return @errors;
 580 }
 581
 582 =head2 _marc_iso5426_to_utf8
 583
 584   my @errors = _marc_iso5426_to_utf8($marc_record, $marc_flavour, $source_encoding);
 585
 586 Convert a C<MARC::Record> to UTF-8 in-place from ISO-5426.
 587 If the conversion fails for some reason, an
 588 appropriate messages will be placed in the returned
 589 C<@errors> array.
 590
 591 FIXME - is ISO-5426 equivalent enough to MARC-8
 592 that C<MARC::Charset> can be used instead?
 593
 594 =cut
 595
 596 sub _marc_iso5426_to_utf8 {
 597     my $marc_record = shift;
 598     my $marc_flavour = shift;
 599
 600     my @errors = ();
 601
 602     foreach my $field ($marc_record->fields()) {
 603         if ($field->is_control_field()) {
 604             ; # do nothing -- control fields should not contain non-ASCII characters
 605         } else {
 606             my @converted_subfields;
 607             foreach my $subfield ($field->subfields()) {
 608                 my $utf8sf = char_decode5426($subfield->[1]);
 609                 push @converted_subfields, $subfield->[0], $utf8sf;
 610             }
 611
 612             $field->replace_with(MARC::Field->new(
 613                 $field->tag(), $field->indicator(1), $field->indicator(2),
 614                 @converted_subfields)
 615             );
 616         }
 617     }
 618
 619     SetMarcUnicodeFlag($marc_record, $marc_flavour);
 620
 621     return @errors;
 622 }
 623
 624 =head2 _marc_to_utf8_via_text_iconv
 625
 626   my @errors = _marc_to_utf8_via_text_iconv($marc_record, $marc_flavour, $source_encoding);
 627
 628 Convert a C<MARC::Record> to UTF-8 in-place using the
 629 C<Text::Iconv> CPAN module.  Any source encoding accepted
 630 by the user's iconv installation should work.  If
 631 the source encoding is not recognized on the user's
 632 server or the conversion fails for some reason,
 633 appropriate messages will be placed in the returned
 634 C<@errors> array.
 635
 636 =cut
 637
 638 sub _marc_to_utf8_via_text_iconv {
 639     my $marc_record = shift;
 640     my $marc_flavour = shift;
 641     my $source_encoding = shift;
 642
 643     my @errors = ();
 644     my $decoder;
 645     eval { $decoder = Text::Iconv->new($source_encoding, 'utf8'); };
 646     if ($@) {
 647         push @errors, "Could not initialze $source_encoding => utf8 converter: $@";
 648         return @errors;
 649     }
 650
 651     my $prev_raise_error = Text::Iconv->raise_error();
 652     Text::Iconv->raise_error(1);
 653
 654     foreach my $field ($marc_record->fields()) {
 655         if ($field->is_control_field()) {
 656             ; # do nothing -- control fields should not contain non-ASCII characters
 657         } else {
 658             my @converted_subfields;
 659             foreach my $subfield ($field->subfields()) {
 660                 my $converted_value;
 661                 my $conversion_ok = 1;
 662                 eval { $converted_value = $decoder->convert($subfield->[1]); };
 663                 if ($@) {
 664                     $conversion_ok = 0;
 665                     push @errors, $@;
 666                 } elsif (not defined $converted_value) {
 667                     $conversion_ok = 0;
 668                     push @errors, "Text::Iconv conversion failed - retval is " . $decoder->retval();
 669                 }
 670
 671                 if ($conversion_ok) {
 672                     push @converted_subfields, $subfield->[0], $converted_value;
 673                 } else {
 674                     $converted_value = $subfield->[1];
 675                     $converted_value =~ s/[\200-\377]/\xef\xbf\xbd/g;
 676                     push @converted_subfields, $subfield->[0], $converted_value;
 677                 }
 678             }
 679
 680             $field->replace_with(MARC::Field->new(
 681                 $field->tag(), $field->indicator(1), $field->indicator(2),
 682                 @converted_subfields)
 683             );
 684         }
 685     }
 686
 687     SetMarcUnicodeFlag($marc_record, $marc_flavour);
 688     Text::Iconv->raise_error($prev_raise_error);
 689
 690     return @errors;
 691 }
 692
 693 =head2 _marc_to_utf8_replacement_char
 694
 695   _marc_to_utf8_replacement_char($marc_record, $marc_flavour);
 696
 697 Convert a C<MARC::Record> to UTF-8 in-place, adopting the
 698 unsatisfactory method of replacing all non-ASCII (e.g.,
 699 where the eight bit is set) octet with the Unicode
 700 replacement character.  This is meant as a last-ditch
 701 method, and would be best used as part of a UI that
 702 lets a cataloguer pick various character conversions
 703 until he or she finds the right one.
 704
 705 =cut
 706
 707 sub _marc_to_utf8_replacement_char {
 708     my $marc_record = shift;
 709     my $marc_flavour = shift;
 710
 711     foreach my $field ($marc_record->fields()) {
 712         if ($field->is_control_field()) {
 713             ; # do nothing -- control fields should not contain non-ASCII characters
 714         } else {
 715             my @converted_subfields;
 716             foreach my $subfield ($field->subfields()) {
 717                 my $value = $subfield->[1];
 718                 $value =~ s/[\200-\377]/\xef\xbf\xbd/g;
 719                 push @converted_subfields, $subfield->[0], $value;
 720             }
 721
 722             $field->replace_with(MARC::Field->new(
 723                 $field->tag(), $field->indicator(1), $field->indicator(2),
 724                 @converted_subfields)
 725             );
 726         }
 727     }
 728
 729     SetMarcUnicodeFlag($marc_record, $marc_flavour);
 730 }
 731
 732 =head2 char_decode5426
 733
 734   my $utf8string = char_decode5426($iso_5426_string);
 735
 736 Converts a string from ISO-5426 to UTF-8.
 737
 738 =cut
 739
 740
 741 my %chars;
 742 $chars{0xb0}=0x0101;#3/0ayn[ain]
 743 $chars{0xb1}=0x0623;#3/1alif/hamzah[alefwithhamzaabove]
 744 #$chars{0xb2}=0x00e0;#'à';
 745 $chars{0xb2}=0x00e0;#3/2leftlowsinglequotationmark
 746 #$chars{0xb3}=0x00e7;#'ç';
 747 $chars{0xb3}=0x00e7;#3/2leftlowsinglequotationmark
 748 # $chars{0xb4}='è';
 749 $chars{0xb4}=0x00e8;
 750 $chars{0xbd}=0x02b9;
 751 $chars{0xbe}=0x02ba;
 752 # $chars{0xb5}='é';
 753 $chars{0xb5}=0x00e9;
 754 $chars{0x97}=0x003c;#3/2leftlowsinglequotationmark
 755 $chars{0x98}=0x003e;#3/2leftlowsinglequotationmark
 756 $chars{0xfa}=0x0153; #oe
 757 $chars{0xea}=0x0152; #oe
 758 $chars{0x81d1}=0x00b0;
 759
 760 ####
 761 ## combined characters iso5426
 762
 763 $chars{0xc041}=0x1ea2; # capital a with hook above
 764 $chars{0xc045}=0x1eba; # capital e with hook above
 765 $chars{0xc049}=0x1ec8; # capital i with hook above
 766 $chars{0xc04f}=0x1ece; # capital o with hook above
 767 $chars{0xc055}=0x1ee6; # capital u with hook above
 768 $chars{0xc059}=0x1ef6; # capital y with hook above
 769 $chars{0xc061}=0x1ea3; # small a with hook above
 770 $chars{0xc065}=0x1ebb; # small e with hook above
 771 $chars{0xc069}=0x1ec9; # small i with hook above
 772 $chars{0xc06f}=0x1ecf; # small o with hook above
 773 $chars{0xc075}=0x1ee7; # small u with hook above
 774 $chars{0xc079}=0x1ef7; # small y with hook above
 775
 776         # 4/1 grave accent
 777 $chars{0xc141}=0x00c0; # capital a with grave accent
 778 $chars{0xc145}=0x00c8; # capital e with grave accent
 779 $chars{0xc149}=0x00cc; # capital i with grave accent
 780 $chars{0xc14f}=0x00d2; # capital o with grave accent
 781 $chars{0xc155}=0x00d9; # capital u with grave accent
 782 $chars{0xc157}=0x1e80; # capital w with grave
 783 $chars{0xc159}=0x1ef2; # capital y with grave
 784 $chars{0xc161}=0x00e0; # small a with grave accent
 785 $chars{0xc165}=0x00e8; # small e with grave accent
 786 $chars{0xc169}=0x00ec; # small i with grave accent
 787 $chars{0xc16f}=0x00f2; # small o with grave accent
 788 $chars{0xc175}=0x00f9; # small u with grave accent
 789 $chars{0xc177}=0x1e81; # small w with grave
 790 $chars{0xc179}=0x1ef3; # small y with grave
 791         # 4/2 acute accent
 792 $chars{0xc241}=0x00c1; # capital a with acute accent
 793 $chars{0xc243}=0x0106; # capital c with acute accent
 794 $chars{0xc245}=0x00c9; # capital e with acute accent
 795 $chars{0xc247}=0x01f4; # capital g with acute
 796 $chars{0xc249}=0x00cd; # capital i with acute accent
 797 $chars{0xc24b}=0x1e30; # capital k with acute
 798 $chars{0xc24c}=0x0139; # capital l with acute accent
 799 $chars{0xc24d}=0x1e3e; # capital m with acute
 800 $chars{0xc24e}=0x0143; # capital n with acute accent
 801 $chars{0xc24f}=0x00d3; # capital o with acute accent
 802 $chars{0xc250}=0x1e54; # capital p with acute
 803 $chars{0xc252}=0x0154; # capital r with acute accent
 804 $chars{0xc253}=0x015a; # capital s with acute accent
 805 $chars{0xc255}=0x00da; # capital u with acute accent
 806 $chars{0xc257}=0x1e82; # capital w with acute
 807 $chars{0xc259}=0x00dd; # capital y with acute accent
 808 $chars{0xc25a}=0x0179; # capital z with acute accent
 809 $chars{0xc261}=0x00e1; # small a with acute accent
 810 $chars{0xc263}=0x0107; # small c with acute accent
 811 $chars{0xc265}=0x00e9; # small e with acute accent
 812 $chars{0xc267}=0x01f5; # small g with acute
 813 $chars{0xc269}=0x00ed; # small i with acute accent
 814 $chars{0xc26b}=0x1e31; # small k with acute
 815 $chars{0xc26c}=0x013a; # small l with acute accent
 816 $chars{0xc26d}=0x1e3f; # small m with acute
 817 $chars{0xc26e}=0x0144; # small n with acute accent
 818 $chars{0xc26f}=0x00f3; # small o with acute accent
 819 $chars{0xc270}=0x1e55; # small p with acute
 820 $chars{0xc272}=0x0155; # small r with acute accent
 821 $chars{0xc273}=0x015b; # small s with acute accent
 822 $chars{0xc275}=0x00fa; # small u with acute accent
 823 $chars{0xc277}=0x1e83; # small w with acute
 824 $chars{0xc279}=0x00fd; # small y with acute accent
 825 $chars{0xc27a}=0x017a; # small z with acute accent
 826 $chars{0xc2e1}=0x01fc; # capital ae with acute
 827 $chars{0xc2f1}=0x01fd; # small ae with acute
 828        # 4/3 circumflex accent
 829 $chars{0xc341}=0x00c2; # capital a with circumflex accent
 830 $chars{0xc343}=0x0108; # capital c with circumflex
 831 $chars{0xc345}=0x00ca; # capital e with circumflex accent
 832 $chars{0xc347}=0x011c; # capital g with circumflex
 833 $chars{0xc348}=0x0124; # capital h with circumflex
 834 $chars{0xc349}=0x00ce; # capital i with circumflex accent
 835 $chars{0xc34a}=0x0134; # capital j with circumflex
 836 $chars{0xc34f}=0x00d4; # capital o with circumflex accent
 837 $chars{0xc353}=0x015c; # capital s with circumflex
 838 $chars{0xc355}=0x00db; # capital u with circumflex
 839 $chars{0xc357}=0x0174; # capital w with circumflex
 840 $chars{0xc359}=0x0176; # capital y with circumflex
 841 $chars{0xc35a}=0x1e90; # capital z with circumflex
 842 $chars{0xc361}=0x00e2; # small a with circumflex accent
 843 $chars{0xc363}=0x0109; # small c with circumflex
 844 $chars{0xc365}=0x00ea; # small e with circumflex accent
 845 $chars{0xc367}=0x011d; # small g with circumflex
 846 $chars{0xc368}=0x0125; # small h with circumflex
 847 $chars{0xc369}=0x00ee; # small i with circumflex accent
 848 $chars{0xc36a}=0x0135; # small j with circumflex
 849 $chars{0xc36e}=0x00f1; # small n with tilde
 850 $chars{0xc36f}=0x00f4; # small o with circumflex accent
 851 $chars{0xc373}=0x015d; # small s with circumflex
 852 $chars{0xc375}=0x00fb; # small u with circumflex
 853 $chars{0xc377}=0x0175; # small w with circumflex
 854 $chars{0xc379}=0x0177; # small y with circumflex
 855 $chars{0xc37a}=0x1e91; # small z with circumflex
 856         # 4/4 tilde
 857 $chars{0xc441}=0x00c3; # capital a with tilde
 858 $chars{0xc445}=0x1ebc; # capital e with tilde
 859 $chars{0xc449}=0x0128; # capital i with tilde
 860 $chars{0xc44e}=0x00d1; # capital n with tilde
 861 $chars{0xc44f}=0x00d5; # capital o with tilde
 862 $chars{0xc455}=0x0168; # capital u with tilde
 863 $chars{0xc456}=0x1e7c; # capital v with tilde
 864 $chars{0xc459}=0x1ef8; # capital y with tilde
 865 $chars{0xc461}=0x00e3; # small a with tilde
 866 $chars{0xc465}=0x1ebd; # small e with tilde
 867 $chars{0xc469}=0x0129; # small i with tilde
 868 $chars{0xc46e}=0x00f1; # small n with tilde
 869 $chars{0xc46f}=0x00f5; # small o with tilde
 870 $chars{0xc475}=0x0169; # small u with tilde
 871 $chars{0xc476}=0x1e7d; # small v with tilde
 872 $chars{0xc479}=0x1ef9; # small y with tilde
 873     # 4/5 macron
 874 $chars{0xc541}=0x0100; # capital a with macron
 875 $chars{0xc545}=0x0112; # capital e with macron
 876 $chars{0xc547}=0x1e20; # capital g with macron
 877 $chars{0xc549}=0x012a; # capital i with macron
 878 $chars{0xc54f}=0x014c; # capital o with macron
 879 $chars{0xc555}=0x016a; # capital u with macron
 880 $chars{0xc561}=0x0101; # small a with macron
 881 $chars{0xc565}=0x0113; # small e with macron
 882 $chars{0xc567}=0x1e21; # small g with macron
 883 $chars{0xc569}=0x012b; # small i with macron
 884 $chars{0xc56f}=0x014d; # small o with macron
 885 $chars{0xc575}=0x016b; # small u with macron
 886 $chars{0xc572}=0x0159; # small r with macron
 887 $chars{0xc5e1}=0x01e2; # capital ae with macron
 888 $chars{0xc5f1}=0x01e3; # small ae with macron
 889         # 4/6 breve
 890 $chars{0xc641}=0x0102; # capital a with breve
 891 $chars{0xc645}=0x0114; # capital e with breve
 892 $chars{0xc647}=0x011e; # capital g with breve
 893 $chars{0xc649}=0x012c; # capital i with breve
 894 $chars{0xc64f}=0x014e; # capital o with breve
 895 $chars{0xc655}=0x016c; # capital u with breve
 896 $chars{0xc661}=0x0103; # small a with breve
 897 $chars{0xc665}=0x0115; # small e with breve
 898 $chars{0xc667}=0x011f; # small g with breve
 899 $chars{0xc669}=0x012d; # small i with breve
 900 $chars{0xc66f}=0x014f; # small o with breve
 901 $chars{0xc675}=0x016d; # small u with breve
 902         # 4/7 dot above
 903 $chars{0xc7b0}=0x01e1; # Ain with dot above
 904 $chars{0xc742}=0x1e02; # capital b with dot above
 905 $chars{0xc743}=0x010a; # capital c with dot above
 906 $chars{0xc744}=0x1e0a; # capital d with dot above
 907 $chars{0xc745}=0x0116; # capital e with dot above
 908 $chars{0xc746}=0x1e1e; # capital f with dot above
 909 $chars{0xc747}=0x0120; # capital g with dot above
 910 $chars{0xc748}=0x1e22; # capital h with dot above
 911 $chars{0xc749}=0x0130; # capital i with dot above
 912 $chars{0xc74d}=0x1e40; # capital m with dot above
 913 $chars{0xc74e}=0x1e44; # capital n with dot above
 914 $chars{0xc750}=0x1e56; # capital p with dot above
 915 $chars{0xc752}=0x1e58; # capital r with dot above
 916 $chars{0xc753}=0x1e60; # capital s with dot above
 917 $chars{0xc754}=0x1e6a; # capital t with dot above
 918 $chars{0xc757}=0x1e86; # capital w with dot above
 919 $chars{0xc758}=0x1e8a; # capital x with dot above
 920 $chars{0xc759}=0x1e8e; # capital y with dot above
 921 $chars{0xc75a}=0x017b; # capital z with dot above
 922 $chars{0xc761}=0x0227; # small b with dot above
 923 $chars{0xc762}=0x1e03; # small b with dot above
 924 $chars{0xc763}=0x010b; # small c with dot above
 925 $chars{0xc764}=0x1e0b; # small d with dot above
 926 $chars{0xc765}=0x0117; # small e with dot above
 927 $chars{0xc766}=0x1e1f; # small f with dot above
 928 $chars{0xc767}=0x0121; # small g with dot above
 929 $chars{0xc768}=0x1e23; # small h with dot above
 930 $chars{0xc76d}=0x1e41; # small m with dot above
 931 $chars{0xc76e}=0x1e45; # small n with dot above
 932 $chars{0xc770}=0x1e57; # small p with dot above
 933 $chars{0xc772}=0x1e59; # small r with dot above
 934 $chars{0xc773}=0x1e61; # small s with dot above
 935 $chars{0xc774}=0x1e6b; # small t with dot above
 936 $chars{0xc777}=0x1e87; # small w with dot above
 937 $chars{0xc778}=0x1e8b; # small x with dot above
 938 $chars{0xc779}=0x1e8f; # small y with dot above
 939 $chars{0xc77a}=0x017c; # small z with dot above
 940         # 4/8 trema, diaresis
 941 $chars{0xc820}=0x00a8; # diaeresis
 942 $chars{0xc841}=0x00c4; # capital a with diaeresis
 943 $chars{0xc845}=0x00cb; # capital e with diaeresis
 944 $chars{0xc848}=0x1e26; # capital h with diaeresis
 945 $chars{0xc849}=0x00cf; # capital i with diaeresis
 946 $chars{0xc84f}=0x00d6; # capital o with diaeresis
 947 $chars{0xc855}=0x00dc; # capital u with diaeresis
 948 $chars{0xc857}=0x1e84; # capital w with diaeresis
 949 $chars{0xc858}=0x1e8c; # capital x with diaeresis
 950 $chars{0xc859}=0x0178; # capital y with diaeresis
 951 $chars{0xc861}=0x00e4; # small a with diaeresis
 952 $chars{0xc865}=0x00eb; # small e with diaeresis
 953 $chars{0xc868}=0x1e27; # small h with diaeresis
 954 $chars{0xc869}=0x00ef; # small i with diaeresis
 955 $chars{0xc86f}=0x00f6; # small o with diaeresis
 956 $chars{0xc874}=0x1e97; # small t with diaeresis
 957 $chars{0xc875}=0x00fc; # small u with diaeresis
 958 $chars{0xc877}=0x1e85; # small w with diaeresis
 959 $chars{0xc878}=0x1e8d; # small x with diaeresis
 960 $chars{0xc879}=0x00ff; # small y with diaeresis
 961         # 4/9 umlaut
 962 $chars{0xc920}=0x00a8; # [diaeresis]
 963 $chars{0xc961}=0x00e4; # a with umlaut
 964 $chars{0xc965}=0x00eb; # e with umlaut
 965 $chars{0xc969}=0x00ef; # i with umlaut
 966 $chars{0xc96f}=0x00f6; # o with umlaut
 967 $chars{0xc975}=0x00fc; # u with umlaut
 968         # 4/10 circle above
 969 $chars{0xca41}=0x00c5; # capital a with ring above
 970 $chars{0xcaad}=0x016e; # capital u with ring above
 971 $chars{0xca61}=0x00e5; # small a with ring above
 972 $chars{0xca75}=0x016f; # small u with ring above
 973 $chars{0xca77}=0x1e98; # small w with ring above
 974 $chars{0xca79}=0x1e99; # small y with ring above
 975         # 4/11 high comma off centre
 976         # 4/12 inverted high comma centred
 977         # 4/13 double acute accent
 978 $chars{0xcd4f}=0x0150; # capital o with double acute
 979 $chars{0xcd55}=0x0170; # capital u with double acute
 980 $chars{0xcd6f}=0x0151; # small o with double acute
 981 $chars{0xcd75}=0x0171; # small u with double acute
 982         # 4/14 horn
 983 $chars{0xce54}=0x01a0; # latin capital letter o with horn
 984 $chars{0xce55}=0x01af; # latin capital letter u with horn
 985 $chars{0xce74}=0x01a1; # latin small letter o with horn
 986 $chars{0xce75}=0x01b0; # latin small letter u with horn
 987         # 4/15 caron (hacek
 988 $chars{0xcf41}=0x01cd; # capital a with caron
 989 $chars{0xcf43}=0x010c; # capital c with caron
 990 $chars{0xcf44}=0x010e; # capital d with caron
 991 $chars{0xcf45}=0x011a; # capital e with caron
 992 $chars{0xcf47}=0x01e6; # capital g with caron
 993 $chars{0xcf49}=0x01cf; # capital i with caron
 994 $chars{0xcf4b}=0x01e8; # capital k with caron
 995 $chars{0xcf4c}=0x013d; # capital l with caron
 996 $chars{0xcf4e}=0x0147; # capital n with caron
 997 $chars{0xcf4f}=0x01d1; # capital o with caron
 998 $chars{0xcf52}=0x0158; # capital r with caron
 999 $chars{0xcf53}=0x0160; # capital s with caron
1000 $chars{0xcf54}=0x0164; # capital t with caron
1001 $chars{0xcf55}=0x01d3; # capital u with caron
1002 $chars{0xcf5a}=0x017d; # capital z with caron
1003 $chars{0xcf61}=0x01ce; # small a with caron
1004 $chars{0xcf63}=0x010d; # small c with caron
1005 $chars{0xcf64}=0x010f; # small d with caron
1006 $chars{0xcf65}=0x011b; # small e with caron
1007 $chars{0xcf67}=0x01e7; # small g with caron
1008 $chars{0xcf69}=0x01d0; # small i with caron
1009 $chars{0xcf6a}=0x01f0; # small j with caron
1010 $chars{0xcf6b}=0x01e9; # small k with caron
1011 $chars{0xcf6c}=0x013e; # small l with caron
1012 $chars{0xcf6e}=0x0148; # small n with caron
1013 $chars{0xcf6f}=0x01d2; # small o with caron
1014 $chars{0xcf72}=0x0159; # small r with caron
1015 $chars{0xcf73}=0x0161; # small s with caron
1016 $chars{0xcf74}=0x0165; # small t with caron
1017 $chars{0xcf75}=0x01d4; # small u with caron
1018 $chars{0xcf7a}=0x017e; # small z with caron
1019         # 5/0 cedilla
1020 $chars{0xd020}=0x00b8; # cedilla
1021 $chars{0xd043}=0x00c7; # capital c with cedilla
1022 $chars{0xd044}=0x1e10; # capital d with cedilla
1023 $chars{0xd047}=0x0122; # capital g with cedilla
1024 $chars{0xd048}=0x1e28; # capital h with cedilla
1025 $chars{0xd04b}=0x0136; # capital k with cedilla
1026 $chars{0xd04c}=0x013b; # capital l with cedilla
1027 $chars{0xd04e}=0x0145; # capital n with cedilla
1028 $chars{0xd052}=0x0156; # capital r with cedilla
1029 $chars{0xd053}=0x015e; # capital s with cedilla
1030 $chars{0xd054}=0x0162; # capital t with cedilla
1031 $chars{0xd063}=0x00e7; # small c with cedilla
1032 $chars{0xd064}=0x1e11; # small d with cedilla
1033 $chars{0xd065}=0x0119; # small e with cedilla
1034 $chars{0xd067}=0x0123; # small g with cedilla
1035 $chars{0xd068}=0x1e29; # small h with cedilla
1036 $chars{0xd06b}=0x0137; # small k with cedilla
1037 $chars{0xd06c}=0x013c; # small l with cedilla
1038 $chars{0xd06e}=0x0146; # small n with cedilla
1039 $chars{0xd072}=0x0157; # small r with cedilla
1040 $chars{0xd073}=0x015f; # small s with cedilla
1041 $chars{0xd074}=0x0163; # small t with cedilla
1042         # 5/1 rude
1043         # 5/2 hook to left
1044         # 5/3 ogonek (hook to right
1045 $chars{0xd320}=0x02db; # ogonek
1046 $chars{0xd341}=0x0104; # capital a with ogonek
1047 $chars{0xd345}=0x0118; # capital e with ogonek
1048 $chars{0xd349}=0x012e; # capital i with ogonek
1049 $chars{0xd34f}=0x01ea; # capital o with ogonek
1050 $chars{0xd355}=0x0172; # capital u with ogonek
1051 $chars{0xd361}=0x0105; # small a with ogonek
1052 $chars{0xd365}=0x0119; # small e with ogonek
1053 $chars{0xd369}=0x012f; # small i with ogonek
1054 $chars{0xd36f}=0x01eb; # small o with ogonek
1055 $chars{0xd375}=0x0173; # small u with ogonek
1056         # 5/4 circle below
1057 $chars{0xd441}=0x1e00; # capital a with ring below
1058 $chars{0xd461}=0x1e01; # small a with ring below
1059         # 5/5 half circle below
1060 $chars{0xf948}=0x1e2a; # capital h with breve below
1061 $chars{0xf968}=0x1e2b; # small h with breve below
1062         # 5/6 dot below
1063 $chars{0xd641}=0x1ea0; # capital a with dot below
1064 $chars{0xd642}=0x1e04; # capital b with dot below
1065 $chars{0xd644}=0x1e0c; # capital d with dot below
1066 $chars{0xd645}=0x1eb8; # capital e with dot below
1067 $chars{0xd648}=0x1e24; # capital h with dot below
1068 $chars{0xd649}=0x1eca; # capital i with dot below
1069 $chars{0xd64b}=0x1e32; # capital k with dot below
1070 $chars{0xd64c}=0x1e36; # capital l with dot below
1071 $chars{0xd64d}=0x1e42; # capital m with dot below
1072 $chars{0xd64e}=0x1e46; # capital n with dot below
1073 $chars{0xd64f}=0x1ecc; # capital o with dot below
1074 $chars{0xd652}=0x1e5a; # capital r with dot below
1075 $chars{0xd653}=0x1e62; # capital s with dot below
1076 $chars{0xd654}=0x1e6c; # capital t with dot below
1077 $chars{0xd655}=0x1ee4; # capital u with dot below
1078 $chars{0xd656}=0x1e7e; # capital v with dot below
1079 $chars{0xd657}=0x1e88; # capital w with dot below
1080 $chars{0xd659}=0x1ef4; # capital y with dot below
1081 $chars{0xd65a}=0x1e92; # capital z with dot below
1082 $chars{0xd661}=0x1ea1; # small a with dot below
1083 $chars{0xd662}=0x1e05; # small b with dot below
1084 $chars{0xd664}=0x1e0d; # small d with dot below
1085 $chars{0xd665}=0x1eb9; # small e with dot below
1086 $chars{0xd668}=0x1e25; # small h with dot below
1087 $chars{0xd669}=0x1ecb; # small i with dot below
1088 $chars{0xd66b}=0x1e33; # small k with dot below
1089 $chars{0xd66c}=0x1e37; # small l with dot below
1090 $chars{0xd66d}=0x1e43; # small m with dot below
1091 $chars{0xd66e}=0x1e47; # small n with dot below
1092 $chars{0xd66f}=0x1ecd; # small o with dot below
1093 $chars{0xd672}=0x1e5b; # small r with dot below
1094 $chars{0xd673}=0x1e63; # small s with dot below
1095 $chars{0xd674}=0x1e6d; # small t with dot below
1096 $chars{0xd675}=0x1ee5; # small u with dot below
1097 $chars{0xd676}=0x1e7f; # small v with dot below
1098 $chars{0xd677}=0x1e89; # small w with dot below
1099 $chars{0xd679}=0x1ef5; # small y with dot below
1100 $chars{0xd67a}=0x1e93; # small z with dot below
1101         # 5/7 double dot below
1102 $chars{0xd755}=0x1e72; # capital u with diaeresis below
1103 $chars{0xd775}=0x1e73; # small u with diaeresis below
1104         # 5/8 underline
1105 $chars{0xd820}=0x005f; # underline
1106         # 5/9 double underline
1107 $chars{0xd920}=0x2017; # double underline
1108         # 5/10 small low vertical bar
1109 $chars{0xda20}=0x02cc; #
1110         # 5/11 circumflex below
1111         # 5/12 (this position shall not be used)
1112         # 5/13 left half of ligature sign and of double tilde
1113         # 5/14 right half of ligature sign
1114         # 5/15 right half of double tilde
1115 #     map {printf "%x :%x\n",$_,$chars{$_};}keys %chars;
1116
1117 sub char_decode5426 {
1118     my ( $string) = @_;
1119     my $result;
1120
1121     my @data = unpack("C*", $string);
1122     my @characters;
1123     my $length=scalar(@data);
1124     for (my $i = 0; $i < scalar(@data); $i++) {
1125       my $char= $data[$i];
1126       if ($char >= 0x00 && $char <= 0x7F){
1127         #IsAscii
1128
1129           push @characters,$char unless ($char<0x02 ||$char== 0x0F);
1130       }elsif (($char >= 0xC0 && $char <= 0xDF)) {
1131         #Combined Char
1132         my $convchar ;
1133         if ($chars{$char*256+$data[$i+1]}) {
1134           $convchar= $chars{$char * 256 + $data[$i+1]};
1135           $i++;
1136 #           printf "char %x $char, char to convert %x , converted %x\n",$char,$char * 256 + $data[$i - 1],$convchar;
1137         } elsif ($chars{$char})  {
1138           $convchar= $chars{$char};
1139 #           printf "0xC char %x, converted %x\n",$char,$chars{$char};
1140         }else {
1141           $convchar=$char;
1142         }
1143         push @characters,$convchar;
1144       } else {
1145         my $convchar;
1146         if ($chars{$char})  {
1147           $convchar= $chars{$char};
1148 #            printf "char %x,  converted %x\n",$char,$chars{$char};
1149         }else {
1150 #            printf "char %x $char\n",$char;
1151           $convchar=$char;
1152         }
1153         push @characters,$convchar;
1154       }
1155     }
1156     $result=pack "U*",@characters;
1157 #     $result=~s/\x01//;
1158 #     $result=~s/\x00//;
1159      $result=~s/\x0f//;
1160      $result=~s/\x1b.//;
1161      $result=~s/\x0e//;
1162      $result=~s/\x1b\x5b//;
1163 #   map{printf "%x",$_} @characters;
1164 #   printf "\n";
1165   return $result;
1166 }
1167
1168 1;
1169
1170
1171 =head1 AUTHOR
1172
1173 Koha Development Team <http://koha-community.org/>
1174
1175 Galen Charlton <galen.charlton@liblime.com>
1176
1177 =cut