C4/Charset.pm

   1 package C4::Charset;
   2
   3 # Copyright (C) 2008 LibLime
   4 #
   5 # This file is part of Koha.
   6 #
   7 # Koha is free software; you can redistribute it and/or modify it under the
   8 # terms of the GNU General Public License as published by the Free Software
   9 # Foundation; either version 2 of the License, or (at your option) any later
  10 # version.
  11 #
  12 # Koha is distributed in the hope that it will be useful, but WITHOUT ANY
  13 # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
  14 # A PARTICULAR PURPOSE.  See the GNU General Public License for more details.
  15 #
  16 # You should have received a copy of the GNU General Public License along
  17 # with Koha; if not, write to the Free Software Foundation, Inc.,
  18 # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
  19
  20 use strict;
  21 use warnings;
  22
  23 use MARC::Charset qw/marc8_to_utf8/;
  24 use Text::Iconv;
  25 use C4::Debug;
  26 use Unicode::Normalize;
  27
  28 use vars qw($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
  29
  30 BEGIN {
  31     # set the version for version checking
  32     $VERSION = 3.07.00.049;
  33     require Exporter;
  34     @ISA    = qw(Exporter);
  35     @EXPORT = qw(
  36         NormalizeString
  37         IsStringUTF8ish
  38         MarcToUTF8Record
  39         SetUTF8Flag
  40         SetMarcUnicodeFlag
  41         StripNonXmlChars
  42         nsb_clean
  43     );
  44 }
  45
  46 =head1 NAME
  47
  48 C4::Charset - utilities for handling character set conversions.
  49
  50 =head1 SYNOPSIS
  51
  52   use C4::Charset;
  53
  54 =head1 DESCRIPTION
  55
  56 This module contains routines for dealing with character set
  57 conversions, particularly for MARC records.
  58
  59 A variety of character encodings are in use by various MARC
  60 standards, and even more character encodings are used by
  61 non-standard MARC records.  The various MARC formats generally
  62 do not do a good job of advertising a given record's character
  63 encoding, and even when a record does advertise its encoding,
  64 e.g., via the Leader/09, experience has shown that one cannot
  65 trust it.
  66
  67 Ultimately, all MARC records are stored in Koha in UTF-8 and
  68 must be converted from whatever the source character encoding is.
  69 The goal of this module is to ensure that these conversions
  70 take place accurately.  When a character conversion cannot take
  71 place, or at least not accurately, the module was provide
  72 enough information to allow user-facing code to inform the user
  73 on how to deal with the situation.
  74
  75 =cut
  76
  77 =head1 FUNCTIONS
  78
  79 =head2 IsStringUTF8ish
  80
  81   my $is_utf8 = IsStringUTF8ish($str);
  82
  83 Determines if C<$str> is valid UTF-8.  This can mean
  84 one of two things:
  85
  86 =over
  87
  88 =item *
  89
  90 The Perl UTF-8 flag is set and the string contains valid UTF-8.
  91
  92 =item *
  93
  94 The Perl UTF-8 flag is B<not> set, but the octets contain
  95 valid UTF-8.
  96
  97 =back
  98
  99 The function is named C<IsStringUTF8ish> instead of C<IsStringUTF8>
 100 because in one could be presented with a MARC blob that is
 101 not actually in UTF-8 but whose sequence of octets appears to be
 102 valid UTF-8.  The rest of the MARC character conversion functions
 103 will assume that this situation occur does not very often.
 104
 105 =cut
 106
 107 sub IsStringUTF8ish {
 108     my $str = shift;
 109
 110     return 1 if utf8::is_utf8($str);
 111     return utf8::decode($str);
 112 }
 113
 114 =head2 SetUTF8Flag
 115
 116   my $marc_record = SetUTF8Flag($marc_record, $nfd);
 117
 118 This function sets the PERL UTF8 flag for data.
 119 It is required when using new_from_usmarc
 120 since MARC::File::USMARC does not handle PERL UTF8 setting.
 121 When editing unicode marc records fields and subfields, you
 122 would end up in double encoding without using this function.
 123
 124 If $nfd is set, string normalization will use NFD instead of NFC
 125
 126 FIXME
 127 In my opinion, this function belongs to MARC::Record and not
 128 to this package.
 129 But since it handles charset, and MARC::Record, it finds its way in that package
 130
 131 =cut
 132
 133 sub SetUTF8Flag{
 134         my ($record, $nfd)=@_;
 135         return unless ($record && $record->fields());
 136         foreach my $field ($record->fields()){
 137                 if ($field->tag()>=10){
 138                         my @subfields;
 139                         foreach my $subfield ($field->subfields()){
 140                                 push @subfields,($$subfield[0],NormalizeString($$subfield[1],$nfd));
 141                         }
 142                         my $newfield=MARC::Field->new(
 143                                                         $field->tag(),
 144                                                         $field->indicator(1),
 145                                                         $field->indicator(2),
 146                                                         @subfields
 147                                                 );
 148                         $field->replace_with($newfield);
 149                 }
 150         }
 151 }
 152
 153 =head2 NormalizeString
 154
 155     my $normalized_string=NormalizeString($string,$nfd,$transform);
 156
 157 Given a string
 158 nfd : If you want to set NFD and not NFC
 159 transform : If you expect all the signs to be removed
 160
 161 Sets the PERL UTF8 Flag on your initial data if need be
 162 and applies cleaning if required
 163
 164 Returns a utf8 NFC normalized string
 165
 166 Sample code :
 167    my $string=NormalizeString ("l'ornithoptère");
 168    #results into ornithoptère in NFC form and sets UTF8 Flag
 169
 170 =cut
 171
 172
 173 sub NormalizeString{
 174         my ($string,$nfd,$transform)=@_;
 175         utf8::decode($string) unless (utf8::is_utf8($string));
 176         if ($nfd){
 177                 $string= NFD($string);
 178         }
 179         else {
 180                 $string=NFC($string);
 181         }
 182         if ($transform){
 183     $string=~s/\<|\>|\^|\;|\.|\?|,|\-|\(|\)|\[|\]|\{|\}|\$|\%|\!|\*|\:|\\|\/|\&|\"|\'/ /g;
 184         #removing one letter words "d'" "l'"  was changed into "d " "l "
 185     $string=~s/\b\S\b//g;
 186     $string=~s/\s+$//g;
 187         }
 188     return $string;
 189 }
 190
 191 =head2 MarcToUTF8Record
 192
 193   ($marc_record, $converted_from, $errors_arrayref) = MarcToUTF8Record($marc_blob,
 194                                         $marc_flavour, [, $source_encoding]);
 195
 196 Given a MARC blob or a C<MARC::Record>, the MARC flavour, and an
 197 optional source encoding, return a C<MARC::Record> that is
 198 converted to UTF-8.
 199
 200 The returned C<$marc_record> is guaranteed to be in valid UTF-8, but
 201 is not guaranteed to have been converted correctly.  Specifically,
 202 if C<$converted_from> is 'failed', the MARC record returned failed
 203 character conversion and had each of its non-ASCII octets changed
 204 to the Unicode replacement character.
 205
 206 If the source encoding was not specified, this routine will
 207 try to guess it; the character encoding used for a successful
 208 conversion is returned in C<$converted_from>.
 209
 210 =cut
 211
 212 sub MarcToUTF8Record {
 213     my $marc = shift;
 214     my $marc_flavour = shift;
 215     my $source_encoding = shift;
 216     my $marc_record;
 217     my $marc_blob_is_utf8 = 0;
 218     if (ref($marc) eq 'MARC::Record') {
 219         my $marc_blob = $marc->as_usmarc();
 220         $marc_blob_is_utf8 = IsStringUTF8ish($marc_blob);
 221         $marc_record = $marc;
 222     } else {
 223         # dealing with a MARC blob
 224
 225         # remove any ersatz whitespace from the beginning and
 226         # end of the MARC blob -- these can creep into MARC
 227         # files produced by several sources -- caller really
 228         # should be doing this, however
 229         $marc =~ s/^\s+//;
 230         $marc =~ s/\s+$//;
 231         $marc_blob_is_utf8 = IsStringUTF8ish($marc);
 232         eval {
 233             $marc_record = MARC::Record->new_from_usmarc($marc);
 234         };
 235         if ($@) {
 236             # if we fail the first time, one likely problem
 237             # is that we have a MARC21 record that says that it's
 238             # UTF-8 (Leader/09 = 'a') but contains non-UTF-8 characters.
 239             # We'll try parsing it again.
 240             substr($marc, 9, 1) = ' ';
 241             eval {
 242                 $marc_record = MARC::Record->new_from_usmarc($marc);
 243             };
 244             if ($@) {
 245                 # it's hopeless; return an empty MARC::Record
 246                 return MARC::Record->new(), 'failed', ['could not parse MARC blob'];
 247             }
 248         }
 249     }
 250
 251     # If we do not know the source encoding, try some guesses
 252     # as follows:
 253     #   1. Record is UTF-8 already.
 254     #   2. If MARC flavor is MARC21 or NORMARC, then
 255     #      a. record is MARC-8
 256     #      b. record is ISO-8859-1
 257     #   3. If MARC flavor is UNIMARC, then
 258     if (not defined $source_encoding) {
 259         if ($marc_blob_is_utf8) {
 260             # note that for MARC21/NORMARC we are not bothering to check
 261             # if the Leader/09 is set to 'a' or not -- because
 262             # of problems with various ILSs (including Koha in the
 263             # past, alas), this just is not trustworthy.
 264             SetMarcUnicodeFlag($marc_record, $marc_flavour);
 265             return $marc_record, 'UTF-8', [];
 266         } else {
 267             if ($marc_flavour eq 'MARC21' || $marc_flavour eq 'NORMARC') {
 268                 return _default_marc21_charconv_to_utf8($marc_record, $marc_flavour);
 269             } elsif ($marc_flavour =~/UNIMARC/) {
 270                 return _default_unimarc_charconv_to_utf8($marc_record, $marc_flavour);
 271             } else {
 272                 return _default_marc21_charconv_to_utf8($marc_record, $marc_flavour);
 273             }
 274         }
 275     } else {
 276         # caller knows the character encoding
 277         my $original_marc_record = $marc_record->clone();
 278         my @errors;
 279         if ($source_encoding =~ /utf-?8/i) {
 280             if ($marc_blob_is_utf8) {
 281                 SetMarcUnicodeFlag($marc_record, $marc_flavour);
 282                 return $marc_record, 'UTF-8', [];
 283             } else {
 284                 push @errors, 'specified UTF-8 => UTF-8 conversion, but record is not in UTF-8';
 285             }
 286         } elsif ($source_encoding =~ /marc-?8/i) {
 287             @errors = _marc_marc8_to_utf8($marc_record, $marc_flavour);
 288         } elsif ($source_encoding =~ /5426/) {
 289             @errors = _marc_iso5426_to_utf8($marc_record, $marc_flavour);
 290         } else {
 291             # assume any other character encoding is for Text::Iconv
 292             @errors = _marc_to_utf8_via_text_iconv($marc_record, $marc_flavour, $source_encoding);
 293         }
 294
 295         if (@errors) {
 296             _marc_to_utf8_replacement_char($original_marc_record, $marc_flavour);
 297             return $original_marc_record, 'failed', \@errors;
 298         } else {
 299             return $marc_record, $source_encoding, [];
 300         }
 301     }
 302
 303 }
 304
 305 =head2 SetMarcUnicodeFlag
 306
 307   SetMarcUnicodeFlag($marc_record, $marc_flavour);
 308
 309 Set both the internal MARC::Record encoding flag
 310 and the appropriate Leader/09 (MARC21) or
 311 100/26-29 (UNIMARC) to indicate that the record
 312 is in UTF-8.  Note that this does B<not> do
 313 any actual character conversion.
 314
 315 =cut
 316
 317 sub SetMarcUnicodeFlag {
 318     my $marc_record = shift;
 319     my $marc_flavour = shift; # || C4::Context->preference("marcflavour");
 320
 321     $marc_record->encoding('UTF-8');
 322     if ($marc_flavour eq 'MARC21' || $marc_flavour eq 'NORMARC') {
 323         my $leader = $marc_record->leader();
 324         substr($leader, 9, 1) = 'a';
 325         $marc_record->leader($leader);
 326     } elsif ($marc_flavour =~/UNIMARC/) {
 327         my $string;
 328                 my ($subflength,$encodingposition)=($marc_flavour=~/AUTH/?(21,9):(36,22));
 329                 $string=$marc_record->subfield( 100, "a" );
 330         if (defined $string && length($string)==$subflength) {
 331                         $string = substr $string, 0,$subflength if (length($string)>$subflength);
 332         }
 333         else {
 334             $string = POSIX::strftime( "%Y%m%d", localtime );
 335             $string =~ s/\-//g;
 336             $string = sprintf( "%-*s", $subflength, $string );
 337         }
 338         substr( $string, $encodingposition, 8, "frey50  " );
 339         if ( $marc_record->subfield( 100, "a" ) ) {
 340                         $marc_record->field('100')->update(a=>$string);
 341                 }
 342                 else {
 343             $marc_record->insert_grouped_field(
 344                 MARC::Field->new( 100, '', '', "a" => $string ) );
 345         }
 346                 $debug && warn "encodage: ", substr( $marc_record->subfield(100, 'a'), $encodingposition, 8 );
 347     } else {
 348         warn "Unrecognized marcflavour: $marc_flavour";
 349     }
 350 }
 351
 352 =head2 StripNonXmlChars
 353
 354   my $new_str = StripNonXmlChars($old_str);
 355
 356 Given a string, return a copy with the
 357 characters that are illegal in XML
 358 removed.
 359
 360 This function exists to work around a problem
 361 that can occur with badly-encoded MARC records.
 362 Specifically, if a UTF-8 MARC record also
 363 has excape (\x1b) characters, MARC::File::XML
 364 will let the escape characters pass through
 365 when as_xml() or as_xml_record() is called.  The
 366 problem is that the escape character is not
 367 legal in well-formed XML documents, so when
 368 MARC::File::XML attempts to parse such a record,
 369 the XML parser will fail.
 370
 371 Stripping such characters will allow a
 372 MARC::Record->new_from_xml()
 373 to work, at the possible risk of some data loss.
 374
 375 =cut
 376
 377 sub StripNonXmlChars {
 378     my $str = shift;
 379     if (!defined($str) || $str eq ""){
 380         return "";
 381     }
 382     $str =~ s/[^\x09\x0A\x0D\x{0020}-\x{D7FF}\x{E000}-\x{FFFD}\x{10000}-\x{10FFFF}]//g;
 383     return $str;
 384 }
 385
 386
 387
 388 =head2 nsb_clean
 389
 390 =over 4
 391
 392 nsb_clean($string);
 393
 394 =back
 395
 396 Removes Non Sorting Block characters
 397
 398 =cut
 399 sub nsb_clean {
 400     my $NSB  = '\x88' ;        # NSB : begin Non Sorting Block
 401     my $NSE  = '\x89' ;        # NSE : Non Sorting Block end
 402     my $NSB2 = '\x98' ;        # NSB : begin Non Sorting Block
 403     my $NSE2 = '\x9C' ;        # NSE : Non Sorting Block end
 404     my $C2   = '\xC2' ;        # What is this char ? It is sometimes left by the regexp after removing NSB / NSE
 405
 406     # handles non sorting blocks
 407     my ($string) = @_ ;
 408     $_ = $string ;
 409     s/$NSB//g ;
 410     s/$NSE//g ;
 411     s/$NSB2//g ;
 412     s/$NSE2//g ;
 413     s/$C2//g ;
 414     $string = $_ ;
 415
 416     return($string) ;
 417 }
 418
 419
 420 =head1 INTERNAL FUNCTIONS
 421
 422 =head2 _default_marc21_charconv_to_utf8
 423
 424   my ($new_marc_record, $guessed_charset) = _default_marc21_charconv_to_utf8($marc_record);
 425
 426 Converts a C<MARC::Record> of unknown character set to UTF-8,
 427 first by trying a MARC-8 to UTF-8 conversion, then ISO-8859-1
 428 to UTF-8, then a default conversion that replaces each non-ASCII
 429 character with the replacement character.
 430
 431 The C<$guessed_charset> return value contains the character set
 432 that resulted in a conversion to valid UTF-8; note that
 433 if the MARC-8 and ISO-8859-1 conversions failed, the value of
 434 this is 'failed'.
 435
 436 =cut
 437
 438 sub _default_marc21_charconv_to_utf8 {
 439     my $marc_record = shift;
 440     my $marc_flavour = shift;
 441
 442     my $trial_marc8 = $marc_record->clone();
 443     my @all_errors = ();
 444     my @errors = _marc_marc8_to_utf8($trial_marc8, $marc_flavour);
 445     unless (@errors) {
 446         return $trial_marc8, 'MARC-8', [];
 447     }
 448     push @all_errors, @errors;
 449
 450     my $trial_8859_1 = $marc_record->clone();
 451     @errors = _marc_to_utf8_via_text_iconv($trial_8859_1, $marc_flavour, 'iso-8859-1');
 452     unless (@errors) {
 453         return $trial_8859_1, 'iso-8859-1', []; # note -- we could return \@all_errors
 454                                                 # instead if we wanted to report details
 455                                                 # of the failed attempt at MARC-8 => UTF-8
 456     }
 457     push @all_errors, @errors;
 458
 459     my $default_converted = $marc_record->clone();
 460     _marc_to_utf8_replacement_char($default_converted, $marc_flavour);
 461     return $default_converted, 'failed', \@all_errors;
 462 }
 463
 464 =head2 _default_unimarc_charconv_to_utf8
 465
 466   my ($new_marc_record, $guessed_charset) = _default_unimarc_charconv_to_utf8($marc_record);
 467
 468 Converts a C<MARC::Record> of unknown character set to UTF-8,
 469 first by trying a ISO-5426 to UTF-8 conversion, then ISO-8859-1
 470 to UTF-8, then a default conversion that replaces each non-ASCII
 471 character with the replacement character.
 472
 473 The C<$guessed_charset> return value contains the character set
 474 that resulted in a conversion to valid UTF-8; note that
 475 if the MARC-8 and ISO-8859-1 conversions failed, the value of
 476 this is 'failed'.
 477
 478 =cut
 479
 480 sub _default_unimarc_charconv_to_utf8 {
 481     my $marc_record = shift;
 482     my $marc_flavour = shift;
 483
 484     my $trial_marc8 = $marc_record->clone();
 485     my @all_errors = ();
 486     my @errors = _marc_iso5426_to_utf8($trial_marc8, $marc_flavour);
 487     unless (@errors) {
 488         return $trial_marc8, 'iso-5426';
 489     }
 490     push @all_errors, @errors;
 491
 492     my $trial_8859_1 = $marc_record->clone();
 493     @errors = _marc_to_utf8_via_text_iconv($trial_8859_1, $marc_flavour, 'iso-8859-1');
 494     unless (@errors) {
 495         return $trial_8859_1, 'iso-8859-1';
 496     }
 497     push @all_errors, @errors;
 498
 499     my $default_converted = $marc_record->clone();
 500     _marc_to_utf8_replacement_char($default_converted, $marc_flavour);
 501     return $default_converted, 'failed', \@all_errors;
 502 }
 503
 504 =head2 _marc_marc8_to_utf8
 505
 506   my @errors = _marc_marc8_to_utf8($marc_record, $marc_flavour, $source_encoding);
 507
 508 Convert a C<MARC::Record> to UTF-8 in-place from MARC-8.
 509 If the conversion fails for some reason, an
 510 appropriate messages will be placed in the returned
 511 C<@errors> array.
 512
 513 =cut
 514
 515 sub _marc_marc8_to_utf8 {
 516     my $marc_record = shift;
 517     my $marc_flavour = shift;
 518
 519     my $prev_ignore = MARC::Charset->ignore_errors();
 520     MARC::Charset->ignore_errors(1);
 521
 522     # trap warnings raised by MARC::Charset
 523     my @errors = ();
 524     local $SIG{__WARN__} = sub {
 525         my $msg = $_[0];
 526         if ($msg =~ /MARC.Charset/) {
 527             # FIXME - purpose of this regexp is to strip out the
 528             # line reference to MARC/Charset.pm, but as it
 529             # exists probably won't work quite on Windows --
 530             # some sort of minimal-bunch back-tracking RE
 531             # would be helpful here
 532             $msg =~ s/at [\/].*?.MARC.Charset\.pm line \d+\.\n$//;
 533             push @errors, $msg;
 534         } else {
 535             # if warning doesn't come from MARC::Charset, just
 536             # pass it on
 537             warn $msg;
 538         }
 539     };
 540
 541     foreach my $field ($marc_record->fields()) {
 542         if ($field->is_control_field()) {
 543             ; # do nothing -- control fields should not contain non-ASCII characters
 544         } else {
 545             my @converted_subfields;
 546             foreach my $subfield ($field->subfields()) {
 547                 my $utf8sf = MARC::Charset::marc8_to_utf8($subfield->[1]);
 548                 unless (IsStringUTF8ish($utf8sf)) {
 549                     # Because of a bug in MARC::Charset 0.98, if the string
 550                     # has (a) one or more diacritics that (b) are only in character positions
 551                     # 128 to 255 inclusive, the resulting converted string is not in
 552                     # UTF-8, but the legacy 8-bit encoding (e.g., ISO-8859-1).  If that
 553                     # occurs, upgrade the string in place.  Moral of the story seems to be
 554                     # that pack("U", ...) is better than chr(...) if you need to guarantee
 555                     # that the resulting string is UTF-8.
 556                     utf8::upgrade($utf8sf);
 557                 }
 558                 push @converted_subfields, $subfield->[0], $utf8sf;
 559             }
 560
 561             $field->replace_with(MARC::Field->new(
 562                 $field->tag(), $field->indicator(1), $field->indicator(2),
 563                 @converted_subfields)
 564             );
 565         }
 566     }
 567
 568     MARC::Charset->ignore_errors($prev_ignore);
 569
 570     SetMarcUnicodeFlag($marc_record, $marc_flavour);
 571
 572     return @errors;
 573 }
 574
 575 =head2 _marc_iso5426_to_utf8
 576
 577   my @errors = _marc_iso5426_to_utf8($marc_record, $marc_flavour, $source_encoding);
 578
 579 Convert a C<MARC::Record> to UTF-8 in-place from ISO-5426.
 580 If the conversion fails for some reason, an
 581 appropriate messages will be placed in the returned
 582 C<@errors> array.
 583
 584 FIXME - is ISO-5426 equivalent enough to MARC-8
 585 that C<MARC::Charset> can be used instead?
 586
 587 =cut
 588
 589 sub _marc_iso5426_to_utf8 {
 590     my $marc_record = shift;
 591     my $marc_flavour = shift;
 592
 593     my @errors = ();
 594
 595     foreach my $field ($marc_record->fields()) {
 596         if ($field->is_control_field()) {
 597             ; # do nothing -- control fields should not contain non-ASCII characters
 598         } else {
 599             my @converted_subfields;
 600             foreach my $subfield ($field->subfields()) {
 601                 my $utf8sf = char_decode5426($subfield->[1]);
 602                 push @converted_subfields, $subfield->[0], $utf8sf;
 603             }
 604
 605             $field->replace_with(MARC::Field->new(
 606                 $field->tag(), $field->indicator(1), $field->indicator(2),
 607                 @converted_subfields)
 608             );
 609         }
 610     }
 611
 612     SetMarcUnicodeFlag($marc_record, $marc_flavour);
 613
 614     return @errors;
 615 }
 616
 617 =head2 _marc_to_utf8_via_text_iconv
 618
 619   my @errors = _marc_to_utf8_via_text_iconv($marc_record, $marc_flavour, $source_encoding);
 620
 621 Convert a C<MARC::Record> to UTF-8 in-place using the
 622 C<Text::Iconv> CPAN module.  Any source encoding accepted
 623 by the user's iconv installation should work.  If
 624 the source encoding is not recognized on the user's
 625 server or the conversion fails for some reason,
 626 appropriate messages will be placed in the returned
 627 C<@errors> array.
 628
 629 =cut
 630
 631 sub _marc_to_utf8_via_text_iconv {
 632     my $marc_record = shift;
 633     my $marc_flavour = shift;
 634     my $source_encoding = shift;
 635
 636     my @errors = ();
 637     my $decoder;
 638     eval { $decoder = Text::Iconv->new($source_encoding, 'utf8'); };
 639     if ($@) {
 640         push @errors, "Could not initialze $source_encoding => utf8 converter: $@";
 641         return @errors;
 642     }
 643
 644     my $prev_raise_error = Text::Iconv->raise_error();
 645     Text::Iconv->raise_error(1);
 646
 647     foreach my $field ($marc_record->fields()) {
 648         if ($field->is_control_field()) {
 649             ; # do nothing -- control fields should not contain non-ASCII characters
 650         } else {
 651             my @converted_subfields;
 652             foreach my $subfield ($field->subfields()) {
 653                 my $converted_value;
 654                 my $conversion_ok = 1;
 655                 eval { $converted_value = $decoder->convert($subfield->[1]); };
 656                 if ($@) {
 657                     $conversion_ok = 0;
 658                     push @errors, $@;
 659                 } elsif (not defined $converted_value) {
 660                     $conversion_ok = 0;
 661                     push @errors, "Text::Iconv conversion failed - retval is " . $decoder->retval();
 662                 }
 663
 664                 if ($conversion_ok) {
 665                     push @converted_subfields, $subfield->[0], $converted_value;
 666                 } else {
 667                     $converted_value = $subfield->[1];
 668                     $converted_value =~ s/[\200-\377]/\xef\xbf\xbd/g;
 669                     push @converted_subfields, $subfield->[0], $converted_value;
 670                 }
 671             }
 672
 673             $field->replace_with(MARC::Field->new(
 674                 $field->tag(), $field->indicator(1), $field->indicator(2),
 675                 @converted_subfields)
 676             );
 677         }
 678     }
 679
 680     SetMarcUnicodeFlag($marc_record, $marc_flavour);
 681     Text::Iconv->raise_error($prev_raise_error);
 682
 683     return @errors;
 684 }
 685
 686 =head2 _marc_to_utf8_replacement_char
 687
 688   _marc_to_utf8_replacement_char($marc_record, $marc_flavour);
 689
 690 Convert a C<MARC::Record> to UTF-8 in-place, adopting the
 691 unsatisfactory method of replacing all non-ASCII (e.g.,
 692 where the eight bit is set) octet with the Unicode
 693 replacement character.  This is meant as a last-ditch
 694 method, and would be best used as part of a UI that
 695 lets a cataloguer pick various character conversions
 696 until he or she finds the right one.
 697
 698 =cut
 699
 700 sub _marc_to_utf8_replacement_char {
 701     my $marc_record = shift;
 702     my $marc_flavour = shift;
 703
 704     foreach my $field ($marc_record->fields()) {
 705         if ($field->is_control_field()) {
 706             ; # do nothing -- control fields should not contain non-ASCII characters
 707         } else {
 708             my @converted_subfields;
 709             foreach my $subfield ($field->subfields()) {
 710                 my $value = $subfield->[1];
 711                 $value =~ s/[\200-\377]/\xef\xbf\xbd/g;
 712                 push @converted_subfields, $subfield->[0], $value;
 713             }
 714
 715             $field->replace_with(MARC::Field->new(
 716                 $field->tag(), $field->indicator(1), $field->indicator(2),
 717                 @converted_subfields)
 718             );
 719         }
 720     }
 721
 722     SetMarcUnicodeFlag($marc_record, $marc_flavour);
 723 }
 724
 725 =head2 char_decode5426
 726
 727   my $utf8string = char_decode5426($iso_5426_string);
 728
 729 Converts a string from ISO-5426 to UTF-8.
 730
 731 =cut
 732
 733
 734 my %chars;
 735 $chars{0xb0}=0x0101;#3/0ayn[ain]
 736 $chars{0xb1}=0x0623;#3/1alif/hamzah[alefwithhamzaabove]
 737 #$chars{0xb2}=0x00e0;#'à';
 738 $chars{0xb2}=0x00e0;#3/2leftlowsinglequotationmark
 739 #$chars{0xb3}=0x00e7;#'ç';
 740 $chars{0xb3}=0x00e7;#3/2leftlowsinglequotationmark
 741 # $chars{0xb4}='è';
 742 $chars{0xb4}=0x00e8;
 743 $chars{0xbd}=0x02b9;
 744 $chars{0xbe}=0x02ba;
 745 # $chars{0xb5}='é';
 746 $chars{0xb5}=0x00e9;
 747 $chars{0x97}=0x003c;#3/2leftlowsinglequotationmark
 748 $chars{0x98}=0x003e;#3/2leftlowsinglequotationmark
 749 $chars{0xfa}=0x0153; #oe
 750 $chars{0xea}=0x0152; #oe
 751 $chars{0x81d1}=0x00b0;
 752
 753 ####
 754 ## combined characters iso5426
 755
 756 $chars{0xc041}=0x1ea2; # capital a with hook above
 757 $chars{0xc045}=0x1eba; # capital e with hook above
 758 $chars{0xc049}=0x1ec8; # capital i with hook above
 759 $chars{0xc04f}=0x1ece; # capital o with hook above
 760 $chars{0xc055}=0x1ee6; # capital u with hook above
 761 $chars{0xc059}=0x1ef6; # capital y with hook above
 762 $chars{0xc061}=0x1ea3; # small a with hook above
 763 $chars{0xc065}=0x1ebb; # small e with hook above
 764 $chars{0xc069}=0x1ec9; # small i with hook above
 765 $chars{0xc06f}=0x1ecf; # small o with hook above
 766 $chars{0xc075}=0x1ee7; # small u with hook above
 767 $chars{0xc079}=0x1ef7; # small y with hook above
 768
 769         # 4/1 grave accent
 770 $chars{0xc141}=0x00c0; # capital a with grave accent
 771 $chars{0xc145}=0x00c8; # capital e with grave accent
 772 $chars{0xc149}=0x00cc; # capital i with grave accent
 773 $chars{0xc14f}=0x00d2; # capital o with grave accent
 774 $chars{0xc155}=0x00d9; # capital u with grave accent
 775 $chars{0xc157}=0x1e80; # capital w with grave
 776 $chars{0xc159}=0x1ef2; # capital y with grave
 777 $chars{0xc161}=0x00e0; # small a with grave accent
 778 $chars{0xc165}=0x00e8; # small e with grave accent
 779 $chars{0xc169}=0x00ec; # small i with grave accent
 780 $chars{0xc16f}=0x00f2; # small o with grave accent
 781 $chars{0xc175}=0x00f9; # small u with grave accent
 782 $chars{0xc177}=0x1e81; # small w with grave
 783 $chars{0xc179}=0x1ef3; # small y with grave
 784         # 4/2 acute accent
 785 $chars{0xc241}=0x00c1; # capital a with acute accent
 786 $chars{0xc243}=0x0106; # capital c with acute accent
 787 $chars{0xc245}=0x00c9; # capital e with acute accent
 788 $chars{0xc247}=0x01f4; # capital g with acute
 789 $chars{0xc249}=0x00cd; # capital i with acute accent
 790 $chars{0xc24b}=0x1e30; # capital k with acute
 791 $chars{0xc24c}=0x0139; # capital l with acute accent
 792 $chars{0xc24d}=0x1e3e; # capital m with acute
 793 $chars{0xc24e}=0x0143; # capital n with acute accent
 794 $chars{0xc24f}=0x00d3; # capital o with acute accent
 795 $chars{0xc250}=0x1e54; # capital p with acute
 796 $chars{0xc252}=0x0154; # capital r with acute accent
 797 $chars{0xc253}=0x015a; # capital s with acute accent
 798 $chars{0xc255}=0x00da; # capital u with acute accent
 799 $chars{0xc257}=0x1e82; # capital w with acute
 800 $chars{0xc259}=0x00dd; # capital y with acute accent
 801 $chars{0xc25a}=0x0179; # capital z with acute accent
 802 $chars{0xc261}=0x00e1; # small a with acute accent
 803 $chars{0xc263}=0x0107; # small c with acute accent
 804 $chars{0xc265}=0x00e9; # small e with acute accent
 805 $chars{0xc267}=0x01f5; # small g with acute
 806 $chars{0xc269}=0x00ed; # small i with acute accent
 807 $chars{0xc26b}=0x1e31; # small k with acute
 808 $chars{0xc26c}=0x013a; # small l with acute accent
 809 $chars{0xc26d}=0x1e3f; # small m with acute
 810 $chars{0xc26e}=0x0144; # small n with acute accent
 811 $chars{0xc26f}=0x00f3; # small o with acute accent
 812 $chars{0xc270}=0x1e55; # small p with acute
 813 $chars{0xc272}=0x0155; # small r with acute accent
 814 $chars{0xc273}=0x015b; # small s with acute accent
 815 $chars{0xc275}=0x00fa; # small u with acute accent
 816 $chars{0xc277}=0x1e83; # small w with acute
 817 $chars{0xc279}=0x00fd; # small y with acute accent
 818 $chars{0xc27a}=0x017a; # small z with acute accent
 819 $chars{0xc2e1}=0x01fc; # capital ae with acute
 820 $chars{0xc2f1}=0x01fd; # small ae with acute
 821        # 4/3 circumflex accent
 822 $chars{0xc341}=0x00c2; # capital a with circumflex accent
 823 $chars{0xc343}=0x0108; # capital c with circumflex
 824 $chars{0xc345}=0x00ca; # capital e with circumflex accent
 825 $chars{0xc347}=0x011c; # capital g with circumflex
 826 $chars{0xc348}=0x0124; # capital h with circumflex
 827 $chars{0xc349}=0x00ce; # capital i with circumflex accent
 828 $chars{0xc34a}=0x0134; # capital j with circumflex
 829 $chars{0xc34f}=0x00d4; # capital o with circumflex accent
 830 $chars{0xc353}=0x015c; # capital s with circumflex
 831 $chars{0xc355}=0x00db; # capital u with circumflex
 832 $chars{0xc357}=0x0174; # capital w with circumflex
 833 $chars{0xc359}=0x0176; # capital y with circumflex
 834 $chars{0xc35a}=0x1e90; # capital z with circumflex
 835 $chars{0xc361}=0x00e2; # small a with circumflex accent
 836 $chars{0xc363}=0x0109; # small c with circumflex
 837 $chars{0xc365}=0x00ea; # small e with circumflex accent
 838 $chars{0xc367}=0x011d; # small g with circumflex
 839 $chars{0xc368}=0x0125; # small h with circumflex
 840 $chars{0xc369}=0x00ee; # small i with circumflex accent
 841 $chars{0xc36a}=0x0135; # small j with circumflex
 842 $chars{0xc36e}=0x00f1; # small n with tilde
 843 $chars{0xc36f}=0x00f4; # small o with circumflex accent
 844 $chars{0xc373}=0x015d; # small s with circumflex
 845 $chars{0xc375}=0x00fb; # small u with circumflex
 846 $chars{0xc377}=0x0175; # small w with circumflex
 847 $chars{0xc379}=0x0177; # small y with circumflex
 848 $chars{0xc37a}=0x1e91; # small z with circumflex
 849         # 4/4 tilde
 850 $chars{0xc441}=0x00c3; # capital a with tilde
 851 $chars{0xc445}=0x1ebc; # capital e with tilde
 852 $chars{0xc449}=0x0128; # capital i with tilde
 853 $chars{0xc44e}=0x00d1; # capital n with tilde
 854 $chars{0xc44f}=0x00d5; # capital o with tilde
 855 $chars{0xc455}=0x0168; # capital u with tilde
 856 $chars{0xc456}=0x1e7c; # capital v with tilde
 857 $chars{0xc459}=0x1ef8; # capital y with tilde
 858 $chars{0xc461}=0x00e3; # small a with tilde
 859 $chars{0xc465}=0x1ebd; # small e with tilde
 860 $chars{0xc469}=0x0129; # small i with tilde
 861 $chars{0xc46e}=0x00f1; # small n with tilde
 862 $chars{0xc46f}=0x00f5; # small o with tilde
 863 $chars{0xc475}=0x0169; # small u with tilde
 864 $chars{0xc476}=0x1e7d; # small v with tilde
 865 $chars{0xc479}=0x1ef9; # small y with tilde
 866     # 4/5 macron
 867 $chars{0xc541}=0x0100; # capital a with macron
 868 $chars{0xc545}=0x0112; # capital e with macron
 869 $chars{0xc547}=0x1e20; # capital g with macron
 870 $chars{0xc549}=0x012a; # capital i with macron
 871 $chars{0xc54f}=0x014c; # capital o with macron
 872 $chars{0xc555}=0x016a; # capital u with macron
 873 $chars{0xc561}=0x0101; # small a with macron
 874 $chars{0xc565}=0x0113; # small e with macron
 875 $chars{0xc567}=0x1e21; # small g with macron
 876 $chars{0xc569}=0x012b; # small i with macron
 877 $chars{0xc56f}=0x014d; # small o with macron
 878 $chars{0xc575}=0x016b; # small u with macron
 879 $chars{0xc572}=0x0159; # small r with macron
 880 $chars{0xc5e1}=0x01e2; # capital ae with macron
 881 $chars{0xc5f1}=0x01e3; # small ae with macron
 882         # 4/6 breve
 883 $chars{0xc641}=0x0102; # capital a with breve
 884 $chars{0xc645}=0x0114; # capital e with breve
 885 $chars{0xc647}=0x011e; # capital g with breve
 886 $chars{0xc649}=0x012c; # capital i with breve
 887 $chars{0xc64f}=0x014e; # capital o with breve
 888 $chars{0xc655}=0x016c; # capital u with breve
 889 $chars{0xc661}=0x0103; # small a with breve
 890 $chars{0xc665}=0x0115; # small e with breve
 891 $chars{0xc667}=0x011f; # small g with breve
 892 $chars{0xc669}=0x012d; # small i with breve
 893 $chars{0xc66f}=0x014f; # small o with breve
 894 $chars{0xc675}=0x016d; # small u with breve
 895         # 4/7 dot above
 896 $chars{0xc7b0}=0x01e1; # Ain with dot above
 897 $chars{0xc742}=0x1e02; # capital b with dot above
 898 $chars{0xc743}=0x010a; # capital c with dot above
 899 $chars{0xc744}=0x1e0a; # capital d with dot above
 900 $chars{0xc745}=0x0116; # capital e with dot above
 901 $chars{0xc746}=0x1e1e; # capital f with dot above
 902 $chars{0xc747}=0x0120; # capital g with dot above
 903 $chars{0xc748}=0x1e22; # capital h with dot above
 904 $chars{0xc749}=0x0130; # capital i with dot above
 905 $chars{0xc74d}=0x1e40; # capital m with dot above
 906 $chars{0xc74e}=0x1e44; # capital n with dot above
 907 $chars{0xc750}=0x1e56; # capital p with dot above
 908 $chars{0xc752}=0x1e58; # capital r with dot above
 909 $chars{0xc753}=0x1e60; # capital s with dot above
 910 $chars{0xc754}=0x1e6a; # capital t with dot above
 911 $chars{0xc757}=0x1e86; # capital w with dot above
 912 $chars{0xc758}=0x1e8a; # capital x with dot above
 913 $chars{0xc759}=0x1e8e; # capital y with dot above
 914 $chars{0xc75a}=0x017b; # capital z with dot above
 915 $chars{0xc761}=0x0227; # small b with dot above
 916 $chars{0xc762}=0x1e03; # small b with dot above
 917 $chars{0xc763}=0x010b; # small c with dot above
 918 $chars{0xc764}=0x1e0b; # small d with dot above
 919 $chars{0xc765}=0x0117; # small e with dot above
 920 $chars{0xc766}=0x1e1f; # small f with dot above
 921 $chars{0xc767}=0x0121; # small g with dot above
 922 $chars{0xc768}=0x1e23; # small h with dot above
 923 $chars{0xc76d}=0x1e41; # small m with dot above
 924 $chars{0xc76e}=0x1e45; # small n with dot above
 925 $chars{0xc770}=0x1e57; # small p with dot above
 926 $chars{0xc772}=0x1e59; # small r with dot above
 927 $chars{0xc773}=0x1e61; # small s with dot above
 928 $chars{0xc774}=0x1e6b; # small t with dot above
 929 $chars{0xc777}=0x1e87; # small w with dot above
 930 $chars{0xc778}=0x1e8b; # small x with dot above
 931 $chars{0xc779}=0x1e8f; # small y with dot above
 932 $chars{0xc77a}=0x017c; # small z with dot above
 933         # 4/8 trema, diaresis
 934 $chars{0xc820}=0x00a8; # diaeresis
 935 $chars{0xc841}=0x00c4; # capital a with diaeresis
 936 $chars{0xc845}=0x00cb; # capital e with diaeresis
 937 $chars{0xc848}=0x1e26; # capital h with diaeresis
 938 $chars{0xc849}=0x00cf; # capital i with diaeresis
 939 $chars{0xc84f}=0x00d6; # capital o with diaeresis
 940 $chars{0xc855}=0x00dc; # capital u with diaeresis
 941 $chars{0xc857}=0x1e84; # capital w with diaeresis
 942 $chars{0xc858}=0x1e8c; # capital x with diaeresis
 943 $chars{0xc859}=0x0178; # capital y with diaeresis
 944 $chars{0xc861}=0x00e4; # small a with diaeresis
 945 $chars{0xc865}=0x00eb; # small e with diaeresis
 946 $chars{0xc868}=0x1e27; # small h with diaeresis
 947 $chars{0xc869}=0x00ef; # small i with diaeresis
 948 $chars{0xc86f}=0x00f6; # small o with diaeresis
 949 $chars{0xc874}=0x1e97; # small t with diaeresis
 950 $chars{0xc875}=0x00fc; # small u with diaeresis
 951 $chars{0xc877}=0x1e85; # small w with diaeresis
 952 $chars{0xc878}=0x1e8d; # small x with diaeresis
 953 $chars{0xc879}=0x00ff; # small y with diaeresis
 954         # 4/9 umlaut
 955 $chars{0xc920}=0x00a8; # [diaeresis]
 956 $chars{0xc961}=0x00e4; # a with umlaut
 957 $chars{0xc965}=0x00eb; # e with umlaut
 958 $chars{0xc969}=0x00ef; # i with umlaut
 959 $chars{0xc96f}=0x00f6; # o with umlaut
 960 $chars{0xc975}=0x00fc; # u with umlaut
 961         # 4/10 circle above
 962 $chars{0xca41}=0x00c5; # capital a with ring above
 963 $chars{0xcaad}=0x016e; # capital u with ring above
 964 $chars{0xca61}=0x00e5; # small a with ring above
 965 $chars{0xca75}=0x016f; # small u with ring above
 966 $chars{0xca77}=0x1e98; # small w with ring above
 967 $chars{0xca79}=0x1e99; # small y with ring above
 968         # 4/11 high comma off centre
 969         # 4/12 inverted high comma centred
 970         # 4/13 double acute accent
 971 $chars{0xcd4f}=0x0150; # capital o with double acute
 972 $chars{0xcd55}=0x0170; # capital u with double acute
 973 $chars{0xcd6f}=0x0151; # small o with double acute
 974 $chars{0xcd75}=0x0171; # small u with double acute
 975         # 4/14 horn
 976 $chars{0xce54}=0x01a0; # latin capital letter o with horn
 977 $chars{0xce55}=0x01af; # latin capital letter u with horn
 978 $chars{0xce74}=0x01a1; # latin small letter o with horn
 979 $chars{0xce75}=0x01b0; # latin small letter u with horn
 980         # 4/15 caron (hacek
 981 $chars{0xcf41}=0x01cd; # capital a with caron
 982 $chars{0xcf43}=0x010c; # capital c with caron
 983 $chars{0xcf44}=0x010e; # capital d with caron
 984 $chars{0xcf45}=0x011a; # capital e with caron
 985 $chars{0xcf47}=0x01e6; # capital g with caron
 986 $chars{0xcf49}=0x01cf; # capital i with caron
 987 $chars{0xcf4b}=0x01e8; # capital k with caron
 988 $chars{0xcf4c}=0x013d; # capital l with caron
 989 $chars{0xcf4e}=0x0147; # capital n with caron
 990 $chars{0xcf4f}=0x01d1; # capital o with caron
 991 $chars{0xcf52}=0x0158; # capital r with caron
 992 $chars{0xcf53}=0x0160; # capital s with caron
 993 $chars{0xcf54}=0x0164; # capital t with caron
 994 $chars{0xcf55}=0x01d3; # capital u with caron
 995 $chars{0xcf5a}=0x017d; # capital z with caron
 996 $chars{0xcf61}=0x01ce; # small a with caron
 997 $chars{0xcf63}=0x010d; # small c with caron
 998 $chars{0xcf64}=0x010f; # small d with caron
 999 $chars{0xcf65}=0x011b; # small e with caron
1000 $chars{0xcf67}=0x01e7; # small g with caron
1001 $chars{0xcf69}=0x01d0; # small i with caron
1002 $chars{0xcf6a}=0x01f0; # small j with caron
1003 $chars{0xcf6b}=0x01e9; # small k with caron
1004 $chars{0xcf6c}=0x013e; # small l with caron
1005 $chars{0xcf6e}=0x0148; # small n with caron
1006 $chars{0xcf6f}=0x01d2; # small o with caron
1007 $chars{0xcf72}=0x0159; # small r with caron
1008 $chars{0xcf73}=0x0161; # small s with caron
1009 $chars{0xcf74}=0x0165; # small t with caron
1010 $chars{0xcf75}=0x01d4; # small u with caron
1011 $chars{0xcf7a}=0x017e; # small z with caron
1012         # 5/0 cedilla
1013 $chars{0xd020}=0x00b8; # cedilla
1014 $chars{0xd043}=0x00c7; # capital c with cedilla
1015 $chars{0xd044}=0x1e10; # capital d with cedilla
1016 $chars{0xd047}=0x0122; # capital g with cedilla
1017 $chars{0xd048}=0x1e28; # capital h with cedilla
1018 $chars{0xd04b}=0x0136; # capital k with cedilla
1019 $chars{0xd04c}=0x013b; # capital l with cedilla
1020 $chars{0xd04e}=0x0145; # capital n with cedilla
1021 $chars{0xd052}=0x0156; # capital r with cedilla
1022 $chars{0xd053}=0x015e; # capital s with cedilla
1023 $chars{0xd054}=0x0162; # capital t with cedilla
1024 $chars{0xd063}=0x00e7; # small c with cedilla
1025 $chars{0xd064}=0x1e11; # small d with cedilla
1026 $chars{0xd065}=0x0119; # small e with cedilla
1027 $chars{0xd067}=0x0123; # small g with cedilla
1028 $chars{0xd068}=0x1e29; # small h with cedilla
1029 $chars{0xd06b}=0x0137; # small k with cedilla
1030 $chars{0xd06c}=0x013c; # small l with cedilla
1031 $chars{0xd06e}=0x0146; # small n with cedilla
1032 $chars{0xd072}=0x0157; # small r with cedilla
1033 $chars{0xd073}=0x015f; # small s with cedilla
1034 $chars{0xd074}=0x0163; # small t with cedilla
1035         # 5/1 rude
1036         # 5/2 hook to left
1037         # 5/3 ogonek (hook to right
1038 $chars{0xd320}=0x02db; # ogonek
1039 $chars{0xd341}=0x0104; # capital a with ogonek
1040 $chars{0xd345}=0x0118; # capital e with ogonek
1041 $chars{0xd349}=0x012e; # capital i with ogonek
1042 $chars{0xd34f}=0x01ea; # capital o with ogonek
1043 $chars{0xd355}=0x0172; # capital u with ogonek
1044 $chars{0xd361}=0x0105; # small a with ogonek
1045 $chars{0xd365}=0x0119; # small e with ogonek
1046 $chars{0xd369}=0x012f; # small i with ogonek
1047 $chars{0xd36f}=0x01eb; # small o with ogonek
1048 $chars{0xd375}=0x0173; # small u with ogonek
1049         # 5/4 circle below
1050 $chars{0xd441}=0x1e00; # capital a with ring below
1051 $chars{0xd461}=0x1e01; # small a with ring below
1052         # 5/5 half circle below
1053 $chars{0xf948}=0x1e2a; # capital h with breve below
1054 $chars{0xf968}=0x1e2b; # small h with breve below
1055         # 5/6 dot below
1056 $chars{0xd641}=0x1ea0; # capital a with dot below
1057 $chars{0xd642}=0x1e04; # capital b with dot below
1058 $chars{0xd644}=0x1e0c; # capital d with dot below
1059 $chars{0xd645}=0x1eb8; # capital e with dot below
1060 $chars{0xd648}=0x1e24; # capital h with dot below
1061 $chars{0xd649}=0x1eca; # capital i with dot below
1062 $chars{0xd64b}=0x1e32; # capital k with dot below
1063 $chars{0xd64c}=0x1e36; # capital l with dot below
1064 $chars{0xd64d}=0x1e42; # capital m with dot below
1065 $chars{0xd64e}=0x1e46; # capital n with dot below
1066 $chars{0xd64f}=0x1ecc; # capital o with dot below
1067 $chars{0xd652}=0x1e5a; # capital r with dot below
1068 $chars{0xd653}=0x1e62; # capital s with dot below
1069 $chars{0xd654}=0x1e6c; # capital t with dot below
1070 $chars{0xd655}=0x1ee4; # capital u with dot below
1071 $chars{0xd656}=0x1e7e; # capital v with dot below
1072 $chars{0xd657}=0x1e88; # capital w with dot below
1073 $chars{0xd659}=0x1ef4; # capital y with dot below
1074 $chars{0xd65a}=0x1e92; # capital z with dot below
1075 $chars{0xd661}=0x1ea1; # small a with dot below
1076 $chars{0xd662}=0x1e05; # small b with dot below
1077 $chars{0xd664}=0x1e0d; # small d with dot below
1078 $chars{0xd665}=0x1eb9; # small e with dot below
1079 $chars{0xd668}=0x1e25; # small h with dot below
1080 $chars{0xd669}=0x1ecb; # small i with dot below
1081 $chars{0xd66b}=0x1e33; # small k with dot below
1082 $chars{0xd66c}=0x1e37; # small l with dot below
1083 $chars{0xd66d}=0x1e43; # small m with dot below
1084 $chars{0xd66e}=0x1e47; # small n with dot below
1085 $chars{0xd66f}=0x1ecd; # small o with dot below
1086 $chars{0xd672}=0x1e5b; # small r with dot below
1087 $chars{0xd673}=0x1e63; # small s with dot below
1088 $chars{0xd674}=0x1e6d; # small t with dot below
1089 $chars{0xd675}=0x1ee5; # small u with dot below
1090 $chars{0xd676}=0x1e7f; # small v with dot below
1091 $chars{0xd677}=0x1e89; # small w with dot below
1092 $chars{0xd679}=0x1ef5; # small y with dot below
1093 $chars{0xd67a}=0x1e93; # small z with dot below
1094         # 5/7 double dot below
1095 $chars{0xd755}=0x1e72; # capital u with diaeresis below
1096 $chars{0xd775}=0x1e73; # small u with diaeresis below
1097         # 5/8 underline
1098 $chars{0xd820}=0x005f; # underline
1099         # 5/9 double underline
1100 $chars{0xd920}=0x2017; # double underline
1101         # 5/10 small low vertical bar
1102 $chars{0xda20}=0x02cc; #
1103         # 5/11 circumflex below
1104         # 5/12 (this position shall not be used)
1105         # 5/13 left half of ligature sign and of double tilde
1106         # 5/14 right half of ligature sign
1107         # 5/15 right half of double tilde
1108 #     map {printf "%x :%x\n",$_,$chars{$_};}keys %chars;
1109
1110 sub char_decode5426 {
1111     my ( $string) = @_;
1112     my $result;
1113
1114     my @data = unpack("C*", $string);
1115     my @characters;
1116     my $length=scalar(@data);
1117     for (my $i = 0; $i < scalar(@data); $i++) {
1118       my $char= $data[$i];
1119       if ($char >= 0x00 && $char <= 0x7F){
1120         #IsAscii
1121
1122           push @characters,$char unless ($char<0x02 ||$char== 0x0F);
1123       }elsif (($char >= 0xC0 && $char <= 0xDF)) {
1124         #Combined Char
1125         my $convchar ;
1126         if ($chars{$char*256+$data[$i+1]}) {
1127           $convchar= $chars{$char * 256 + $data[$i+1]};
1128           $i++;
1129 #           printf "char %x $char, char to convert %x , converted %x\n",$char,$char * 256 + $data[$i - 1],$convchar;
1130         } elsif ($chars{$char})  {
1131           $convchar= $chars{$char};
1132 #           printf "0xC char %x, converted %x\n",$char,$chars{$char};
1133         }else {
1134           $convchar=$char;
1135         }
1136         push @characters,$convchar;
1137       } else {
1138         my $convchar;
1139         if ($chars{$char})  {
1140           $convchar= $chars{$char};
1141 #            printf "char %x,  converted %x\n",$char,$chars{$char};
1142         }else {
1143 #            printf "char %x $char\n",$char;
1144           $convchar=$char;
1145         }
1146         push @characters,$convchar;
1147       }
1148     }
1149     $result=pack "U*",@characters;
1150 #     $result=~s/\x01//;
1151 #     $result=~s/\x00//;
1152      $result=~s/\x0f//;
1153      $result=~s/\x1b.//;
1154      $result=~s/\x0e//;
1155      $result=~s/\x1b\x5b//;
1156 #   map{printf "%x",$_} @characters;
1157 #   printf "\n";
1158   return $result;
1159 }
1160
1161 1;
1162
1163
1164 =head1 AUTHOR
1165
1166 Koha Development Team <http://koha-community.org/>
1167
1168 Galen Charlton <galen.charlton@liblime.com>
1169
1170 =cut