C4/Charset.pm

   1 package C4::Charset;
   2
   3 # Copyright (C) 2008 LibLime
   4 #
   5 # This file is part of Koha.
   6 #
   7 # Koha is free software; you can redistribute it and/or modify it under the
   8 # terms of the GNU General Public License as published by the Free Software
   9 # Foundation; either version 2 of the License, or (at your option) any later
  10 # version.
  11 #
  12 # Koha is distributed in the hope that it will be useful, but WITHOUT ANY
  13 # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
  14 # A PARTICULAR PURPOSE.  See the GNU General Public License for more details.
  15 #
  16 # You should have received a copy of the GNU General Public License along
  17 # with Koha; if not, write to the Free Software Foundation, Inc.,
  18 # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
  19
  20 use strict;
  21 use warnings;
  22
  23 use MARC::Charset qw/marc8_to_utf8/;
  24 use Text::Iconv;
  25 use C4::Debug;
  26 use Unicode::Normalize;
  27
  28 use vars qw($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
  29
  30 BEGIN {
  31     # set the version for version checking
  32     $VERSION = 3.01;
  33     require Exporter;
  34     @ISA    = qw(Exporter);
  35     @EXPORT = qw(
  36         NormalizeString
  37         IsStringUTF8ish
  38         MarcToUTF8Record
  39         SetUTF8Flag
  40         SetMarcUnicodeFlag
  41         StripNonXmlChars
  42     );
  43 }
  44
  45 =head1 NAME
  46
  47 C4::Charset - utilities for handling character set conversions.
  48
  49 =head1 SYNOPSIS
  50
  51   use C4::Charset;
  52
  53 =head1 DESCRIPTION
  54
  55 This module contains routines for dealing with character set
  56 conversions, particularly for MARC records.
  57
  58 A variety of character encodings are in use by various MARC
  59 standards, and even more character encodings are used by
  60 non-standard MARC records.  The various MARC formats generally
  61 do not do a good job of advertising a given record's character
  62 encoding, and even when a record does advertise its encoding,
  63 e.g., via the Leader/09, experience has shown that one cannot
  64 trust it.
  65
  66 Ultimately, all MARC records are stored in Koha in UTF-8 and
  67 must be converted from whatever the source character encoding is.
  68 The goal of this module is to ensure that these conversions
  69 take place accurately.  When a character conversion cannot take
  70 place, or at least not accurately, the module was provide
  71 enough information to allow user-facing code to inform the user
  72 on how to deal with the situation.
  73
  74 =cut
  75
  76 =head1 FUNCTIONS
  77
  78 =head2 IsStringUTF8ish
  79
  80   my $is_utf8 = IsStringUTF8ish($str);
  81
  82 Determines if C<$str> is valid UTF-8.  This can mean
  83 one of two things:
  84
  85 =over
  86
  87 =item *
  88
  89 The Perl UTF-8 flag is set and the string contains valid UTF-8.
  90
  91 =item *
  92
  93 The Perl UTF-8 flag is B<not> set, but the octets contain
  94 valid UTF-8.
  95
  96 =back
  97
  98 The function is named C<IsStringUTF8ish> instead of C<IsStringUTF8>
  99 because in one could be presented with a MARC blob that is
 100 not actually in UTF-8 but whose sequence of octets appears to be
 101 valid UTF-8.  The rest of the MARC character conversion functions
 102 will assume that this situation occur does not very often.
 103
 104 =cut
 105
 106 sub IsStringUTF8ish {
 107     my $str = shift;
 108
 109     return 1 if utf8::is_utf8($str);
 110     return utf8::decode($str);
 111 }
 112
 113 =head2 SetUTF8Flag
 114
 115   my $marc_record = SetUTF8Flag($marc_record);
 116
 117 This function sets the PERL UTF8 flag for data.
 118 It is required when using new_from_usmarc
 119 since MARC::File::USMARC does not handle PERL UTF8 setting.
 120 When editing unicode marc records fields and subfields, you
 121 would end up in double encoding without using this function.
 122
 123 FIXME
 124 In my opinion, this function belongs to MARC::Record and not
 125 to this package.
 126 But since it handles charset, and MARC::Record, it finds its way in that package
 127
 128 =cut
 129
 130 sub SetUTF8Flag{
 131         my ($record)=@_;
 132         return unless ($record && $record->fields());
 133         foreach my $field ($record->fields()){
 134                 if ($field->tag()>=10){
 135                         my @subfields;
 136                         foreach my $subfield ($field->subfields()){
 137                                 push @subfields,($$subfield[0],NormalizeString($$subfield[1]));
 138                         }
 139                         my $newfield=MARC::Field->new(
 140                                                         $field->tag(),
 141                                                         $field->indicator(1),
 142                                                         $field->indicator(2),
 143                                                         @subfields
 144                                                 );
 145                         $field->replace_with($newfield);
 146                 }
 147         }
 148 }
 149
 150 =head2 NormalizeString
 151
 152     my $normalized_string=NormalizeString($string,$nfd,$transform);
 153
 154 Given a string
 155 nfd : If you want to set NFD and not NFC
 156 transform : If you expect all the signs to be removed
 157
 158 Sets the PERL UTF8 Flag on your initial data if need be
 159 and applies cleaning if required
 160
 161 Returns a utf8 NFC normalized string
 162
 163 Sample code :
 164    my $string=NormalizeString ("l'ornithoptère");
 165    #results into ornithoptère in NFC form and sets UTF8 Flag
 166
 167 =cut
 168
 169
 170 sub NormalizeString{
 171         my ($string,$nfd,$transform)=@_;
 172         utf8::decode($string) unless (utf8::is_utf8($string));
 173         if ($nfd){
 174                 $string= NFD($string);
 175         }
 176         else {
 177                 $string=NFC($string);
 178         }
 179         if ($transform){
 180     $string=~s/\<|\>|\^|\;|\.|\?|,|\-|\(|\)|\[|\]|\{|\}|\$|\%|\!|\*|\:|\\|\/|\&|\"|\'/ /g;
 181         #removing one letter words "d'" "l'"  was changed into "d " "l "
 182     $string=~s/\b\S\b//g;
 183     $string=~s/\s+$//g;
 184         }
 185     return $string;
 186 }
 187
 188 =head2 MarcToUTF8Record
 189
 190   ($marc_record, $converted_from, $errors_arrayref) = MarcToUTF8Record($marc_blob,
 191                                         $marc_flavour, [, $source_encoding]);
 192
 193 Given a MARC blob or a C<MARC::Record>, the MARC flavour, and an
 194 optional source encoding, return a C<MARC::Record> that is
 195 converted to UTF-8.
 196
 197 The returned C<$marc_record> is guaranteed to be in valid UTF-8, but
 198 is not guaranteed to have been converted correctly.  Specifically,
 199 if C<$converted_from> is 'failed', the MARC record returned failed
 200 character conversion and had each of its non-ASCII octets changed
 201 to the Unicode replacement character.
 202
 203 If the source encoding was not specified, this routine will
 204 try to guess it; the character encoding used for a successful
 205 conversion is returned in C<$converted_from>.
 206
 207 =cut
 208
 209 sub MarcToUTF8Record {
 210     my $marc = shift;
 211     my $marc_flavour = shift;
 212     my $source_encoding = shift;
 213     my $marc_record;
 214     my $marc_blob_is_utf8 = 0;
 215     if (ref($marc) eq 'MARC::Record') {
 216         my $marc_blob = $marc->as_usmarc();
 217         $marc_blob_is_utf8 = IsStringUTF8ish($marc_blob);
 218         $marc_record = $marc;
 219     } else {
 220         # dealing with a MARC blob
 221
 222         # remove any ersatz whitespace from the beginning and
 223         # end of the MARC blob -- these can creep into MARC
 224         # files produced by several sources -- caller really
 225         # should be doing this, however
 226         $marc =~ s/^\s+//;
 227         $marc =~ s/\s+$//;
 228         $marc_blob_is_utf8 = IsStringUTF8ish($marc);
 229         eval {
 230             $marc_record = MARC::Record->new_from_usmarc($marc);
 231         };
 232         if ($@) {
 233             # if we fail the first time, one likely problem
 234             # is that we have a MARC21 record that says that it's
 235             # UTF-8 (Leader/09 = 'a') but contains non-UTF-8 characters.
 236             # We'll try parsing it again.
 237             substr($marc, 9, 1) = ' ';
 238             eval {
 239                 $marc_record = MARC::Record->new_from_usmarc($marc);
 240             };
 241             if ($@) {
 242                 # it's hopeless; return an empty MARC::Record
 243                 return MARC::Record->new(), 'failed', ['could not parse MARC blob'];
 244             }
 245         }
 246     }
 247
 248     # If we do not know the source encoding, try some guesses
 249     # as follows:
 250     #   1. Record is UTF-8 already.
 251     #   2. If MARC flavor is MARC21 or NORMARC, then
 252     #      a. record is MARC-8
 253     #      b. record is ISO-8859-1
 254     #   3. If MARC flavor is UNIMARC, then
 255     if (not defined $source_encoding) {
 256         if ($marc_blob_is_utf8) {
 257             # note that for MARC21/NORMARC we are not bothering to check
 258             # if the Leader/09 is set to 'a' or not -- because
 259             # of problems with various ILSs (including Koha in the
 260             # past, alas), this just is not trustworthy.
 261             SetMarcUnicodeFlag($marc_record, $marc_flavour);
 262             return $marc_record, 'UTF-8', [];
 263         } else {
 264             if ($marc_flavour eq 'MARC21' || $marc_flavour eq 'NORMARC') {
 265                 return _default_marc21_charconv_to_utf8($marc_record, $marc_flavour);
 266             } elsif ($marc_flavour =~/UNIMARC/) {
 267                 return _default_unimarc_charconv_to_utf8($marc_record, $marc_flavour);
 268             } else {
 269                 return _default_marc21_charconv_to_utf8($marc_record, $marc_flavour);
 270             }
 271         }
 272     } else {
 273         # caller knows the character encoding
 274         my $original_marc_record = $marc_record->clone();
 275         my @errors;
 276         if ($source_encoding =~ /utf-?8/i) {
 277             if ($marc_blob_is_utf8) {
 278                 SetMarcUnicodeFlag($marc_record, $marc_flavour);
 279                 return $marc_record, 'UTF-8', [];
 280             } else {
 281                 push @errors, 'specified UTF-8 => UTF-8 conversion, but record is not in UTF-8';
 282             }
 283         } elsif ($source_encoding =~ /marc-?8/i) {
 284             @errors = _marc_marc8_to_utf8($marc_record, $marc_flavour);
 285         } elsif ($source_encoding =~ /5426/) {
 286             @errors = _marc_iso5426_to_utf8($marc_record, $marc_flavour);
 287         } else {
 288             # assume any other character encoding is for Text::Iconv
 289             @errors = _marc_to_utf8_via_text_iconv($marc_record, $marc_flavour, $source_encoding);
 290         }
 291
 292         if (@errors) {
 293             _marc_to_utf8_replacement_char($original_marc_record, $marc_flavour);
 294             return $original_marc_record, 'failed', \@errors;
 295         } else {
 296             return $marc_record, $source_encoding, [];
 297         }
 298     }
 299
 300 }
 301
 302 =head2 SetMarcUnicodeFlag
 303
 304   SetMarcUnicodeFlag($marc_record, $marc_flavour);
 305
 306 Set both the internal MARC::Record encoding flag
 307 and the appropriate Leader/09 (MARC21) or
 308 100/26-29 (UNIMARC) to indicate that the record
 309 is in UTF-8.  Note that this does B<not> do
 310 any actual character conversion.
 311
 312 =cut
 313
 314 sub SetMarcUnicodeFlag {
 315     my $marc_record = shift;
 316     my $marc_flavour = shift; # || C4::Context->preference("marcflavour");
 317
 318     $marc_record->encoding('UTF-8');
 319     if ($marc_flavour eq 'MARC21' || $marc_flavour eq 'NORMARC') {
 320         my $leader = $marc_record->leader();
 321         substr($leader, 9, 1) = 'a';
 322         $marc_record->leader($leader);
 323     } elsif ($marc_flavour =~/UNIMARC/) {
 324         my $string;
 325                 my ($subflength,$encodingposition)=($marc_flavour=~/AUTH/?(21,9):(36,22));
 326                 $string=$marc_record->subfield( 100, "a" );
 327         if (defined $string && length($string)==$subflength) {
 328                         $string = substr $string, 0,$subflength if (length($string)>$subflength);
 329         }
 330         else {
 331             $string = POSIX::strftime( "%Y%m%d", localtime );
 332             $string =~ s/\-//g;
 333             $string = sprintf( "%-*s", $subflength, $string );
 334         }
 335         substr( $string, $encodingposition, 8, "frey50  " );
 336         if ( $marc_record->subfield( 100, "a" ) ) {
 337                         $marc_record->field('100')->update(a=>$string);
 338                 }
 339                 else {
 340             $marc_record->insert_grouped_field(
 341                 MARC::Field->new( 100, '', '', "a" => $string ) );
 342         }
 343                 $debug && warn "encodage: ", substr( $marc_record->subfield(100, 'a'), $encodingposition, 8 );
 344     } else {
 345         warn "Unrecognized marcflavour: $marc_flavour";
 346     }
 347 }
 348
 349 =head2 StripNonXmlChars
 350
 351   my $new_str = StripNonXmlChars($old_str);
 352
 353 Given a string, return a copy with the
 354 characters that are illegal in XML
 355 removed.
 356
 357 This function exists to work around a problem
 358 that can occur with badly-encoded MARC records.
 359 Specifically, if a UTF-8 MARC record also
 360 has excape (\x1b) characters, MARC::File::XML
 361 will let the escape characters pass through
 362 when as_xml() or as_xml_record() is called.  The
 363 problem is that the escape character is not
 364 legal in well-formed XML documents, so when
 365 MARC::File::XML attempts to parse such a record,
 366 the XML parser will fail.
 367
 368 Stripping such characters will allow a
 369 MARC::Record->new_from_xml()
 370 to work, at the possible risk of some data loss.
 371
 372 =cut
 373
 374 sub StripNonXmlChars {
 375     my $str = shift;
 376     if (!defined($str) || $str eq ""){
 377         return "";
 378     }
 379     $str =~ s/[^\x09\x0A\x0D\x{0020}-\x{D7FF}\x{E000}-\x{FFFD}\x{10000}-\x{10FFFF}]//g;
 380     return $str;
 381 }
 382
 383 =head1 INTERNAL FUNCTIONS
 384
 385 =head2 _default_marc21_charconv_to_utf8
 386
 387   my ($new_marc_record, $guessed_charset) = _default_marc21_charconv_to_utf8($marc_record);
 388
 389 Converts a C<MARC::Record> of unknown character set to UTF-8,
 390 first by trying a MARC-8 to UTF-8 conversion, then ISO-8859-1
 391 to UTF-8, then a default conversion that replaces each non-ASCII
 392 character with the replacement character.
 393
 394 The C<$guessed_charset> return value contains the character set
 395 that resulted in a conversion to valid UTF-8; note that
 396 if the MARC-8 and ISO-8859-1 conversions failed, the value of
 397 this is 'failed'.
 398
 399 =cut
 400
 401 sub _default_marc21_charconv_to_utf8 {
 402     my $marc_record = shift;
 403     my $marc_flavour = shift;
 404
 405     my $trial_marc8 = $marc_record->clone();
 406     my @all_errors = ();
 407     my @errors = _marc_marc8_to_utf8($trial_marc8, $marc_flavour);
 408     unless (@errors) {
 409         return $trial_marc8, 'MARC-8', [];
 410     }
 411     push @all_errors, @errors;
 412
 413     my $trial_8859_1 = $marc_record->clone();
 414     @errors = _marc_to_utf8_via_text_iconv($trial_8859_1, $marc_flavour, 'iso-8859-1');
 415     unless (@errors) {
 416         return $trial_8859_1, 'iso-8859-1', []; # note -- we could return \@all_errors
 417                                                 # instead if we wanted to report details
 418                                                 # of the failed attempt at MARC-8 => UTF-8
 419     }
 420     push @all_errors, @errors;
 421
 422     my $default_converted = $marc_record->clone();
 423     _marc_to_utf8_replacement_char($default_converted, $marc_flavour);
 424     return $default_converted, 'failed', \@all_errors;
 425 }
 426
 427 =head2 _default_unimarc_charconv_to_utf8
 428
 429   my ($new_marc_record, $guessed_charset) = _default_unimarc_charconv_to_utf8($marc_record);
 430
 431 Converts a C<MARC::Record> of unknown character set to UTF-8,
 432 first by trying a ISO-5426 to UTF-8 conversion, then ISO-8859-1
 433 to UTF-8, then a default conversion that replaces each non-ASCII
 434 character with the replacement character.
 435
 436 The C<$guessed_charset> return value contains the character set
 437 that resulted in a conversion to valid UTF-8; note that
 438 if the MARC-8 and ISO-8859-1 conversions failed, the value of
 439 this is 'failed'.
 440
 441 =cut
 442
 443 sub _default_unimarc_charconv_to_utf8 {
 444     my $marc_record = shift;
 445     my $marc_flavour = shift;
 446
 447     my $trial_marc8 = $marc_record->clone();
 448     my @all_errors = ();
 449     my @errors = _marc_iso5426_to_utf8($trial_marc8, $marc_flavour);
 450     unless (@errors) {
 451         return $trial_marc8, 'iso-5426';
 452     }
 453     push @all_errors, @errors;
 454
 455     my $trial_8859_1 = $marc_record->clone();
 456     @errors = _marc_to_utf8_via_text_iconv($trial_8859_1, $marc_flavour, 'iso-8859-1');
 457     unless (@errors) {
 458         return $trial_8859_1, 'iso-8859-1';
 459     }
 460     push @all_errors, @errors;
 461
 462     my $default_converted = $marc_record->clone();
 463     _marc_to_utf8_replacement_char($default_converted, $marc_flavour);
 464     return $default_converted, 'failed', \@all_errors;
 465 }
 466
 467 =head2 _marc_marc8_to_utf8
 468
 469   my @errors = _marc_marc8_to_utf8($marc_record, $marc_flavour, $source_encoding);
 470
 471 Convert a C<MARC::Record> to UTF-8 in-place from MARC-8.
 472 If the conversion fails for some reason, an
 473 appropriate messages will be placed in the returned
 474 C<@errors> array.
 475
 476 =cut
 477
 478 sub _marc_marc8_to_utf8 {
 479     my $marc_record = shift;
 480     my $marc_flavour = shift;
 481
 482     my $prev_ignore = MARC::Charset->ignore_errors();
 483     MARC::Charset->ignore_errors(1);
 484
 485     # trap warnings raised by MARC::Charset
 486     my @errors = ();
 487     local $SIG{__WARN__} = sub {
 488         my $msg = $_[0];
 489         if ($msg =~ /MARC.Charset/) {
 490             # FIXME - purpose of this regexp is to strip out the
 491             # line reference to MARC/Charset.pm, but as it
 492             # exists probably won't work quite on Windows --
 493             # some sort of minimal-bunch back-tracking RE
 494             # would be helpful here
 495             $msg =~ s/at [\/].*?.MARC.Charset\.pm line \d+\.\n$//;
 496             push @errors, $msg;
 497         } else {
 498             # if warning doesn't come from MARC::Charset, just
 499             # pass it on
 500             warn $msg;
 501         }
 502     };
 503
 504     foreach my $field ($marc_record->fields()) {
 505         if ($field->is_control_field()) {
 506             ; # do nothing -- control fields should not contain non-ASCII characters
 507         } else {
 508             my @converted_subfields;
 509             foreach my $subfield ($field->subfields()) {
 510                 my $utf8sf = MARC::Charset::marc8_to_utf8($subfield->[1]);
 511                 unless (IsStringUTF8ish($utf8sf)) {
 512                     # Because of a bug in MARC::Charset 0.98, if the string
 513                     # has (a) one or more diacritics that (b) are only in character positions
 514                     # 128 to 255 inclusive, the resulting converted string is not in
 515                     # UTF-8, but the legacy 8-bit encoding (e.g., ISO-8859-1).  If that
 516                     # occurs, upgrade the string in place.  Moral of the story seems to be
 517                     # that pack("U", ...) is better than chr(...) if you need to guarantee
 518                     # that the resulting string is UTF-8.
 519                     utf8::upgrade($utf8sf);
 520                 }
 521                 push @converted_subfields, $subfield->[0], $utf8sf;
 522             }
 523
 524             $field->replace_with(MARC::Field->new(
 525                 $field->tag(), $field->indicator(1), $field->indicator(2),
 526                 @converted_subfields)
 527             );
 528         }
 529     }
 530
 531     MARC::Charset->ignore_errors($prev_ignore);
 532
 533     SetMarcUnicodeFlag($marc_record, $marc_flavour);
 534
 535     return @errors;
 536 }
 537
 538 =head2 _marc_iso5426_to_utf8
 539
 540   my @errors = _marc_iso5426_to_utf8($marc_record, $marc_flavour, $source_encoding);
 541
 542 Convert a C<MARC::Record> to UTF-8 in-place from ISO-5426.
 543 If the conversion fails for some reason, an
 544 appropriate messages will be placed in the returned
 545 C<@errors> array.
 546
 547 FIXME - is ISO-5426 equivalent enough to MARC-8
 548 that C<MARC::Charset> can be used instead?
 549
 550 =cut
 551
 552 sub _marc_iso5426_to_utf8 {
 553     my $marc_record = shift;
 554     my $marc_flavour = shift;
 555
 556     my @errors = ();
 557
 558     foreach my $field ($marc_record->fields()) {
 559         if ($field->is_control_field()) {
 560             ; # do nothing -- control fields should not contain non-ASCII characters
 561         } else {
 562             my @converted_subfields;
 563             foreach my $subfield ($field->subfields()) {
 564                 my $utf8sf = char_decode5426($subfield->[1]);
 565                 push @converted_subfields, $subfield->[0], $utf8sf;
 566             }
 567
 568             $field->replace_with(MARC::Field->new(
 569                 $field->tag(), $field->indicator(1), $field->indicator(2),
 570                 @converted_subfields)
 571             );
 572         }
 573     }
 574
 575     SetMarcUnicodeFlag($marc_record, $marc_flavour);
 576
 577     return @errors;
 578 }
 579
 580 =head2 _marc_to_utf8_via_text_iconv
 581
 582   my @errors = _marc_to_utf8_via_text_iconv($marc_record, $marc_flavour, $source_encoding);
 583
 584 Convert a C<MARC::Record> to UTF-8 in-place using the
 585 C<Text::Iconv> CPAN module.  Any source encoding accepted
 586 by the user's iconv installation should work.  If
 587 the source encoding is not recognized on the user's
 588 server or the conversion fails for some reason,
 589 appropriate messages will be placed in the returned
 590 C<@errors> array.
 591
 592 =cut
 593
 594 sub _marc_to_utf8_via_text_iconv {
 595     my $marc_record = shift;
 596     my $marc_flavour = shift;
 597     my $source_encoding = shift;
 598
 599     my @errors = ();
 600     my $decoder;
 601     eval { $decoder = Text::Iconv->new($source_encoding, 'utf8'); };
 602     if ($@) {
 603         push @errors, "Could not initialze $source_encoding => utf8 converter: $@";
 604         return @errors;
 605     }
 606
 607     my $prev_raise_error = Text::Iconv->raise_error();
 608     Text::Iconv->raise_error(1);
 609
 610     foreach my $field ($marc_record->fields()) {
 611         if ($field->is_control_field()) {
 612             ; # do nothing -- control fields should not contain non-ASCII characters
 613         } else {
 614             my @converted_subfields;
 615             foreach my $subfield ($field->subfields()) {
 616                 my $converted_value;
 617                 my $conversion_ok = 1;
 618                 eval { $converted_value = $decoder->convert($subfield->[1]); };
 619                 if ($@) {
 620                     $conversion_ok = 0;
 621                     push @errors, $@;
 622                 } elsif (not defined $converted_value) {
 623                     $conversion_ok = 0;
 624                     push @errors, "Text::Iconv conversion failed - retval is " . $decoder->retval();
 625                 }
 626
 627                 if ($conversion_ok) {
 628                     push @converted_subfields, $subfield->[0], $converted_value;
 629                 } else {
 630                     $converted_value = $subfield->[1];
 631                     $converted_value =~ s/[\200-\377]/\xef\xbf\xbd/g;
 632                     push @converted_subfields, $subfield->[0], $converted_value;
 633                 }
 634             }
 635
 636             $field->replace_with(MARC::Field->new(
 637                 $field->tag(), $field->indicator(1), $field->indicator(2),
 638                 @converted_subfields)
 639             );
 640         }
 641     }
 642
 643     SetMarcUnicodeFlag($marc_record, $marc_flavour);
 644     Text::Iconv->raise_error($prev_raise_error);
 645
 646     return @errors;
 647 }
 648
 649 =head2 _marc_to_utf8_replacement_char
 650
 651   _marc_to_utf8_replacement_char($marc_record, $marc_flavour);
 652
 653 Convert a C<MARC::Record> to UTF-8 in-place, adopting the
 654 unsatisfactory method of replacing all non-ASCII (e.g.,
 655 where the eight bit is set) octet with the Unicode
 656 replacement character.  This is meant as a last-ditch
 657 method, and would be best used as part of a UI that
 658 lets a cataloguer pick various character conversions
 659 until he or she finds the right one.
 660
 661 =cut
 662
 663 sub _marc_to_utf8_replacement_char {
 664     my $marc_record = shift;
 665     my $marc_flavour = shift;
 666
 667     foreach my $field ($marc_record->fields()) {
 668         if ($field->is_control_field()) {
 669             ; # do nothing -- control fields should not contain non-ASCII characters
 670         } else {
 671             my @converted_subfields;
 672             foreach my $subfield ($field->subfields()) {
 673                 my $value = $subfield->[1];
 674                 $value =~ s/[\200-\377]/\xef\xbf\xbd/g;
 675                 push @converted_subfields, $subfield->[0], $value;
 676             }
 677
 678             $field->replace_with(MARC::Field->new(
 679                 $field->tag(), $field->indicator(1), $field->indicator(2),
 680                 @converted_subfields)
 681             );
 682         }
 683     }
 684
 685     SetMarcUnicodeFlag($marc_record, $marc_flavour);
 686 }
 687
 688 =head2 char_decode5426
 689
 690   my $utf8string = char_decode5426($iso_5426_string);
 691
 692 Converts a string from ISO-5426 to UTF-8.
 693
 694 =cut
 695
 696
 697 my %chars;
 698 $chars{0xb0}=0x0101;#3/0ayn[ain]
 699 $chars{0xb1}=0x0623;#3/1alif/hamzah[alefwithhamzaabove]
 700 #$chars{0xb2}=0x00e0;#'à';
 701 $chars{0xb2}=0x00e0;#3/2leftlowsinglequotationmark
 702 #$chars{0xb3}=0x00e7;#'ç';
 703 $chars{0xb3}=0x00e7;#3/2leftlowsinglequotationmark
 704 # $chars{0xb4}='è';
 705 $chars{0xb4}=0x00e8;
 706 $chars{0xbd}=0x02b9;
 707 $chars{0xbe}=0x02ba;
 708 # $chars{0xb5}='é';
 709 $chars{0xb5}=0x00e9;
 710 $chars{0x97}=0x003c;#3/2leftlowsinglequotationmark
 711 $chars{0x98}=0x003e;#3/2leftlowsinglequotationmark
 712 $chars{0xfa}=0x0153; #oe
 713 $chars{0xea}=0x0152; #oe
 714 $chars{0x81d1}=0x00b0;
 715
 716 ####
 717 ## combined characters iso5426
 718
 719 $chars{0xc041}=0x1ea2; # capital a with hook above
 720 $chars{0xc045}=0x1eba; # capital e with hook above
 721 $chars{0xc049}=0x1ec8; # capital i with hook above
 722 $chars{0xc04f}=0x1ece; # capital o with hook above
 723 $chars{0xc055}=0x1ee6; # capital u with hook above
 724 $chars{0xc059}=0x1ef6; # capital y with hook above
 725 $chars{0xc061}=0x1ea3; # small a with hook above
 726 $chars{0xc065}=0x1ebb; # small e with hook above
 727 $chars{0xc069}=0x1ec9; # small i with hook above
 728 $chars{0xc06f}=0x1ecf; # small o with hook above
 729 $chars{0xc075}=0x1ee7; # small u with hook above
 730 $chars{0xc079}=0x1ef7; # small y with hook above
 731
 732         # 4/1 grave accent
 733 $chars{0xc141}=0x00c0; # capital a with grave accent
 734 $chars{0xc145}=0x00c8; # capital e with grave accent
 735 $chars{0xc149}=0x00cc; # capital i with grave accent
 736 $chars{0xc14f}=0x00d2; # capital o with grave accent
 737 $chars{0xc155}=0x00d9; # capital u with grave accent
 738 $chars{0xc157}=0x1e80; # capital w with grave
 739 $chars{0xc159}=0x1ef2; # capital y with grave
 740 $chars{0xc161}=0x00e0; # small a with grave accent
 741 $chars{0xc165}=0x00e8; # small e with grave accent
 742 $chars{0xc169}=0x00ec; # small i with grave accent
 743 $chars{0xc16f}=0x00f2; # small o with grave accent
 744 $chars{0xc175}=0x00f9; # small u with grave accent
 745 $chars{0xc177}=0x1e81; # small w with grave
 746 $chars{0xc179}=0x1ef3; # small y with grave
 747         # 4/2 acute accent
 748 $chars{0xc241}=0x00c1; # capital a with acute accent
 749 $chars{0xc243}=0x0106; # capital c with acute accent
 750 $chars{0xc245}=0x00c9; # capital e with acute accent
 751 $chars{0xc247}=0x01f4; # capital g with acute
 752 $chars{0xc249}=0x00cd; # capital i with acute accent
 753 $chars{0xc24b}=0x1e30; # capital k with acute
 754 $chars{0xc24c}=0x0139; # capital l with acute accent
 755 $chars{0xc24d}=0x1e3e; # capital m with acute
 756 $chars{0xc24e}=0x0143; # capital n with acute accent
 757 $chars{0xc24f}=0x00d3; # capital o with acute accent
 758 $chars{0xc250}=0x1e54; # capital p with acute
 759 $chars{0xc252}=0x0154; # capital r with acute accent
 760 $chars{0xc253}=0x015a; # capital s with acute accent
 761 $chars{0xc255}=0x00da; # capital u with acute accent
 762 $chars{0xc257}=0x1e82; # capital w with acute
 763 $chars{0xc259}=0x00dd; # capital y with acute accent
 764 $chars{0xc25a}=0x0179; # capital z with acute accent
 765 $chars{0xc261}=0x00e1; # small a with acute accent
 766 $chars{0xc263}=0x0107; # small c with acute accent
 767 $chars{0xc265}=0x00e9; # small e with acute accent
 768 $chars{0xc267}=0x01f5; # small g with acute
 769 $chars{0xc269}=0x00ed; # small i with acute accent
 770 $chars{0xc26b}=0x1e31; # small k with acute
 771 $chars{0xc26c}=0x013a; # small l with acute accent
 772 $chars{0xc26d}=0x1e3f; # small m with acute
 773 $chars{0xc26e}=0x0144; # small n with acute accent
 774 $chars{0xc26f}=0x00f3; # small o with acute accent
 775 $chars{0xc270}=0x1e55; # small p with acute
 776 $chars{0xc272}=0x0155; # small r with acute accent
 777 $chars{0xc273}=0x015b; # small s with acute accent
 778 $chars{0xc275}=0x00fa; # small u with acute accent
 779 $chars{0xc277}=0x1e83; # small w with acute
 780 $chars{0xc279}=0x00fd; # small y with acute accent
 781 $chars{0xc27a}=0x017a; # small z with acute accent
 782 $chars{0xc2e1}=0x01fc; # capital ae with acute
 783 $chars{0xc2f1}=0x01fd; # small ae with acute
 784        # 4/3 circumflex accent
 785 $chars{0xc341}=0x00c2; # capital a with circumflex accent
 786 $chars{0xc343}=0x0108; # capital c with circumflex
 787 $chars{0xc345}=0x00ca; # capital e with circumflex accent
 788 $chars{0xc347}=0x011c; # capital g with circumflex
 789 $chars{0xc348}=0x0124; # capital h with circumflex
 790 $chars{0xc349}=0x00ce; # capital i with circumflex accent
 791 $chars{0xc34a}=0x0134; # capital j with circumflex
 792 $chars{0xc34f}=0x00d4; # capital o with circumflex accent
 793 $chars{0xc353}=0x015c; # capital s with circumflex
 794 $chars{0xc355}=0x00db; # capital u with circumflex
 795 $chars{0xc357}=0x0174; # capital w with circumflex
 796 $chars{0xc359}=0x0176; # capital y with circumflex
 797 $chars{0xc35a}=0x1e90; # capital z with circumflex
 798 $chars{0xc361}=0x00e2; # small a with circumflex accent
 799 $chars{0xc363}=0x0109; # small c with circumflex
 800 $chars{0xc365}=0x00ea; # small e with circumflex accent
 801 $chars{0xc367}=0x011d; # small g with circumflex
 802 $chars{0xc368}=0x0125; # small h with circumflex
 803 $chars{0xc369}=0x00ee; # small i with circumflex accent
 804 $chars{0xc36a}=0x0135; # small j with circumflex
 805 $chars{0xc36e}=0x00f1; # small n with tilde
 806 $chars{0xc36f}=0x00f4; # small o with circumflex accent
 807 $chars{0xc373}=0x015d; # small s with circumflex
 808 $chars{0xc375}=0x00fb; # small u with circumflex
 809 $chars{0xc377}=0x0175; # small w with circumflex
 810 $chars{0xc379}=0x0177; # small y with circumflex
 811 $chars{0xc37a}=0x1e91; # small z with circumflex
 812         # 4/4 tilde
 813 $chars{0xc441}=0x00c3; # capital a with tilde
 814 $chars{0xc445}=0x1ebc; # capital e with tilde
 815 $chars{0xc449}=0x0128; # capital i with tilde
 816 $chars{0xc44e}=0x00d1; # capital n with tilde
 817 $chars{0xc44f}=0x00d5; # capital o with tilde
 818 $chars{0xc455}=0x0168; # capital u with tilde
 819 $chars{0xc456}=0x1e7c; # capital v with tilde
 820 $chars{0xc459}=0x1ef8; # capital y with tilde
 821 $chars{0xc461}=0x00e3; # small a with tilde
 822 $chars{0xc465}=0x1ebd; # small e with tilde
 823 $chars{0xc469}=0x0129; # small i with tilde
 824 $chars{0xc46e}=0x00f1; # small n with tilde
 825 $chars{0xc46f}=0x00f5; # small o with tilde
 826 $chars{0xc475}=0x0169; # small u with tilde
 827 $chars{0xc476}=0x1e7d; # small v with tilde
 828 $chars{0xc479}=0x1ef9; # small y with tilde
 829     # 4/5 macron
 830 $chars{0xc541}=0x0100; # capital a with macron
 831 $chars{0xc545}=0x0112; # capital e with macron
 832 $chars{0xc547}=0x1e20; # capital g with macron
 833 $chars{0xc549}=0x012a; # capital i with macron
 834 $chars{0xc54f}=0x014c; # capital o with macron
 835 $chars{0xc555}=0x016a; # capital u with macron
 836 $chars{0xc561}=0x0101; # small a with macron
 837 $chars{0xc565}=0x0113; # small e with macron
 838 $chars{0xc567}=0x1e21; # small g with macron
 839 $chars{0xc569}=0x012b; # small i with macron
 840 $chars{0xc56f}=0x014d; # small o with macron
 841 $chars{0xc575}=0x016b; # small u with macron
 842 $chars{0xc572}=0x0159; # small r with macron
 843 $chars{0xc5e1}=0x01e2; # capital ae with macron
 844 $chars{0xc5f1}=0x01e3; # small ae with macron
 845         # 4/6 breve
 846 $chars{0xc641}=0x0102; # capital a with breve
 847 $chars{0xc645}=0x0114; # capital e with breve
 848 $chars{0xc647}=0x011e; # capital g with breve
 849 $chars{0xc649}=0x012c; # capital i with breve
 850 $chars{0xc64f}=0x014e; # capital o with breve
 851 $chars{0xc655}=0x016c; # capital u with breve
 852 $chars{0xc661}=0x0103; # small a with breve
 853 $chars{0xc665}=0x0115; # small e with breve
 854 $chars{0xc667}=0x011f; # small g with breve
 855 $chars{0xc669}=0x012d; # small i with breve
 856 $chars{0xc66f}=0x014f; # small o with breve
 857 $chars{0xc675}=0x016d; # small u with breve
 858         # 4/7 dot above
 859 $chars{0xc7b0}=0x01e1; # Ain with dot above
 860 $chars{0xc742}=0x1e02; # capital b with dot above
 861 $chars{0xc743}=0x010a; # capital c with dot above
 862 $chars{0xc744}=0x1e0a; # capital d with dot above
 863 $chars{0xc745}=0x0116; # capital e with dot above
 864 $chars{0xc746}=0x1e1e; # capital f with dot above
 865 $chars{0xc747}=0x0120; # capital g with dot above
 866 $chars{0xc748}=0x1e22; # capital h with dot above
 867 $chars{0xc749}=0x0130; # capital i with dot above
 868 $chars{0xc74d}=0x1e40; # capital m with dot above
 869 $chars{0xc74e}=0x1e44; # capital n with dot above
 870 $chars{0xc750}=0x1e56; # capital p with dot above
 871 $chars{0xc752}=0x1e58; # capital r with dot above
 872 $chars{0xc753}=0x1e60; # capital s with dot above
 873 $chars{0xc754}=0x1e6a; # capital t with dot above
 874 $chars{0xc757}=0x1e86; # capital w with dot above
 875 $chars{0xc758}=0x1e8a; # capital x with dot above
 876 $chars{0xc759}=0x1e8e; # capital y with dot above
 877 $chars{0xc75a}=0x017b; # capital z with dot above
 878 $chars{0xc761}=0x0227; # small b with dot above
 879 $chars{0xc762}=0x1e03; # small b with dot above
 880 $chars{0xc763}=0x010b; # small c with dot above
 881 $chars{0xc764}=0x1e0b; # small d with dot above
 882 $chars{0xc765}=0x0117; # small e with dot above
 883 $chars{0xc766}=0x1e1f; # small f with dot above
 884 $chars{0xc767}=0x0121; # small g with dot above
 885 $chars{0xc768}=0x1e23; # small h with dot above
 886 $chars{0xc76d}=0x1e41; # small m with dot above
 887 $chars{0xc76e}=0x1e45; # small n with dot above
 888 $chars{0xc770}=0x1e57; # small p with dot above
 889 $chars{0xc772}=0x1e59; # small r with dot above
 890 $chars{0xc773}=0x1e61; # small s with dot above
 891 $chars{0xc774}=0x1e6b; # small t with dot above
 892 $chars{0xc777}=0x1e87; # small w with dot above
 893 $chars{0xc778}=0x1e8b; # small x with dot above
 894 $chars{0xc779}=0x1e8f; # small y with dot above
 895 $chars{0xc77a}=0x017c; # small z with dot above
 896         # 4/8 trema, diaresis
 897 $chars{0xc820}=0x00a8; # diaeresis
 898 $chars{0xc841}=0x00c4; # capital a with diaeresis
 899 $chars{0xc845}=0x00cb; # capital e with diaeresis
 900 $chars{0xc848}=0x1e26; # capital h with diaeresis
 901 $chars{0xc849}=0x00cf; # capital i with diaeresis
 902 $chars{0xc84f}=0x00d6; # capital o with diaeresis
 903 $chars{0xc855}=0x00dc; # capital u with diaeresis
 904 $chars{0xc857}=0x1e84; # capital w with diaeresis
 905 $chars{0xc858}=0x1e8c; # capital x with diaeresis
 906 $chars{0xc859}=0x0178; # capital y with diaeresis
 907 $chars{0xc861}=0x00e4; # small a with diaeresis
 908 $chars{0xc865}=0x00eb; # small e with diaeresis
 909 $chars{0xc868}=0x1e27; # small h with diaeresis
 910 $chars{0xc869}=0x00ef; # small i with diaeresis
 911 $chars{0xc86f}=0x00f6; # small o with diaeresis
 912 $chars{0xc874}=0x1e97; # small t with diaeresis
 913 $chars{0xc875}=0x00fc; # small u with diaeresis
 914 $chars{0xc877}=0x1e85; # small w with diaeresis
 915 $chars{0xc878}=0x1e8d; # small x with diaeresis
 916 $chars{0xc879}=0x00ff; # small y with diaeresis
 917         # 4/9 umlaut
 918 $chars{0xc920}=0x00a8; # [diaeresis]
 919 $chars{0xc961}=0x00e4; # a with umlaut
 920 $chars{0xc965}=0x00eb; # e with umlaut
 921 $chars{0xc969}=0x00ef; # i with umlaut
 922 $chars{0xc96f}=0x00f6; # o with umlaut
 923 $chars{0xc975}=0x00fc; # u with umlaut
 924         # 4/10 circle above
 925 $chars{0xca41}=0x00c5; # capital a with ring above
 926 $chars{0xcaad}=0x016e; # capital u with ring above
 927 $chars{0xca61}=0x00e5; # small a with ring above
 928 $chars{0xca75}=0x016f; # small u with ring above
 929 $chars{0xca77}=0x1e98; # small w with ring above
 930 $chars{0xca79}=0x1e99; # small y with ring above
 931         # 4/11 high comma off centre
 932         # 4/12 inverted high comma centred
 933         # 4/13 double acute accent
 934 $chars{0xcd4f}=0x0150; # capital o with double acute
 935 $chars{0xcd55}=0x0170; # capital u with double acute
 936 $chars{0xcd6f}=0x0151; # small o with double acute
 937 $chars{0xcd75}=0x0171; # small u with double acute
 938         # 4/14 horn
 939 $chars{0xce54}=0x01a0; # latin capital letter o with horn
 940 $chars{0xce55}=0x01af; # latin capital letter u with horn
 941 $chars{0xce74}=0x01a1; # latin small letter o with horn
 942 $chars{0xce75}=0x01b0; # latin small letter u with horn
 943         # 4/15 caron (hacek
 944 $chars{0xcf41}=0x01cd; # capital a with caron
 945 $chars{0xcf43}=0x010c; # capital c with caron
 946 $chars{0xcf44}=0x010e; # capital d with caron
 947 $chars{0xcf45}=0x011a; # capital e with caron
 948 $chars{0xcf47}=0x01e6; # capital g with caron
 949 $chars{0xcf49}=0x01cf; # capital i with caron
 950 $chars{0xcf4b}=0x01e8; # capital k with caron
 951 $chars{0xcf4c}=0x013d; # capital l with caron
 952 $chars{0xcf4e}=0x0147; # capital n with caron
 953 $chars{0xcf4f}=0x01d1; # capital o with caron
 954 $chars{0xcf52}=0x0158; # capital r with caron
 955 $chars{0xcf53}=0x0160; # capital s with caron
 956 $chars{0xcf54}=0x0164; # capital t with caron
 957 $chars{0xcf55}=0x01d3; # capital u with caron
 958 $chars{0xcf5a}=0x017d; # capital z with caron
 959 $chars{0xcf61}=0x01ce; # small a with caron
 960 $chars{0xcf63}=0x010d; # small c with caron
 961 $chars{0xcf64}=0x010f; # small d with caron
 962 $chars{0xcf65}=0x011b; # small e with caron
 963 $chars{0xcf67}=0x01e7; # small g with caron
 964 $chars{0xcf69}=0x01d0; # small i with caron
 965 $chars{0xcf6a}=0x01f0; # small j with caron
 966 $chars{0xcf6b}=0x01e9; # small k with caron
 967 $chars{0xcf6c}=0x013e; # small l with caron
 968 $chars{0xcf6e}=0x0148; # small n with caron
 969 $chars{0xcf6f}=0x01d2; # small o with caron
 970 $chars{0xcf72}=0x0159; # small r with caron
 971 $chars{0xcf73}=0x0161; # small s with caron
 972 $chars{0xcf74}=0x0165; # small t with caron
 973 $chars{0xcf75}=0x01d4; # small u with caron
 974 $chars{0xcf7a}=0x017e; # small z with caron
 975         # 5/0 cedilla
 976 $chars{0xd020}=0x00b8; # cedilla
 977 $chars{0xd043}=0x00c7; # capital c with cedilla
 978 $chars{0xd044}=0x1e10; # capital d with cedilla
 979 $chars{0xd047}=0x0122; # capital g with cedilla
 980 $chars{0xd048}=0x1e28; # capital h with cedilla
 981 $chars{0xd04b}=0x0136; # capital k with cedilla
 982 $chars{0xd04c}=0x013b; # capital l with cedilla
 983 $chars{0xd04e}=0x0145; # capital n with cedilla
 984 $chars{0xd052}=0x0156; # capital r with cedilla
 985 $chars{0xd053}=0x015e; # capital s with cedilla
 986 $chars{0xd054}=0x0162; # capital t with cedilla
 987 $chars{0xd063}=0x00e7; # small c with cedilla
 988 $chars{0xd064}=0x1e11; # small d with cedilla
 989 $chars{0xd065}=0x0119; # small e with cedilla
 990 $chars{0xd067}=0x0123; # small g with cedilla
 991 $chars{0xd068}=0x1e29; # small h with cedilla
 992 $chars{0xd06b}=0x0137; # small k with cedilla
 993 $chars{0xd06c}=0x013c; # small l with cedilla
 994 $chars{0xd06e}=0x0146; # small n with cedilla
 995 $chars{0xd072}=0x0157; # small r with cedilla
 996 $chars{0xd073}=0x015f; # small s with cedilla
 997 $chars{0xd074}=0x0163; # small t with cedilla
 998         # 5/1 rude
 999         # 5/2 hook to left
1000         # 5/3 ogonek (hook to right
1001 $chars{0xd320}=0x02db; # ogonek
1002 $chars{0xd341}=0x0104; # capital a with ogonek
1003 $chars{0xd345}=0x0118; # capital e with ogonek
1004 $chars{0xd349}=0x012e; # capital i with ogonek
1005 $chars{0xd34f}=0x01ea; # capital o with ogonek
1006 $chars{0xd355}=0x0172; # capital u with ogonek
1007 $chars{0xd361}=0x0105; # small a with ogonek
1008 $chars{0xd365}=0x0119; # small e with ogonek
1009 $chars{0xd369}=0x012f; # small i with ogonek
1010 $chars{0xd36f}=0x01eb; # small o with ogonek
1011 $chars{0xd375}=0x0173; # small u with ogonek
1012         # 5/4 circle below
1013 $chars{0xd441}=0x1e00; # capital a with ring below
1014 $chars{0xd461}=0x1e01; # small a with ring below
1015         # 5/5 half circle below
1016 $chars{0xf948}=0x1e2a; # capital h with breve below
1017 $chars{0xf968}=0x1e2b; # small h with breve below
1018         # 5/6 dot below
1019 $chars{0xd641}=0x1ea0; # capital a with dot below
1020 $chars{0xd642}=0x1e04; # capital b with dot below
1021 $chars{0xd644}=0x1e0c; # capital d with dot below
1022 $chars{0xd645}=0x1eb8; # capital e with dot below
1023 $chars{0xd648}=0x1e24; # capital h with dot below
1024 $chars{0xd649}=0x1eca; # capital i with dot below
1025 $chars{0xd64b}=0x1e32; # capital k with dot below
1026 $chars{0xd64c}=0x1e36; # capital l with dot below
1027 $chars{0xd64d}=0x1e42; # capital m with dot below
1028 $chars{0xd64e}=0x1e46; # capital n with dot below
1029 $chars{0xd64f}=0x1ecc; # capital o with dot below
1030 $chars{0xd652}=0x1e5a; # capital r with dot below
1031 $chars{0xd653}=0x1e62; # capital s with dot below
1032 $chars{0xd654}=0x1e6c; # capital t with dot below
1033 $chars{0xd655}=0x1ee4; # capital u with dot below
1034 $chars{0xd656}=0x1e7e; # capital v with dot below
1035 $chars{0xd657}=0x1e88; # capital w with dot below
1036 $chars{0xd659}=0x1ef4; # capital y with dot below
1037 $chars{0xd65a}=0x1e92; # capital z with dot below
1038 $chars{0xd661}=0x1ea1; # small a with dot below
1039 $chars{0xd662}=0x1e05; # small b with dot below
1040 $chars{0xd664}=0x1e0d; # small d with dot below
1041 $chars{0xd665}=0x1eb9; # small e with dot below
1042 $chars{0xd668}=0x1e25; # small h with dot below
1043 $chars{0xd669}=0x1ecb; # small i with dot below
1044 $chars{0xd66b}=0x1e33; # small k with dot below
1045 $chars{0xd66c}=0x1e37; # small l with dot below
1046 $chars{0xd66d}=0x1e43; # small m with dot below
1047 $chars{0xd66e}=0x1e47; # small n with dot below
1048 $chars{0xd66f}=0x1ecd; # small o with dot below
1049 $chars{0xd672}=0x1e5b; # small r with dot below
1050 $chars{0xd673}=0x1e63; # small s with dot below
1051 $chars{0xd674}=0x1e6d; # small t with dot below
1052 $chars{0xd675}=0x1ee5; # small u with dot below
1053 $chars{0xd676}=0x1e7f; # small v with dot below
1054 $chars{0xd677}=0x1e89; # small w with dot below
1055 $chars{0xd679}=0x1ef5; # small y with dot below
1056 $chars{0xd67a}=0x1e93; # small z with dot below
1057         # 5/7 double dot below
1058 $chars{0xd755}=0x1e72; # capital u with diaeresis below
1059 $chars{0xd775}=0x1e73; # small u with diaeresis below
1060         # 5/8 underline
1061 $chars{0xd820}=0x005f; # underline
1062         # 5/9 double underline
1063 $chars{0xd920}=0x2017; # double underline
1064         # 5/10 small low vertical bar
1065 $chars{0xda20}=0x02cc; #
1066         # 5/11 circumflex below
1067         # 5/12 (this position shall not be used)
1068         # 5/13 left half of ligature sign and of double tilde
1069         # 5/14 right half of ligature sign
1070         # 5/15 right half of double tilde
1071 #     map {printf "%x :%x\n",$_,$chars{$_};}keys %chars;
1072
1073 sub char_decode5426 {
1074     my ( $string) = @_;
1075     my $result;
1076
1077     my @data = unpack("C*", $string);
1078     my @characters;
1079     my $length=scalar(@data);
1080     for (my $i = 0; $i < scalar(@data); $i++) {
1081       my $char= $data[$i];
1082       if ($char >= 0x00 && $char <= 0x7F){
1083         #IsAscii
1084
1085           push @characters,$char unless ($char<0x02 ||$char== 0x0F);
1086       }elsif (($char >= 0xC0 && $char <= 0xDF)) {
1087         #Combined Char
1088         my $convchar ;
1089         if ($chars{$char*256+$data[$i+1]}) {
1090           $convchar= $chars{$char * 256 + $data[$i+1]};
1091           $i++;
1092 #           printf "char %x $char, char to convert %x , converted %x\n",$char,$char * 256 + $data[$i - 1],$convchar;
1093         } elsif ($chars{$char})  {
1094           $convchar= $chars{$char};
1095 #           printf "0xC char %x, converted %x\n",$char,$chars{$char};
1096         }else {
1097           $convchar=$char;
1098         }
1099         push @characters,$convchar;
1100       } else {
1101         my $convchar;
1102         if ($chars{$char})  {
1103           $convchar= $chars{$char};
1104 #            printf "char %x,  converted %x\n",$char,$chars{$char};
1105         }else {
1106 #            printf "char %x $char\n",$char;
1107           $convchar=$char;
1108         }
1109         push @characters,$convchar;
1110       }
1111     }
1112     $result=pack "U*",@characters;
1113 #     $result=~s/\x01//;
1114 #     $result=~s/\x00//;
1115      $result=~s/\x0f//;
1116      $result=~s/\x1b.//;
1117      $result=~s/\x0e//;
1118      $result=~s/\x1b\x5b//;
1119 #   map{printf "%x",$_} @characters;
1120 #   printf "\n";
1121   return $result;
1122 }
1123
1124 1;
1125
1126
1127 =head1 AUTHOR
1128
1129 Koha Development Team <http://koha-community.org/>
1130
1131 Galen Charlton <galen.charlton@liblime.com>
1132
1133 =cut