C4/Charset.pm

   1 package C4::Charset;
   2
   3 # Copyright (C) 2008 LibLime
   4 #
   5 # This file is part of Koha.
   6 #
   7 # Koha is free software; you can redistribute it and/or modify it under the
   8 # terms of the GNU General Public License as published by the Free Software
   9 # Foundation; either version 2 of the License, or (at your option) any later
  10 # version.
  11 #
  12 # Koha is distributed in the hope that it will be useful, but WITHOUT ANY
  13 # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
  14 # A PARTICULAR PURPOSE.  See the GNU General Public License for more details.
  15 #
  16 # You should have received a copy of the GNU General Public License along with
  17 # Koha; if not, write to the Free Software Foundation, Inc., 59 Temple Place,
  18 # Suite 330, Boston, MA  02111-1307 USA
  19
  20 use strict;
  21 use warnings;
  22
  23 use MARC::Charset qw/marc8_to_utf8/;
  24 use Text::Iconv;
  25 use C4::Debug;
  26 use Unicode::Normalize;
  27
  28 use vars qw($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
  29
  30 BEGIN {
  31     # set the version for version checking
  32     $VERSION = 3.01;
  33     require Exporter;
  34     @ISA    = qw(Exporter);
  35     @EXPORT = qw(
  36         IsStringUTF8ish
  37         MarcToUTF8Record
  38         SetUTF8Flag
  39         SetMarcUnicodeFlag
  40         StripNonXmlChars
  41     );
  42 }
  43
  44 =head1 NAME
  45
  46 C4::Charset - utilities for handling character set conversions.
  47
  48 =head1 SYNOPSIS
  49
  50 use C4::Charset;
  51
  52 =head1 DESCRIPTION
  53
  54 This module contains routines for dealing with character set
  55 conversions, particularly for MARC records.
  56
  57 A variety of character encodings are in use by various MARC
  58 standards, and even more character encodings are used by
  59 non-standard MARC records.  The various MARC formats generally
  60 do not do a good job of advertising a given record's character
  61 encoding, and even when a record does advertise its encoding,
  62 e.g., via the Leader/09, experience has shown that one cannot
  63 trust it.
  64
  65 Ultimately, all MARC records are stored in Koha in UTF-8 and
  66 must be converted from whatever the source character encoding is.
  67 The goal of this module is to ensure that these conversions
  68 take place accurately.  When a character conversion cannot take
  69 place, or at least not accurately, the module was provide
  70 enough information to allow user-facing code to inform the user
  71 on how to deal with the situation.
  72
  73 =cut
  74
  75 =head1 FUNCTIONS
  76
  77 =head2 IsStringUTF8ish
  78
  79 =over 4
  80
  81 my $is_utf8 = IsStringUTF8ish($str);
  82
  83 =back
  84
  85 Determines if C<$str> is valid UTF-8.  This can mean
  86 one of two things:
  87
  88 =over 2
  89
  90 =item *
  91
  92 The Perl UTF-8 flag is set and the string contains valid UTF-8.
  93
  94 =item *
  95
  96 The Perl UTF-8 flag is B<not> set, but the octets contain
  97 valid UTF-8.
  98
  99 =back
 100
 101 The function is named C<IsStringUTF8ish> instead of C<IsStringUTF8>
 102 because in one could be presented with a MARC blob that is
 103 not actually in UTF-8 but whose sequence of octets appears to be
 104 valid UTF-8.  The rest of the MARC character conversion functions
 105 will assume that this situation occur does not very often.
 106
 107 =cut
 108
 109 sub IsStringUTF8ish {
 110     my $str = shift;
 111
 112     return 1 if utf8::is_utf8($str);
 113     return utf8::decode($str);
 114 }
 115
 116 =head2 SetUTF8Flag
 117
 118 =over 4
 119
 120 my $marc_record = SetUTF8Flag($marc_record);
 121
 122 =back
 123
 124 This function sets the PERL UTF8 flag for data.
 125 It is required when using new_from_usmarc
 126 since MARC::File::USMARC does not handle PERL UTF8 setting.
 127 When editing unicode marc records fields and subfields, you
 128 would end up in double encoding without using this function.
 129
 130 FIXME
 131 In my opinion, this function belongs to MARC::Record and not
 132 to this package.
 133 But since it handles charset, and MARC::Record, it finds its way in that package
 134
 135 =cut
 136
 137 sub SetUTF8Flag{
 138         my ($record)=@_;
 139         return unless ($record && $record->fields());
 140         foreach my $field ($record->fields()){
 141                 if ($field->tag()>=10){
 142                         my @subfields;
 143                         foreach my $subfield ($field->subfields()){
 144                                 push @subfields,($$subfield[0],NormalizeString($$subfield[1]));
 145                         }
 146                         my $newfield=MARC::Field->new(
 147                                                         $field->tag(),
 148                                                         $field->indicator(1),
 149                                                         $field->indicator(2),
 150                                                         @subfields
 151                                                 );
 152                         $field->replace_with($newfield);
 153                 }
 154         }
 155 }
 156
 157 =head2 NormalizeString
 158
 159 =over 4
 160
 161     my $normalized_string=NormalizeString($string);
 162
 163 =back
 164         Given
 165             a string
 166         nfc : If you want to set NFC and not NFD
 167         transform : If you expect all the signs to be removed
 168     Sets the PERL UTF8 Flag on your initial data if need be
 169     and applies cleaning if required
 170
 171         Returns a utf8 NFD normalized string
 172
 173         Sample code :
 174         my $string=NormalizeString ("l'ornithoptère");
 175     #results into ornithoptère in NFD form and sets UTF8 Flag
 176 =cut
 177
 178 sub NormalizeString{
 179         my ($string,$nfc,$transform)=@_;
 180         utf8::decode($string) unless (utf8::is_utf8($string));
 181         if ($nfc){
 182                 $string= NFD($string);
 183         }
 184         else {
 185                 $string=NFC($string);
 186         }
 187         if ($transform){
 188     $string=~s/\<|\>|\^|\;|\.|\?|,|\-|\(|\)|\[|\]|\{|\}|\$|\%|\!|\*|\:|\\|\/|\&|\"|\'/ /g;
 189         #removing one letter words "d'" "l'"  was changed into "d " "l "
 190     $string=~s/\b\S\b//g;
 191     $string=~s/\s+$//g;
 192         }
 193     return $string;
 194 }
 195
 196 =head2 MarcToUTF8Record
 197
 198 =over 4
 199
 200 ($marc_record, $converted_from, $errors_arrayref) = MarcToUTF8Record($marc_blob, $marc_flavour, [, $source_encoding]);
 201
 202 =back
 203
 204 Given a MARC blob or a C<MARC::Record>, the MARC flavour, and an
 205 optional source encoding, return a C<MARC::Record> that is
 206 converted to UTF-8.
 207
 208 The returned C<$marc_record> is guaranteed to be in valid UTF-8, but
 209 is not guaranteed to have been converted correctly.  Specifically,
 210 if C<$converted_from> is 'failed', the MARC record returned failed
 211 character conversion and had each of its non-ASCII octets changed
 212 to the Unicode replacement character.
 213
 214 If the source encoding was not specified, this routine will
 215 try to guess it; the character encoding used for a successful
 216 conversion is returned in C<$converted_from>.
 217
 218 =cut
 219
 220 sub MarcToUTF8Record {
 221     my $marc = shift;
 222     my $marc_flavour = shift;
 223     my $source_encoding = shift;
 224     my $marc_record;
 225     my $marc_blob_is_utf8 = 0;
 226     if (ref($marc) eq 'MARC::Record') {
 227         my $marc_blob = $marc->as_usmarc();
 228         $marc_blob_is_utf8 = IsStringUTF8ish($marc_blob);
 229         $marc_record = $marc;
 230     } else {
 231         # dealing with a MARC blob
 232
 233         # remove any ersatz whitespace from the beginning and
 234         # end of the MARC blob -- these can creep into MARC
 235         # files produced by several sources -- caller really
 236         # should be doing this, however
 237         $marc =~ s/^\s+//;
 238         $marc =~ s/\s+$//;
 239         $marc_blob_is_utf8 = IsStringUTF8ish($marc);
 240         eval {
 241             $marc_record = MARC::Record->new_from_usmarc($marc);
 242         };
 243         if ($@) {
 244             # if we fail the first time, one likely problem
 245             # is that we have a MARC21 record that says that it's
 246             # UTF-8 (Leader/09 = 'a') but contains non-UTF-8 characters.
 247             # We'll try parsing it again.
 248             substr($marc, 9, 1) = ' ';
 249             eval {
 250                 $marc_record = MARC::Record->new_from_usmarc($marc);
 251             };
 252             if ($@) {
 253                 # it's hopeless; return an empty MARC::Record
 254                 return MARC::Record->new(), 'failed', ['could not parse MARC blob'];
 255             }
 256         }
 257     }
 258
 259     # If we do not know the source encoding, try some guesses
 260     # as follows:
 261     #   1. Record is UTF-8 already.
 262     #   2. If MARC flavor is MARC21, then
 263     #      a. record is MARC-8
 264     #      b. record is ISO-8859-1
 265     #   3. If MARC flavor is UNIMARC, then
 266     if (not defined $source_encoding) {
 267         if ($marc_blob_is_utf8) {
 268             # note that for MARC21 we are not bothering to check
 269             # if the Leader/09 is set to 'a' or not -- because
 270             # of problems with various ILSs (including Koha in the
 271             # past, alas), this just is not trustworthy.
 272             SetMarcUnicodeFlag($marc_record, $marc_flavour);
 273             return $marc_record, 'UTF-8', [];
 274         } else {
 275             if ($marc_flavour eq 'MARC21') {
 276                 return _default_marc21_charconv_to_utf8($marc_record, $marc_flavour);
 277             } elsif ($marc_flavour =~/UNIMARC/) {
 278                 return _default_unimarc_charconv_to_utf8($marc_record, $marc_flavour);
 279             } else {
 280                 return _default_marc21_charconv_to_utf8($marc_record, $marc_flavour);
 281             }
 282         }
 283     } else {
 284         # caller knows the character encoding
 285         my $original_marc_record = $marc_record->clone();
 286         my @errors;
 287         if ($source_encoding =~ /utf-?8/i) {
 288             if ($marc_blob_is_utf8) {
 289                 SetMarcUnicodeFlag($marc_record, $marc_flavour);
 290                 return $marc_record, 'UTF-8', [];
 291             } else {
 292                 push @errors, 'specified UTF-8 => UTF-8 conversion, but record is not in UTF-8';
 293             }
 294         } elsif ($source_encoding =~ /marc-?8/i) {
 295             @errors = _marc_marc8_to_utf8($marc_record, $marc_flavour);
 296         } elsif ($source_encoding =~ /5426/) {
 297             @errors = _marc_iso5426_to_utf8($marc_record, $marc_flavour);
 298         } else {
 299             # assume any other character encoding is for Text::Iconv
 300             @errors = _marc_to_utf8_via_text_iconv($marc_record, $marc_flavour, $source_encoding);
 301         }
 302
 303         if (@errors) {
 304             _marc_to_utf8_replacement_char($original_marc_record, $marc_flavour);
 305             return $original_marc_record, 'failed', \@errors;
 306         } else {
 307             return $marc_record, $source_encoding, [];
 308         }
 309     }
 310
 311 }
 312
 313 =head2 SetMarcUnicodeFlag
 314
 315 =over 4
 316
 317 SetMarcUnicodeFlag($marc_record, $marc_flavour);
 318
 319 =back
 320
 321 Set both the internal MARC::Record encoding flag
 322 and the appropriate Leader/09 (MARC21) or
 323 100/26-29 (UNIMARC) to indicate that the record
 324 is in UTF-8.  Note that this does B<not> do
 325 any actual character conversion.
 326
 327 =cut
 328
 329 sub SetMarcUnicodeFlag {
 330     my $marc_record = shift;
 331     my $marc_flavour = shift; # || C4::Context->preference("marcflavour");
 332
 333     $marc_record->encoding('UTF-8');
 334     if ($marc_flavour eq 'MARC21') {
 335         my $leader = $marc_record->leader();
 336         substr($leader, 9, 1) = 'a';
 337         $marc_record->leader($leader);
 338     } elsif ($marc_flavour =~/UNIMARC/) {
 339         my $string;
 340                 my ($subflength,$encodingposition)=($marc_flavour=~/AUTH/?(21,9):(36,22));
 341                 $string=$marc_record->subfield( 100, "a" );
 342         if (defined $string && length($string)==$subflength) {
 343                         $string = substr $string, 0,$subflength if (length($string)>$subflength);
 344         }
 345         else {
 346             $string = POSIX::strftime( "%Y%m%d", localtime );
 347             $string =~ s/\-//g;
 348             $string = sprintf( "%-*s", $subflength, $string );
 349         }
 350         substr( $string, $encodingposition, 8, "frey50  " );
 351         if ( $marc_record->subfield( 100, "a" ) ) {
 352                         $marc_record->field('100')->update(a=>$string);
 353                 }
 354                 else {
 355             $marc_record->insert_grouped_field(
 356                 MARC::Field->new( 100, '', '', "a" => $string ) );
 357         }
 358                 $debug && warn "encodage: ", substr( $marc_record->subfield(100, 'a'), $encodingposition, 8 );
 359     } else {
 360         warn "Unrecognized marcflavour: $marc_flavour";
 361     }
 362 }
 363
 364 =head2 StripNonXmlChars
 365
 366 =over 4
 367
 368 my $new_str = StripNonXmlChars($old_str);
 369
 370 =back
 371
 372 Given a string, return a copy with the
 373 characters that are illegal in XML
 374 removed.
 375
 376 This function exists to work around a problem
 377 that can occur with badly-encoded MARC records.
 378 Specifically, if a UTF-8 MARC record also
 379 has excape (\x1b) characters, MARC::File::XML
 380 will let the escape characters pass through
 381 when as_xml() or as_xml_record() is called.  The
 382 problem is that the escape character is not
 383 legal in well-formed XML documents, so when
 384 MARC::File::XML attempts to parse such a record,
 385 the XML parser will fail.
 386
 387 Stripping such characters will allow a
 388 MARC::Record->new_from_xml()
 389 to work, at the possible risk of some data loss.
 390
 391 =cut
 392
 393 sub StripNonXmlChars {
 394     my $str = shift;
 395     if (!defined($str) || $str eq ""){
 396         return "";
 397     }
 398     $str =~ s/[^\x09\x0A\x0D\x{0020}-\x{D7FF}\x{E000}-\x{FFFD}\x{10000}-\x{10FFFF}]//g;
 399     return $str;
 400 }
 401
 402 =head1 INTERNAL FUNCTIONS
 403
 404 =head2 _default_marc21_charconv_to_utf8
 405
 406 =over 4
 407
 408 my ($new_marc_record, $guessed_charset) = _default_marc21_charconv_to_utf8($marc_record);
 409
 410 =back
 411
 412 Converts a C<MARC::Record> of unknown character set to UTF-8,
 413 first by trying a MARC-8 to UTF-8 conversion, then ISO-8859-1
 414 to UTF-8, then a default conversion that replaces each non-ASCII
 415 character with the replacement character.
 416
 417 The C<$guessed_charset> return value contains the character set
 418 that resulted in a conversion to valid UTF-8; note that
 419 if the MARC-8 and ISO-8859-1 conversions failed, the value of
 420 this is 'failed'.
 421
 422 =cut
 423
 424 sub _default_marc21_charconv_to_utf8 {
 425     my $marc_record = shift;
 426     my $marc_flavour = shift;
 427
 428     my $trial_marc8 = $marc_record->clone();
 429     my @all_errors = ();
 430     my @errors = _marc_marc8_to_utf8($trial_marc8, $marc_flavour);
 431     unless (@errors) {
 432         return $trial_marc8, 'MARC-8', [];
 433     }
 434     push @all_errors, @errors;
 435
 436     my $trial_8859_1 = $marc_record->clone();
 437     @errors = _marc_to_utf8_via_text_iconv($trial_8859_1, $marc_flavour, 'iso-8859-1');
 438     unless (@errors) {
 439         return $trial_8859_1, 'iso-8859-1', []; # note -- we could return \@all_errors
 440                                                 # instead if we wanted to report details
 441                                                 # of the failed attempt at MARC-8 => UTF-8
 442     }
 443     push @all_errors, @errors;
 444
 445     my $default_converted = $marc_record->clone();
 446     _marc_to_utf8_replacement_char($default_converted, $marc_flavour);
 447     return $default_converted, 'failed', \@all_errors;
 448 }
 449
 450 =head2 _default_unimarc_charconv_to_utf8
 451
 452 =over 4
 453
 454 my ($new_marc_record, $guessed_charset) = _default_unimarc_charconv_to_utf8($marc_record);
 455
 456 =back
 457
 458 Converts a C<MARC::Record> of unknown character set to UTF-8,
 459 first by trying a ISO-5426 to UTF-8 conversion, then ISO-8859-1
 460 to UTF-8, then a default conversion that replaces each non-ASCII
 461 character with the replacement character.
 462
 463 The C<$guessed_charset> return value contains the character set
 464 that resulted in a conversion to valid UTF-8; note that
 465 if the MARC-8 and ISO-8859-1 conversions failed, the value of
 466 this is 'failed'.
 467
 468 =cut
 469
 470 sub _default_unimarc_charconv_to_utf8 {
 471     my $marc_record = shift;
 472     my $marc_flavour = shift;
 473
 474     my $trial_marc8 = $marc_record->clone();
 475     my @all_errors = ();
 476     my @errors = _marc_iso5426_to_utf8($trial_marc8, $marc_flavour);
 477     unless (@errors) {
 478         return $trial_marc8, 'iso-5426';
 479     }
 480     push @all_errors, @errors;
 481
 482     my $trial_8859_1 = $marc_record->clone();
 483     @errors = _marc_to_utf8_via_text_iconv($trial_8859_1, $marc_flavour, 'iso-8859-1');
 484     unless (@errors) {
 485         return $trial_8859_1, 'iso-8859-1';
 486     }
 487     push @all_errors, @errors;
 488
 489     my $default_converted = $marc_record->clone();
 490     _marc_to_utf8_replacement_char($default_converted, $marc_flavour);
 491     return $default_converted, 'failed', \@all_errors;
 492 }
 493
 494 =head2 _marc_marc8_to_utf8
 495
 496 =over 4
 497
 498 my @errors = _marc_marc8_to_utf8($marc_record, $marc_flavour, $source_encoding);
 499
 500 =back
 501
 502 Convert a C<MARC::Record> to UTF-8 in-place from MARC-8.
 503 If the conversion fails for some reason, an
 504 appropriate messages will be placed in the returned
 505 C<@errors> array.
 506
 507 =cut
 508
 509 sub _marc_marc8_to_utf8 {
 510     my $marc_record = shift;
 511     my $marc_flavour = shift;
 512
 513     my $prev_ignore = MARC::Charset->ignore_errors();
 514     MARC::Charset->ignore_errors(1);
 515
 516     # trap warnings raised by MARC::Charset
 517     my @errors = ();
 518     local $SIG{__WARN__} = sub {
 519         my $msg = $_[0];
 520         if ($msg =~ /MARC.Charset/) {
 521             # FIXME - purpose of this regexp is to strip out the
 522             # line reference to MARC/Charset.pm, but as it
 523             # exists probably won't work quite on Windows --
 524             # some sort of minimal-bunch back-tracking RE
 525             # would be helpful here
 526             $msg =~ s/at [\/].*?.MARC.Charset\.pm line \d+\.\n$//;
 527             push @errors, $msg;
 528         } else {
 529             # if warning doesn't come from MARC::Charset, just
 530             # pass it on
 531             warn $msg;
 532         }
 533     };
 534
 535     foreach my $field ($marc_record->fields()) {
 536         if ($field->is_control_field()) {
 537             ; # do nothing -- control fields should not contain non-ASCII characters
 538         } else {
 539             my @converted_subfields;
 540             foreach my $subfield ($field->subfields()) {
 541                 my $utf8sf = MARC::Charset::marc8_to_utf8($subfield->[1]);
 542                 unless (IsStringUTF8ish($utf8sf)) {
 543                     # Because of a bug in MARC::Charset 0.98, if the string
 544                     # has (a) one or more diacritics that (b) are only in character positions
 545                     # 128 to 255 inclusive, the resulting converted string is not in
 546                     # UTF-8, but the legacy 8-bit encoding (e.g., ISO-8859-1).  If that
 547                     # occurs, upgrade the string in place.  Moral of the story seems to be
 548                     # that pack("U", ...) is better than chr(...) if you need to guarantee
 549                     # that the resulting string is UTF-8.
 550                     utf8::upgrade($utf8sf);
 551                 }
 552                 push @converted_subfields, $subfield->[0], $utf8sf;
 553             }
 554
 555             $field->replace_with(MARC::Field->new(
 556                 $field->tag(), $field->indicator(1), $field->indicator(2),
 557                 @converted_subfields)
 558             );
 559         }
 560     }
 561
 562     MARC::Charset->ignore_errors($prev_ignore);
 563
 564     SetMarcUnicodeFlag($marc_record, $marc_flavour);
 565
 566     return @errors;
 567 }
 568
 569 =head2 _marc_iso5426_to_utf8
 570
 571 =over 4
 572
 573 my @errors = _marc_iso5426_to_utf8($marc_record, $marc_flavour, $source_encoding);
 574
 575 =back
 576
 577 Convert a C<MARC::Record> to UTF-8 in-place from ISO-5426.
 578 If the conversion fails for some reason, an
 579 appropriate messages will be placed in the returned
 580 C<@errors> array.
 581
 582 FIXME - is ISO-5426 equivalent enough to MARC-8
 583 that C<MARC::Charset> can be used instead?
 584
 585 =cut
 586
 587 sub _marc_iso5426_to_utf8 {
 588     my $marc_record = shift;
 589     my $marc_flavour = shift;
 590
 591     my @errors = ();
 592
 593     foreach my $field ($marc_record->fields()) {
 594         if ($field->is_control_field()) {
 595             ; # do nothing -- control fields should not contain non-ASCII characters
 596         } else {
 597             my @converted_subfields;
 598             foreach my $subfield ($field->subfields()) {
 599                 my $utf8sf = char_decode5426($subfield->[1]);
 600                 push @converted_subfields, $subfield->[0], $utf8sf;
 601             }
 602
 603             $field->replace_with(MARC::Field->new(
 604                 $field->tag(), $field->indicator(1), $field->indicator(2),
 605                 @converted_subfields)
 606             );
 607         }
 608     }
 609
 610     SetMarcUnicodeFlag($marc_record, $marc_flavour);
 611
 612     return @errors;
 613 }
 614
 615 =head2 _marc_to_utf8_via_text_iconv
 616
 617 =over 4
 618
 619 my @errors = _marc_to_utf8_via_text_iconv($marc_record, $marc_flavour, $source_encoding);
 620
 621 =back
 622
 623 Convert a C<MARC::Record> to UTF-8 in-place using the
 624 C<Text::Iconv> CPAN module.  Any source encoding accepted
 625 by the user's iconv installation should work.  If
 626 the source encoding is not recognized on the user's
 627 server or the conversion fails for some reason,
 628 appropriate messages will be placed in the returned
 629 C<@errors> array.
 630
 631 =cut
 632
 633 sub _marc_to_utf8_via_text_iconv {
 634     my $marc_record = shift;
 635     my $marc_flavour = shift;
 636     my $source_encoding = shift;
 637
 638     my @errors = ();
 639     my $decoder;
 640     eval { $decoder = Text::Iconv->new($source_encoding, 'utf8'); };
 641     if ($@) {
 642         push @errors, "Could not initialze $source_encoding => utf8 converter: $@";
 643         return @errors;
 644     }
 645
 646     my $prev_raise_error = Text::Iconv->raise_error();
 647     Text::Iconv->raise_error(1);
 648
 649     foreach my $field ($marc_record->fields()) {
 650         if ($field->is_control_field()) {
 651             ; # do nothing -- control fields should not contain non-ASCII characters
 652         } else {
 653             my @converted_subfields;
 654             foreach my $subfield ($field->subfields()) {
 655                 my $converted_value;
 656                 my $conversion_ok = 1;
 657                 eval { $converted_value = $decoder->convert($subfield->[1]); };
 658                 if ($@) {
 659                     $conversion_ok = 0;
 660                     push @errors, $@;
 661                 } elsif (not defined $converted_value) {
 662                     $conversion_ok = 0;
 663                     push @errors, "Text::Iconv conversion failed - retval is " . $decoder->retval();
 664                 }
 665
 666                 if ($conversion_ok) {
 667                     push @converted_subfields, $subfield->[0], $converted_value;
 668                 } else {
 669                     $converted_value = $subfield->[1];
 670                     $converted_value =~ s/[\200-\377]/\xef\xbf\xbd/g;
 671                     push @converted_subfields, $subfield->[0], $converted_value;
 672                 }
 673             }
 674
 675             $field->replace_with(MARC::Field->new(
 676                 $field->tag(), $field->indicator(1), $field->indicator(2),
 677                 @converted_subfields)
 678             );
 679         }
 680     }
 681
 682     SetMarcUnicodeFlag($marc_record, $marc_flavour);
 683     Text::Iconv->raise_error($prev_raise_error);
 684
 685     return @errors;
 686 }
 687
 688 =head2 _marc_to_utf8_replacement_char
 689
 690 =over 4
 691
 692 _marc_to_utf8_replacement_char($marc_record, $marc_flavour);
 693
 694 =back
 695
 696 Convert a C<MARC::Record> to UTF-8 in-place, adopting the
 697 unsatisfactory method of replacing all non-ASCII (e.g.,
 698 where the eight bit is set) octet with the Unicode
 699 replacement character.  This is meant as a last-ditch
 700 method, and would be best used as part of a UI that
 701 lets a cataloguer pick various character conversions
 702 until he or she finds the right one.
 703
 704 =cut
 705
 706 sub _marc_to_utf8_replacement_char {
 707     my $marc_record = shift;
 708     my $marc_flavour = shift;
 709
 710     foreach my $field ($marc_record->fields()) {
 711         if ($field->is_control_field()) {
 712             ; # do nothing -- control fields should not contain non-ASCII characters
 713         } else {
 714             my @converted_subfields;
 715             foreach my $subfield ($field->subfields()) {
 716                 my $value = $subfield->[1];
 717                 $value =~ s/[\200-\377]/\xef\xbf\xbd/g;
 718                 push @converted_subfields, $subfield->[0], $value;
 719             }
 720
 721             $field->replace_with(MARC::Field->new(
 722                 $field->tag(), $field->indicator(1), $field->indicator(2),
 723                 @converted_subfields)
 724             );
 725         }
 726     }
 727
 728     SetMarcUnicodeFlag($marc_record, $marc_flavour);
 729 }
 730
 731 =head2 char_decode5426
 732
 733 =over 4
 734
 735 my $utf8string = char_decode5426($iso_5426_string);
 736
 737 =back
 738
 739 Converts a string from ISO-5426 to UTF-8.
 740
 741 =cut
 742
 743
 744 my %chars;
 745 $chars{0xb0}=0x0101;#3/0ayn[ain]
 746 $chars{0xb1}=0x0623;#3/1alif/hamzah[alefwithhamzaabove]
 747 #$chars{0xb2}=0x00e0;#'à';
 748 $chars{0xb2}=0x00e0;#3/2leftlowsinglequotationmark
 749 #$chars{0xb3}=0x00e7;#'ç';
 750 $chars{0xb3}=0x00e7;#3/2leftlowsinglequotationmark
 751 # $chars{0xb4}='è';
 752 $chars{0xb4}=0x00e8;
 753 # $chars{0xb5}='é';
 754 $chars{0xb5}=0x00e9;
 755 $chars{0x97}=0x003c;#3/2leftlowsinglequotationmark
 756 $chars{0x98}=0x003e;#3/2leftlowsinglequotationmark
 757 $chars{0xfa}=0x0153;#oe
 758 $chars{0x81d1}=0x00b0;
 759
 760 ####
 761 ## combined characters iso5426
 762
 763 $chars{0xc041}=0x1ea2; # capital a with hook above
 764 $chars{0xc045}=0x1eba; # capital e with hook above
 765 $chars{0xc049}=0x1ec8; # capital i with hook above
 766 $chars{0xc04f}=0x1ece; # capital o with hook above
 767 $chars{0xc055}=0x1ee6; # capital u with hook above
 768 $chars{0xc059}=0x1ef6; # capital y with hook above
 769 $chars{0xc061}=0x1ea3; # small a with hook above
 770 $chars{0xc065}=0x1ebb; # small e with hook above
 771 $chars{0xc069}=0x1ec9; # small i with hook above
 772 $chars{0xc06f}=0x1ecf; # small o with hook above
 773 $chars{0xc075}=0x1ee7; # small u with hook above
 774 $chars{0xc079}=0x1ef7; # small y with hook above
 775
 776         # 4/1 grave accent
 777 $chars{0xc141}=0x00c0; # capital a with grave accent
 778 $chars{0xc145}=0x00c8; # capital e with grave accent
 779 $chars{0xc149}=0x00cc; # capital i with grave accent
 780 $chars{0xc14f}=0x00d2; # capital o with grave accent
 781 $chars{0xc155}=0x00d9; # capital u with grave accent
 782 $chars{0xc157}=0x1e80; # capital w with grave
 783 $chars{0xc159}=0x1ef2; # capital y with grave
 784 $chars{0xc161}=0x00e0; # small a with grave accent
 785 $chars{0xc165}=0x00e8; # small e with grave accent
 786 $chars{0xc169}=0x00ec; # small i with grave accent
 787 $chars{0xc16f}=0x00f2; # small o with grave accent
 788 $chars{0xc175}=0x00f9; # small u with grave accent
 789 $chars{0xc177}=0x1e81; # small w with grave
 790 $chars{0xc179}=0x1ef3; # small y with grave
 791         # 4/2 acute accent
 792 $chars{0xc241}=0x00c1; # capital a with acute accent
 793 $chars{0xc243}=0x0106; # capital c with acute accent
 794 $chars{0xc245}=0x00c9; # capital e with acute accent
 795 $chars{0xc247}=0x01f4; # capital g with acute
 796 $chars{0xc249}=0x00cd; # capital i with acute accent
 797 $chars{0xc24b}=0x1e30; # capital k with acute
 798 $chars{0xc24c}=0x0139; # capital l with acute accent
 799 $chars{0xc24d}=0x1e3e; # capital m with acute
 800 $chars{0xc24e}=0x0143; # capital n with acute accent
 801 $chars{0xc24f}=0x00d3; # capital o with acute accent
 802 $chars{0xc250}=0x1e54; # capital p with acute
 803 $chars{0xc252}=0x0154; # capital r with acute accent
 804 $chars{0xc253}=0x015a; # capital s with acute accent
 805 $chars{0xc255}=0x00da; # capital u with acute accent
 806 $chars{0xc257}=0x1e82; # capital w with acute
 807 $chars{0xc259}=0x00dd; # capital y with acute accent
 808 $chars{0xc25a}=0x0179; # capital z with acute accent
 809 $chars{0xc261}=0x00e1; # small a with acute accent
 810 $chars{0xc263}=0x0107; # small c with acute accent
 811 $chars{0xc265}=0x00e9; # small e with acute accent
 812 $chars{0xc267}=0x01f5; # small g with acute
 813 $chars{0xc269}=0x00ed; # small i with acute accent
 814 $chars{0xc26b}=0x1e31; # small k with acute
 815 $chars{0xc26c}=0x013a; # small l with acute accent
 816 $chars{0xc26d}=0x1e3f; # small m with acute
 817 $chars{0xc26e}=0x0144; # small n with acute accent
 818 $chars{0xc26f}=0x00f3; # small o with acute accent
 819 $chars{0xc270}=0x1e55; # small p with acute
 820 $chars{0xc272}=0x0155; # small r with acute accent
 821 $chars{0xc273}=0x015b; # small s with acute accent
 822 $chars{0xc275}=0x00fa; # small u with acute accent
 823 $chars{0xc277}=0x1e83; # small w with acute
 824 $chars{0xc279}=0x00fd; # small y with acute accent
 825 $chars{0xc27a}=0x017a; # small z with acute accent
 826 $chars{0xc2e1}=0x01fc; # capital ae with acute
 827 $chars{0xc2f1}=0x01fd; # small ae with acute
 828        # 4/3 circumflex accent
 829 $chars{0xc341}=0x00c2; # capital a with circumflex accent
 830 $chars{0xc343}=0x0108; # capital c with circumflex
 831 $chars{0xc345}=0x00ca; # capital e with circumflex accent
 832 $chars{0xc347}=0x011c; # capital g with circumflex
 833 $chars{0xc348}=0x0124; # capital h with circumflex
 834 $chars{0xc349}=0x00ce; # capital i with circumflex accent
 835 $chars{0xc34a}=0x0134; # capital j with circumflex
 836 $chars{0xc34f}=0x00d4; # capital o with circumflex accent
 837 $chars{0xc353}=0x015c; # capital s with circumflex
 838 $chars{0xc355}=0x00db; # capital u with circumflex
 839 $chars{0xc357}=0x0174; # capital w with circumflex
 840 $chars{0xc359}=0x0176; # capital y with circumflex
 841 $chars{0xc35a}=0x1e90; # capital z with circumflex
 842 $chars{0xc361}=0x00e2; # small a with circumflex accent
 843 $chars{0xc363}=0x0109; # small c with circumflex
 844 $chars{0xc365}=0x00ea; # small e with circumflex accent
 845 $chars{0xc367}=0x011d; # small g with circumflex
 846 $chars{0xc368}=0x0125; # small h with circumflex
 847 $chars{0xc369}=0x00ee; # small i with circumflex accent
 848 $chars{0xc36a}=0x0135; # small j with circumflex
 849 $chars{0xc36e}=0x00f1; # small n with tilde
 850 $chars{0xc36f}=0x00f4; # small o with circumflex accent
 851 $chars{0xc373}=0x015d; # small s with circumflex
 852 $chars{0xc375}=0x00fb; # small u with circumflex
 853 $chars{0xc377}=0x0175; # small w with circumflex
 854 $chars{0xc379}=0x0177; # small y with circumflex
 855 $chars{0xc37a}=0x1e91; # small z with circumflex
 856         # 4/4 tilde
 857 $chars{0xc441}=0x00c3; # capital a with tilde
 858 $chars{0xc445}=0x1ebc; # capital e with tilde
 859 $chars{0xc449}=0x0128; # capital i with tilde
 860 $chars{0xc44e}=0x00d1; # capital n with tilde
 861 $chars{0xc44f}=0x00d5; # capital o with tilde
 862 $chars{0xc455}=0x0168; # capital u with tilde
 863 $chars{0xc456}=0x1e7c; # capital v with tilde
 864 $chars{0xc459}=0x1ef8; # capital y with tilde
 865 $chars{0xc461}=0x00e3; # small a with tilde
 866 $chars{0xc465}=0x1ebd; # small e with tilde
 867 $chars{0xc469}=0x0129; # small i with tilde
 868 $chars{0xc46e}=0x00f1; # small n with tilde
 869 $chars{0xc46f}=0x00f5; # small o with tilde
 870 $chars{0xc475}=0x0169; # small u with tilde
 871 $chars{0xc476}=0x1e7d; # small v with tilde
 872 $chars{0xc479}=0x1ef9; # small y with tilde
 873     # 4/5 macron
 874 $chars{0xc541}=0x0100; # capital a with macron
 875 $chars{0xc545}=0x0112; # capital e with macron
 876 $chars{0xc547}=0x1e20; # capital g with macron
 877 $chars{0xc549}=0x012a; # capital i with macron
 878 $chars{0xc54f}=0x014c; # capital o with macron
 879 $chars{0xc555}=0x016a; # capital u with macron
 880 $chars{0xc561}=0x0101; # small a with macron
 881 $chars{0xc565}=0x0113; # small e with macron
 882 $chars{0xc567}=0x1e21; # small g with macron
 883 $chars{0xc569}=0x012b; # small i with macron
 884 $chars{0xc56f}=0x014d; # small o with macron
 885 $chars{0xc575}=0x016b; # small u with macron
 886 $chars{0xc572}=0x0159; # small r with macron
 887 $chars{0xc5e1}=0x01e2; # capital ae with macron
 888 $chars{0xc5f1}=0x01e3; # small ae with macron
 889         # 4/6 breve
 890 $chars{0xc641}=0x0102; # capital a with breve
 891 $chars{0xc645}=0x0114; # capital e with breve
 892 $chars{0xc647}=0x011e; # capital g with breve
 893 $chars{0xc649}=0x012c; # capital i with breve
 894 $chars{0xc64f}=0x014e; # capital o with breve
 895 $chars{0xc655}=0x016c; # capital u with breve
 896 $chars{0xc661}=0x0103; # small a with breve
 897 $chars{0xc665}=0x0115; # small e with breve
 898 $chars{0xc667}=0x011f; # small g with breve
 899 $chars{0xc669}=0x012d; # small i with breve
 900 $chars{0xc66f}=0x014f; # small o with breve
 901 $chars{0xc675}=0x016d; # small u with breve
 902         # 4/7 dot above
 903 $chars{0xc7b0}=0x01e1; # Ain with dot above
 904 $chars{0xc742}=0x1e02; # capital b with dot above
 905 $chars{0xc743}=0x010a; # capital c with dot above
 906 $chars{0xc744}=0x1e0a; # capital d with dot above
 907 $chars{0xc745}=0x0116; # capital e with dot above
 908 $chars{0xc746}=0x1e1e; # capital f with dot above
 909 $chars{0xc747}=0x0120; # capital g with dot above
 910 $chars{0xc748}=0x1e22; # capital h with dot above
 911 $chars{0xc749}=0x0130; # capital i with dot above
 912 $chars{0xc74d}=0x1e40; # capital m with dot above
 913 $chars{0xc74e}=0x1e44; # capital n with dot above
 914 $chars{0xc750}=0x1e56; # capital p with dot above
 915 $chars{0xc752}=0x1e58; # capital r with dot above
 916 $chars{0xc753}=0x1e60; # capital s with dot above
 917 $chars{0xc754}=0x1e6a; # capital t with dot above
 918 $chars{0xc757}=0x1e86; # capital w with dot above
 919 $chars{0xc758}=0x1e8a; # capital x with dot above
 920 $chars{0xc759}=0x1e8e; # capital y with dot above
 921 $chars{0xc75a}=0x017b; # capital z with dot above
 922 $chars{0xc761}=0x0227; # small b with dot above
 923 $chars{0xc762}=0x1e03; # small b with dot above
 924 $chars{0xc763}=0x010b; # small c with dot above
 925 $chars{0xc764}=0x1e0b; # small d with dot above
 926 $chars{0xc765}=0x0117; # small e with dot above
 927 $chars{0xc766}=0x1e1f; # small f with dot above
 928 $chars{0xc767}=0x0121; # small g with dot above
 929 $chars{0xc768}=0x1e23; # small h with dot above
 930 $chars{0xc76d}=0x1e41; # small m with dot above
 931 $chars{0xc76e}=0x1e45; # small n with dot above
 932 $chars{0xc770}=0x1e57; # small p with dot above
 933 $chars{0xc772}=0x1e59; # small r with dot above
 934 $chars{0xc773}=0x1e61; # small s with dot above
 935 $chars{0xc774}=0x1e6b; # small t with dot above
 936 $chars{0xc777}=0x1e87; # small w with dot above
 937 $chars{0xc778}=0x1e8b; # small x with dot above
 938 $chars{0xc779}=0x1e8f; # small y with dot above
 939 $chars{0xc77a}=0x017c; # small z with dot above
 940         # 4/8 trema, diaresis
 941 $chars{0xc820}=0x00a8; # diaeresis
 942 $chars{0xc841}=0x00c4; # capital a with diaeresis
 943 $chars{0xc845}=0x00cb; # capital e with diaeresis
 944 $chars{0xc848}=0x1e26; # capital h with diaeresis
 945 $chars{0xc849}=0x00cf; # capital i with diaeresis
 946 $chars{0xc84f}=0x00d6; # capital o with diaeresis
 947 $chars{0xc855}=0x00dc; # capital u with diaeresis
 948 $chars{0xc857}=0x1e84; # capital w with diaeresis
 949 $chars{0xc858}=0x1e8c; # capital x with diaeresis
 950 $chars{0xc859}=0x0178; # capital y with diaeresis
 951 $chars{0xc861}=0x00e4; # small a with diaeresis
 952 $chars{0xc865}=0x00eb; # small e with diaeresis
 953 $chars{0xc868}=0x1e27; # small h with diaeresis
 954 $chars{0xc869}=0x00ef; # small i with diaeresis
 955 $chars{0xc86f}=0x00f6; # small o with diaeresis
 956 $chars{0xc874}=0x1e97; # small t with diaeresis
 957 $chars{0xc875}=0x00fc; # small u with diaeresis
 958 $chars{0xc877}=0x1e85; # small w with diaeresis
 959 $chars{0xc878}=0x1e8d; # small x with diaeresis
 960 $chars{0xc879}=0x00ff; # small y with diaeresis
 961         # 4/9 umlaut
 962 $chars{0xc920}=0x00a8; # [diaeresis]
 963 $chars{0xc961}=0x00e4; # a with umlaut
 964 $chars{0xc965}=0x00eb; # e with umlaut
 965 $chars{0xc969}=0x00ef; # i with umlaut
 966 $chars{0xc96f}=0x00f6; # o with umlaut
 967 $chars{0xc975}=0x00fc; # u with umlaut
 968         # 4/10 circle above
 969 $chars{0xca41}=0x00c5; # capital a with ring above
 970 $chars{0xcaad}=0x016e; # capital u with ring above
 971 $chars{0xca61}=0x00e5; # small a with ring above
 972 $chars{0xca75}=0x016f; # small u with ring above
 973 $chars{0xca77}=0x1e98; # small w with ring above
 974 $chars{0xca79}=0x1e99; # small y with ring above
 975         # 4/11 high comma off centre
 976         # 4/12 inverted high comma centred
 977         # 4/13 double acute accent
 978 $chars{0xcd4f}=0x0150; # capital o with double acute
 979 $chars{0xcd55}=0x0170; # capital u with double acute
 980 $chars{0xcd6f}=0x0151; # small o with double acute
 981 $chars{0xcd75}=0x0171; # small u with double acute
 982         # 4/14 horn
 983 $chars{0xce54}=0x01a0; # latin capital letter o with horn
 984 $chars{0xce55}=0x01af; # latin capital letter u with horn
 985 $chars{0xce74}=0x01a1; # latin small letter o with horn
 986 $chars{0xce75}=0x01b0; # latin small letter u with horn
 987         # 4/15 caron (hacek
 988 $chars{0xcf41}=0x01cd; # capital a with caron
 989 $chars{0xcf43}=0x010c; # capital c with caron
 990 $chars{0xcf44}=0x010e; # capital d with caron
 991 $chars{0xcf45}=0x011a; # capital e with caron
 992 $chars{0xcf47}=0x01e6; # capital g with caron
 993 $chars{0xcf49}=0x01cf; # capital i with caron
 994 $chars{0xcf4b}=0x01e8; # capital k with caron
 995 $chars{0xcf4c}=0x013d; # capital l with caron
 996 $chars{0xcf4e}=0x0147; # capital n with caron
 997 $chars{0xcf4f}=0x01d1; # capital o with caron
 998 $chars{0xcf52}=0x0158; # capital r with caron
 999 $chars{0xcf53}=0x0160; # capital s with caron
1000 $chars{0xcf54}=0x0164; # capital t with caron
1001 $chars{0xcf55}=0x01d3; # capital u with caron
1002 $chars{0xcf5a}=0x017d; # capital z with caron
1003 $chars{0xcf61}=0x01ce; # small a with caron
1004 $chars{0xcf63}=0x010d; # small c with caron
1005 $chars{0xcf64}=0x010f; # small d with caron
1006 $chars{0xcf65}=0x011b; # small e with caron
1007 $chars{0xcf67}=0x01e7; # small g with caron
1008 $chars{0xcf69}=0x01d0; # small i with caron
1009 $chars{0xcf6a}=0x01f0; # small j with caron
1010 $chars{0xcf6b}=0x01e9; # small k with caron
1011 $chars{0xcf6c}=0x013e; # small l with caron
1012 $chars{0xcf6e}=0x0148; # small n with caron
1013 $chars{0xcf6f}=0x01d2; # small o with caron
1014 $chars{0xcf72}=0x0159; # small r with caron
1015 $chars{0xcf73}=0x0161; # small s with caron
1016 $chars{0xcf74}=0x0165; # small t with caron
1017 $chars{0xcf75}=0x01d4; # small u with caron
1018 $chars{0xcf7a}=0x017e; # small z with caron
1019         # 5/0 cedilla
1020 $chars{0xd020}=0x00b8; # cedilla
1021 $chars{0xd043}=0x00c7; # capital c with cedilla
1022 $chars{0xd044}=0x1e10; # capital d with cedilla
1023 $chars{0xd047}=0x0122; # capital g with cedilla
1024 $chars{0xd048}=0x1e28; # capital h with cedilla
1025 $chars{0xd04b}=0x0136; # capital k with cedilla
1026 $chars{0xd04c}=0x013b; # capital l with cedilla
1027 $chars{0xd04e}=0x0145; # capital n with cedilla
1028 $chars{0xd052}=0x0156; # capital r with cedilla
1029 $chars{0xd053}=0x015e; # capital s with cedilla
1030 $chars{0xd054}=0x0162; # capital t with cedilla
1031 $chars{0xd063}=0x00e7; # small c with cedilla
1032 $chars{0xd064}=0x1e11; # small d with cedilla
1033 $chars{0xd065}=0x0119; # small e with cedilla
1034 $chars{0xd067}=0x0123; # small g with cedilla
1035 $chars{0xd068}=0x1e29; # small h with cedilla
1036 $chars{0xd06b}=0x0137; # small k with cedilla
1037 $chars{0xd06c}=0x013c; # small l with cedilla
1038 $chars{0xd06e}=0x0146; # small n with cedilla
1039 $chars{0xd072}=0x0157; # small r with cedilla
1040 $chars{0xd073}=0x015f; # small s with cedilla
1041 $chars{0xd074}=0x0163; # small t with cedilla
1042         # 5/1 rude
1043         # 5/2 hook to left
1044         # 5/3 ogonek (hook to right
1045 $chars{0xd320}=0x02db; # ogonek
1046 $chars{0xd341}=0x0104; # capital a with ogonek
1047 $chars{0xd345}=0x0118; # capital e with ogonek
1048 $chars{0xd349}=0x012e; # capital i with ogonek
1049 $chars{0xd34f}=0x01ea; # capital o with ogonek
1050 $chars{0xd355}=0x0172; # capital u with ogonek
1051 $chars{0xd361}=0x0105; # small a with ogonek
1052 $chars{0xd365}=0x0119; # small e with ogonek
1053 $chars{0xd369}=0x012f; # small i with ogonek
1054 $chars{0xd36f}=0x01eb; # small o with ogonek
1055 $chars{0xd375}=0x0173; # small u with ogonek
1056         # 5/4 circle below
1057 $chars{0xd441}=0x1e00; # capital a with ring below
1058 $chars{0xd461}=0x1e01; # small a with ring below
1059         # 5/5 half circle below
1060 $chars{0xf948}=0x1e2a; # capital h with breve below
1061 $chars{0xf968}=0x1e2b; # small h with breve below
1062         # 5/6 dot below
1063 $chars{0xd641}=0x1ea0; # capital a with dot below
1064 $chars{0xd642}=0x1e04; # capital b with dot below
1065 $chars{0xd644}=0x1e0c; # capital d with dot below
1066 $chars{0xd645}=0x1eb8; # capital e with dot below
1067 $chars{0xd648}=0x1e24; # capital h with dot below
1068 $chars{0xd649}=0x1eca; # capital i with dot below
1069 $chars{0xd64b}=0x1e32; # capital k with dot below
1070 $chars{0xd64c}=0x1e36; # capital l with dot below
1071 $chars{0xd64d}=0x1e42; # capital m with dot below
1072 $chars{0xd64e}=0x1e46; # capital n with dot below
1073 $chars{0xd64f}=0x1ecc; # capital o with dot below
1074 $chars{0xd652}=0x1e5a; # capital r with dot below
1075 $chars{0xd653}=0x1e62; # capital s with dot below
1076 $chars{0xd654}=0x1e6c; # capital t with dot below
1077 $chars{0xd655}=0x1ee4; # capital u with dot below
1078 $chars{0xd656}=0x1e7e; # capital v with dot below
1079 $chars{0xd657}=0x1e88; # capital w with dot below
1080 $chars{0xd659}=0x1ef4; # capital y with dot below
1081 $chars{0xd65a}=0x1e92; # capital z with dot below
1082 $chars{0xd661}=0x1ea1; # small a with dot below
1083 $chars{0xd662}=0x1e05; # small b with dot below
1084 $chars{0xd664}=0x1e0d; # small d with dot below
1085 $chars{0xd665}=0x1eb9; # small e with dot below
1086 $chars{0xd668}=0x1e25; # small h with dot below
1087 $chars{0xd669}=0x1ecb; # small i with dot below
1088 $chars{0xd66b}=0x1e33; # small k with dot below
1089 $chars{0xd66c}=0x1e37; # small l with dot below
1090 $chars{0xd66d}=0x1e43; # small m with dot below
1091 $chars{0xd66e}=0x1e47; # small n with dot below
1092 $chars{0xd66f}=0x1ecd; # small o with dot below
1093 $chars{0xd672}=0x1e5b; # small r with dot below
1094 $chars{0xd673}=0x1e63; # small s with dot below
1095 $chars{0xd674}=0x1e6d; # small t with dot below
1096 $chars{0xd675}=0x1ee5; # small u with dot below
1097 $chars{0xd676}=0x1e7f; # small v with dot below
1098 $chars{0xd677}=0x1e89; # small w with dot below
1099 $chars{0xd679}=0x1ef5; # small y with dot below
1100 $chars{0xd67a}=0x1e93; # small z with dot below
1101         # 5/7 double dot below
1102 $chars{0xd755}=0x1e72; # capital u with diaeresis below
1103 $chars{0xd775}=0x1e73; # small u with diaeresis below
1104         # 5/8 underline
1105 $chars{0xd820}=0x005f; # underline
1106         # 5/9 double underline
1107 $chars{0xd920}=0x2017; # double underline
1108         # 5/10 small low vertical bar
1109 $chars{0xda20}=0x02cc; #
1110         # 5/11 circumflex below
1111         # 5/12 (this position shall not be used)
1112         # 5/13 left half of ligature sign and of double tilde
1113         # 5/14 right half of ligature sign
1114         # 5/15 right half of double tilde
1115 #     map {printf "%x :%x\n",$_,$chars{$_};}keys %chars;
1116
1117 sub char_decode5426 {
1118     my ( $string) = @_;
1119     my $result;
1120
1121     my @data = unpack("C*", $string);
1122     my @characters;
1123     my $length=scalar(@data);
1124     for (my $i = 0; $i < scalar(@data); $i++) {
1125       my $char= $data[$i];
1126       if ($char >= 0x00 && $char <= 0x7F){
1127         #IsAscii
1128
1129           push @characters,$char unless ($char<0x02 ||$char== 0x0F);
1130       }elsif (($char >= 0xC0 && $char <= 0xDF)) {
1131         #Combined Char
1132         my $convchar ;
1133         if ($chars{$char*256+$data[$i+1]}) {
1134           $convchar= $chars{$char * 256 + $data[$i+1]};
1135           $i++;
1136 #           printf "char %x $char, char to convert %x , converted %x\n",$char,$char * 256 + $data[$i - 1],$convchar;
1137         } elsif ($chars{$char})  {
1138           $convchar= $chars{$char};
1139 #           printf "0xC char %x, converted %x\n",$char,$chars{$char};
1140         }else {
1141           $convchar=$char;
1142         }
1143         push @characters,$convchar;
1144       } else {
1145         my $convchar;
1146         if ($chars{$char})  {
1147           $convchar= $chars{$char};
1148 #            printf "char %x,  converted %x\n",$char,$chars{$char};
1149         }else {
1150 #            printf "char %x $char\n",$char;
1151           $convchar=$char;
1152         }
1153         push @characters,$convchar;
1154       }
1155     }
1156     $result=pack "U*",@characters;
1157 #     $result=~s/\x01//;
1158 #     $result=~s/\x00//;
1159      $result=~s/\x0f//;
1160      $result=~s/\x1b.//;
1161      $result=~s/\x0e//;
1162      $result=~s/\x1b\x5b//;
1163 #   map{printf "%x",$_} @characters;
1164 #   printf "\n";
1165   return $result;
1166 }
1167
1168 1;
1169
1170
1171 =head1 AUTHOR
1172
1173 Koha Development Team <info@koha.org>
1174
1175 Galen Charlton <galen.charlton@liblime.com>
1176
1177 =cut