C4/Charset.pm

   1 package C4::Charset;
   2
   3 # Copyright (C) 2008 LibLime
   4 #
   5 # This file is part of Koha.
   6 #
   7 # Koha is free software; you can redistribute it and/or modify it under the
   8 # terms of the GNU General Public License as published by the Free Software
   9 # Foundation; either version 2 of the License, or (at your option) any later
  10 # version.
  11 #
  12 # Koha is distributed in the hope that it will be useful, but WITHOUT ANY
  13 # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
  14 # A PARTICULAR PURPOSE.  See the GNU General Public License for more details.
  15 #
  16 # You should have received a copy of the GNU General Public License along with
  17 # Koha; if not, write to the Free Software Foundation, Inc., 59 Temple Place,
  18 # Suite 330, Boston, MA  02111-1307 USA
  19
  20 use strict;
  21 use warnings;
  22
  23 use MARC::Charset qw/marc8_to_utf8/;
  24 use Text::Iconv;
  25 use C4::Debug;
  26 use Unicode::Normalize;
  27
  28 use vars qw($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
  29
  30 BEGIN {
  31     # set the version for version checking
  32     $VERSION = 3.01;
  33     require Exporter;
  34     @ISA    = qw(Exporter);
  35     @EXPORT = qw(
  36         IsStringUTF8ish
  37         MarcToUTF8Record
  38         SetUTF8Flag
  39         SetMarcUnicodeFlag
  40         StripNonXmlChars
  41     );
  42 }
  43
  44 =head1 NAME
  45
  46 C4::Charset - utilities for handling character set conversions.
  47
  48 =head1 SYNOPSIS
  49
  50 use C4::Charset;
  51
  52 =head1 DESCRIPTION
  53
  54 This module contains routines for dealing with character set
  55 conversions, particularly for MARC records.
  56
  57 A variety of character encodings are in use by various MARC
  58 standards, and even more character encodings are used by
  59 non-standard MARC records.  The various MARC formats generally
  60 do not do a good job of advertising a given record's character
  61 encoding, and even when a record does advertise its encoding,
  62 e.g., via the Leader/09, experience has shown that one cannot
  63 trust it.
  64
  65 Ultimately, all MARC records are stored in Koha in UTF-8 and
  66 must be converted from whatever the source character encoding is.
  67 The goal of this module is to ensure that these conversions
  68 take place accurately.  When a character conversion cannot take
  69 place, or at least not accurately, the module was provide
  70 enough information to allow user-facing code to inform the user
  71 on how to deal with the situation.
  72
  73 =cut
  74
  75 =head1 FUNCTIONS
  76
  77 =head2 IsStringUTF8ish
  78
  79 =over 4
  80
  81 my $is_utf8 = IsStringUTF8ish($str);
  82
  83 =back
  84
  85 Determines if C<$str> is valid UTF-8.  This can mean
  86 one of two things:
  87
  88 =over 2
  89
  90 =item *
  91
  92 The Perl UTF-8 flag is set and the string contains valid UTF-8.
  93
  94 =item *
  95
  96 The Perl UTF-8 flag is B<not> set, but the octets contain
  97 valid UTF-8.
  98
  99 =back
 100
 101 The function is named C<IsStringUTF8ish> instead of C<IsStringUTF8>
 102 because in one could be presented with a MARC blob that is
 103 not actually in UTF-8 but whose sequence of octets appears to be
 104 valid UTF-8.  The rest of the MARC character conversion functions
 105 will assume that this situation occur does not very often.
 106
 107 =cut
 108
 109 sub IsStringUTF8ish {
 110     my $str = shift;
 111
 112     return 1 if utf8::is_utf8($str);
 113     return utf8::decode($str);
 114 }
 115
 116 =head2 SetUTF8Flag
 117
 118 =over 4
 119
 120 my $marc_record = SetUTF8Flag($marc_record);
 121
 122 =back
 123
 124 This function sets the PERL UTF8 flag for data.
 125 It is required when using new_from_usmarc
 126 since MARC::File::USMARC does not handle PERL UTF8 setting.
 127 When editing unicode marc records fields and subfields, you
 128 would end up in double encoding without using this function.
 129
 130 FIXME
 131 In my opinion, this function belongs to MARC::Record and not
 132 to this package.
 133 But since it handles charset, and MARC::Record, it finds its way in that package
 134
 135 =cut
 136
 137 sub SetUTF8Flag{
 138         my ($record)=@_;
 139         return unless ($record && $record->fields());
 140         foreach my $field ($record->fields()){
 141                 if ($field->tag()>=10){
 142                         my @subfields;
 143                         foreach my $subfield ($field->subfields()){
 144                                 push @subfields,($$subfield[0],NormalizeString($$subfield[1]));
 145                         }
 146                         my $newfield=MARC::Field->new(
 147                                                         $field->tag(),
 148                                                         $field->indicator(1),
 149                                                         $field->indicator(2),
 150                                                         @subfields
 151                                                 );
 152                         $field->replace_with($newfield);
 153                 }
 154         }
 155 }
 156
 157 =head2 NormalizeString
 158
 159 =over 4
 160
 161     my $normalized_string=NormalizeString($string);
 162
 163 =back
 164
 165         Given
 166             a string
 167         nfc : If you want to set NFC and not NFD
 168         transform : If you expect all the signs to be removed
 169     Sets the PERL UTF8 Flag on your initial data if need be
 170     and applies cleaning if required
 171
 172         Returns a utf8 NFD normalized string
 173
 174         Sample code :
 175         my $string=NormalizeString ("l'ornithoptère");
 176     #results into ornithoptère in NFD form and sets UTF8 Flag
 177 =cut
 178
 179 sub NormalizeString{
 180         my ($string,$nfc,$transform)=@_;
 181         utf8::decode($string) unless (utf8::is_utf8($string));
 182         if ($nfc){
 183                 $string= NFD($string);
 184         }
 185         else {
 186                 $string=NFC($string);
 187         }
 188         if ($transform){
 189     $string=~s/\<|\>|\^|\;|\.|\?|,|\-|\(|\)|\[|\]|\{|\}|\$|\%|\!|\*|\:|\\|\/|\&|\"|\'/ /g;
 190         #removing one letter words "d'" "l'"  was changed into "d " "l "
 191     $string=~s/\b\S\b//g;
 192     $string=~s/\s+$//g;
 193         }
 194     return $string;
 195 }
 196
 197 =head2 MarcToUTF8Record
 198
 199 =over 4
 200
 201 ($marc_record, $converted_from, $errors_arrayref) = MarcToUTF8Record($marc_blob, $marc_flavour, [, $source_encoding]);
 202
 203 =back
 204
 205 Given a MARC blob or a C<MARC::Record>, the MARC flavour, and an
 206 optional source encoding, return a C<MARC::Record> that is
 207 converted to UTF-8.
 208
 209 The returned C<$marc_record> is guaranteed to be in valid UTF-8, but
 210 is not guaranteed to have been converted correctly.  Specifically,
 211 if C<$converted_from> is 'failed', the MARC record returned failed
 212 character conversion and had each of its non-ASCII octets changed
 213 to the Unicode replacement character.
 214
 215 If the source encoding was not specified, this routine will
 216 try to guess it; the character encoding used for a successful
 217 conversion is returned in C<$converted_from>.
 218
 219 =cut
 220
 221 sub MarcToUTF8Record {
 222     my $marc = shift;
 223     my $marc_flavour = shift;
 224     my $source_encoding = shift;
 225     my $marc_record;
 226     my $marc_blob_is_utf8 = 0;
 227     if (ref($marc) eq 'MARC::Record') {
 228         my $marc_blob = $marc->as_usmarc();
 229         $marc_blob_is_utf8 = IsStringUTF8ish($marc_blob);
 230         $marc_record = $marc;
 231     } else {
 232         # dealing with a MARC blob
 233
 234         # remove any ersatz whitespace from the beginning and
 235         # end of the MARC blob -- these can creep into MARC
 236         # files produced by several sources -- caller really
 237         # should be doing this, however
 238         $marc =~ s/^\s+//;
 239         $marc =~ s/\s+$//;
 240         $marc_blob_is_utf8 = IsStringUTF8ish($marc);
 241         eval {
 242             $marc_record = MARC::Record->new_from_usmarc($marc);
 243         };
 244         if ($@) {
 245             # if we fail the first time, one likely problem
 246             # is that we have a MARC21 record that says that it's
 247             # UTF-8 (Leader/09 = 'a') but contains non-UTF-8 characters.
 248             # We'll try parsing it again.
 249             substr($marc, 9, 1) = ' ';
 250             eval {
 251                 $marc_record = MARC::Record->new_from_usmarc($marc);
 252             };
 253             if ($@) {
 254                 # it's hopeless; return an empty MARC::Record
 255                 return MARC::Record->new(), 'failed', ['could not parse MARC blob'];
 256             }
 257         }
 258     }
 259
 260     # If we do not know the source encoding, try some guesses
 261     # as follows:
 262     #   1. Record is UTF-8 already.
 263     #   2. If MARC flavor is MARC21, then
 264     #      a. record is MARC-8
 265     #      b. record is ISO-8859-1
 266     #   3. If MARC flavor is UNIMARC, then
 267     if (not defined $source_encoding) {
 268         if ($marc_blob_is_utf8) {
 269             # note that for MARC21 we are not bothering to check
 270             # if the Leader/09 is set to 'a' or not -- because
 271             # of problems with various ILSs (including Koha in the
 272             # past, alas), this just is not trustworthy.
 273             SetMarcUnicodeFlag($marc_record, $marc_flavour);
 274             return $marc_record, 'UTF-8', [];
 275         } else {
 276             if ($marc_flavour eq 'MARC21') {
 277                 return _default_marc21_charconv_to_utf8($marc_record, $marc_flavour);
 278             } elsif ($marc_flavour =~/UNIMARC/) {
 279                 return _default_unimarc_charconv_to_utf8($marc_record, $marc_flavour);
 280             } else {
 281                 return _default_marc21_charconv_to_utf8($marc_record, $marc_flavour);
 282             }
 283         }
 284     } else {
 285         # caller knows the character encoding
 286         my $original_marc_record = $marc_record->clone();
 287         my @errors;
 288         if ($source_encoding =~ /utf-?8/i) {
 289             if ($marc_blob_is_utf8) {
 290                 SetMarcUnicodeFlag($marc_record, $marc_flavour);
 291                 return $marc_record, 'UTF-8', [];
 292             } else {
 293                 push @errors, 'specified UTF-8 => UTF-8 conversion, but record is not in UTF-8';
 294             }
 295         } elsif ($source_encoding =~ /marc-?8/i) {
 296             @errors = _marc_marc8_to_utf8($marc_record, $marc_flavour);
 297         } elsif ($source_encoding =~ /5426/) {
 298             @errors = _marc_iso5426_to_utf8($marc_record, $marc_flavour);
 299         } else {
 300             # assume any other character encoding is for Text::Iconv
 301             @errors = _marc_to_utf8_via_text_iconv($marc_record, $marc_flavour, $source_encoding);
 302         }
 303
 304         if (@errors) {
 305             _marc_to_utf8_replacement_char($original_marc_record, $marc_flavour);
 306             return $original_marc_record, 'failed', \@errors;
 307         } else {
 308             return $marc_record, $source_encoding, [];
 309         }
 310     }
 311
 312 }
 313
 314 =head2 SetMarcUnicodeFlag
 315
 316 =over 4
 317
 318 SetMarcUnicodeFlag($marc_record, $marc_flavour);
 319
 320 =back
 321
 322 Set both the internal MARC::Record encoding flag
 323 and the appropriate Leader/09 (MARC21) or
 324 100/26-29 (UNIMARC) to indicate that the record
 325 is in UTF-8.  Note that this does B<not> do
 326 any actual character conversion.
 327
 328 =cut
 329
 330 sub SetMarcUnicodeFlag {
 331     my $marc_record = shift;
 332     my $marc_flavour = shift; # || C4::Context->preference("marcflavour");
 333
 334     $marc_record->encoding('UTF-8');
 335     if ($marc_flavour eq 'MARC21') {
 336         my $leader = $marc_record->leader();
 337         substr($leader, 9, 1) = 'a';
 338         $marc_record->leader($leader);
 339     } elsif ($marc_flavour =~/UNIMARC/) {
 340         my $string;
 341                 my ($subflength,$encodingposition)=($marc_flavour=~/AUTH/?(21,9):(36,22));
 342                 $string=$marc_record->subfield( 100, "a" );
 343         if (defined $string && length($string)==$subflength) {
 344                         $string = substr $string, 0,$subflength if (length($string)>$subflength);
 345         }
 346         else {
 347             $string = POSIX::strftime( "%Y%m%d", localtime );
 348             $string =~ s/\-//g;
 349             $string = sprintf( "%-*s", $subflength, $string );
 350         }
 351         substr( $string, $encodingposition, 8, "frey50  " );
 352         if ( $marc_record->subfield( 100, "a" ) ) {
 353                         $marc_record->field('100')->update(a=>$string);
 354                 }
 355                 else {
 356             $marc_record->insert_grouped_field(
 357                 MARC::Field->new( 100, '', '', "a" => $string ) );
 358         }
 359                 $debug && warn "encodage: ", substr( $marc_record->subfield(100, 'a'), $encodingposition, 8 );
 360     } else {
 361         warn "Unrecognized marcflavour: $marc_flavour";
 362     }
 363 }
 364
 365 =head2 StripNonXmlChars
 366
 367 =over 4
 368
 369 my $new_str = StripNonXmlChars($old_str);
 370
 371 =back
 372
 373 Given a string, return a copy with the
 374 characters that are illegal in XML
 375 removed.
 376
 377 This function exists to work around a problem
 378 that can occur with badly-encoded MARC records.
 379 Specifically, if a UTF-8 MARC record also
 380 has excape (\x1b) characters, MARC::File::XML
 381 will let the escape characters pass through
 382 when as_xml() or as_xml_record() is called.  The
 383 problem is that the escape character is not
 384 legal in well-formed XML documents, so when
 385 MARC::File::XML attempts to parse such a record,
 386 the XML parser will fail.
 387
 388 Stripping such characters will allow a
 389 MARC::Record->new_from_xml()
 390 to work, at the possible risk of some data loss.
 391
 392 =cut
 393
 394 sub StripNonXmlChars {
 395     my $str = shift;
 396     if (!defined($str) || $str eq ""){
 397         return "";
 398     }
 399     $str =~ s/[^\x09\x0A\x0D\x{0020}-\x{D7FF}\x{E000}-\x{FFFD}\x{10000}-\x{10FFFF}]//g;
 400     return $str;
 401 }
 402
 403 =head1 INTERNAL FUNCTIONS
 404
 405 =head2 _default_marc21_charconv_to_utf8
 406
 407 =over 4
 408
 409 my ($new_marc_record, $guessed_charset) = _default_marc21_charconv_to_utf8($marc_record);
 410
 411 =back
 412
 413 Converts a C<MARC::Record> of unknown character set to UTF-8,
 414 first by trying a MARC-8 to UTF-8 conversion, then ISO-8859-1
 415 to UTF-8, then a default conversion that replaces each non-ASCII
 416 character with the replacement character.
 417
 418 The C<$guessed_charset> return value contains the character set
 419 that resulted in a conversion to valid UTF-8; note that
 420 if the MARC-8 and ISO-8859-1 conversions failed, the value of
 421 this is 'failed'.
 422
 423 =cut
 424
 425 sub _default_marc21_charconv_to_utf8 {
 426     my $marc_record = shift;
 427     my $marc_flavour = shift;
 428
 429     my $trial_marc8 = $marc_record->clone();
 430     my @all_errors = ();
 431     my @errors = _marc_marc8_to_utf8($trial_marc8, $marc_flavour);
 432     unless (@errors) {
 433         return $trial_marc8, 'MARC-8', [];
 434     }
 435     push @all_errors, @errors;
 436
 437     my $trial_8859_1 = $marc_record->clone();
 438     @errors = _marc_to_utf8_via_text_iconv($trial_8859_1, $marc_flavour, 'iso-8859-1');
 439     unless (@errors) {
 440         return $trial_8859_1, 'iso-8859-1', []; # note -- we could return \@all_errors
 441                                                 # instead if we wanted to report details
 442                                                 # of the failed attempt at MARC-8 => UTF-8
 443     }
 444     push @all_errors, @errors;
 445
 446     my $default_converted = $marc_record->clone();
 447     _marc_to_utf8_replacement_char($default_converted, $marc_flavour);
 448     return $default_converted, 'failed', \@all_errors;
 449 }
 450
 451 =head2 _default_unimarc_charconv_to_utf8
 452
 453 =over 4
 454
 455 my ($new_marc_record, $guessed_charset) = _default_unimarc_charconv_to_utf8($marc_record);
 456
 457 =back
 458
 459 Converts a C<MARC::Record> of unknown character set to UTF-8,
 460 first by trying a ISO-5426 to UTF-8 conversion, then ISO-8859-1
 461 to UTF-8, then a default conversion that replaces each non-ASCII
 462 character with the replacement character.
 463
 464 The C<$guessed_charset> return value contains the character set
 465 that resulted in a conversion to valid UTF-8; note that
 466 if the MARC-8 and ISO-8859-1 conversions failed, the value of
 467 this is 'failed'.
 468
 469 =cut
 470
 471 sub _default_unimarc_charconv_to_utf8 {
 472     my $marc_record = shift;
 473     my $marc_flavour = shift;
 474
 475     my $trial_marc8 = $marc_record->clone();
 476     my @all_errors = ();
 477     my @errors = _marc_iso5426_to_utf8($trial_marc8, $marc_flavour);
 478     unless (@errors) {
 479         return $trial_marc8, 'iso-5426';
 480     }
 481     push @all_errors, @errors;
 482
 483     my $trial_8859_1 = $marc_record->clone();
 484     @errors = _marc_to_utf8_via_text_iconv($trial_8859_1, $marc_flavour, 'iso-8859-1');
 485     unless (@errors) {
 486         return $trial_8859_1, 'iso-8859-1';
 487     }
 488     push @all_errors, @errors;
 489
 490     my $default_converted = $marc_record->clone();
 491     _marc_to_utf8_replacement_char($default_converted, $marc_flavour);
 492     return $default_converted, 'failed', \@all_errors;
 493 }
 494
 495 =head2 _marc_marc8_to_utf8
 496
 497 =over 4
 498
 499 my @errors = _marc_marc8_to_utf8($marc_record, $marc_flavour, $source_encoding);
 500
 501 =back
 502
 503 Convert a C<MARC::Record> to UTF-8 in-place from MARC-8.
 504 If the conversion fails for some reason, an
 505 appropriate messages will be placed in the returned
 506 C<@errors> array.
 507
 508 =cut
 509
 510 sub _marc_marc8_to_utf8 {
 511     my $marc_record = shift;
 512     my $marc_flavour = shift;
 513
 514     my $prev_ignore = MARC::Charset->ignore_errors();
 515     MARC::Charset->ignore_errors(1);
 516
 517     # trap warnings raised by MARC::Charset
 518     my @errors = ();
 519     local $SIG{__WARN__} = sub {
 520         my $msg = $_[0];
 521         if ($msg =~ /MARC.Charset/) {
 522             # FIXME - purpose of this regexp is to strip out the
 523             # line reference to MARC/Charset.pm, but as it
 524             # exists probably won't work quite on Windows --
 525             # some sort of minimal-bunch back-tracking RE
 526             # would be helpful here
 527             $msg =~ s/at [\/].*?.MARC.Charset\.pm line \d+\.\n$//;
 528             push @errors, $msg;
 529         } else {
 530             # if warning doesn't come from MARC::Charset, just
 531             # pass it on
 532             warn $msg;
 533         }
 534     };
 535
 536     foreach my $field ($marc_record->fields()) {
 537         if ($field->is_control_field()) {
 538             ; # do nothing -- control fields should not contain non-ASCII characters
 539         } else {
 540             my @converted_subfields;
 541             foreach my $subfield ($field->subfields()) {
 542                 my $utf8sf = MARC::Charset::marc8_to_utf8($subfield->[1]);
 543                 unless (IsStringUTF8ish($utf8sf)) {
 544                     # Because of a bug in MARC::Charset 0.98, if the string
 545                     # has (a) one or more diacritics that (b) are only in character positions
 546                     # 128 to 255 inclusive, the resulting converted string is not in
 547                     # UTF-8, but the legacy 8-bit encoding (e.g., ISO-8859-1).  If that
 548                     # occurs, upgrade the string in place.  Moral of the story seems to be
 549                     # that pack("U", ...) is better than chr(...) if you need to guarantee
 550                     # that the resulting string is UTF-8.
 551                     utf8::upgrade($utf8sf);
 552                 }
 553                 push @converted_subfields, $subfield->[0], $utf8sf;
 554             }
 555
 556             $field->replace_with(MARC::Field->new(
 557                 $field->tag(), $field->indicator(1), $field->indicator(2),
 558                 @converted_subfields)
 559             );
 560         }
 561     }
 562
 563     MARC::Charset->ignore_errors($prev_ignore);
 564
 565     SetMarcUnicodeFlag($marc_record, $marc_flavour);
 566
 567     return @errors;
 568 }
 569
 570 =head2 _marc_iso5426_to_utf8
 571
 572 =over 4
 573
 574 my @errors = _marc_iso5426_to_utf8($marc_record, $marc_flavour, $source_encoding);
 575
 576 =back
 577
 578 Convert a C<MARC::Record> to UTF-8 in-place from ISO-5426.
 579 If the conversion fails for some reason, an
 580 appropriate messages will be placed in the returned
 581 C<@errors> array.
 582
 583 FIXME - is ISO-5426 equivalent enough to MARC-8
 584 that C<MARC::Charset> can be used instead?
 585
 586 =cut
 587
 588 sub _marc_iso5426_to_utf8 {
 589     my $marc_record = shift;
 590     my $marc_flavour = shift;
 591
 592     my @errors = ();
 593
 594     foreach my $field ($marc_record->fields()) {
 595         if ($field->is_control_field()) {
 596             ; # do nothing -- control fields should not contain non-ASCII characters
 597         } else {
 598             my @converted_subfields;
 599             foreach my $subfield ($field->subfields()) {
 600                 my $utf8sf = char_decode5426($subfield->[1]);
 601                 push @converted_subfields, $subfield->[0], $utf8sf;
 602             }
 603
 604             $field->replace_with(MARC::Field->new(
 605                 $field->tag(), $field->indicator(1), $field->indicator(2),
 606                 @converted_subfields)
 607             );
 608         }
 609     }
 610
 611     SetMarcUnicodeFlag($marc_record, $marc_flavour);
 612
 613     return @errors;
 614 }
 615
 616 =head2 _marc_to_utf8_via_text_iconv
 617
 618 =over 4
 619
 620 my @errors = _marc_to_utf8_via_text_iconv($marc_record, $marc_flavour, $source_encoding);
 621
 622 =back
 623
 624 Convert a C<MARC::Record> to UTF-8 in-place using the
 625 C<Text::Iconv> CPAN module.  Any source encoding accepted
 626 by the user's iconv installation should work.  If
 627 the source encoding is not recognized on the user's
 628 server or the conversion fails for some reason,
 629 appropriate messages will be placed in the returned
 630 C<@errors> array.
 631
 632 =cut
 633
 634 sub _marc_to_utf8_via_text_iconv {
 635     my $marc_record = shift;
 636     my $marc_flavour = shift;
 637     my $source_encoding = shift;
 638
 639     my @errors = ();
 640     my $decoder;
 641     eval { $decoder = Text::Iconv->new($source_encoding, 'utf8'); };
 642     if ($@) {
 643         push @errors, "Could not initialze $source_encoding => utf8 converter: $@";
 644         return @errors;
 645     }
 646
 647     my $prev_raise_error = Text::Iconv->raise_error();
 648     Text::Iconv->raise_error(1);
 649
 650     foreach my $field ($marc_record->fields()) {
 651         if ($field->is_control_field()) {
 652             ; # do nothing -- control fields should not contain non-ASCII characters
 653         } else {
 654             my @converted_subfields;
 655             foreach my $subfield ($field->subfields()) {
 656                 my $converted_value;
 657                 my $conversion_ok = 1;
 658                 eval { $converted_value = $decoder->convert($subfield->[1]); };
 659                 if ($@) {
 660                     $conversion_ok = 0;
 661                     push @errors, $@;
 662                 } elsif (not defined $converted_value) {
 663                     $conversion_ok = 0;
 664                     push @errors, "Text::Iconv conversion failed - retval is " . $decoder->retval();
 665                 }
 666
 667                 if ($conversion_ok) {
 668                     push @converted_subfields, $subfield->[0], $converted_value;
 669                 } else {
 670                     $converted_value = $subfield->[1];
 671                     $converted_value =~ s/[\200-\377]/\xef\xbf\xbd/g;
 672                     push @converted_subfields, $subfield->[0], $converted_value;
 673                 }
 674             }
 675
 676             $field->replace_with(MARC::Field->new(
 677                 $field->tag(), $field->indicator(1), $field->indicator(2),
 678                 @converted_subfields)
 679             );
 680         }
 681     }
 682
 683     SetMarcUnicodeFlag($marc_record, $marc_flavour);
 684     Text::Iconv->raise_error($prev_raise_error);
 685
 686     return @errors;
 687 }
 688
 689 =head2 _marc_to_utf8_replacement_char
 690
 691 =over 4
 692
 693 _marc_to_utf8_replacement_char($marc_record, $marc_flavour);
 694
 695 =back
 696
 697 Convert a C<MARC::Record> to UTF-8 in-place, adopting the
 698 unsatisfactory method of replacing all non-ASCII (e.g.,
 699 where the eight bit is set) octet with the Unicode
 700 replacement character.  This is meant as a last-ditch
 701 method, and would be best used as part of a UI that
 702 lets a cataloguer pick various character conversions
 703 until he or she finds the right one.
 704
 705 =cut
 706
 707 sub _marc_to_utf8_replacement_char {
 708     my $marc_record = shift;
 709     my $marc_flavour = shift;
 710
 711     foreach my $field ($marc_record->fields()) {
 712         if ($field->is_control_field()) {
 713             ; # do nothing -- control fields should not contain non-ASCII characters
 714         } else {
 715             my @converted_subfields;
 716             foreach my $subfield ($field->subfields()) {
 717                 my $value = $subfield->[1];
 718                 $value =~ s/[\200-\377]/\xef\xbf\xbd/g;
 719                 push @converted_subfields, $subfield->[0], $value;
 720             }
 721
 722             $field->replace_with(MARC::Field->new(
 723                 $field->tag(), $field->indicator(1), $field->indicator(2),
 724                 @converted_subfields)
 725             );
 726         }
 727     }
 728
 729     SetMarcUnicodeFlag($marc_record, $marc_flavour);
 730 }
 731
 732 =head2 char_decode5426
 733
 734 =over 4
 735
 736 my $utf8string = char_decode5426($iso_5426_string);
 737
 738 =back
 739
 740 Converts a string from ISO-5426 to UTF-8.
 741
 742 =cut
 743
 744
 745 my %chars;
 746 $chars{0xb0}=0x0101;#3/0ayn[ain]
 747 $chars{0xb1}=0x0623;#3/1alif/hamzah[alefwithhamzaabove]
 748 #$chars{0xb2}=0x00e0;#'à';
 749 $chars{0xb2}=0x00e0;#3/2leftlowsinglequotationmark
 750 #$chars{0xb3}=0x00e7;#'ç';
 751 $chars{0xb3}=0x00e7;#3/2leftlowsinglequotationmark
 752 # $chars{0xb4}='è';
 753 $chars{0xb4}=0x00e8;
 754 # $chars{0xb5}='é';
 755 $chars{0xb5}=0x00e9;
 756 $chars{0x97}=0x003c;#3/2leftlowsinglequotationmark
 757 $chars{0x98}=0x003e;#3/2leftlowsinglequotationmark
 758 $chars{0xfa}=0x0153;#oe
 759 $chars{0x81d1}=0x00b0;
 760
 761 ####
 762 ## combined characters iso5426
 763
 764 $chars{0xc041}=0x1ea2; # capital a with hook above
 765 $chars{0xc045}=0x1eba; # capital e with hook above
 766 $chars{0xc049}=0x1ec8; # capital i with hook above
 767 $chars{0xc04f}=0x1ece; # capital o with hook above
 768 $chars{0xc055}=0x1ee6; # capital u with hook above
 769 $chars{0xc059}=0x1ef6; # capital y with hook above
 770 $chars{0xc061}=0x1ea3; # small a with hook above
 771 $chars{0xc065}=0x1ebb; # small e with hook above
 772 $chars{0xc069}=0x1ec9; # small i with hook above
 773 $chars{0xc06f}=0x1ecf; # small o with hook above
 774 $chars{0xc075}=0x1ee7; # small u with hook above
 775 $chars{0xc079}=0x1ef7; # small y with hook above
 776
 777         # 4/1 grave accent
 778 $chars{0xc141}=0x00c0; # capital a with grave accent
 779 $chars{0xc145}=0x00c8; # capital e with grave accent
 780 $chars{0xc149}=0x00cc; # capital i with grave accent
 781 $chars{0xc14f}=0x00d2; # capital o with grave accent
 782 $chars{0xc155}=0x00d9; # capital u with grave accent
 783 $chars{0xc157}=0x1e80; # capital w with grave
 784 $chars{0xc159}=0x1ef2; # capital y with grave
 785 $chars{0xc161}=0x00e0; # small a with grave accent
 786 $chars{0xc165}=0x00e8; # small e with grave accent
 787 $chars{0xc169}=0x00ec; # small i with grave accent
 788 $chars{0xc16f}=0x00f2; # small o with grave accent
 789 $chars{0xc175}=0x00f9; # small u with grave accent
 790 $chars{0xc177}=0x1e81; # small w with grave
 791 $chars{0xc179}=0x1ef3; # small y with grave
 792         # 4/2 acute accent
 793 $chars{0xc241}=0x00c1; # capital a with acute accent
 794 $chars{0xc243}=0x0106; # capital c with acute accent
 795 $chars{0xc245}=0x00c9; # capital e with acute accent
 796 $chars{0xc247}=0x01f4; # capital g with acute
 797 $chars{0xc249}=0x00cd; # capital i with acute accent
 798 $chars{0xc24b}=0x1e30; # capital k with acute
 799 $chars{0xc24c}=0x0139; # capital l with acute accent
 800 $chars{0xc24d}=0x1e3e; # capital m with acute
 801 $chars{0xc24e}=0x0143; # capital n with acute accent
 802 $chars{0xc24f}=0x00d3; # capital o with acute accent
 803 $chars{0xc250}=0x1e54; # capital p with acute
 804 $chars{0xc252}=0x0154; # capital r with acute accent
 805 $chars{0xc253}=0x015a; # capital s with acute accent
 806 $chars{0xc255}=0x00da; # capital u with acute accent
 807 $chars{0xc257}=0x1e82; # capital w with acute
 808 $chars{0xc259}=0x00dd; # capital y with acute accent
 809 $chars{0xc25a}=0x0179; # capital z with acute accent
 810 $chars{0xc261}=0x00e1; # small a with acute accent
 811 $chars{0xc263}=0x0107; # small c with acute accent
 812 $chars{0xc265}=0x00e9; # small e with acute accent
 813 $chars{0xc267}=0x01f5; # small g with acute
 814 $chars{0xc269}=0x00ed; # small i with acute accent
 815 $chars{0xc26b}=0x1e31; # small k with acute
 816 $chars{0xc26c}=0x013a; # small l with acute accent
 817 $chars{0xc26d}=0x1e3f; # small m with acute
 818 $chars{0xc26e}=0x0144; # small n with acute accent
 819 $chars{0xc26f}=0x00f3; # small o with acute accent
 820 $chars{0xc270}=0x1e55; # small p with acute
 821 $chars{0xc272}=0x0155; # small r with acute accent
 822 $chars{0xc273}=0x015b; # small s with acute accent
 823 $chars{0xc275}=0x00fa; # small u with acute accent
 824 $chars{0xc277}=0x1e83; # small w with acute
 825 $chars{0xc279}=0x00fd; # small y with acute accent
 826 $chars{0xc27a}=0x017a; # small z with acute accent
 827 $chars{0xc2e1}=0x01fc; # capital ae with acute
 828 $chars{0xc2f1}=0x01fd; # small ae with acute
 829        # 4/3 circumflex accent
 830 $chars{0xc341}=0x00c2; # capital a with circumflex accent
 831 $chars{0xc343}=0x0108; # capital c with circumflex
 832 $chars{0xc345}=0x00ca; # capital e with circumflex accent
 833 $chars{0xc347}=0x011c; # capital g with circumflex
 834 $chars{0xc348}=0x0124; # capital h with circumflex
 835 $chars{0xc349}=0x00ce; # capital i with circumflex accent
 836 $chars{0xc34a}=0x0134; # capital j with circumflex
 837 $chars{0xc34f}=0x00d4; # capital o with circumflex accent
 838 $chars{0xc353}=0x015c; # capital s with circumflex
 839 $chars{0xc355}=0x00db; # capital u with circumflex
 840 $chars{0xc357}=0x0174; # capital w with circumflex
 841 $chars{0xc359}=0x0176; # capital y with circumflex
 842 $chars{0xc35a}=0x1e90; # capital z with circumflex
 843 $chars{0xc361}=0x00e2; # small a with circumflex accent
 844 $chars{0xc363}=0x0109; # small c with circumflex
 845 $chars{0xc365}=0x00ea; # small e with circumflex accent
 846 $chars{0xc367}=0x011d; # small g with circumflex
 847 $chars{0xc368}=0x0125; # small h with circumflex
 848 $chars{0xc369}=0x00ee; # small i with circumflex accent
 849 $chars{0xc36a}=0x0135; # small j with circumflex
 850 $chars{0xc36e}=0x00f1; # small n with tilde
 851 $chars{0xc36f}=0x00f4; # small o with circumflex accent
 852 $chars{0xc373}=0x015d; # small s with circumflex
 853 $chars{0xc375}=0x00fb; # small u with circumflex
 854 $chars{0xc377}=0x0175; # small w with circumflex
 855 $chars{0xc379}=0x0177; # small y with circumflex
 856 $chars{0xc37a}=0x1e91; # small z with circumflex
 857         # 4/4 tilde
 858 $chars{0xc441}=0x00c3; # capital a with tilde
 859 $chars{0xc445}=0x1ebc; # capital e with tilde
 860 $chars{0xc449}=0x0128; # capital i with tilde
 861 $chars{0xc44e}=0x00d1; # capital n with tilde
 862 $chars{0xc44f}=0x00d5; # capital o with tilde
 863 $chars{0xc455}=0x0168; # capital u with tilde
 864 $chars{0xc456}=0x1e7c; # capital v with tilde
 865 $chars{0xc459}=0x1ef8; # capital y with tilde
 866 $chars{0xc461}=0x00e3; # small a with tilde
 867 $chars{0xc465}=0x1ebd; # small e with tilde
 868 $chars{0xc469}=0x0129; # small i with tilde
 869 $chars{0xc46e}=0x00f1; # small n with tilde
 870 $chars{0xc46f}=0x00f5; # small o with tilde
 871 $chars{0xc475}=0x0169; # small u with tilde
 872 $chars{0xc476}=0x1e7d; # small v with tilde
 873 $chars{0xc479}=0x1ef9; # small y with tilde
 874     # 4/5 macron
 875 $chars{0xc541}=0x0100; # capital a with macron
 876 $chars{0xc545}=0x0112; # capital e with macron
 877 $chars{0xc547}=0x1e20; # capital g with macron
 878 $chars{0xc549}=0x012a; # capital i with macron
 879 $chars{0xc54f}=0x014c; # capital o with macron
 880 $chars{0xc555}=0x016a; # capital u with macron
 881 $chars{0xc561}=0x0101; # small a with macron
 882 $chars{0xc565}=0x0113; # small e with macron
 883 $chars{0xc567}=0x1e21; # small g with macron
 884 $chars{0xc569}=0x012b; # small i with macron
 885 $chars{0xc56f}=0x014d; # small o with macron
 886 $chars{0xc575}=0x016b; # small u with macron
 887 $chars{0xc572}=0x0159; # small r with macron
 888 $chars{0xc5e1}=0x01e2; # capital ae with macron
 889 $chars{0xc5f1}=0x01e3; # small ae with macron
 890         # 4/6 breve
 891 $chars{0xc641}=0x0102; # capital a with breve
 892 $chars{0xc645}=0x0114; # capital e with breve
 893 $chars{0xc647}=0x011e; # capital g with breve
 894 $chars{0xc649}=0x012c; # capital i with breve
 895 $chars{0xc64f}=0x014e; # capital o with breve
 896 $chars{0xc655}=0x016c; # capital u with breve
 897 $chars{0xc661}=0x0103; # small a with breve
 898 $chars{0xc665}=0x0115; # small e with breve
 899 $chars{0xc667}=0x011f; # small g with breve
 900 $chars{0xc669}=0x012d; # small i with breve
 901 $chars{0xc66f}=0x014f; # small o with breve
 902 $chars{0xc675}=0x016d; # small u with breve
 903         # 4/7 dot above
 904 $chars{0xc7b0}=0x01e1; # Ain with dot above
 905 $chars{0xc742}=0x1e02; # capital b with dot above
 906 $chars{0xc743}=0x010a; # capital c with dot above
 907 $chars{0xc744}=0x1e0a; # capital d with dot above
 908 $chars{0xc745}=0x0116; # capital e with dot above
 909 $chars{0xc746}=0x1e1e; # capital f with dot above
 910 $chars{0xc747}=0x0120; # capital g with dot above
 911 $chars{0xc748}=0x1e22; # capital h with dot above
 912 $chars{0xc749}=0x0130; # capital i with dot above
 913 $chars{0xc74d}=0x1e40; # capital m with dot above
 914 $chars{0xc74e}=0x1e44; # capital n with dot above
 915 $chars{0xc750}=0x1e56; # capital p with dot above
 916 $chars{0xc752}=0x1e58; # capital r with dot above
 917 $chars{0xc753}=0x1e60; # capital s with dot above
 918 $chars{0xc754}=0x1e6a; # capital t with dot above
 919 $chars{0xc757}=0x1e86; # capital w with dot above
 920 $chars{0xc758}=0x1e8a; # capital x with dot above
 921 $chars{0xc759}=0x1e8e; # capital y with dot above
 922 $chars{0xc75a}=0x017b; # capital z with dot above
 923 $chars{0xc761}=0x0227; # small b with dot above
 924 $chars{0xc762}=0x1e03; # small b with dot above
 925 $chars{0xc763}=0x010b; # small c with dot above
 926 $chars{0xc764}=0x1e0b; # small d with dot above
 927 $chars{0xc765}=0x0117; # small e with dot above
 928 $chars{0xc766}=0x1e1f; # small f with dot above
 929 $chars{0xc767}=0x0121; # small g with dot above
 930 $chars{0xc768}=0x1e23; # small h with dot above
 931 $chars{0xc76d}=0x1e41; # small m with dot above
 932 $chars{0xc76e}=0x1e45; # small n with dot above
 933 $chars{0xc770}=0x1e57; # small p with dot above
 934 $chars{0xc772}=0x1e59; # small r with dot above
 935 $chars{0xc773}=0x1e61; # small s with dot above
 936 $chars{0xc774}=0x1e6b; # small t with dot above
 937 $chars{0xc777}=0x1e87; # small w with dot above
 938 $chars{0xc778}=0x1e8b; # small x with dot above
 939 $chars{0xc779}=0x1e8f; # small y with dot above
 940 $chars{0xc77a}=0x017c; # small z with dot above
 941         # 4/8 trema, diaresis
 942 $chars{0xc820}=0x00a8; # diaeresis
 943 $chars{0xc841}=0x00c4; # capital a with diaeresis
 944 $chars{0xc845}=0x00cb; # capital e with diaeresis
 945 $chars{0xc848}=0x1e26; # capital h with diaeresis
 946 $chars{0xc849}=0x00cf; # capital i with diaeresis
 947 $chars{0xc84f}=0x00d6; # capital o with diaeresis
 948 $chars{0xc855}=0x00dc; # capital u with diaeresis
 949 $chars{0xc857}=0x1e84; # capital w with diaeresis
 950 $chars{0xc858}=0x1e8c; # capital x with diaeresis
 951 $chars{0xc859}=0x0178; # capital y with diaeresis
 952 $chars{0xc861}=0x00e4; # small a with diaeresis
 953 $chars{0xc865}=0x00eb; # small e with diaeresis
 954 $chars{0xc868}=0x1e27; # small h with diaeresis
 955 $chars{0xc869}=0x00ef; # small i with diaeresis
 956 $chars{0xc86f}=0x00f6; # small o with diaeresis
 957 $chars{0xc874}=0x1e97; # small t with diaeresis
 958 $chars{0xc875}=0x00fc; # small u with diaeresis
 959 $chars{0xc877}=0x1e85; # small w with diaeresis
 960 $chars{0xc878}=0x1e8d; # small x with diaeresis
 961 $chars{0xc879}=0x00ff; # small y with diaeresis
 962         # 4/9 umlaut
 963 $chars{0xc920}=0x00a8; # [diaeresis]
 964 $chars{0xc961}=0x00e4; # a with umlaut
 965 $chars{0xc965}=0x00eb; # e with umlaut
 966 $chars{0xc969}=0x00ef; # i with umlaut
 967 $chars{0xc96f}=0x00f6; # o with umlaut
 968 $chars{0xc975}=0x00fc; # u with umlaut
 969         # 4/10 circle above
 970 $chars{0xca41}=0x00c5; # capital a with ring above
 971 $chars{0xcaad}=0x016e; # capital u with ring above
 972 $chars{0xca61}=0x00e5; # small a with ring above
 973 $chars{0xca75}=0x016f; # small u with ring above
 974 $chars{0xca77}=0x1e98; # small w with ring above
 975 $chars{0xca79}=0x1e99; # small y with ring above
 976         # 4/11 high comma off centre
 977         # 4/12 inverted high comma centred
 978         # 4/13 double acute accent
 979 $chars{0xcd4f}=0x0150; # capital o with double acute
 980 $chars{0xcd55}=0x0170; # capital u with double acute
 981 $chars{0xcd6f}=0x0151; # small o with double acute
 982 $chars{0xcd75}=0x0171; # small u with double acute
 983         # 4/14 horn
 984 $chars{0xce54}=0x01a0; # latin capital letter o with horn
 985 $chars{0xce55}=0x01af; # latin capital letter u with horn
 986 $chars{0xce74}=0x01a1; # latin small letter o with horn
 987 $chars{0xce75}=0x01b0; # latin small letter u with horn
 988         # 4/15 caron (hacek
 989 $chars{0xcf41}=0x01cd; # capital a with caron
 990 $chars{0xcf43}=0x010c; # capital c with caron
 991 $chars{0xcf44}=0x010e; # capital d with caron
 992 $chars{0xcf45}=0x011a; # capital e with caron
 993 $chars{0xcf47}=0x01e6; # capital g with caron
 994 $chars{0xcf49}=0x01cf; # capital i with caron
 995 $chars{0xcf4b}=0x01e8; # capital k with caron
 996 $chars{0xcf4c}=0x013d; # capital l with caron
 997 $chars{0xcf4e}=0x0147; # capital n with caron
 998 $chars{0xcf4f}=0x01d1; # capital o with caron
 999 $chars{0xcf52}=0x0158; # capital r with caron
1000 $chars{0xcf53}=0x0160; # capital s with caron
1001 $chars{0xcf54}=0x0164; # capital t with caron
1002 $chars{0xcf55}=0x01d3; # capital u with caron
1003 $chars{0xcf5a}=0x017d; # capital z with caron
1004 $chars{0xcf61}=0x01ce; # small a with caron
1005 $chars{0xcf63}=0x010d; # small c with caron
1006 $chars{0xcf64}=0x010f; # small d with caron
1007 $chars{0xcf65}=0x011b; # small e with caron
1008 $chars{0xcf67}=0x01e7; # small g with caron
1009 $chars{0xcf69}=0x01d0; # small i with caron
1010 $chars{0xcf6a}=0x01f0; # small j with caron
1011 $chars{0xcf6b}=0x01e9; # small k with caron
1012 $chars{0xcf6c}=0x013e; # small l with caron
1013 $chars{0xcf6e}=0x0148; # small n with caron
1014 $chars{0xcf6f}=0x01d2; # small o with caron
1015 $chars{0xcf72}=0x0159; # small r with caron
1016 $chars{0xcf73}=0x0161; # small s with caron
1017 $chars{0xcf74}=0x0165; # small t with caron
1018 $chars{0xcf75}=0x01d4; # small u with caron
1019 $chars{0xcf7a}=0x017e; # small z with caron
1020         # 5/0 cedilla
1021 $chars{0xd020}=0x00b8; # cedilla
1022 $chars{0xd043}=0x00c7; # capital c with cedilla
1023 $chars{0xd044}=0x1e10; # capital d with cedilla
1024 $chars{0xd047}=0x0122; # capital g with cedilla
1025 $chars{0xd048}=0x1e28; # capital h with cedilla
1026 $chars{0xd04b}=0x0136; # capital k with cedilla
1027 $chars{0xd04c}=0x013b; # capital l with cedilla
1028 $chars{0xd04e}=0x0145; # capital n with cedilla
1029 $chars{0xd052}=0x0156; # capital r with cedilla
1030 $chars{0xd053}=0x015e; # capital s with cedilla
1031 $chars{0xd054}=0x0162; # capital t with cedilla
1032 $chars{0xd063}=0x00e7; # small c with cedilla
1033 $chars{0xd064}=0x1e11; # small d with cedilla
1034 $chars{0xd065}=0x0119; # small e with cedilla
1035 $chars{0xd067}=0x0123; # small g with cedilla
1036 $chars{0xd068}=0x1e29; # small h with cedilla
1037 $chars{0xd06b}=0x0137; # small k with cedilla
1038 $chars{0xd06c}=0x013c; # small l with cedilla
1039 $chars{0xd06e}=0x0146; # small n with cedilla
1040 $chars{0xd072}=0x0157; # small r with cedilla
1041 $chars{0xd073}=0x015f; # small s with cedilla
1042 $chars{0xd074}=0x0163; # small t with cedilla
1043         # 5/1 rude
1044         # 5/2 hook to left
1045         # 5/3 ogonek (hook to right
1046 $chars{0xd320}=0x02db; # ogonek
1047 $chars{0xd341}=0x0104; # capital a with ogonek
1048 $chars{0xd345}=0x0118; # capital e with ogonek
1049 $chars{0xd349}=0x012e; # capital i with ogonek
1050 $chars{0xd34f}=0x01ea; # capital o with ogonek
1051 $chars{0xd355}=0x0172; # capital u with ogonek
1052 $chars{0xd361}=0x0105; # small a with ogonek
1053 $chars{0xd365}=0x0119; # small e with ogonek
1054 $chars{0xd369}=0x012f; # small i with ogonek
1055 $chars{0xd36f}=0x01eb; # small o with ogonek
1056 $chars{0xd375}=0x0173; # small u with ogonek
1057         # 5/4 circle below
1058 $chars{0xd441}=0x1e00; # capital a with ring below
1059 $chars{0xd461}=0x1e01; # small a with ring below
1060         # 5/5 half circle below
1061 $chars{0xf948}=0x1e2a; # capital h with breve below
1062 $chars{0xf968}=0x1e2b; # small h with breve below
1063         # 5/6 dot below
1064 $chars{0xd641}=0x1ea0; # capital a with dot below
1065 $chars{0xd642}=0x1e04; # capital b with dot below
1066 $chars{0xd644}=0x1e0c; # capital d with dot below
1067 $chars{0xd645}=0x1eb8; # capital e with dot below
1068 $chars{0xd648}=0x1e24; # capital h with dot below
1069 $chars{0xd649}=0x1eca; # capital i with dot below
1070 $chars{0xd64b}=0x1e32; # capital k with dot below
1071 $chars{0xd64c}=0x1e36; # capital l with dot below
1072 $chars{0xd64d}=0x1e42; # capital m with dot below
1073 $chars{0xd64e}=0x1e46; # capital n with dot below
1074 $chars{0xd64f}=0x1ecc; # capital o with dot below
1075 $chars{0xd652}=0x1e5a; # capital r with dot below
1076 $chars{0xd653}=0x1e62; # capital s with dot below
1077 $chars{0xd654}=0x1e6c; # capital t with dot below
1078 $chars{0xd655}=0x1ee4; # capital u with dot below
1079 $chars{0xd656}=0x1e7e; # capital v with dot below
1080 $chars{0xd657}=0x1e88; # capital w with dot below
1081 $chars{0xd659}=0x1ef4; # capital y with dot below
1082 $chars{0xd65a}=0x1e92; # capital z with dot below
1083 $chars{0xd661}=0x1ea1; # small a with dot below
1084 $chars{0xd662}=0x1e05; # small b with dot below
1085 $chars{0xd664}=0x1e0d; # small d with dot below
1086 $chars{0xd665}=0x1eb9; # small e with dot below
1087 $chars{0xd668}=0x1e25; # small h with dot below
1088 $chars{0xd669}=0x1ecb; # small i with dot below
1089 $chars{0xd66b}=0x1e33; # small k with dot below
1090 $chars{0xd66c}=0x1e37; # small l with dot below
1091 $chars{0xd66d}=0x1e43; # small m with dot below
1092 $chars{0xd66e}=0x1e47; # small n with dot below
1093 $chars{0xd66f}=0x1ecd; # small o with dot below
1094 $chars{0xd672}=0x1e5b; # small r with dot below
1095 $chars{0xd673}=0x1e63; # small s with dot below
1096 $chars{0xd674}=0x1e6d; # small t with dot below
1097 $chars{0xd675}=0x1ee5; # small u with dot below
1098 $chars{0xd676}=0x1e7f; # small v with dot below
1099 $chars{0xd677}=0x1e89; # small w with dot below
1100 $chars{0xd679}=0x1ef5; # small y with dot below
1101 $chars{0xd67a}=0x1e93; # small z with dot below
1102         # 5/7 double dot below
1103 $chars{0xd755}=0x1e72; # capital u with diaeresis below
1104 $chars{0xd775}=0x1e73; # small u with diaeresis below
1105         # 5/8 underline
1106 $chars{0xd820}=0x005f; # underline
1107         # 5/9 double underline
1108 $chars{0xd920}=0x2017; # double underline
1109         # 5/10 small low vertical bar
1110 $chars{0xda20}=0x02cc; #
1111         # 5/11 circumflex below
1112         # 5/12 (this position shall not be used)
1113         # 5/13 left half of ligature sign and of double tilde
1114         # 5/14 right half of ligature sign
1115         # 5/15 right half of double tilde
1116 #     map {printf "%x :%x\n",$_,$chars{$_};}keys %chars;
1117
1118 sub char_decode5426 {
1119     my ( $string) = @_;
1120     my $result;
1121
1122     my @data = unpack("C*", $string);
1123     my @characters;
1124     my $length=scalar(@data);
1125     for (my $i = 0; $i < scalar(@data); $i++) {
1126       my $char= $data[$i];
1127       if ($char >= 0x00 && $char <= 0x7F){
1128         #IsAscii
1129
1130           push @characters,$char unless ($char<0x02 ||$char== 0x0F);
1131       }elsif (($char >= 0xC0 && $char <= 0xDF)) {
1132         #Combined Char
1133         my $convchar ;
1134         if ($chars{$char*256+$data[$i+1]}) {
1135           $convchar= $chars{$char * 256 + $data[$i+1]};
1136           $i++;
1137 #           printf "char %x $char, char to convert %x , converted %x\n",$char,$char * 256 + $data[$i - 1],$convchar;
1138         } elsif ($chars{$char})  {
1139           $convchar= $chars{$char};
1140 #           printf "0xC char %x, converted %x\n",$char,$chars{$char};
1141         }else {
1142           $convchar=$char;
1143         }
1144         push @characters,$convchar;
1145       } else {
1146         my $convchar;
1147         if ($chars{$char})  {
1148           $convchar= $chars{$char};
1149 #            printf "char %x,  converted %x\n",$char,$chars{$char};
1150         }else {
1151 #            printf "char %x $char\n",$char;
1152           $convchar=$char;
1153         }
1154         push @characters,$convchar;
1155       }
1156     }
1157     $result=pack "U*",@characters;
1158 #     $result=~s/\x01//;
1159 #     $result=~s/\x00//;
1160      $result=~s/\x0f//;
1161      $result=~s/\x1b.//;
1162      $result=~s/\x0e//;
1163      $result=~s/\x1b\x5b//;
1164 #   map{printf "%x",$_} @characters;
1165 #   printf "\n";
1166   return $result;
1167 }
1168
1169 1;
1170
1171
1172 =head1 AUTHOR
1173
1174 Koha Development Team <info@koha.org>
1175
1176 Galen Charlton <galen.charlton@liblime.com>
1177
1178 =cut