C4/Charset.pm

   1 package C4::Charset;
   2
   3 # Copyright (C) 2008 LibLime
   4 #
   5 # This file is part of Koha.
   6 #
   7 # Koha is free software; you can redistribute it and/or modify it under the
   8 # terms of the GNU General Public License as published by the Free Software
   9 # Foundation; either version 2 of the License, or (at your option) any later
  10 # version.
  11 #
  12 # Koha is distributed in the hope that it will be useful, but WITHOUT ANY
  13 # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
  14 # A PARTICULAR PURPOSE.  See the GNU General Public License for more details.
  15 #
  16 # You should have received a copy of the GNU General Public License along
  17 # with Koha; if not, write to the Free Software Foundation, Inc.,
  18 # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
  19
  20 use strict;
  21 use warnings;
  22
  23 use MARC::Charset qw/marc8_to_utf8/;
  24 use Text::Iconv;
  25 use C4::Debug;
  26 use Unicode::Normalize;
  27
  28 use vars qw($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
  29
  30 BEGIN {
  31     # set the version for version checking
  32     $VERSION = 3.07.00.049;
  33     require Exporter;
  34     @ISA    = qw(Exporter);
  35     @EXPORT = qw(
  36         NormalizeString
  37         IsStringUTF8ish
  38         MarcToUTF8Record
  39         SetUTF8Flag
  40         SetMarcUnicodeFlag
  41         StripNonXmlChars
  42         nsb_clean
  43     );
  44 }
  45
  46 =head1 NAME
  47
  48 C4::Charset - utilities for handling character set conversions.
  49
  50 =head1 SYNOPSIS
  51
  52   use C4::Charset;
  53
  54 =head1 DESCRIPTION
  55
  56 This module contains routines for dealing with character set
  57 conversions, particularly for MARC records.
  58
  59 A variety of character encodings are in use by various MARC
  60 standards, and even more character encodings are used by
  61 non-standard MARC records.  The various MARC formats generally
  62 do not do a good job of advertising a given record's character
  63 encoding, and even when a record does advertise its encoding,
  64 e.g., via the Leader/09, experience has shown that one cannot
  65 trust it.
  66
  67 Ultimately, all MARC records are stored in Koha in UTF-8 and
  68 must be converted from whatever the source character encoding is.
  69 The goal of this module is to ensure that these conversions
  70 take place accurately.  When a character conversion cannot take
  71 place, or at least not accurately, the module was provide
  72 enough information to allow user-facing code to inform the user
  73 on how to deal with the situation.
  74
  75 =cut
  76
  77 =head1 FUNCTIONS
  78
  79 =head2 IsStringUTF8ish
  80
  81   my $is_utf8 = IsStringUTF8ish($str);
  82
  83 Determines if C<$str> is valid UTF-8.  This can mean
  84 one of two things:
  85
  86 =over
  87
  88 =item *
  89
  90 The Perl UTF-8 flag is set and the string contains valid UTF-8.
  91
  92 =item *
  93
  94 The Perl UTF-8 flag is B<not> set, but the octets contain
  95 valid UTF-8.
  96
  97 =back
  98
  99 The function is named C<IsStringUTF8ish> instead of C<IsStringUTF8>
 100 because in one could be presented with a MARC blob that is
 101 not actually in UTF-8 but whose sequence of octets appears to be
 102 valid UTF-8.  The rest of the MARC character conversion functions
 103 will assume that this situation occur does not very often.
 104
 105 =cut
 106
 107 sub IsStringUTF8ish {
 108     my $str = shift;
 109
 110     return 1 if utf8::is_utf8($str);
 111     return utf8::decode($str);
 112 }
 113
 114 =head2 SetUTF8Flag
 115
 116   my $marc_record = SetUTF8Flag($marc_record, $nfd);
 117
 118 This function sets the PERL UTF8 flag for data.
 119 It is required when using new_from_usmarc
 120 since MARC::File::USMARC does not handle PERL UTF8 setting.
 121 When editing unicode marc records fields and subfields, you
 122 would end up in double encoding without using this function.
 123
 124 If $nfd is set, string normalization will use NFD instead of NFC
 125
 126 FIXME
 127 In my opinion, this function belongs to MARC::Record and not
 128 to this package.
 129 But since it handles charset, and MARC::Record, it finds its way in that package
 130
 131 =cut
 132
 133 sub SetUTF8Flag{
 134         my ($record, $nfd)=@_;
 135         return unless ($record && $record->fields());
 136         foreach my $field ($record->fields()){
 137                 if ($field->tag()>=10){
 138                         my @subfields;
 139                         foreach my $subfield ($field->subfields()){
 140                                 push @subfields,($$subfield[0],NormalizeString($$subfield[1],$nfd));
 141                         }
 142                         my $newfield=MARC::Field->new(
 143                                                         $field->tag(),
 144                                                         $field->indicator(1),
 145                                                         $field->indicator(2),
 146                                                         @subfields
 147                                                 );
 148                         $field->replace_with($newfield);
 149                 }
 150         }
 151 }
 152
 153 =head2 NormalizeString
 154
 155     my $normalized_string=NormalizeString($string,$nfd,$transform);
 156
 157 Given a string
 158 nfd : If you want to set NFD and not NFC
 159 transform : If you expect all the signs to be removed
 160
 161 Sets the PERL UTF8 Flag on your initial data if need be
 162 and applies cleaning if required
 163
 164 Returns a utf8 NFC normalized string
 165
 166 Sample code :
 167    my $string=NormalizeString ("l'ornithoptère");
 168    #results into ornithoptère in NFC form and sets UTF8 Flag
 169
 170 =cut
 171
 172
 173 sub NormalizeString{
 174         my ($string,$nfd,$transform)=@_;
 175         utf8::decode($string) unless (utf8::is_utf8($string));
 176         if ($nfd){
 177                 $string= NFD($string);
 178         }
 179         else {
 180                 $string=NFC($string);
 181         }
 182         if ($transform){
 183     $string=~s/\<|\>|\^|\;|\.|\?|,|\-|\(|\)|\[|\]|\{|\}|\$|\%|\!|\*|\:|\\|\/|\&|\"|\'/ /g;
 184         #removing one letter words "d'" "l'"  was changed into "d " "l "
 185     $string=~s/\b\S\b//g;
 186     $string=~s/\s+$//g;
 187         }
 188     return $string;
 189 }
 190
 191 =head2 MarcToUTF8Record
 192
 193   ($marc_record, $converted_from, $errors_arrayref) = MarcToUTF8Record($marc_blob,
 194                                         $marc_flavour, [, $source_encoding]);
 195
 196 Given a MARC blob or a C<MARC::Record>, the MARC flavour, and an
 197 optional source encoding, return a C<MARC::Record> that is
 198 converted to UTF-8.
 199
 200 The returned C<$marc_record> is guaranteed to be in valid UTF-8, but
 201 is not guaranteed to have been converted correctly.  Specifically,
 202 if C<$converted_from> is 'failed', the MARC record returned failed
 203 character conversion and had each of its non-ASCII octets changed
 204 to the Unicode replacement character.
 205
 206 If the source encoding was not specified, this routine will
 207 try to guess it; the character encoding used for a successful
 208 conversion is returned in C<$converted_from>.
 209
 210 =cut
 211
 212 sub MarcToUTF8Record {
 213     my $marc = shift;
 214     my $marc_flavour = shift;
 215     my $source_encoding = shift;
 216     my $marc_record;
 217     my $marc_blob_is_utf8 = 0;
 218     if (ref($marc) eq 'MARC::Record') {
 219         my $marc_blob = $marc->as_usmarc();
 220         $marc_blob_is_utf8 = IsStringUTF8ish($marc_blob);
 221         $marc_record = $marc;
 222     } else {
 223         # dealing with a MARC blob
 224
 225         # remove any ersatz whitespace from the beginning and
 226         # end of the MARC blob -- these can creep into MARC
 227         # files produced by several sources -- caller really
 228         # should be doing this, however
 229         $marc =~ s/^\s+//;
 230         $marc =~ s/\s+$//;
 231         $marc_blob_is_utf8 = IsStringUTF8ish($marc);
 232         eval {
 233             $marc_record = MARC::Record->new_from_usmarc($marc);
 234         };
 235         if ($@) {
 236             # if we fail the first time, one likely problem
 237             # is that we have a MARC21 record that says that it's
 238             # UTF-8 (Leader/09 = 'a') but contains non-UTF-8 characters.
 239             # We'll try parsing it again.
 240             substr($marc, 9, 1) = ' ';
 241             eval {
 242                 $marc_record = MARC::Record->new_from_usmarc($marc);
 243             };
 244             if ($@) {
 245                 # it's hopeless; return an empty MARC::Record
 246                 return MARC::Record->new(), 'failed', ['could not parse MARC blob'];
 247             }
 248         }
 249     }
 250
 251     # If we do not know the source encoding, try some guesses
 252     # as follows:
 253     #   1. Record is UTF-8 already.
 254     #   2. If MARC flavor is MARC21 or NORMARC, then
 255     #      a. record is MARC-8
 256     #      b. record is ISO-8859-1
 257     #   3. If MARC flavor is UNIMARC, then
 258     if (not defined $source_encoding) {
 259         if ($marc_blob_is_utf8) {
 260             # note that for MARC21/NORMARC we are not bothering to check
 261             # if the Leader/09 is set to 'a' or not -- because
 262             # of problems with various ILSs (including Koha in the
 263             # past, alas), this just is not trustworthy.
 264             SetMarcUnicodeFlag($marc_record, $marc_flavour);
 265             return $marc_record, 'UTF-8', [];
 266         } else {
 267             if ($marc_flavour eq 'MARC21' || $marc_flavour eq 'NORMARC') {
 268                 return _default_marc21_charconv_to_utf8($marc_record, $marc_flavour);
 269             } elsif ($marc_flavour =~/UNIMARC/) {
 270                 return _default_unimarc_charconv_to_utf8($marc_record, $marc_flavour);
 271             } else {
 272                 return _default_marc21_charconv_to_utf8($marc_record, $marc_flavour);
 273             }
 274         }
 275     } else {
 276         # caller knows the character encoding
 277         my $original_marc_record = $marc_record->clone();
 278         my @errors;
 279         if ($source_encoding =~ /utf-?8/i) {
 280             if ($marc_blob_is_utf8) {
 281                 SetMarcUnicodeFlag($marc_record, $marc_flavour);
 282                 return $marc_record, 'UTF-8', [];
 283             } else {
 284                 push @errors, 'specified UTF-8 => UTF-8 conversion, but record is not in UTF-8';
 285             }
 286         } elsif ($source_encoding =~ /marc-?8/i) {
 287             @errors = _marc_marc8_to_utf8($marc_record, $marc_flavour);
 288         } elsif ($source_encoding =~ /5426/) {
 289             @errors = _marc_iso5426_to_utf8($marc_record, $marc_flavour);
 290         } else {
 291             # assume any other character encoding is for Text::Iconv
 292             @errors = _marc_to_utf8_via_text_iconv($marc_record, $marc_flavour, $source_encoding);
 293         }
 294
 295         if (@errors) {
 296             _marc_to_utf8_replacement_char($original_marc_record, $marc_flavour);
 297             return $original_marc_record, 'failed', \@errors;
 298         } else {
 299             return $marc_record, $source_encoding, [];
 300         }
 301     }
 302
 303 }
 304
 305 =head2 SetMarcUnicodeFlag
 306
 307   SetMarcUnicodeFlag($marc_record, $marc_flavour);
 308
 309 Set both the internal MARC::Record encoding flag
 310 and the appropriate Leader/09 (MARC21) or
 311 100/26-29 (UNIMARC) to indicate that the record
 312 is in UTF-8.  Note that this does B<not> do
 313 any actual character conversion.
 314
 315 =cut
 316
 317 sub SetMarcUnicodeFlag {
 318     my $marc_record = shift;
 319     my $marc_flavour = shift; # || C4::Context->preference("marcflavour");
 320
 321     $marc_record->encoding('UTF-8');
 322     if ($marc_flavour eq 'MARC21' || $marc_flavour eq 'NORMARC') {
 323         my $leader = $marc_record->leader();
 324         substr($leader, 9, 1) = 'a';
 325         $marc_record->leader($leader);
 326     } elsif ($marc_flavour =~/UNIMARC/) {
 327         my $defaultlanguage = C4::Context->preference("UNIMARCField100Language");
 328         $defaultlanguage = "fre" if (!$defaultlanguage || length($defaultlanguage) != 3);
 329         my $string;
 330                 my ($subflength,$encodingposition)=($marc_flavour=~/AUTH/?(21,12):(36,25));
 331                 $string=$marc_record->subfield( 100, "a" );
 332         if (defined $string && length($string)==$subflength) {
 333                         $string = substr $string, 0,$subflength if (length($string)>$subflength);
 334         }
 335         else {
 336             $string = POSIX::strftime( "%Y%m%d", localtime );
 337             $string =~ s/\-//g;
 338             $string = sprintf( "%-*s", $subflength, $string );
 339             substr ( $string, ($encodingposition - 3), 3, $defaultlanguage);
 340         }
 341         substr( $string, $encodingposition, 3, "y50" );
 342         if ( $marc_record->subfield( 100, "a" ) ) {
 343                         $marc_record->field('100')->update(a=>$string);
 344                 }
 345                 else {
 346             $marc_record->insert_grouped_field(
 347                 MARC::Field->new( 100, '', '', "a" => $string ) );
 348         }
 349                 $debug && warn "encodage: ", substr( $marc_record->subfield(100, 'a'), $encodingposition, 3 );
 350     } else {
 351         warn "Unrecognized marcflavour: $marc_flavour";
 352     }
 353 }
 354
 355 =head2 StripNonXmlChars
 356
 357   my $new_str = StripNonXmlChars($old_str);
 358
 359 Given a string, return a copy with the
 360 characters that are illegal in XML
 361 removed.
 362
 363 This function exists to work around a problem
 364 that can occur with badly-encoded MARC records.
 365 Specifically, if a UTF-8 MARC record also
 366 has excape (\x1b) characters, MARC::File::XML
 367 will let the escape characters pass through
 368 when as_xml() or as_xml_record() is called.  The
 369 problem is that the escape character is not
 370 legal in well-formed XML documents, so when
 371 MARC::File::XML attempts to parse such a record,
 372 the XML parser will fail.
 373
 374 Stripping such characters will allow a
 375 MARC::Record->new_from_xml()
 376 to work, at the possible risk of some data loss.
 377
 378 =cut
 379
 380 sub StripNonXmlChars {
 381     my $str = shift;
 382     if (!defined($str) || $str eq ""){
 383         return "";
 384     }
 385     $str =~ s/[^\x09\x0A\x0D\x{0020}-\x{D7FF}\x{E000}-\x{FFFD}\x{10000}-\x{10FFFF}]//g;
 386     return $str;
 387 }
 388
 389
 390
 391 =head2 nsb_clean
 392
 393 =over 4
 394
 395 nsb_clean($string);
 396
 397 =back
 398
 399 Removes Non Sorting Block characters
 400
 401 =cut
 402 sub nsb_clean {
 403     my $NSB  = '\x88' ;        # NSB : begin Non Sorting Block
 404     my $NSE  = '\x89' ;        # NSE : Non Sorting Block end
 405     my $NSB2 = '\x98' ;        # NSB : begin Non Sorting Block
 406     my $NSE2 = '\x9C' ;        # NSE : Non Sorting Block end
 407     my $C2   = '\xC2' ;        # What is this char ? It is sometimes left by the regexp after removing NSB / NSE
 408
 409     # handles non sorting blocks
 410     my ($string) = @_ ;
 411     $_ = $string ;
 412     s/$NSB//g ;
 413     s/$NSE//g ;
 414     s/$NSB2//g ;
 415     s/$NSE2//g ;
 416     s/$C2//g ;
 417     $string = $_ ;
 418
 419     return($string) ;
 420 }
 421
 422
 423 =head1 INTERNAL FUNCTIONS
 424
 425 =head2 _default_marc21_charconv_to_utf8
 426
 427   my ($new_marc_record, $guessed_charset) = _default_marc21_charconv_to_utf8($marc_record);
 428
 429 Converts a C<MARC::Record> of unknown character set to UTF-8,
 430 first by trying a MARC-8 to UTF-8 conversion, then ISO-8859-1
 431 to UTF-8, then a default conversion that replaces each non-ASCII
 432 character with the replacement character.
 433
 434 The C<$guessed_charset> return value contains the character set
 435 that resulted in a conversion to valid UTF-8; note that
 436 if the MARC-8 and ISO-8859-1 conversions failed, the value of
 437 this is 'failed'.
 438
 439 =cut
 440
 441 sub _default_marc21_charconv_to_utf8 {
 442     my $marc_record = shift;
 443     my $marc_flavour = shift;
 444
 445     my $trial_marc8 = $marc_record->clone();
 446     my @all_errors = ();
 447     my @errors = _marc_marc8_to_utf8($trial_marc8, $marc_flavour);
 448     unless (@errors) {
 449         return $trial_marc8, 'MARC-8', [];
 450     }
 451     push @all_errors, @errors;
 452
 453     my $trial_8859_1 = $marc_record->clone();
 454     @errors = _marc_to_utf8_via_text_iconv($trial_8859_1, $marc_flavour, 'iso-8859-1');
 455     unless (@errors) {
 456         return $trial_8859_1, 'iso-8859-1', []; # note -- we could return \@all_errors
 457                                                 # instead if we wanted to report details
 458                                                 # of the failed attempt at MARC-8 => UTF-8
 459     }
 460     push @all_errors, @errors;
 461
 462     my $default_converted = $marc_record->clone();
 463     _marc_to_utf8_replacement_char($default_converted, $marc_flavour);
 464     return $default_converted, 'failed', \@all_errors;
 465 }
 466
 467 =head2 _default_unimarc_charconv_to_utf8
 468
 469   my ($new_marc_record, $guessed_charset) = _default_unimarc_charconv_to_utf8($marc_record);
 470
 471 Converts a C<MARC::Record> of unknown character set to UTF-8,
 472 first by trying a ISO-5426 to UTF-8 conversion, then ISO-8859-1
 473 to UTF-8, then a default conversion that replaces each non-ASCII
 474 character with the replacement character.
 475
 476 The C<$guessed_charset> return value contains the character set
 477 that resulted in a conversion to valid UTF-8; note that
 478 if the MARC-8 and ISO-8859-1 conversions failed, the value of
 479 this is 'failed'.
 480
 481 =cut
 482
 483 sub _default_unimarc_charconv_to_utf8 {
 484     my $marc_record = shift;
 485     my $marc_flavour = shift;
 486
 487     my $trial_marc8 = $marc_record->clone();
 488     my @all_errors = ();
 489     my @errors = _marc_iso5426_to_utf8($trial_marc8, $marc_flavour);
 490     unless (@errors) {
 491         return $trial_marc8, 'iso-5426';
 492     }
 493     push @all_errors, @errors;
 494
 495     my $trial_8859_1 = $marc_record->clone();
 496     @errors = _marc_to_utf8_via_text_iconv($trial_8859_1, $marc_flavour, 'iso-8859-1');
 497     unless (@errors) {
 498         return $trial_8859_1, 'iso-8859-1';
 499     }
 500     push @all_errors, @errors;
 501
 502     my $default_converted = $marc_record->clone();
 503     _marc_to_utf8_replacement_char($default_converted, $marc_flavour);
 504     return $default_converted, 'failed', \@all_errors;
 505 }
 506
 507 =head2 _marc_marc8_to_utf8
 508
 509   my @errors = _marc_marc8_to_utf8($marc_record, $marc_flavour, $source_encoding);
 510
 511 Convert a C<MARC::Record> to UTF-8 in-place from MARC-8.
 512 If the conversion fails for some reason, an
 513 appropriate messages will be placed in the returned
 514 C<@errors> array.
 515
 516 =cut
 517
 518 sub _marc_marc8_to_utf8 {
 519     my $marc_record = shift;
 520     my $marc_flavour = shift;
 521
 522     my $prev_ignore = MARC::Charset->ignore_errors();
 523     MARC::Charset->ignore_errors(1);
 524
 525     # trap warnings raised by MARC::Charset
 526     my @errors = ();
 527     local $SIG{__WARN__} = sub {
 528         my $msg = $_[0];
 529         if ($msg =~ /MARC.Charset/) {
 530             # FIXME - purpose of this regexp is to strip out the
 531             # line reference to MARC/Charset.pm, but as it
 532             # exists probably won't work quite on Windows --
 533             # some sort of minimal-bunch back-tracking RE
 534             # would be helpful here
 535             $msg =~ s/at [\/].*?.MARC.Charset\.pm line \d+\.\n$//;
 536             push @errors, $msg;
 537         } else {
 538             # if warning doesn't come from MARC::Charset, just
 539             # pass it on
 540             warn $msg;
 541         }
 542     };
 543
 544     foreach my $field ($marc_record->fields()) {
 545         if ($field->is_control_field()) {
 546             ; # do nothing -- control fields should not contain non-ASCII characters
 547         } else {
 548             my @converted_subfields;
 549             foreach my $subfield ($field->subfields()) {
 550                 my $utf8sf = MARC::Charset::marc8_to_utf8($subfield->[1]);
 551                 unless (IsStringUTF8ish($utf8sf)) {
 552                     # Because of a bug in MARC::Charset 0.98, if the string
 553                     # has (a) one or more diacritics that (b) are only in character positions
 554                     # 128 to 255 inclusive, the resulting converted string is not in
 555                     # UTF-8, but the legacy 8-bit encoding (e.g., ISO-8859-1).  If that
 556                     # occurs, upgrade the string in place.  Moral of the story seems to be
 557                     # that pack("U", ...) is better than chr(...) if you need to guarantee
 558                     # that the resulting string is UTF-8.
 559                     utf8::upgrade($utf8sf);
 560                 }
 561                 push @converted_subfields, $subfield->[0], $utf8sf;
 562             }
 563
 564             $field->replace_with(MARC::Field->new(
 565                 $field->tag(), $field->indicator(1), $field->indicator(2),
 566                 @converted_subfields)
 567             );
 568         }
 569     }
 570
 571     MARC::Charset->ignore_errors($prev_ignore);
 572
 573     SetMarcUnicodeFlag($marc_record, $marc_flavour);
 574
 575     return @errors;
 576 }
 577
 578 =head2 _marc_iso5426_to_utf8
 579
 580   my @errors = _marc_iso5426_to_utf8($marc_record, $marc_flavour, $source_encoding);
 581
 582 Convert a C<MARC::Record> to UTF-8 in-place from ISO-5426.
 583 If the conversion fails for some reason, an
 584 appropriate messages will be placed in the returned
 585 C<@errors> array.
 586
 587 FIXME - is ISO-5426 equivalent enough to MARC-8
 588 that C<MARC::Charset> can be used instead?
 589
 590 =cut
 591
 592 sub _marc_iso5426_to_utf8 {
 593     my $marc_record = shift;
 594     my $marc_flavour = shift;
 595
 596     my @errors = ();
 597
 598     foreach my $field ($marc_record->fields()) {
 599         if ($field->is_control_field()) {
 600             ; # do nothing -- control fields should not contain non-ASCII characters
 601         } else {
 602             my @converted_subfields;
 603             foreach my $subfield ($field->subfields()) {
 604                 my $utf8sf = char_decode5426($subfield->[1]);
 605                 push @converted_subfields, $subfield->[0], $utf8sf;
 606             }
 607
 608             $field->replace_with(MARC::Field->new(
 609                 $field->tag(), $field->indicator(1), $field->indicator(2),
 610                 @converted_subfields)
 611             );
 612         }
 613     }
 614
 615     SetMarcUnicodeFlag($marc_record, $marc_flavour);
 616
 617     return @errors;
 618 }
 619
 620 =head2 _marc_to_utf8_via_text_iconv
 621
 622   my @errors = _marc_to_utf8_via_text_iconv($marc_record, $marc_flavour, $source_encoding);
 623
 624 Convert a C<MARC::Record> to UTF-8 in-place using the
 625 C<Text::Iconv> CPAN module.  Any source encoding accepted
 626 by the user's iconv installation should work.  If
 627 the source encoding is not recognized on the user's
 628 server or the conversion fails for some reason,
 629 appropriate messages will be placed in the returned
 630 C<@errors> array.
 631
 632 =cut
 633
 634 sub _marc_to_utf8_via_text_iconv {
 635     my $marc_record = shift;
 636     my $marc_flavour = shift;
 637     my $source_encoding = shift;
 638
 639     my @errors = ();
 640     my $decoder;
 641     eval { $decoder = Text::Iconv->new($source_encoding, 'utf8'); };
 642     if ($@) {
 643         push @errors, "Could not initialze $source_encoding => utf8 converter: $@";
 644         return @errors;
 645     }
 646
 647     my $prev_raise_error = Text::Iconv->raise_error();
 648     Text::Iconv->raise_error(1);
 649
 650     foreach my $field ($marc_record->fields()) {
 651         if ($field->is_control_field()) {
 652             ; # do nothing -- control fields should not contain non-ASCII characters
 653         } else {
 654             my @converted_subfields;
 655             foreach my $subfield ($field->subfields()) {
 656                 my $converted_value;
 657                 my $conversion_ok = 1;
 658                 eval { $converted_value = $decoder->convert($subfield->[1]); };
 659                 if ($@) {
 660                     $conversion_ok = 0;
 661                     push @errors, $@;
 662                 } elsif (not defined $converted_value) {
 663                     $conversion_ok = 0;
 664                     push @errors, "Text::Iconv conversion failed - retval is " . $decoder->retval();
 665                 }
 666
 667                 if ($conversion_ok) {
 668                     push @converted_subfields, $subfield->[0], $converted_value;
 669                 } else {
 670                     $converted_value = $subfield->[1];
 671                     $converted_value =~ s/[\200-\377]/\xef\xbf\xbd/g;
 672                     push @converted_subfields, $subfield->[0], $converted_value;
 673                 }
 674             }
 675
 676             $field->replace_with(MARC::Field->new(
 677                 $field->tag(), $field->indicator(1), $field->indicator(2),
 678                 @converted_subfields)
 679             );
 680         }
 681     }
 682
 683     SetMarcUnicodeFlag($marc_record, $marc_flavour);
 684     Text::Iconv->raise_error($prev_raise_error);
 685
 686     return @errors;
 687 }
 688
 689 =head2 _marc_to_utf8_replacement_char
 690
 691   _marc_to_utf8_replacement_char($marc_record, $marc_flavour);
 692
 693 Convert a C<MARC::Record> to UTF-8 in-place, adopting the
 694 unsatisfactory method of replacing all non-ASCII (e.g.,
 695 where the eight bit is set) octet with the Unicode
 696 replacement character.  This is meant as a last-ditch
 697 method, and would be best used as part of a UI that
 698 lets a cataloguer pick various character conversions
 699 until he or she finds the right one.
 700
 701 =cut
 702
 703 sub _marc_to_utf8_replacement_char {
 704     my $marc_record = shift;
 705     my $marc_flavour = shift;
 706
 707     foreach my $field ($marc_record->fields()) {
 708         if ($field->is_control_field()) {
 709             ; # do nothing -- control fields should not contain non-ASCII characters
 710         } else {
 711             my @converted_subfields;
 712             foreach my $subfield ($field->subfields()) {
 713                 my $value = $subfield->[1];
 714                 $value =~ s/[\200-\377]/\xef\xbf\xbd/g;
 715                 push @converted_subfields, $subfield->[0], $value;
 716             }
 717
 718             $field->replace_with(MARC::Field->new(
 719                 $field->tag(), $field->indicator(1), $field->indicator(2),
 720                 @converted_subfields)
 721             );
 722         }
 723     }
 724
 725     SetMarcUnicodeFlag($marc_record, $marc_flavour);
 726 }
 727
 728 =head2 char_decode5426
 729
 730   my $utf8string = char_decode5426($iso_5426_string);
 731
 732 Converts a string from ISO-5426 to UTF-8.
 733
 734 =cut
 735
 736
 737 my %chars;
 738 $chars{0xb0}=0x0101;#3/0ayn[ain]
 739 $chars{0xb1}=0x0623;#3/1alif/hamzah[alefwithhamzaabove]
 740 #$chars{0xb2}=0x00e0;#'à';
 741 $chars{0xb2}=0x00e0;#3/2leftlowsinglequotationmark
 742 #$chars{0xb3}=0x00e7;#'ç';
 743 $chars{0xb3}=0x00e7;#3/2leftlowsinglequotationmark
 744 # $chars{0xb4}='è';
 745 $chars{0xb4}=0x00e8;
 746 $chars{0xbd}=0x02b9;
 747 $chars{0xbe}=0x02ba;
 748 # $chars{0xb5}='é';
 749 $chars{0xb5}=0x00e9;
 750 $chars{0x97}=0x003c;#3/2leftlowsinglequotationmark
 751 $chars{0x98}=0x003e;#3/2leftlowsinglequotationmark
 752 $chars{0xfa}=0x0153; #oe
 753 $chars{0xea}=0x0152; #oe
 754 $chars{0x81d1}=0x00b0;
 755
 756 ####
 757 ## combined characters iso5426
 758
 759 $chars{0xc041}=0x1ea2; # capital a with hook above
 760 $chars{0xc045}=0x1eba; # capital e with hook above
 761 $chars{0xc049}=0x1ec8; # capital i with hook above
 762 $chars{0xc04f}=0x1ece; # capital o with hook above
 763 $chars{0xc055}=0x1ee6; # capital u with hook above
 764 $chars{0xc059}=0x1ef6; # capital y with hook above
 765 $chars{0xc061}=0x1ea3; # small a with hook above
 766 $chars{0xc065}=0x1ebb; # small e with hook above
 767 $chars{0xc069}=0x1ec9; # small i with hook above
 768 $chars{0xc06f}=0x1ecf; # small o with hook above
 769 $chars{0xc075}=0x1ee7; # small u with hook above
 770 $chars{0xc079}=0x1ef7; # small y with hook above
 771
 772         # 4/1 grave accent
 773 $chars{0xc141}=0x00c0; # capital a with grave accent
 774 $chars{0xc145}=0x00c8; # capital e with grave accent
 775 $chars{0xc149}=0x00cc; # capital i with grave accent
 776 $chars{0xc14f}=0x00d2; # capital o with grave accent
 777 $chars{0xc155}=0x00d9; # capital u with grave accent
 778 $chars{0xc157}=0x1e80; # capital w with grave
 779 $chars{0xc159}=0x1ef2; # capital y with grave
 780 $chars{0xc161}=0x00e0; # small a with grave accent
 781 $chars{0xc165}=0x00e8; # small e with grave accent
 782 $chars{0xc169}=0x00ec; # small i with grave accent
 783 $chars{0xc16f}=0x00f2; # small o with grave accent
 784 $chars{0xc175}=0x00f9; # small u with grave accent
 785 $chars{0xc177}=0x1e81; # small w with grave
 786 $chars{0xc179}=0x1ef3; # small y with grave
 787         # 4/2 acute accent
 788 $chars{0xc241}=0x00c1; # capital a with acute accent
 789 $chars{0xc243}=0x0106; # capital c with acute accent
 790 $chars{0xc245}=0x00c9; # capital e with acute accent
 791 $chars{0xc247}=0x01f4; # capital g with acute
 792 $chars{0xc249}=0x00cd; # capital i with acute accent
 793 $chars{0xc24b}=0x1e30; # capital k with acute
 794 $chars{0xc24c}=0x0139; # capital l with acute accent
 795 $chars{0xc24d}=0x1e3e; # capital m with acute
 796 $chars{0xc24e}=0x0143; # capital n with acute accent
 797 $chars{0xc24f}=0x00d3; # capital o with acute accent
 798 $chars{0xc250}=0x1e54; # capital p with acute
 799 $chars{0xc252}=0x0154; # capital r with acute accent
 800 $chars{0xc253}=0x015a; # capital s with acute accent
 801 $chars{0xc255}=0x00da; # capital u with acute accent
 802 $chars{0xc257}=0x1e82; # capital w with acute
 803 $chars{0xc259}=0x00dd; # capital y with acute accent
 804 $chars{0xc25a}=0x0179; # capital z with acute accent
 805 $chars{0xc261}=0x00e1; # small a with acute accent
 806 $chars{0xc263}=0x0107; # small c with acute accent
 807 $chars{0xc265}=0x00e9; # small e with acute accent
 808 $chars{0xc267}=0x01f5; # small g with acute
 809 $chars{0xc269}=0x00ed; # small i with acute accent
 810 $chars{0xc26b}=0x1e31; # small k with acute
 811 $chars{0xc26c}=0x013a; # small l with acute accent
 812 $chars{0xc26d}=0x1e3f; # small m with acute
 813 $chars{0xc26e}=0x0144; # small n with acute accent
 814 $chars{0xc26f}=0x00f3; # small o with acute accent
 815 $chars{0xc270}=0x1e55; # small p with acute
 816 $chars{0xc272}=0x0155; # small r with acute accent
 817 $chars{0xc273}=0x015b; # small s with acute accent
 818 $chars{0xc275}=0x00fa; # small u with acute accent
 819 $chars{0xc277}=0x1e83; # small w with acute
 820 $chars{0xc279}=0x00fd; # small y with acute accent
 821 $chars{0xc27a}=0x017a; # small z with acute accent
 822 $chars{0xc2e1}=0x01fc; # capital ae with acute
 823 $chars{0xc2f1}=0x01fd; # small ae with acute
 824        # 4/3 circumflex accent
 825 $chars{0xc341}=0x00c2; # capital a with circumflex accent
 826 $chars{0xc343}=0x0108; # capital c with circumflex
 827 $chars{0xc345}=0x00ca; # capital e with circumflex accent
 828 $chars{0xc347}=0x011c; # capital g with circumflex
 829 $chars{0xc348}=0x0124; # capital h with circumflex
 830 $chars{0xc349}=0x00ce; # capital i with circumflex accent
 831 $chars{0xc34a}=0x0134; # capital j with circumflex
 832 $chars{0xc34f}=0x00d4; # capital o with circumflex accent
 833 $chars{0xc353}=0x015c; # capital s with circumflex
 834 $chars{0xc355}=0x00db; # capital u with circumflex
 835 $chars{0xc357}=0x0174; # capital w with circumflex
 836 $chars{0xc359}=0x0176; # capital y with circumflex
 837 $chars{0xc35a}=0x1e90; # capital z with circumflex
 838 $chars{0xc361}=0x00e2; # small a with circumflex accent
 839 $chars{0xc363}=0x0109; # small c with circumflex
 840 $chars{0xc365}=0x00ea; # small e with circumflex accent
 841 $chars{0xc367}=0x011d; # small g with circumflex
 842 $chars{0xc368}=0x0125; # small h with circumflex
 843 $chars{0xc369}=0x00ee; # small i with circumflex accent
 844 $chars{0xc36a}=0x0135; # small j with circumflex
 845 $chars{0xc36e}=0x00f1; # small n with tilde
 846 $chars{0xc36f}=0x00f4; # small o with circumflex accent
 847 $chars{0xc373}=0x015d; # small s with circumflex
 848 $chars{0xc375}=0x00fb; # small u with circumflex
 849 $chars{0xc377}=0x0175; # small w with circumflex
 850 $chars{0xc379}=0x0177; # small y with circumflex
 851 $chars{0xc37a}=0x1e91; # small z with circumflex
 852         # 4/4 tilde
 853 $chars{0xc441}=0x00c3; # capital a with tilde
 854 $chars{0xc445}=0x1ebc; # capital e with tilde
 855 $chars{0xc449}=0x0128; # capital i with tilde
 856 $chars{0xc44e}=0x00d1; # capital n with tilde
 857 $chars{0xc44f}=0x00d5; # capital o with tilde
 858 $chars{0xc455}=0x0168; # capital u with tilde
 859 $chars{0xc456}=0x1e7c; # capital v with tilde
 860 $chars{0xc459}=0x1ef8; # capital y with tilde
 861 $chars{0xc461}=0x00e3; # small a with tilde
 862 $chars{0xc465}=0x1ebd; # small e with tilde
 863 $chars{0xc469}=0x0129; # small i with tilde
 864 $chars{0xc46e}=0x00f1; # small n with tilde
 865 $chars{0xc46f}=0x00f5; # small o with tilde
 866 $chars{0xc475}=0x0169; # small u with tilde
 867 $chars{0xc476}=0x1e7d; # small v with tilde
 868 $chars{0xc479}=0x1ef9; # small y with tilde
 869     # 4/5 macron
 870 $chars{0xc541}=0x0100; # capital a with macron
 871 $chars{0xc545}=0x0112; # capital e with macron
 872 $chars{0xc547}=0x1e20; # capital g with macron
 873 $chars{0xc549}=0x012a; # capital i with macron
 874 $chars{0xc54f}=0x014c; # capital o with macron
 875 $chars{0xc555}=0x016a; # capital u with macron
 876 $chars{0xc561}=0x0101; # small a with macron
 877 $chars{0xc565}=0x0113; # small e with macron
 878 $chars{0xc567}=0x1e21; # small g with macron
 879 $chars{0xc569}=0x012b; # small i with macron
 880 $chars{0xc56f}=0x014d; # small o with macron
 881 $chars{0xc575}=0x016b; # small u with macron
 882 $chars{0xc572}=0x0159; # small r with macron
 883 $chars{0xc5e1}=0x01e2; # capital ae with macron
 884 $chars{0xc5f1}=0x01e3; # small ae with macron
 885         # 4/6 breve
 886 $chars{0xc641}=0x0102; # capital a with breve
 887 $chars{0xc645}=0x0114; # capital e with breve
 888 $chars{0xc647}=0x011e; # capital g with breve
 889 $chars{0xc649}=0x012c; # capital i with breve
 890 $chars{0xc64f}=0x014e; # capital o with breve
 891 $chars{0xc655}=0x016c; # capital u with breve
 892 $chars{0xc661}=0x0103; # small a with breve
 893 $chars{0xc665}=0x0115; # small e with breve
 894 $chars{0xc667}=0x011f; # small g with breve
 895 $chars{0xc669}=0x012d; # small i with breve
 896 $chars{0xc66f}=0x014f; # small o with breve
 897 $chars{0xc675}=0x016d; # small u with breve
 898         # 4/7 dot above
 899 $chars{0xc7b0}=0x01e1; # Ain with dot above
 900 $chars{0xc742}=0x1e02; # capital b with dot above
 901 $chars{0xc743}=0x010a; # capital c with dot above
 902 $chars{0xc744}=0x1e0a; # capital d with dot above
 903 $chars{0xc745}=0x0116; # capital e with dot above
 904 $chars{0xc746}=0x1e1e; # capital f with dot above
 905 $chars{0xc747}=0x0120; # capital g with dot above
 906 $chars{0xc748}=0x1e22; # capital h with dot above
 907 $chars{0xc749}=0x0130; # capital i with dot above
 908 $chars{0xc74d}=0x1e40; # capital m with dot above
 909 $chars{0xc74e}=0x1e44; # capital n with dot above
 910 $chars{0xc750}=0x1e56; # capital p with dot above
 911 $chars{0xc752}=0x1e58; # capital r with dot above
 912 $chars{0xc753}=0x1e60; # capital s with dot above
 913 $chars{0xc754}=0x1e6a; # capital t with dot above
 914 $chars{0xc757}=0x1e86; # capital w with dot above
 915 $chars{0xc758}=0x1e8a; # capital x with dot above
 916 $chars{0xc759}=0x1e8e; # capital y with dot above
 917 $chars{0xc75a}=0x017b; # capital z with dot above
 918 $chars{0xc761}=0x0227; # small b with dot above
 919 $chars{0xc762}=0x1e03; # small b with dot above
 920 $chars{0xc763}=0x010b; # small c with dot above
 921 $chars{0xc764}=0x1e0b; # small d with dot above
 922 $chars{0xc765}=0x0117; # small e with dot above
 923 $chars{0xc766}=0x1e1f; # small f with dot above
 924 $chars{0xc767}=0x0121; # small g with dot above
 925 $chars{0xc768}=0x1e23; # small h with dot above
 926 $chars{0xc76d}=0x1e41; # small m with dot above
 927 $chars{0xc76e}=0x1e45; # small n with dot above
 928 $chars{0xc770}=0x1e57; # small p with dot above
 929 $chars{0xc772}=0x1e59; # small r with dot above
 930 $chars{0xc773}=0x1e61; # small s with dot above
 931 $chars{0xc774}=0x1e6b; # small t with dot above
 932 $chars{0xc777}=0x1e87; # small w with dot above
 933 $chars{0xc778}=0x1e8b; # small x with dot above
 934 $chars{0xc779}=0x1e8f; # small y with dot above
 935 $chars{0xc77a}=0x017c; # small z with dot above
 936         # 4/8 trema, diaresis
 937 $chars{0xc820}=0x00a8; # diaeresis
 938 $chars{0xc841}=0x00c4; # capital a with diaeresis
 939 $chars{0xc845}=0x00cb; # capital e with diaeresis
 940 $chars{0xc848}=0x1e26; # capital h with diaeresis
 941 $chars{0xc849}=0x00cf; # capital i with diaeresis
 942 $chars{0xc84f}=0x00d6; # capital o with diaeresis
 943 $chars{0xc855}=0x00dc; # capital u with diaeresis
 944 $chars{0xc857}=0x1e84; # capital w with diaeresis
 945 $chars{0xc858}=0x1e8c; # capital x with diaeresis
 946 $chars{0xc859}=0x0178; # capital y with diaeresis
 947 $chars{0xc861}=0x00e4; # small a with diaeresis
 948 $chars{0xc865}=0x00eb; # small e with diaeresis
 949 $chars{0xc868}=0x1e27; # small h with diaeresis
 950 $chars{0xc869}=0x00ef; # small i with diaeresis
 951 $chars{0xc86f}=0x00f6; # small o with diaeresis
 952 $chars{0xc874}=0x1e97; # small t with diaeresis
 953 $chars{0xc875}=0x00fc; # small u with diaeresis
 954 $chars{0xc877}=0x1e85; # small w with diaeresis
 955 $chars{0xc878}=0x1e8d; # small x with diaeresis
 956 $chars{0xc879}=0x00ff; # small y with diaeresis
 957         # 4/9 umlaut
 958 $chars{0xc920}=0x00a8; # [diaeresis]
 959 $chars{0xc961}=0x00e4; # a with umlaut
 960 $chars{0xc965}=0x00eb; # e with umlaut
 961 $chars{0xc969}=0x00ef; # i with umlaut
 962 $chars{0xc96f}=0x00f6; # o with umlaut
 963 $chars{0xc975}=0x00fc; # u with umlaut
 964         # 4/10 circle above
 965 $chars{0xca41}=0x00c5; # capital a with ring above
 966 $chars{0xcaad}=0x016e; # capital u with ring above
 967 $chars{0xca61}=0x00e5; # small a with ring above
 968 $chars{0xca75}=0x016f; # small u with ring above
 969 $chars{0xca77}=0x1e98; # small w with ring above
 970 $chars{0xca79}=0x1e99; # small y with ring above
 971         # 4/11 high comma off centre
 972         # 4/12 inverted high comma centred
 973         # 4/13 double acute accent
 974 $chars{0xcd4f}=0x0150; # capital o with double acute
 975 $chars{0xcd55}=0x0170; # capital u with double acute
 976 $chars{0xcd6f}=0x0151; # small o with double acute
 977 $chars{0xcd75}=0x0171; # small u with double acute
 978         # 4/14 horn
 979 $chars{0xce54}=0x01a0; # latin capital letter o with horn
 980 $chars{0xce55}=0x01af; # latin capital letter u with horn
 981 $chars{0xce74}=0x01a1; # latin small letter o with horn
 982 $chars{0xce75}=0x01b0; # latin small letter u with horn
 983         # 4/15 caron (hacek
 984 $chars{0xcf41}=0x01cd; # capital a with caron
 985 $chars{0xcf43}=0x010c; # capital c with caron
 986 $chars{0xcf44}=0x010e; # capital d with caron
 987 $chars{0xcf45}=0x011a; # capital e with caron
 988 $chars{0xcf47}=0x01e6; # capital g with caron
 989 $chars{0xcf49}=0x01cf; # capital i with caron
 990 $chars{0xcf4b}=0x01e8; # capital k with caron
 991 $chars{0xcf4c}=0x013d; # capital l with caron
 992 $chars{0xcf4e}=0x0147; # capital n with caron
 993 $chars{0xcf4f}=0x01d1; # capital o with caron
 994 $chars{0xcf52}=0x0158; # capital r with caron
 995 $chars{0xcf53}=0x0160; # capital s with caron
 996 $chars{0xcf54}=0x0164; # capital t with caron
 997 $chars{0xcf55}=0x01d3; # capital u with caron
 998 $chars{0xcf5a}=0x017d; # capital z with caron
 999 $chars{0xcf61}=0x01ce; # small a with caron
1000 $chars{0xcf63}=0x010d; # small c with caron
1001 $chars{0xcf64}=0x010f; # small d with caron
1002 $chars{0xcf65}=0x011b; # small e with caron
1003 $chars{0xcf67}=0x01e7; # small g with caron
1004 $chars{0xcf69}=0x01d0; # small i with caron
1005 $chars{0xcf6a}=0x01f0; # small j with caron
1006 $chars{0xcf6b}=0x01e9; # small k with caron
1007 $chars{0xcf6c}=0x013e; # small l with caron
1008 $chars{0xcf6e}=0x0148; # small n with caron
1009 $chars{0xcf6f}=0x01d2; # small o with caron
1010 $chars{0xcf72}=0x0159; # small r with caron
1011 $chars{0xcf73}=0x0161; # small s with caron
1012 $chars{0xcf74}=0x0165; # small t with caron
1013 $chars{0xcf75}=0x01d4; # small u with caron
1014 $chars{0xcf7a}=0x017e; # small z with caron
1015         # 5/0 cedilla
1016 $chars{0xd020}=0x00b8; # cedilla
1017 $chars{0xd043}=0x00c7; # capital c with cedilla
1018 $chars{0xd044}=0x1e10; # capital d with cedilla
1019 $chars{0xd047}=0x0122; # capital g with cedilla
1020 $chars{0xd048}=0x1e28; # capital h with cedilla
1021 $chars{0xd04b}=0x0136; # capital k with cedilla
1022 $chars{0xd04c}=0x013b; # capital l with cedilla
1023 $chars{0xd04e}=0x0145; # capital n with cedilla
1024 $chars{0xd052}=0x0156; # capital r with cedilla
1025 $chars{0xd053}=0x015e; # capital s with cedilla
1026 $chars{0xd054}=0x0162; # capital t with cedilla
1027 $chars{0xd063}=0x00e7; # small c with cedilla
1028 $chars{0xd064}=0x1e11; # small d with cedilla
1029 $chars{0xd065}=0x0119; # small e with cedilla
1030 $chars{0xd067}=0x0123; # small g with cedilla
1031 $chars{0xd068}=0x1e29; # small h with cedilla
1032 $chars{0xd06b}=0x0137; # small k with cedilla
1033 $chars{0xd06c}=0x013c; # small l with cedilla
1034 $chars{0xd06e}=0x0146; # small n with cedilla
1035 $chars{0xd072}=0x0157; # small r with cedilla
1036 $chars{0xd073}=0x015f; # small s with cedilla
1037 $chars{0xd074}=0x0163; # small t with cedilla
1038         # 5/1 rude
1039         # 5/2 hook to left
1040         # 5/3 ogonek (hook to right
1041 $chars{0xd320}=0x02db; # ogonek
1042 $chars{0xd341}=0x0104; # capital a with ogonek
1043 $chars{0xd345}=0x0118; # capital e with ogonek
1044 $chars{0xd349}=0x012e; # capital i with ogonek
1045 $chars{0xd34f}=0x01ea; # capital o with ogonek
1046 $chars{0xd355}=0x0172; # capital u with ogonek
1047 $chars{0xd361}=0x0105; # small a with ogonek
1048 $chars{0xd365}=0x0119; # small e with ogonek
1049 $chars{0xd369}=0x012f; # small i with ogonek
1050 $chars{0xd36f}=0x01eb; # small o with ogonek
1051 $chars{0xd375}=0x0173; # small u with ogonek
1052         # 5/4 circle below
1053 $chars{0xd441}=0x1e00; # capital a with ring below
1054 $chars{0xd461}=0x1e01; # small a with ring below
1055         # 5/5 half circle below
1056 $chars{0xf948}=0x1e2a; # capital h with breve below
1057 $chars{0xf968}=0x1e2b; # small h with breve below
1058         # 5/6 dot below
1059 $chars{0xd641}=0x1ea0; # capital a with dot below
1060 $chars{0xd642}=0x1e04; # capital b with dot below
1061 $chars{0xd644}=0x1e0c; # capital d with dot below
1062 $chars{0xd645}=0x1eb8; # capital e with dot below
1063 $chars{0xd648}=0x1e24; # capital h with dot below
1064 $chars{0xd649}=0x1eca; # capital i with dot below
1065 $chars{0xd64b}=0x1e32; # capital k with dot below
1066 $chars{0xd64c}=0x1e36; # capital l with dot below
1067 $chars{0xd64d}=0x1e42; # capital m with dot below
1068 $chars{0xd64e}=0x1e46; # capital n with dot below
1069 $chars{0xd64f}=0x1ecc; # capital o with dot below
1070 $chars{0xd652}=0x1e5a; # capital r with dot below
1071 $chars{0xd653}=0x1e62; # capital s with dot below
1072 $chars{0xd654}=0x1e6c; # capital t with dot below
1073 $chars{0xd655}=0x1ee4; # capital u with dot below
1074 $chars{0xd656}=0x1e7e; # capital v with dot below
1075 $chars{0xd657}=0x1e88; # capital w with dot below
1076 $chars{0xd659}=0x1ef4; # capital y with dot below
1077 $chars{0xd65a}=0x1e92; # capital z with dot below
1078 $chars{0xd661}=0x1ea1; # small a with dot below
1079 $chars{0xd662}=0x1e05; # small b with dot below
1080 $chars{0xd664}=0x1e0d; # small d with dot below
1081 $chars{0xd665}=0x1eb9; # small e with dot below
1082 $chars{0xd668}=0x1e25; # small h with dot below
1083 $chars{0xd669}=0x1ecb; # small i with dot below
1084 $chars{0xd66b}=0x1e33; # small k with dot below
1085 $chars{0xd66c}=0x1e37; # small l with dot below
1086 $chars{0xd66d}=0x1e43; # small m with dot below
1087 $chars{0xd66e}=0x1e47; # small n with dot below
1088 $chars{0xd66f}=0x1ecd; # small o with dot below
1089 $chars{0xd672}=0x1e5b; # small r with dot below
1090 $chars{0xd673}=0x1e63; # small s with dot below
1091 $chars{0xd674}=0x1e6d; # small t with dot below
1092 $chars{0xd675}=0x1ee5; # small u with dot below
1093 $chars{0xd676}=0x1e7f; # small v with dot below
1094 $chars{0xd677}=0x1e89; # small w with dot below
1095 $chars{0xd679}=0x1ef5; # small y with dot below
1096 $chars{0xd67a}=0x1e93; # small z with dot below
1097         # 5/7 double dot below
1098 $chars{0xd755}=0x1e72; # capital u with diaeresis below
1099 $chars{0xd775}=0x1e73; # small u with diaeresis below
1100         # 5/8 underline
1101 $chars{0xd820}=0x005f; # underline
1102         # 5/9 double underline
1103 $chars{0xd920}=0x2017; # double underline
1104         # 5/10 small low vertical bar
1105 $chars{0xda20}=0x02cc; #
1106         # 5/11 circumflex below
1107         # 5/12 (this position shall not be used)
1108         # 5/13 left half of ligature sign and of double tilde
1109         # 5/14 right half of ligature sign
1110         # 5/15 right half of double tilde
1111 #     map {printf "%x :%x\n",$_,$chars{$_};}keys %chars;
1112
1113 sub char_decode5426 {
1114     my ( $string) = @_;
1115     my $result;
1116
1117     my @data = unpack("C*", $string);
1118     my @characters;
1119     my $length=scalar(@data);
1120     for (my $i = 0; $i < scalar(@data); $i++) {
1121       my $char= $data[$i];
1122       if ($char >= 0x00 && $char <= 0x7F){
1123         #IsAscii
1124
1125           push @characters,$char unless ($char<0x02 ||$char== 0x0F);
1126       }elsif (($char >= 0xC0 && $char <= 0xDF)) {
1127         #Combined Char
1128         my $convchar ;
1129         if ($chars{$char*256+$data[$i+1]}) {
1130           $convchar= $chars{$char * 256 + $data[$i+1]};
1131           $i++;
1132 #           printf "char %x $char, char to convert %x , converted %x\n",$char,$char * 256 + $data[$i - 1],$convchar;
1133         } elsif ($chars{$char})  {
1134           $convchar= $chars{$char};
1135 #           printf "0xC char %x, converted %x\n",$char,$chars{$char};
1136         }else {
1137           $convchar=$char;
1138         }
1139         push @characters,$convchar;
1140       } else {
1141         my $convchar;
1142         if ($chars{$char})  {
1143           $convchar= $chars{$char};
1144 #            printf "char %x,  converted %x\n",$char,$chars{$char};
1145         }else {
1146 #            printf "char %x $char\n",$char;
1147           $convchar=$char;
1148         }
1149         push @characters,$convchar;
1150       }
1151     }
1152     $result=pack "U*",@characters;
1153 #     $result=~s/\x01//;
1154 #     $result=~s/\x00//;
1155      $result=~s/\x0f//;
1156      $result=~s/\x1b.//;
1157      $result=~s/\x0e//;
1158      $result=~s/\x1b\x5b//;
1159 #   map{printf "%x",$_} @characters;
1160 #   printf "\n";
1161   return $result;
1162 }
1163
1164 1;
1165
1166
1167 =head1 AUTHOR
1168
1169 Koha Development Team <http://koha-community.org/>
1170
1171 Galen Charlton <galen.charlton@liblime.com>
1172
1173 =cut