C4/Charset.pm

   1 package C4::Charset;
   2
   3 # Copyright (C) 2008 LibLime
   4 #
   5 # This file is part of Koha.
   6 #
   7 # Koha is free software; you can redistribute it and/or modify it under the
   8 # terms of the GNU General Public License as published by the Free Software
   9 # Foundation; either version 2 of the License, or (at your option) any later
  10 # version.
  11 #
  12 # Koha is distributed in the hope that it will be useful, but WITHOUT ANY
  13 # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
  14 # A PARTICULAR PURPOSE.  See the GNU General Public License for more details.
  15 #
  16 # You should have received a copy of the GNU General Public License along with
  17 # Koha; if not, write to the Free Software Foundation, Inc., 59 Temple Place,
  18 # Suite 330, Boston, MA  02111-1307 USA
  19
  20 use strict;
  21 use MARC::Charset qw/marc8_to_utf8/;
  22 use Text::Iconv;
  23
  24 use vars qw($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
  25
  26 BEGIN {
  27     # set the version for version checking
  28     $VERSION = 3.01;
  29     require Exporter;
  30     @ISA    = qw(Exporter);
  31     @EXPORT = qw(
  32         IsStringUTF8ish
  33         MarcToUTF8Record
  34         SetMarcUnicodeFlag
  35         StripNonXmlChars
  36     );
  37 }
  38
  39 =head1 NAME
  40
  41 C4::Charset - utilities for handling character set conversions.
  42
  43 =head1 SYNOPSIS
  44
  45 use C4::Charset;
  46
  47 =head1 DESCRIPTION
  48
  49 This module contains routines for dealing with character set
  50 conversions, particularly for MARC records.
  51
  52 A variety of character encodings are in use by various MARC
  53 standards, and even more character encodings are used by
  54 non-standard MARC records.  The various MARC formats generally
  55 do not do a good job of advertising a given record's character
  56 encoding, and even when a record does advertise its encoding,
  57 e.g., via the Leader/09, experience has shown that one cannot
  58 trust it.
  59
  60 Ultimately, all MARC records are stored in Koha in UTF-8 and
  61 must be converted from whatever the source character encoding is.
  62 The goal of this module is to ensure that these conversions
  63 take place accurately.  When a character conversion cannot take
  64 place, or at least not accurately, the module was provide
  65 enough information to allow user-facing code to inform the user
  66 on how to deal with the situation.
  67
  68 =cut
  69
  70 =head1 FUNCTIONS
  71
  72 =head2 IsStringUTF8ish
  73
  74 =over 4
  75
  76 my $is_utf8 = IsStringUTF8ish($str);
  77
  78 =back
  79
  80 Determines if C<$str> is valid UTF-8.  This can mean
  81 one of two things:
  82
  83 =over 2
  84
  85 =item *
  86
  87 The Perl UTF-8 flag is set and the string contains valid UTF-8.
  88
  89 =item *
  90
  91 The Perl UTF-8 flag is B<not> set, but the octets contain
  92 valid UTF-8.
  93
  94 =back
  95
  96 The function is named C<IsStringUTF8ish> instead of C<IsStringUTF8>
  97 because in one could be presented with a MARC blob that is
  98 not actually in UTF-8 but whose sequence of octets appears to be
  99 valid UTF-8.  The rest of the MARC character conversion functions
 100 will assume that this situation occur does not very often.
 101
 102 =cut
 103
 104 sub IsStringUTF8ish {
 105     my $str = shift;
 106
 107     return 1 if utf8::is_utf8($str);
 108     return utf8::decode($str);
 109 }
 110
 111 =head2 MarcToUTF8Record
 112
 113 =over 4
 114
 115 ($marc_record, $converted_from, $errors_arrayref) = MarcToUTF8Record($marc_blob, $marc_flavour, [, $source_encoding]);
 116
 117 =back
 118
 119 Given a MARC blob or a C<MARC::Record>, the MARC flavour, and an
 120 optional source encoding, return a C<MARC::Record> that is
 121 converted to UTF-8.
 122
 123 The returned C<$marc_record> is guaranteed to be in valid UTF-8, but
 124 is not guaranteed to have been converted correctly.  Specifically,
 125 if C<$converted_from> is 'failed', the MARC record returned failed
 126 character conversion and had each of its non-ASCII octets changed
 127 to the Unicode replacement character.
 128
 129 If the source encoding was not specified, this routine will
 130 try to guess it; the character encoding used for a successful
 131 conversion is returned in C<$converted_from>.
 132
 133 =cut
 134
 135 sub MarcToUTF8Record {
 136     my $marc = shift;
 137     my $marc_flavour = shift;
 138     my $source_encoding = shift;
 139
 140     my $marc_record;
 141     my $marc_blob_is_utf8 = 0;
 142     if (ref($marc) eq 'MARC::Record') {
 143         my $marc_blob = $marc->as_usmarc();
 144         $marc_blob_is_utf8 = IsStringUTF8ish($marc_blob);
 145         $marc_record = $marc;
 146     } else {
 147         # dealing with a MARC blob
 148
 149         # remove any ersatz whitespace from the beginning and
 150         # end of the MARC blob -- these can creep into MARC
 151         # files produced by several sources -- caller really
 152         # should be doing this, however
 153         $marc =~ s/^\s+//;
 154         $marc =~ s/\s+$//;
 155         $marc_blob_is_utf8 = IsStringUTF8ish($marc);
 156         $marc_record = MARC::Record->new_from_usmarc($marc);
 157     }
 158
 159     # If we do not know the source encoding, try some guesses
 160     # as follows:
 161     #   1. Record is UTF-8 already.
 162     #   2. If MARC flavor is MARC21, then
 163     #      a. record is MARC-8
 164     #      b. record is ISO-8859-1
 165     #   3. If MARC flavor is UNIMARC, then
 166     if (not defined $source_encoding) {
 167         if ($marc_blob_is_utf8) {
 168             # note that for MARC21 we are not bothering to check
 169             # if the Leader/09 is set to 'a' or not -- because
 170             # of problems with various ILSs (including Koha in the
 171             # past, alas), this just is not trustworthy.
 172             SetMarcUnicodeFlag($marc_record, $marc_flavour);
 173             return $marc_record, 'UTF-8', [];
 174         } else {
 175             if ($marc_flavour eq 'MARC21') {
 176                 return _default_marc21_charconv_to_utf8($marc_record, $marc_flavour);
 177             } elsif ($marc_flavour eq 'UNIMARC') {
 178                 return _default_unimarc_charconv_to_utf8($marc_record, $marc_flavour);
 179             } else {
 180                 return _default_marc21_charconv_to_utf8($marc_record, $marc_flavour);
 181             }
 182         }
 183     } else {
 184         # caller knows the character encoding
 185         my $original_marc_record = $marc_record->clone();
 186         my @errors;
 187         if ($source_encoding =~ /utf-?8/i) {
 188             if ($marc_blob_is_utf8) {
 189                 SetMarcUnicodeFlag($marc_record, $marc_flavour);
 190                 return $marc_record, 'UTF-8', [];
 191             } else {
 192                 push @errors, 'specified UTF-8 => UTF-8 conversion, but record is not in UTF-8';
 193             }
 194         } elsif ($source_encoding =~ /marc-?8/i) {
 195             @errors = _marc_marc8_to_utf8($marc_record, $marc_flavour);
 196         } elsif ($source_encoding =~ /5426/) {
 197             @errors = _marc_iso5426_to_utf8($marc_record, $marc_flavour);
 198         } else {
 199             # assume any other character encoding is for Text::Iconv
 200             @errors = _marc_to_utf8_via_text_iconv($marc_record, $marc_flavour, 'iso-8859-1');
 201         }
 202
 203         if (@errors) {
 204             _marc_to_utf8_replacement_char($original_marc_record, $marc_flavour);
 205             return $original_marc_record, 'failed', \@errors;
 206         } else {
 207             return $marc_record, $source_encoding, [];
 208         }
 209     }
 210
 211 }
 212
 213 =head2 SetMarcUnicodeFlag
 214
 215 =over 4
 216
 217 SetMarcUnicodeFlag($marc_record, $marc_flavour);
 218
 219 =back
 220
 221 Set both the internal MARC::Record encoding flag
 222 and the appropriate Leader/09 (MARC21) or
 223 100/26-29 (UNIMARC) to indicate that the record
 224 is in UTF-8.  Note that this does B<not> do
 225 any actual character conversion.
 226
 227 =cut
 228
 229 sub SetMarcUnicodeFlag {
 230     my $marc_record = shift;
 231     my $marc_flavour = shift; # || C4::Context->preference("marcflavour");
 232
 233     $marc_record->encoding('UTF-8');
 234     if ($marc_flavour eq 'MARC21') {
 235         my $leader = $marc_record->leader();
 236         substr($leader, 9, 1) = 'a';
 237         $marc_record->leader($leader);
 238     } elsif ($marc_flavour eq "UNIMARC") {
 239         if (my $field = $marc_record->field('100')) {
 240             my $sfa = $field->subfield('a');
 241
 242             my $subflength = 36;
 243             # fix the length of the field
 244             $sfa = substr $sfa, 0, $subflength if (length($sfa) > $subflength);
 245             $sfa = sprintf( "%-*s", 35, $sfa ) if (length($sfa) < $subflength);
 246
 247             substr($sfa, 26, 4) = '50  ';
 248             $field->update('a' => $sfa);
 249         }
 250     } else {
 251         warn "Unrecognized marcflavour: $marc_flavour";
 252     }
 253 }
 254
 255 =head2 StripNonXmlChars
 256
 257 =over 4
 258
 259 my $new_str = StripNonXmlChars($old_str);
 260
 261 =back
 262
 263 Given a string, return a copy with the
 264 characters that are illegal in XML
 265 removed.
 266
 267 This function exists to work around a problem
 268 that can occur with badly-encoded MARC records.
 269 Specifically, if a UTF-8 MARC record also
 270 has excape (\x1b) characters, MARC::File::XML
 271 will let the escape characters pass through
 272 when as_xml() or as_xml_record() is called.  The
 273 problem is that the escape character is not
 274 legal in well-formed XML documents, so when
 275 MARC::File::XML attempts to parse such a record,
 276 the XML parser will fail.
 277
 278 Stripping such characters will allow a
 279 MARC::Record->new_from_xml()
 280 to work, at the possible risk of some data loss.
 281
 282 =cut
 283
 284 sub StripNonXmlChars {
 285     my $str = shift;
 286     $str =~ s/[^\x09\x0A\x0D\x{0020}-\x{D7FF}\x{E000}-\x{FFFD}\x{10000}-\x{10FFFF}]//g;
 287     return $str;
 288 }
 289
 290 =head1 INTERNAL FUNCTIONS
 291
 292 =head2 _default_marc21_charconv_to_utf8
 293
 294 =over 4
 295
 296 my ($new_marc_record, $guessed_charset) = _default_marc21_charconv_to_utf8($marc_record);
 297
 298 =back
 299
 300 Converts a C<MARC::Record> of unknown character set to UTF-8,
 301 first by trying a MARC-8 to UTF-8 conversion, then ISO-8859-1
 302 to UTF-8, then a default conversion that replaces each non-ASCII
 303 character with the replacement character.
 304
 305 The C<$guessed_charset> return value contains the character set
 306 that resulted in a conversion to valid UTF-8; note that
 307 if the MARC-8 and ISO-8859-1 conversions failed, the value of
 308 this is 'failed'.
 309
 310 =cut
 311
 312 sub _default_marc21_charconv_to_utf8 {
 313     my $marc_record = shift;
 314     my $marc_flavour = shift;
 315
 316     my $trial_marc8 = $marc_record->clone();
 317     my @all_errors = ();
 318     my @errors = _marc_marc8_to_utf8($trial_marc8, $marc_flavour);
 319     unless (@errors) {
 320         return $trial_marc8, 'MARC-8', [];
 321     }
 322     push @all_errors, @errors;
 323
 324     my $trial_8859_1 = $marc_record->clone();
 325     @errors = _marc_to_utf8_via_text_iconv($trial_8859_1, $marc_flavour, 'iso-8859-1');
 326     unless (@errors) {
 327         return $trial_8859_1, 'iso-8859-1', []; # note -- we could return \@all_errors
 328                                                 # instead if we wanted to report details
 329                                                 # of the failed attempt at MARC-8 => UTF-8
 330     }
 331     push @all_errors, @errors;
 332
 333     my $default_converted = $marc_record->clone();
 334     _marc_to_utf8_replacement_char($default_converted, $marc_flavour);
 335     return $default_converted, 'failed', \@all_errors;
 336 }
 337
 338 =head2 _default_unimarc_charconv_to_utf8
 339
 340 =over 4
 341
 342 my ($new_marc_record, $guessed_charset) = _default_unimarc_charconv_to_utf8($marc_record);
 343
 344 =back
 345
 346 Converts a C<MARC::Record> of unknown character set to UTF-8,
 347 first by trying a ISO-5426 to UTF-8 conversion, then ISO-8859-1
 348 to UTF-8, then a default conversion that replaces each non-ASCII
 349 character with the replacement character.
 350
 351 The C<$guessed_charset> return value contains the character set
 352 that resulted in a conversion to valid UTF-8; note that
 353 if the MARC-8 and ISO-8859-1 conversions failed, the value of
 354 this is 'failed'.
 355
 356 =cut
 357
 358 sub _default_unimarc_charconv_to_utf8 {
 359     my $marc_record = shift;
 360     my $marc_flavour = shift;
 361
 362     my $trial_marc8 = $marc_record->clone();
 363     my @all_errors = ();
 364     my @errors = _marc_iso5426_to_utf8($trial_marc8, $marc_flavour);
 365     unless (@errors) {
 366         return $trial_marc8, 'iso-5426';
 367     }
 368     push @all_errors, @errors;
 369
 370     my $trial_8859_1 = $marc_record->clone();
 371     @errors = _marc_to_utf8_via_text_iconv($trial_8859_1, $marc_flavour, 'iso-8859-1');
 372     unless (@errors) {
 373         return $trial_8859_1, 'iso-8859-1';
 374     }
 375     push @all_errors, @errors;
 376
 377     my $default_converted = $marc_record->clone();
 378     _marc_to_utf8_replacement_char($default_converted, $marc_flavour);
 379     return $default_converted, 'failed', \@all_errors;
 380 }
 381
 382 =head2 _marc_marc8_to_utf8
 383
 384 =over 4
 385
 386 my @errors = _marc_marc8_to_utf8($marc_record, $marc_flavour, $source_encoding);
 387
 388 =back
 389
 390 Convert a C<MARC::Record> to UTF-8 in-place from MARC-8.
 391 If the conversion fails for some reason, an
 392 appropriate messages will be placed in the returned
 393 C<@errors> array.
 394
 395 =cut
 396
 397 sub _marc_marc8_to_utf8 {
 398     my $marc_record = shift;
 399     my $marc_flavour = shift;
 400
 401     my $prev_ignore = MARC::Charset->ignore_errors();
 402     MARC::Charset->ignore_errors(1);
 403
 404     # trap warnings raised by MARC::Charset
 405     my @errors = ();
 406     local $SIG{__WARN__} = sub {
 407         my $msg = $_[0];
 408         if ($msg =~ /MARC.Charset/) {
 409             # FIXME - purpose of this regexp is to strip out the
 410             # line reference to MARC/Charset.pm, but as it
 411             # exists probably won't work quite on Windows --
 412             # some sort of minimal-bunch back-tracking RE
 413             # would be helpful here
 414             $msg =~ s/at [\/].*?.MARC.Charset\.pm line \d+\.\n$//;
 415             push @errors, $msg;
 416         } else {
 417             # if warning doesn't come from MARC::Charset, just
 418             # pass it on
 419             warn $msg;
 420         }
 421     };
 422
 423     foreach my $field ($marc_record->fields()) {
 424         if ($field->is_control_field()) {
 425             ; # do nothing -- control fields should not contain non-ASCII characters
 426         } else {
 427             my @converted_subfields;
 428             foreach my $subfield ($field->subfields()) {
 429                 my $utf8sf = MARC::Charset::marc8_to_utf8($subfield->[1]);
 430                 unless (IsStringUTF8ish($utf8sf)) {
 431                     # Because of a bug in MARC::Charset 0.98, if the string
 432                     # has (a) one or more diacritics that (b) are only in character positions
 433                     # 128 to 255 inclusive, the resulting converted string is not in
 434                     # UTF-8, but the legacy 8-bit encoding (e.g., ISO-8859-1).  If that
 435                     # occurs, upgrade the string in place.  Moral of the story seems to be
 436                     # that pack("U", ...) is better than chr(...) if you need to guarantee
 437                     # that the resulting string is UTF-8.
 438                     utf8::upgrade($utf8sf);
 439                 }
 440                 push @converted_subfields, $subfield->[0], $utf8sf;
 441             }
 442
 443             $field->replace_with(MARC::Field->new(
 444                 $field->tag(), $field->indicator(1), $field->indicator(2),
 445                 @converted_subfields)
 446             );
 447         }
 448     }
 449
 450     MARC::Charset->ignore_errors($prev_ignore);
 451
 452     SetMarcUnicodeFlag($marc_record, $marc_flavour);
 453
 454     return @errors;
 455 }
 456
 457 =head2 _marc_iso5426_to_utf8
 458
 459 =over 4
 460
 461 my @errors = _marc_iso5426_to_utf8($marc_record, $marc_flavour, $source_encoding);
 462
 463 =back
 464
 465 Convert a C<MARC::Record> to UTF-8 in-place from ISO-5426.
 466 If the conversion fails for some reason, an
 467 appropriate messages will be placed in the returned
 468 C<@errors> array.
 469
 470 FIXME - is ISO-5426 equivalent enough to MARC-8
 471 that C<MARC::Charset> can be used instead?
 472
 473 =cut
 474
 475 sub _marc_iso5426_to_utf8 {
 476     my $marc_record = shift;
 477     my $marc_flavour = shift;
 478
 479     my @errors = ();
 480
 481     foreach my $field ($marc_record->fields()) {
 482         if ($field->is_control_field()) {
 483             ; # do nothing -- control fields should not contain non-ASCII characters
 484         } else {
 485             my @converted_subfields;
 486             foreach my $subfield ($field->subfields()) {
 487                 my $utf8sf = char_decode5426($subfield->[1]);
 488                 push @converted_subfields, $subfield->[0], $utf8sf;
 489             }
 490
 491             $field->replace_with(MARC::Field->new(
 492                 $field->tag(), $field->indicator(1), $field->indicator(2),
 493                 @converted_subfields)
 494             );
 495         }
 496     }
 497
 498     SetMarcUnicodeFlag($marc_record, $marc_flavour);
 499
 500     return @errors;
 501 }
 502
 503 =head2 _marc_to_utf8_via_text_iconv
 504
 505 =over 4
 506
 507 my @errors = _marc_to_utf8_via_text_iconv($marc_record, $marc_flavour, $source_encoding);
 508
 509 =back
 510
 511 Convert a C<MARC::Record> to UTF-8 in-place using the
 512 C<Text::Iconv> CPAN module.  Any source encoding accepted
 513 by the user's iconv installation should work.  If
 514 the source encoding is not recognized on the user's
 515 server or the conversion fails for some reason,
 516 appropriate messages will be placed in the returned
 517 C<@errors> array.
 518
 519 =cut
 520
 521 sub _marc_to_utf8_via_text_iconv {
 522     my $marc_record = shift;
 523     my $marc_flavour = shift;
 524     my $source_encoding = shift;
 525
 526     my @errors = ();
 527     my $decoder;
 528     eval { $decoder = Text::Iconv->new($source_encoding, 'utf8'); };
 529     if ($@) {
 530         push @errors, "Could not initialze $source_encoding => utf8 converter: $@";
 531         return @errors;
 532     }
 533
 534     my $prev_raise_error = Text::Iconv->raise_error();
 535     Text::Iconv->raise_error(1);
 536
 537     foreach my $field ($marc_record->fields()) {
 538         if ($field->is_control_field()) {
 539             ; # do nothing -- control fields should not contain non-ASCII characters
 540         } else {
 541             my @converted_subfields;
 542             foreach my $subfield ($field->subfields()) {
 543                 my $converted_value;
 544                 my $conversion_ok = 1;
 545                 eval { $converted_value = $decoder->convert($subfield->[1]); };
 546                 if ($@) {
 547                     $conversion_ok = 0;
 548                     push @errors, $@;
 549                 } elsif (not defined $converted_value) {
 550                     $conversion_ok = 0;
 551                     push @errors, "Text::Iconv conversion failed - retval is " . $decoder->retval();
 552                 }
 553
 554                 if ($conversion_ok) {
 555                     push @converted_subfields, $subfield->[0], $converted_value;
 556                 } else {
 557                     $converted_value = $subfield->[1];
 558                     $converted_value =~ s/[\200-\377]/\xef\xbf\xbd/g;
 559                     push @converted_subfields, $subfield->[0], $converted_value;
 560                 }
 561             }
 562
 563             $field->replace_with(MARC::Field->new(
 564                 $field->tag(), $field->indicator(1), $field->indicator(2),
 565                 @converted_subfields)
 566             );
 567         }
 568     }
 569
 570     SetMarcUnicodeFlag($marc_record, $marc_flavour);
 571     Text::Iconv->raise_error($prev_raise_error);
 572
 573     return @errors;
 574 }
 575
 576 =head2 _marc_to_utf8_replacement_char
 577
 578 =over 4
 579
 580 _marc_to_utf8_replacement_char($marc_record, $marc_flavour);
 581
 582 =back
 583
 584 Convert a C<MARC::Record> to UTF-8 in-place, adopting the
 585 unsatisfactory method of replacing all non-ASCII (e.g.,
 586 where the eight bit is set) octet with the Unicode
 587 replacement character.  This is meant as a last-ditch
 588 method, and would be best used as part of a UI that
 589 lets a cataloguer pick various character conversions
 590 until he or she finds the right one.
 591
 592 =cut
 593
 594 sub _marc_to_utf8_replacement_char {
 595     my $marc_record = shift;
 596     my $marc_flavour = shift;
 597
 598     foreach my $field ($marc_record->fields()) {
 599         if ($field->is_control_field()) {
 600             ; # do nothing -- control fields should not contain non-ASCII characters
 601         } else {
 602             my @converted_subfields;
 603             foreach my $subfield ($field->subfields()) {
 604                 my $value = $subfield->[1];
 605                 $value =~ s/[\200-\377]/\xef\xbf\xbd/g;
 606                 push @converted_subfields, $subfield->[0], $value;
 607             }
 608
 609             $field->replace_with(MARC::Field->new(
 610                 $field->tag(), $field->indicator(1), $field->indicator(2),
 611                 @converted_subfields)
 612             );
 613         }
 614     }
 615
 616     SetMarcUnicodeFlag($marc_record, $marc_flavour);
 617 }
 618
 619 =head2 char_decode5426
 620
 621 =over 4
 622
 623 my $utf8string = char_decode5426($iso_5426_string);
 624
 625 =back
 626
 627 Converts a string from ISO-5426 to UTF-8.
 628
 629 =cut
 630
 631
 632 my %chars;
 633 $chars{0xb0}=0x0101;#3/0ayn[ain]
 634 $chars{0xb1}=0x0623;#3/1alif/hamzah[alefwithhamzaabove]
 635 #$chars{0xb2}=0x00e0;#'à';
 636 $chars{0xb2}=0x00e0;#3/2leftlowsinglequotationmark
 637 #$chars{0xb3}=0x00e7;#'ç';
 638 $chars{0xb3}=0x00e7;#3/2leftlowsinglequotationmark
 639 # $chars{0xb4}='è';
 640 $chars{0xb4}=0x00e8;
 641 # $chars{0xb5}='é';
 642 $chars{0xb5}=0x00e9;
 643 $chars{0x97}=0x003c;#3/2leftlowsinglequotationmark
 644 $chars{0x98}=0x003e;#3/2leftlowsinglequotationmark
 645 $chars{0xfa}=0x0153;#oe
 646 $chars{0x81d1}=0x00b0;
 647
 648 ####
 649 ## combined characters iso5426
 650
 651 $chars{0xc041}=0x1ea2; # capital a with hook above
 652 $chars{0xc045}=0x1eba; # capital e with hook above
 653 $chars{0xc049}=0x1ec8; # capital i with hook above
 654 $chars{0xc04f}=0x1ece; # capital o with hook above
 655 $chars{0xc055}=0x1ee6; # capital u with hook above
 656 $chars{0xc059}=0x1ef6; # capital y with hook above
 657 $chars{0xc061}=0x1ea3; # small a with hook above
 658 $chars{0xc065}=0x1ebb; # small e with hook above
 659 $chars{0xc069}=0x1ec9; # small i with hook above
 660 $chars{0xc06f}=0x1ecf; # small o with hook above
 661 $chars{0xc075}=0x1ee7; # small u with hook above
 662 $chars{0xc079}=0x1ef7; # small y with hook above
 663
 664         # 4/1 grave accent
 665 $chars{0xc141}=0x00c0; # capital a with grave accent
 666 $chars{0xc145}=0x00c8; # capital e with grave accent
 667 $chars{0xc149}=0x00cc; # capital i with grave accent
 668 $chars{0xc14f}=0x00d2; # capital o with grave accent
 669 $chars{0xc155}=0x00d9; # capital u with grave accent
 670 $chars{0xc157}=0x1e80; # capital w with grave
 671 $chars{0xc159}=0x1ef2; # capital y with grave
 672 $chars{0xc161}=0x00e0; # small a with grave accent
 673 $chars{0xc165}=0x00e8; # small e with grave accent
 674 $chars{0xc169}=0x00ec; # small i with grave accent
 675 $chars{0xc16f}=0x00f2; # small o with grave accent
 676 $chars{0xc175}=0x00f9; # small u with grave accent
 677 $chars{0xc177}=0x1e81; # small w with grave
 678 $chars{0xc179}=0x1ef3; # small y with grave
 679         # 4/2 acute accent
 680 $chars{0xc241}=0x00c1; # capital a with acute accent
 681 $chars{0xc243}=0x0106; # capital c with acute accent
 682 $chars{0xc245}=0x00c9; # capital e with acute accent
 683 $chars{0xc247}=0x01f4; # capital g with acute
 684 $chars{0xc249}=0x00cd; # capital i with acute accent
 685 $chars{0xc24b}=0x1e30; # capital k with acute
 686 $chars{0xc24c}=0x0139; # capital l with acute accent
 687 $chars{0xc24d}=0x1e3e; # capital m with acute
 688 $chars{0xc24e}=0x0143; # capital n with acute accent
 689 $chars{0xc24f}=0x00d3; # capital o with acute accent
 690 $chars{0xc250}=0x1e54; # capital p with acute
 691 $chars{0xc252}=0x0154; # capital r with acute accent
 692 $chars{0xc253}=0x015a; # capital s with acute accent
 693 $chars{0xc255}=0x00da; # capital u with acute accent
 694 $chars{0xc257}=0x1e82; # capital w with acute
 695 $chars{0xc259}=0x00dd; # capital y with acute accent
 696 $chars{0xc25a}=0x0179; # capital z with acute accent
 697 $chars{0xc261}=0x00e1; # small a with acute accent
 698 $chars{0xc263}=0x0107; # small c with acute accent
 699 $chars{0xc265}=0x00e9; # small e with acute accent
 700 $chars{0xc267}=0x01f5; # small g with acute
 701 $chars{0xc269}=0x00ed; # small i with acute accent
 702 $chars{0xc26b}=0x1e31; # small k with acute
 703 $chars{0xc26c}=0x013a; # small l with acute accent
 704 $chars{0xc26d}=0x1e3f; # small m with acute
 705 $chars{0xc26e}=0x0144; # small n with acute accent
 706 $chars{0xc26f}=0x00f3; # small o with acute accent
 707 $chars{0xc270}=0x1e55; # small p with acute
 708 $chars{0xc272}=0x0155; # small r with acute accent
 709 $chars{0xc273}=0x015b; # small s with acute accent
 710 $chars{0xc275}=0x00fa; # small u with acute accent
 711 $chars{0xc277}=0x1e83; # small w with acute
 712 $chars{0xc279}=0x00fd; # small y with acute accent
 713 $chars{0xc27a}=0x017a; # small z with acute accent
 714 $chars{0xc2e1}=0x01fc; # capital ae with acute
 715 $chars{0xc2f1}=0x01fd; # small ae with acute
 716        # 4/3 circumflex accent
 717 $chars{0xc341}=0x00c2; # capital a with circumflex accent
 718 $chars{0xc343}=0x0108; # capital c with circumflex
 719 $chars{0xc345}=0x00ca; # capital e with circumflex accent
 720 $chars{0xc347}=0x011c; # capital g with circumflex
 721 $chars{0xc348}=0x0124; # capital h with circumflex
 722 $chars{0xc349}=0x00ce; # capital i with circumflex accent
 723 $chars{0xc34a}=0x0134; # capital j with circumflex
 724 $chars{0xc34f}=0x00d4; # capital o with circumflex accent
 725 $chars{0xc353}=0x015c; # capital s with circumflex
 726 $chars{0xc355}=0x00db; # capital u with circumflex
 727 $chars{0xc357}=0x0174; # capital w with circumflex
 728 $chars{0xc359}=0x0176; # capital y with circumflex
 729 $chars{0xc35a}=0x1e90; # capital z with circumflex
 730 $chars{0xc361}=0x00e2; # small a with circumflex accent
 731 $chars{0xc363}=0x0109; # small c with circumflex
 732 $chars{0xc365}=0x00ea; # small e with circumflex accent
 733 $chars{0xc367}=0x011d; # small g with circumflex
 734 $chars{0xc368}=0x0125; # small h with circumflex
 735 $chars{0xc369}=0x00ee; # small i with circumflex accent
 736 $chars{0xc36a}=0x0135; # small j with circumflex
 737 $chars{0xc36e}=0x00f1; # small n with tilde
 738 $chars{0xc36f}=0x00f4; # small o with circumflex accent
 739 $chars{0xc373}=0x015d; # small s with circumflex
 740 $chars{0xc375}=0x00fb; # small u with circumflex
 741 $chars{0xc377}=0x0175; # small w with circumflex
 742 $chars{0xc379}=0x0177; # small y with circumflex
 743 $chars{0xc37a}=0x1e91; # small z with circumflex
 744         # 4/4 tilde
 745 $chars{0xc441}=0x00c3; # capital a with tilde
 746 $chars{0xc445}=0x1ebc; # capital e with tilde
 747 $chars{0xc449}=0x0128; # capital i with tilde
 748 $chars{0xc44e}=0x00d1; # capital n with tilde
 749 $chars{0xc44f}=0x00d5; # capital o with tilde
 750 $chars{0xc455}=0x0168; # capital u with tilde
 751 $chars{0xc456}=0x1e7c; # capital v with tilde
 752 $chars{0xc459}=0x1ef8; # capital y with tilde
 753 $chars{0xc461}=0x00e3; # small a with tilde
 754 $chars{0xc465}=0x1ebd; # small e with tilde
 755 $chars{0xc469}=0x0129; # small i with tilde
 756 $chars{0xc46e}=0x00f1; # small n with tilde
 757 $chars{0xc46f}=0x00f5; # small o with tilde
 758 $chars{0xc475}=0x0169; # small u with tilde
 759 $chars{0xc476}=0x1e7d; # small v with tilde
 760 $chars{0xc479}=0x1ef9; # small y with tilde
 761     # 4/5 macron
 762 $chars{0xc541}=0x0100; # capital a with macron
 763 $chars{0xc545}=0x0112; # capital e with macron
 764 $chars{0xc547}=0x1e20; # capital g with macron
 765 $chars{0xc549}=0x012a; # capital i with macron
 766 $chars{0xc54f}=0x014c; # capital o with macron
 767 $chars{0xc555}=0x016a; # capital u with macron
 768 $chars{0xc561}=0x0101; # small a with macron
 769 $chars{0xc565}=0x0113; # small e with macron
 770 $chars{0xc567}=0x1e21; # small g with macron
 771 $chars{0xc569}=0x012b; # small i with macron
 772 $chars{0xc56f}=0x014d; # small o with macron
 773 $chars{0xc575}=0x016b; # small u with macron
 774 $chars{0xc572}=0x0159; # small r with macron
 775 $chars{0xc5e1}=0x01e2; # capital ae with macron
 776 $chars{0xc5f1}=0x01e3; # small ae with macron
 777         # 4/6 breve
 778 $chars{0xc641}=0x0102; # capital a with breve
 779 $chars{0xc645}=0x0114; # capital e with breve
 780 $chars{0xc647}=0x011e; # capital g with breve
 781 $chars{0xc649}=0x012c; # capital i with breve
 782 $chars{0xc64f}=0x014e; # capital o with breve
 783 $chars{0xc655}=0x016c; # capital u with breve
 784 $chars{0xc661}=0x0103; # small a with breve
 785 $chars{0xc665}=0x0115; # small e with breve
 786 $chars{0xc667}=0x011f; # small g with breve
 787 $chars{0xc669}=0x012d; # small i with breve
 788 $chars{0xc66f}=0x014f; # small o with breve
 789 $chars{0xc675}=0x016d; # small u with breve
 790         # 4/7 dot above
 791 $chars{0xc7b0}=0x01e1; # Ain with dot above
 792 $chars{0xc742}=0x1e02; # capital b with dot above
 793 $chars{0xc743}=0x010a; # capital c with dot above
 794 $chars{0xc744}=0x1e0a; # capital d with dot above
 795 $chars{0xc745}=0x0116; # capital e with dot above
 796 $chars{0xc746}=0x1e1e; # capital f with dot above
 797 $chars{0xc747}=0x0120; # capital g with dot above
 798 $chars{0xc748}=0x1e22; # capital h with dot above
 799 $chars{0xc749}=0x0130; # capital i with dot above
 800 $chars{0xc74d}=0x1e40; # capital m with dot above
 801 $chars{0xc74e}=0x1e44; # capital n with dot above
 802 $chars{0xc750}=0x1e56; # capital p with dot above
 803 $chars{0xc752}=0x1e58; # capital r with dot above
 804 $chars{0xc753}=0x1e60; # capital s with dot above
 805 $chars{0xc754}=0x1e6a; # capital t with dot above
 806 $chars{0xc757}=0x1e86; # capital w with dot above
 807 $chars{0xc758}=0x1e8a; # capital x with dot above
 808 $chars{0xc759}=0x1e8e; # capital y with dot above
 809 $chars{0xc75a}=0x017b; # capital z with dot above
 810 $chars{0xc761}=0x0227; # small b with dot above
 811 $chars{0xc762}=0x1e03; # small b with dot above
 812 $chars{0xc763}=0x010b; # small c with dot above
 813 $chars{0xc764}=0x1e0b; # small d with dot above
 814 $chars{0xc765}=0x0117; # small e with dot above
 815 $chars{0xc766}=0x1e1f; # small f with dot above
 816 $chars{0xc767}=0x0121; # small g with dot above
 817 $chars{0xc768}=0x1e23; # small h with dot above
 818 $chars{0xc76d}=0x1e41; # small m with dot above
 819 $chars{0xc76e}=0x1e45; # small n with dot above
 820 $chars{0xc770}=0x1e57; # small p with dot above
 821 $chars{0xc772}=0x1e59; # small r with dot above
 822 $chars{0xc773}=0x1e61; # small s with dot above
 823 $chars{0xc774}=0x1e6b; # small t with dot above
 824 $chars{0xc777}=0x1e87; # small w with dot above
 825 $chars{0xc778}=0x1e8b; # small x with dot above
 826 $chars{0xc779}=0x1e8f; # small y with dot above
 827 $chars{0xc77a}=0x017c; # small z with dot above
 828         # 4/8 trema, diaresis
 829 $chars{0xc820}=0x00a8; # diaeresis
 830 $chars{0xc841}=0x00c4; # capital a with diaeresis
 831 $chars{0xc845}=0x00cb; # capital e with diaeresis
 832 $chars{0xc848}=0x1e26; # capital h with diaeresis
 833 $chars{0xc849}=0x00cf; # capital i with diaeresis
 834 $chars{0xc84f}=0x00d6; # capital o with diaeresis
 835 $chars{0xc855}=0x00dc; # capital u with diaeresis
 836 $chars{0xc857}=0x1e84; # capital w with diaeresis
 837 $chars{0xc858}=0x1e8c; # capital x with diaeresis
 838 $chars{0xc859}=0x0178; # capital y with diaeresis
 839 $chars{0xc861}=0x00e4; # small a with diaeresis
 840 $chars{0xc865}=0x00eb; # small e with diaeresis
 841 $chars{0xc868}=0x1e27; # small h with diaeresis
 842 $chars{0xc869}=0x00ef; # small i with diaeresis
 843 $chars{0xc86f}=0x00f6; # small o with diaeresis
 844 $chars{0xc874}=0x1e97; # small t with diaeresis
 845 $chars{0xc875}=0x00fc; # small u with diaeresis
 846 $chars{0xc877}=0x1e85; # small w with diaeresis
 847 $chars{0xc878}=0x1e8d; # small x with diaeresis
 848 $chars{0xc879}=0x00ff; # small y with diaeresis
 849         # 4/9 umlaut
 850 $chars{0xc920}=0x00a8; # [diaeresis]
 851 $chars{0xc961}=0x00e4; # a with umlaut
 852 $chars{0xc965}=0x00eb; # e with umlaut
 853 $chars{0xc969}=0x00ef; # i with umlaut
 854 $chars{0xc96f}=0x00f6; # o with umlaut
 855 $chars{0xc975}=0x00fc; # u with umlaut
 856         # 4/10 circle above
 857 $chars{0xca41}=0x00c5; # capital a with ring above
 858 $chars{0xcaad}=0x016e; # capital u with ring above
 859 $chars{0xca61}=0x00e5; # small a with ring above
 860 $chars{0xca75}=0x016f; # small u with ring above
 861 $chars{0xca77}=0x1e98; # small w with ring above
 862 $chars{0xca79}=0x1e99; # small y with ring above
 863         # 4/11 high comma off centre
 864         # 4/12 inverted high comma centred
 865         # 4/13 double acute accent
 866 $chars{0xcd4f}=0x0150; # capital o with double acute
 867 $chars{0xcd55}=0x0170; # capital u with double acute
 868 $chars{0xcd6f}=0x0151; # small o with double acute
 869 $chars{0xcd75}=0x0171; # small u with double acute
 870         # 4/14 horn
 871 $chars{0xce54}=0x01a0; # latin capital letter o with horn
 872 $chars{0xce55}=0x01af; # latin capital letter u with horn
 873 $chars{0xce74}=0x01a1; # latin small letter o with horn
 874 $chars{0xce75}=0x01b0; # latin small letter u with horn
 875         # 4/15 caron (hacek
 876 $chars{0xcf41}=0x01cd; # capital a with caron
 877 $chars{0xcf43}=0x010c; # capital c with caron
 878 $chars{0xcf44}=0x010e; # capital d with caron
 879 $chars{0xcf45}=0x011a; # capital e with caron
 880 $chars{0xcf47}=0x01e6; # capital g with caron
 881 $chars{0xcf49}=0x01cf; # capital i with caron
 882 $chars{0xcf4b}=0x01e8; # capital k with caron
 883 $chars{0xcf4c}=0x013d; # capital l with caron
 884 $chars{0xcf4e}=0x0147; # capital n with caron
 885 $chars{0xcf4f}=0x01d1; # capital o with caron
 886 $chars{0xcf52}=0x0158; # capital r with caron
 887 $chars{0xcf53}=0x0160; # capital s with caron
 888 $chars{0xcf54}=0x0164; # capital t with caron
 889 $chars{0xcf55}=0x01d3; # capital u with caron
 890 $chars{0xcf5a}=0x017d; # capital z with caron
 891 $chars{0xcf61}=0x01ce; # small a with caron
 892 $chars{0xcf63}=0x010d; # small c with caron
 893 $chars{0xcf64}=0x010f; # small d with caron
 894 $chars{0xcf65}=0x011b; # small e with caron
 895 $chars{0xcf67}=0x01e7; # small g with caron
 896 $chars{0xcf69}=0x01d0; # small i with caron
 897 $chars{0xcf6a}=0x01f0; # small j with caron
 898 $chars{0xcf6b}=0x01e9; # small k with caron
 899 $chars{0xcf6c}=0x013e; # small l with caron
 900 $chars{0xcf6e}=0x0148; # small n with caron
 901 $chars{0xcf6f}=0x01d2; # small o with caron
 902 $chars{0xcf72}=0x0159; # small r with caron
 903 $chars{0xcf73}=0x0161; # small s with caron
 904 $chars{0xcf74}=0x0165; # small t with caron
 905 $chars{0xcf75}=0x01d4; # small u with caron
 906 $chars{0xcf7a}=0x017e; # small z with caron
 907         # 5/0 cedilla
 908 $chars{0xd020}=0x00b8; # cedilla
 909 $chars{0xd043}=0x00c7; # capital c with cedilla
 910 $chars{0xd044}=0x1e10; # capital d with cedilla
 911 $chars{0xd047}=0x0122; # capital g with cedilla
 912 $chars{0xd048}=0x1e28; # capital h with cedilla
 913 $chars{0xd04b}=0x0136; # capital k with cedilla
 914 $chars{0xd04c}=0x013b; # capital l with cedilla
 915 $chars{0xd04e}=0x0145; # capital n with cedilla
 916 $chars{0xd052}=0x0156; # capital r with cedilla
 917 $chars{0xd053}=0x015e; # capital s with cedilla
 918 $chars{0xd054}=0x0162; # capital t with cedilla
 919 $chars{0xd063}=0x00e7; # small c with cedilla
 920 $chars{0xd064}=0x1e11; # small d with cedilla
 921 $chars{0xd065}=0x0119; # small e with cedilla
 922 $chars{0xd067}=0x0123; # small g with cedilla
 923 $chars{0xd068}=0x1e29; # small h with cedilla
 924 $chars{0xd06b}=0x0137; # small k with cedilla
 925 $chars{0xd06c}=0x013c; # small l with cedilla
 926 $chars{0xd06e}=0x0146; # small n with cedilla
 927 $chars{0xd072}=0x0157; # small r with cedilla
 928 $chars{0xd073}=0x015f; # small s with cedilla
 929 $chars{0xd074}=0x0163; # small t with cedilla
 930         # 5/1 rude
 931         # 5/2 hook to left
 932         # 5/3 ogonek (hook to right
 933 $chars{0xd320}=0x02db; # ogonek
 934 $chars{0xd341}=0x0104; # capital a with ogonek
 935 $chars{0xd345}=0x0118; # capital e with ogonek
 936 $chars{0xd349}=0x012e; # capital i with ogonek
 937 $chars{0xd34f}=0x01ea; # capital o with ogonek
 938 $chars{0xd355}=0x0172; # capital u with ogonek
 939 $chars{0xd361}=0x0105; # small a with ogonek
 940 $chars{0xd365}=0x0119; # small e with ogonek
 941 $chars{0xd369}=0x012f; # small i with ogonek
 942 $chars{0xd36f}=0x01eb; # small o with ogonek
 943 $chars{0xd375}=0x0173; # small u with ogonek
 944         # 5/4 circle below
 945 $chars{0xd441}=0x1e00; # capital a with ring below
 946 $chars{0xd461}=0x1e01; # small a with ring below
 947         # 5/5 half circle below
 948 $chars{0xf948}=0x1e2a; # capital h with breve below
 949 $chars{0xf968}=0x1e2b; # small h with breve below
 950         # 5/6 dot below
 951 $chars{0xd641}=0x1ea0; # capital a with dot below
 952 $chars{0xd642}=0x1e04; # capital b with dot below
 953 $chars{0xd644}=0x1e0c; # capital d with dot below
 954 $chars{0xd645}=0x1eb8; # capital e with dot below
 955 $chars{0xd648}=0x1e24; # capital h with dot below
 956 $chars{0xd649}=0x1eca; # capital i with dot below
 957 $chars{0xd64b}=0x1e32; # capital k with dot below
 958 $chars{0xd64c}=0x1e36; # capital l with dot below
 959 $chars{0xd64d}=0x1e42; # capital m with dot below
 960 $chars{0xd64e}=0x1e46; # capital n with dot below
 961 $chars{0xd64f}=0x1ecc; # capital o with dot below
 962 $chars{0xd652}=0x1e5a; # capital r with dot below
 963 $chars{0xd653}=0x1e62; # capital s with dot below
 964 $chars{0xd654}=0x1e6c; # capital t with dot below
 965 $chars{0xd655}=0x1ee4; # capital u with dot below
 966 $chars{0xd656}=0x1e7e; # capital v with dot below
 967 $chars{0xd657}=0x1e88; # capital w with dot below
 968 $chars{0xd659}=0x1ef4; # capital y with dot below
 969 $chars{0xd65a}=0x1e92; # capital z with dot below
 970 $chars{0xd661}=0x1ea1; # small a with dot below
 971 $chars{0xd662}=0x1e05; # small b with dot below
 972 $chars{0xd664}=0x1e0d; # small d with dot below
 973 $chars{0xd665}=0x1eb9; # small e with dot below
 974 $chars{0xd668}=0x1e25; # small h with dot below
 975 $chars{0xd669}=0x1ecb; # small i with dot below
 976 $chars{0xd66b}=0x1e33; # small k with dot below
 977 $chars{0xd66c}=0x1e37; # small l with dot below
 978 $chars{0xd66d}=0x1e43; # small m with dot below
 979 $chars{0xd66e}=0x1e47; # small n with dot below
 980 $chars{0xd66f}=0x1ecd; # small o with dot below
 981 $chars{0xd672}=0x1e5b; # small r with dot below
 982 $chars{0xd673}=0x1e63; # small s with dot below
 983 $chars{0xd674}=0x1e6d; # small t with dot below
 984 $chars{0xd675}=0x1ee5; # small u with dot below
 985 $chars{0xd676}=0x1e7f; # small v with dot below
 986 $chars{0xd677}=0x1e89; # small w with dot below
 987 $chars{0xd679}=0x1ef5; # small y with dot below
 988 $chars{0xd67a}=0x1e93; # small z with dot below
 989         # 5/7 double dot below
 990 $chars{0xd755}=0x1e72; # capital u with diaeresis below
 991 $chars{0xd775}=0x1e73; # small u with diaeresis below
 992         # 5/8 underline
 993 $chars{0xd820}=0x005f; # underline
 994         # 5/9 double underline
 995 $chars{0xd920}=0x2017; # double underline
 996         # 5/10 small low vertical bar
 997 $chars{0xda20}=0x02cc; #
 998         # 5/11 circumflex below
 999         # 5/12 (this position shall not be used)
1000         # 5/13 left half of ligature sign and of double tilde
1001         # 5/14 right half of ligature sign
1002         # 5/15 right half of double tilde
1003 #     map {printf "%x :%x\n",$_,$chars{$_};}keys %chars;
1004
1005 sub char_decode5426 {
1006     my ( $string) = @_;
1007     my $result;
1008
1009     my @data = unpack("C*", $string);
1010     my @characters;
1011     my $length=scalar(@data);
1012     for (my $i = 0; $i < scalar(@data); $i++) {
1013       my $char= $data[$i];
1014       if ($char >= 0x00 && $char <= 0x7F){
1015         #IsAscii
1016
1017           push @characters,$char unless ($char<0x02 ||$char== 0x0F);
1018       }elsif (($char >= 0xC0 && $char <= 0xDF)) {
1019         #Combined Char
1020         my $convchar ;
1021         if ($chars{$char*256+$data[$i+1]}) {
1022           $convchar= $chars{$char * 256 + $data[$i+1]};
1023           $i++;
1024 #           printf "char %x $char, char to convert %x , converted %x\n",$char,$char * 256 + $data[$i - 1],$convchar;
1025         } elsif ($chars{$char})  {
1026           $convchar= $chars{$char};
1027 #           printf "0xC char %x, converted %x\n",$char,$chars{$char};
1028         }else {
1029           $convchar=$char;
1030         }
1031         push @characters,$convchar;
1032       } else {
1033         my $convchar;
1034         if ($chars{$char})  {
1035           $convchar= $chars{$char};
1036 #            printf "char %x,  converted %x\n",$char,$chars{$char};
1037         }else {
1038 #            printf "char %x $char\n",$char;
1039           $convchar=$char;
1040         }
1041         push @characters,$convchar;
1042       }
1043     }
1044     $result=pack "U*",@characters;
1045 #     $result=~s/\x01//;
1046 #     $result=~s/\x00//;
1047      $result=~s/\x0f//;
1048      $result=~s/\x1b.//;
1049      $result=~s/\x0e//;
1050      $result=~s/\x1b\x5b//;
1051 #   map{printf "%x",$_} @characters;
1052 #   printf "\n";
1053   return $result;
1054 }
1055
1056 1;
1057
1058
1059 =head1 AUTHOR
1060
1061 Koha Development Team <info@koha.org>
1062
1063 Galen Charlton <galen.charlton@liblime.com>
1064
1065 =cut