C4/Charset.pm

   1 package C4::Charset;
   2
   3 # Copyright (C) 2008 LibLime
   4 #
   5 # This file is part of Koha.
   6 #
   7 # Koha is free software; you can redistribute it and/or modify it under the
   8 # terms of the GNU General Public License as published by the Free Software
   9 # Foundation; either version 2 of the License, or (at your option) any later
  10 # version.
  11 #
  12 # Koha is distributed in the hope that it will be useful, but WITHOUT ANY
  13 # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
  14 # A PARTICULAR PURPOSE.  See the GNU General Public License for more details.
  15 #
  16 # You should have received a copy of the GNU General Public License along with
  17 # Koha; if not, write to the Free Software Foundation, Inc., 59 Temple Place,
  18 # Suite 330, Boston, MA  02111-1307 USA
  19
  20 use strict;
  21 use MARC::Charset qw/marc8_to_utf8/;
  22 use Text::Iconv;
  23
  24 use vars qw($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
  25
  26 BEGIN {
  27     # set the version for version checking
  28     $VERSION = 3.00;
  29     require Exporter;
  30     @ISA    = qw(Exporter);
  31     @EXPORT = qw(
  32         IsStringUTF8ish
  33         MarcToUTF8Record
  34         SetMarcUnicodeFlag
  35         StripNonXmlChars
  36     );
  37 }
  38
  39 =head1 NAME
  40
  41 C4::Charset - utilities for handling character set conversions.
  42
  43 =head1 SYNOPSIS
  44
  45 use C4::Charset;
  46
  47 =head1 DESCRIPTION
  48
  49 This module contains routines for dealing with character set
  50 conversions, particularly for MARC records.
  51
  52 A variety of character encodings are in use by various MARC
  53 standards, and even more character encodings are used by
  54 non-standard MARC records.  The various MARC formats generally
  55 do not do a good job of advertising a given record's character
  56 encoding, and even when a record does advertise its encoding,
  57 e.g., via the Leader/09, experience has shown that one cannot
  58 trust it.
  59
  60 Ultimately, all MARC records are stored in Koha in UTF-8 and
  61 must be converted from whatever the source character encoding is.
  62 The goal of this module is to ensure that these conversions
  63 take place accurately.  When a character conversion cannot take
  64 place, or at least not accurately, the module was provide
  65 enough information to allow user-facing code to inform the user
  66 on how to deal with the situation.
  67
  68 =cut
  69
  70 =head1 FUNCTIONS
  71
  72 =head2 IsStringUTF8ish
  73
  74 =over 4
  75
  76 my $is_utf8 = IsStringUTF8ish($str);
  77
  78 =back
  79
  80 Determines if C<$str> is valid UTF-8.  This can mean
  81 one of two things:
  82
  83 =over 2
  84
  85 =item *
  86
  87 The Perl UTF-8 flag is set and the string contains valid UTF-8.
  88
  89 =item *
  90
  91 The Perl UTF-8 flag is B<not> set, but the octets contain
  92 valid UTF-8.
  93
  94 =back
  95
  96 The function is named C<IsStringUTF8ish> instead of C<IsStringUTF8>
  97 because in one could be presented with a MARC blob that is
  98 not actually in UTF-8 but whose sequence of octets appears to be
  99 valid UTF-8.  The rest of the MARC character conversion functions
 100 will assume that this situation occur does not very often.
 101
 102 =cut
 103
 104 sub IsStringUTF8ish {
 105     my $str = shift;
 106
 107     return 1 if utf8::is_utf8($str);
 108     return utf8::decode($str);
 109 }
 110
 111 =head2 MarcToUTF8Record
 112
 113 =over 4
 114
 115 ($marc_record, $converted_from, $errors_arrayref) = MarcToUTF8Record($marc_blob, $marc_flavour, [, $source_encoding]);
 116
 117 =back
 118
 119 Given a MARC blob or a C<MARC::Record>, the MARC flavour, and an
 120 optional source encoding, return a C<MARC::Record> that is
 121 converted to UTF-8.
 122
 123 The returned C<$marc_record> is guaranteed to be in valid UTF-8, but
 124 is not guaranteed to have been converted correctly.  Specifically,
 125 if C<$converted_from> is 'failed', the MARC record returned failed
 126 character conversion and had each of its non-ASCII octets changed
 127 to the Unicode replacement character.
 128
 129 If the source encoding was not specified, this routine will
 130 try to guess it; the character encoding used for a successful
 131 conversion is returned in C<$converted_from>.
 132
 133 =cut
 134
 135 sub MarcToUTF8Record {
 136     my $marc = shift;
 137     my $marc_flavour = shift;
 138     my $source_encoding = shift;
 139
 140     my $marc_record;
 141     my $marc_blob_is_utf8 = 0;
 142     if (ref($marc) eq 'MARC::Record') {
 143         my $marc_blob = $marc->as_usmarc();
 144         $marc_blob_is_utf8 = IsStringUTF8ish($marc_blob);
 145         $marc_record = $marc;
 146     } else {
 147         # dealing with a MARC blob
 148
 149         # remove any ersatz whitespace from the beginning and
 150         # end of the MARC blob -- these can creep into MARC
 151         # files produced by several sources -- caller really
 152         # should be doing this, however
 153         $marc =~ s/^\s+//;
 154         $marc =~ s/\s+$//;
 155         $marc_blob_is_utf8 = IsStringUTF8ish($marc);
 156         $marc_record = MARC::Record->new_from_usmarc($marc);
 157     }
 158
 159     # If we do not know the source encoding, try some guesses
 160     # as follows:
 161     #   1. Record is UTF-8 already.
 162     #   2. If MARC flavor is MARC21, then
 163     #      a. record is MARC-8
 164     #      b. record is ISO-8859-1
 165     #   3. If MARC flavor is UNIMARC, then
 166     if (not defined $source_encoding) {
 167         if ($marc_blob_is_utf8) {
 168             # note that for MARC21 we are not bothering to check
 169             # if the Leader/09 is set to 'a' or not -- because
 170             # of problems with various ILSs (including Koha in the
 171             # past, alas), this just is not trustworthy.
 172             SetMarcUnicodeFlag($marc_record, $marc_flavour);
 173             return $marc_record, 'UTF-8', [];
 174         } else {
 175             if ($marc_flavour eq 'MARC21') {
 176                 return _default_marc21_charconv_to_utf8($marc_record, $marc_flavour);
 177             } elsif ($marc_flavour eq 'UNIMARC') {
 178                 return _default_unimarc_charconv_to_utf8($marc_record, $marc_flavour);
 179             } else {
 180                 return _default_marc21_charconv_to_utf8($marc_record, $marc_flavour);
 181             }
 182         }
 183     } else {
 184         # caller knows the character encoding
 185         my $original_marc_record = $marc_record->clone();
 186         my @errors;
 187         if ($source_encoding =~ /utf-?8/i) {
 188             if ($marc_blob_is_utf8) {
 189                 SetMarcUnicodeFlag($marc_record, $marc_flavour);
 190                 return $marc_record, 'UTF-8', [];
 191             } else {
 192                 push @errors, 'specified UTF-8 => UTF-8 conversion, but record is not in UTF-8';
 193             }
 194         } elsif ($source_encoding =~ /marc-?8/i) {
 195             @errors = _marc_marc8_to_utf8($marc_record, $marc_flavour);
 196         } elsif ($source_encoding =~ /5426/) {
 197             @errors = _marc_iso5426_to_utf8($marc_record, $marc_flavour);
 198         } else {
 199             # assume any other character encoding is for Text::Iconv
 200             @errors = _marc_to_utf8_via_text_iconv($marc_record, $marc_flavour, 'iso-8859-1');
 201         }
 202
 203         if (@errors) {
 204             _marc_to_utf8_replacement_char($original_marc_record, $marc_flavour);
 205             return $original_marc_record, 'failed', \@errors;
 206         } else {
 207             return $marc_record, $source_encoding, [];
 208         }
 209     }
 210
 211 }
 212
 213 =head2 SetMarcUnicodeFlag
 214
 215 =over 4
 216
 217 SetMarcUnicodeFlag($marc_record, $marc_flavour);
 218
 219 =back
 220
 221 Set both the internal MARC::Record encoding flag
 222 and the appropriate Leader/09 (MARC21) or
 223 100/26-29 (UNIMARC) to indicate that the record
 224 is in UTF-8.  Note that this does B<not> do
 225 any actual character conversion.
 226
 227 =cut
 228
 229 sub SetMarcUnicodeFlag {
 230     my $marc_record = shift;
 231     my $marc_flavour = shift;
 232
 233     $marc_record->encoding('UTF-8');
 234     if ($marc_flavour eq 'MARC21') {
 235         my $leader = $marc_record->leader();
 236         substr($leader, 9, 1) = 'a';
 237         $marc_record->leader($leader);
 238     } elsif ($marc_flavour eq "UNIMARC") {
 239         if (my $field = $marc_record->field('100')) {
 240             my $sfa = $field->subfield('a');
 241             substr($sfa, 26, 4) = '5050';
 242             $field->update('a' => $sfa);
 243         }
 244     }
 245 }
 246
 247 =head2 StripNonXmlChars
 248
 249 =over 4
 250
 251 my $new_str = StripNonXmlChars($old_str);
 252
 253 =back
 254
 255 Given a string, return a copy with the
 256 characters that are illegal in XML
 257 removed.
 258
 259 This function exists to work around a problem
 260 that can occur with badly-encoded MARC records.
 261 Specifically, if a UTF-8 MARC record also
 262 has excape (\x1b) characters, MARC::File::XML
 263 will let the escape characters pass through
 264 when as_xml() or as_xml_record() is called.  The
 265 problem is that the escape character is not
 266 legal in well-formed XML documents, so when
 267 MARC::File::XML attempts to parse such a record,
 268 the XML parser will fail.
 269
 270 Stripping such characters will allow a
 271 MARC::Record->new_from_xml()
 272 to work, at the possible risk of some data loss.
 273
 274 =cut
 275
 276 sub StripNonXmlChars {
 277     my $str = shift;
 278     $str =~ s/[^\x09\x0A\x0D\x{0020}-\x{D7FF}\x{E000}-\x{FFFD}\x{10000}-\x{10FFFF}]//g;
 279     return $str;
 280 }
 281
 282 =head1 INTERNAL FUNCTIONS
 283
 284 =head2 _default_marc21_charconv_to_utf8
 285
 286 =over 4
 287
 288 my ($new_marc_record, $guessed_charset) = _default_marc21_charconv_to_utf8($marc_record);
 289
 290 =back
 291
 292 Converts a C<MARC::Record> of unknown character set to UTF-8,
 293 first by trying a MARC-8 to UTF-8 conversion, then ISO-8859-1
 294 to UTF-8, then a default conversion that replaces each non-ASCII
 295 character with the replacement character.
 296
 297 The C<$guessed_charset> return value contains the character set
 298 that resulted in a conversion to valid UTF-8; note that
 299 if the MARC-8 and ISO-8859-1 conversions failed, the value of
 300 this is 'failed'.
 301
 302 =cut
 303
 304 sub _default_marc21_charconv_to_utf8 {
 305     my $marc_record = shift;
 306     my $marc_flavour = shift;
 307
 308     my $trial_marc8 = $marc_record->clone();
 309     my @all_errors = ();
 310     my @errors = _marc_marc8_to_utf8($trial_marc8, $marc_flavour);
 311     unless (@errors) {
 312         return $trial_marc8, 'MARC-8', [];
 313     }
 314     push @all_errors, @errors;
 315
 316     my $trial_8859_1 = $marc_record->clone();
 317     @errors = _marc_to_utf8_via_text_iconv($trial_8859_1, $marc_flavour, 'iso-8859-1');
 318     unless (@errors) {
 319         return $trial_8859_1, 'iso-8859-1', []; # note -- we could return \@all_errors
 320                                                 # instead if we wanted to report details
 321                                                 # of the failed attempt at MARC-8 => UTF-8
 322     }
 323     push @all_errors, @errors;
 324
 325     my $default_converted = $marc_record->clone();
 326     _marc_to_utf8_replacement_char($default_converted, $marc_flavour);
 327     return $default_converted, 'failed', \@all_errors;
 328 }
 329
 330 =head2 _default_unimarc_charconv_to_utf8
 331
 332 =over 4
 333
 334 my ($new_marc_record, $guessed_charset) = _default_unimarc_charconv_to_utf8($marc_record);
 335
 336 =back
 337
 338 Converts a C<MARC::Record> of unknown character set to UTF-8,
 339 first by trying a ISO-5426 to UTF-8 conversion, then ISO-8859-1
 340 to UTF-8, then a default conversion that replaces each non-ASCII
 341 character with the replacement character.
 342
 343 The C<$guessed_charset> return value contains the character set
 344 that resulted in a conversion to valid UTF-8; note that
 345 if the MARC-8 and ISO-8859-1 conversions failed, the value of
 346 this is 'failed'.
 347
 348 =cut
 349
 350 sub _default_unimarc_charconv_to_utf8 {
 351     my $marc_record = shift;
 352     my $marc_flavour = shift;
 353
 354     my $trial_marc8 = $marc_record->clone();
 355     my @all_errors = ();
 356     my @errors = _marc_iso5426_to_utf8($trial_marc8, $marc_flavour);
 357     unless (@errors) {
 358         return $trial_marc8, 'iso-5426';
 359     }
 360     push @all_errors, @errors;
 361
 362     my $trial_8859_1 = $marc_record->clone();
 363     @errors = _marc_to_utf8_via_text_iconv($trial_8859_1, $marc_flavour, 'iso-8859-1');
 364     unless (@errors) {
 365         return $trial_8859_1, 'iso-8859-1';
 366     }
 367     push @all_errors, @errors;
 368
 369     my $default_converted = $marc_record->clone();
 370     _marc_to_utf8_replacement_char($default_converted, $marc_flavour);
 371     return $default_converted, 'failed', \@all_errors;
 372 }
 373
 374 =head2 _marc_marc8_to_utf8
 375
 376 =over 4
 377
 378 my @errors = _marc_marc8_to_utf8($marc_record, $marc_flavour, $source_encoding);
 379
 380 =back
 381
 382 Convert a C<MARC::Record> to UTF-8 in-place from MARC-8.
 383 If the conversion fails for some reason, an
 384 appropriate messages will be placed in the returned
 385 C<@errors> array.
 386
 387 =cut
 388
 389 sub _marc_marc8_to_utf8 {
 390     my $marc_record = shift;
 391     my $marc_flavour = shift;
 392
 393     my $prev_ignore = MARC::Charset->ignore_errors();
 394     MARC::Charset->ignore_errors(1);
 395
 396     # trap warnings raised by MARC::Charset
 397     my @errors = ();
 398     local $SIG{__WARN__} = sub {
 399         my $msg = $_[0];
 400         if ($msg =~ /MARC.Charset/) {
 401             # FIXME - purpose of this regexp is to strip out the
 402             # line reference to MARC/Charset.pm, but as it
 403             # exists probably won't work quite on Windows --
 404             # some sort of minimal-bunch back-tracking RE
 405             # would be helpful here
 406             $msg =~ s/at [\/].*?.MARC.Charset\.pm line \d+\.\n$//;
 407             push @errors, $msg;
 408         } else {
 409             # if warning doesn't come from MARC::Charset, just
 410             # pass it on
 411             warn $msg;
 412         }
 413     };
 414
 415     foreach my $field ($marc_record->fields()) {
 416         if ($field->is_control_field()) {
 417             ; # do nothing -- control fields should not contain non-ASCII characters
 418         } else {
 419             my @converted_subfields;
 420             foreach my $subfield ($field->subfields()) {
 421                 my $utf8sf = MARC::Charset::marc8_to_utf8($subfield->[1]);
 422                 push @converted_subfields, $subfield->[0], $utf8sf;
 423             }
 424
 425             $field->replace_with(MARC::Field->new(
 426                 $field->tag(), $field->indicator(1), $field->indicator(2),
 427                 @converted_subfields)
 428             );
 429         }
 430     }
 431
 432     MARC::Charset->ignore_errors($prev_ignore);
 433
 434     SetMarcUnicodeFlag($marc_record, $marc_flavour);
 435
 436     return @errors;
 437 }
 438
 439 =head2 _marc_iso5426_to_utf8
 440
 441 =over 4
 442
 443 my @errors = _marc_iso5426_to_utf8($marc_record, $marc_flavour, $source_encoding);
 444
 445 =back
 446
 447 Convert a C<MARC::Record> to UTF-8 in-place from ISO-5426.
 448 If the conversion fails for some reason, an
 449 appropriate messages will be placed in the returned
 450 C<@errors> array.
 451
 452 FIXME - is ISO-5426 equivalent enough to MARC-8
 453 that C<MARC::Charset> can be used instead?
 454
 455 =cut
 456
 457 sub _marc_iso5426_to_utf8 {
 458     my $marc_record = shift;
 459     my $marc_flavour = shift;
 460
 461     my @errors = ();
 462
 463     foreach my $field ($marc_record->fields()) {
 464         if ($field->is_control_field()) {
 465             ; # do nothing -- control fields should not contain non-ASCII characters
 466         } else {
 467             my @converted_subfields;
 468             foreach my $subfield ($field->subfields()) {
 469                 my $utf8sf = char_decode5426($subfield->[1]);
 470                 push @converted_subfields, $subfield->[0], $utf8sf;
 471             }
 472
 473             $field->replace_with(MARC::Field->new(
 474                 $field->tag(), $field->indicator(1), $field->indicator(2),
 475                 @converted_subfields)
 476             );
 477         }
 478     }
 479
 480     SetMarcUnicodeFlag($marc_record, $marc_flavour);
 481
 482     return @errors;
 483 }
 484
 485 =head2 _marc_to_utf8_via_text_iconv
 486
 487 =over 4
 488
 489 my @errors = _marc_to_utf8_via_text_iconv($marc_record, $marc_flavour, $source_encoding);
 490
 491 =back
 492
 493 Convert a C<MARC::Record> to UTF-8 in-place using the
 494 C<Text::Iconv> CPAN module.  Any source encoding accepted
 495 by the user's iconv installation should work.  If
 496 the source encoding is not recognized on the user's
 497 server or the conversion fails for some reason,
 498 appropriate messages will be placed in the returned
 499 C<@errors> array.
 500
 501 =cut
 502
 503 sub _marc_to_utf8_via_text_iconv {
 504     my $marc_record = shift;
 505     my $marc_flavour = shift;
 506     my $source_encoding = shift;
 507
 508     my @errors = ();
 509     my $decoder;
 510     eval { $decoder = Text::Iconv->new($source_encoding, 'utf8'); };
 511     if ($@) {
 512         push @errors, "Could not initialze $source_encoding => utf8 converter: $@";
 513         return @errors;
 514     }
 515
 516     my $prev_raise_error = Text::Iconv->raise_error();
 517     Text::Iconv->raise_error(1);
 518
 519     foreach my $field ($marc_record->fields()) {
 520         if ($field->is_control_field()) {
 521             ; # do nothing -- control fields should not contain non-ASCII characters
 522         } else {
 523             my @converted_subfields;
 524             foreach my $subfield ($field->subfields()) {
 525                 my $converted_value;
 526                 my $conversion_ok = 1;
 527                 eval { $converted_value = $decoder->convert($subfield->[1]); };
 528                 if ($@) {
 529                     $conversion_ok = 0;
 530                     push @errors, $@;
 531                 } elsif (not defined $converted_value) {
 532                     $conversion_ok = 0;
 533                     push @errors, "Text::Iconv conversion failed - retval is " . $decoder->retval();
 534                 }
 535
 536                 if ($conversion_ok) {
 537                     push @converted_subfields, $subfield->[0], $converted_value;
 538                 } else {
 539                     $converted_value = $subfield->[1];
 540                     $converted_value =~ s/[\200-\377]/\xef\xbf\xbd/g;
 541                     push @converted_subfields, $subfield->[0], $converted_value;
 542                 }
 543             }
 544
 545             $field->replace_with(MARC::Field->new(
 546                 $field->tag(), $field->indicator(1), $field->indicator(2),
 547                 @converted_subfields)
 548             );
 549         }
 550     }
 551
 552     SetMarcUnicodeFlag($marc_record, $marc_flavour);
 553     Text::Iconv->raise_error($prev_raise_error);
 554
 555     return @errors;
 556 }
 557
 558 =head2 _marc_to_utf8_replacement_char
 559
 560 =over 4
 561
 562 _marc_to_utf8_replacement_char($marc_record, $marc_flavour);
 563
 564 =back
 565
 566 Convert a C<MARC::Record> to UTF-8 in-place, adopting the
 567 unsatisfactory method of replacing all non-ASCII (e.g.,
 568 where the eight bit is set) octet with the Unicode
 569 replacement character.  This is meant as a last-ditch
 570 method, and would be best used as part of a UI that
 571 lets a cataloguer pick various character conversions
 572 until he or she finds the right one.
 573
 574 =cut
 575
 576 sub _marc_to_utf8_replacement_char {
 577     my $marc_record = shift;
 578     my $marc_flavour = shift;
 579
 580     foreach my $field ($marc_record->fields()) {
 581         if ($field->is_control_field()) {
 582             ; # do nothing -- control fields should not contain non-ASCII characters
 583         } else {
 584             my @converted_subfields;
 585             foreach my $subfield ($field->subfields()) {
 586                 my $value = $subfield->[1];
 587                 $value =~ s/[\200-\377]/\xef\xbf\xbd/g;
 588                 push @converted_subfields, $subfield->[0], $value;
 589             }
 590
 591             $field->replace_with(MARC::Field->new(
 592                 $field->tag(), $field->indicator(1), $field->indicator(2),
 593                 @converted_subfields)
 594             );
 595         }
 596     }
 597
 598     SetMarcUnicodeFlag($marc_record, $marc_flavour);
 599 }
 600
 601 =head2 char_decode5426
 602
 603 =over 4
 604
 605 my $utf8string = char_decode5426($iso_5426_string);
 606
 607 =back
 608
 609 Converts a string from ISO-5426 to UTF-8.
 610
 611 =cut
 612
 613 sub char_decode5426 {
 614     my ( $string) = @_;
 615     my $result;
 616 my %chars;
 617 $chars{0xb0}=0x0101;#3/0ayn[ain]
 618 $chars{0xb1}=0x0623;#3/1alif/hamzah[alefwithhamzaabove]
 619 #$chars{0xb2}=0x00e0;#'à';
 620 $chars{0xb2}=0x00e0;#3/2leftlowsinglequotationmark
 621 #$chars{0xb3}=0x00e7;#'ç';
 622 $chars{0xb3}=0x00e7;#3/2leftlowsinglequotationmark
 623 # $chars{0xb4}='è';
 624 $chars{0xb4}=0x00e8;
 625 # $chars{0xb5}='é';
 626 $chars{0xb5}=0x00e9;
 627 $chars{0x97}=0x003c;#3/2leftlowsinglequotationmark
 628 $chars{0x98}=0x003e;#3/2leftlowsinglequotationmark
 629 $chars{0xfa}=0x0153;#oe
 630 $chars{0x81d1}=0x00b0;
 631
 632 ####
 633 ## combined characters iso5426
 634
 635 $chars{0xc041}=0x1ea2; # capital a with hook above
 636 $chars{0xc045}=0x1eba; # capital e with hook above
 637 $chars{0xc049}=0x1ec8; # capital i with hook above
 638 $chars{0xc04f}=0x1ece; # capital o with hook above
 639 $chars{0xc055}=0x1ee6; # capital u with hook above
 640 $chars{0xc059}=0x1ef6; # capital y with hook above
 641 $chars{0xc061}=0x1ea3; # small a with hook above
 642 $chars{0xc065}=0x1ebb; # small e with hook above
 643 $chars{0xc069}=0x1ec9; # small i with hook above
 644 $chars{0xc06f}=0x1ecf; # small o with hook above
 645 $chars{0xc075}=0x1ee7; # small u with hook above
 646 $chars{0xc079}=0x1ef7; # small y with hook above
 647
 648         # 4/1 grave accent
 649 $chars{0xc141}=0x00c0; # capital a with grave accent
 650 $chars{0xc145}=0x00c8; # capital e with grave accent
 651 $chars{0xc149}=0x00cc; # capital i with grave accent
 652 $chars{0xc14f}=0x00d2; # capital o with grave accent
 653 $chars{0xc155}=0x00d9; # capital u with grave accent
 654 $chars{0xc157}=0x1e80; # capital w with grave
 655 $chars{0xc159}=0x1ef2; # capital y with grave
 656 $chars{0xc161}=0x00e0; # small a with grave accent
 657 $chars{0xc165}=0x00e8; # small e with grave accent
 658 $chars{0xc169}=0x00ec; # small i with grave accent
 659 $chars{0xc16f}=0x00f2; # small o with grave accent
 660 $chars{0xc175}=0x00f9; # small u with grave accent
 661 $chars{0xc177}=0x1e81; # small w with grave
 662 $chars{0xc179}=0x1ef3; # small y with grave
 663         # 4/2 acute accent
 664 $chars{0xc241}=0x00c1; # capital a with acute accent
 665 $chars{0xc243}=0x0106; # capital c with acute accent
 666 $chars{0xc245}=0x00c9; # capital e with acute accent
 667 $chars{0xc247}=0x01f4; # capital g with acute
 668 $chars{0xc249}=0x00cd; # capital i with acute accent
 669 $chars{0xc24b}=0x1e30; # capital k with acute
 670 $chars{0xc24c}=0x0139; # capital l with acute accent
 671 $chars{0xc24d}=0x1e3e; # capital m with acute
 672 $chars{0xc24e}=0x0143; # capital n with acute accent
 673 $chars{0xc24f}=0x00d3; # capital o with acute accent
 674 $chars{0xc250}=0x1e54; # capital p with acute
 675 $chars{0xc252}=0x0154; # capital r with acute accent
 676 $chars{0xc253}=0x015a; # capital s with acute accent
 677 $chars{0xc255}=0x00da; # capital u with acute accent
 678 $chars{0xc257}=0x1e82; # capital w with acute
 679 $chars{0xc259}=0x00dd; # capital y with acute accent
 680 $chars{0xc25a}=0x0179; # capital z with acute accent
 681 $chars{0xc261}=0x00e1; # small a with acute accent
 682 $chars{0xc263}=0x0107; # small c with acute accent
 683 $chars{0xc265}=0x00e9; # small e with acute accent
 684 $chars{0xc267}=0x01f5; # small g with acute
 685 $chars{0xc269}=0x00ed; # small i with acute accent
 686 $chars{0xc26b}=0x1e31; # small k with acute
 687 $chars{0xc26c}=0x013a; # small l with acute accent
 688 $chars{0xc26d}=0x1e3f; # small m with acute
 689 $chars{0xc26e}=0x0144; # small n with acute accent
 690 $chars{0xc26f}=0x00f3; # small o with acute accent
 691 $chars{0xc270}=0x1e55; # small p with acute
 692 $chars{0xc272}=0x0155; # small r with acute accent
 693 $chars{0xc273}=0x015b; # small s with acute accent
 694 $chars{0xc275}=0x00fa; # small u with acute accent
 695 $chars{0xc277}=0x1e83; # small w with acute
 696 $chars{0xc279}=0x00fd; # small y with acute accent
 697 $chars{0xc27a}=0x017a; # small z with acute accent
 698 $chars{0xc2e1}=0x01fc; # capital ae with acute
 699 $chars{0xc2f1}=0x01fd; # small ae with acute
 700        # 4/3 circumflex accent
 701 $chars{0xc341}=0x00c2; # capital a with circumflex accent
 702 $chars{0xc343}=0x0108; # capital c with circumflex
 703 $chars{0xc345}=0x00ca; # capital e with circumflex accent
 704 $chars{0xc347}=0x011c; # capital g with circumflex
 705 $chars{0xc348}=0x0124; # capital h with circumflex
 706 $chars{0xc349}=0x00ce; # capital i with circumflex accent
 707 $chars{0xc34a}=0x0134; # capital j with circumflex
 708 $chars{0xc34f}=0x00d4; # capital o with circumflex accent
 709 $chars{0xc353}=0x015c; # capital s with circumflex
 710 $chars{0xc355}=0x00db; # capital u with circumflex
 711 $chars{0xc357}=0x0174; # capital w with circumflex
 712 $chars{0xc359}=0x0176; # capital y with circumflex
 713 $chars{0xc35a}=0x1e90; # capital z with circumflex
 714 $chars{0xc361}=0x00e2; # small a with circumflex accent
 715 $chars{0xc363}=0x0109; # small c with circumflex
 716 $chars{0xc365}=0x00ea; # small e with circumflex accent
 717 $chars{0xc367}=0x011d; # small g with circumflex
 718 $chars{0xc368}=0x0125; # small h with circumflex
 719 $chars{0xc369}=0x00ee; # small i with circumflex accent
 720 $chars{0xc36a}=0x0135; # small j with circumflex
 721 $chars{0xc36e}=0x00f1; # small n with tilde
 722 $chars{0xc36f}=0x00f4; # small o with circumflex accent
 723 $chars{0xc373}=0x015d; # small s with circumflex
 724 $chars{0xc375}=0x00fb; # small u with circumflex
 725 $chars{0xc377}=0x0175; # small w with circumflex
 726 $chars{0xc379}=0x0177; # small y with circumflex
 727 $chars{0xc37a}=0x1e91; # small z with circumflex
 728         # 4/4 tilde
 729 $chars{0xc441}=0x00c3; # capital a with tilde
 730 $chars{0xc445}=0x1ebc; # capital e with tilde
 731 $chars{0xc449}=0x0128; # capital i with tilde
 732 $chars{0xc44e}=0x00d1; # capital n with tilde
 733 $chars{0xc44f}=0x00d5; # capital o with tilde
 734 $chars{0xc455}=0x0168; # capital u with tilde
 735 $chars{0xc456}=0x1e7c; # capital v with tilde
 736 $chars{0xc459}=0x1ef8; # capital y with tilde
 737 $chars{0xc461}=0x00e3; # small a with tilde
 738 $chars{0xc465}=0x1ebd; # small e with tilde
 739 $chars{0xc469}=0x0129; # small i with tilde
 740 $chars{0xc46e}=0x00f1; # small n with tilde
 741 $chars{0xc46f}=0x00f5; # small o with tilde
 742 $chars{0xc475}=0x0169; # small u with tilde
 743 $chars{0xc476}=0x1e7d; # small v with tilde
 744 $chars{0xc479}=0x1ef9; # small y with tilde
 745     # 4/5 macron
 746 $chars{0xc541}=0x0100; # capital a with macron
 747 $chars{0xc545}=0x0112; # capital e with macron
 748 $chars{0xc547}=0x1e20; # capital g with macron
 749 $chars{0xc549}=0x012a; # capital i with macron
 750 $chars{0xc54f}=0x014c; # capital o with macron
 751 $chars{0xc555}=0x016a; # capital u with macron
 752 $chars{0xc561}=0x0101; # small a with macron
 753 $chars{0xc565}=0x0113; # small e with macron
 754 $chars{0xc567}=0x1e21; # small g with macron
 755 $chars{0xc569}=0x012b; # small i with macron
 756 $chars{0xc56f}=0x014d; # small o with macron
 757 $chars{0xc575}=0x016b; # small u with macron
 758 $chars{0xc572}=0x0159; # small r with macron
 759 $chars{0xc5e1}=0x01e2; # capital ae with macron
 760 $chars{0xc5f1}=0x01e3; # small ae with macron
 761         # 4/6 breve
 762 $chars{0xc641}=0x0102; # capital a with breve
 763 $chars{0xc645}=0x0114; # capital e with breve
 764 $chars{0xc647}=0x011e; # capital g with breve
 765 $chars{0xc649}=0x012c; # capital i with breve
 766 $chars{0xc64f}=0x014e; # capital o with breve
 767 $chars{0xc655}=0x016c; # capital u with breve
 768 $chars{0xc661}=0x0103; # small a with breve
 769 $chars{0xc665}=0x0115; # small e with breve
 770 $chars{0xc667}=0x011f; # small g with breve
 771 $chars{0xc669}=0x012d; # small i with breve
 772 $chars{0xc66f}=0x014f; # small o with breve
 773 $chars{0xc675}=0x016d; # small u with breve
 774         # 4/7 dot above
 775 $chars{0xc7b0}=0x01e1; # Ain with dot above
 776 $chars{0xc742}=0x1e02; # capital b with dot above
 777 $chars{0xc743}=0x010a; # capital c with dot above
 778 $chars{0xc744}=0x1e0a; # capital d with dot above
 779 $chars{0xc745}=0x0116; # capital e with dot above
 780 $chars{0xc746}=0x1e1e; # capital f with dot above
 781 $chars{0xc747}=0x0120; # capital g with dot above
 782 $chars{0xc748}=0x1e22; # capital h with dot above
 783 $chars{0xc749}=0x0130; # capital i with dot above
 784 $chars{0xc74d}=0x1e40; # capital m with dot above
 785 $chars{0xc74e}=0x1e44; # capital n with dot above
 786 $chars{0xc750}=0x1e56; # capital p with dot above
 787 $chars{0xc752}=0x1e58; # capital r with dot above
 788 $chars{0xc753}=0x1e60; # capital s with dot above
 789 $chars{0xc754}=0x1e6a; # capital t with dot above
 790 $chars{0xc757}=0x1e86; # capital w with dot above
 791 $chars{0xc758}=0x1e8a; # capital x with dot above
 792 $chars{0xc759}=0x1e8e; # capital y with dot above
 793 $chars{0xc75a}=0x017b; # capital z with dot above
 794 $chars{0xc761}=0x0227; # small b with dot above
 795 $chars{0xc762}=0x1e03; # small b with dot above
 796 $chars{0xc763}=0x010b; # small c with dot above
 797 $chars{0xc764}=0x1e0b; # small d with dot above
 798 $chars{0xc765}=0x0117; # small e with dot above
 799 $chars{0xc766}=0x1e1f; # small f with dot above
 800 $chars{0xc767}=0x0121; # small g with dot above
 801 $chars{0xc768}=0x1e23; # small h with dot above
 802 $chars{0xc76d}=0x1e41; # small m with dot above
 803 $chars{0xc76e}=0x1e45; # small n with dot above
 804 $chars{0xc770}=0x1e57; # small p with dot above
 805 $chars{0xc772}=0x1e59; # small r with dot above
 806 $chars{0xc773}=0x1e61; # small s with dot above
 807 $chars{0xc774}=0x1e6b; # small t with dot above
 808 $chars{0xc777}=0x1e87; # small w with dot above
 809 $chars{0xc778}=0x1e8b; # small x with dot above
 810 $chars{0xc779}=0x1e8f; # small y with dot above
 811 $chars{0xc77a}=0x017c; # small z with dot above
 812         # 4/8 trema, diaresis
 813 $chars{0xc820}=0x00a8; # diaeresis
 814 $chars{0xc841}=0x00c4; # capital a with diaeresis
 815 $chars{0xc845}=0x00cb; # capital e with diaeresis
 816 $chars{0xc848}=0x1e26; # capital h with diaeresis
 817 $chars{0xc849}=0x00cf; # capital i with diaeresis
 818 $chars{0xc84f}=0x00d6; # capital o with diaeresis
 819 $chars{0xc855}=0x00dc; # capital u with diaeresis
 820 $chars{0xc857}=0x1e84; # capital w with diaeresis
 821 $chars{0xc858}=0x1e8c; # capital x with diaeresis
 822 $chars{0xc859}=0x0178; # capital y with diaeresis
 823 $chars{0xc861}=0x00e4; # small a with diaeresis
 824 $chars{0xc865}=0x00eb; # small e with diaeresis
 825 $chars{0xc868}=0x1e27; # small h with diaeresis
 826 $chars{0xc869}=0x00ef; # small i with diaeresis
 827 $chars{0xc86f}=0x00f6; # small o with diaeresis
 828 $chars{0xc874}=0x1e97; # small t with diaeresis
 829 $chars{0xc875}=0x00fc; # small u with diaeresis
 830 $chars{0xc877}=0x1e85; # small w with diaeresis
 831 $chars{0xc878}=0x1e8d; # small x with diaeresis
 832 $chars{0xc879}=0x00ff; # small y with diaeresis
 833         # 4/9 umlaut
 834 $chars{0xc920}=0x00a8; # [diaeresis]
 835 $chars{0xc961}=0x00e4; # a with umlaut
 836 $chars{0xc965}=0x00eb; # e with umlaut
 837 $chars{0xc969}=0x00ef; # i with umlaut
 838 $chars{0xc96f}=0x00f6; # o with umlaut
 839 $chars{0xc975}=0x00fc; # u with umlaut
 840         # 4/10 circle above
 841 $chars{0xca41}=0x00c5; # capital a with ring above
 842 $chars{0xcaad}=0x016e; # capital u with ring above
 843 $chars{0xca61}=0x00e5; # small a with ring above
 844 $chars{0xca75}=0x016f; # small u with ring above
 845 $chars{0xca77}=0x1e98; # small w with ring above
 846 $chars{0xca79}=0x1e99; # small y with ring above
 847         # 4/11 high comma off centre
 848         # 4/12 inverted high comma centred
 849         # 4/13 double acute accent
 850 $chars{0xcd4f}=0x0150; # capital o with double acute
 851 $chars{0xcd55}=0x0170; # capital u with double acute
 852 $chars{0xcd6f}=0x0151; # small o with double acute
 853 $chars{0xcd75}=0x0171; # small u with double acute
 854         # 4/14 horn
 855 $chars{0xce54}=0x01a0; # latin capital letter o with horn
 856 $chars{0xce55}=0x01af; # latin capital letter u with horn
 857 $chars{0xce74}=0x01a1; # latin small letter o with horn
 858 $chars{0xce75}=0x01b0; # latin small letter u with horn
 859         # 4/15 caron (hacek
 860 $chars{0xcf41}=0x01cd; # capital a with caron
 861 $chars{0xcf43}=0x010c; # capital c with caron
 862 $chars{0xcf44}=0x010e; # capital d with caron
 863 $chars{0xcf45}=0x011a; # capital e with caron
 864 $chars{0xcf47}=0x01e6; # capital g with caron
 865 $chars{0xcf49}=0x01cf; # capital i with caron
 866 $chars{0xcf4b}=0x01e8; # capital k with caron
 867 $chars{0xcf4c}=0x013d; # capital l with caron
 868 $chars{0xcf4e}=0x0147; # capital n with caron
 869 $chars{0xcf4f}=0x01d1; # capital o with caron
 870 $chars{0xcf52}=0x0158; # capital r with caron
 871 $chars{0xcf53}=0x0160; # capital s with caron
 872 $chars{0xcf54}=0x0164; # capital t with caron
 873 $chars{0xcf55}=0x01d3; # capital u with caron
 874 $chars{0xcf5a}=0x017d; # capital z with caron
 875 $chars{0xcf61}=0x01ce; # small a with caron
 876 $chars{0xcf63}=0x010d; # small c with caron
 877 $chars{0xcf64}=0x010f; # small d with caron
 878 $chars{0xcf65}=0x011b; # small e with caron
 879 $chars{0xcf67}=0x01e7; # small g with caron
 880 $chars{0xcf69}=0x01d0; # small i with caron
 881 $chars{0xcf6a}=0x01f0; # small j with caron
 882 $chars{0xcf6b}=0x01e9; # small k with caron
 883 $chars{0xcf6c}=0x013e; # small l with caron
 884 $chars{0xcf6e}=0x0148; # small n with caron
 885 $chars{0xcf6f}=0x01d2; # small o with caron
 886 $chars{0xcf72}=0x0159; # small r with caron
 887 $chars{0xcf73}=0x0161; # small s with caron
 888 $chars{0xcf74}=0x0165; # small t with caron
 889 $chars{0xcf75}=0x01d4; # small u with caron
 890 $chars{0xcf7a}=0x017e; # small z with caron
 891         # 5/0 cedilla
 892 $chars{0xd020}=0x00b8; # cedilla
 893 $chars{0xd043}=0x00c7; # capital c with cedilla
 894 $chars{0xd044}=0x1e10; # capital d with cedilla
 895 $chars{0xd047}=0x0122; # capital g with cedilla
 896 $chars{0xd048}=0x1e28; # capital h with cedilla
 897 $chars{0xd04b}=0x0136; # capital k with cedilla
 898 $chars{0xd04c}=0x013b; # capital l with cedilla
 899 $chars{0xd04e}=0x0145; # capital n with cedilla
 900 $chars{0xd052}=0x0156; # capital r with cedilla
 901 $chars{0xd053}=0x015e; # capital s with cedilla
 902 $chars{0xd054}=0x0162; # capital t with cedilla
 903 $chars{0xd063}=0x00e7; # small c with cedilla
 904 $chars{0xd064}=0x1e11; # small d with cedilla
 905 $chars{0xd065}=0x0119; # small e with cedilla
 906 $chars{0xd067}=0x0123; # small g with cedilla
 907 $chars{0xd068}=0x1e29; # small h with cedilla
 908 $chars{0xd06b}=0x0137; # small k with cedilla
 909 $chars{0xd06c}=0x013c; # small l with cedilla
 910 $chars{0xd06e}=0x0146; # small n with cedilla
 911 $chars{0xd072}=0x0157; # small r with cedilla
 912 $chars{0xd073}=0x015f; # small s with cedilla
 913 $chars{0xd074}=0x0163; # small t with cedilla
 914         # 5/1 rude
 915         # 5/2 hook to left
 916         # 5/3 ogonek (hook to right
 917 $chars{0xd320}=0x02db; # ogonek
 918 $chars{0xd341}=0x0104; # capital a with ogonek
 919 $chars{0xd345}=0x0118; # capital e with ogonek
 920 $chars{0xd349}=0x012e; # capital i with ogonek
 921 $chars{0xd34f}=0x01ea; # capital o with ogonek
 922 $chars{0xd355}=0x0172; # capital u with ogonek
 923 $chars{0xd361}=0x0105; # small a with ogonek
 924 $chars{0xd365}=0x0119; # small e with ogonek
 925 $chars{0xd369}=0x012f; # small i with ogonek
 926 $chars{0xd36f}=0x01eb; # small o with ogonek
 927 $chars{0xd375}=0x0173; # small u with ogonek
 928         # 5/4 circle below
 929 $chars{0xd441}=0x1e00; # capital a with ring below
 930 $chars{0xd461}=0x1e01; # small a with ring below
 931         # 5/5 half circle below
 932 $chars{0xf948}=0x1e2a; # capital h with breve below
 933 $chars{0xf968}=0x1e2b; # small h with breve below
 934         # 5/6 dot below
 935 $chars{0xd641}=0x1ea0; # capital a with dot below
 936 $chars{0xd642}=0x1e04; # capital b with dot below
 937 $chars{0xd644}=0x1e0c; # capital d with dot below
 938 $chars{0xd645}=0x1eb8; # capital e with dot below
 939 $chars{0xd648}=0x1e24; # capital h with dot below
 940 $chars{0xd649}=0x1eca; # capital i with dot below
 941 $chars{0xd64b}=0x1e32; # capital k with dot below
 942 $chars{0xd64c}=0x1e36; # capital l with dot below
 943 $chars{0xd64d}=0x1e42; # capital m with dot below
 944 $chars{0xd64e}=0x1e46; # capital n with dot below
 945 $chars{0xd64f}=0x1ecc; # capital o with dot below
 946 $chars{0xd652}=0x1e5a; # capital r with dot below
 947 $chars{0xd653}=0x1e62; # capital s with dot below
 948 $chars{0xd654}=0x1e6c; # capital t with dot below
 949 $chars{0xd655}=0x1ee4; # capital u with dot below
 950 $chars{0xd656}=0x1e7e; # capital v with dot below
 951 $chars{0xd657}=0x1e88; # capital w with dot below
 952 $chars{0xd659}=0x1ef4; # capital y with dot below
 953 $chars{0xd65a}=0x1e92; # capital z with dot below
 954 $chars{0xd661}=0x1ea1; # small a with dot below
 955 $chars{0xd662}=0x1e05; # small b with dot below
 956 $chars{0xd664}=0x1e0d; # small d with dot below
 957 $chars{0xd665}=0x1eb9; # small e with dot below
 958 $chars{0xd668}=0x1e25; # small h with dot below
 959 $chars{0xd669}=0x1ecb; # small i with dot below
 960 $chars{0xd66b}=0x1e33; # small k with dot below
 961 $chars{0xd66c}=0x1e37; # small l with dot below
 962 $chars{0xd66d}=0x1e43; # small m with dot below
 963 $chars{0xd66e}=0x1e47; # small n with dot below
 964 $chars{0xd66f}=0x1ecd; # small o with dot below
 965 $chars{0xd672}=0x1e5b; # small r with dot below
 966 $chars{0xd673}=0x1e63; # small s with dot below
 967 $chars{0xd674}=0x1e6d; # small t with dot below
 968 $chars{0xd675}=0x1ee5; # small u with dot below
 969 $chars{0xd676}=0x1e7f; # small v with dot below
 970 $chars{0xd677}=0x1e89; # small w with dot below
 971 $chars{0xd679}=0x1ef5; # small y with dot below
 972 $chars{0xd67a}=0x1e93; # small z with dot below
 973         # 5/7 double dot below
 974 $chars{0xd755}=0x1e72; # capital u with diaeresis below
 975 $chars{0xd775}=0x1e73; # small u with diaeresis below
 976         # 5/8 underline
 977 $chars{0xd820}=0x005f; # underline
 978         # 5/9 double underline
 979 $chars{0xd920}=0x2017; # double underline
 980         # 5/10 small low vertical bar
 981 $chars{0xda20}=0x02cc; #
 982         # 5/11 circumflex below
 983         # 5/12 (this position shall not be used)
 984         # 5/13 left half of ligature sign and of double tilde
 985         # 5/14 right half of ligature sign
 986         # 5/15 right half of double tilde
 987 #     map {printf "%x :%x\n",$_,$chars{$_};}keys %chars;
 988     my @data = unpack("C*", $string);
 989     my @characters;
 990     my $length=scalar(@data);
 991     for (my $i = 0; $i < scalar(@data); $i++) {
 992       my $char= $data[$i];
 993       if ($char >= 0x00 && $char <= 0x7F){
 994         #IsAscii
 995
 996           push @characters,$char unless ($char<0x02 ||$char== 0x0F);
 997       }elsif (($char >= 0xC0 && $char <= 0xDF)) {
 998         #Combined Char
 999         my $convchar ;
1000         if ($chars{$char*256+$data[$i+1]}) {
1001           $convchar= $chars{$char * 256 + $data[$i+1]};
1002           $i++;
1003 #           printf "char %x $char, char to convert %x , converted %x\n",$char,$char * 256 + $data[$i - 1],$convchar;
1004         } elsif ($chars{$char})  {
1005           $convchar= $chars{$char};
1006 #           printf "0xC char %x, converted %x\n",$char,$chars{$char};
1007         }else {
1008           $convchar=$char;
1009         }
1010         push @characters,$convchar;
1011       } else {
1012         my $convchar;
1013         if ($chars{$char})  {
1014           $convchar= $chars{$char};
1015 #            printf "char %x,  converted %x\n",$char,$chars{$char};
1016         }else {
1017 #            printf "char %x $char\n",$char;
1018           $convchar=$char;
1019         }
1020         push @characters,$convchar;
1021       }
1022     }
1023     $result=pack "U*",@characters;
1024 #     $result=~s/\x01//;
1025 #     $result=~s/\x00//;
1026      $result=~s/\x0f//;
1027      $result=~s/\x1b.//;
1028      $result=~s/\x0e//;
1029      $result=~s/\x1b\x5b//;
1030 #   map{printf "%x",$_} @characters;
1031 #   printf "\n";
1032   return $result;
1033 }
1034
1035 1;
1036
1037
1038 =head1 AUTHOR
1039
1040 Koha Development Team <info@koha.org>
1041
1042 Galen Charlton <galen.charlton@liblime.com>
1043
1044 =cut