C4/Charset.pm

   1 package C4::Charset;
   2
   3 # Copyright (C) 2008 LibLime
   4 #
   5 # This file is part of Koha.
   6 #
   7 # Koha is free software; you can redistribute it and/or modify it under the
   8 # terms of the GNU General Public License as published by the Free Software
   9 # Foundation; either version 2 of the License, or (at your option) any later
  10 # version.
  11 #
  12 # Koha is distributed in the hope that it will be useful, but WITHOUT ANY
  13 # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
  14 # A PARTICULAR PURPOSE.  See the GNU General Public License for more details.
  15 #
  16 # You should have received a copy of the GNU General Public License along with
  17 # Koha; if not, write to the Free Software Foundation, Inc., 59 Temple Place,
  18 # Suite 330, Boston, MA  02111-1307 USA
  19
  20 use strict;
  21 use MARC::Charset qw/marc8_to_utf8/;
  22 use Text::Iconv;
  23
  24 use vars qw($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
  25
  26 BEGIN {
  27     # set the version for version checking
  28     $VERSION = 3.00;
  29     require Exporter;
  30     @ISA    = qw(Exporter);
  31     @EXPORT = qw(
  32         IsStringUTF8ish
  33         MarcToUTF8Record
  34         SetMarcUnicodeFlag
  35     );
  36 }
  37
  38 =head1 NAME
  39
  40 C4::Charset - utilities for handling character set conversions.
  41
  42 =head1 SYNOPSIS
  43
  44 use C4::Charset;
  45
  46 =head1 DESCRIPTION
  47
  48 This module contains routines for dealing with character set
  49 conversions, particularly for MARC records.
  50
  51 A variety of character encodings are in use by various MARC
  52 standards, and even more character encodings are used by
  53 non-standard MARC records.  The various MARC formats generally
  54 do not do a good job of advertising a given record's character
  55 encoding, and even when a record does advertise its encoding,
  56 e.g., via the Leader/09, experience has shown that one cannot
  57 trust it.
  58
  59 Ultimately, all MARC records are stored in Koha in UTF-8 and
  60 must be converted from whatever the source character encoding is.
  61 The goal of this module is to ensure that these conversions
  62 take place accurately.  When a character conversion cannot take
  63 place, or at least not accurately, the module was provide
  64 enough information to allow user-facing code to inform the user
  65 on how to deal with the situation.
  66
  67 =cut
  68
  69 =head1 FUNCTIONS
  70
  71 =head2 IsStringUTF8ish
  72
  73 =over 4
  74
  75 my $is_utf8 = IsStringUTF8ish($str);
  76
  77 =back
  78
  79 Determines if C<$str> is valid UTF-8.  This can mean
  80 one of two things:
  81
  82 =over 2
  83
  84 =item *
  85
  86 The Perl UTF-8 flag is set and the string contains valid UTF-8.
  87
  88 =item *
  89
  90 The Perl UTF-8 flag is B<not> set, but the octets contain
  91 valid UTF-8.
  92
  93 =back
  94
  95 The function is named C<IsStringUTF8ish> instead of C<IsStringUTF8>
  96 because in one could be presented with a MARC blob that is
  97 not actually in UTF-8 but whose sequence of octets appears to be
  98 valid UTF-8.  The rest of the MARC character conversion functions
  99 will assume that this situation occur does not very often.
 100
 101 =cut
 102
 103 sub IsStringUTF8ish {
 104     my $str = shift;
 105
 106     return 1 if utf8::is_utf8($str);
 107     return utf8::decode($str);
 108 }
 109
 110 =head2 MarcToUTF8Record
 111
 112 =over 4
 113
 114 ($marc_record, $converted_from, $errors_arrayref) = MarcToUTF8Record($marc_blob, $marc_flavour, [, $source_encoding]);
 115
 116 =back
 117
 118 Given a MARC blob or a C<MARC::Record>, the MARC flavour, and an
 119 optional source encoding, return a C<MARC::Record> that is
 120 converted to UTF-8.
 121
 122 The returned C<$marc_record> is guaranteed to be in valid UTF-8, but
 123 is not guaranteed to have been converted correctly.  Specifically,
 124 if C<$converted_from> is 'failed', the MARC record returned failed
 125 character conversion and had each of its non-ASCII octets changed
 126 to the Unicode replacement character.
 127
 128 If the source encoding was not specified, this routine will
 129 try to guess it; the character encoding used for a successful
 130 conversion is returned in C<$converted_from>.
 131
 132 =cut
 133
 134 sub MarcToUTF8Record {
 135     my $marc = shift;
 136     my $marc_flavour = shift;
 137     my $source_encoding = shift;
 138
 139     my $marc_record;
 140     my $marc_blob_is_utf8 = 0;
 141     if (ref($marc) eq 'MARC::Record') {
 142         my $marc_blob = $marc->as_usmarc();
 143         $marc_blob_is_utf8 = IsStringUTF8ish($marc_blob);
 144         $marc_record = $marc;
 145     } else {
 146         # dealing with a MARC blob
 147
 148         # remove any ersatz whitespace from the beginning and
 149         # end of the MARC blob -- these can creep into MARC
 150         # files produced by several sources -- caller really
 151         # should be doing this, however
 152         $marc =~ s/^\s+//;
 153         $marc =~ s/\s+$//;
 154         $marc_blob_is_utf8 = IsStringUTF8ish($marc);
 155         $marc_record = MARC::Record->new_from_usmarc($marc);
 156     }
 157
 158     # If we do not know the source encoding, try some guesses
 159     # as follows:
 160     #   1. Record is UTF-8 already.
 161     #   2. If MARC flavor is MARC21, then
 162     #      a. record is MARC-8
 163     #      b. record is ISO-8859-1
 164     #   3. If MARC flavor is UNIMARC, then
 165     if (not defined $source_encoding) {
 166         if ($marc_blob_is_utf8) {
 167             # note that for MARC21 we are not bothering to check
 168             # if the Leader/09 is set to 'a' or not -- because
 169             # of problems with various ILSs (including Koha in the
 170             # past, alas), this just is not trustworthy.
 171             SetMarcUnicodeFlag($marc_record, $marc_flavour);
 172             return $marc_record, 'UTF-8', [];
 173         } else {
 174             if ($marc_flavour eq 'MARC21') {
 175                 return _default_marc21_charconv_to_utf8($marc_record, $marc_flavour);
 176             } elsif ($marc_flavour eq 'UNIMARC') {
 177                 return _default_unimarc_charconv_to_utf8($marc_record, $marc_flavour);
 178             } else {
 179                 return _default_marc21_charconv_to_utf8($marc_record, $marc_flavour);
 180             }
 181         }
 182     } else {
 183         # caller knows the character encoding
 184         my $original_marc_record = $marc_record->clone();
 185         my @errors;
 186         if ($source_encoding =~ /utf-?8/i) {
 187             if ($marc_blob_is_utf8) {
 188                 SetMarcUnicodeFlag($marc_record, $marc_flavour);
 189                 return $marc_record, 'UTF-8', [];
 190             } else {
 191                 push @errors, 'specified UTF-8 => UTF-8 conversion, but record is not in UTF-8';
 192             }
 193         } elsif ($source_encoding =~ /marc-?8/i) {
 194             @errors = _marc_marc8_to_utf8($marc_record, $marc_flavour);
 195         } elsif ($source_encoding =~ /5426/) {
 196             @errors = _marc_iso5426_to_utf8($marc_record, $marc_flavour);
 197         } else {
 198             # assume any other character encoding is for Text::Iconv
 199             @errors = _marc_to_utf8_via_text_iconv($marc_record, $marc_flavour, 'iso-8859-1');
 200         }
 201
 202         if (@errors) {
 203             _marc_to_utf8_replacement_char($original_marc_record, $marc_flavour);
 204             return $original_marc_record, 'failed', \@errors;
 205         } else {
 206             return $marc_record, $source_encoding, [];
 207         }
 208     }
 209
 210 }
 211
 212 =head2 SetMarcUnicodeFlag
 213
 214 =over 4
 215
 216 SetMarcUnicodeFlag($marc_record, $marc_flavour);
 217
 218 =back
 219
 220 Set both the internal MARC::Record encoding flag
 221 and the appropriate Leader/09 (MARC21) or
 222 100/26-29 (UNIMARC) to indicate that the record
 223 is in UTF-8.  Note that this does B<not> do
 224 any actual character conversion.
 225
 226 =cut
 227
 228 sub SetMarcUnicodeFlag {
 229     my $marc_record = shift;
 230     my $marc_flavour = shift;
 231
 232     $marc_record->encoding('UTF-8');
 233     if ($marc_flavour eq 'MARC21') {
 234         my $leader = $marc_record->leader();
 235         substr($leader, 9, 1) = 'a';
 236         $marc_record->leader($leader);
 237     } elsif ($marc_flavour eq "UNIMARC") {
 238         if (my $field = $marc_record->field('100')) {
 239             my $sfa = $field->subfield('a');
 240             substr($sfa, 26, 4) = '5050';
 241             $field->update('a' => $sfa);
 242         }
 243     }
 244 }
 245
 246
 247
 248 =head1 INTERNAL FUNCTIONS
 249
 250 =head2 _default_marc21_charconv_to_utf8
 251
 252 =over 4
 253
 254 my ($new_marc_record, $guessed_charset) = _default_marc21_charconv_to_utf8($marc_record);
 255
 256 =back
 257
 258 Converts a C<MARC::Record> of unknown character set to UTF-8,
 259 first by trying a MARC-8 to UTF-8 conversion, then ISO-8859-1
 260 to UTF-8, then a default conversion that replaces each non-ASCII
 261 character with the replacement character.
 262
 263 The C<$guessed_charset> return value contains the character set
 264 that resulted in a conversion to valid UTF-8; note that
 265 if the MARC-8 and ISO-8859-1 conversions failed, the value of
 266 this is 'failed'.
 267
 268 =cut
 269
 270 sub _default_marc21_charconv_to_utf8 {
 271     my $marc_record = shift;
 272     my $marc_flavour = shift;
 273
 274     my $trial_marc8 = $marc_record->clone();
 275     my @all_errors = ();
 276     my @errors = _marc_marc8_to_utf8($trial_marc8, $marc_flavour);
 277     unless (@errors) {
 278         return $trial_marc8, 'MARC-8', [];
 279     }
 280     push @all_errors, @errors;
 281
 282     my $trial_8859_1 = $marc_record->clone();
 283     @errors = _marc_to_utf8_via_text_iconv($trial_8859_1, $marc_flavour, 'iso-8859-1');
 284     unless (@errors) {
 285         return $trial_8859_1, 'iso-8859-1', []; # note -- we could return \@all_errors
 286                                                 # instead if we wanted to report details
 287                                                 # of the failed attempt at MARC-8 => UTF-8
 288     }
 289     push @all_errors, @errors;
 290
 291     my $default_converted = $marc_record->clone();
 292     _marc_to_utf8_replacement_char($default_converted, $marc_flavour);
 293     return $default_converted, 'failed', \@all_errors;
 294 }
 295
 296 =head2 _default_unimarc_charconv_to_utf8
 297
 298 =over 4
 299
 300 my ($new_marc_record, $guessed_charset) = _default_unimarc_charconv_to_utf8($marc_record);
 301
 302 =back
 303
 304 Converts a C<MARC::Record> of unknown character set to UTF-8,
 305 first by trying a ISO-5426 to UTF-8 conversion, then ISO-8859-1
 306 to UTF-8, then a default conversion that replaces each non-ASCII
 307 character with the replacement character.
 308
 309 The C<$guessed_charset> return value contains the character set
 310 that resulted in a conversion to valid UTF-8; note that
 311 if the MARC-8 and ISO-8859-1 conversions failed, the value of
 312 this is 'failed'.
 313
 314 =cut
 315
 316 sub _default_unimarc_charconv_to_utf8 {
 317     my $marc_record = shift;
 318     my $marc_flavour = shift;
 319
 320     my $trial_marc8 = $marc_record->clone();
 321     my @all_errors = ();
 322     my @errors = _marc_iso5426_to_utf8($trial_marc8, $marc_flavour);
 323     unless (@errors) {
 324         return $trial_marc8, 'iso-5426';
 325     }
 326     push @all_errors, @errors;
 327
 328     my $trial_8859_1 = $marc_record->clone();
 329     @errors = _marc_to_utf8_via_text_iconv($trial_8859_1, $marc_flavour, 'iso-8859-1');
 330     unless (@errors) {
 331         return $trial_8859_1, 'iso-8859-1';
 332     }
 333     push @all_errors, @errors;
 334
 335     my $default_converted = $marc_record->clone();
 336     _marc_to_utf8_replacement_char($default_converted, $marc_flavour);
 337     return $default_converted, 'failed', \@all_errors;
 338 }
 339
 340 =head2 _marc_marc8_to_utf8
 341
 342 =over 4
 343
 344 my @errors = _marc_marc8_to_utf8($marc_record, $marc_flavour, $source_encoding);
 345
 346 =back
 347
 348 Convert a C<MARC::Record> to UTF-8 in-place from MARC-8.
 349 If the conversion fails for some reason, an
 350 appropriate messages will be placed in the returned
 351 C<@errors> array.
 352
 353 =cut
 354
 355 sub _marc_marc8_to_utf8 {
 356     my $marc_record = shift;
 357     my $marc_flavour = shift;
 358
 359     my $prev_ignore = MARC::Charset->ignore_errors();
 360     MARC::Charset->ignore_errors(1);
 361
 362     # trap warnings raised by MARC::Charset
 363     my @errors = ();
 364     local $SIG{__WARN__} = sub {
 365         my $msg = $_[0];
 366         if ($msg =~ /MARC.Charset/) {
 367             # FIXME - purpose of this regexp is to strip out the
 368             # line reference to MARC/Charset.pm, but as it
 369             # exists probably won't work quite on Windows --
 370             # some sort of minimal-bunch back-tracking RE
 371             # would be helpful here
 372             $msg =~ s/at [\/].*?.MARC.Charset\.pm line \d+\.\n$//;
 373             push @errors, $msg;
 374         } else {
 375             # if warning doesn't come from MARC::Charset, just
 376             # pass it on
 377             warn $msg;
 378         }
 379     };
 380
 381     foreach my $field ($marc_record->fields()) {
 382         if ($field->is_control_field()) {
 383             ; # do nothing -- control fields should not contain non-ASCII characters
 384         } else {
 385             my @converted_subfields;
 386             foreach my $subfield ($field->subfields()) {
 387                 my $utf8sf = MARC::Charset::marc8_to_utf8($subfield->[1]);
 388                 push @converted_subfields, $subfield->[0], $utf8sf;
 389             }
 390
 391             $field->replace_with(MARC::Field->new(
 392                 $field->tag(), $field->indicator(1), $field->indicator(2),
 393                 @converted_subfields)
 394             );
 395         }
 396     }
 397
 398     MARC::Charset->ignore_errors($prev_ignore);
 399
 400     SetMarcUnicodeFlag($marc_record, $marc_flavour);
 401
 402     return @errors;
 403 }
 404
 405 =head2 _marc_iso5426_to_utf8
 406
 407 =over 4
 408
 409 my @errors = _marc_iso5426_to_utf8($marc_record, $marc_flavour, $source_encoding);
 410
 411 =back
 412
 413 Convert a C<MARC::Record> to UTF-8 in-place from ISO-5426.
 414 If the conversion fails for some reason, an
 415 appropriate messages will be placed in the returned
 416 C<@errors> array.
 417
 418 FIXME - is ISO-5426 equivalent enough to MARC-8
 419 that C<MARC::Charset> can be used instead?
 420
 421 =cut
 422
 423 sub _marc_iso5426_to_utf8 {
 424     my $marc_record = shift;
 425     my $marc_flavour = shift;
 426
 427     my @errors = ();
 428
 429     foreach my $field ($marc_record->fields()) {
 430         if ($field->is_control_field()) {
 431             ; # do nothing -- control fields should not contain non-ASCII characters
 432         } else {
 433             my @converted_subfields;
 434             foreach my $subfield ($field->subfields()) {
 435                 my $utf8sf = char_decode5426($subfield->[1]);
 436                 push @converted_subfields, $subfield->[0], $utf8sf;
 437             }
 438
 439             $field->replace_with(MARC::Field->new(
 440                 $field->tag(), $field->indicator(1), $field->indicator(2),
 441                 @converted_subfields)
 442             );
 443         }
 444     }
 445
 446     SetMarcUnicodeFlag($marc_record, $marc_flavour);
 447
 448     return @errors;
 449 }
 450
 451 =head2 _marc_to_utf8_via_text_iconv
 452
 453 =over 4
 454
 455 my @errors = _marc_to_utf8_via_text_iconv($marc_record, $marc_flavour, $source_encoding);
 456
 457 =back
 458
 459 Convert a C<MARC::Record> to UTF-8 in-place using the
 460 C<Text::Iconv> CPAN module.  Any source encoding accepted
 461 by the user's iconv installation should work.  If
 462 the source encoding is not recognized on the user's
 463 server or the conversion fails for some reason,
 464 appropriate messages will be placed in the returned
 465 C<@errors> array.
 466
 467 =cut
 468
 469 sub _marc_to_utf8_via_text_iconv {
 470     my $marc_record = shift;
 471     my $marc_flavour = shift;
 472     my $source_encoding = shift;
 473
 474     my @errors = ();
 475     my $decoder;
 476     eval { $decoder = Text::Iconv->new($source_encoding, 'utf8'); };
 477     if ($@) {
 478         push @errors, "Could not initialze $source_encoding => utf8 converter: $@";
 479         return @errors;
 480     }
 481
 482     my $prev_raise_error = Text::Iconv->raise_error();
 483     Text::Iconv->raise_error(1);
 484
 485     foreach my $field ($marc_record->fields()) {
 486         if ($field->is_control_field()) {
 487             ; # do nothing -- control fields should not contain non-ASCII characters
 488         } else {
 489             my @converted_subfields;
 490             foreach my $subfield ($field->subfields()) {
 491                 my $converted_value;
 492                 my $conversion_ok = 1;
 493                 eval { $converted_value = $decoder->convert($subfield->[1]); };
 494                 if ($@) {
 495                     $conversion_ok = 0;
 496                     push @errors, $@;
 497                 } elsif (not defined $converted_value) {
 498                     $conversion_ok = 0;
 499                     push @errors, "Text::Iconv conversion failed - retval is " . $decoder->retval();
 500                 }
 501
 502                 if ($conversion_ok) {
 503                     push @converted_subfields, $subfield->[0], $converted_value;
 504                 } else {
 505                     $converted_value = $subfield->[1];
 506                     $converted_value =~ s/[\200-\377]/\xef\xbf\xbd/g;
 507                     push @converted_subfields, $subfield->[0], $converted_value;
 508                 }
 509             }
 510
 511             $field->replace_with(MARC::Field->new(
 512                 $field->tag(), $field->indicator(1), $field->indicator(2),
 513                 @converted_subfields)
 514             );
 515         }
 516     }
 517
 518     SetMarcUnicodeFlag($marc_record, $marc_flavour);
 519     Text::Iconv->raise_error($prev_raise_error);
 520
 521     return @errors;
 522 }
 523
 524 =head2 _marc_to_utf8_replacement_char
 525
 526 =over 4
 527
 528 _marc_to_utf8_replacement_char($marc_record, $marc_flavour);
 529
 530 =back
 531
 532 Convert a C<MARC::Record> to UTF-8 in-place, adopting the
 533 unsatisfactory method of replacing all non-ASCII (e.g.,
 534 where the eight bit is set) octet with the Unicode
 535 replacement character.  This is meant as a last-ditch
 536 method, and would be best used as part of a UI that
 537 lets a cataloguer pick various character conversions
 538 until he or she finds the right one.
 539
 540 =cut
 541
 542 sub _marc_to_utf8_replacement_char {
 543     my $marc_record = shift;
 544     my $marc_flavour = shift;
 545
 546     foreach my $field ($marc_record->fields()) {
 547         if ($field->is_control_field()) {
 548             ; # do nothing -- control fields should not contain non-ASCII characters
 549         } else {
 550             my @converted_subfields;
 551             foreach my $subfield ($field->subfields()) {
 552                 my $value = $subfield->[1];
 553                 $value =~ s/[\200-\377]/\xef\xbf\xbd/g;
 554                 push @converted_subfields, $subfield->[0], $value;
 555             }
 556
 557             $field->replace_with(MARC::Field->new(
 558                 $field->tag(), $field->indicator(1), $field->indicator(2),
 559                 @converted_subfields)
 560             );
 561         }
 562     }
 563
 564     SetMarcUnicodeFlag($marc_record, $marc_flavour);
 565 }
 566
 567 =head2 char_decode5426
 568
 569 =over 4
 570
 571 my $utf8string = char_decode5426($iso_5426_string);
 572
 573 =back
 574
 575 Converts a string from ISO-5426 to UTF-8.
 576
 577 =cut
 578
 579 sub char_decode5426 {
 580     my ( $string) = @_;
 581     my $result;
 582 my %chars;
 583 $chars{0xb0}=0x0101;#3/0ayn[ain]
 584 $chars{0xb1}=0x0623;#3/1alif/hamzah[alefwithhamzaabove]
 585 #$chars{0xb2}=0x00e0;#'à';
 586 $chars{0xb2}=0x00e0;#3/2leftlowsinglequotationmark
 587 #$chars{0xb3}=0x00e7;#'ç';
 588 $chars{0xb3}=0x00e7;#3/2leftlowsinglequotationmark
 589 # $chars{0xb4}='è';
 590 $chars{0xb4}=0x00e8;
 591 # $chars{0xb5}='é';
 592 $chars{0xb5}=0x00e9;
 593 $chars{0x97}=0x003c;#3/2leftlowsinglequotationmark
 594 $chars{0x98}=0x003e;#3/2leftlowsinglequotationmark
 595 $chars{0xfa}=0x0153;#oe
 596 $chars{0x81d1}=0x00b0;
 597
 598 ####
 599 ## combined characters iso5426
 600
 601 $chars{0xc041}=0x1ea2; # capital a with hook above
 602 $chars{0xc045}=0x1eba; # capital e with hook above
 603 $chars{0xc049}=0x1ec8; # capital i with hook above
 604 $chars{0xc04f}=0x1ece; # capital o with hook above
 605 $chars{0xc055}=0x1ee6; # capital u with hook above
 606 $chars{0xc059}=0x1ef6; # capital y with hook above
 607 $chars{0xc061}=0x1ea3; # small a with hook above
 608 $chars{0xc065}=0x1ebb; # small e with hook above
 609 $chars{0xc069}=0x1ec9; # small i with hook above
 610 $chars{0xc06f}=0x1ecf; # small o with hook above
 611 $chars{0xc075}=0x1ee7; # small u with hook above
 612 $chars{0xc079}=0x1ef7; # small y with hook above
 613
 614         # 4/1 grave accent
 615 $chars{0xc141}=0x00c0; # capital a with grave accent
 616 $chars{0xc145}=0x00c8; # capital e with grave accent
 617 $chars{0xc149}=0x00cc; # capital i with grave accent
 618 $chars{0xc14f}=0x00d2; # capital o with grave accent
 619 $chars{0xc155}=0x00d9; # capital u with grave accent
 620 $chars{0xc157}=0x1e80; # capital w with grave
 621 $chars{0xc159}=0x1ef2; # capital y with grave
 622 $chars{0xc161}=0x00e0; # small a with grave accent
 623 $chars{0xc165}=0x00e8; # small e with grave accent
 624 $chars{0xc169}=0x00ec; # small i with grave accent
 625 $chars{0xc16f}=0x00f2; # small o with grave accent
 626 $chars{0xc175}=0x00f9; # small u with grave accent
 627 $chars{0xc177}=0x1e81; # small w with grave
 628 $chars{0xc179}=0x1ef3; # small y with grave
 629         # 4/2 acute accent
 630 $chars{0xc241}=0x00c1; # capital a with acute accent
 631 $chars{0xc243}=0x0106; # capital c with acute accent
 632 $chars{0xc245}=0x00c9; # capital e with acute accent
 633 $chars{0xc247}=0x01f4; # capital g with acute
 634 $chars{0xc249}=0x00cd; # capital i with acute accent
 635 $chars{0xc24b}=0x1e30; # capital k with acute
 636 $chars{0xc24c}=0x0139; # capital l with acute accent
 637 $chars{0xc24d}=0x1e3e; # capital m with acute
 638 $chars{0xc24e}=0x0143; # capital n with acute accent
 639 $chars{0xc24f}=0x00d3; # capital o with acute accent
 640 $chars{0xc250}=0x1e54; # capital p with acute
 641 $chars{0xc252}=0x0154; # capital r with acute accent
 642 $chars{0xc253}=0x015a; # capital s with acute accent
 643 $chars{0xc255}=0x00da; # capital u with acute accent
 644 $chars{0xc257}=0x1e82; # capital w with acute
 645 $chars{0xc259}=0x00dd; # capital y with acute accent
 646 $chars{0xc25a}=0x0179; # capital z with acute accent
 647 $chars{0xc261}=0x00e1; # small a with acute accent
 648 $chars{0xc263}=0x0107; # small c with acute accent
 649 $chars{0xc265}=0x00e9; # small e with acute accent
 650 $chars{0xc267}=0x01f5; # small g with acute
 651 $chars{0xc269}=0x00ed; # small i with acute accent
 652 $chars{0xc26b}=0x1e31; # small k with acute
 653 $chars{0xc26c}=0x013a; # small l with acute accent
 654 $chars{0xc26d}=0x1e3f; # small m with acute
 655 $chars{0xc26e}=0x0144; # small n with acute accent
 656 $chars{0xc26f}=0x00f3; # small o with acute accent
 657 $chars{0xc270}=0x1e55; # small p with acute
 658 $chars{0xc272}=0x0155; # small r with acute accent
 659 $chars{0xc273}=0x015b; # small s with acute accent
 660 $chars{0xc275}=0x00fa; # small u with acute accent
 661 $chars{0xc277}=0x1e83; # small w with acute
 662 $chars{0xc279}=0x00fd; # small y with acute accent
 663 $chars{0xc27a}=0x017a; # small z with acute accent
 664 $chars{0xc2e1}=0x01fc; # capital ae with acute
 665 $chars{0xc2f1}=0x01fd; # small ae with acute
 666        # 4/3 circumflex accent
 667 $chars{0xc341}=0x00c2; # capital a with circumflex accent
 668 $chars{0xc343}=0x0108; # capital c with circumflex
 669 $chars{0xc345}=0x00ca; # capital e with circumflex accent
 670 $chars{0xc347}=0x011c; # capital g with circumflex
 671 $chars{0xc348}=0x0124; # capital h with circumflex
 672 $chars{0xc349}=0x00ce; # capital i with circumflex accent
 673 $chars{0xc34a}=0x0134; # capital j with circumflex
 674 $chars{0xc34f}=0x00d4; # capital o with circumflex accent
 675 $chars{0xc353}=0x015c; # capital s with circumflex
 676 $chars{0xc355}=0x00db; # capital u with circumflex
 677 $chars{0xc357}=0x0174; # capital w with circumflex
 678 $chars{0xc359}=0x0176; # capital y with circumflex
 679 $chars{0xc35a}=0x1e90; # capital z with circumflex
 680 $chars{0xc361}=0x00e2; # small a with circumflex accent
 681 $chars{0xc363}=0x0109; # small c with circumflex
 682 $chars{0xc365}=0x00ea; # small e with circumflex accent
 683 $chars{0xc367}=0x011d; # small g with circumflex
 684 $chars{0xc368}=0x0125; # small h with circumflex
 685 $chars{0xc369}=0x00ee; # small i with circumflex accent
 686 $chars{0xc36a}=0x0135; # small j with circumflex
 687 $chars{0xc36e}=0x00f1; # small n with tilde
 688 $chars{0xc36f}=0x00f4; # small o with circumflex accent
 689 $chars{0xc373}=0x015d; # small s with circumflex
 690 $chars{0xc375}=0x00fb; # small u with circumflex
 691 $chars{0xc377}=0x0175; # small w with circumflex
 692 $chars{0xc379}=0x0177; # small y with circumflex
 693 $chars{0xc37a}=0x1e91; # small z with circumflex
 694         # 4/4 tilde
 695 $chars{0xc441}=0x00c3; # capital a with tilde
 696 $chars{0xc445}=0x1ebc; # capital e with tilde
 697 $chars{0xc449}=0x0128; # capital i with tilde
 698 $chars{0xc44e}=0x00d1; # capital n with tilde
 699 $chars{0xc44f}=0x00d5; # capital o with tilde
 700 $chars{0xc455}=0x0168; # capital u with tilde
 701 $chars{0xc456}=0x1e7c; # capital v with tilde
 702 $chars{0xc459}=0x1ef8; # capital y with tilde
 703 $chars{0xc461}=0x00e3; # small a with tilde
 704 $chars{0xc465}=0x1ebd; # small e with tilde
 705 $chars{0xc469}=0x0129; # small i with tilde
 706 $chars{0xc46e}=0x00f1; # small n with tilde
 707 $chars{0xc46f}=0x00f5; # small o with tilde
 708 $chars{0xc475}=0x0169; # small u with tilde
 709 $chars{0xc476}=0x1e7d; # small v with tilde
 710 $chars{0xc479}=0x1ef9; # small y with tilde
 711     # 4/5 macron
 712 $chars{0xc541}=0x0100; # capital a with macron
 713 $chars{0xc545}=0x0112; # capital e with macron
 714 $chars{0xc547}=0x1e20; # capital g with macron
 715 $chars{0xc549}=0x012a; # capital i with macron
 716 $chars{0xc54f}=0x014c; # capital o with macron
 717 $chars{0xc555}=0x016a; # capital u with macron
 718 $chars{0xc561}=0x0101; # small a with macron
 719 $chars{0xc565}=0x0113; # small e with macron
 720 $chars{0xc567}=0x1e21; # small g with macron
 721 $chars{0xc569}=0x012b; # small i with macron
 722 $chars{0xc56f}=0x014d; # small o with macron
 723 $chars{0xc575}=0x016b; # small u with macron
 724 $chars{0xc572}=0x0159; # small r with macron
 725 $chars{0xc5e1}=0x01e2; # capital ae with macron
 726 $chars{0xc5f1}=0x01e3; # small ae with macron
 727         # 4/6 breve
 728 $chars{0xc641}=0x0102; # capital a with breve
 729 $chars{0xc645}=0x0114; # capital e with breve
 730 $chars{0xc647}=0x011e; # capital g with breve
 731 $chars{0xc649}=0x012c; # capital i with breve
 732 $chars{0xc64f}=0x014e; # capital o with breve
 733 $chars{0xc655}=0x016c; # capital u with breve
 734 $chars{0xc661}=0x0103; # small a with breve
 735 $chars{0xc665}=0x0115; # small e with breve
 736 $chars{0xc667}=0x011f; # small g with breve
 737 $chars{0xc669}=0x012d; # small i with breve
 738 $chars{0xc66f}=0x014f; # small o with breve
 739 $chars{0xc675}=0x016d; # small u with breve
 740         # 4/7 dot above
 741 $chars{0xc7b0}=0x01e1; # Ain with dot above
 742 $chars{0xc742}=0x1e02; # capital b with dot above
 743 $chars{0xc743}=0x010a; # capital c with dot above
 744 $chars{0xc744}=0x1e0a; # capital d with dot above
 745 $chars{0xc745}=0x0116; # capital e with dot above
 746 $chars{0xc746}=0x1e1e; # capital f with dot above
 747 $chars{0xc747}=0x0120; # capital g with dot above
 748 $chars{0xc748}=0x1e22; # capital h with dot above
 749 $chars{0xc749}=0x0130; # capital i with dot above
 750 $chars{0xc74d}=0x1e40; # capital m with dot above
 751 $chars{0xc74e}=0x1e44; # capital n with dot above
 752 $chars{0xc750}=0x1e56; # capital p with dot above
 753 $chars{0xc752}=0x1e58; # capital r with dot above
 754 $chars{0xc753}=0x1e60; # capital s with dot above
 755 $chars{0xc754}=0x1e6a; # capital t with dot above
 756 $chars{0xc757}=0x1e86; # capital w with dot above
 757 $chars{0xc758}=0x1e8a; # capital x with dot above
 758 $chars{0xc759}=0x1e8e; # capital y with dot above
 759 $chars{0xc75a}=0x017b; # capital z with dot above
 760 $chars{0xc761}=0x0227; # small b with dot above
 761 $chars{0xc762}=0x1e03; # small b with dot above
 762 $chars{0xc763}=0x010b; # small c with dot above
 763 $chars{0xc764}=0x1e0b; # small d with dot above
 764 $chars{0xc765}=0x0117; # small e with dot above
 765 $chars{0xc766}=0x1e1f; # small f with dot above
 766 $chars{0xc767}=0x0121; # small g with dot above
 767 $chars{0xc768}=0x1e23; # small h with dot above
 768 $chars{0xc76d}=0x1e41; # small m with dot above
 769 $chars{0xc76e}=0x1e45; # small n with dot above
 770 $chars{0xc770}=0x1e57; # small p with dot above
 771 $chars{0xc772}=0x1e59; # small r with dot above
 772 $chars{0xc773}=0x1e61; # small s with dot above
 773 $chars{0xc774}=0x1e6b; # small t with dot above
 774 $chars{0xc777}=0x1e87; # small w with dot above
 775 $chars{0xc778}=0x1e8b; # small x with dot above
 776 $chars{0xc779}=0x1e8f; # small y with dot above
 777 $chars{0xc77a}=0x017c; # small z with dot above
 778         # 4/8 trema, diaresis
 779 $chars{0xc820}=0x00a8; # diaeresis
 780 $chars{0xc841}=0x00c4; # capital a with diaeresis
 781 $chars{0xc845}=0x00cb; # capital e with diaeresis
 782 $chars{0xc848}=0x1e26; # capital h with diaeresis
 783 $chars{0xc849}=0x00cf; # capital i with diaeresis
 784 $chars{0xc84f}=0x00d6; # capital o with diaeresis
 785 $chars{0xc855}=0x00dc; # capital u with diaeresis
 786 $chars{0xc857}=0x1e84; # capital w with diaeresis
 787 $chars{0xc858}=0x1e8c; # capital x with diaeresis
 788 $chars{0xc859}=0x0178; # capital y with diaeresis
 789 $chars{0xc861}=0x00e4; # small a with diaeresis
 790 $chars{0xc865}=0x00eb; # small e with diaeresis
 791 $chars{0xc868}=0x1e27; # small h with diaeresis
 792 $chars{0xc869}=0x00ef; # small i with diaeresis
 793 $chars{0xc86f}=0x00f6; # small o with diaeresis
 794 $chars{0xc874}=0x1e97; # small t with diaeresis
 795 $chars{0xc875}=0x00fc; # small u with diaeresis
 796 $chars{0xc877}=0x1e85; # small w with diaeresis
 797 $chars{0xc878}=0x1e8d; # small x with diaeresis
 798 $chars{0xc879}=0x00ff; # small y with diaeresis
 799         # 4/9 umlaut
 800 $chars{0xc920}=0x00a8; # [diaeresis]
 801 $chars{0xc961}=0x00e4; # a with umlaut
 802 $chars{0xc965}=0x00eb; # e with umlaut
 803 $chars{0xc969}=0x00ef; # i with umlaut
 804 $chars{0xc96f}=0x00f6; # o with umlaut
 805 $chars{0xc975}=0x00fc; # u with umlaut
 806         # 4/10 circle above
 807 $chars{0xca41}=0x00c5; # capital a with ring above
 808 $chars{0xcaad}=0x016e; # capital u with ring above
 809 $chars{0xca61}=0x00e5; # small a with ring above
 810 $chars{0xca75}=0x016f; # small u with ring above
 811 $chars{0xca77}=0x1e98; # small w with ring above
 812 $chars{0xca79}=0x1e99; # small y with ring above
 813         # 4/11 high comma off centre
 814         # 4/12 inverted high comma centred
 815         # 4/13 double acute accent
 816 $chars{0xcd4f}=0x0150; # capital o with double acute
 817 $chars{0xcd55}=0x0170; # capital u with double acute
 818 $chars{0xcd6f}=0x0151; # small o with double acute
 819 $chars{0xcd75}=0x0171; # small u with double acute
 820         # 4/14 horn
 821 $chars{0xce54}=0x01a0; # latin capital letter o with horn
 822 $chars{0xce55}=0x01af; # latin capital letter u with horn
 823 $chars{0xce74}=0x01a1; # latin small letter o with horn
 824 $chars{0xce75}=0x01b0; # latin small letter u with horn
 825         # 4/15 caron (hacek
 826 $chars{0xcf41}=0x01cd; # capital a with caron
 827 $chars{0xcf43}=0x010c; # capital c with caron
 828 $chars{0xcf44}=0x010e; # capital d with caron
 829 $chars{0xcf45}=0x011a; # capital e with caron
 830 $chars{0xcf47}=0x01e6; # capital g with caron
 831 $chars{0xcf49}=0x01cf; # capital i with caron
 832 $chars{0xcf4b}=0x01e8; # capital k with caron
 833 $chars{0xcf4c}=0x013d; # capital l with caron
 834 $chars{0xcf4e}=0x0147; # capital n with caron
 835 $chars{0xcf4f}=0x01d1; # capital o with caron
 836 $chars{0xcf52}=0x0158; # capital r with caron
 837 $chars{0xcf53}=0x0160; # capital s with caron
 838 $chars{0xcf54}=0x0164; # capital t with caron
 839 $chars{0xcf55}=0x01d3; # capital u with caron
 840 $chars{0xcf5a}=0x017d; # capital z with caron
 841 $chars{0xcf61}=0x01ce; # small a with caron
 842 $chars{0xcf63}=0x010d; # small c with caron
 843 $chars{0xcf64}=0x010f; # small d with caron
 844 $chars{0xcf65}=0x011b; # small e with caron
 845 $chars{0xcf67}=0x01e7; # small g with caron
 846 $chars{0xcf69}=0x01d0; # small i with caron
 847 $chars{0xcf6a}=0x01f0; # small j with caron
 848 $chars{0xcf6b}=0x01e9; # small k with caron
 849 $chars{0xcf6c}=0x013e; # small l with caron
 850 $chars{0xcf6e}=0x0148; # small n with caron
 851 $chars{0xcf6f}=0x01d2; # small o with caron
 852 $chars{0xcf72}=0x0159; # small r with caron
 853 $chars{0xcf73}=0x0161; # small s with caron
 854 $chars{0xcf74}=0x0165; # small t with caron
 855 $chars{0xcf75}=0x01d4; # small u with caron
 856 $chars{0xcf7a}=0x017e; # small z with caron
 857         # 5/0 cedilla
 858 $chars{0xd020}=0x00b8; # cedilla
 859 $chars{0xd043}=0x00c7; # capital c with cedilla
 860 $chars{0xd044}=0x1e10; # capital d with cedilla
 861 $chars{0xd047}=0x0122; # capital g with cedilla
 862 $chars{0xd048}=0x1e28; # capital h with cedilla
 863 $chars{0xd04b}=0x0136; # capital k with cedilla
 864 $chars{0xd04c}=0x013b; # capital l with cedilla
 865 $chars{0xd04e}=0x0145; # capital n with cedilla
 866 $chars{0xd052}=0x0156; # capital r with cedilla
 867 $chars{0xd053}=0x015e; # capital s with cedilla
 868 $chars{0xd054}=0x0162; # capital t with cedilla
 869 $chars{0xd063}=0x00e7; # small c with cedilla
 870 $chars{0xd064}=0x1e11; # small d with cedilla
 871 $chars{0xd065}=0x0119; # small e with cedilla
 872 $chars{0xd067}=0x0123; # small g with cedilla
 873 $chars{0xd068}=0x1e29; # small h with cedilla
 874 $chars{0xd06b}=0x0137; # small k with cedilla
 875 $chars{0xd06c}=0x013c; # small l with cedilla
 876 $chars{0xd06e}=0x0146; # small n with cedilla
 877 $chars{0xd072}=0x0157; # small r with cedilla
 878 $chars{0xd073}=0x015f; # small s with cedilla
 879 $chars{0xd074}=0x0163; # small t with cedilla
 880         # 5/1 rude
 881         # 5/2 hook to left
 882         # 5/3 ogonek (hook to right
 883 $chars{0xd320}=0x02db; # ogonek
 884 $chars{0xd341}=0x0104; # capital a with ogonek
 885 $chars{0xd345}=0x0118; # capital e with ogonek
 886 $chars{0xd349}=0x012e; # capital i with ogonek
 887 $chars{0xd34f}=0x01ea; # capital o with ogonek
 888 $chars{0xd355}=0x0172; # capital u with ogonek
 889 $chars{0xd361}=0x0105; # small a with ogonek
 890 $chars{0xd365}=0x0119; # small e with ogonek
 891 $chars{0xd369}=0x012f; # small i with ogonek
 892 $chars{0xd36f}=0x01eb; # small o with ogonek
 893 $chars{0xd375}=0x0173; # small u with ogonek
 894         # 5/4 circle below
 895 $chars{0xd441}=0x1e00; # capital a with ring below
 896 $chars{0xd461}=0x1e01; # small a with ring below
 897         # 5/5 half circle below
 898 $chars{0xf948}=0x1e2a; # capital h with breve below
 899 $chars{0xf968}=0x1e2b; # small h with breve below
 900         # 5/6 dot below
 901 $chars{0xd641}=0x1ea0; # capital a with dot below
 902 $chars{0xd642}=0x1e04; # capital b with dot below
 903 $chars{0xd644}=0x1e0c; # capital d with dot below
 904 $chars{0xd645}=0x1eb8; # capital e with dot below
 905 $chars{0xd648}=0x1e24; # capital h with dot below
 906 $chars{0xd649}=0x1eca; # capital i with dot below
 907 $chars{0xd64b}=0x1e32; # capital k with dot below
 908 $chars{0xd64c}=0x1e36; # capital l with dot below
 909 $chars{0xd64d}=0x1e42; # capital m with dot below
 910 $chars{0xd64e}=0x1e46; # capital n with dot below
 911 $chars{0xd64f}=0x1ecc; # capital o with dot below
 912 $chars{0xd652}=0x1e5a; # capital r with dot below
 913 $chars{0xd653}=0x1e62; # capital s with dot below
 914 $chars{0xd654}=0x1e6c; # capital t with dot below
 915 $chars{0xd655}=0x1ee4; # capital u with dot below
 916 $chars{0xd656}=0x1e7e; # capital v with dot below
 917 $chars{0xd657}=0x1e88; # capital w with dot below
 918 $chars{0xd659}=0x1ef4; # capital y with dot below
 919 $chars{0xd65a}=0x1e92; # capital z with dot below
 920 $chars{0xd661}=0x1ea1; # small a with dot below
 921 $chars{0xd662}=0x1e05; # small b with dot below
 922 $chars{0xd664}=0x1e0d; # small d with dot below
 923 $chars{0xd665}=0x1eb9; # small e with dot below
 924 $chars{0xd668}=0x1e25; # small h with dot below
 925 $chars{0xd669}=0x1ecb; # small i with dot below
 926 $chars{0xd66b}=0x1e33; # small k with dot below
 927 $chars{0xd66c}=0x1e37; # small l with dot below
 928 $chars{0xd66d}=0x1e43; # small m with dot below
 929 $chars{0xd66e}=0x1e47; # small n with dot below
 930 $chars{0xd66f}=0x1ecd; # small o with dot below
 931 $chars{0xd672}=0x1e5b; # small r with dot below
 932 $chars{0xd673}=0x1e63; # small s with dot below
 933 $chars{0xd674}=0x1e6d; # small t with dot below
 934 $chars{0xd675}=0x1ee5; # small u with dot below
 935 $chars{0xd676}=0x1e7f; # small v with dot below
 936 $chars{0xd677}=0x1e89; # small w with dot below
 937 $chars{0xd679}=0x1ef5; # small y with dot below
 938 $chars{0xd67a}=0x1e93; # small z with dot below
 939         # 5/7 double dot below
 940 $chars{0xd755}=0x1e72; # capital u with diaeresis below
 941 $chars{0xd775}=0x1e73; # small u with diaeresis below
 942         # 5/8 underline
 943 $chars{0xd820}=0x005f; # underline
 944         # 5/9 double underline
 945 $chars{0xd920}=0x2017; # double underline
 946         # 5/10 small low vertical bar
 947 $chars{0xda20}=0x02cc; #
 948         # 5/11 circumflex below
 949         # 5/12 (this position shall not be used)
 950         # 5/13 left half of ligature sign and of double tilde
 951         # 5/14 right half of ligature sign
 952         # 5/15 right half of double tilde
 953 #     map {printf "%x :%x\n",$_,$chars{$_};}keys %chars;
 954     my @data = unpack("C*", $string);
 955     my @characters;
 956     my $length=scalar(@data);
 957     for (my $i = 0; $i < scalar(@data); $i++) {
 958       my $char= $data[$i];
 959       if ($char >= 0x00 && $char <= 0x7F){
 960         #IsAscii
 961
 962           push @characters,$char unless ($char<0x02 ||$char== 0x0F);
 963       }elsif (($char >= 0xC0 && $char <= 0xDF)) {
 964         #Combined Char
 965         my $convchar ;
 966         if ($chars{$char*256+$data[$i+1]}) {
 967           $convchar= $chars{$char * 256 + $data[$i+1]};
 968           $i++;
 969 #           printf "char %x $char, char to convert %x , converted %x\n",$char,$char * 256 + $data[$i - 1],$convchar;
 970         } elsif ($chars{$char})  {
 971           $convchar= $chars{$char};
 972 #           printf "0xC char %x, converted %x\n",$char,$chars{$char};
 973         }else {
 974           $convchar=$char;
 975         }
 976         push @characters,$convchar;
 977       } else {
 978         my $convchar;
 979         if ($chars{$char})  {
 980           $convchar= $chars{$char};
 981 #            printf "char %x,  converted %x\n",$char,$chars{$char};
 982         }else {
 983 #            printf "char %x $char\n",$char;
 984           $convchar=$char;
 985         }
 986         push @characters,$convchar;
 987       }
 988     }
 989     $result=pack "U*",@characters;
 990 #     $result=~s/\x01//;
 991 #     $result=~s/\x00//;
 992      $result=~s/\x0f//;
 993      $result=~s/\x1b.//;
 994      $result=~s/\x0e//;
 995      $result=~s/\x1b\x5b//;
 996 #   map{printf "%x",$_} @characters;
 997 #   printf "\n";
 998   return $result;
 999 }
1000
1001 1;
1002
1003
1004 =head1 AUTHOR
1005
1006 Koha Development Team <info@koha.org>
1007
1008 Galen Charlton <galen.charlton@liblime.com>
1009
1010 =cut