C4/Charset.pm

   1 package C4::Charset;
   2
   3 # Copyright (C) 2008 LibLime
   4 #
   5 # This file is part of Koha.
   6 #
   7 # Koha is free software; you can redistribute it and/or modify it under the
   8 # terms of the GNU General Public License as published by the Free Software
   9 # Foundation; either version 2 of the License, or (at your option) any later
  10 # version.
  11 #
  12 # Koha is distributed in the hope that it will be useful, but WITHOUT ANY
  13 # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
  14 # A PARTICULAR PURPOSE.  See the GNU General Public License for more details.
  15 #
  16 # You should have received a copy of the GNU General Public License along with
  17 # Koha; if not, write to the Free Software Foundation, Inc., 59 Temple Place,
  18 # Suite 330, Boston, MA  02111-1307 USA
  19
  20 use strict;
  21 use MARC::Charset qw/marc8_to_utf8/;
  22 use Text::Iconv;
  23
  24 use vars qw($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
  25
  26 BEGIN {
  27     # set the version for version checking
  28     $VERSION = 3.00;
  29     require Exporter;
  30     @ISA    = qw(Exporter);
  31     @EXPORT = qw(
  32         IsStringUTF8ish
  33         MarcToUTF8Record
  34         SetMarcUnicodeFlag
  35     );
  36 }
  37
  38 =head1 NAME
  39
  40 C4::Charset - utilities for handling character set conversions.
  41
  42 =head1 SYNOPSIS
  43
  44 use C4::Charset;
  45
  46 =head1 DESCRIPTION
  47
  48 This module contains routines for dealing with character set
  49 conversions, particularly for MARC records.
  50
  51 A variety of character encodings are in use by various MARC
  52 standards, and even more character encodings are used by
  53 non-standard MARC records.  The various MARC formats generally
  54 do not do a good job of advertising a given record's character
  55 encoding, and even when a record does advertise its encoding,
  56 e.g., via the Leader/09, experience has shown that one cannot
  57 trust it.
  58
  59 Ultimately, all MARC records are stored in Koha in UTF-8 and
  60 must be converted from whatever the source character encoding is.
  61 The goal of this module is to ensure that these conversions
  62 take place accurately.  When a character conversion cannot take
  63 place, or at least not accurately, the module was provide
  64 enough information to allow user-facing code to inform the user
  65 on how to deal with the situation.
  66
  67 =cut
  68
  69 =head1 FUNCTIONS
  70
  71 =head2 IsStringUTF8ish
  72
  73 =over 4
  74
  75 my $is_utf8 = IsStringUTF8ish($str);
  76
  77 =back
  78
  79 Determines if C<$str> is valid UTF-8.  This can mean
  80 one of two things:
  81
  82 =over 2
  83
  84 =item *
  85
  86 The Perl UTF-8 flag is set and the string contains valid UTF-8.
  87
  88 =item *
  89
  90 The Perl UTF-8 flag is B<not> set, but the octets contain
  91 valid UTF-8.
  92
  93 =back
  94
  95 The function is named C<IsStringUTF8ish> instead of C<IsStringUTF8>
  96 because in one could be presented with a MARC blob that is
  97 not actually in UTF-8 but whose sequence of octets appears to be
  98 valid UTF-8.  The rest of the MARC character conversion functions
  99 will assume that this situation occur does not very often.
 100
 101 =cut
 102
 103 sub IsStringUTF8ish {
 104     my $str = shift;
 105
 106     return 1 if utf8::is_utf8($str);
 107     return utf8::decode($str);
 108 }
 109
 110 =head2 MarcToUTF8Record
 111
 112 =over 4
 113
 114 ($marc_record, $converted_from, $errors_arrayref) = MarcToUTF8Record($marc_blob, $marc_flavour, [, $source_encoding]);
 115
 116 =back
 117
 118 Given a MARC blob or a C<MARC::Record>, the MARC flavour, and an
 119 optional source encoding, return a C<MARC::Record> that is
 120 converted to UTF-8.
 121
 122 The returned C<$marc_record> is guaranteed to be in valid UTF-8, but
 123 is not guaranteed to have been converted correctly.  Specifically,
 124 if C<$converted_from> is 'failed', the MARC record returned failed
 125 character conversion and had each of its non-ASCII octets changed
 126 to the Unicode replacement character.
 127
 128 If the source encoding was not specified, this routine will
 129 try to guess it; the character encoding used for a successful
 130 conversion is returned in C<$converted_from>.
 131
 132 =cut
 133
 134 sub MarcToUTF8Record {
 135     my $marc = shift;
 136     my $marc_flavour = shift;
 137     my $source_encoding = shift;
 138
 139     my $marc_record;
 140     my $marc_blob_is_utf8 = 0;
 141     if (ref($marc) eq 'MARC::Record') {
 142         my $marc_blob = $marc->as_usmarc();
 143         $marc_blob_is_utf8 = IsStringUTF8ish($marc_blob);
 144         $marc_record = $marc;
 145     } else {
 146         # dealing with a MARC blob
 147
 148         # remove any ersatz whitespace from the beginning and
 149         # end of the MARC blob -- these can creep into MARC
 150         # files produced by several sources -- caller really
 151         # should be doing this, however
 152         $marc =~ s/^\s+//;
 153         $marc =~ s/\s+$//;
 154         $marc_blob_is_utf8 = IsStringUTF8ish($marc);
 155         $marc_record = MARC::Record->new_from_usmarc($marc);
 156     }
 157
 158     # If we do not know the source encoding, try some guesses
 159     # as follows:
 160     #   1. Record is UTF-8 already.
 161     #   2. If MARC flavor is MARC21, then
 162     #      a. record is MARC-8
 163     #      b. record is ISO-8859-1
 164     #   3. If MARC flavor is UNIMARC, then
 165     if (not defined $source_encoding) {
 166         if ($marc_blob_is_utf8) {
 167             # note that for MARC21 we are not bothering to check
 168             # if the Leader/09 is set to 'a' or not -- because
 169             # of problems with various ILSs (including Koha in the
 170             # past, alas), this just is not trustworthy.
 171             SetMarcUnicodeFlag($marc_record, $marc_flavour);
 172             return $marc_record, 'UTF-8', [];
 173         } else {
 174             if ($marc_flavour eq 'MARC21') {
 175                 return _default_marc21_charconv_to_utf8($marc_record, $marc_flavour);
 176             } elsif ($marc_flavour eq 'UNIMARC') {
 177                 return _default_unimarc_charconv_to_utf8($marc_record, $marc_flavour);
 178             } else {
 179                 return _default_marc21_charconv_to_utf8($marc_record, $marc_flavour);
 180             }
 181         }
 182     } else {
 183         # caller knows the character encoding
 184         my $original_marc_record = $marc_record->clone();
 185         my @errors;
 186         if ($source_encoding =~ /utf-?8/i) {
 187             if ($marc_blob_is_utf8) {
 188                 SetMarcUnicodeFlag($marc_record, $marc_flavour);
 189                 return $marc_record, 'UTF-8', [];
 190             } else {
 191                 push @errors, 'specified UTF-8 => UTF-8 conversion, but record is not in UTF-8';
 192             }
 193         } elsif ($source_encoding =~ /marc-?8/i) {
 194             @errors = _marc_marc8_to_utf8($marc_record, $marc_flavour);
 195         } elsif ($source_encoding =~ /5426/) {
 196         } else {
 197             # assume any other character encoding is for Text::Iconv
 198             @errors = _marc_to_utf8_via_text_iconv($marc_record, $marc_flavour, 'iso-8859-1');
 199         }
 200
 201         if (@errors) {
 202             _marc_to_utf8_replacement_char($original_marc_record, $marc_flavour);
 203             return $original_marc_record, 'failed', \@errors;
 204         } else {
 205             return $marc_record, $source_encoding, [];
 206         }
 207     }
 208
 209 }
 210
 211 =head2 SetMarcUnicodeFlag
 212
 213 =over 4
 214
 215 SetMarcUnicodeFlag($marc_record, $marc_flavour);
 216
 217 =back
 218
 219 Set both the internal MARC::Record encoding flag
 220 and the appropriate Leader/09 (MARC21) or
 221 100/26-29 (UNIMARC) to indicate that the record
 222 is in UTF-8.  Note that this does B<not> do
 223 any actual character conversion.
 224
 225 =cut
 226
 227 sub SetMarcUnicodeFlag {
 228     my $marc_record = shift;
 229     my $marc_flavour = shift;
 230
 231     $marc_record->encoding('UTF-8');
 232     if ($marc_flavour eq 'MARC21') {
 233         my $leader = $marc_record->leader();
 234         substr($leader, 9, 1) = 'a';
 235         $marc_record->leader($leader);
 236     } elsif ($marc_flavour eq "UNIMARC") {
 237         if (my $field = $marc_record->fields('100')) {
 238             my $sfa = $field->subfield('a');
 239             substr($sfa, 26, 4) = '5050';
 240             $field->update('a' => $sfa);
 241         }
 242     }
 243 }
 244
 245
 246
 247 =head1 INTERNAL FUNCTIONS
 248
 249 =head2 _default_marc21_charconv_to_utf8
 250
 251 =over 4
 252
 253 my ($new_marc_record, $guessed_charset) = _default_marc21_charconv_to_utf8($marc_record);
 254
 255 =back
 256
 257 Converts a C<MARC::Record> of unknown character set to UTF-8,
 258 first by trying a MARC-8 to UTF-8 conversion, then ISO-8859-1
 259 to UTF-8, then a default conversion that replaces each non-ASCII
 260 character with the replacement character.
 261
 262 The C<$guessed_charset> return value contains the character set
 263 that resulted in a conversion to valid UTF-8; note that
 264 if the MARC-8 and ISO-8859-1 conversions failed, the value of
 265 this is 'failed'.
 266
 267 =cut
 268
 269 sub _default_marc21_charconv_to_utf8 {
 270     my $marc_record = shift;
 271     my $marc_flavour = shift;
 272
 273     my $trial_marc8 = $marc_record->clone();
 274     my @all_errors = ();
 275     my @errors = _marc_marc8_to_utf8($trial_marc8, $marc_flavour);
 276     unless (@errors) {
 277         return $trial_marc8, 'MARC-8', [];
 278     }
 279     push @all_errors, @errors;
 280
 281     my $trial_8859_1 = $marc_record->clone();
 282     @errors = _marc_to_utf8_via_text_iconv($trial_8859_1, $marc_flavour, 'iso-8859-1');
 283     unless (@errors) {
 284         return $trial_8859_1, 'iso-8859-1', []; # note -- we could return \@all_errors
 285                                                 # instead if we wanted to report details
 286                                                 # of the failed attempt at MARC-8 => UTF-8
 287     }
 288     push @all_errors, @errors;
 289
 290     my $default_converted = $marc_record->clone();
 291     _marc_to_utf8_replacement_char($default_converted, $marc_flavour);
 292     return $default_converted, 'failed', \@all_errors;
 293 }
 294
 295 =head2 _default_unimarc_charconv_to_utf8
 296
 297 =over 4
 298
 299 my ($new_marc_record, $guessed_charset) = _default_unimarc_charconv_to_utf8($marc_record);
 300
 301 =back
 302
 303 Converts a C<MARC::Record> of unknown character set to UTF-8,
 304 first by trying a ISO-5426 to UTF-8 conversion, then ISO-8859-1
 305 to UTF-8, then a default conversion that replaces each non-ASCII
 306 character with the replacement character.
 307
 308 The C<$guessed_charset> return value contains the character set
 309 that resulted in a conversion to valid UTF-8; note that
 310 if the MARC-8 and ISO-8859-1 conversions failed, the value of
 311 this is 'failed'.
 312
 313 =cut
 314
 315 sub _default_unimarc_charconv_to_utf8 {
 316     my $marc_record = shift;
 317     my $marc_flavour = shift;
 318
 319     my $trial_marc8 = $marc_record->clone();
 320     my @all_errors = ();
 321     my @errors = _marc_iso5426_to_utf8($trial_marc8, $marc_flavour);
 322     unless (@errors) {
 323         return $trial_marc8, 'iso-5426';
 324     }
 325     push @all_errors, @errors;
 326
 327     my $trial_8859_1 = $marc_record->clone();
 328     @errors = _marc_to_utf8_via_text_iconv($trial_8859_1, $marc_flavour, 'iso-8859-1');
 329     unless (@errors) {
 330         return $trial_8859_1, 'iso-8859-1';
 331     }
 332     push @all_errors, @errors;
 333
 334     my $default_converted = $marc_record->clone();
 335     _marc_to_utf8_replacement_char($default_converted, $marc_flavour);
 336     return $default_converted, 'failed', \@all_errors;
 337 }
 338
 339 =head2 _marc_marc8_to_utf8
 340
 341 =over 4
 342
 343 my @errors = _marc_marc8_to_utf8($marc_record, $marc_flavour, $source_encoding);
 344
 345 =back
 346
 347 Convert a C<MARC::Record> to UTF-8 in-place from MARC-8.
 348 If the conversion fails for some reason, an
 349 appropriate messages will be placed in the returned
 350 C<@errors> array.
 351
 352 =cut
 353
 354 sub _marc_marc8_to_utf8 {
 355     my $marc_record = shift;
 356     my $marc_flavour = shift;
 357
 358     my $prev_ignore = MARC::Charset->ignore_errors();
 359     MARC::Charset->ignore_errors(1);
 360
 361     # trap warnings raised by MARC::Charset
 362     my @errors = ();
 363     local $SIG{__WARN__} = sub {
 364         my $msg = $_[0];
 365         if ($msg =~ /MARC.Charset/) {
 366             # FIXME - purpose of this regexp is to strip out the
 367             # line reference to MARC/Charset.pm, but as it
 368             # exists probably won't work quite on Windows --
 369             # some sort of minimal-bunch back-tracking RE
 370             # would be helpful here
 371             $msg =~ s/at [\/].*?.MARC.Charset\.pm line \d+\.\n$//;
 372             push @errors, $msg;
 373         } else {
 374             # if warning doesn't come from MARC::Charset, just
 375             # pass it on
 376             warn $msg;
 377         }
 378     };
 379
 380     foreach my $field ($marc_record->fields()) {
 381         if ($field->is_control_field()) {
 382             ; # do nothing -- control fields should not contain non-ASCII characters
 383         } else {
 384             my @converted_subfields;
 385             foreach my $subfield ($field->subfields()) {
 386                 my $utf8sf = MARC::Charset::marc8_to_utf8($subfield->[1]);
 387                 push @converted_subfields, $subfield->[0], $utf8sf;
 388             }
 389
 390             $field->replace_with(MARC::Field->new(
 391                 $field->tag(), $field->indicator(1), $field->indicator(2),
 392                 @converted_subfields)
 393             );
 394         }
 395     }
 396
 397     MARC::Charset->ignore_errors($prev_ignore);
 398
 399     SetMarcUnicodeFlag($marc_record, $marc_flavour);
 400
 401     return @errors;
 402 }
 403
 404 =head2 _marc_iso5426_to_utf8
 405
 406 =over 4
 407
 408 my @errors = _marc_iso5426_to_utf8($marc_record, $marc_flavour, $source_encoding);
 409
 410 =back
 411
 412 Convert a C<MARC::Record> to UTF-8 in-place from ISO-5426.
 413 If the conversion fails for some reason, an
 414 appropriate messages will be placed in the returned
 415 C<@errors> array.
 416
 417 FIXME - is ISO-5426 equivalent enough to MARC-8
 418 that C<MARC::Charset> can be used instead?
 419
 420 =cut
 421
 422 sub _marc_iso5426_to_utf8 {
 423     my $marc_record = shift;
 424     my $marc_flavour = shift;
 425
 426     my @errors = ();
 427
 428     foreach my $field ($marc_record->fields()) {
 429         if ($field->is_control_field()) {
 430             ; # do nothing -- control fields should not contain non-ASCII characters
 431         } else {
 432             my @converted_subfields;
 433             foreach my $subfield ($field->subfields()) {
 434                 my $utf8sf = char_decode5426($subfield->[1]);
 435                 push @converted_subfields, $subfield->[0], $utf8sf;
 436             }
 437
 438             $field->replace_with(MARC::Field->new(
 439                 $field->tag(), $field->indicator(1), $field->indicator(2),
 440                 @converted_subfields)
 441             );
 442         }
 443     }
 444
 445     SetMarcUnicodeFlag($marc_record, $marc_flavour);
 446
 447     return @errors;
 448 }
 449
 450 =head2 _marc_to_utf8_via_text_iconv
 451
 452 =over 4
 453
 454 my @errors = _marc_to_utf8_via_text_iconv($marc_record, $marc_flavour, $source_encoding);
 455
 456 =back
 457
 458 Convert a C<MARC::Record> to UTF-8 in-place using the
 459 C<Text::Iconv> CPAN module.  Any source encoding accepted
 460 by the user's iconv installation should work.  If
 461 the source encoding is not recognized on the user's
 462 server or the conversion fails for some reason,
 463 appropriate messages will be placed in the returned
 464 C<@errors> array.
 465
 466 =cut
 467
 468 sub _marc_to_utf8_via_text_iconv {
 469     my $marc_record = shift;
 470     my $marc_flavour = shift;
 471     my $source_encoding = shift;
 472
 473     my @errors = ();
 474     my $decoder;
 475     eval { $decoder = Text::Iconv->new($source_encoding, 'utf8'); };
 476     if ($@) {
 477         push @errors, "Could not initialze $source_encoding => utf8 converter: $@";
 478         return @errors;
 479     }
 480
 481     my $prev_raise_error = Text::Iconv->raise_error();
 482     Text::Iconv->raise_error(1);
 483
 484     foreach my $field ($marc_record->fields()) {
 485         if ($field->is_control_field()) {
 486             ; # do nothing -- control fields should not contain non-ASCII characters
 487         } else {
 488             my @converted_subfields;
 489             foreach my $subfield ($field->subfields()) {
 490                 my $converted_value;
 491                 my $conversion_ok = 1;
 492                 eval { $converted_value = $decoder->convert($subfield->[1]); };
 493                 if ($@) {
 494                     $conversion_ok = 0;
 495                     push @errors, $@;
 496                 } elsif (not defined $converted_value) {
 497                     $conversion_ok = 0;
 498                     push @errors, "Text::Iconv conversion failed - retval is " . $decoder->retval();
 499                 }
 500
 501                 if ($conversion_ok) {
 502                     push @converted_subfields, $subfield->[0], $converted_value;
 503                 } else {
 504                     $converted_value = $subfield->[1];
 505                     $converted_value =~ s/[\200-\377]/\xef\xbf\xbd/g;
 506                     push @converted_subfields, $subfield->[0], $converted_value;
 507                 }
 508             }
 509
 510             $field->replace_with(MARC::Field->new(
 511                 $field->tag(), $field->indicator(1), $field->indicator(2),
 512                 @converted_subfields)
 513             );
 514         }
 515     }
 516
 517     SetMarcUnicodeFlag($marc_record, $marc_flavour);
 518     Text::Iconv->raise_error($prev_raise_error);
 519
 520     return @errors;
 521 }
 522
 523 =head2 _marc_to_utf8_replacement_char
 524
 525 =over 4
 526
 527 _marc_to_utf8_replacement_char($marc_record, $marc_flavour);
 528
 529 =back
 530
 531 Convert a C<MARC::Record> to UTF-8 in-place, adopting the
 532 unsatisfactory method of replacing all non-ASCII (e.g.,
 533 where the eight bit is set) octet with the Unicode
 534 replacement character.  This is meant as a last-ditch
 535 method, and would be best used as part of a UI that
 536 lets a cataloguer pick various character conversions
 537 until he or she finds the right one.
 538
 539 =cut
 540
 541 sub _marc_to_utf8_replacement_char {
 542     my $marc_record = shift;
 543     my $marc_flavour = shift;
 544
 545     foreach my $field ($marc_record->fields()) {
 546         if ($field->is_control_field()) {
 547             ; # do nothing -- control fields should not contain non-ASCII characters
 548         } else {
 549             my @converted_subfields;
 550             foreach my $subfield ($field->subfields()) {
 551                 my $value = $subfield->[1];
 552                 $value =~ s/[\200-\377]/\xef\xbf\xbd/g;
 553                 push @converted_subfields, $subfield->[0], $value;
 554             }
 555
 556             $field->replace_with(MARC::Field->new(
 557                 $field->tag(), $field->indicator(1), $field->indicator(2),
 558                 @converted_subfields)
 559             );
 560         }
 561     }
 562
 563     SetMarcUnicodeFlag($marc_record, $marc_flavour);
 564 }
 565
 566 =head2 char_decode5426
 567
 568 =over 4
 569
 570 my $utf8string = char_decode5426($iso_5426_string);
 571
 572 =back
 573
 574 Converts a string from ISO-5426 to UTF-8.
 575
 576 =cut
 577
 578 sub char_decode5426 {
 579     my ( $string) = @_;
 580     my $result;
 581 my %chars;
 582 $chars{0xb0}=0x0101;#3/0ayn[ain]
 583 $chars{0xb1}=0x0623;#3/1alif/hamzah[alefwithhamzaabove]
 584 #$chars{0xb2}=0x00e0;#'à';
 585 $chars{0xb2}=0x00e0;#3/2leftlowsinglequotationmark
 586 #$chars{0xb3}=0x00e7;#'ç';
 587 $chars{0xb3}=0x00e7;#3/2leftlowsinglequotationmark
 588 # $chars{0xb4}='è';
 589 $chars{0xb4}=0x00e8;
 590 # $chars{0xb5}='é';
 591 $chars{0xb5}=0x00e9;
 592 $chars{0x97}=0x003c;#3/2leftlowsinglequotationmark
 593 $chars{0x98}=0x003e;#3/2leftlowsinglequotationmark
 594 $chars{0xfa}=0x0153;#oe
 595 $chars{0x81d1}=0x00b0;
 596
 597 ####
 598 ## combined characters iso5426
 599
 600 $chars{0xc041}=0x1ea2; # capital a with hook above
 601 $chars{0xc045}=0x1eba; # capital e with hook above
 602 $chars{0xc049}=0x1ec8; # capital i with hook above
 603 $chars{0xc04f}=0x1ece; # capital o with hook above
 604 $chars{0xc055}=0x1ee6; # capital u with hook above
 605 $chars{0xc059}=0x1ef6; # capital y with hook above
 606 $chars{0xc061}=0x1ea3; # small a with hook above
 607 $chars{0xc065}=0x1ebb; # small e with hook above
 608 $chars{0xc069}=0x1ec9; # small i with hook above
 609 $chars{0xc06f}=0x1ecf; # small o with hook above
 610 $chars{0xc075}=0x1ee7; # small u with hook above
 611 $chars{0xc079}=0x1ef7; # small y with hook above
 612
 613         # 4/1 grave accent
 614 $chars{0xc141}=0x00c0; # capital a with grave accent
 615 $chars{0xc145}=0x00c8; # capital e with grave accent
 616 $chars{0xc149}=0x00cc; # capital i with grave accent
 617 $chars{0xc14f}=0x00d2; # capital o with grave accent
 618 $chars{0xc155}=0x00d9; # capital u with grave accent
 619 $chars{0xc157}=0x1e80; # capital w with grave
 620 $chars{0xc159}=0x1ef2; # capital y with grave
 621 $chars{0xc161}=0x00e0; # small a with grave accent
 622 $chars{0xc165}=0x00e8; # small e with grave accent
 623 $chars{0xc169}=0x00ec; # small i with grave accent
 624 $chars{0xc16f}=0x00f2; # small o with grave accent
 625 $chars{0xc175}=0x00f9; # small u with grave accent
 626 $chars{0xc177}=0x1e81; # small w with grave
 627 $chars{0xc179}=0x1ef3; # small y with grave
 628         # 4/2 acute accent
 629 $chars{0xc241}=0x00c1; # capital a with acute accent
 630 $chars{0xc243}=0x0106; # capital c with acute accent
 631 $chars{0xc245}=0x00c9; # capital e with acute accent
 632 $chars{0xc247}=0x01f4; # capital g with acute
 633 $chars{0xc249}=0x00cd; # capital i with acute accent
 634 $chars{0xc24b}=0x1e30; # capital k with acute
 635 $chars{0xc24c}=0x0139; # capital l with acute accent
 636 $chars{0xc24d}=0x1e3e; # capital m with acute
 637 $chars{0xc24e}=0x0143; # capital n with acute accent
 638 $chars{0xc24f}=0x00d3; # capital o with acute accent
 639 $chars{0xc250}=0x1e54; # capital p with acute
 640 $chars{0xc252}=0x0154; # capital r with acute accent
 641 $chars{0xc253}=0x015a; # capital s with acute accent
 642 $chars{0xc255}=0x00da; # capital u with acute accent
 643 $chars{0xc257}=0x1e82; # capital w with acute
 644 $chars{0xc259}=0x00dd; # capital y with acute accent
 645 $chars{0xc25a}=0x0179; # capital z with acute accent
 646 $chars{0xc261}=0x00e1; # small a with acute accent
 647 $chars{0xc263}=0x0107; # small c with acute accent
 648 $chars{0xc265}=0x00e9; # small e with acute accent
 649 $chars{0xc267}=0x01f5; # small g with acute
 650 $chars{0xc269}=0x00ed; # small i with acute accent
 651 $chars{0xc26b}=0x1e31; # small k with acute
 652 $chars{0xc26c}=0x013a; # small l with acute accent
 653 $chars{0xc26d}=0x1e3f; # small m with acute
 654 $chars{0xc26e}=0x0144; # small n with acute accent
 655 $chars{0xc26f}=0x00f3; # small o with acute accent
 656 $chars{0xc270}=0x1e55; # small p with acute
 657 $chars{0xc272}=0x0155; # small r with acute accent
 658 $chars{0xc273}=0x015b; # small s with acute accent
 659 $chars{0xc275}=0x00fa; # small u with acute accent
 660 $chars{0xc277}=0x1e83; # small w with acute
 661 $chars{0xc279}=0x00fd; # small y with acute accent
 662 $chars{0xc27a}=0x017a; # small z with acute accent
 663 $chars{0xc2e1}=0x01fc; # capital ae with acute
 664 $chars{0xc2f1}=0x01fd; # small ae with acute
 665        # 4/3 circumflex accent
 666 $chars{0xc341}=0x00c2; # capital a with circumflex accent
 667 $chars{0xc343}=0x0108; # capital c with circumflex
 668 $chars{0xc345}=0x00ca; # capital e with circumflex accent
 669 $chars{0xc347}=0x011c; # capital g with circumflex
 670 $chars{0xc348}=0x0124; # capital h with circumflex
 671 $chars{0xc349}=0x00ce; # capital i with circumflex accent
 672 $chars{0xc34a}=0x0134; # capital j with circumflex
 673 $chars{0xc34f}=0x00d4; # capital o with circumflex accent
 674 $chars{0xc353}=0x015c; # capital s with circumflex
 675 $chars{0xc355}=0x00db; # capital u with circumflex
 676 $chars{0xc357}=0x0174; # capital w with circumflex
 677 $chars{0xc359}=0x0176; # capital y with circumflex
 678 $chars{0xc35a}=0x1e90; # capital z with circumflex
 679 $chars{0xc361}=0x00e2; # small a with circumflex accent
 680 $chars{0xc363}=0x0109; # small c with circumflex
 681 $chars{0xc365}=0x00ea; # small e with circumflex accent
 682 $chars{0xc367}=0x011d; # small g with circumflex
 683 $chars{0xc368}=0x0125; # small h with circumflex
 684 $chars{0xc369}=0x00ee; # small i with circumflex accent
 685 $chars{0xc36a}=0x0135; # small j with circumflex
 686 $chars{0xc36e}=0x00f1; # small n with tilde
 687 $chars{0xc36f}=0x00f4; # small o with circumflex accent
 688 $chars{0xc373}=0x015d; # small s with circumflex
 689 $chars{0xc375}=0x00fb; # small u with circumflex
 690 $chars{0xc377}=0x0175; # small w with circumflex
 691 $chars{0xc379}=0x0177; # small y with circumflex
 692 $chars{0xc37a}=0x1e91; # small z with circumflex
 693         # 4/4 tilde
 694 $chars{0xc441}=0x00c3; # capital a with tilde
 695 $chars{0xc445}=0x1ebc; # capital e with tilde
 696 $chars{0xc449}=0x0128; # capital i with tilde
 697 $chars{0xc44e}=0x00d1; # capital n with tilde
 698 $chars{0xc44f}=0x00d5; # capital o with tilde
 699 $chars{0xc455}=0x0168; # capital u with tilde
 700 $chars{0xc456}=0x1e7c; # capital v with tilde
 701 $chars{0xc459}=0x1ef8; # capital y with tilde
 702 $chars{0xc461}=0x00e3; # small a with tilde
 703 $chars{0xc465}=0x1ebd; # small e with tilde
 704 $chars{0xc469}=0x0129; # small i with tilde
 705 $chars{0xc46e}=0x00f1; # small n with tilde
 706 $chars{0xc46f}=0x00f5; # small o with tilde
 707 $chars{0xc475}=0x0169; # small u with tilde
 708 $chars{0xc476}=0x1e7d; # small v with tilde
 709 $chars{0xc479}=0x1ef9; # small y with tilde
 710     # 4/5 macron
 711 $chars{0xc541}=0x0100; # capital a with macron
 712 $chars{0xc545}=0x0112; # capital e with macron
 713 $chars{0xc547}=0x1e20; # capital g with macron
 714 $chars{0xc549}=0x012a; # capital i with macron
 715 $chars{0xc54f}=0x014c; # capital o with macron
 716 $chars{0xc555}=0x016a; # capital u with macron
 717 $chars{0xc561}=0x0101; # small a with macron
 718 $chars{0xc565}=0x0113; # small e with macron
 719 $chars{0xc567}=0x1e21; # small g with macron
 720 $chars{0xc569}=0x012b; # small i with macron
 721 $chars{0xc56f}=0x014d; # small o with macron
 722 $chars{0xc575}=0x016b; # small u with macron
 723 $chars{0xc572}=0x0159; # small r with macron
 724 $chars{0xc5e1}=0x01e2; # capital ae with macron
 725 $chars{0xc5f1}=0x01e3; # small ae with macron
 726         # 4/6 breve
 727 $chars{0xc641}=0x0102; # capital a with breve
 728 $chars{0xc645}=0x0114; # capital e with breve
 729 $chars{0xc647}=0x011e; # capital g with breve
 730 $chars{0xc649}=0x012c; # capital i with breve
 731 $chars{0xc64f}=0x014e; # capital o with breve
 732 $chars{0xc655}=0x016c; # capital u with breve
 733 $chars{0xc661}=0x0103; # small a with breve
 734 $chars{0xc665}=0x0115; # small e with breve
 735 $chars{0xc667}=0x011f; # small g with breve
 736 $chars{0xc669}=0x012d; # small i with breve
 737 $chars{0xc66f}=0x014f; # small o with breve
 738 $chars{0xc675}=0x016d; # small u with breve
 739         # 4/7 dot above
 740 $chars{0xc7b0}=0x01e1; # Ain with dot above
 741 $chars{0xc742}=0x1e02; # capital b with dot above
 742 $chars{0xc743}=0x010a; # capital c with dot above
 743 $chars{0xc744}=0x1e0a; # capital d with dot above
 744 $chars{0xc745}=0x0116; # capital e with dot above
 745 $chars{0xc746}=0x1e1e; # capital f with dot above
 746 $chars{0xc747}=0x0120; # capital g with dot above
 747 $chars{0xc748}=0x1e22; # capital h with dot above
 748 $chars{0xc749}=0x0130; # capital i with dot above
 749 $chars{0xc74d}=0x1e40; # capital m with dot above
 750 $chars{0xc74e}=0x1e44; # capital n with dot above
 751 $chars{0xc750}=0x1e56; # capital p with dot above
 752 $chars{0xc752}=0x1e58; # capital r with dot above
 753 $chars{0xc753}=0x1e60; # capital s with dot above
 754 $chars{0xc754}=0x1e6a; # capital t with dot above
 755 $chars{0xc757}=0x1e86; # capital w with dot above
 756 $chars{0xc758}=0x1e8a; # capital x with dot above
 757 $chars{0xc759}=0x1e8e; # capital y with dot above
 758 $chars{0xc75a}=0x017b; # capital z with dot above
 759 $chars{0xc761}=0x0227; # small b with dot above
 760 $chars{0xc762}=0x1e03; # small b with dot above
 761 $chars{0xc763}=0x010b; # small c with dot above
 762 $chars{0xc764}=0x1e0b; # small d with dot above
 763 $chars{0xc765}=0x0117; # small e with dot above
 764 $chars{0xc766}=0x1e1f; # small f with dot above
 765 $chars{0xc767}=0x0121; # small g with dot above
 766 $chars{0xc768}=0x1e23; # small h with dot above
 767 $chars{0xc76d}=0x1e41; # small m with dot above
 768 $chars{0xc76e}=0x1e45; # small n with dot above
 769 $chars{0xc770}=0x1e57; # small p with dot above
 770 $chars{0xc772}=0x1e59; # small r with dot above
 771 $chars{0xc773}=0x1e61; # small s with dot above
 772 $chars{0xc774}=0x1e6b; # small t with dot above
 773 $chars{0xc777}=0x1e87; # small w with dot above
 774 $chars{0xc778}=0x1e8b; # small x with dot above
 775 $chars{0xc779}=0x1e8f; # small y with dot above
 776 $chars{0xc77a}=0x017c; # small z with dot above
 777         # 4/8 trema, diaresis
 778 $chars{0xc820}=0x00a8; # diaeresis
 779 $chars{0xc841}=0x00c4; # capital a with diaeresis
 780 $chars{0xc845}=0x00cb; # capital e with diaeresis
 781 $chars{0xc848}=0x1e26; # capital h with diaeresis
 782 $chars{0xc849}=0x00cf; # capital i with diaeresis
 783 $chars{0xc84f}=0x00d6; # capital o with diaeresis
 784 $chars{0xc855}=0x00dc; # capital u with diaeresis
 785 $chars{0xc857}=0x1e84; # capital w with diaeresis
 786 $chars{0xc858}=0x1e8c; # capital x with diaeresis
 787 $chars{0xc859}=0x0178; # capital y with diaeresis
 788 $chars{0xc861}=0x00e4; # small a with diaeresis
 789 $chars{0xc865}=0x00eb; # small e with diaeresis
 790 $chars{0xc868}=0x1e27; # small h with diaeresis
 791 $chars{0xc869}=0x00ef; # small i with diaeresis
 792 $chars{0xc86f}=0x00f6; # small o with diaeresis
 793 $chars{0xc874}=0x1e97; # small t with diaeresis
 794 $chars{0xc875}=0x00fc; # small u with diaeresis
 795 $chars{0xc877}=0x1e85; # small w with diaeresis
 796 $chars{0xc878}=0x1e8d; # small x with diaeresis
 797 $chars{0xc879}=0x00ff; # small y with diaeresis
 798         # 4/9 umlaut
 799 $chars{0xc920}=0x00a8; # [diaeresis]
 800 $chars{0xc961}=0x00e4; # a with umlaut
 801 $chars{0xc965}=0x00eb; # e with umlaut
 802 $chars{0xc969}=0x00ef; # i with umlaut
 803 $chars{0xc96f}=0x00f6; # o with umlaut
 804 $chars{0xc975}=0x00fc; # u with umlaut
 805         # 4/10 circle above
 806 $chars{0xca41}=0x00c5; # capital a with ring above
 807 $chars{0xcaad}=0x016e; # capital u with ring above
 808 $chars{0xca61}=0x00e5; # small a with ring above
 809 $chars{0xca75}=0x016f; # small u with ring above
 810 $chars{0xca77}=0x1e98; # small w with ring above
 811 $chars{0xca79}=0x1e99; # small y with ring above
 812         # 4/11 high comma off centre
 813         # 4/12 inverted high comma centred
 814         # 4/13 double acute accent
 815 $chars{0xcd4f}=0x0150; # capital o with double acute
 816 $chars{0xcd55}=0x0170; # capital u with double acute
 817 $chars{0xcd6f}=0x0151; # small o with double acute
 818 $chars{0xcd75}=0x0171; # small u with double acute
 819         # 4/14 horn
 820 $chars{0xce54}=0x01a0; # latin capital letter o with horn
 821 $chars{0xce55}=0x01af; # latin capital letter u with horn
 822 $chars{0xce74}=0x01a1; # latin small letter o with horn
 823 $chars{0xce75}=0x01b0; # latin small letter u with horn
 824         # 4/15 caron (hacek
 825 $chars{0xcf41}=0x01cd; # capital a with caron
 826 $chars{0xcf43}=0x010c; # capital c with caron
 827 $chars{0xcf44}=0x010e; # capital d with caron
 828 $chars{0xcf45}=0x011a; # capital e with caron
 829 $chars{0xcf47}=0x01e6; # capital g with caron
 830 $chars{0xcf49}=0x01cf; # capital i with caron
 831 $chars{0xcf4b}=0x01e8; # capital k with caron
 832 $chars{0xcf4c}=0x013d; # capital l with caron
 833 $chars{0xcf4e}=0x0147; # capital n with caron
 834 $chars{0xcf4f}=0x01d1; # capital o with caron
 835 $chars{0xcf52}=0x0158; # capital r with caron
 836 $chars{0xcf53}=0x0160; # capital s with caron
 837 $chars{0xcf54}=0x0164; # capital t with caron
 838 $chars{0xcf55}=0x01d3; # capital u with caron
 839 $chars{0xcf5a}=0x017d; # capital z with caron
 840 $chars{0xcf61}=0x01ce; # small a with caron
 841 $chars{0xcf63}=0x010d; # small c with caron
 842 $chars{0xcf64}=0x010f; # small d with caron
 843 $chars{0xcf65}=0x011b; # small e with caron
 844 $chars{0xcf67}=0x01e7; # small g with caron
 845 $chars{0xcf69}=0x01d0; # small i with caron
 846 $chars{0xcf6a}=0x01f0; # small j with caron
 847 $chars{0xcf6b}=0x01e9; # small k with caron
 848 $chars{0xcf6c}=0x013e; # small l with caron
 849 $chars{0xcf6e}=0x0148; # small n with caron
 850 $chars{0xcf6f}=0x01d2; # small o with caron
 851 $chars{0xcf72}=0x0159; # small r with caron
 852 $chars{0xcf73}=0x0161; # small s with caron
 853 $chars{0xcf74}=0x0165; # small t with caron
 854 $chars{0xcf75}=0x01d4; # small u with caron
 855 $chars{0xcf7a}=0x017e; # small z with caron
 856         # 5/0 cedilla
 857 $chars{0xd020}=0x00b8; # cedilla
 858 $chars{0xd043}=0x00c7; # capital c with cedilla
 859 $chars{0xd044}=0x1e10; # capital d with cedilla
 860 $chars{0xd047}=0x0122; # capital g with cedilla
 861 $chars{0xd048}=0x1e28; # capital h with cedilla
 862 $chars{0xd04b}=0x0136; # capital k with cedilla
 863 $chars{0xd04c}=0x013b; # capital l with cedilla
 864 $chars{0xd04e}=0x0145; # capital n with cedilla
 865 $chars{0xd052}=0x0156; # capital r with cedilla
 866 $chars{0xd053}=0x015e; # capital s with cedilla
 867 $chars{0xd054}=0x0162; # capital t with cedilla
 868 $chars{0xd063}=0x00e7; # small c with cedilla
 869 $chars{0xd064}=0x1e11; # small d with cedilla
 870 $chars{0xd065}=0x0119; # small e with cedilla
 871 $chars{0xd067}=0x0123; # small g with cedilla
 872 $chars{0xd068}=0x1e29; # small h with cedilla
 873 $chars{0xd06b}=0x0137; # small k with cedilla
 874 $chars{0xd06c}=0x013c; # small l with cedilla
 875 $chars{0xd06e}=0x0146; # small n with cedilla
 876 $chars{0xd072}=0x0157; # small r with cedilla
 877 $chars{0xd073}=0x015f; # small s with cedilla
 878 $chars{0xd074}=0x0163; # small t with cedilla
 879         # 5/1 rude
 880         # 5/2 hook to left
 881         # 5/3 ogonek (hook to right
 882 $chars{0xd320}=0x02db; # ogonek
 883 $chars{0xd341}=0x0104; # capital a with ogonek
 884 $chars{0xd345}=0x0118; # capital e with ogonek
 885 $chars{0xd349}=0x012e; # capital i with ogonek
 886 $chars{0xd34f}=0x01ea; # capital o with ogonek
 887 $chars{0xd355}=0x0172; # capital u with ogonek
 888 $chars{0xd361}=0x0105; # small a with ogonek
 889 $chars{0xd365}=0x0119; # small e with ogonek
 890 $chars{0xd369}=0x012f; # small i with ogonek
 891 $chars{0xd36f}=0x01eb; # small o with ogonek
 892 $chars{0xd375}=0x0173; # small u with ogonek
 893         # 5/4 circle below
 894 $chars{0xd441}=0x1e00; # capital a with ring below
 895 $chars{0xd461}=0x1e01; # small a with ring below
 896         # 5/5 half circle below
 897 $chars{0xf948}=0x1e2a; # capital h with breve below
 898 $chars{0xf968}=0x1e2b; # small h with breve below
 899         # 5/6 dot below
 900 $chars{0xd641}=0x1ea0; # capital a with dot below
 901 $chars{0xd642}=0x1e04; # capital b with dot below
 902 $chars{0xd644}=0x1e0c; # capital d with dot below
 903 $chars{0xd645}=0x1eb8; # capital e with dot below
 904 $chars{0xd648}=0x1e24; # capital h with dot below
 905 $chars{0xd649}=0x1eca; # capital i with dot below
 906 $chars{0xd64b}=0x1e32; # capital k with dot below
 907 $chars{0xd64c}=0x1e36; # capital l with dot below
 908 $chars{0xd64d}=0x1e42; # capital m with dot below
 909 $chars{0xd64e}=0x1e46; # capital n with dot below
 910 $chars{0xd64f}=0x1ecc; # capital o with dot below
 911 $chars{0xd652}=0x1e5a; # capital r with dot below
 912 $chars{0xd653}=0x1e62; # capital s with dot below
 913 $chars{0xd654}=0x1e6c; # capital t with dot below
 914 $chars{0xd655}=0x1ee4; # capital u with dot below
 915 $chars{0xd656}=0x1e7e; # capital v with dot below
 916 $chars{0xd657}=0x1e88; # capital w with dot below
 917 $chars{0xd659}=0x1ef4; # capital y with dot below
 918 $chars{0xd65a}=0x1e92; # capital z with dot below
 919 $chars{0xd661}=0x1ea1; # small a with dot below
 920 $chars{0xd662}=0x1e05; # small b with dot below
 921 $chars{0xd664}=0x1e0d; # small d with dot below
 922 $chars{0xd665}=0x1eb9; # small e with dot below
 923 $chars{0xd668}=0x1e25; # small h with dot below
 924 $chars{0xd669}=0x1ecb; # small i with dot below
 925 $chars{0xd66b}=0x1e33; # small k with dot below
 926 $chars{0xd66c}=0x1e37; # small l with dot below
 927 $chars{0xd66d}=0x1e43; # small m with dot below
 928 $chars{0xd66e}=0x1e47; # small n with dot below
 929 $chars{0xd66f}=0x1ecd; # small o with dot below
 930 $chars{0xd672}=0x1e5b; # small r with dot below
 931 $chars{0xd673}=0x1e63; # small s with dot below
 932 $chars{0xd674}=0x1e6d; # small t with dot below
 933 $chars{0xd675}=0x1ee5; # small u with dot below
 934 $chars{0xd676}=0x1e7f; # small v with dot below
 935 $chars{0xd677}=0x1e89; # small w with dot below
 936 $chars{0xd679}=0x1ef5; # small y with dot below
 937 $chars{0xd67a}=0x1e93; # small z with dot below
 938         # 5/7 double dot below
 939 $chars{0xd755}=0x1e72; # capital u with diaeresis below
 940 $chars{0xd775}=0x1e73; # small u with diaeresis below
 941         # 5/8 underline
 942 $chars{0xd820}=0x005f; # underline
 943         # 5/9 double underline
 944 $chars{0xd920}=0x2017; # double underline
 945         # 5/10 small low vertical bar
 946 $chars{0xda20}=0x02cc; #
 947         # 5/11 circumflex below
 948         # 5/12 (this position shall not be used)
 949         # 5/13 left half of ligature sign and of double tilde
 950         # 5/14 right half of ligature sign
 951         # 5/15 right half of double tilde
 952 #     map {printf "%x :%x\n",$_,$chars{$_};}keys %chars;
 953     my @data = unpack("C*", $string);
 954     my @characters;
 955     my $length=scalar(@data);
 956     for (my $i = 0; $i < scalar(@data); $i++) {
 957       my $char= $data[$i];
 958       if ($char >= 0x00 && $char <= 0x7F){
 959         #IsAscii
 960
 961           push @characters,$char unless ($char<0x02 ||$char== 0x0F);
 962       }elsif (($char >= 0xC0 && $char <= 0xDF)) {
 963         #Combined Char
 964         my $convchar ;
 965         if ($chars{$char*256+$data[$i+1]}) {
 966           $convchar= $chars{$char * 256 + $data[$i+1]};
 967           $i++;
 968 #           printf "char %x $char, char to convert %x , converted %x\n",$char,$char * 256 + $data[$i - 1],$convchar;
 969         } elsif ($chars{$char})  {
 970           $convchar= $chars{$char};
 971 #           printf "0xC char %x, converted %x\n",$char,$chars{$char};
 972         }else {
 973           $convchar=$char;
 974         }
 975         push @characters,$convchar;
 976       } else {
 977         my $convchar;
 978         if ($chars{$char})  {
 979           $convchar= $chars{$char};
 980 #            printf "char %x,  converted %x\n",$char,$chars{$char};
 981         }else {
 982 #            printf "char %x $char\n",$char;
 983           $convchar=$char;
 984         }
 985         push @characters,$convchar;
 986       }
 987     }
 988     $result=pack "U*",@characters;
 989 #     $result=~s/\x01//;
 990 #     $result=~s/\x00//;
 991      $result=~s/\x0f//;
 992      $result=~s/\x1b.//;
 993      $result=~s/\x0e//;
 994      $result=~s/\x1b\x5b//;
 995 #   map{printf "%x",$_} @characters;
 996 #   printf "\n";
 997   return $result;
 998 }
 999
1000 1;
1001
1002
1003 =head1 AUTHOR
1004
1005 Koha Development Team <info@koha.org>
1006
1007 Galen Charlton <galen.charlton@liblime.com>
1008
1009 =cut