C4/Charset.pm

   1 package C4::Charset;
   2
   3 # Copyright (C) 2008 LibLime
   4 #
   5 # This file is part of Koha.
   6 #
   7 # Koha is free software; you can redistribute it and/or modify it
   8 # under the terms of the GNU General Public License as published by
   9 # the Free Software Foundation; either version 3 of the License, or
  10 # (at your option) any later version.
  11 #
  12 # Koha is distributed in the hope that it will be useful, but
  13 # WITHOUT ANY WARRANTY; without even the implied warranty of
  14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  15 # GNU General Public License for more details.
  16 #
  17 # You should have received a copy of the GNU General Public License
  18 # along with Koha; if not, see <http://www.gnu.org/licenses>.
  19
  20 use strict;
  21 use warnings;
  22
  23 use MARC::Charset qw/marc8_to_utf8/;
  24 use Text::Iconv;
  25 use C4::Debug;
  26 use Unicode::Normalize;
  27 use Encode qw( decode encode is_utf8 );
  28
  29 use vars qw($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
  30
  31 BEGIN {
  32     # set the version for version checking
  33     $VERSION = 3.07.00.049;
  34     require Exporter;
  35     @ISA    = qw(Exporter);
  36     @EXPORT = qw(
  37         NormalizeString
  38         IsStringUTF8ish
  39         MarcToUTF8Record
  40         SetUTF8Flag
  41         SetMarcUnicodeFlag
  42         StripNonXmlChars
  43         nsb_clean
  44         SanitizeRecord
  45     );
  46 }
  47
  48 =encoding UTF-8
  49
  50 =head1 NAME
  51
  52 C4::Charset - utilities for handling character set conversions.
  53
  54 =head1 SYNOPSIS
  55
  56   use C4::Charset;
  57
  58 =head1 DESCRIPTION
  59
  60 This module contains routines for dealing with character set
  61 conversions, particularly for MARC records.
  62
  63 A variety of character encodings are in use by various MARC
  64 standards, and even more character encodings are used by
  65 non-standard MARC records.  The various MARC formats generally
  66 do not do a good job of advertising a given record's character
  67 encoding, and even when a record does advertise its encoding,
  68 e.g., via the Leader/09, experience has shown that one cannot
  69 trust it.
  70
  71 Ultimately, all MARC records are stored in Koha in UTF-8 and
  72 must be converted from whatever the source character encoding is.
  73 The goal of this module is to ensure that these conversions
  74 take place accurately.  When a character conversion cannot take
  75 place, or at least not accurately, the module was provide
  76 enough information to allow user-facing code to inform the user
  77 on how to deal with the situation.
  78
  79 =cut
  80
  81 =head1 FUNCTIONS
  82
  83 =head2 IsStringUTF8ish
  84
  85   my $is_utf8 = IsStringUTF8ish($str);
  86
  87 Determines if C<$str> is valid UTF-8.  This can mean
  88 one of two things:
  89
  90 =over
  91
  92 =item *
  93
  94 The Perl UTF-8 flag is set and the string contains valid UTF-8.
  95
  96 =item *
  97
  98 The Perl UTF-8 flag is B<not> set, but the octets contain
  99 valid UTF-8.
 100
 101 =back
 102
 103 The function is named C<IsStringUTF8ish> instead of C<IsStringUTF8>
 104 because in one could be presented with a MARC blob that is
 105 not actually in UTF-8 but whose sequence of octets appears to be
 106 valid UTF-8.  The rest of the MARC character conversion functions
 107 will assume that this situation occur does not very often.
 108
 109 =cut
 110
 111 sub IsStringUTF8ish {
 112     my $str = shift;
 113
 114     return 1 if Encode::is_utf8($str);
 115     return utf8::decode( $str );
 116 }
 117
 118 =head2 SetUTF8Flag
 119
 120   my $marc_record = SetUTF8Flag($marc_record, $nfd);
 121
 122 This function sets the PERL UTF8 flag for data.
 123 It is required when using new_from_usmarc
 124 since MARC::File::USMARC does not handle PERL UTF8 setting.
 125 When editing unicode marc records fields and subfields, you
 126 would end up in double encoding without using this function.
 127
 128 If $nfd is set, string normalization will use NFD instead of NFC
 129
 130 FIXME
 131 In my opinion, this function belongs to MARC::Record and not
 132 to this package.
 133 But since it handles charset, and MARC::Record, it finds its way in that package
 134
 135 =cut
 136
 137 sub SetUTF8Flag{
 138     my ($record, $nfd)=@_;
 139     return unless ($record && $record->fields());
 140     foreach my $field ($record->fields()){
 141         if ($field->tag()>=10){
 142             my @subfields;
 143             foreach my $subfield ($field->subfields()){
 144                 push @subfields,($$subfield[0],NormalizeString($$subfield[1],$nfd));
 145             }
 146             eval {
 147                 my $newfield=MARC::Field->new(
 148                             $field->tag(),
 149                             $field->indicator(1),
 150                             $field->indicator(2),
 151                             @subfields
 152                         );
 153                 $field->replace_with($newfield);
 154             };
 155             warn "ERROR occurred in SetUTF8Flag $@" if $@;
 156         }
 157     }
 158 }
 159
 160 =head2 NormalizeString
 161
 162     my $normalized_string=NormalizeString($string,$nfd,$transform);
 163
 164 Given a string
 165 nfd : If you want to set NFD and not NFC
 166 transform : If you expect all the signs to be removed
 167
 168 Sets the PERL UTF8 Flag on your initial data if need be
 169 and applies cleaning if required
 170
 171 Returns a utf8 NFC normalized string
 172
 173 Sample code :
 174    my $string=NormalizeString ("l'ornithoptère");
 175    #results into ornithoptère in NFC form and sets UTF8 Flag
 176
 177 =cut
 178
 179
 180 sub NormalizeString{
 181         my ($string,$nfd,$transform)=@_;
 182     return $string unless defined($string); # force scalar context return.
 183     $string = Encode::decode('UTF-8', $string) unless (Encode::is_utf8($string));
 184         if ($nfd){
 185                 $string= NFD($string);
 186         }
 187         else {
 188                 $string=NFC($string);
 189         }
 190         if ($transform){
 191     $string=~s/\<|\>|\^|\;|\.|\?|,|\-|\(|\)|\[|\]|\{|\}|\$|\%|\!|\*|\:|\\|\/|\&|\"|\'/ /g;
 192         #removing one letter words "d'" "l'"  was changed into "d " "l "
 193     $string=~s/\b\S\b//g;
 194     $string=~s/\s+$//g;
 195         }
 196     return $string;
 197 }
 198
 199 =head2 MarcToUTF8Record
 200
 201   ($marc_record, $converted_from, $errors_arrayref) = MarcToUTF8Record($marc_blob,
 202                                         $marc_flavour, [, $source_encoding]);
 203
 204 Given a MARC blob or a C<MARC::Record>, the MARC flavour, and an
 205 optional source encoding, return a C<MARC::Record> that is
 206 converted to UTF-8.
 207
 208 The returned C<$marc_record> is guaranteed to be in valid UTF-8, but
 209 is not guaranteed to have been converted correctly.  Specifically,
 210 if C<$converted_from> is 'failed', the MARC record returned failed
 211 character conversion and had each of its non-ASCII octets changed
 212 to the Unicode replacement character.
 213
 214 If the source encoding was not specified, this routine will
 215 try to guess it; the character encoding used for a successful
 216 conversion is returned in C<$converted_from>.
 217
 218 =cut
 219
 220 sub MarcToUTF8Record {
 221     my $marc = shift;
 222     my $marc_flavour = shift;
 223     my $source_encoding = shift;
 224     my $marc_record;
 225     my $marc_blob_is_utf8 = 0;
 226     if (ref($marc) eq 'MARC::Record') {
 227         my $marc_blob = $marc->as_usmarc();
 228         $marc_blob_is_utf8 = IsStringUTF8ish($marc_blob);
 229         $marc_record = $marc;
 230     } else {
 231         # dealing with a MARC blob
 232
 233         # remove any ersatz whitespace from the beginning and
 234         # end of the MARC blob -- these can creep into MARC
 235         # files produced by several sources -- caller really
 236         # should be doing this, however
 237         $marc =~ s/^\s+//;
 238         $marc =~ s/\s+$//;
 239         $marc_blob_is_utf8 = IsStringUTF8ish($marc);
 240         eval {
 241             $marc_record = MARC::Record->new_from_usmarc($marc);
 242         };
 243         if ($@) {
 244             # if we fail the first time, one likely problem
 245             # is that we have a MARC21 record that says that it's
 246             # UTF-8 (Leader/09 = 'a') but contains non-UTF-8 characters.
 247             # We'll try parsing it again.
 248             substr($marc, 9, 1) = ' ';
 249             eval {
 250                 $marc_record = MARC::Record->new_from_usmarc($marc);
 251             };
 252             if ($@) {
 253                 # it's hopeless; return an empty MARC::Record
 254                 return MARC::Record->new(), 'failed', ['could not parse MARC blob'];
 255             }
 256         }
 257     }
 258
 259     # If we do not know the source encoding, try some guesses
 260     # as follows:
 261     #   1. Record is UTF-8 already.
 262     #   2. If MARC flavor is MARC21 or NORMARC, then
 263     #      a. record is MARC-8
 264     #      b. record is ISO-8859-1
 265     #   3. If MARC flavor is UNIMARC, then
 266     if (not defined $source_encoding) {
 267         if ($marc_blob_is_utf8) {
 268             # note that for MARC21/NORMARC we are not bothering to check
 269             # if the Leader/09 is set to 'a' or not -- because
 270             # of problems with various ILSs (including Koha in the
 271             # past, alas), this just is not trustworthy.
 272             SetMarcUnicodeFlag($marc_record, $marc_flavour);
 273             return $marc_record, 'UTF-8', [];
 274         } else {
 275             if ($marc_flavour eq 'MARC21' || $marc_flavour eq 'NORMARC') {
 276                 return _default_marc21_charconv_to_utf8($marc_record, $marc_flavour);
 277             } elsif ($marc_flavour =~/UNIMARC/) {
 278                 return _default_unimarc_charconv_to_utf8($marc_record, $marc_flavour);
 279             } else {
 280                 return _default_marc21_charconv_to_utf8($marc_record, $marc_flavour);
 281             }
 282         }
 283     } else {
 284         # caller knows the character encoding
 285         my $original_marc_record = $marc_record->clone();
 286         my @errors;
 287         if ($source_encoding =~ /utf-?8/i) {
 288             if ($marc_blob_is_utf8) {
 289                 SetMarcUnicodeFlag($marc_record, $marc_flavour);
 290                 return $marc_record, 'UTF-8', [];
 291             } else {
 292                 push @errors, 'specified UTF-8 => UTF-8 conversion, but record is not in UTF-8';
 293             }
 294         } elsif ($source_encoding =~ /marc-?8/i) {
 295             @errors = _marc_marc8_to_utf8($marc_record, $marc_flavour);
 296         } elsif ($source_encoding =~ /5426/) {
 297             @errors = _marc_iso5426_to_utf8($marc_record, $marc_flavour);
 298         } else {
 299             # assume any other character encoding is for Text::Iconv
 300             @errors = _marc_to_utf8_via_text_iconv($marc_record, $marc_flavour, $source_encoding);
 301         }
 302
 303         if (@errors) {
 304             _marc_to_utf8_replacement_char($original_marc_record, $marc_flavour);
 305             return $original_marc_record, 'failed', \@errors;
 306         } else {
 307             return $marc_record, $source_encoding, [];
 308         }
 309     }
 310
 311 }
 312
 313 =head2 SetMarcUnicodeFlag
 314
 315   SetMarcUnicodeFlag($marc_record, $marc_flavour);
 316
 317 Set both the internal MARC::Record encoding flag
 318 and the appropriate Leader/09 (MARC21) or
 319 100/26-29 (UNIMARC) to indicate that the record
 320 is in UTF-8.  Note that this does B<not> do
 321 any actual character conversion.
 322
 323 =cut
 324
 325 sub SetMarcUnicodeFlag {
 326     my $marc_record = shift;
 327     my $marc_flavour = shift; # || C4::Context->preference("marcflavour");
 328
 329     $marc_record->encoding('UTF-8');
 330     if ($marc_flavour eq 'MARC21' || $marc_flavour eq 'NORMARC') {
 331         my $leader = $marc_record->leader();
 332         substr($leader, 9, 1) = 'a';
 333         $marc_record->leader($leader);
 334     } elsif ($marc_flavour =~/UNIMARC/) {
 335         require C4::Context;
 336         my $defaultlanguage = C4::Context->preference("UNIMARCField100Language");
 337         $defaultlanguage = "fre" if (!$defaultlanguage || length($defaultlanguage) != 3);
 338         my $string;
 339                 my ($subflength,$encodingposition)=($marc_flavour=~/AUTH/?(21,12):(36,25));
 340                 $string=$marc_record->subfield( 100, "a" );
 341         if (defined $string && length($string)==$subflength) {
 342                         $string = substr $string, 0,$subflength if (length($string)>$subflength);
 343         }
 344         else {
 345             $string = POSIX::strftime( "%Y%m%d", localtime );
 346             $string =~ s/\-//g;
 347             $string = sprintf( "%-*s", $subflength, $string );
 348             substr ( $string, ($encodingposition - 3), 3, $defaultlanguage);
 349         }
 350         substr( $string, $encodingposition, 3, "y50" );
 351         if ( $marc_record->subfield( 100, "a" ) ) {
 352                         $marc_record->field('100')->update(a=>$string);
 353                 }
 354                 else {
 355             $marc_record->insert_grouped_field(
 356                 MARC::Field->new( 100, '', '', "a" => $string ) );
 357         }
 358                 $debug && warn "encodage: ", substr( $marc_record->subfield(100, 'a'), $encodingposition, 3 );
 359     } else {
 360         warn "Unrecognized marcflavour: $marc_flavour";
 361     }
 362 }
 363
 364 =head2 StripNonXmlChars
 365
 366   my $new_str = StripNonXmlChars($old_str);
 367
 368 Given a string, return a copy with the
 369 characters that are illegal in XML
 370 removed.
 371
 372 This function exists to work around a problem
 373 that can occur with badly-encoded MARC records.
 374 Specifically, if a UTF-8 MARC record also
 375 has excape (\x1b) characters, MARC::File::XML
 376 will let the escape characters pass through
 377 when as_xml() or as_xml_record() is called.  The
 378 problem is that the escape character is not
 379 legal in well-formed XML documents, so when
 380 MARC::File::XML attempts to parse such a record,
 381 the XML parser will fail.
 382
 383 Stripping such characters will allow a
 384 MARC::Record->new_from_xml()
 385 to work, at the possible risk of some data loss.
 386
 387 =cut
 388
 389 sub StripNonXmlChars {
 390     my $str = shift;
 391     if (!defined($str) || $str eq ""){
 392         return "";
 393     }
 394     $str =~ s/[^\x09\x0A\x0D\x{0020}-\x{D7FF}\x{E000}-\x{FFFD}\x{10000}-\x{10FFFF}]//g;
 395     return $str;
 396 }
 397
 398
 399
 400 =head2 nsb_clean
 401
 402 =over 4
 403
 404 nsb_clean($string);
 405
 406 =back
 407
 408 Removes Non Sorting Block characters
 409
 410 =cut
 411 sub nsb_clean {
 412     my $NSB  = '\x88' ;        # NSB : begin Non Sorting Block
 413     my $NSE  = '\x89' ;        # NSE : Non Sorting Block end
 414     my $NSB2 = '\x98' ;        # NSB : begin Non Sorting Block
 415     my $NSE2 = '\x9C' ;        # NSE : Non Sorting Block end
 416     my $C2   = '\xC2' ;        # What is this char ? It is sometimes left by the regexp after removing NSB / NSE
 417
 418     # handles non sorting blocks
 419     my ($string) = @_ ;
 420     $_ = $string ;
 421     s/($C2){0,1}($NSB|$NSB2)//g ;
 422     s/($C2){0,1}($NSE|$NSE2)//g ;
 423     $string = $_ ;
 424
 425     return($string) ;
 426 }
 427
 428
 429 =head2 SanitizeRecord
 430
 431 SanitizeRecord($marcrecord);
 432
 433 Sanitize a record
 434 This routine is called in the maintenance script misc/maintenance/sanitize_records.pl.
 435 It cleans any string with '&amp;amp;...', replacing it by '&'
 436
 437 =cut
 438
 439 sub SanitizeRecord {
 440     my ( $record, $biblionumber ) = @_;
 441     my $string;
 442     my $record_modified = 0;
 443     my $frameworkcode   = C4::Biblio::GetFrameworkCode($biblionumber);
 444     my ( $url_field, $url_subfield ) =
 445       C4::Biblio::GetMarcFromKohaField( 'biblioitems.url', $frameworkcode );
 446     foreach my $field ( $record->fields() ) {
 447         if ( $field->is_control_field() ) {
 448             my $value           = $field->data();
 449             my $sanitized_value = _clean_ampersand($value);
 450             $record_modified = 1 if $sanitized_value ne $value;
 451             $field->update($sanitized_value);
 452         }
 453         else {
 454             my @subfields = $field->subfields();
 455             my @new_subfields;
 456             foreach my $subfield (@subfields) {
 457                 next
 458                   if $url_field eq $field->tag()
 459                       and $url_subfield eq $subfield->[0];
 460                 my $value           = $subfield->[1];
 461                 my $sanitized_value = _clean_ampersand($value);
 462                 push @new_subfields, $subfield->[0] => $sanitized_value;
 463                 $record_modified = 1 if $sanitized_value ne $value;
 464             }
 465             if ( scalar(@new_subfields) > 0 ) {
 466                 my $new_field = eval {
 467                     MARC::Field->new(
 468                         $field->tag(),        $field->indicator(1),
 469                         $field->indicator(2), @new_subfields
 470                     );
 471                 };
 472                 if ($@) {
 473                     warn "error : $@";
 474                 }
 475                 else {
 476                     $field->replace_with($new_field);
 477                 }
 478
 479             }
 480         }
 481     }
 482
 483     return $record, $record_modified;
 484 }
 485
 486 sub _clean_ampersand {
 487     my ($string) = @_;
 488     $string =~ s/(&)(amp;)+/$1/g;
 489     return $string;
 490 }
 491
 492 =head1 INTERNAL FUNCTIONS
 493
 494 =head2 _default_marc21_charconv_to_utf8
 495
 496   my ($new_marc_record, $guessed_charset) = _default_marc21_charconv_to_utf8($marc_record);
 497
 498 Converts a C<MARC::Record> of unknown character set to UTF-8,
 499 first by trying a MARC-8 to UTF-8 conversion, then ISO-8859-1
 500 to UTF-8, then a default conversion that replaces each non-ASCII
 501 character with the replacement character.
 502
 503 The C<$guessed_charset> return value contains the character set
 504 that resulted in a conversion to valid UTF-8; note that
 505 if the MARC-8 and ISO-8859-1 conversions failed, the value of
 506 this is 'failed'.
 507
 508 =cut
 509
 510 sub _default_marc21_charconv_to_utf8 {
 511     my $marc_record = shift;
 512     my $marc_flavour = shift;
 513
 514     my $trial_marc8 = $marc_record->clone();
 515     my @all_errors = ();
 516     my @errors = _marc_marc8_to_utf8($trial_marc8, $marc_flavour);
 517     unless (@errors) {
 518         return $trial_marc8, 'MARC-8', [];
 519     }
 520     push @all_errors, @errors;
 521
 522     my $trial_8859_1 = $marc_record->clone();
 523     @errors = _marc_to_utf8_via_text_iconv($trial_8859_1, $marc_flavour, 'iso-8859-1');
 524     unless (@errors) {
 525         return $trial_8859_1, 'iso-8859-1', []; # note -- we could return \@all_errors
 526                                                 # instead if we wanted to report details
 527                                                 # of the failed attempt at MARC-8 => UTF-8
 528     }
 529     push @all_errors, @errors;
 530
 531     my $default_converted = $marc_record->clone();
 532     _marc_to_utf8_replacement_char($default_converted, $marc_flavour);
 533     return $default_converted, 'failed', \@all_errors;
 534 }
 535
 536 =head2 _default_unimarc_charconv_to_utf8
 537
 538   my ($new_marc_record, $guessed_charset) = _default_unimarc_charconv_to_utf8($marc_record);
 539
 540 Converts a C<MARC::Record> of unknown character set to UTF-8,
 541 first by trying a ISO-5426 to UTF-8 conversion, then ISO-8859-1
 542 to UTF-8, then a default conversion that replaces each non-ASCII
 543 character with the replacement character.
 544
 545 The C<$guessed_charset> return value contains the character set
 546 that resulted in a conversion to valid UTF-8; note that
 547 if the MARC-8 and ISO-8859-1 conversions failed, the value of
 548 this is 'failed'.
 549
 550 =cut
 551
 552 sub _default_unimarc_charconv_to_utf8 {
 553     my $marc_record = shift;
 554     my $marc_flavour = shift;
 555
 556     my $trial_marc8 = $marc_record->clone();
 557     my @all_errors = ();
 558     my @errors = _marc_iso5426_to_utf8($trial_marc8, $marc_flavour);
 559     unless (@errors) {
 560         return $trial_marc8, 'iso-5426';
 561     }
 562     push @all_errors, @errors;
 563
 564     my $trial_8859_1 = $marc_record->clone();
 565     @errors = _marc_to_utf8_via_text_iconv($trial_8859_1, $marc_flavour, 'iso-8859-1');
 566     unless (@errors) {
 567         return $trial_8859_1, 'iso-8859-1';
 568     }
 569     push @all_errors, @errors;
 570
 571     my $default_converted = $marc_record->clone();
 572     _marc_to_utf8_replacement_char($default_converted, $marc_flavour);
 573     return $default_converted, 'failed', \@all_errors;
 574 }
 575
 576 =head2 _marc_marc8_to_utf8
 577
 578   my @errors = _marc_marc8_to_utf8($marc_record, $marc_flavour, $source_encoding);
 579
 580 Convert a C<MARC::Record> to UTF-8 in-place from MARC-8.
 581 If the conversion fails for some reason, an
 582 appropriate messages will be placed in the returned
 583 C<@errors> array.
 584
 585 =cut
 586
 587 sub _marc_marc8_to_utf8 {
 588     my $marc_record = shift;
 589     my $marc_flavour = shift;
 590
 591     my $prev_ignore = MARC::Charset->ignore_errors();
 592     MARC::Charset->ignore_errors(1);
 593
 594     # trap warnings raised by MARC::Charset
 595     my @errors = ();
 596     local $SIG{__WARN__} = sub {
 597         my $msg = $_[0];
 598         if ($msg =~ /MARC.Charset/) {
 599             # FIXME - purpose of this regexp is to strip out the
 600             # line reference to MARC/Charset.pm, but as it
 601             # exists probably won't work quite on Windows --
 602             # some sort of minimal-bunch back-tracking RE
 603             # would be helpful here
 604             $msg =~ s/at [\/].*?.MARC.Charset\.pm line \d+\.\n$//;
 605             push @errors, $msg;
 606         } else {
 607             # if warning doesn't come from MARC::Charset, just
 608             # pass it on
 609             warn $msg;
 610         }
 611     };
 612
 613     foreach my $field ($marc_record->fields()) {
 614         if ($field->is_control_field()) {
 615             ; # do nothing -- control fields should not contain non-ASCII characters
 616         } else {
 617             my @converted_subfields;
 618             foreach my $subfield ($field->subfields()) {
 619                 my $utf8sf = MARC::Charset::marc8_to_utf8($subfield->[1]);
 620                 unless (IsStringUTF8ish($utf8sf)) {
 621                     # Because of a bug in MARC::Charset 0.98, if the string
 622                     # has (a) one or more diacritics that (b) are only in character positions
 623                     # 128 to 255 inclusive, the resulting converted string is not in
 624                     # UTF-8, but the legacy 8-bit encoding (e.g., ISO-8859-1).  If that
 625                     # occurs, upgrade the string in place.  Moral of the story seems to be
 626                     # that pack("U", ...) is better than chr(...) if you need to guarantee
 627                     # that the resulting string is UTF-8.
 628                     $utf8sf = Encode::encode('UTF-8', $utf8sf);
 629                 }
 630                 push @converted_subfields, $subfield->[0], $utf8sf;
 631             }
 632
 633             $field->replace_with(MARC::Field->new(
 634                 $field->tag(), $field->indicator(1), $field->indicator(2),
 635                 @converted_subfields)
 636             );
 637         }
 638     }
 639
 640     MARC::Charset->ignore_errors($prev_ignore);
 641
 642     SetMarcUnicodeFlag($marc_record, $marc_flavour);
 643
 644     return @errors;
 645 }
 646
 647 =head2 _marc_iso5426_to_utf8
 648
 649   my @errors = _marc_iso5426_to_utf8($marc_record, $marc_flavour, $source_encoding);
 650
 651 Convert a C<MARC::Record> to UTF-8 in-place from ISO-5426.
 652 If the conversion fails for some reason, an
 653 appropriate messages will be placed in the returned
 654 C<@errors> array.
 655
 656 FIXME - is ISO-5426 equivalent enough to MARC-8
 657 that C<MARC::Charset> can be used instead?
 658
 659 =cut
 660
 661 sub _marc_iso5426_to_utf8 {
 662     my $marc_record = shift;
 663     my $marc_flavour = shift;
 664
 665     my @errors = ();
 666
 667     foreach my $field ($marc_record->fields()) {
 668         if ($field->is_control_field()) {
 669             ; # do nothing -- control fields should not contain non-ASCII characters
 670         } else {
 671             my @converted_subfields;
 672             foreach my $subfield ($field->subfields()) {
 673                 my $utf8sf = char_decode5426($subfield->[1]);
 674                 push @converted_subfields, $subfield->[0], $utf8sf;
 675             }
 676
 677             $field->replace_with(MARC::Field->new(
 678                 $field->tag(), $field->indicator(1), $field->indicator(2),
 679                 @converted_subfields)
 680             );
 681         }
 682     }
 683
 684     SetMarcUnicodeFlag($marc_record, $marc_flavour);
 685
 686     return @errors;
 687 }
 688
 689 =head2 _marc_to_utf8_via_text_iconv
 690
 691   my @errors = _marc_to_utf8_via_text_iconv($marc_record, $marc_flavour, $source_encoding);
 692
 693 Convert a C<MARC::Record> to UTF-8 in-place using the
 694 C<Text::Iconv> CPAN module.  Any source encoding accepted
 695 by the user's iconv installation should work.  If
 696 the source encoding is not recognized on the user's
 697 server or the conversion fails for some reason,
 698 appropriate messages will be placed in the returned
 699 C<@errors> array.
 700
 701 =cut
 702
 703 sub _marc_to_utf8_via_text_iconv {
 704     my $marc_record = shift;
 705     my $marc_flavour = shift;
 706     my $source_encoding = shift;
 707
 708     my @errors = ();
 709     my $decoder;
 710     eval { $decoder = Text::Iconv->new($source_encoding, 'utf8'); };
 711     if ($@) {
 712         push @errors, "Could not initialze $source_encoding => utf8 converter: $@";
 713         return @errors;
 714     }
 715
 716     my $prev_raise_error = Text::Iconv->raise_error();
 717     Text::Iconv->raise_error(1);
 718
 719     foreach my $field ($marc_record->fields()) {
 720         if ($field->is_control_field()) {
 721             ; # do nothing -- control fields should not contain non-ASCII characters
 722         } else {
 723             my @converted_subfields;
 724             foreach my $subfield ($field->subfields()) {
 725                 my $converted_value;
 726                 my $conversion_ok = 1;
 727                 eval { $converted_value = $decoder->convert($subfield->[1]); };
 728                 if ($@) {
 729                     $conversion_ok = 0;
 730                     push @errors, $@;
 731                 } elsif (not defined $converted_value) {
 732                     $conversion_ok = 0;
 733                     push @errors, "Text::Iconv conversion failed - retval is " . $decoder->retval();
 734                 }
 735
 736                 if ($conversion_ok) {
 737                     push @converted_subfields, $subfield->[0], $converted_value;
 738                 } else {
 739                     $converted_value = $subfield->[1];
 740                     $converted_value =~ s/[\200-\377]/\xef\xbf\xbd/g;
 741                     push @converted_subfields, $subfield->[0], $converted_value;
 742                 }
 743             }
 744
 745             $field->replace_with(MARC::Field->new(
 746                 $field->tag(), $field->indicator(1), $field->indicator(2),
 747                 @converted_subfields)
 748             );
 749         }
 750     }
 751
 752     SetMarcUnicodeFlag($marc_record, $marc_flavour);
 753     Text::Iconv->raise_error($prev_raise_error);
 754
 755     return @errors;
 756 }
 757
 758 =head2 _marc_to_utf8_replacement_char
 759
 760   _marc_to_utf8_replacement_char($marc_record, $marc_flavour);
 761
 762 Convert a C<MARC::Record> to UTF-8 in-place, adopting the
 763 unsatisfactory method of replacing all non-ASCII (e.g.,
 764 where the eight bit is set) octet with the Unicode
 765 replacement character.  This is meant as a last-ditch
 766 method, and would be best used as part of a UI that
 767 lets a cataloguer pick various character conversions
 768 until he or she finds the right one.
 769
 770 =cut
 771
 772 sub _marc_to_utf8_replacement_char {
 773     my $marc_record = shift;
 774     my $marc_flavour = shift;
 775
 776     foreach my $field ($marc_record->fields()) {
 777         if ($field->is_control_field()) {
 778             ; # do nothing -- control fields should not contain non-ASCII characters
 779         } else {
 780             my @converted_subfields;
 781             foreach my $subfield ($field->subfields()) {
 782                 my $value = $subfield->[1];
 783                 $value =~ s/[\200-\377]/\xef\xbf\xbd/g;
 784                 push @converted_subfields, $subfield->[0], $value;
 785             }
 786
 787             $field->replace_with(MARC::Field->new(
 788                 $field->tag(), $field->indicator(1), $field->indicator(2),
 789                 @converted_subfields)
 790             );
 791         }
 792     }
 793
 794     SetMarcUnicodeFlag($marc_record, $marc_flavour);
 795 }
 796
 797 =head2 char_decode5426
 798
 799   my $utf8string = char_decode5426($iso_5426_string);
 800
 801 Converts a string from ISO-5426 to UTF-8.
 802
 803 =cut
 804
 805
 806 my %chars;
 807
 808 ####
 809 ## 0xb
 810 $chars{0xb0}=0x0101;#3/0ayn[ain]
 811 $chars{0xb1}=0x0623;#3/1alif/hamzah[alefwithhamzaabove]
 812 #$chars{0xb2}=0x00e0;#'à';
 813 $chars{0xb2}=0x00e0;#3/2leftlowsinglequotationmark
 814 #$chars{0xb3}=0x00e7;#'ç';
 815 $chars{0xb3}=0x00e7;#3/2leftlowsinglequotationmark
 816 # $chars{0xb4}='è';
 817 $chars{0xb4}=0x00e8;
 818 # $chars{0xb5}='é';
 819 $chars{0xb5}=0x00e9;
 820 $chars{0xb6}=0x2021; # double dagger
 821 $chars{0xb7}=0x00b7; # middle dot
 822 $chars{0xb8}=0x2033; # double prime
 823 $chars{0xb9}=0x2019; # right single quotation mark
 824 $chars{0xba}=0x201d; # right double quotation mark
 825 $chars{0xbb}=0x00bb; # right-pointing double angle quotation mark
 826 $chars{0xbc}=0x266f; # music sharp sign
 827 $chars{0xbd}=0x02b9; # modifier letter prime
 828 $chars{0xbe}=0x02ba; # modifier letter double prime
 829 $chars{0xbf}=0x00bf; # inverted question mark
 830
 831 ####
 832 ## 0xe
 833 $chars{0xe1}=0x00c6; # latin capital letter ae
 834 $chars{0xe2}=0x0110; # latin capital letter d with stroke
 835 $chars{0xe6}=0x0132; # latin capital ligature ij
 836 $chars{0xe8}=0x0141; # latin capital letter l with stroke
 837 $chars{0xe9}=0x00d8; # latin capital letter o with stroke
 838 $chars{0xea}=0x0152; # latin capital ligature oe
 839 $chars{0xec}=0x00de; # latin capital letter thorn
 840
 841 ####
 842 ## 0xf
 843 $chars{0xf1}=0x00e6; # latin small letter ae
 844 $chars{0xf2}=0x0111; # latin small letter d with stroke
 845 $chars{0xf3}=0x00f0; # latin small letter eth
 846 $chars{0xf5}=0x0131; # latin small letter dotless i
 847 $chars{0xf6}=0x0133; # latin small ligature ij
 848 $chars{0xf8}=0x0142; # latin small letter l with stroke
 849 $chars{0xf9}=0x00f8; # latin small letter o with stroke
 850 $chars{0xfa}=0x0153; # latin small ligature oe
 851 $chars{0xfb}=0x00df; # latin small letter sharp s
 852 $chars{0xfc}=0x00fe; # latin small letter thorn
 853
 854 ####
 855 ## Others
 856 $chars{0x97}=0x003c;#3/2leftlowsinglequotationmark
 857 $chars{0x98}=0x003e;#3/2leftlowsinglequotationmark
 858 #$chars{0x81d1}=0x00b0; # FIXME useless
 859
 860 ####
 861 ## combined characters iso5426
 862
 863 $chars{0xc041}=0x1ea2; # capital a with hook above
 864 $chars{0xc045}=0x1eba; # capital e with hook above
 865 $chars{0xc049}=0x1ec8; # capital i with hook above
 866 $chars{0xc04f}=0x1ece; # capital o with hook above
 867 $chars{0xc055}=0x1ee6; # capital u with hook above
 868 $chars{0xc059}=0x1ef6; # capital y with hook above
 869 $chars{0xc061}=0x1ea3; # small a with hook above
 870 $chars{0xc065}=0x1ebb; # small e with hook above
 871 $chars{0xc069}=0x1ec9; # small i with hook above
 872 $chars{0xc06f}=0x1ecf; # small o with hook above
 873 $chars{0xc075}=0x1ee7; # small u with hook above
 874 $chars{0xc079}=0x1ef7; # small y with hook above
 875
 876         # 4/1 grave accent
 877 $chars{0xc141}=0x00c0; # capital a with grave accent
 878 $chars{0xc145}=0x00c8; # capital e with grave accent
 879 $chars{0xc149}=0x00cc; # capital i with grave accent
 880 $chars{0xc14f}=0x00d2; # capital o with grave accent
 881 $chars{0xc155}=0x00d9; # capital u with grave accent
 882 $chars{0xc157}=0x1e80; # capital w with grave
 883 $chars{0xc159}=0x1ef2; # capital y with grave
 884 $chars{0xc161}=0x00e0; # small a with grave accent
 885 $chars{0xc165}=0x00e8; # small e with grave accent
 886 $chars{0xc169}=0x00ec; # small i with grave accent
 887 $chars{0xc16f}=0x00f2; # small o with grave accent
 888 $chars{0xc175}=0x00f9; # small u with grave accent
 889 $chars{0xc177}=0x1e81; # small w with grave
 890 $chars{0xc179}=0x1ef3; # small y with grave
 891         # 4/2 acute accent
 892 $chars{0xc241}=0x00c1; # capital a with acute accent
 893 $chars{0xc243}=0x0106; # capital c with acute accent
 894 $chars{0xc245}=0x00c9; # capital e with acute accent
 895 $chars{0xc247}=0x01f4; # capital g with acute
 896 $chars{0xc249}=0x00cd; # capital i with acute accent
 897 $chars{0xc24b}=0x1e30; # capital k with acute
 898 $chars{0xc24c}=0x0139; # capital l with acute accent
 899 $chars{0xc24d}=0x1e3e; # capital m with acute
 900 $chars{0xc24e}=0x0143; # capital n with acute accent
 901 $chars{0xc24f}=0x00d3; # capital o with acute accent
 902 $chars{0xc250}=0x1e54; # capital p with acute
 903 $chars{0xc252}=0x0154; # capital r with acute accent
 904 $chars{0xc253}=0x015a; # capital s with acute accent
 905 $chars{0xc255}=0x00da; # capital u with acute accent
 906 $chars{0xc257}=0x1e82; # capital w with acute
 907 $chars{0xc259}=0x00dd; # capital y with acute accent
 908 $chars{0xc25a}=0x0179; # capital z with acute accent
 909 $chars{0xc261}=0x00e1; # small a with acute accent
 910 $chars{0xc263}=0x0107; # small c with acute accent
 911 $chars{0xc265}=0x00e9; # small e with acute accent
 912 $chars{0xc267}=0x01f5; # small g with acute
 913 $chars{0xc269}=0x00ed; # small i with acute accent
 914 $chars{0xc26b}=0x1e31; # small k with acute
 915 $chars{0xc26c}=0x013a; # small l with acute accent
 916 $chars{0xc26d}=0x1e3f; # small m with acute
 917 $chars{0xc26e}=0x0144; # small n with acute accent
 918 $chars{0xc26f}=0x00f3; # small o with acute accent
 919 $chars{0xc270}=0x1e55; # small p with acute
 920 $chars{0xc272}=0x0155; # small r with acute accent
 921 $chars{0xc273}=0x015b; # small s with acute accent
 922 $chars{0xc275}=0x00fa; # small u with acute accent
 923 $chars{0xc277}=0x1e83; # small w with acute
 924 $chars{0xc279}=0x00fd; # small y with acute accent
 925 $chars{0xc27a}=0x017a; # small z with acute accent
 926 $chars{0xc2e1}=0x01fc; # capital ae with acute
 927 $chars{0xc2f1}=0x01fd; # small ae with acute
 928        # 4/3 circumflex accent
 929 $chars{0xc341}=0x00c2; # capital a with circumflex accent
 930 $chars{0xc343}=0x0108; # capital c with circumflex
 931 $chars{0xc345}=0x00ca; # capital e with circumflex accent
 932 $chars{0xc347}=0x011c; # capital g with circumflex
 933 $chars{0xc348}=0x0124; # capital h with circumflex
 934 $chars{0xc349}=0x00ce; # capital i with circumflex accent
 935 $chars{0xc34a}=0x0134; # capital j with circumflex
 936 $chars{0xc34f}=0x00d4; # capital o with circumflex accent
 937 $chars{0xc353}=0x015c; # capital s with circumflex
 938 $chars{0xc355}=0x00db; # capital u with circumflex
 939 $chars{0xc357}=0x0174; # capital w with circumflex
 940 $chars{0xc359}=0x0176; # capital y with circumflex
 941 $chars{0xc35a}=0x1e90; # capital z with circumflex
 942 $chars{0xc361}=0x00e2; # small a with circumflex accent
 943 $chars{0xc363}=0x0109; # small c with circumflex
 944 $chars{0xc365}=0x00ea; # small e with circumflex accent
 945 $chars{0xc367}=0x011d; # small g with circumflex
 946 $chars{0xc368}=0x0125; # small h with circumflex
 947 $chars{0xc369}=0x00ee; # small i with circumflex accent
 948 $chars{0xc36a}=0x0135; # small j with circumflex
 949 $chars{0xc36e}=0x00f1; # small n with tilde
 950 $chars{0xc36f}=0x00f4; # small o with circumflex accent
 951 $chars{0xc373}=0x015d; # small s with circumflex
 952 $chars{0xc375}=0x00fb; # small u with circumflex
 953 $chars{0xc377}=0x0175; # small w with circumflex
 954 $chars{0xc379}=0x0177; # small y with circumflex
 955 $chars{0xc37a}=0x1e91; # small z with circumflex
 956         # 4/4 tilde
 957 $chars{0xc441}=0x00c3; # capital a with tilde
 958 $chars{0xc445}=0x1ebc; # capital e with tilde
 959 $chars{0xc449}=0x0128; # capital i with tilde
 960 $chars{0xc44e}=0x00d1; # capital n with tilde
 961 $chars{0xc44f}=0x00d5; # capital o with tilde
 962 $chars{0xc455}=0x0168; # capital u with tilde
 963 $chars{0xc456}=0x1e7c; # capital v with tilde
 964 $chars{0xc459}=0x1ef8; # capital y with tilde
 965 $chars{0xc461}=0x00e3; # small a with tilde
 966 $chars{0xc465}=0x1ebd; # small e with tilde
 967 $chars{0xc469}=0x0129; # small i with tilde
 968 $chars{0xc46e}=0x00f1; # small n with tilde
 969 $chars{0xc46f}=0x00f5; # small o with tilde
 970 $chars{0xc475}=0x0169; # small u with tilde
 971 $chars{0xc476}=0x1e7d; # small v with tilde
 972 $chars{0xc479}=0x1ef9; # small y with tilde
 973     # 4/5 macron
 974 $chars{0xc541}=0x0100; # capital a with macron
 975 $chars{0xc545}=0x0112; # capital e with macron
 976 $chars{0xc547}=0x1e20; # capital g with macron
 977 $chars{0xc549}=0x012a; # capital i with macron
 978 $chars{0xc54f}=0x014c; # capital o with macron
 979 $chars{0xc555}=0x016a; # capital u with macron
 980 $chars{0xc561}=0x0101; # small a with macron
 981 $chars{0xc565}=0x0113; # small e with macron
 982 $chars{0xc567}=0x1e21; # small g with macron
 983 $chars{0xc569}=0x012b; # small i with macron
 984 $chars{0xc56f}=0x014d; # small o with macron
 985 $chars{0xc575}=0x016b; # small u with macron
 986 $chars{0xc572}=0x0159; # small r with macron
 987 $chars{0xc5e1}=0x01e2; # capital ae with macron
 988 $chars{0xc5f1}=0x01e3; # small ae with macron
 989         # 4/6 breve
 990 $chars{0xc641}=0x0102; # capital a with breve
 991 $chars{0xc645}=0x0114; # capital e with breve
 992 $chars{0xc647}=0x011e; # capital g with breve
 993 $chars{0xc649}=0x012c; # capital i with breve
 994 $chars{0xc64f}=0x014e; # capital o with breve
 995 $chars{0xc655}=0x016c; # capital u with breve
 996 $chars{0xc661}=0x0103; # small a with breve
 997 $chars{0xc665}=0x0115; # small e with breve
 998 $chars{0xc667}=0x011f; # small g with breve
 999 $chars{0xc669}=0x012d; # small i with breve
1000 $chars{0xc66f}=0x014f; # small o with breve
1001 $chars{0xc675}=0x016d; # small u with breve
1002         # 4/7 dot above
1003 $chars{0xc7b0}=0x01e1; # Ain with dot above
1004 $chars{0xc742}=0x1e02; # capital b with dot above
1005 $chars{0xc743}=0x010a; # capital c with dot above
1006 $chars{0xc744}=0x1e0a; # capital d with dot above
1007 $chars{0xc745}=0x0116; # capital e with dot above
1008 $chars{0xc746}=0x1e1e; # capital f with dot above
1009 $chars{0xc747}=0x0120; # capital g with dot above
1010 $chars{0xc748}=0x1e22; # capital h with dot above
1011 $chars{0xc749}=0x0130; # capital i with dot above
1012 $chars{0xc74d}=0x1e40; # capital m with dot above
1013 $chars{0xc74e}=0x1e44; # capital n with dot above
1014 $chars{0xc750}=0x1e56; # capital p with dot above
1015 $chars{0xc752}=0x1e58; # capital r with dot above
1016 $chars{0xc753}=0x1e60; # capital s with dot above
1017 $chars{0xc754}=0x1e6a; # capital t with dot above
1018 $chars{0xc757}=0x1e86; # capital w with dot above
1019 $chars{0xc758}=0x1e8a; # capital x with dot above
1020 $chars{0xc759}=0x1e8e; # capital y with dot above
1021 $chars{0xc75a}=0x017b; # capital z with dot above
1022 $chars{0xc761}=0x0227; # small b with dot above
1023 $chars{0xc762}=0x1e03; # small b with dot above
1024 $chars{0xc763}=0x010b; # small c with dot above
1025 $chars{0xc764}=0x1e0b; # small d with dot above
1026 $chars{0xc765}=0x0117; # small e with dot above
1027 $chars{0xc766}=0x1e1f; # small f with dot above
1028 $chars{0xc767}=0x0121; # small g with dot above
1029 $chars{0xc768}=0x1e23; # small h with dot above
1030 $chars{0xc76d}=0x1e41; # small m with dot above
1031 $chars{0xc76e}=0x1e45; # small n with dot above
1032 $chars{0xc770}=0x1e57; # small p with dot above
1033 $chars{0xc772}=0x1e59; # small r with dot above
1034 $chars{0xc773}=0x1e61; # small s with dot above
1035 $chars{0xc774}=0x1e6b; # small t with dot above
1036 $chars{0xc777}=0x1e87; # small w with dot above
1037 $chars{0xc778}=0x1e8b; # small x with dot above
1038 $chars{0xc779}=0x1e8f; # small y with dot above
1039 $chars{0xc77a}=0x017c; # small z with dot above
1040         # 4/8 trema, diaresis
1041 $chars{0xc820}=0x00a8; # diaeresis
1042 $chars{0xc841}=0x00c4; # capital a with diaeresis
1043 $chars{0xc845}=0x00cb; # capital e with diaeresis
1044 $chars{0xc848}=0x1e26; # capital h with diaeresis
1045 $chars{0xc849}=0x00cf; # capital i with diaeresis
1046 $chars{0xc84f}=0x00d6; # capital o with diaeresis
1047 $chars{0xc855}=0x00dc; # capital u with diaeresis
1048 $chars{0xc857}=0x1e84; # capital w with diaeresis
1049 $chars{0xc858}=0x1e8c; # capital x with diaeresis
1050 $chars{0xc859}=0x0178; # capital y with diaeresis
1051 $chars{0xc861}=0x00e4; # small a with diaeresis
1052 $chars{0xc865}=0x00eb; # small e with diaeresis
1053 $chars{0xc868}=0x1e27; # small h with diaeresis
1054 $chars{0xc869}=0x00ef; # small i with diaeresis
1055 $chars{0xc86f}=0x00f6; # small o with diaeresis
1056 $chars{0xc874}=0x1e97; # small t with diaeresis
1057 $chars{0xc875}=0x00fc; # small u with diaeresis
1058 $chars{0xc877}=0x1e85; # small w with diaeresis
1059 $chars{0xc878}=0x1e8d; # small x with diaeresis
1060 $chars{0xc879}=0x00ff; # small y with diaeresis
1061         # 4/9 umlaut
1062 $chars{0xc920}=0x00a8; # [diaeresis]
1063 $chars{0xc961}=0x00e4; # a with umlaut
1064 $chars{0xc965}=0x00eb; # e with umlaut
1065 $chars{0xc969}=0x00ef; # i with umlaut
1066 $chars{0xc96f}=0x00f6; # o with umlaut
1067 $chars{0xc975}=0x00fc; # u with umlaut
1068         # 4/10 circle above
1069 $chars{0xca41}=0x00c5; # capital a with ring above
1070 $chars{0xcaad}=0x016e; # capital u with ring above
1071 $chars{0xca61}=0x00e5; # small a with ring above
1072 $chars{0xca75}=0x016f; # small u with ring above
1073 $chars{0xca77}=0x1e98; # small w with ring above
1074 $chars{0xca79}=0x1e99; # small y with ring above
1075         # 4/11 high comma off centre
1076         # 4/12 inverted high comma centred
1077         # 4/13 double acute accent
1078 $chars{0xcd4f}=0x0150; # capital o with double acute
1079 $chars{0xcd55}=0x0170; # capital u with double acute
1080 $chars{0xcd6f}=0x0151; # small o with double acute
1081 $chars{0xcd75}=0x0171; # small u with double acute
1082         # 4/14 horn
1083 $chars{0xce54}=0x01a0; # latin capital letter o with horn
1084 $chars{0xce55}=0x01af; # latin capital letter u with horn
1085 $chars{0xce74}=0x01a1; # latin small letter o with horn
1086 $chars{0xce75}=0x01b0; # latin small letter u with horn
1087         # 4/15 caron (hacek
1088 $chars{0xcf41}=0x01cd; # capital a with caron
1089 $chars{0xcf43}=0x010c; # capital c with caron
1090 $chars{0xcf44}=0x010e; # capital d with caron
1091 $chars{0xcf45}=0x011a; # capital e with caron
1092 $chars{0xcf47}=0x01e6; # capital g with caron
1093 $chars{0xcf49}=0x01cf; # capital i with caron
1094 $chars{0xcf4b}=0x01e8; # capital k with caron
1095 $chars{0xcf4c}=0x013d; # capital l with caron
1096 $chars{0xcf4e}=0x0147; # capital n with caron
1097 $chars{0xcf4f}=0x01d1; # capital o with caron
1098 $chars{0xcf52}=0x0158; # capital r with caron
1099 $chars{0xcf53}=0x0160; # capital s with caron
1100 $chars{0xcf54}=0x0164; # capital t with caron
1101 $chars{0xcf55}=0x01d3; # capital u with caron
1102 $chars{0xcf5a}=0x017d; # capital z with caron
1103 $chars{0xcf61}=0x01ce; # small a with caron
1104 $chars{0xcf63}=0x010d; # small c with caron
1105 $chars{0xcf64}=0x010f; # small d with caron
1106 $chars{0xcf65}=0x011b; # small e with caron
1107 $chars{0xcf67}=0x01e7; # small g with caron
1108 $chars{0xcf69}=0x01d0; # small i with caron
1109 $chars{0xcf6a}=0x01f0; # small j with caron
1110 $chars{0xcf6b}=0x01e9; # small k with caron
1111 $chars{0xcf6c}=0x013e; # small l with caron
1112 $chars{0xcf6e}=0x0148; # small n with caron
1113 $chars{0xcf6f}=0x01d2; # small o with caron
1114 $chars{0xcf72}=0x0159; # small r with caron
1115 $chars{0xcf73}=0x0161; # small s with caron
1116 $chars{0xcf74}=0x0165; # small t with caron
1117 $chars{0xcf75}=0x01d4; # small u with caron
1118 $chars{0xcf7a}=0x017e; # small z with caron
1119         # 5/0 cedilla
1120 $chars{0xd020}=0x00b8; # cedilla
1121 $chars{0xd043}=0x00c7; # capital c with cedilla
1122 $chars{0xd044}=0x1e10; # capital d with cedilla
1123 $chars{0xd047}=0x0122; # capital g with cedilla
1124 $chars{0xd048}=0x1e28; # capital h with cedilla
1125 $chars{0xd04b}=0x0136; # capital k with cedilla
1126 $chars{0xd04c}=0x013b; # capital l with cedilla
1127 $chars{0xd04e}=0x0145; # capital n with cedilla
1128 $chars{0xd052}=0x0156; # capital r with cedilla
1129 $chars{0xd053}=0x015e; # capital s with cedilla
1130 $chars{0xd054}=0x0162; # capital t with cedilla
1131 $chars{0xd063}=0x00e7; # small c with cedilla
1132 $chars{0xd064}=0x1e11; # small d with cedilla
1133 $chars{0xd065}=0x0119; # small e with cedilla
1134 $chars{0xd067}=0x0123; # small g with cedilla
1135 $chars{0xd068}=0x1e29; # small h with cedilla
1136 $chars{0xd06b}=0x0137; # small k with cedilla
1137 $chars{0xd06c}=0x013c; # small l with cedilla
1138 $chars{0xd06e}=0x0146; # small n with cedilla
1139 $chars{0xd072}=0x0157; # small r with cedilla
1140 $chars{0xd073}=0x015f; # small s with cedilla
1141 $chars{0xd074}=0x0163; # small t with cedilla
1142         # 5/1 rude
1143         # 5/2 hook to left
1144         # 5/3 ogonek (hook to right
1145 $chars{0xd320}=0x02db; # ogonek
1146 $chars{0xd341}=0x0104; # capital a with ogonek
1147 $chars{0xd345}=0x0118; # capital e with ogonek
1148 $chars{0xd349}=0x012e; # capital i with ogonek
1149 $chars{0xd34f}=0x01ea; # capital o with ogonek
1150 $chars{0xd355}=0x0172; # capital u with ogonek
1151 $chars{0xd361}=0x0105; # small a with ogonek
1152 $chars{0xd365}=0x0119; # small e with ogonek
1153 $chars{0xd369}=0x012f; # small i with ogonek
1154 $chars{0xd36f}=0x01eb; # small o with ogonek
1155 $chars{0xd375}=0x0173; # small u with ogonek
1156         # 5/4 circle below
1157 $chars{0xd441}=0x1e00; # capital a with ring below
1158 $chars{0xd461}=0x1e01; # small a with ring below
1159         # 5/5 half circle below
1160 $chars{0xd548}=0x1e2a; # capital h with breve below
1161 $chars{0xd568}=0x1e2b; # small h with breve below
1162         # 5/6 dot below
1163 $chars{0xd641}=0x1ea0; # capital a with dot below
1164 $chars{0xd642}=0x1e04; # capital b with dot below
1165 $chars{0xd644}=0x1e0c; # capital d with dot below
1166 $chars{0xd645}=0x1eb8; # capital e with dot below
1167 $chars{0xd648}=0x1e24; # capital h with dot below
1168 $chars{0xd649}=0x1eca; # capital i with dot below
1169 $chars{0xd64b}=0x1e32; # capital k with dot below
1170 $chars{0xd64c}=0x1e36; # capital l with dot below
1171 $chars{0xd64d}=0x1e42; # capital m with dot below
1172 $chars{0xd64e}=0x1e46; # capital n with dot below
1173 $chars{0xd64f}=0x1ecc; # capital o with dot below
1174 $chars{0xd652}=0x1e5a; # capital r with dot below
1175 $chars{0xd653}=0x1e62; # capital s with dot below
1176 $chars{0xd654}=0x1e6c; # capital t with dot below
1177 $chars{0xd655}=0x1ee4; # capital u with dot below
1178 $chars{0xd656}=0x1e7e; # capital v with dot below
1179 $chars{0xd657}=0x1e88; # capital w with dot below
1180 $chars{0xd659}=0x1ef4; # capital y with dot below
1181 $chars{0xd65a}=0x1e92; # capital z with dot below
1182 $chars{0xd661}=0x1ea1; # small a with dot below
1183 $chars{0xd662}=0x1e05; # small b with dot below
1184 $chars{0xd664}=0x1e0d; # small d with dot below
1185 $chars{0xd665}=0x1eb9; # small e with dot below
1186 $chars{0xd668}=0x1e25; # small h with dot below
1187 $chars{0xd669}=0x1ecb; # small i with dot below
1188 $chars{0xd66b}=0x1e33; # small k with dot below
1189 $chars{0xd66c}=0x1e37; # small l with dot below
1190 $chars{0xd66d}=0x1e43; # small m with dot below
1191 $chars{0xd66e}=0x1e47; # small n with dot below
1192 $chars{0xd66f}=0x1ecd; # small o with dot below
1193 $chars{0xd672}=0x1e5b; # small r with dot below
1194 $chars{0xd673}=0x1e63; # small s with dot below
1195 $chars{0xd674}=0x1e6d; # small t with dot below
1196 $chars{0xd675}=0x1ee5; # small u with dot below
1197 $chars{0xd676}=0x1e7f; # small v with dot below
1198 $chars{0xd677}=0x1e89; # small w with dot below
1199 $chars{0xd679}=0x1ef5; # small y with dot below
1200 $chars{0xd67a}=0x1e93; # small z with dot below
1201         # 5/7 double dot below
1202 $chars{0xd755}=0x1e72; # capital u with diaeresis below
1203 $chars{0xd775}=0x1e73; # small u with diaeresis below
1204         # 5/8 underline
1205 $chars{0xd820}=0x005f; # underline
1206         # 5/9 double underline
1207 $chars{0xd920}=0x2017; # double underline
1208         # 5/10 small low vertical bar
1209 $chars{0xda20}=0x02cc; #
1210         # 5/11 circumflex below
1211         # 5/12 (this position shall not be used)
1212         # 5/13 left half of ligature sign and of double tilde
1213         # 5/14 right half of ligature sign
1214         # 5/15 right half of double tilde
1215 #     map {printf "%x :%x\n",$_,$chars{$_};}keys %chars;
1216
1217 sub char_decode5426 {
1218     my ( $string) = @_;
1219     my $result;
1220
1221     my @data = unpack("C*", $string);
1222     my @characters;
1223     my $length=scalar(@data);
1224     for (my $i = 0; $i < scalar(@data); $i++) {
1225       my $char= $data[$i];
1226       if ($char >= 0x00 && $char <= 0x7F){
1227         #IsAscii
1228
1229           push @characters,$char unless ($char<0x02 ||$char== 0x0F);
1230       }elsif (($char >= 0xC0 && $char <= 0xDF)) {
1231         #Combined Char
1232         my $convchar ;
1233         if ($chars{$char*256+$data[$i+1]}) {
1234           $convchar= $chars{$char * 256 + $data[$i+1]};
1235           $i++;
1236 #           printf "char %x $char, char to convert %x , converted %x\n",$char,$char * 256 + $data[$i - 1],$convchar;
1237         } elsif ($chars{$char})  {
1238           $convchar= $chars{$char};
1239 #           printf "0xC char %x, converted %x\n",$char,$chars{$char};
1240         }else {
1241           $convchar=$char;
1242         }
1243         push @characters,$convchar;
1244       } else {
1245         my $convchar;
1246         if ($chars{$char})  {
1247           $convchar= $chars{$char};
1248 #            printf "char %x,  converted %x\n",$char,$chars{$char};
1249         }else {
1250 #            printf "char %x $char\n",$char;
1251           $convchar=$char;
1252         }
1253         push @characters,$convchar;
1254       }
1255     }
1256     $result=pack "U*",@characters;
1257 #     $result=~s/\x01//;
1258 #     $result=~s/\x00//;
1259      $result=~s/\x0f//;
1260      $result=~s/\x1b.//;
1261      $result=~s/\x0e//;
1262      $result=~s/\x1b\x5b//;
1263 #   map{printf "%x",$_} @characters;
1264 #   printf "\n";
1265   return $result;
1266 }
1267
1268 1;
1269
1270
1271 =head1 AUTHOR
1272
1273 Koha Development Team <http://koha-community.org/>
1274
1275 Galen Charlton <galen.charlton@liblime.com>
1276
1277 =cut