C4/Charset.pm

   1 package C4::Charset;
   2
   3 # Copyright (C) 2008 LibLime
   4 #
   5 # This file is part of Koha.
   6 #
   7 # Koha is free software; you can redistribute it and/or modify it
   8 # under the terms of the GNU General Public License as published by
   9 # the Free Software Foundation; either version 3 of the License, or
  10 # (at your option) any later version.
  11 #
  12 # Koha is distributed in the hope that it will be useful, but
  13 # WITHOUT ANY WARRANTY; without even the implied warranty of
  14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  15 # GNU General Public License for more details.
  16 #
  17 # You should have received a copy of the GNU General Public License
  18 # along with Koha; if not, see <http://www.gnu.org/licenses>.
  19
  20 use Modern::Perl;
  21
  22 use MARC::Charset;
  23 use Text::Iconv;
  24 use Unicode::Normalize qw( NFC NFD );
  25 use Encode;
  26
  27 use Koha::Logger;
  28
  29 our (@ISA, @EXPORT_OK);
  30 BEGIN {
  31     require Exporter;
  32     @ISA    = qw(Exporter);
  33     @EXPORT_OK = qw(
  34         NormalizeString
  35         IsStringUTF8ish
  36         MarcToUTF8Record
  37         SetUTF8Flag
  38         SetMarcUnicodeFlag
  39         StripNonXmlChars
  40         nsb_clean
  41         SanitizeRecord
  42     );
  43 }
  44
  45 =encoding UTF-8
  46
  47 =head1 NAME
  48
  49 C4::Charset - utilities for handling character set conversions.
  50
  51 =head1 SYNOPSIS
  52
  53   use C4::Charset;
  54
  55 =head1 DESCRIPTION
  56
  57 This module contains routines for dealing with character set
  58 conversions, particularly for MARC records.
  59
  60 A variety of character encodings are in use by various MARC
  61 standards, and even more character encodings are used by
  62 non-standard MARC records.  The various MARC formats generally
  63 do not do a good job of advertising a given record's character
  64 encoding, and even when a record does advertise its encoding,
  65 e.g., via the Leader/09, experience has shown that one cannot
  66 trust it.
  67
  68 Ultimately, all MARC records are stored in Koha in UTF-8 and
  69 must be converted from whatever the source character encoding is.
  70 The goal of this module is to ensure that these conversions
  71 take place accurately.  When a character conversion cannot take
  72 place, or at least not accurately, the module was provide
  73 enough information to allow user-facing code to inform the user
  74 on how to deal with the situation.
  75
  76 =cut
  77
  78 =head1 FUNCTIONS
  79
  80 =head2 IsStringUTF8ish
  81
  82   my $is_utf8 = IsStringUTF8ish($str);
  83
  84 Determines if C<$str> is valid UTF-8.  This can mean
  85 one of two things:
  86
  87 =over
  88
  89 =item *
  90
  91 The Perl UTF-8 flag is set and the string contains valid UTF-8.
  92
  93 =item *
  94
  95 The Perl UTF-8 flag is B<not> set, but the octets contain
  96 valid UTF-8.
  97
  98 =back
  99
 100 The function is named C<IsStringUTF8ish> instead of C<IsStringUTF8>
 101 because in one could be presented with a MARC blob that is
 102 not actually in UTF-8 but whose sequence of octets appears to be
 103 valid UTF-8.  The rest of the MARC character conversion functions
 104 will assume that this situation occur does not very often.
 105
 106 =cut
 107
 108 sub IsStringUTF8ish {
 109     my $str = shift;
 110
 111     return 1 if Encode::is_utf8($str);
 112     return utf8::decode( $str );
 113 }
 114
 115 =head2 SetUTF8Flag
 116
 117   my $marc_record = SetUTF8Flag($marc_record, $nfd);
 118
 119 This function sets the PERL UTF8 flag for data.
 120 It is required when using new_from_usmarc
 121 since MARC::File::USMARC does not handle PERL UTF8 setting.
 122 When editing unicode marc records fields and subfields, you
 123 would end up in double encoding without using this function.
 124
 125 If $nfd is set, string normalization will use NFD instead of NFC
 126
 127 FIXME
 128 In my opinion, this function belongs to MARC::Record and not
 129 to this package.
 130 But since it handles charset, and MARC::Record, it finds its way in that package
 131
 132 =cut
 133
 134 sub SetUTF8Flag{
 135     my ($record, $nfd)=@_;
 136     return unless ($record && $record->fields());
 137     foreach my $field ($record->fields()){
 138         if ($field->tag()>=10){
 139             my @subfields;
 140             foreach my $subfield ($field->subfields()){
 141                 push @subfields,($$subfield[0],NormalizeString($$subfield[1],$nfd));
 142             }
 143             eval {
 144                 my $newfield=MARC::Field->new(
 145                             $field->tag(),
 146                             $field->indicator(1),
 147                             $field->indicator(2),
 148                             @subfields
 149                         );
 150                 $field->replace_with($newfield);
 151             };
 152             warn "ERROR occurred in SetUTF8Flag $@" if $@;
 153         }
 154     }
 155 }
 156
 157 =head2 NormalizeString
 158
 159     my $normalized_string=NormalizeString($string,$nfd,$transform);
 160
 161 Given a string
 162 nfd : If you want to set NFD and not NFC
 163 transform : If you expect all the signs to be removed
 164
 165 Sets the PERL UTF8 Flag on your initial data if need be
 166 and applies cleaning if required
 167
 168 Returns a utf8 NFC normalized string
 169
 170 Sample code :
 171    my $string=NormalizeString ("l'ornithoptère");
 172    #results into ornithoptère in NFC form and sets UTF8 Flag
 173
 174 =cut
 175
 176
 177 sub NormalizeString{
 178         my ($string,$nfd,$transform)=@_;
 179     return $string unless defined($string); # force scalar context return.
 180     $string = Encode::decode('UTF-8', $string) unless (Encode::is_utf8($string));
 181         if ($nfd){
 182                 $string= NFD($string);
 183         }
 184         else {
 185                 $string=NFC($string);
 186         }
 187         if ($transform){
 188     $string=~s/\<|\>|\^|\;|\.|\?|,|\-|\(|\)|\[|\]|\{|\}|\$|\%|\!|\*|\:|\\|\/|\&|\"|\'/ /g;
 189         #removing one letter words "d'" "l'"  was changed into "d " "l "
 190     $string=~s/\b\S\b//g;
 191     $string=~s/\s+$//g;
 192         }
 193     return $string;
 194 }
 195
 196 =head2 MarcToUTF8Record
 197
 198   ($marc_record, $converted_from, $errors_arrayref) = MarcToUTF8Record($marc_blob,
 199                                         $marc_flavour, [, $source_encoding]);
 200
 201 Given a MARC blob or a C<MARC::Record>, the MARC flavour, and an
 202 optional source encoding, return a C<MARC::Record> that is
 203 converted to UTF-8.
 204
 205 The returned C<$marc_record> is guaranteed to be in valid UTF-8, but
 206 is not guaranteed to have been converted correctly.  Specifically,
 207 if C<$converted_from> is 'failed', the MARC record returned failed
 208 character conversion and had each of its non-ASCII octets changed
 209 to the Unicode replacement character.
 210
 211 If the source encoding was not specified, this routine will
 212 try to guess it; the character encoding used for a successful
 213 conversion is returned in C<$converted_from>.
 214
 215 =cut
 216
 217 sub MarcToUTF8Record {
 218     my $marc = shift;
 219     my $marc_flavour = shift;
 220     my $source_encoding = shift;
 221     my $marc_record;
 222     my $marc_blob_is_utf8 = 0;
 223     if (ref($marc) eq 'MARC::Record') {
 224         my $marc_blob = $marc->as_usmarc();
 225         $marc_blob_is_utf8 = IsStringUTF8ish($marc_blob);
 226         $marc_record = $marc;
 227     } else {
 228         # dealing with a MARC blob
 229
 230         # remove any ersatz whitespace from the beginning and
 231         # end of the MARC blob -- these can creep into MARC
 232         # files produced by several sources -- caller really
 233         # should be doing this, however
 234         $marc =~ s/^\s+//;
 235         $marc =~ s/\s+$//;
 236         $marc_blob_is_utf8 = IsStringUTF8ish($marc);
 237         eval {
 238             $marc_record = MARC::Record->new_from_usmarc($marc);
 239         };
 240         if ($@) {
 241             # if we fail the first time, one likely problem
 242             # is that we have a MARC21 record that says that it's
 243             # UTF-8 (Leader/09 = 'a') but contains non-UTF-8 characters.
 244             # We'll try parsing it again.
 245             substr($marc, 9, 1) = ' ';
 246             eval {
 247                 $marc_record = MARC::Record->new_from_usmarc($marc);
 248             };
 249             if ($@) {
 250                 # it's hopeless; return an empty MARC::Record
 251                 return MARC::Record->new(), 'failed', ['could not parse MARC blob'];
 252             }
 253         }
 254     }
 255
 256     # If we do not know the source encoding, try some guesses
 257     # as follows:
 258     #   1. Record is UTF-8 already.
 259     #   2. If MARC flavor is MARC21, then
 260     #      a. record is MARC-8
 261     #      b. record is ISO-8859-1
 262     #   3. If MARC flavor is UNIMARC, then
 263     if (not defined $source_encoding) {
 264         if ($marc_blob_is_utf8) {
 265             # note that for MARC21 we are not bothering to check
 266             # if the Leader/09 is set to 'a' or not -- because
 267             # of problems with various ILSs (including Koha in the
 268             # past, alas), this just is not trustworthy.
 269             SetMarcUnicodeFlag($marc_record, $marc_flavour);
 270             return $marc_record, 'UTF-8', [];
 271         } else {
 272             if ($marc_flavour eq 'MARC21') {
 273                 return _default_marc21_charconv_to_utf8($marc_record, $marc_flavour);
 274             } elsif ($marc_flavour =~/UNIMARC/) {
 275                 return _default_unimarc_charconv_to_utf8($marc_record, $marc_flavour);
 276             } else {
 277                 return _default_marc21_charconv_to_utf8($marc_record, $marc_flavour);
 278             }
 279         }
 280     } else {
 281         # caller knows the character encoding
 282         my $original_marc_record = $marc_record->clone();
 283         my @errors;
 284         if ($source_encoding =~ /utf-?8/i) {
 285             if ($marc_blob_is_utf8) {
 286                 SetMarcUnicodeFlag($marc_record, $marc_flavour);
 287                 return $marc_record, 'UTF-8', [];
 288             } else {
 289                 push @errors, 'specified UTF-8 => UTF-8 conversion, but record is not in UTF-8';
 290             }
 291         } elsif ($source_encoding =~ /marc-?8/i) {
 292             @errors = _marc_marc8_to_utf8($marc_record, $marc_flavour);
 293         } elsif ($source_encoding =~ /5426/) {
 294             @errors = _marc_iso5426_to_utf8($marc_record, $marc_flavour);
 295         } else {
 296             # assume any other character encoding is for Text::Iconv
 297             @errors = _marc_to_utf8_via_text_iconv($marc_record, $marc_flavour, $source_encoding);
 298         }
 299
 300         if (@errors) {
 301             _marc_to_utf8_replacement_char($original_marc_record, $marc_flavour);
 302             return $original_marc_record, 'failed', \@errors;
 303         } else {
 304             return $marc_record, $source_encoding, [];
 305         }
 306     }
 307
 308 }
 309
 310 =head2 SetMarcUnicodeFlag
 311
 312   SetMarcUnicodeFlag($marc_record, $marc_flavour);
 313
 314 Set both the internal MARC::Record encoding flag
 315 and the appropriate Leader/09 (MARC21) or
 316 100/26-29 (UNIMARC) to indicate that the record
 317 is in UTF-8.  Note that this does B<not> do
 318 any actual character conversion.
 319
 320 =cut
 321
 322 sub SetMarcUnicodeFlag {
 323     my $marc_record = shift;
 324     my $marc_flavour = shift; # || C4::Context->preference("marcflavour");
 325
 326     $marc_record->encoding('UTF-8');
 327     if ($marc_flavour eq 'MARC21') {
 328         my $leader = $marc_record->leader();
 329         substr($leader, 9, 1) = 'a';
 330         $marc_record->leader($leader);
 331     } elsif ($marc_flavour =~/UNIMARC/) {
 332         require C4::Context;
 333         my $defaultlanguage = C4::Context->preference("UNIMARCField100Language");
 334         $defaultlanguage = "fre" if (!$defaultlanguage || length($defaultlanguage) != 3);
 335         my $string;
 336                 my ($subflength,$encodingposition)=($marc_flavour=~/AUTH/?(21,12):(36,25));
 337                 $string=$marc_record->subfield( 100, "a" );
 338         if (defined $string && length($string)==$subflength) {
 339                         $string = substr $string, 0,$subflength if (length($string)>$subflength);
 340         }
 341         else {
 342             $string = POSIX::strftime( "%Y%m%d", localtime );
 343             $string =~ s/\-//g;
 344             $string = sprintf( "%-*s", $subflength, $string );
 345             substr ( $string, ($encodingposition - 3), 3, $defaultlanguage);
 346         }
 347         substr( $string, $encodingposition, 3, "y50" );
 348         if ( $marc_record->subfield( 100, "a" ) ) {
 349                         $marc_record->field('100')->update(a=>$string);
 350                 }
 351                 else {
 352             $marc_record->insert_grouped_field(
 353                 MARC::Field->new( 100, '', '', "a" => $string ) );
 354         }
 355         Koha::Logger->get->debug("encodage: ", substr( $marc_record->subfield(100, 'a'), $encodingposition, 3 ));
 356     } else {
 357         warn "Unrecognized marcflavour: $marc_flavour";
 358     }
 359 }
 360
 361 =head2 StripNonXmlChars
 362
 363   my $new_str = StripNonXmlChars($old_str);
 364
 365 Given a string, return a copy with the
 366 characters that are illegal in XML
 367 removed.
 368
 369 This function exists to work around a problem
 370 that can occur with badly-encoded MARC records.
 371 Specifically, if a UTF-8 MARC record also
 372 has excape (\x1b) characters, MARC::File::XML
 373 will let the escape characters pass through
 374 when as_xml() or as_xml_record() is called.  The
 375 problem is that the escape character is not
 376 legal in well-formed XML documents, so when
 377 MARC::File::XML attempts to parse such a record,
 378 the XML parser will fail.
 379
 380 Stripping such characters will allow a
 381 MARC::Record->new_from_xml()
 382 to work, at the possible risk of some data loss.
 383
 384 =cut
 385
 386 sub StripNonXmlChars {
 387     my $str = shift;
 388     if (!defined($str) || $str eq ""){
 389         return "";
 390     }
 391     $str =~ s/[^\x09\x0A\x0D\x{0020}-\x{D7FF}\x{E000}-\x{FFFD}\x{10000}-\x{10FFFF}]//g;
 392     return $str;
 393 }
 394
 395
 396
 397 =head2 nsb_clean
 398
 399 =over 4
 400
 401 nsb_clean($string);
 402
 403 =back
 404
 405 Removes Non Sorting Block characters
 406
 407 =cut
 408 sub nsb_clean {
 409     my $NSB  = '\x88' ;        # NSB : begin Non Sorting Block
 410     my $NSE  = '\x89' ;        # NSE : Non Sorting Block end
 411     my $NSB2 = '\x98' ;        # NSB : begin Non Sorting Block
 412     my $NSE2 = '\x9C' ;        # NSE : Non Sorting Block end
 413     my $C2   = '\xC2' ;        # What is this char ? It is sometimes left by the regexp after removing NSB / NSE
 414
 415     # handles non sorting blocks
 416     my ($string) = @_ ;
 417     $_ = $string ;
 418     s/($C2){0,1}($NSB|$NSB2)//g ;
 419     s/($C2){0,1}($NSE|$NSE2)//g ;
 420     $string = $_ ;
 421
 422     return($string) ;
 423 }
 424
 425
 426 =head2 SanitizeRecord
 427
 428 SanitizeRecord($marcrecord);
 429
 430 Sanitize a record
 431 This routine is called in the maintenance script misc/maintenance/sanitize_records.pl.
 432 It cleans any string with '&amp;amp;...', replacing it by '&'
 433
 434 =cut
 435
 436 sub SanitizeRecord {
 437     my ( $record, $biblionumber ) = @_;
 438     my $string;
 439     my $record_modified = 0;
 440     my $frameworkcode   = C4::Biblio::GetFrameworkCode($biblionumber);
 441     my ( $url_field, $url_subfield ) =
 442       C4::Biblio::GetMarcFromKohaField( 'biblioitems.url' );
 443     foreach my $field ( $record->fields() ) {
 444         if ( $field->is_control_field() ) {
 445             my $value           = $field->data();
 446             my $sanitized_value = _clean_ampersand($value);
 447             $record_modified = 1 if $sanitized_value ne $value;
 448             $field->update($sanitized_value);
 449         }
 450         else {
 451             my @subfields = $field->subfields();
 452             my @new_subfields;
 453             foreach my $subfield (@subfields) {
 454                 next
 455                   if $url_field eq $field->tag()
 456                       and $url_subfield eq $subfield->[0];
 457                 my $value           = $subfield->[1];
 458                 my $sanitized_value = _clean_ampersand($value);
 459                 push @new_subfields, $subfield->[0] => $sanitized_value;
 460                 $record_modified = 1 if $sanitized_value ne $value;
 461             }
 462             if ( scalar(@new_subfields) > 0 ) {
 463                 my $new_field = eval {
 464                     MARC::Field->new(
 465                         $field->tag(),        $field->indicator(1),
 466                         $field->indicator(2), @new_subfields
 467                     );
 468                 };
 469                 if ($@) {
 470                     warn "error : $@";
 471                 }
 472                 else {
 473                     $field->replace_with($new_field);
 474                 }
 475
 476             }
 477         }
 478     }
 479
 480     return $record, $record_modified;
 481 }
 482
 483 sub _clean_ampersand {
 484     my ($string) = @_;
 485     $string =~ s/(&)(amp;)+/$1/g;
 486     return $string;
 487 }
 488
 489 =head1 INTERNAL FUNCTIONS
 490
 491 =head2 _default_marc21_charconv_to_utf8
 492
 493   my ($new_marc_record, $guessed_charset) = _default_marc21_charconv_to_utf8($marc_record);
 494
 495 Converts a C<MARC::Record> of unknown character set to UTF-8,
 496 first by trying a MARC-8 to UTF-8 conversion, then ISO-8859-1
 497 to UTF-8, then a default conversion that replaces each non-ASCII
 498 character with the replacement character.
 499
 500 The C<$guessed_charset> return value contains the character set
 501 that resulted in a conversion to valid UTF-8; note that
 502 if the MARC-8 and ISO-8859-1 conversions failed, the value of
 503 this is 'failed'.
 504
 505 =cut
 506
 507 sub _default_marc21_charconv_to_utf8 {
 508     my $marc_record = shift;
 509     my $marc_flavour = shift;
 510
 511     my $trial_marc8 = $marc_record->clone();
 512     my @all_errors = ();
 513     my @errors = _marc_marc8_to_utf8($trial_marc8, $marc_flavour);
 514     unless (@errors) {
 515         return $trial_marc8, 'MARC-8', [];
 516     }
 517     push @all_errors, @errors;
 518
 519     my $trial_8859_1 = $marc_record->clone();
 520     @errors = _marc_to_utf8_via_text_iconv($trial_8859_1, $marc_flavour, 'iso-8859-1');
 521     unless (@errors) {
 522         return $trial_8859_1, 'iso-8859-1', []; # note -- we could return \@all_errors
 523                                                 # instead if we wanted to report details
 524                                                 # of the failed attempt at MARC-8 => UTF-8
 525     }
 526     push @all_errors, @errors;
 527
 528     my $default_converted = $marc_record->clone();
 529     _marc_to_utf8_replacement_char($default_converted, $marc_flavour);
 530     return $default_converted, 'failed', \@all_errors;
 531 }
 532
 533 =head2 _default_unimarc_charconv_to_utf8
 534
 535   my ($new_marc_record, $guessed_charset) = _default_unimarc_charconv_to_utf8($marc_record);
 536
 537 Converts a C<MARC::Record> of unknown character set to UTF-8,
 538 first by trying a ISO-5426 to UTF-8 conversion, then ISO-8859-1
 539 to UTF-8, then a default conversion that replaces each non-ASCII
 540 character with the replacement character.
 541
 542 The C<$guessed_charset> return value contains the character set
 543 that resulted in a conversion to valid UTF-8; note that
 544 if the MARC-8 and ISO-8859-1 conversions failed, the value of
 545 this is 'failed'.
 546
 547 =cut
 548
 549 sub _default_unimarc_charconv_to_utf8 {
 550     my $marc_record = shift;
 551     my $marc_flavour = shift;
 552
 553     my $trial_marc8 = $marc_record->clone();
 554     my @all_errors = ();
 555     my @errors = _marc_iso5426_to_utf8($trial_marc8, $marc_flavour);
 556     unless (@errors) {
 557         return $trial_marc8, 'iso-5426';
 558     }
 559     push @all_errors, @errors;
 560
 561     my $trial_8859_1 = $marc_record->clone();
 562     @errors = _marc_to_utf8_via_text_iconv($trial_8859_1, $marc_flavour, 'iso-8859-1');
 563     unless (@errors) {
 564         return $trial_8859_1, 'iso-8859-1';
 565     }
 566     push @all_errors, @errors;
 567
 568     my $default_converted = $marc_record->clone();
 569     _marc_to_utf8_replacement_char($default_converted, $marc_flavour);
 570     return $default_converted, 'failed', \@all_errors;
 571 }
 572
 573 =head2 _marc_marc8_to_utf8
 574
 575   my @errors = _marc_marc8_to_utf8($marc_record, $marc_flavour, $source_encoding);
 576
 577 Convert a C<MARC::Record> to UTF-8 in-place from MARC-8.
 578 If the conversion fails for some reason, an
 579 appropriate messages will be placed in the returned
 580 C<@errors> array.
 581
 582 =cut
 583
 584 sub _marc_marc8_to_utf8 {
 585     my $marc_record = shift;
 586     my $marc_flavour = shift;
 587
 588     my $prev_ignore = MARC::Charset->ignore_errors();
 589     MARC::Charset->ignore_errors(1);
 590
 591     # trap warnings raised by MARC::Charset
 592     my @errors = ();
 593     local $SIG{__WARN__} = sub {
 594         my $msg = $_[0];
 595         if ($msg =~ /MARC.Charset/) {
 596             # FIXME - purpose of this regexp is to strip out the
 597             # line reference to MARC/Charset.pm, but as it
 598             # exists probably won't work quite on Windows --
 599             # some sort of minimal-bunch back-tracking RE
 600             # would be helpful here
 601             $msg =~ s/at [\/].*?.MARC.Charset\.pm line \d+\.\n$//;
 602             push @errors, $msg;
 603         } else {
 604             # if warning doesn't come from MARC::Charset, just
 605             # pass it on
 606             warn $msg;
 607         }
 608     };
 609
 610     foreach my $field ($marc_record->fields()) {
 611         if ($field->is_control_field()) {
 612             ; # do nothing -- control fields should not contain non-ASCII characters
 613         } else {
 614             my @converted_subfields;
 615             foreach my $subfield ($field->subfields()) {
 616                 my $utf8sf = MARC::Charset::marc8_to_utf8($subfield->[1]);
 617                 unless (IsStringUTF8ish($utf8sf)) {
 618                     # Because of a bug in MARC::Charset 0.98, if the string
 619                     # has (a) one or more diacritics that (b) are only in character positions
 620                     # 128 to 255 inclusive, the resulting converted string is not in
 621                     # UTF-8, but the legacy 8-bit encoding (e.g., ISO-8859-1).  If that
 622                     # occurs, upgrade the string in place.  Moral of the story seems to be
 623                     # that pack("U", ...) is better than chr(...) if you need to guarantee
 624                     # that the resulting string is UTF-8.
 625                     $utf8sf = Encode::encode('UTF-8', $utf8sf);
 626                 }
 627                 push @converted_subfields, $subfield->[0], $utf8sf;
 628             }
 629
 630             $field->replace_with(MARC::Field->new(
 631                 $field->tag(), $field->indicator(1), $field->indicator(2),
 632                 @converted_subfields)
 633             );
 634         }
 635     }
 636
 637     MARC::Charset->ignore_errors($prev_ignore);
 638
 639     SetMarcUnicodeFlag($marc_record, $marc_flavour);
 640
 641     return @errors;
 642 }
 643
 644 =head2 _marc_iso5426_to_utf8
 645
 646   my @errors = _marc_iso5426_to_utf8($marc_record, $marc_flavour, $source_encoding);
 647
 648 Convert a C<MARC::Record> to UTF-8 in-place from ISO-5426.
 649 If the conversion fails for some reason, an
 650 appropriate messages will be placed in the returned
 651 C<@errors> array.
 652
 653 FIXME - is ISO-5426 equivalent enough to MARC-8
 654 that C<MARC::Charset> can be used instead?
 655
 656 =cut
 657
 658 sub _marc_iso5426_to_utf8 {
 659     my $marc_record = shift;
 660     my $marc_flavour = shift;
 661
 662     my @errors = ();
 663
 664     foreach my $field ($marc_record->fields()) {
 665         if ($field->is_control_field()) {
 666             ; # do nothing -- control fields should not contain non-ASCII characters
 667         } else {
 668             my @converted_subfields;
 669             foreach my $subfield ($field->subfields()) {
 670                 my $utf8sf = char_decode5426($subfield->[1]);
 671                 push @converted_subfields, $subfield->[0], $utf8sf;
 672             }
 673
 674             $field->replace_with(MARC::Field->new(
 675                 $field->tag(), $field->indicator(1), $field->indicator(2),
 676                 @converted_subfields)
 677             );
 678         }
 679     }
 680
 681     SetMarcUnicodeFlag($marc_record, $marc_flavour);
 682
 683     return @errors;
 684 }
 685
 686 =head2 _marc_to_utf8_via_text_iconv
 687
 688   my @errors = _marc_to_utf8_via_text_iconv($marc_record, $marc_flavour, $source_encoding);
 689
 690 Convert a C<MARC::Record> to UTF-8 in-place using the
 691 C<Text::Iconv> CPAN module.  Any source encoding accepted
 692 by the user's iconv installation should work.  If
 693 the source encoding is not recognized on the user's
 694 server or the conversion fails for some reason,
 695 appropriate messages will be placed in the returned
 696 C<@errors> array.
 697
 698 =cut
 699
 700 sub _marc_to_utf8_via_text_iconv {
 701     my $marc_record = shift;
 702     my $marc_flavour = shift;
 703     my $source_encoding = shift;
 704
 705     my @errors = ();
 706     my $decoder;
 707     eval { $decoder = Text::Iconv->new($source_encoding, 'utf8'); };
 708     if ($@) {
 709         push @errors, "Could not initialze $source_encoding => utf8 converter: $@";
 710         return @errors;
 711     }
 712
 713     my $prev_raise_error = Text::Iconv->raise_error();
 714     Text::Iconv->raise_error(1);
 715
 716     foreach my $field ($marc_record->fields()) {
 717         if ($field->is_control_field()) {
 718             ; # do nothing -- control fields should not contain non-ASCII characters
 719         } else {
 720             my @converted_subfields;
 721             foreach my $subfield ($field->subfields()) {
 722                 my $converted_value;
 723                 my $conversion_ok = 1;
 724                 eval { $converted_value = $decoder->convert($subfield->[1]); };
 725                 if ($@) {
 726                     $conversion_ok = 0;
 727                     push @errors, $@;
 728                 } elsif (not defined $converted_value) {
 729                     $conversion_ok = 0;
 730                     push @errors, "Text::Iconv conversion failed - retval is " . $decoder->retval();
 731                 }
 732
 733                 if ($conversion_ok) {
 734                     push @converted_subfields, $subfield->[0], $converted_value;
 735                 } else {
 736                     $converted_value = $subfield->[1];
 737                     $converted_value =~ s/[\200-\377]/\xef\xbf\xbd/g;
 738                     push @converted_subfields, $subfield->[0], $converted_value;
 739                 }
 740             }
 741
 742             $field->replace_with(MARC::Field->new(
 743                 $field->tag(), $field->indicator(1), $field->indicator(2),
 744                 @converted_subfields)
 745             );
 746         }
 747     }
 748
 749     SetMarcUnicodeFlag($marc_record, $marc_flavour);
 750     Text::Iconv->raise_error($prev_raise_error);
 751
 752     return @errors;
 753 }
 754
 755 =head2 _marc_to_utf8_replacement_char
 756
 757   _marc_to_utf8_replacement_char($marc_record, $marc_flavour);
 758
 759 Convert a C<MARC::Record> to UTF-8 in-place, adopting the
 760 unsatisfactory method of replacing all non-ASCII (e.g.,
 761 where the eight bit is set) octet with the Unicode
 762 replacement character.  This is meant as a last-ditch
 763 method, and would be best used as part of a UI that
 764 lets a cataloguer pick various character conversions
 765 until they find the right one.
 766
 767 =cut
 768
 769 sub _marc_to_utf8_replacement_char {
 770     my $marc_record = shift;
 771     my $marc_flavour = shift;
 772
 773     foreach my $field ($marc_record->fields()) {
 774         if ($field->is_control_field()) {
 775             ; # do nothing -- control fields should not contain non-ASCII characters
 776         } else {
 777             my @converted_subfields;
 778             foreach my $subfield ($field->subfields()) {
 779                 my $value = $subfield->[1];
 780                 $value =~ s/[\200-\377]/\xef\xbf\xbd/g;
 781                 push @converted_subfields, $subfield->[0], $value;
 782             }
 783
 784             $field->replace_with(MARC::Field->new(
 785                 $field->tag(), $field->indicator(1), $field->indicator(2),
 786                 @converted_subfields)
 787             );
 788         }
 789     }
 790
 791     SetMarcUnicodeFlag($marc_record, $marc_flavour);
 792 }
 793
 794 =head2 char_decode5426
 795
 796   my $utf8string = char_decode5426($iso_5426_string);
 797
 798 Converts a string from ISO-5426 to UTF-8.
 799
 800 =cut
 801
 802
 803 my %chars;
 804
 805 ####
 806 ## 0xb
 807 $chars{0xb0}=0x0101;#3/0ayn[ain]
 808 $chars{0xb1}=0x0623;#3/1alif/hamzah[alefwithhamzaabove]
 809 #$chars{0xb2}=0x00e0;#'à';
 810 $chars{0xb2}=0x00e0;#3/2leftlowsinglequotationmark
 811 #$chars{0xb3}=0x00e7;#'ç';
 812 $chars{0xb3}=0x00e7;#3/2leftlowsinglequotationmark
 813 # $chars{0xb4}='è';
 814 $chars{0xb4}=0x00e8;
 815 # $chars{0xb5}='é';
 816 $chars{0xb5}=0x00e9;
 817 $chars{0xb6}=0x2021; # double dagger
 818 $chars{0xb7}=0x00b7; # middle dot
 819 $chars{0xb8}=0x2033; # double prime
 820 $chars{0xb9}=0x2019; # right single quotation mark
 821 $chars{0xba}=0x201d; # right double quotation mark
 822 $chars{0xbb}=0x00bb; # right-pointing double angle quotation mark
 823 $chars{0xbc}=0x266f; # music sharp sign
 824 $chars{0xbd}=0x02b9; # modifier letter prime
 825 $chars{0xbe}=0x02ba; # modifier letter double prime
 826 $chars{0xbf}=0x00bf; # inverted question mark
 827
 828 ####
 829 ## 0xe
 830 $chars{0xe1}=0x00c6; # latin capital letter ae
 831 $chars{0xe2}=0x0110; # latin capital letter d with stroke
 832 $chars{0xe6}=0x0132; # latin capital ligature ij
 833 $chars{0xe8}=0x0141; # latin capital letter l with stroke
 834 $chars{0xe9}=0x00d8; # latin capital letter o with stroke
 835 $chars{0xea}=0x0152; # latin capital ligature oe
 836 $chars{0xec}=0x00de; # latin capital letter thorn
 837
 838 ####
 839 ## 0xf
 840 $chars{0xf1}=0x00e6; # latin small letter ae
 841 $chars{0xf2}=0x0111; # latin small letter d with stroke
 842 $chars{0xf3}=0x00f0; # latin small letter eth
 843 $chars{0xf5}=0x0131; # latin small letter dotless i
 844 $chars{0xf6}=0x0133; # latin small ligature ij
 845 $chars{0xf8}=0x0142; # latin small letter l with stroke
 846 $chars{0xf9}=0x00f8; # latin small letter o with stroke
 847 $chars{0xfa}=0x0153; # latin small ligature oe
 848 $chars{0xfb}=0x00df; # latin small letter sharp s
 849 $chars{0xfc}=0x00fe; # latin small letter thorn
 850
 851 ####
 852 ## Others
 853 $chars{0x97}=0x003c;#3/2leftlowsinglequotationmark
 854 $chars{0x98}=0x003e;#3/2leftlowsinglequotationmark
 855 #$chars{0x81d1}=0x00b0; # FIXME useless
 856
 857 ####
 858 ## combined characters iso5426
 859
 860 $chars{0xc041}=0x1ea2; # capital a with hook above
 861 $chars{0xc045}=0x1eba; # capital e with hook above
 862 $chars{0xc049}=0x1ec8; # capital i with hook above
 863 $chars{0xc04f}=0x1ece; # capital o with hook above
 864 $chars{0xc055}=0x1ee6; # capital u with hook above
 865 $chars{0xc059}=0x1ef6; # capital y with hook above
 866 $chars{0xc061}=0x1ea3; # small a with hook above
 867 $chars{0xc065}=0x1ebb; # small e with hook above
 868 $chars{0xc069}=0x1ec9; # small i with hook above
 869 $chars{0xc06f}=0x1ecf; # small o with hook above
 870 $chars{0xc075}=0x1ee7; # small u with hook above
 871 $chars{0xc079}=0x1ef7; # small y with hook above
 872
 873         # 4/1 grave accent
 874 $chars{0xc141}=0x00c0; # capital a with grave accent
 875 $chars{0xc145}=0x00c8; # capital e with grave accent
 876 $chars{0xc149}=0x00cc; # capital i with grave accent
 877 $chars{0xc14f}=0x00d2; # capital o with grave accent
 878 $chars{0xc155}=0x00d9; # capital u with grave accent
 879 $chars{0xc157}=0x1e80; # capital w with grave
 880 $chars{0xc159}=0x1ef2; # capital y with grave
 881 $chars{0xc161}=0x00e0; # small a with grave accent
 882 $chars{0xc165}=0x00e8; # small e with grave accent
 883 $chars{0xc169}=0x00ec; # small i with grave accent
 884 $chars{0xc16f}=0x00f2; # small o with grave accent
 885 $chars{0xc175}=0x00f9; # small u with grave accent
 886 $chars{0xc177}=0x1e81; # small w with grave
 887 $chars{0xc179}=0x1ef3; # small y with grave
 888         # 4/2 acute accent
 889 $chars{0xc241}=0x00c1; # capital a with acute accent
 890 $chars{0xc243}=0x0106; # capital c with acute accent
 891 $chars{0xc245}=0x00c9; # capital e with acute accent
 892 $chars{0xc247}=0x01f4; # capital g with acute
 893 $chars{0xc249}=0x00cd; # capital i with acute accent
 894 $chars{0xc24b}=0x1e30; # capital k with acute
 895 $chars{0xc24c}=0x0139; # capital l with acute accent
 896 $chars{0xc24d}=0x1e3e; # capital m with acute
 897 $chars{0xc24e}=0x0143; # capital n with acute accent
 898 $chars{0xc24f}=0x00d3; # capital o with acute accent
 899 $chars{0xc250}=0x1e54; # capital p with acute
 900 $chars{0xc252}=0x0154; # capital r with acute accent
 901 $chars{0xc253}=0x015a; # capital s with acute accent
 902 $chars{0xc255}=0x00da; # capital u with acute accent
 903 $chars{0xc257}=0x1e82; # capital w with acute
 904 $chars{0xc259}=0x00dd; # capital y with acute accent
 905 $chars{0xc25a}=0x0179; # capital z with acute accent
 906 $chars{0xc261}=0x00e1; # small a with acute accent
 907 $chars{0xc263}=0x0107; # small c with acute accent
 908 $chars{0xc265}=0x00e9; # small e with acute accent
 909 $chars{0xc267}=0x01f5; # small g with acute
 910 $chars{0xc269}=0x00ed; # small i with acute accent
 911 $chars{0xc26b}=0x1e31; # small k with acute
 912 $chars{0xc26c}=0x013a; # small l with acute accent
 913 $chars{0xc26d}=0x1e3f; # small m with acute
 914 $chars{0xc26e}=0x0144; # small n with acute accent
 915 $chars{0xc26f}=0x00f3; # small o with acute accent
 916 $chars{0xc270}=0x1e55; # small p with acute
 917 $chars{0xc272}=0x0155; # small r with acute accent
 918 $chars{0xc273}=0x015b; # small s with acute accent
 919 $chars{0xc275}=0x00fa; # small u with acute accent
 920 $chars{0xc277}=0x1e83; # small w with acute
 921 $chars{0xc279}=0x00fd; # small y with acute accent
 922 $chars{0xc27a}=0x017a; # small z with acute accent
 923 $chars{0xc2e1}=0x01fc; # capital ae with acute
 924 $chars{0xc2f1}=0x01fd; # small ae with acute
 925        # 4/3 circumflex accent
 926 $chars{0xc341}=0x00c2; # capital a with circumflex accent
 927 $chars{0xc343}=0x0108; # capital c with circumflex
 928 $chars{0xc345}=0x00ca; # capital e with circumflex accent
 929 $chars{0xc347}=0x011c; # capital g with circumflex
 930 $chars{0xc348}=0x0124; # capital h with circumflex
 931 $chars{0xc349}=0x00ce; # capital i with circumflex accent
 932 $chars{0xc34a}=0x0134; # capital j with circumflex
 933 $chars{0xc34f}=0x00d4; # capital o with circumflex accent
 934 $chars{0xc353}=0x015c; # capital s with circumflex
 935 $chars{0xc355}=0x00db; # capital u with circumflex
 936 $chars{0xc357}=0x0174; # capital w with circumflex
 937 $chars{0xc359}=0x0176; # capital y with circumflex
 938 $chars{0xc35a}=0x1e90; # capital z with circumflex
 939 $chars{0xc361}=0x00e2; # small a with circumflex accent
 940 $chars{0xc363}=0x0109; # small c with circumflex
 941 $chars{0xc365}=0x00ea; # small e with circumflex accent
 942 $chars{0xc367}=0x011d; # small g with circumflex
 943 $chars{0xc368}=0x0125; # small h with circumflex
 944 $chars{0xc369}=0x00ee; # small i with circumflex accent
 945 $chars{0xc36a}=0x0135; # small j with circumflex
 946 $chars{0xc36e}=0x00f1; # small n with tilde
 947 $chars{0xc36f}=0x00f4; # small o with circumflex accent
 948 $chars{0xc373}=0x015d; # small s with circumflex
 949 $chars{0xc375}=0x00fb; # small u with circumflex
 950 $chars{0xc377}=0x0175; # small w with circumflex
 951 $chars{0xc379}=0x0177; # small y with circumflex
 952 $chars{0xc37a}=0x1e91; # small z with circumflex
 953         # 4/4 tilde
 954 $chars{0xc441}=0x00c3; # capital a with tilde
 955 $chars{0xc445}=0x1ebc; # capital e with tilde
 956 $chars{0xc449}=0x0128; # capital i with tilde
 957 $chars{0xc44e}=0x00d1; # capital n with tilde
 958 $chars{0xc44f}=0x00d5; # capital o with tilde
 959 $chars{0xc455}=0x0168; # capital u with tilde
 960 $chars{0xc456}=0x1e7c; # capital v with tilde
 961 $chars{0xc459}=0x1ef8; # capital y with tilde
 962 $chars{0xc461}=0x00e3; # small a with tilde
 963 $chars{0xc465}=0x1ebd; # small e with tilde
 964 $chars{0xc469}=0x0129; # small i with tilde
 965 $chars{0xc46e}=0x00f1; # small n with tilde
 966 $chars{0xc46f}=0x00f5; # small o with tilde
 967 $chars{0xc475}=0x0169; # small u with tilde
 968 $chars{0xc476}=0x1e7d; # small v with tilde
 969 $chars{0xc479}=0x1ef9; # small y with tilde
 970     # 4/5 macron
 971 $chars{0xc541}=0x0100; # capital a with macron
 972 $chars{0xc545}=0x0112; # capital e with macron
 973 $chars{0xc547}=0x1e20; # capital g with macron
 974 $chars{0xc549}=0x012a; # capital i with macron
 975 $chars{0xc54f}=0x014c; # capital o with macron
 976 $chars{0xc555}=0x016a; # capital u with macron
 977 $chars{0xc561}=0x0101; # small a with macron
 978 $chars{0xc565}=0x0113; # small e with macron
 979 $chars{0xc567}=0x1e21; # small g with macron
 980 $chars{0xc569}=0x012b; # small i with macron
 981 $chars{0xc56f}=0x014d; # small o with macron
 982 $chars{0xc575}=0x016b; # small u with macron
 983 $chars{0xc572}=0x0159; # small r with macron
 984 $chars{0xc5e1}=0x01e2; # capital ae with macron
 985 $chars{0xc5f1}=0x01e3; # small ae with macron
 986         # 4/6 breve
 987 $chars{0xc641}=0x0102; # capital a with breve
 988 $chars{0xc645}=0x0114; # capital e with breve
 989 $chars{0xc647}=0x011e; # capital g with breve
 990 $chars{0xc649}=0x012c; # capital i with breve
 991 $chars{0xc64f}=0x014e; # capital o with breve
 992 $chars{0xc655}=0x016c; # capital u with breve
 993 $chars{0xc661}=0x0103; # small a with breve
 994 $chars{0xc665}=0x0115; # small e with breve
 995 $chars{0xc667}=0x011f; # small g with breve
 996 $chars{0xc669}=0x012d; # small i with breve
 997 $chars{0xc66f}=0x014f; # small o with breve
 998 $chars{0xc675}=0x016d; # small u with breve
 999         # 4/7 dot above
1000 $chars{0xc7b0}=0x01e1; # Ain with dot above
1001 $chars{0xc742}=0x1e02; # capital b with dot above
1002 $chars{0xc743}=0x010a; # capital c with dot above
1003 $chars{0xc744}=0x1e0a; # capital d with dot above
1004 $chars{0xc745}=0x0116; # capital e with dot above
1005 $chars{0xc746}=0x1e1e; # capital f with dot above
1006 $chars{0xc747}=0x0120; # capital g with dot above
1007 $chars{0xc748}=0x1e22; # capital h with dot above
1008 $chars{0xc749}=0x0130; # capital i with dot above
1009 $chars{0xc74d}=0x1e40; # capital m with dot above
1010 $chars{0xc74e}=0x1e44; # capital n with dot above
1011 $chars{0xc750}=0x1e56; # capital p with dot above
1012 $chars{0xc752}=0x1e58; # capital r with dot above
1013 $chars{0xc753}=0x1e60; # capital s with dot above
1014 $chars{0xc754}=0x1e6a; # capital t with dot above
1015 $chars{0xc757}=0x1e86; # capital w with dot above
1016 $chars{0xc758}=0x1e8a; # capital x with dot above
1017 $chars{0xc759}=0x1e8e; # capital y with dot above
1018 $chars{0xc75a}=0x017b; # capital z with dot above
1019 $chars{0xc761}=0x0227; # small b with dot above
1020 $chars{0xc762}=0x1e03; # small b with dot above
1021 $chars{0xc763}=0x010b; # small c with dot above
1022 $chars{0xc764}=0x1e0b; # small d with dot above
1023 $chars{0xc765}=0x0117; # small e with dot above
1024 $chars{0xc766}=0x1e1f; # small f with dot above
1025 $chars{0xc767}=0x0121; # small g with dot above
1026 $chars{0xc768}=0x1e23; # small h with dot above
1027 $chars{0xc76d}=0x1e41; # small m with dot above
1028 $chars{0xc76e}=0x1e45; # small n with dot above
1029 $chars{0xc770}=0x1e57; # small p with dot above
1030 $chars{0xc772}=0x1e59; # small r with dot above
1031 $chars{0xc773}=0x1e61; # small s with dot above
1032 $chars{0xc774}=0x1e6b; # small t with dot above
1033 $chars{0xc777}=0x1e87; # small w with dot above
1034 $chars{0xc778}=0x1e8b; # small x with dot above
1035 $chars{0xc779}=0x1e8f; # small y with dot above
1036 $chars{0xc77a}=0x017c; # small z with dot above
1037         # 4/8 trema, diaresis
1038 $chars{0xc820}=0x00a8; # diaeresis
1039 $chars{0xc841}=0x00c4; # capital a with diaeresis
1040 $chars{0xc845}=0x00cb; # capital e with diaeresis
1041 $chars{0xc848}=0x1e26; # capital h with diaeresis
1042 $chars{0xc849}=0x00cf; # capital i with diaeresis
1043 $chars{0xc84f}=0x00d6; # capital o with diaeresis
1044 $chars{0xc855}=0x00dc; # capital u with diaeresis
1045 $chars{0xc857}=0x1e84; # capital w with diaeresis
1046 $chars{0xc858}=0x1e8c; # capital x with diaeresis
1047 $chars{0xc859}=0x0178; # capital y with diaeresis
1048 $chars{0xc861}=0x00e4; # small a with diaeresis
1049 $chars{0xc865}=0x00eb; # small e with diaeresis
1050 $chars{0xc868}=0x1e27; # small h with diaeresis
1051 $chars{0xc869}=0x00ef; # small i with diaeresis
1052 $chars{0xc86f}=0x00f6; # small o with diaeresis
1053 $chars{0xc874}=0x1e97; # small t with diaeresis
1054 $chars{0xc875}=0x00fc; # small u with diaeresis
1055 $chars{0xc877}=0x1e85; # small w with diaeresis
1056 $chars{0xc878}=0x1e8d; # small x with diaeresis
1057 $chars{0xc879}=0x00ff; # small y with diaeresis
1058         # 4/9 umlaut
1059 $chars{0xc920}=0x00a8; # [diaeresis]
1060 $chars{0xc961}=0x00e4; # a with umlaut
1061 $chars{0xc965}=0x00eb; # e with umlaut
1062 $chars{0xc969}=0x00ef; # i with umlaut
1063 $chars{0xc96f}=0x00f6; # o with umlaut
1064 $chars{0xc975}=0x00fc; # u with umlaut
1065         # 4/10 circle above
1066 $chars{0xca41}=0x00c5; # capital a with ring above
1067 $chars{0xcaad}=0x016e; # capital u with ring above
1068 $chars{0xca61}=0x00e5; # small a with ring above
1069 $chars{0xca75}=0x016f; # small u with ring above
1070 $chars{0xca77}=0x1e98; # small w with ring above
1071 $chars{0xca79}=0x1e99; # small y with ring above
1072         # 4/11 high comma off centre
1073         # 4/12 inverted high comma centred
1074         # 4/13 double acute accent
1075 $chars{0xcd4f}=0x0150; # capital o with double acute
1076 $chars{0xcd55}=0x0170; # capital u with double acute
1077 $chars{0xcd6f}=0x0151; # small o with double acute
1078 $chars{0xcd75}=0x0171; # small u with double acute
1079         # 4/14 horn
1080 $chars{0xce54}=0x01a0; # latin capital letter o with horn
1081 $chars{0xce55}=0x01af; # latin capital letter u with horn
1082 $chars{0xce74}=0x01a1; # latin small letter o with horn
1083 $chars{0xce75}=0x01b0; # latin small letter u with horn
1084         # 4/15 caron (hacek
1085 $chars{0xcf41}=0x01cd; # capital a with caron
1086 $chars{0xcf43}=0x010c; # capital c with caron
1087 $chars{0xcf44}=0x010e; # capital d with caron
1088 $chars{0xcf45}=0x011a; # capital e with caron
1089 $chars{0xcf47}=0x01e6; # capital g with caron
1090 $chars{0xcf49}=0x01cf; # capital i with caron
1091 $chars{0xcf4b}=0x01e8; # capital k with caron
1092 $chars{0xcf4c}=0x013d; # capital l with caron
1093 $chars{0xcf4e}=0x0147; # capital n with caron
1094 $chars{0xcf4f}=0x01d1; # capital o with caron
1095 $chars{0xcf52}=0x0158; # capital r with caron
1096 $chars{0xcf53}=0x0160; # capital s with caron
1097 $chars{0xcf54}=0x0164; # capital t with caron
1098 $chars{0xcf55}=0x01d3; # capital u with caron
1099 $chars{0xcf5a}=0x017d; # capital z with caron
1100 $chars{0xcf61}=0x01ce; # small a with caron
1101 $chars{0xcf63}=0x010d; # small c with caron
1102 $chars{0xcf64}=0x010f; # small d with caron
1103 $chars{0xcf65}=0x011b; # small e with caron
1104 $chars{0xcf67}=0x01e7; # small g with caron
1105 $chars{0xcf69}=0x01d0; # small i with caron
1106 $chars{0xcf6a}=0x01f0; # small j with caron
1107 $chars{0xcf6b}=0x01e9; # small k with caron
1108 $chars{0xcf6c}=0x013e; # small l with caron
1109 $chars{0xcf6e}=0x0148; # small n with caron
1110 $chars{0xcf6f}=0x01d2; # small o with caron
1111 $chars{0xcf72}=0x0159; # small r with caron
1112 $chars{0xcf73}=0x0161; # small s with caron
1113 $chars{0xcf74}=0x0165; # small t with caron
1114 $chars{0xcf75}=0x01d4; # small u with caron
1115 $chars{0xcf7a}=0x017e; # small z with caron
1116         # 5/0 cedilla
1117 $chars{0xd020}=0x00b8; # cedilla
1118 $chars{0xd043}=0x00c7; # capital c with cedilla
1119 $chars{0xd044}=0x1e10; # capital d with cedilla
1120 $chars{0xd047}=0x0122; # capital g with cedilla
1121 $chars{0xd048}=0x1e28; # capital h with cedilla
1122 $chars{0xd04b}=0x0136; # capital k with cedilla
1123 $chars{0xd04c}=0x013b; # capital l with cedilla
1124 $chars{0xd04e}=0x0145; # capital n with cedilla
1125 $chars{0xd052}=0x0156; # capital r with cedilla
1126 $chars{0xd053}=0x015e; # capital s with cedilla
1127 $chars{0xd054}=0x0162; # capital t with cedilla
1128 $chars{0xd063}=0x00e7; # small c with cedilla
1129 $chars{0xd064}=0x1e11; # small d with cedilla
1130 $chars{0xd065}=0x0119; # small e with cedilla
1131 $chars{0xd067}=0x0123; # small g with cedilla
1132 $chars{0xd068}=0x1e29; # small h with cedilla
1133 $chars{0xd06b}=0x0137; # small k with cedilla
1134 $chars{0xd06c}=0x013c; # small l with cedilla
1135 $chars{0xd06e}=0x0146; # small n with cedilla
1136 $chars{0xd072}=0x0157; # small r with cedilla
1137 $chars{0xd073}=0x015f; # small s with cedilla
1138 $chars{0xd074}=0x0163; # small t with cedilla
1139         # 5/1 rude
1140         # 5/2 hook to left
1141         # 5/3 ogonek (hook to right
1142 $chars{0xd320}=0x02db; # ogonek
1143 $chars{0xd341}=0x0104; # capital a with ogonek
1144 $chars{0xd345}=0x0118; # capital e with ogonek
1145 $chars{0xd349}=0x012e; # capital i with ogonek
1146 $chars{0xd34f}=0x01ea; # capital o with ogonek
1147 $chars{0xd355}=0x0172; # capital u with ogonek
1148 $chars{0xd361}=0x0105; # small a with ogonek
1149 $chars{0xd365}=0x0119; # small e with ogonek
1150 $chars{0xd369}=0x012f; # small i with ogonek
1151 $chars{0xd36f}=0x01eb; # small o with ogonek
1152 $chars{0xd375}=0x0173; # small u with ogonek
1153         # 5/4 circle below
1154 $chars{0xd441}=0x1e00; # capital a with ring below
1155 $chars{0xd461}=0x1e01; # small a with ring below
1156         # 5/5 half circle below
1157 $chars{0xd548}=0x1e2a; # capital h with breve below
1158 $chars{0xd568}=0x1e2b; # small h with breve below
1159         # 5/6 dot below
1160 $chars{0xd641}=0x1ea0; # capital a with dot below
1161 $chars{0xd642}=0x1e04; # capital b with dot below
1162 $chars{0xd644}=0x1e0c; # capital d with dot below
1163 $chars{0xd645}=0x1eb8; # capital e with dot below
1164 $chars{0xd648}=0x1e24; # capital h with dot below
1165 $chars{0xd649}=0x1eca; # capital i with dot below
1166 $chars{0xd64b}=0x1e32; # capital k with dot below
1167 $chars{0xd64c}=0x1e36; # capital l with dot below
1168 $chars{0xd64d}=0x1e42; # capital m with dot below
1169 $chars{0xd64e}=0x1e46; # capital n with dot below
1170 $chars{0xd64f}=0x1ecc; # capital o with dot below
1171 $chars{0xd652}=0x1e5a; # capital r with dot below
1172 $chars{0xd653}=0x1e62; # capital s with dot below
1173 $chars{0xd654}=0x1e6c; # capital t with dot below
1174 $chars{0xd655}=0x1ee4; # capital u with dot below
1175 $chars{0xd656}=0x1e7e; # capital v with dot below
1176 $chars{0xd657}=0x1e88; # capital w with dot below
1177 $chars{0xd659}=0x1ef4; # capital y with dot below
1178 $chars{0xd65a}=0x1e92; # capital z with dot below
1179 $chars{0xd661}=0x1ea1; # small a with dot below
1180 $chars{0xd662}=0x1e05; # small b with dot below
1181 $chars{0xd664}=0x1e0d; # small d with dot below
1182 $chars{0xd665}=0x1eb9; # small e with dot below
1183 $chars{0xd668}=0x1e25; # small h with dot below
1184 $chars{0xd669}=0x1ecb; # small i with dot below
1185 $chars{0xd66b}=0x1e33; # small k with dot below
1186 $chars{0xd66c}=0x1e37; # small l with dot below
1187 $chars{0xd66d}=0x1e43; # small m with dot below
1188 $chars{0xd66e}=0x1e47; # small n with dot below
1189 $chars{0xd66f}=0x1ecd; # small o with dot below
1190 $chars{0xd672}=0x1e5b; # small r with dot below
1191 $chars{0xd673}=0x1e63; # small s with dot below
1192 $chars{0xd674}=0x1e6d; # small t with dot below
1193 $chars{0xd675}=0x1ee5; # small u with dot below
1194 $chars{0xd676}=0x1e7f; # small v with dot below
1195 $chars{0xd677}=0x1e89; # small w with dot below
1196 $chars{0xd679}=0x1ef5; # small y with dot below
1197 $chars{0xd67a}=0x1e93; # small z with dot below
1198         # 5/7 double dot below
1199 $chars{0xd755}=0x1e72; # capital u with diaeresis below
1200 $chars{0xd775}=0x1e73; # small u with diaeresis below
1201         # 5/8 underline
1202 $chars{0xd820}=0x005f; # underline
1203         # 5/9 double underline
1204 $chars{0xd920}=0x2017; # double underline
1205         # 5/10 small low vertical bar
1206 $chars{0xda20}=0x02cc; #
1207         # 5/11 circumflex below
1208         # 5/12 (this position shall not be used)
1209         # 5/13 left half of ligature sign and of double tilde
1210         # 5/14 right half of ligature sign
1211         # 5/15 right half of double tilde
1212 #     map {printf "%x :%x\n",$_,$chars{$_};}keys %chars;
1213
1214 sub char_decode5426 {
1215     my ( $string) = @_;
1216     my $result;
1217
1218     my @data = unpack("C*", $string);
1219     my @characters;
1220     my $length=scalar(@data);
1221     for (my $i = 0; $i < scalar(@data); $i++) {
1222       my $char= $data[$i];
1223       if ($char >= 0x00 && $char <= 0x7F){
1224         #IsAscii
1225
1226           push @characters,$char unless ($char<0x02 ||$char== 0x0F);
1227       }elsif (($char >= 0xC0 && $char <= 0xDF)) {
1228         #Combined Char
1229         my $convchar ;
1230         if ($chars{$char*256+$data[$i+1]}) {
1231           $convchar= $chars{$char * 256 + $data[$i+1]};
1232           $i++;
1233 #           printf "char %x $char, char to convert %x , converted %x\n",$char,$char * 256 + $data[$i - 1],$convchar;
1234         } elsif ($chars{$char})  {
1235           $convchar= $chars{$char};
1236 #           printf "0xC char %x, converted %x\n",$char,$chars{$char};
1237         }else {
1238           $convchar=$char;
1239         }
1240         push @characters,$convchar;
1241       } else {
1242         my $convchar;
1243         if ($chars{$char})  {
1244           $convchar= $chars{$char};
1245 #            printf "char %x,  converted %x\n",$char,$chars{$char};
1246         }else {
1247 #            printf "char %x $char\n",$char;
1248           $convchar=$char;
1249         }
1250         push @characters,$convchar;
1251       }
1252     }
1253     $result=pack "U*",@characters;
1254 #     $result=~s/\x01//;
1255 #     $result=~s/\x00//;
1256      $result=~s/\x0f//;
1257      $result=~s/\x1b.//;
1258      $result=~s/\x0e//;
1259      $result=~s/\x1b\x5b//;
1260 #   map{printf "%x",$_} @characters;
1261 #   printf "\n";
1262   return $result;
1263 }
1264
1265 1;
1266
1267
1268 =head1 AUTHOR
1269
1270 Koha Development Team <http://koha-community.org/>
1271
1272 Galen Charlton <galen.charlton@liblime.com>
1273
1274 =cut