C4/Charset.pm

   1 package C4::Charset;
   2
   3 # Copyright (C) 2008 LibLime
   4 #
   5 # This file is part of Koha.
   6 #
   7 # Koha is free software; you can redistribute it and/or modify it
   8 # under the terms of the GNU General Public License as published by
   9 # the Free Software Foundation; either version 3 of the License, or
  10 # (at your option) any later version.
  11 #
  12 # Koha is distributed in the hope that it will be useful, but
  13 # WITHOUT ANY WARRANTY; without even the implied warranty of
  14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  15 # GNU General Public License for more details.
  16 #
  17 # You should have received a copy of the GNU General Public License
  18 # along with Koha; if not, see <http://www.gnu.org/licenses>.
  19
  20 use strict;
  21 use warnings;
  22
  23 use MARC::Charset qw/marc8_to_utf8/;
  24 use Text::Iconv;
  25 use C4::Context;
  26 use C4::Debug;
  27 use Unicode::Normalize;
  28 use Encode qw( decode encode is_utf8 );
  29
  30 use vars qw($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
  31
  32 BEGIN {
  33     # set the version for version checking
  34     $VERSION = 3.07.00.049;
  35     require Exporter;
  36     @ISA    = qw(Exporter);
  37     @EXPORT = qw(
  38         NormalizeString
  39         IsStringUTF8ish
  40         MarcToUTF8Record
  41         SetUTF8Flag
  42         SetMarcUnicodeFlag
  43         StripNonXmlChars
  44         nsb_clean
  45         SanitizeRecord
  46     );
  47 }
  48
  49 =encoding UTF-8
  50
  51 =head1 NAME
  52
  53 C4::Charset - utilities for handling character set conversions.
  54
  55 =head1 SYNOPSIS
  56
  57   use C4::Charset;
  58
  59 =head1 DESCRIPTION
  60
  61 This module contains routines for dealing with character set
  62 conversions, particularly for MARC records.
  63
  64 A variety of character encodings are in use by various MARC
  65 standards, and even more character encodings are used by
  66 non-standard MARC records.  The various MARC formats generally
  67 do not do a good job of advertising a given record's character
  68 encoding, and even when a record does advertise its encoding,
  69 e.g., via the Leader/09, experience has shown that one cannot
  70 trust it.
  71
  72 Ultimately, all MARC records are stored in Koha in UTF-8 and
  73 must be converted from whatever the source character encoding is.
  74 The goal of this module is to ensure that these conversions
  75 take place accurately.  When a character conversion cannot take
  76 place, or at least not accurately, the module was provide
  77 enough information to allow user-facing code to inform the user
  78 on how to deal with the situation.
  79
  80 =cut
  81
  82 =head1 FUNCTIONS
  83
  84 =head2 IsStringUTF8ish
  85
  86   my $is_utf8 = IsStringUTF8ish($str);
  87
  88 Determines if C<$str> is valid UTF-8.  This can mean
  89 one of two things:
  90
  91 =over
  92
  93 =item *
  94
  95 The Perl UTF-8 flag is set and the string contains valid UTF-8.
  96
  97 =item *
  98
  99 The Perl UTF-8 flag is B<not> set, but the octets contain
 100 valid UTF-8.
 101
 102 =back
 103
 104 The function is named C<IsStringUTF8ish> instead of C<IsStringUTF8>
 105 because in one could be presented with a MARC blob that is
 106 not actually in UTF-8 but whose sequence of octets appears to be
 107 valid UTF-8.  The rest of the MARC character conversion functions
 108 will assume that this situation occur does not very often.
 109
 110 =cut
 111
 112 sub IsStringUTF8ish {
 113     my $str = shift;
 114
 115     return 1 if Encode::is_utf8($str);
 116     return utf8::decode( $str );
 117 }
 118
 119 =head2 SetUTF8Flag
 120
 121   my $marc_record = SetUTF8Flag($marc_record, $nfd);
 122
 123 This function sets the PERL UTF8 flag for data.
 124 It is required when using new_from_usmarc
 125 since MARC::File::USMARC does not handle PERL UTF8 setting.
 126 When editing unicode marc records fields and subfields, you
 127 would end up in double encoding without using this function.
 128
 129 If $nfd is set, string normalization will use NFD instead of NFC
 130
 131 FIXME
 132 In my opinion, this function belongs to MARC::Record and not
 133 to this package.
 134 But since it handles charset, and MARC::Record, it finds its way in that package
 135
 136 =cut
 137
 138 sub SetUTF8Flag{
 139     my ($record, $nfd)=@_;
 140     return unless ($record && $record->fields());
 141     foreach my $field ($record->fields()){
 142         if ($field->tag()>=10){
 143             my @subfields;
 144             foreach my $subfield ($field->subfields()){
 145                 push @subfields,($$subfield[0],NormalizeString($$subfield[1],$nfd));
 146             }
 147             eval {
 148                 my $newfield=MARC::Field->new(
 149                             $field->tag(),
 150                             $field->indicator(1),
 151                             $field->indicator(2),
 152                             @subfields
 153                         );
 154                 $field->replace_with($newfield);
 155             };
 156             warn "ERROR occurred in SetUTF8Flag $@" if $@;
 157         }
 158     }
 159 }
 160
 161 =head2 NormalizeString
 162
 163     my $normalized_string=NormalizeString($string,$nfd,$transform);
 164
 165 Given a string
 166 nfd : If you want to set NFD and not NFC
 167 transform : If you expect all the signs to be removed
 168
 169 Sets the PERL UTF8 Flag on your initial data if need be
 170 and applies cleaning if required
 171
 172 Returns a utf8 NFC normalized string
 173
 174 Sample code :
 175    my $string=NormalizeString ("l'ornithoptère");
 176    #results into ornithoptère in NFC form and sets UTF8 Flag
 177
 178 =cut
 179
 180
 181 sub NormalizeString{
 182         my ($string,$nfd,$transform)=@_;
 183     return $string unless defined($string); # force scalar context return.
 184     $string = Encode::decode('UTF-8', $string) unless (Encode::is_utf8($string));
 185         if ($nfd){
 186                 $string= NFD($string);
 187         }
 188         else {
 189                 $string=NFC($string);
 190         }
 191         if ($transform){
 192     $string=~s/\<|\>|\^|\;|\.|\?|,|\-|\(|\)|\[|\]|\{|\}|\$|\%|\!|\*|\:|\\|\/|\&|\"|\'/ /g;
 193         #removing one letter words "d'" "l'"  was changed into "d " "l "
 194     $string=~s/\b\S\b//g;
 195     $string=~s/\s+$//g;
 196         }
 197     return $string;
 198 }
 199
 200 =head2 MarcToUTF8Record
 201
 202   ($marc_record, $converted_from, $errors_arrayref) = MarcToUTF8Record($marc_blob,
 203                                         $marc_flavour, [, $source_encoding]);
 204
 205 Given a MARC blob or a C<MARC::Record>, the MARC flavour, and an
 206 optional source encoding, return a C<MARC::Record> that is
 207 converted to UTF-8.
 208
 209 The returned C<$marc_record> is guaranteed to be in valid UTF-8, but
 210 is not guaranteed to have been converted correctly.  Specifically,
 211 if C<$converted_from> is 'failed', the MARC record returned failed
 212 character conversion and had each of its non-ASCII octets changed
 213 to the Unicode replacement character.
 214
 215 If the source encoding was not specified, this routine will
 216 try to guess it; the character encoding used for a successful
 217 conversion is returned in C<$converted_from>.
 218
 219 =cut
 220
 221 sub MarcToUTF8Record {
 222     my $marc = shift;
 223     my $marc_flavour = shift;
 224     my $source_encoding = shift;
 225     my $marc_record;
 226     my $marc_blob_is_utf8 = 0;
 227     if (ref($marc) eq 'MARC::Record') {
 228         my $marc_blob = $marc->as_usmarc();
 229         $marc_blob_is_utf8 = IsStringUTF8ish($marc_blob);
 230         $marc_record = $marc;
 231     } else {
 232         # dealing with a MARC blob
 233
 234         # remove any ersatz whitespace from the beginning and
 235         # end of the MARC blob -- these can creep into MARC
 236         # files produced by several sources -- caller really
 237         # should be doing this, however
 238         $marc =~ s/^\s+//;
 239         $marc =~ s/\s+$//;
 240         $marc_blob_is_utf8 = IsStringUTF8ish($marc);
 241         eval {
 242             $marc_record = MARC::Record->new_from_usmarc($marc);
 243         };
 244         if ($@) {
 245             # if we fail the first time, one likely problem
 246             # is that we have a MARC21 record that says that it's
 247             # UTF-8 (Leader/09 = 'a') but contains non-UTF-8 characters.
 248             # We'll try parsing it again.
 249             substr($marc, 9, 1) = ' ';
 250             eval {
 251                 $marc_record = MARC::Record->new_from_usmarc($marc);
 252             };
 253             if ($@) {
 254                 # it's hopeless; return an empty MARC::Record
 255                 return MARC::Record->new(), 'failed', ['could not parse MARC blob'];
 256             }
 257         }
 258     }
 259
 260     # If we do not know the source encoding, try some guesses
 261     # as follows:
 262     #   1. Record is UTF-8 already.
 263     #   2. If MARC flavor is MARC21 or NORMARC, then
 264     #      a. record is MARC-8
 265     #      b. record is ISO-8859-1
 266     #   3. If MARC flavor is UNIMARC, then
 267     if (not defined $source_encoding) {
 268         if ($marc_blob_is_utf8) {
 269             # note that for MARC21/NORMARC we are not bothering to check
 270             # if the Leader/09 is set to 'a' or not -- because
 271             # of problems with various ILSs (including Koha in the
 272             # past, alas), this just is not trustworthy.
 273             SetMarcUnicodeFlag($marc_record, $marc_flavour);
 274             return $marc_record, 'UTF-8', [];
 275         } else {
 276             if ($marc_flavour eq 'MARC21' || $marc_flavour eq 'NORMARC') {
 277                 return _default_marc21_charconv_to_utf8($marc_record, $marc_flavour);
 278             } elsif ($marc_flavour =~/UNIMARC/) {
 279                 return _default_unimarc_charconv_to_utf8($marc_record, $marc_flavour);
 280             } else {
 281                 return _default_marc21_charconv_to_utf8($marc_record, $marc_flavour);
 282             }
 283         }
 284     } else {
 285         # caller knows the character encoding
 286         my $original_marc_record = $marc_record->clone();
 287         my @errors;
 288         if ($source_encoding =~ /utf-?8/i) {
 289             if ($marc_blob_is_utf8) {
 290                 SetMarcUnicodeFlag($marc_record, $marc_flavour);
 291                 return $marc_record, 'UTF-8', [];
 292             } else {
 293                 push @errors, 'specified UTF-8 => UTF-8 conversion, but record is not in UTF-8';
 294             }
 295         } elsif ($source_encoding =~ /marc-?8/i) {
 296             @errors = _marc_marc8_to_utf8($marc_record, $marc_flavour);
 297         } elsif ($source_encoding =~ /5426/) {
 298             @errors = _marc_iso5426_to_utf8($marc_record, $marc_flavour);
 299         } else {
 300             # assume any other character encoding is for Text::Iconv
 301             @errors = _marc_to_utf8_via_text_iconv($marc_record, $marc_flavour, $source_encoding);
 302         }
 303
 304         if (@errors) {
 305             _marc_to_utf8_replacement_char($original_marc_record, $marc_flavour);
 306             return $original_marc_record, 'failed', \@errors;
 307         } else {
 308             return $marc_record, $source_encoding, [];
 309         }
 310     }
 311
 312 }
 313
 314 =head2 SetMarcUnicodeFlag
 315
 316   SetMarcUnicodeFlag($marc_record, $marc_flavour);
 317
 318 Set both the internal MARC::Record encoding flag
 319 and the appropriate Leader/09 (MARC21) or
 320 100/26-29 (UNIMARC) to indicate that the record
 321 is in UTF-8.  Note that this does B<not> do
 322 any actual character conversion.
 323
 324 =cut
 325
 326 sub SetMarcUnicodeFlag {
 327     my $marc_record = shift;
 328     my $marc_flavour = shift; # || C4::Context->preference("marcflavour");
 329
 330     $marc_record->encoding('UTF-8');
 331     if ($marc_flavour eq 'MARC21' || $marc_flavour eq 'NORMARC') {
 332         my $leader = $marc_record->leader();
 333         substr($leader, 9, 1) = 'a';
 334         $marc_record->leader($leader);
 335     } elsif ($marc_flavour =~/UNIMARC/) {
 336         my $defaultlanguage = C4::Context->preference("UNIMARCField100Language");
 337         $defaultlanguage = "fre" if (!$defaultlanguage || length($defaultlanguage) != 3);
 338         my $string;
 339                 my ($subflength,$encodingposition)=($marc_flavour=~/AUTH/?(21,12):(36,25));
 340                 $string=$marc_record->subfield( 100, "a" );
 341         if (defined $string && length($string)==$subflength) {
 342                         $string = substr $string, 0,$subflength if (length($string)>$subflength);
 343         }
 344         else {
 345             $string = POSIX::strftime( "%Y%m%d", localtime );
 346             $string =~ s/\-//g;
 347             $string = sprintf( "%-*s", $subflength, $string );
 348             substr ( $string, ($encodingposition - 3), 3, $defaultlanguage);
 349         }
 350         substr( $string, $encodingposition, 3, "y50" );
 351         if ( $marc_record->subfield( 100, "a" ) ) {
 352                         $marc_record->field('100')->update(a=>$string);
 353                 }
 354                 else {
 355             $marc_record->insert_grouped_field(
 356                 MARC::Field->new( 100, '', '', "a" => $string ) );
 357         }
 358                 $debug && warn "encodage: ", substr( $marc_record->subfield(100, 'a'), $encodingposition, 3 );
 359     } else {
 360         warn "Unrecognized marcflavour: $marc_flavour";
 361     }
 362 }
 363
 364 =head2 StripNonXmlChars
 365
 366   my $new_str = StripNonXmlChars($old_str);
 367
 368 Given a string, return a copy with the
 369 characters that are illegal in XML
 370 removed.
 371
 372 This function exists to work around a problem
 373 that can occur with badly-encoded MARC records.
 374 Specifically, if a UTF-8 MARC record also
 375 has excape (\x1b) characters, MARC::File::XML
 376 will let the escape characters pass through
 377 when as_xml() or as_xml_record() is called.  The
 378 problem is that the escape character is not
 379 legal in well-formed XML documents, so when
 380 MARC::File::XML attempts to parse such a record,
 381 the XML parser will fail.
 382
 383 Stripping such characters will allow a
 384 MARC::Record->new_from_xml()
 385 to work, at the possible risk of some data loss.
 386
 387 =cut
 388
 389 sub StripNonXmlChars {
 390     my $str = shift;
 391     if (!defined($str) || $str eq ""){
 392         return "";
 393     }
 394     $str =~ s/[^\x09\x0A\x0D\x{0020}-\x{D7FF}\x{E000}-\x{FFFD}\x{10000}-\x{10FFFF}]//g;
 395     return $str;
 396 }
 397
 398
 399
 400 =head2 nsb_clean
 401
 402 =over 4
 403
 404 nsb_clean($string);
 405
 406 =back
 407
 408 Removes Non Sorting Block characters
 409
 410 =cut
 411 sub nsb_clean {
 412     my $NSB  = '\x88' ;        # NSB : begin Non Sorting Block
 413     my $NSE  = '\x89' ;        # NSE : Non Sorting Block end
 414     my $NSB2 = '\x98' ;        # NSB : begin Non Sorting Block
 415     my $NSE2 = '\x9C' ;        # NSE : Non Sorting Block end
 416     my $C2   = '\xC2' ;        # What is this char ? It is sometimes left by the regexp after removing NSB / NSE
 417
 418     # handles non sorting blocks
 419     my ($string) = @_ ;
 420     $_ = $string ;
 421     s/($C2){0,1}($NSB|$NSB2)//g ;
 422     s/($C2){0,1}($NSE|$NSE2)//g ;
 423     $string = $_ ;
 424
 425     return($string) ;
 426 }
 427
 428
 429 =head2 SanitizeRecord
 430
 431 SanitizeRecord($marcrecord);
 432
 433 Sanitize a record
 434 This routine is called in the maintenance script misc/maintenance/sanitize_records.pl.
 435 It cleans any string with '&amp;amp;...', replacing it by '&'
 436
 437 =cut
 438
 439 sub SanitizeRecord {
 440     my ( $record, $biblionumber ) = @_;
 441     my $string;
 442     my $record_modified = 0;
 443     my $frameworkcode   = C4::Biblio::GetFrameworkCode($biblionumber);
 444     my ( $url_field, $url_subfield ) =
 445       C4::Biblio::GetMarcFromKohaField( 'biblioitems.url', $frameworkcode );
 446     foreach my $field ( $record->fields() ) {
 447         if ( $field->is_control_field() ) {
 448             my $value           = $field->data();
 449             my $sanitized_value = _clean_ampersand($value);
 450             $record_modified = 1 if $sanitized_value ne $value;
 451             $field->update($sanitized_value);
 452         }
 453         else {
 454             my @subfields = $field->subfields();
 455             my @new_subfields;
 456             foreach my $subfield (@subfields) {
 457                 next
 458                   if $url_field eq $field->tag()
 459                       and $url_subfield eq $subfield->[0];
 460                 my $value           = $subfield->[1];
 461                 my $sanitized_value = _clean_ampersand($value);
 462                 push @new_subfields, $subfield->[0] => $sanitized_value;
 463                 $record_modified = 1 if $sanitized_value ne $value;
 464             }
 465             if ( scalar(@new_subfields) > 0 ) {
 466                 my $new_field = eval {
 467                     MARC::Field->new(
 468                         $field->tag(),        $field->indicator(1),
 469                         $field->indicator(2), @new_subfields
 470                     );
 471                 };
 472                 if ($@) {
 473                     warn "error : $@";
 474                 }
 475                 else {
 476                     $field->replace_with($new_field);
 477                 }
 478
 479             }
 480         }
 481     }
 482
 483     return $record, $record_modified;
 484 }
 485
 486 sub _clean_ampersand {
 487     my ($string) = @_;
 488     $string =~ s/(&)(amp;)+/$1/g;
 489     return $string;
 490 }
 491
 492 =head1 INTERNAL FUNCTIONS
 493
 494 =head2 _default_marc21_charconv_to_utf8
 495
 496   my ($new_marc_record, $guessed_charset) = _default_marc21_charconv_to_utf8($marc_record);
 497
 498 Converts a C<MARC::Record> of unknown character set to UTF-8,
 499 first by trying a MARC-8 to UTF-8 conversion, then ISO-8859-1
 500 to UTF-8, then a default conversion that replaces each non-ASCII
 501 character with the replacement character.
 502
 503 The C<$guessed_charset> return value contains the character set
 504 that resulted in a conversion to valid UTF-8; note that
 505 if the MARC-8 and ISO-8859-1 conversions failed, the value of
 506 this is 'failed'.
 507
 508 =cut
 509
 510 sub _default_marc21_charconv_to_utf8 {
 511     my $marc_record = shift;
 512     my $marc_flavour = shift;
 513
 514     my $trial_marc8 = $marc_record->clone();
 515     my @all_errors = ();
 516     my @errors = _marc_marc8_to_utf8($trial_marc8, $marc_flavour);
 517     unless (@errors) {
 518         return $trial_marc8, 'MARC-8', [];
 519     }
 520     push @all_errors, @errors;
 521
 522     my $trial_8859_1 = $marc_record->clone();
 523     @errors = _marc_to_utf8_via_text_iconv($trial_8859_1, $marc_flavour, 'iso-8859-1');
 524     unless (@errors) {
 525         return $trial_8859_1, 'iso-8859-1', []; # note -- we could return \@all_errors
 526                                                 # instead if we wanted to report details
 527                                                 # of the failed attempt at MARC-8 => UTF-8
 528     }
 529     push @all_errors, @errors;
 530
 531     my $default_converted = $marc_record->clone();
 532     _marc_to_utf8_replacement_char($default_converted, $marc_flavour);
 533     return $default_converted, 'failed', \@all_errors;
 534 }
 535
 536 =head2 _default_unimarc_charconv_to_utf8
 537
 538   my ($new_marc_record, $guessed_charset) = _default_unimarc_charconv_to_utf8($marc_record);
 539
 540 Converts a C<MARC::Record> of unknown character set to UTF-8,
 541 first by trying a ISO-5426 to UTF-8 conversion, then ISO-8859-1
 542 to UTF-8, then a default conversion that replaces each non-ASCII
 543 character with the replacement character.
 544
 545 The C<$guessed_charset> return value contains the character set
 546 that resulted in a conversion to valid UTF-8; note that
 547 if the MARC-8 and ISO-8859-1 conversions failed, the value of
 548 this is 'failed'.
 549
 550 =cut
 551
 552 sub _default_unimarc_charconv_to_utf8 {
 553     my $marc_record = shift;
 554     my $marc_flavour = shift;
 555
 556     my $trial_marc8 = $marc_record->clone();
 557     my @all_errors = ();
 558     my @errors = _marc_iso5426_to_utf8($trial_marc8, $marc_flavour);
 559     unless (@errors) {
 560         return $trial_marc8, 'iso-5426';
 561     }
 562     push @all_errors, @errors;
 563
 564     my $trial_8859_1 = $marc_record->clone();
 565     @errors = _marc_to_utf8_via_text_iconv($trial_8859_1, $marc_flavour, 'iso-8859-1');
 566     unless (@errors) {
 567         return $trial_8859_1, 'iso-8859-1';
 568     }
 569     push @all_errors, @errors;
 570
 571     my $default_converted = $marc_record->clone();
 572     _marc_to_utf8_replacement_char($default_converted, $marc_flavour);
 573     return $default_converted, 'failed', \@all_errors;
 574 }
 575
 576 =head2 _marc_marc8_to_utf8
 577
 578   my @errors = _marc_marc8_to_utf8($marc_record, $marc_flavour, $source_encoding);
 579
 580 Convert a C<MARC::Record> to UTF-8 in-place from MARC-8.
 581 If the conversion fails for some reason, an
 582 appropriate messages will be placed in the returned
 583 C<@errors> array.
 584
 585 =cut
 586
 587 sub _marc_marc8_to_utf8 {
 588     my $marc_record = shift;
 589     my $marc_flavour = shift;
 590
 591     my $prev_ignore = MARC::Charset->ignore_errors();
 592     MARC::Charset->ignore_errors(1);
 593
 594     # trap warnings raised by MARC::Charset
 595     my @errors = ();
 596     local $SIG{__WARN__} = sub {
 597         my $msg = $_[0];
 598         if ($msg =~ /MARC.Charset/) {
 599             # FIXME - purpose of this regexp is to strip out the
 600             # line reference to MARC/Charset.pm, but as it
 601             # exists probably won't work quite on Windows --
 602             # some sort of minimal-bunch back-tracking RE
 603             # would be helpful here
 604             $msg =~ s/at [\/].*?.MARC.Charset\.pm line \d+\.\n$//;
 605             push @errors, $msg;
 606         } else {
 607             # if warning doesn't come from MARC::Charset, just
 608             # pass it on
 609             warn $msg;
 610         }
 611     };
 612
 613     foreach my $field ($marc_record->fields()) {
 614         if ($field->is_control_field()) {
 615             ; # do nothing -- control fields should not contain non-ASCII characters
 616         } else {
 617             my @converted_subfields;
 618             foreach my $subfield ($field->subfields()) {
 619                 my $utf8sf = MARC::Charset::marc8_to_utf8($subfield->[1]);
 620                 unless (IsStringUTF8ish($utf8sf)) {
 621                     # Because of a bug in MARC::Charset 0.98, if the string
 622                     # has (a) one or more diacritics that (b) are only in character positions
 623                     # 128 to 255 inclusive, the resulting converted string is not in
 624                     # UTF-8, but the legacy 8-bit encoding (e.g., ISO-8859-1).  If that
 625                     # occurs, upgrade the string in place.  Moral of the story seems to be
 626                     # that pack("U", ...) is better than chr(...) if you need to guarantee
 627                     # that the resulting string is UTF-8.
 628                     $utf8sf = Encode::encode('UTF-8', $utf8sf);
 629                 }
 630                 push @converted_subfields, $subfield->[0], $utf8sf;
 631             }
 632
 633             $field->replace_with(MARC::Field->new(
 634                 $field->tag(), $field->indicator(1), $field->indicator(2),
 635                 @converted_subfields)
 636             );
 637         }
 638     }
 639
 640     MARC::Charset->ignore_errors($prev_ignore);
 641
 642     SetMarcUnicodeFlag($marc_record, $marc_flavour);
 643
 644     return @errors;
 645 }
 646
 647 =head2 _marc_iso5426_to_utf8
 648
 649   my @errors = _marc_iso5426_to_utf8($marc_record, $marc_flavour, $source_encoding);
 650
 651 Convert a C<MARC::Record> to UTF-8 in-place from ISO-5426.
 652 If the conversion fails for some reason, an
 653 appropriate messages will be placed in the returned
 654 C<@errors> array.
 655
 656 FIXME - is ISO-5426 equivalent enough to MARC-8
 657 that C<MARC::Charset> can be used instead?
 658
 659 =cut
 660
 661 sub _marc_iso5426_to_utf8 {
 662     my $marc_record = shift;
 663     my $marc_flavour = shift;
 664
 665     my @errors = ();
 666
 667     foreach my $field ($marc_record->fields()) {
 668         if ($field->is_control_field()) {
 669             ; # do nothing -- control fields should not contain non-ASCII characters
 670         } else {
 671             my @converted_subfields;
 672             foreach my $subfield ($field->subfields()) {
 673                 my $utf8sf = char_decode5426($subfield->[1]);
 674                 push @converted_subfields, $subfield->[0], $utf8sf;
 675             }
 676
 677             $field->replace_with(MARC::Field->new(
 678                 $field->tag(), $field->indicator(1), $field->indicator(2),
 679                 @converted_subfields)
 680             );
 681         }
 682     }
 683
 684     SetMarcUnicodeFlag($marc_record, $marc_flavour);
 685
 686     return @errors;
 687 }
 688
 689 =head2 _marc_to_utf8_via_text_iconv
 690
 691   my @errors = _marc_to_utf8_via_text_iconv($marc_record, $marc_flavour, $source_encoding);
 692
 693 Convert a C<MARC::Record> to UTF-8 in-place using the
 694 C<Text::Iconv> CPAN module.  Any source encoding accepted
 695 by the user's iconv installation should work.  If
 696 the source encoding is not recognized on the user's
 697 server or the conversion fails for some reason,
 698 appropriate messages will be placed in the returned
 699 C<@errors> array.
 700
 701 =cut
 702
 703 sub _marc_to_utf8_via_text_iconv {
 704     my $marc_record = shift;
 705     my $marc_flavour = shift;
 706     my $source_encoding = shift;
 707
 708     my @errors = ();
 709     my $decoder;
 710     eval { $decoder = Text::Iconv->new($source_encoding, 'utf8'); };
 711     if ($@) {
 712         push @errors, "Could not initialze $source_encoding => utf8 converter: $@";
 713         return @errors;
 714     }
 715
 716     my $prev_raise_error = Text::Iconv->raise_error();
 717     Text::Iconv->raise_error(1);
 718
 719     foreach my $field ($marc_record->fields()) {
 720         if ($field->is_control_field()) {
 721             ; # do nothing -- control fields should not contain non-ASCII characters
 722         } else {
 723             my @converted_subfields;
 724             foreach my $subfield ($field->subfields()) {
 725                 my $converted_value;
 726                 my $conversion_ok = 1;
 727                 eval { $converted_value = $decoder->convert($subfield->[1]); };
 728                 if ($@) {
 729                     $conversion_ok = 0;
 730                     push @errors, $@;
 731                 } elsif (not defined $converted_value) {
 732                     $conversion_ok = 0;
 733                     push @errors, "Text::Iconv conversion failed - retval is " . $decoder->retval();
 734                 }
 735
 736                 if ($conversion_ok) {
 737                     push @converted_subfields, $subfield->[0], $converted_value;
 738                 } else {
 739                     $converted_value = $subfield->[1];
 740                     $converted_value =~ s/[\200-\377]/\xef\xbf\xbd/g;
 741                     push @converted_subfields, $subfield->[0], $converted_value;
 742                 }
 743             }
 744
 745             $field->replace_with(MARC::Field->new(
 746                 $field->tag(), $field->indicator(1), $field->indicator(2),
 747                 @converted_subfields)
 748             );
 749         }
 750     }
 751
 752     SetMarcUnicodeFlag($marc_record, $marc_flavour);
 753     Text::Iconv->raise_error($prev_raise_error);
 754
 755     return @errors;
 756 }
 757
 758 =head2 _marc_to_utf8_replacement_char
 759
 760   _marc_to_utf8_replacement_char($marc_record, $marc_flavour);
 761
 762 Convert a C<MARC::Record> to UTF-8 in-place, adopting the
 763 unsatisfactory method of replacing all non-ASCII (e.g.,
 764 where the eight bit is set) octet with the Unicode
 765 replacement character.  This is meant as a last-ditch
 766 method, and would be best used as part of a UI that
 767 lets a cataloguer pick various character conversions
 768 until he or she finds the right one.
 769
 770 =cut
 771
 772 sub _marc_to_utf8_replacement_char {
 773     my $marc_record = shift;
 774     my $marc_flavour = shift;
 775
 776     foreach my $field ($marc_record->fields()) {
 777         if ($field->is_control_field()) {
 778             ; # do nothing -- control fields should not contain non-ASCII characters
 779         } else {
 780             my @converted_subfields;
 781             foreach my $subfield ($field->subfields()) {
 782                 my $value = $subfield->[1];
 783                 $value =~ s/[\200-\377]/\xef\xbf\xbd/g;
 784                 push @converted_subfields, $subfield->[0], $value;
 785             }
 786
 787             $field->replace_with(MARC::Field->new(
 788                 $field->tag(), $field->indicator(1), $field->indicator(2),
 789                 @converted_subfields)
 790             );
 791         }
 792     }
 793
 794     SetMarcUnicodeFlag($marc_record, $marc_flavour);
 795 }
 796
 797 =head2 char_decode5426
 798
 799   my $utf8string = char_decode5426($iso_5426_string);
 800
 801 Converts a string from ISO-5426 to UTF-8.
 802
 803 =cut
 804
 805
 806 my %chars;
 807 $chars{0xb0}=0x0101;#3/0ayn[ain]
 808 $chars{0xb1}=0x0623;#3/1alif/hamzah[alefwithhamzaabove]
 809 #$chars{0xb2}=0x00e0;#'à';
 810 $chars{0xb2}=0x00e0;#3/2leftlowsinglequotationmark
 811 #$chars{0xb3}=0x00e7;#'ç';
 812 $chars{0xb3}=0x00e7;#3/2leftlowsinglequotationmark
 813 # $chars{0xb4}='è';
 814 $chars{0xb4}=0x00e8;
 815 $chars{0xbd}=0x02b9;
 816 $chars{0xbe}=0x02ba;
 817 # $chars{0xb5}='é';
 818 $chars{0xb5}=0x00e9;
 819 $chars{0x97}=0x003c;#3/2leftlowsinglequotationmark
 820 $chars{0x98}=0x003e;#3/2leftlowsinglequotationmark
 821 $chars{0xfa}=0x0153; #oe
 822 $chars{0xea}=0x0152; #oe
 823 $chars{0x81d1}=0x00b0;
 824
 825 ####
 826 ## combined characters iso5426
 827
 828 $chars{0xc041}=0x1ea2; # capital a with hook above
 829 $chars{0xc045}=0x1eba; # capital e with hook above
 830 $chars{0xc049}=0x1ec8; # capital i with hook above
 831 $chars{0xc04f}=0x1ece; # capital o with hook above
 832 $chars{0xc055}=0x1ee6; # capital u with hook above
 833 $chars{0xc059}=0x1ef6; # capital y with hook above
 834 $chars{0xc061}=0x1ea3; # small a with hook above
 835 $chars{0xc065}=0x1ebb; # small e with hook above
 836 $chars{0xc069}=0x1ec9; # small i with hook above
 837 $chars{0xc06f}=0x1ecf; # small o with hook above
 838 $chars{0xc075}=0x1ee7; # small u with hook above
 839 $chars{0xc079}=0x1ef7; # small y with hook above
 840
 841         # 4/1 grave accent
 842 $chars{0xc141}=0x00c0; # capital a with grave accent
 843 $chars{0xc145}=0x00c8; # capital e with grave accent
 844 $chars{0xc149}=0x00cc; # capital i with grave accent
 845 $chars{0xc14f}=0x00d2; # capital o with grave accent
 846 $chars{0xc155}=0x00d9; # capital u with grave accent
 847 $chars{0xc157}=0x1e80; # capital w with grave
 848 $chars{0xc159}=0x1ef2; # capital y with grave
 849 $chars{0xc161}=0x00e0; # small a with grave accent
 850 $chars{0xc165}=0x00e8; # small e with grave accent
 851 $chars{0xc169}=0x00ec; # small i with grave accent
 852 $chars{0xc16f}=0x00f2; # small o with grave accent
 853 $chars{0xc175}=0x00f9; # small u with grave accent
 854 $chars{0xc177}=0x1e81; # small w with grave
 855 $chars{0xc179}=0x1ef3; # small y with grave
 856         # 4/2 acute accent
 857 $chars{0xc241}=0x00c1; # capital a with acute accent
 858 $chars{0xc243}=0x0106; # capital c with acute accent
 859 $chars{0xc245}=0x00c9; # capital e with acute accent
 860 $chars{0xc247}=0x01f4; # capital g with acute
 861 $chars{0xc249}=0x00cd; # capital i with acute accent
 862 $chars{0xc24b}=0x1e30; # capital k with acute
 863 $chars{0xc24c}=0x0139; # capital l with acute accent
 864 $chars{0xc24d}=0x1e3e; # capital m with acute
 865 $chars{0xc24e}=0x0143; # capital n with acute accent
 866 $chars{0xc24f}=0x00d3; # capital o with acute accent
 867 $chars{0xc250}=0x1e54; # capital p with acute
 868 $chars{0xc252}=0x0154; # capital r with acute accent
 869 $chars{0xc253}=0x015a; # capital s with acute accent
 870 $chars{0xc255}=0x00da; # capital u with acute accent
 871 $chars{0xc257}=0x1e82; # capital w with acute
 872 $chars{0xc259}=0x00dd; # capital y with acute accent
 873 $chars{0xc25a}=0x0179; # capital z with acute accent
 874 $chars{0xc261}=0x00e1; # small a with acute accent
 875 $chars{0xc263}=0x0107; # small c with acute accent
 876 $chars{0xc265}=0x00e9; # small e with acute accent
 877 $chars{0xc267}=0x01f5; # small g with acute
 878 $chars{0xc269}=0x00ed; # small i with acute accent
 879 $chars{0xc26b}=0x1e31; # small k with acute
 880 $chars{0xc26c}=0x013a; # small l with acute accent
 881 $chars{0xc26d}=0x1e3f; # small m with acute
 882 $chars{0xc26e}=0x0144; # small n with acute accent
 883 $chars{0xc26f}=0x00f3; # small o with acute accent
 884 $chars{0xc270}=0x1e55; # small p with acute
 885 $chars{0xc272}=0x0155; # small r with acute accent
 886 $chars{0xc273}=0x015b; # small s with acute accent
 887 $chars{0xc275}=0x00fa; # small u with acute accent
 888 $chars{0xc277}=0x1e83; # small w with acute
 889 $chars{0xc279}=0x00fd; # small y with acute accent
 890 $chars{0xc27a}=0x017a; # small z with acute accent
 891 $chars{0xc2e1}=0x01fc; # capital ae with acute
 892 $chars{0xc2f1}=0x01fd; # small ae with acute
 893        # 4/3 circumflex accent
 894 $chars{0xc341}=0x00c2; # capital a with circumflex accent
 895 $chars{0xc343}=0x0108; # capital c with circumflex
 896 $chars{0xc345}=0x00ca; # capital e with circumflex accent
 897 $chars{0xc347}=0x011c; # capital g with circumflex
 898 $chars{0xc348}=0x0124; # capital h with circumflex
 899 $chars{0xc349}=0x00ce; # capital i with circumflex accent
 900 $chars{0xc34a}=0x0134; # capital j with circumflex
 901 $chars{0xc34f}=0x00d4; # capital o with circumflex accent
 902 $chars{0xc353}=0x015c; # capital s with circumflex
 903 $chars{0xc355}=0x00db; # capital u with circumflex
 904 $chars{0xc357}=0x0174; # capital w with circumflex
 905 $chars{0xc359}=0x0176; # capital y with circumflex
 906 $chars{0xc35a}=0x1e90; # capital z with circumflex
 907 $chars{0xc361}=0x00e2; # small a with circumflex accent
 908 $chars{0xc363}=0x0109; # small c with circumflex
 909 $chars{0xc365}=0x00ea; # small e with circumflex accent
 910 $chars{0xc367}=0x011d; # small g with circumflex
 911 $chars{0xc368}=0x0125; # small h with circumflex
 912 $chars{0xc369}=0x00ee; # small i with circumflex accent
 913 $chars{0xc36a}=0x0135; # small j with circumflex
 914 $chars{0xc36e}=0x00f1; # small n with tilde
 915 $chars{0xc36f}=0x00f4; # small o with circumflex accent
 916 $chars{0xc373}=0x015d; # small s with circumflex
 917 $chars{0xc375}=0x00fb; # small u with circumflex
 918 $chars{0xc377}=0x0175; # small w with circumflex
 919 $chars{0xc379}=0x0177; # small y with circumflex
 920 $chars{0xc37a}=0x1e91; # small z with circumflex
 921         # 4/4 tilde
 922 $chars{0xc441}=0x00c3; # capital a with tilde
 923 $chars{0xc445}=0x1ebc; # capital e with tilde
 924 $chars{0xc449}=0x0128; # capital i with tilde
 925 $chars{0xc44e}=0x00d1; # capital n with tilde
 926 $chars{0xc44f}=0x00d5; # capital o with tilde
 927 $chars{0xc455}=0x0168; # capital u with tilde
 928 $chars{0xc456}=0x1e7c; # capital v with tilde
 929 $chars{0xc459}=0x1ef8; # capital y with tilde
 930 $chars{0xc461}=0x00e3; # small a with tilde
 931 $chars{0xc465}=0x1ebd; # small e with tilde
 932 $chars{0xc469}=0x0129; # small i with tilde
 933 $chars{0xc46e}=0x00f1; # small n with tilde
 934 $chars{0xc46f}=0x00f5; # small o with tilde
 935 $chars{0xc475}=0x0169; # small u with tilde
 936 $chars{0xc476}=0x1e7d; # small v with tilde
 937 $chars{0xc479}=0x1ef9; # small y with tilde
 938     # 4/5 macron
 939 $chars{0xc541}=0x0100; # capital a with macron
 940 $chars{0xc545}=0x0112; # capital e with macron
 941 $chars{0xc547}=0x1e20; # capital g with macron
 942 $chars{0xc549}=0x012a; # capital i with macron
 943 $chars{0xc54f}=0x014c; # capital o with macron
 944 $chars{0xc555}=0x016a; # capital u with macron
 945 $chars{0xc561}=0x0101; # small a with macron
 946 $chars{0xc565}=0x0113; # small e with macron
 947 $chars{0xc567}=0x1e21; # small g with macron
 948 $chars{0xc569}=0x012b; # small i with macron
 949 $chars{0xc56f}=0x014d; # small o with macron
 950 $chars{0xc575}=0x016b; # small u with macron
 951 $chars{0xc572}=0x0159; # small r with macron
 952 $chars{0xc5e1}=0x01e2; # capital ae with macron
 953 $chars{0xc5f1}=0x01e3; # small ae with macron
 954         # 4/6 breve
 955 $chars{0xc641}=0x0102; # capital a with breve
 956 $chars{0xc645}=0x0114; # capital e with breve
 957 $chars{0xc647}=0x011e; # capital g with breve
 958 $chars{0xc649}=0x012c; # capital i with breve
 959 $chars{0xc64f}=0x014e; # capital o with breve
 960 $chars{0xc655}=0x016c; # capital u with breve
 961 $chars{0xc661}=0x0103; # small a with breve
 962 $chars{0xc665}=0x0115; # small e with breve
 963 $chars{0xc667}=0x011f; # small g with breve
 964 $chars{0xc669}=0x012d; # small i with breve
 965 $chars{0xc66f}=0x014f; # small o with breve
 966 $chars{0xc675}=0x016d; # small u with breve
 967         # 4/7 dot above
 968 $chars{0xc7b0}=0x01e1; # Ain with dot above
 969 $chars{0xc742}=0x1e02; # capital b with dot above
 970 $chars{0xc743}=0x010a; # capital c with dot above
 971 $chars{0xc744}=0x1e0a; # capital d with dot above
 972 $chars{0xc745}=0x0116; # capital e with dot above
 973 $chars{0xc746}=0x1e1e; # capital f with dot above
 974 $chars{0xc747}=0x0120; # capital g with dot above
 975 $chars{0xc748}=0x1e22; # capital h with dot above
 976 $chars{0xc749}=0x0130; # capital i with dot above
 977 $chars{0xc74d}=0x1e40; # capital m with dot above
 978 $chars{0xc74e}=0x1e44; # capital n with dot above
 979 $chars{0xc750}=0x1e56; # capital p with dot above
 980 $chars{0xc752}=0x1e58; # capital r with dot above
 981 $chars{0xc753}=0x1e60; # capital s with dot above
 982 $chars{0xc754}=0x1e6a; # capital t with dot above
 983 $chars{0xc757}=0x1e86; # capital w with dot above
 984 $chars{0xc758}=0x1e8a; # capital x with dot above
 985 $chars{0xc759}=0x1e8e; # capital y with dot above
 986 $chars{0xc75a}=0x017b; # capital z with dot above
 987 $chars{0xc761}=0x0227; # small b with dot above
 988 $chars{0xc762}=0x1e03; # small b with dot above
 989 $chars{0xc763}=0x010b; # small c with dot above
 990 $chars{0xc764}=0x1e0b; # small d with dot above
 991 $chars{0xc765}=0x0117; # small e with dot above
 992 $chars{0xc766}=0x1e1f; # small f with dot above
 993 $chars{0xc767}=0x0121; # small g with dot above
 994 $chars{0xc768}=0x1e23; # small h with dot above
 995 $chars{0xc76d}=0x1e41; # small m with dot above
 996 $chars{0xc76e}=0x1e45; # small n with dot above
 997 $chars{0xc770}=0x1e57; # small p with dot above
 998 $chars{0xc772}=0x1e59; # small r with dot above
 999 $chars{0xc773}=0x1e61; # small s with dot above
1000 $chars{0xc774}=0x1e6b; # small t with dot above
1001 $chars{0xc777}=0x1e87; # small w with dot above
1002 $chars{0xc778}=0x1e8b; # small x with dot above
1003 $chars{0xc779}=0x1e8f; # small y with dot above
1004 $chars{0xc77a}=0x017c; # small z with dot above
1005         # 4/8 trema, diaresis
1006 $chars{0xc820}=0x00a8; # diaeresis
1007 $chars{0xc841}=0x00c4; # capital a with diaeresis
1008 $chars{0xc845}=0x00cb; # capital e with diaeresis
1009 $chars{0xc848}=0x1e26; # capital h with diaeresis
1010 $chars{0xc849}=0x00cf; # capital i with diaeresis
1011 $chars{0xc84f}=0x00d6; # capital o with diaeresis
1012 $chars{0xc855}=0x00dc; # capital u with diaeresis
1013 $chars{0xc857}=0x1e84; # capital w with diaeresis
1014 $chars{0xc858}=0x1e8c; # capital x with diaeresis
1015 $chars{0xc859}=0x0178; # capital y with diaeresis
1016 $chars{0xc861}=0x00e4; # small a with diaeresis
1017 $chars{0xc865}=0x00eb; # small e with diaeresis
1018 $chars{0xc868}=0x1e27; # small h with diaeresis
1019 $chars{0xc869}=0x00ef; # small i with diaeresis
1020 $chars{0xc86f}=0x00f6; # small o with diaeresis
1021 $chars{0xc874}=0x1e97; # small t with diaeresis
1022 $chars{0xc875}=0x00fc; # small u with diaeresis
1023 $chars{0xc877}=0x1e85; # small w with diaeresis
1024 $chars{0xc878}=0x1e8d; # small x with diaeresis
1025 $chars{0xc879}=0x00ff; # small y with diaeresis
1026         # 4/9 umlaut
1027 $chars{0xc920}=0x00a8; # [diaeresis]
1028 $chars{0xc961}=0x00e4; # a with umlaut
1029 $chars{0xc965}=0x00eb; # e with umlaut
1030 $chars{0xc969}=0x00ef; # i with umlaut
1031 $chars{0xc96f}=0x00f6; # o with umlaut
1032 $chars{0xc975}=0x00fc; # u with umlaut
1033         # 4/10 circle above
1034 $chars{0xca41}=0x00c5; # capital a with ring above
1035 $chars{0xcaad}=0x016e; # capital u with ring above
1036 $chars{0xca61}=0x00e5; # small a with ring above
1037 $chars{0xca75}=0x016f; # small u with ring above
1038 $chars{0xca77}=0x1e98; # small w with ring above
1039 $chars{0xca79}=0x1e99; # small y with ring above
1040         # 4/11 high comma off centre
1041         # 4/12 inverted high comma centred
1042         # 4/13 double acute accent
1043 $chars{0xcd4f}=0x0150; # capital o with double acute
1044 $chars{0xcd55}=0x0170; # capital u with double acute
1045 $chars{0xcd6f}=0x0151; # small o with double acute
1046 $chars{0xcd75}=0x0171; # small u with double acute
1047         # 4/14 horn
1048 $chars{0xce54}=0x01a0; # latin capital letter o with horn
1049 $chars{0xce55}=0x01af; # latin capital letter u with horn
1050 $chars{0xce74}=0x01a1; # latin small letter o with horn
1051 $chars{0xce75}=0x01b0; # latin small letter u with horn
1052         # 4/15 caron (hacek
1053 $chars{0xcf41}=0x01cd; # capital a with caron
1054 $chars{0xcf43}=0x010c; # capital c with caron
1055 $chars{0xcf44}=0x010e; # capital d with caron
1056 $chars{0xcf45}=0x011a; # capital e with caron
1057 $chars{0xcf47}=0x01e6; # capital g with caron
1058 $chars{0xcf49}=0x01cf; # capital i with caron
1059 $chars{0xcf4b}=0x01e8; # capital k with caron
1060 $chars{0xcf4c}=0x013d; # capital l with caron
1061 $chars{0xcf4e}=0x0147; # capital n with caron
1062 $chars{0xcf4f}=0x01d1; # capital o with caron
1063 $chars{0xcf52}=0x0158; # capital r with caron
1064 $chars{0xcf53}=0x0160; # capital s with caron
1065 $chars{0xcf54}=0x0164; # capital t with caron
1066 $chars{0xcf55}=0x01d3; # capital u with caron
1067 $chars{0xcf5a}=0x017d; # capital z with caron
1068 $chars{0xcf61}=0x01ce; # small a with caron
1069 $chars{0xcf63}=0x010d; # small c with caron
1070 $chars{0xcf64}=0x010f; # small d with caron
1071 $chars{0xcf65}=0x011b; # small e with caron
1072 $chars{0xcf67}=0x01e7; # small g with caron
1073 $chars{0xcf69}=0x01d0; # small i with caron
1074 $chars{0xcf6a}=0x01f0; # small j with caron
1075 $chars{0xcf6b}=0x01e9; # small k with caron
1076 $chars{0xcf6c}=0x013e; # small l with caron
1077 $chars{0xcf6e}=0x0148; # small n with caron
1078 $chars{0xcf6f}=0x01d2; # small o with caron
1079 $chars{0xcf72}=0x0159; # small r with caron
1080 $chars{0xcf73}=0x0161; # small s with caron
1081 $chars{0xcf74}=0x0165; # small t with caron
1082 $chars{0xcf75}=0x01d4; # small u with caron
1083 $chars{0xcf7a}=0x017e; # small z with caron
1084         # 5/0 cedilla
1085 $chars{0xd020}=0x00b8; # cedilla
1086 $chars{0xd043}=0x00c7; # capital c with cedilla
1087 $chars{0xd044}=0x1e10; # capital d with cedilla
1088 $chars{0xd047}=0x0122; # capital g with cedilla
1089 $chars{0xd048}=0x1e28; # capital h with cedilla
1090 $chars{0xd04b}=0x0136; # capital k with cedilla
1091 $chars{0xd04c}=0x013b; # capital l with cedilla
1092 $chars{0xd04e}=0x0145; # capital n with cedilla
1093 $chars{0xd052}=0x0156; # capital r with cedilla
1094 $chars{0xd053}=0x015e; # capital s with cedilla
1095 $chars{0xd054}=0x0162; # capital t with cedilla
1096 $chars{0xd063}=0x00e7; # small c with cedilla
1097 $chars{0xd064}=0x1e11; # small d with cedilla
1098 $chars{0xd065}=0x0119; # small e with cedilla
1099 $chars{0xd067}=0x0123; # small g with cedilla
1100 $chars{0xd068}=0x1e29; # small h with cedilla
1101 $chars{0xd06b}=0x0137; # small k with cedilla
1102 $chars{0xd06c}=0x013c; # small l with cedilla
1103 $chars{0xd06e}=0x0146; # small n with cedilla
1104 $chars{0xd072}=0x0157; # small r with cedilla
1105 $chars{0xd073}=0x015f; # small s with cedilla
1106 $chars{0xd074}=0x0163; # small t with cedilla
1107         # 5/1 rude
1108         # 5/2 hook to left
1109         # 5/3 ogonek (hook to right
1110 $chars{0xd320}=0x02db; # ogonek
1111 $chars{0xd341}=0x0104; # capital a with ogonek
1112 $chars{0xd345}=0x0118; # capital e with ogonek
1113 $chars{0xd349}=0x012e; # capital i with ogonek
1114 $chars{0xd34f}=0x01ea; # capital o with ogonek
1115 $chars{0xd355}=0x0172; # capital u with ogonek
1116 $chars{0xd361}=0x0105; # small a with ogonek
1117 $chars{0xd365}=0x0119; # small e with ogonek
1118 $chars{0xd369}=0x012f; # small i with ogonek
1119 $chars{0xd36f}=0x01eb; # small o with ogonek
1120 $chars{0xd375}=0x0173; # small u with ogonek
1121         # 5/4 circle below
1122 $chars{0xd441}=0x1e00; # capital a with ring below
1123 $chars{0xd461}=0x1e01; # small a with ring below
1124         # 5/5 half circle below
1125 $chars{0xf948}=0x1e2a; # capital h with breve below
1126 $chars{0xf968}=0x1e2b; # small h with breve below
1127         # 5/6 dot below
1128 $chars{0xd641}=0x1ea0; # capital a with dot below
1129 $chars{0xd642}=0x1e04; # capital b with dot below
1130 $chars{0xd644}=0x1e0c; # capital d with dot below
1131 $chars{0xd645}=0x1eb8; # capital e with dot below
1132 $chars{0xd648}=0x1e24; # capital h with dot below
1133 $chars{0xd649}=0x1eca; # capital i with dot below
1134 $chars{0xd64b}=0x1e32; # capital k with dot below
1135 $chars{0xd64c}=0x1e36; # capital l with dot below
1136 $chars{0xd64d}=0x1e42; # capital m with dot below
1137 $chars{0xd64e}=0x1e46; # capital n with dot below
1138 $chars{0xd64f}=0x1ecc; # capital o with dot below
1139 $chars{0xd652}=0x1e5a; # capital r with dot below
1140 $chars{0xd653}=0x1e62; # capital s with dot below
1141 $chars{0xd654}=0x1e6c; # capital t with dot below
1142 $chars{0xd655}=0x1ee4; # capital u with dot below
1143 $chars{0xd656}=0x1e7e; # capital v with dot below
1144 $chars{0xd657}=0x1e88; # capital w with dot below
1145 $chars{0xd659}=0x1ef4; # capital y with dot below
1146 $chars{0xd65a}=0x1e92; # capital z with dot below
1147 $chars{0xd661}=0x1ea1; # small a with dot below
1148 $chars{0xd662}=0x1e05; # small b with dot below
1149 $chars{0xd664}=0x1e0d; # small d with dot below
1150 $chars{0xd665}=0x1eb9; # small e with dot below
1151 $chars{0xd668}=0x1e25; # small h with dot below
1152 $chars{0xd669}=0x1ecb; # small i with dot below
1153 $chars{0xd66b}=0x1e33; # small k with dot below
1154 $chars{0xd66c}=0x1e37; # small l with dot below
1155 $chars{0xd66d}=0x1e43; # small m with dot below
1156 $chars{0xd66e}=0x1e47; # small n with dot below
1157 $chars{0xd66f}=0x1ecd; # small o with dot below
1158 $chars{0xd672}=0x1e5b; # small r with dot below
1159 $chars{0xd673}=0x1e63; # small s with dot below
1160 $chars{0xd674}=0x1e6d; # small t with dot below
1161 $chars{0xd675}=0x1ee5; # small u with dot below
1162 $chars{0xd676}=0x1e7f; # small v with dot below
1163 $chars{0xd677}=0x1e89; # small w with dot below
1164 $chars{0xd679}=0x1ef5; # small y with dot below
1165 $chars{0xd67a}=0x1e93; # small z with dot below
1166         # 5/7 double dot below
1167 $chars{0xd755}=0x1e72; # capital u with diaeresis below
1168 $chars{0xd775}=0x1e73; # small u with diaeresis below
1169         # 5/8 underline
1170 $chars{0xd820}=0x005f; # underline
1171         # 5/9 double underline
1172 $chars{0xd920}=0x2017; # double underline
1173         # 5/10 small low vertical bar
1174 $chars{0xda20}=0x02cc; #
1175         # 5/11 circumflex below
1176         # 5/12 (this position shall not be used)
1177         # 5/13 left half of ligature sign and of double tilde
1178         # 5/14 right half of ligature sign
1179         # 5/15 right half of double tilde
1180 #     map {printf "%x :%x\n",$_,$chars{$_};}keys %chars;
1181
1182 sub char_decode5426 {
1183     my ( $string) = @_;
1184     my $result;
1185
1186     my @data = unpack("C*", $string);
1187     my @characters;
1188     my $length=scalar(@data);
1189     for (my $i = 0; $i < scalar(@data); $i++) {
1190       my $char= $data[$i];
1191       if ($char >= 0x00 && $char <= 0x7F){
1192         #IsAscii
1193
1194           push @characters,$char unless ($char<0x02 ||$char== 0x0F);
1195       }elsif (($char >= 0xC0 && $char <= 0xDF)) {
1196         #Combined Char
1197         my $convchar ;
1198         if ($chars{$char*256+$data[$i+1]}) {
1199           $convchar= $chars{$char * 256 + $data[$i+1]};
1200           $i++;
1201 #           printf "char %x $char, char to convert %x , converted %x\n",$char,$char * 256 + $data[$i - 1],$convchar;
1202         } elsif ($chars{$char})  {
1203           $convchar= $chars{$char};
1204 #           printf "0xC char %x, converted %x\n",$char,$chars{$char};
1205         }else {
1206           $convchar=$char;
1207         }
1208         push @characters,$convchar;
1209       } else {
1210         my $convchar;
1211         if ($chars{$char})  {
1212           $convchar= $chars{$char};
1213 #            printf "char %x,  converted %x\n",$char,$chars{$char};
1214         }else {
1215 #            printf "char %x $char\n",$char;
1216           $convchar=$char;
1217         }
1218         push @characters,$convchar;
1219       }
1220     }
1221     $result=pack "U*",@characters;
1222 #     $result=~s/\x01//;
1223 #     $result=~s/\x00//;
1224      $result=~s/\x0f//;
1225      $result=~s/\x1b.//;
1226      $result=~s/\x0e//;
1227      $result=~s/\x1b\x5b//;
1228 #   map{printf "%x",$_} @characters;
1229 #   printf "\n";
1230   return $result;
1231 }
1232
1233 1;
1234
1235
1236 =head1 AUTHOR
1237
1238 Koha Development Team <http://koha-community.org/>
1239
1240 Galen Charlton <galen.charlton@liblime.com>
1241
1242 =cut