C4/Charset.pm

   1 package C4::Charset;
   2
   3 # Copyright (C) 2008 LibLime
   4 #
   5 # This file is part of Koha.
   6 #
   7 # Koha is free software; you can redistribute it and/or modify it under the
   8 # terms of the GNU General Public License as published by the Free Software
   9 # Foundation; either version 2 of the License, or (at your option) any later
  10 # version.
  11 #
  12 # Koha is distributed in the hope that it will be useful, but WITHOUT ANY
  13 # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
  14 # A PARTICULAR PURPOSE.  See the GNU General Public License for more details.
  15 #
  16 # You should have received a copy of the GNU General Public License along
  17 # with Koha; if not, write to the Free Software Foundation, Inc.,
  18 # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
  19
  20 use strict;
  21 use warnings;
  22
  23 use MARC::Charset qw/marc8_to_utf8/;
  24 use Text::Iconv;
  25 use C4::Context;
  26 use C4::Debug;
  27 use Unicode::Normalize;
  28
  29 use vars qw($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
  30
  31 BEGIN {
  32     # set the version for version checking
  33     $VERSION = 3.07.00.049;
  34     require Exporter;
  35     @ISA    = qw(Exporter);
  36     @EXPORT = qw(
  37         NormalizeString
  38         IsStringUTF8ish
  39         MarcToUTF8Record
  40         SetUTF8Flag
  41         SetMarcUnicodeFlag
  42         StripNonXmlChars
  43         nsb_clean
  44         SanitizeRecord
  45     );
  46 }
  47
  48 =encoding UTF-8
  49
  50 =head1 NAME
  51
  52 C4::Charset - utilities for handling character set conversions.
  53
  54 =head1 SYNOPSIS
  55
  56   use C4::Charset;
  57
  58 =head1 DESCRIPTION
  59
  60 This module contains routines for dealing with character set
  61 conversions, particularly for MARC records.
  62
  63 A variety of character encodings are in use by various MARC
  64 standards, and even more character encodings are used by
  65 non-standard MARC records.  The various MARC formats generally
  66 do not do a good job of advertising a given record's character
  67 encoding, and even when a record does advertise its encoding,
  68 e.g., via the Leader/09, experience has shown that one cannot
  69 trust it.
  70
  71 Ultimately, all MARC records are stored in Koha in UTF-8 and
  72 must be converted from whatever the source character encoding is.
  73 The goal of this module is to ensure that these conversions
  74 take place accurately.  When a character conversion cannot take
  75 place, or at least not accurately, the module was provide
  76 enough information to allow user-facing code to inform the user
  77 on how to deal with the situation.
  78
  79 =cut
  80
  81 =head1 FUNCTIONS
  82
  83 =head2 IsStringUTF8ish
  84
  85   my $is_utf8 = IsStringUTF8ish($str);
  86
  87 Determines if C<$str> is valid UTF-8.  This can mean
  88 one of two things:
  89
  90 =over
  91
  92 =item *
  93
  94 The Perl UTF-8 flag is set and the string contains valid UTF-8.
  95
  96 =item *
  97
  98 The Perl UTF-8 flag is B<not> set, but the octets contain
  99 valid UTF-8.
 100
 101 =back
 102
 103 The function is named C<IsStringUTF8ish> instead of C<IsStringUTF8>
 104 because in one could be presented with a MARC blob that is
 105 not actually in UTF-8 but whose sequence of octets appears to be
 106 valid UTF-8.  The rest of the MARC character conversion functions
 107 will assume that this situation occur does not very often.
 108
 109 =cut
 110
 111 sub IsStringUTF8ish {
 112     my $str = shift;
 113
 114     return 1 if utf8::is_utf8($str);
 115     return utf8::decode($str);
 116 }
 117
 118 =head2 SetUTF8Flag
 119
 120   my $marc_record = SetUTF8Flag($marc_record, $nfd);
 121
 122 This function sets the PERL UTF8 flag for data.
 123 It is required when using new_from_usmarc
 124 since MARC::File::USMARC does not handle PERL UTF8 setting.
 125 When editing unicode marc records fields and subfields, you
 126 would end up in double encoding without using this function.
 127
 128 If $nfd is set, string normalization will use NFD instead of NFC
 129
 130 FIXME
 131 In my opinion, this function belongs to MARC::Record and not
 132 to this package.
 133 But since it handles charset, and MARC::Record, it finds its way in that package
 134
 135 =cut
 136
 137 sub SetUTF8Flag{
 138     my ($record, $nfd)=@_;
 139     return unless ($record && $record->fields());
 140     foreach my $field ($record->fields()){
 141         if ($field->tag()>=10){
 142             my @subfields;
 143             foreach my $subfield ($field->subfields()){
 144                 push @subfields,($$subfield[0],NormalizeString($$subfield[1],$nfd));
 145             }
 146             eval {
 147                 my $newfield=MARC::Field->new(
 148                             $field->tag(),
 149                             $field->indicator(1),
 150                             $field->indicator(2),
 151                             @subfields
 152                         );
 153                 $field->replace_with($newfield);
 154             };
 155             warn "ERROR occurred in SetUTF8Flag $@" if $@;
 156         }
 157     }
 158 }
 159
 160 =head2 NormalizeString
 161
 162     my $normalized_string=NormalizeString($string,$nfd,$transform);
 163
 164 Given a string
 165 nfd : If you want to set NFD and not NFC
 166 transform : If you expect all the signs to be removed
 167
 168 Sets the PERL UTF8 Flag on your initial data if need be
 169 and applies cleaning if required
 170
 171 Returns a utf8 NFC normalized string
 172
 173 Sample code :
 174    my $string=NormalizeString ("l'ornithoptère");
 175    #results into ornithoptère in NFC form and sets UTF8 Flag
 176
 177 =cut
 178
 179
 180 sub NormalizeString{
 181         my ($string,$nfd,$transform)=@_;
 182         utf8::decode($string) unless (utf8::is_utf8($string));
 183         if ($nfd){
 184                 $string= NFD($string);
 185         }
 186         else {
 187                 $string=NFC($string);
 188         }
 189         if ($transform){
 190     $string=~s/\<|\>|\^|\;|\.|\?|,|\-|\(|\)|\[|\]|\{|\}|\$|\%|\!|\*|\:|\\|\/|\&|\"|\'/ /g;
 191         #removing one letter words "d'" "l'"  was changed into "d " "l "
 192     $string=~s/\b\S\b//g;
 193     $string=~s/\s+$//g;
 194         }
 195     return $string;
 196 }
 197
 198 =head2 MarcToUTF8Record
 199
 200   ($marc_record, $converted_from, $errors_arrayref) = MarcToUTF8Record($marc_blob,
 201                                         $marc_flavour, [, $source_encoding]);
 202
 203 Given a MARC blob or a C<MARC::Record>, the MARC flavour, and an
 204 optional source encoding, return a C<MARC::Record> that is
 205 converted to UTF-8.
 206
 207 The returned C<$marc_record> is guaranteed to be in valid UTF-8, but
 208 is not guaranteed to have been converted correctly.  Specifically,
 209 if C<$converted_from> is 'failed', the MARC record returned failed
 210 character conversion and had each of its non-ASCII octets changed
 211 to the Unicode replacement character.
 212
 213 If the source encoding was not specified, this routine will
 214 try to guess it; the character encoding used for a successful
 215 conversion is returned in C<$converted_from>.
 216
 217 =cut
 218
 219 sub MarcToUTF8Record {
 220     my $marc = shift;
 221     my $marc_flavour = shift;
 222     my $source_encoding = shift;
 223     my $marc_record;
 224     my $marc_blob_is_utf8 = 0;
 225     if (ref($marc) eq 'MARC::Record') {
 226         my $marc_blob = $marc->as_usmarc();
 227         $marc_blob_is_utf8 = IsStringUTF8ish($marc_blob);
 228         $marc_record = $marc;
 229     } else {
 230         # dealing with a MARC blob
 231
 232         # remove any ersatz whitespace from the beginning and
 233         # end of the MARC blob -- these can creep into MARC
 234         # files produced by several sources -- caller really
 235         # should be doing this, however
 236         $marc =~ s/^\s+//;
 237         $marc =~ s/\s+$//;
 238         $marc_blob_is_utf8 = IsStringUTF8ish($marc);
 239         eval {
 240             $marc_record = MARC::Record->new_from_usmarc($marc);
 241         };
 242         if ($@) {
 243             # if we fail the first time, one likely problem
 244             # is that we have a MARC21 record that says that it's
 245             # UTF-8 (Leader/09 = 'a') but contains non-UTF-8 characters.
 246             # We'll try parsing it again.
 247             substr($marc, 9, 1) = ' ';
 248             eval {
 249                 $marc_record = MARC::Record->new_from_usmarc($marc);
 250             };
 251             if ($@) {
 252                 # it's hopeless; return an empty MARC::Record
 253                 return MARC::Record->new(), 'failed', ['could not parse MARC blob'];
 254             }
 255         }
 256     }
 257
 258     # If we do not know the source encoding, try some guesses
 259     # as follows:
 260     #   1. Record is UTF-8 already.
 261     #   2. If MARC flavor is MARC21 or NORMARC, then
 262     #      a. record is MARC-8
 263     #      b. record is ISO-8859-1
 264     #   3. If MARC flavor is UNIMARC, then
 265     if (not defined $source_encoding) {
 266         if ($marc_blob_is_utf8) {
 267             # note that for MARC21/NORMARC we are not bothering to check
 268             # if the Leader/09 is set to 'a' or not -- because
 269             # of problems with various ILSs (including Koha in the
 270             # past, alas), this just is not trustworthy.
 271             SetMarcUnicodeFlag($marc_record, $marc_flavour);
 272             return $marc_record, 'UTF-8', [];
 273         } else {
 274             if ($marc_flavour eq 'MARC21' || $marc_flavour eq 'NORMARC') {
 275                 return _default_marc21_charconv_to_utf8($marc_record, $marc_flavour);
 276             } elsif ($marc_flavour =~/UNIMARC/) {
 277                 return _default_unimarc_charconv_to_utf8($marc_record, $marc_flavour);
 278             } else {
 279                 return _default_marc21_charconv_to_utf8($marc_record, $marc_flavour);
 280             }
 281         }
 282     } else {
 283         # caller knows the character encoding
 284         my $original_marc_record = $marc_record->clone();
 285         my @errors;
 286         if ($source_encoding =~ /utf-?8/i) {
 287             if ($marc_blob_is_utf8) {
 288                 SetMarcUnicodeFlag($marc_record, $marc_flavour);
 289                 return $marc_record, 'UTF-8', [];
 290             } else {
 291                 push @errors, 'specified UTF-8 => UTF-8 conversion, but record is not in UTF-8';
 292             }
 293         } elsif ($source_encoding =~ /marc-?8/i) {
 294             @errors = _marc_marc8_to_utf8($marc_record, $marc_flavour);
 295         } elsif ($source_encoding =~ /5426/) {
 296             @errors = _marc_iso5426_to_utf8($marc_record, $marc_flavour);
 297         } else {
 298             # assume any other character encoding is for Text::Iconv
 299             @errors = _marc_to_utf8_via_text_iconv($marc_record, $marc_flavour, $source_encoding);
 300         }
 301
 302         if (@errors) {
 303             _marc_to_utf8_replacement_char($original_marc_record, $marc_flavour);
 304             return $original_marc_record, 'failed', \@errors;
 305         } else {
 306             return $marc_record, $source_encoding, [];
 307         }
 308     }
 309
 310 }
 311
 312 =head2 SetMarcUnicodeFlag
 313
 314   SetMarcUnicodeFlag($marc_record, $marc_flavour);
 315
 316 Set both the internal MARC::Record encoding flag
 317 and the appropriate Leader/09 (MARC21) or
 318 100/26-29 (UNIMARC) to indicate that the record
 319 is in UTF-8.  Note that this does B<not> do
 320 any actual character conversion.
 321
 322 =cut
 323
 324 sub SetMarcUnicodeFlag {
 325     my $marc_record = shift;
 326     my $marc_flavour = shift; # || C4::Context->preference("marcflavour");
 327
 328     $marc_record->encoding('UTF-8');
 329     if ($marc_flavour eq 'MARC21' || $marc_flavour eq 'NORMARC') {
 330         my $leader = $marc_record->leader();
 331         substr($leader, 9, 1) = 'a';
 332         $marc_record->leader($leader);
 333     } elsif ($marc_flavour =~/UNIMARC/) {
 334         my $defaultlanguage = C4::Context->preference("UNIMARCField100Language");
 335         $defaultlanguage = "fre" if (!$defaultlanguage || length($defaultlanguage) != 3);
 336         my $string;
 337                 my ($subflength,$encodingposition)=($marc_flavour=~/AUTH/?(21,12):(36,25));
 338                 $string=$marc_record->subfield( 100, "a" );
 339         if (defined $string && length($string)==$subflength) {
 340                         $string = substr $string, 0,$subflength if (length($string)>$subflength);
 341         }
 342         else {
 343             $string = POSIX::strftime( "%Y%m%d", localtime );
 344             $string =~ s/\-//g;
 345             $string = sprintf( "%-*s", $subflength, $string );
 346             substr ( $string, ($encodingposition - 3), 3, $defaultlanguage);
 347         }
 348         substr( $string, $encodingposition, 3, "y50" );
 349         if ( $marc_record->subfield( 100, "a" ) ) {
 350                         $marc_record->field('100')->update(a=>$string);
 351                 }
 352                 else {
 353             $marc_record->insert_grouped_field(
 354                 MARC::Field->new( 100, '', '', "a" => $string ) );
 355         }
 356                 $debug && warn "encodage: ", substr( $marc_record->subfield(100, 'a'), $encodingposition, 3 );
 357     } else {
 358         warn "Unrecognized marcflavour: $marc_flavour";
 359     }
 360 }
 361
 362 =head2 StripNonXmlChars
 363
 364   my $new_str = StripNonXmlChars($old_str);
 365
 366 Given a string, return a copy with the
 367 characters that are illegal in XML
 368 removed.
 369
 370 This function exists to work around a problem
 371 that can occur with badly-encoded MARC records.
 372 Specifically, if a UTF-8 MARC record also
 373 has excape (\x1b) characters, MARC::File::XML
 374 will let the escape characters pass through
 375 when as_xml() or as_xml_record() is called.  The
 376 problem is that the escape character is not
 377 legal in well-formed XML documents, so when
 378 MARC::File::XML attempts to parse such a record,
 379 the XML parser will fail.
 380
 381 Stripping such characters will allow a
 382 MARC::Record->new_from_xml()
 383 to work, at the possible risk of some data loss.
 384
 385 =cut
 386
 387 sub StripNonXmlChars {
 388     my $str = shift;
 389     if (!defined($str) || $str eq ""){
 390         return "";
 391     }
 392     $str =~ s/[^\x09\x0A\x0D\x{0020}-\x{D7FF}\x{E000}-\x{FFFD}\x{10000}-\x{10FFFF}]//g;
 393     return $str;
 394 }
 395
 396
 397
 398 =head2 nsb_clean
 399
 400 =over 4
 401
 402 nsb_clean($string);
 403
 404 =back
 405
 406 Removes Non Sorting Block characters
 407
 408 =cut
 409 sub nsb_clean {
 410     my $NSB  = '\x88' ;        # NSB : begin Non Sorting Block
 411     my $NSE  = '\x89' ;        # NSE : Non Sorting Block end
 412     my $NSB2 = '\x98' ;        # NSB : begin Non Sorting Block
 413     my $NSE2 = '\x9C' ;        # NSE : Non Sorting Block end
 414     my $C2   = '\xC2' ;        # What is this char ? It is sometimes left by the regexp after removing NSB / NSE
 415
 416     # handles non sorting blocks
 417     my ($string) = @_ ;
 418     $_ = $string ;
 419     s/($C2){0,1}($NSB|$NSB2)//g ;
 420     s/($C2){0,1}($NSE|$NSE2)//g ;
 421     $string = $_ ;
 422
 423     return($string) ;
 424 }
 425
 426
 427 =head2 SanitizeRecord
 428
 429 SanitizeRecord($marcrecord);
 430
 431 Sanitize a record
 432 This routine is called in the maintenance script misc/maintenance/sanitize_records.pl.
 433 It cleans any string with '&amp;amp;...', replacing it by '&'
 434
 435 =cut
 436
 437 sub SanitizeRecord {
 438     my ( $record, $biblionumber ) = @_;
 439     my $string;
 440     my $record_modified = 0;
 441     my $frameworkcode   = C4::Biblio::GetFrameworkCode($biblionumber);
 442     my ( $url_field, $url_subfield ) =
 443       C4::Biblio::GetMarcFromKohaField( 'biblioitems.url', $frameworkcode );
 444     foreach my $field ( $record->fields() ) {
 445         if ( $field->is_control_field() ) {
 446             my $value           = $field->data();
 447             my $sanitized_value = _clean_ampersand($value);
 448             $record_modified = 1 if $sanitized_value ne $value;
 449             $field->update($sanitized_value);
 450         }
 451         else {
 452             my @subfields = $field->subfields();
 453             my @new_subfields;
 454             foreach my $subfield (@subfields) {
 455                 next
 456                   if $url_field eq $field->tag()
 457                       and $url_subfield eq $subfield->[0];
 458                 my $value           = $subfield->[1];
 459                 my $sanitized_value = _clean_ampersand($value);
 460                 push @new_subfields, $subfield->[0] => $sanitized_value;
 461                 $record_modified = 1 if $sanitized_value ne $value;
 462             }
 463             if ( scalar(@new_subfields) > 0 ) {
 464                 my $new_field = eval {
 465                     MARC::Field->new(
 466                         $field->tag(),        $field->indicator(1),
 467                         $field->indicator(2), @new_subfields
 468                     );
 469                 };
 470                 if ($@) {
 471                     warn "error : $@";
 472                 }
 473                 else {
 474                     $field->replace_with($new_field);
 475                 }
 476
 477             }
 478         }
 479     }
 480
 481     return $record, $record_modified;
 482 }
 483
 484 sub _clean_ampersand {
 485     my ($string) = @_;
 486     $string =~ s/(&)(amp;)+/$1/g;
 487     return $string;
 488 }
 489
 490 =head1 INTERNAL FUNCTIONS
 491
 492 =head2 _default_marc21_charconv_to_utf8
 493
 494   my ($new_marc_record, $guessed_charset) = _default_marc21_charconv_to_utf8($marc_record);
 495
 496 Converts a C<MARC::Record> of unknown character set to UTF-8,
 497 first by trying a MARC-8 to UTF-8 conversion, then ISO-8859-1
 498 to UTF-8, then a default conversion that replaces each non-ASCII
 499 character with the replacement character.
 500
 501 The C<$guessed_charset> return value contains the character set
 502 that resulted in a conversion to valid UTF-8; note that
 503 if the MARC-8 and ISO-8859-1 conversions failed, the value of
 504 this is 'failed'.
 505
 506 =cut
 507
 508 sub _default_marc21_charconv_to_utf8 {
 509     my $marc_record = shift;
 510     my $marc_flavour = shift;
 511
 512     my $trial_marc8 = $marc_record->clone();
 513     my @all_errors = ();
 514     my @errors = _marc_marc8_to_utf8($trial_marc8, $marc_flavour);
 515     unless (@errors) {
 516         return $trial_marc8, 'MARC-8', [];
 517     }
 518     push @all_errors, @errors;
 519
 520     my $trial_8859_1 = $marc_record->clone();
 521     @errors = _marc_to_utf8_via_text_iconv($trial_8859_1, $marc_flavour, 'iso-8859-1');
 522     unless (@errors) {
 523         return $trial_8859_1, 'iso-8859-1', []; # note -- we could return \@all_errors
 524                                                 # instead if we wanted to report details
 525                                                 # of the failed attempt at MARC-8 => UTF-8
 526     }
 527     push @all_errors, @errors;
 528
 529     my $default_converted = $marc_record->clone();
 530     _marc_to_utf8_replacement_char($default_converted, $marc_flavour);
 531     return $default_converted, 'failed', \@all_errors;
 532 }
 533
 534 =head2 _default_unimarc_charconv_to_utf8
 535
 536   my ($new_marc_record, $guessed_charset) = _default_unimarc_charconv_to_utf8($marc_record);
 537
 538 Converts a C<MARC::Record> of unknown character set to UTF-8,
 539 first by trying a ISO-5426 to UTF-8 conversion, then ISO-8859-1
 540 to UTF-8, then a default conversion that replaces each non-ASCII
 541 character with the replacement character.
 542
 543 The C<$guessed_charset> return value contains the character set
 544 that resulted in a conversion to valid UTF-8; note that
 545 if the MARC-8 and ISO-8859-1 conversions failed, the value of
 546 this is 'failed'.
 547
 548 =cut
 549
 550 sub _default_unimarc_charconv_to_utf8 {
 551     my $marc_record = shift;
 552     my $marc_flavour = shift;
 553
 554     my $trial_marc8 = $marc_record->clone();
 555     my @all_errors = ();
 556     my @errors = _marc_iso5426_to_utf8($trial_marc8, $marc_flavour);
 557     unless (@errors) {
 558         return $trial_marc8, 'iso-5426';
 559     }
 560     push @all_errors, @errors;
 561
 562     my $trial_8859_1 = $marc_record->clone();
 563     @errors = _marc_to_utf8_via_text_iconv($trial_8859_1, $marc_flavour, 'iso-8859-1');
 564     unless (@errors) {
 565         return $trial_8859_1, 'iso-8859-1';
 566     }
 567     push @all_errors, @errors;
 568
 569     my $default_converted = $marc_record->clone();
 570     _marc_to_utf8_replacement_char($default_converted, $marc_flavour);
 571     return $default_converted, 'failed', \@all_errors;
 572 }
 573
 574 =head2 _marc_marc8_to_utf8
 575
 576   my @errors = _marc_marc8_to_utf8($marc_record, $marc_flavour, $source_encoding);
 577
 578 Convert a C<MARC::Record> to UTF-8 in-place from MARC-8.
 579 If the conversion fails for some reason, an
 580 appropriate messages will be placed in the returned
 581 C<@errors> array.
 582
 583 =cut
 584
 585 sub _marc_marc8_to_utf8 {
 586     my $marc_record = shift;
 587     my $marc_flavour = shift;
 588
 589     my $prev_ignore = MARC::Charset->ignore_errors();
 590     MARC::Charset->ignore_errors(1);
 591
 592     # trap warnings raised by MARC::Charset
 593     my @errors = ();
 594     local $SIG{__WARN__} = sub {
 595         my $msg = $_[0];
 596         if ($msg =~ /MARC.Charset/) {
 597             # FIXME - purpose of this regexp is to strip out the
 598             # line reference to MARC/Charset.pm, but as it
 599             # exists probably won't work quite on Windows --
 600             # some sort of minimal-bunch back-tracking RE
 601             # would be helpful here
 602             $msg =~ s/at [\/].*?.MARC.Charset\.pm line \d+\.\n$//;
 603             push @errors, $msg;
 604         } else {
 605             # if warning doesn't come from MARC::Charset, just
 606             # pass it on
 607             warn $msg;
 608         }
 609     };
 610
 611     foreach my $field ($marc_record->fields()) {
 612         if ($field->is_control_field()) {
 613             ; # do nothing -- control fields should not contain non-ASCII characters
 614         } else {
 615             my @converted_subfields;
 616             foreach my $subfield ($field->subfields()) {
 617                 my $utf8sf = MARC::Charset::marc8_to_utf8($subfield->[1]);
 618                 unless (IsStringUTF8ish($utf8sf)) {
 619                     # Because of a bug in MARC::Charset 0.98, if the string
 620                     # has (a) one or more diacritics that (b) are only in character positions
 621                     # 128 to 255 inclusive, the resulting converted string is not in
 622                     # UTF-8, but the legacy 8-bit encoding (e.g., ISO-8859-1).  If that
 623                     # occurs, upgrade the string in place.  Moral of the story seems to be
 624                     # that pack("U", ...) is better than chr(...) if you need to guarantee
 625                     # that the resulting string is UTF-8.
 626                     utf8::upgrade($utf8sf);
 627                 }
 628                 push @converted_subfields, $subfield->[0], $utf8sf;
 629             }
 630
 631             $field->replace_with(MARC::Field->new(
 632                 $field->tag(), $field->indicator(1), $field->indicator(2),
 633                 @converted_subfields)
 634             );
 635         }
 636     }
 637
 638     MARC::Charset->ignore_errors($prev_ignore);
 639
 640     SetMarcUnicodeFlag($marc_record, $marc_flavour);
 641
 642     return @errors;
 643 }
 644
 645 =head2 _marc_iso5426_to_utf8
 646
 647   my @errors = _marc_iso5426_to_utf8($marc_record, $marc_flavour, $source_encoding);
 648
 649 Convert a C<MARC::Record> to UTF-8 in-place from ISO-5426.
 650 If the conversion fails for some reason, an
 651 appropriate messages will be placed in the returned
 652 C<@errors> array.
 653
 654 FIXME - is ISO-5426 equivalent enough to MARC-8
 655 that C<MARC::Charset> can be used instead?
 656
 657 =cut
 658
 659 sub _marc_iso5426_to_utf8 {
 660     my $marc_record = shift;
 661     my $marc_flavour = shift;
 662
 663     my @errors = ();
 664
 665     foreach my $field ($marc_record->fields()) {
 666         if ($field->is_control_field()) {
 667             ; # do nothing -- control fields should not contain non-ASCII characters
 668         } else {
 669             my @converted_subfields;
 670             foreach my $subfield ($field->subfields()) {
 671                 my $utf8sf = char_decode5426($subfield->[1]);
 672                 push @converted_subfields, $subfield->[0], $utf8sf;
 673             }
 674
 675             $field->replace_with(MARC::Field->new(
 676                 $field->tag(), $field->indicator(1), $field->indicator(2),
 677                 @converted_subfields)
 678             );
 679         }
 680     }
 681
 682     SetMarcUnicodeFlag($marc_record, $marc_flavour);
 683
 684     return @errors;
 685 }
 686
 687 =head2 _marc_to_utf8_via_text_iconv
 688
 689   my @errors = _marc_to_utf8_via_text_iconv($marc_record, $marc_flavour, $source_encoding);
 690
 691 Convert a C<MARC::Record> to UTF-8 in-place using the
 692 C<Text::Iconv> CPAN module.  Any source encoding accepted
 693 by the user's iconv installation should work.  If
 694 the source encoding is not recognized on the user's
 695 server or the conversion fails for some reason,
 696 appropriate messages will be placed in the returned
 697 C<@errors> array.
 698
 699 =cut
 700
 701 sub _marc_to_utf8_via_text_iconv {
 702     my $marc_record = shift;
 703     my $marc_flavour = shift;
 704     my $source_encoding = shift;
 705
 706     my @errors = ();
 707     my $decoder;
 708     eval { $decoder = Text::Iconv->new($source_encoding, 'utf8'); };
 709     if ($@) {
 710         push @errors, "Could not initialze $source_encoding => utf8 converter: $@";
 711         return @errors;
 712     }
 713
 714     my $prev_raise_error = Text::Iconv->raise_error();
 715     Text::Iconv->raise_error(1);
 716
 717     foreach my $field ($marc_record->fields()) {
 718         if ($field->is_control_field()) {
 719             ; # do nothing -- control fields should not contain non-ASCII characters
 720         } else {
 721             my @converted_subfields;
 722             foreach my $subfield ($field->subfields()) {
 723                 my $converted_value;
 724                 my $conversion_ok = 1;
 725                 eval { $converted_value = $decoder->convert($subfield->[1]); };
 726                 if ($@) {
 727                     $conversion_ok = 0;
 728                     push @errors, $@;
 729                 } elsif (not defined $converted_value) {
 730                     $conversion_ok = 0;
 731                     push @errors, "Text::Iconv conversion failed - retval is " . $decoder->retval();
 732                 }
 733
 734                 if ($conversion_ok) {
 735                     push @converted_subfields, $subfield->[0], $converted_value;
 736                 } else {
 737                     $converted_value = $subfield->[1];
 738                     $converted_value =~ s/[\200-\377]/\xef\xbf\xbd/g;
 739                     push @converted_subfields, $subfield->[0], $converted_value;
 740                 }
 741             }
 742
 743             $field->replace_with(MARC::Field->new(
 744                 $field->tag(), $field->indicator(1), $field->indicator(2),
 745                 @converted_subfields)
 746             );
 747         }
 748     }
 749
 750     SetMarcUnicodeFlag($marc_record, $marc_flavour);
 751     Text::Iconv->raise_error($prev_raise_error);
 752
 753     return @errors;
 754 }
 755
 756 =head2 _marc_to_utf8_replacement_char
 757
 758   _marc_to_utf8_replacement_char($marc_record, $marc_flavour);
 759
 760 Convert a C<MARC::Record> to UTF-8 in-place, adopting the
 761 unsatisfactory method of replacing all non-ASCII (e.g.,
 762 where the eight bit is set) octet with the Unicode
 763 replacement character.  This is meant as a last-ditch
 764 method, and would be best used as part of a UI that
 765 lets a cataloguer pick various character conversions
 766 until he or she finds the right one.
 767
 768 =cut
 769
 770 sub _marc_to_utf8_replacement_char {
 771     my $marc_record = shift;
 772     my $marc_flavour = shift;
 773
 774     foreach my $field ($marc_record->fields()) {
 775         if ($field->is_control_field()) {
 776             ; # do nothing -- control fields should not contain non-ASCII characters
 777         } else {
 778             my @converted_subfields;
 779             foreach my $subfield ($field->subfields()) {
 780                 my $value = $subfield->[1];
 781                 $value =~ s/[\200-\377]/\xef\xbf\xbd/g;
 782                 push @converted_subfields, $subfield->[0], $value;
 783             }
 784
 785             $field->replace_with(MARC::Field->new(
 786                 $field->tag(), $field->indicator(1), $field->indicator(2),
 787                 @converted_subfields)
 788             );
 789         }
 790     }
 791
 792     SetMarcUnicodeFlag($marc_record, $marc_flavour);
 793 }
 794
 795 =head2 char_decode5426
 796
 797   my $utf8string = char_decode5426($iso_5426_string);
 798
 799 Converts a string from ISO-5426 to UTF-8.
 800
 801 =cut
 802
 803
 804 my %chars;
 805 $chars{0xb0}=0x0101;#3/0ayn[ain]
 806 $chars{0xb1}=0x0623;#3/1alif/hamzah[alefwithhamzaabove]
 807 #$chars{0xb2}=0x00e0;#'à';
 808 $chars{0xb2}=0x00e0;#3/2leftlowsinglequotationmark
 809 #$chars{0xb3}=0x00e7;#'ç';
 810 $chars{0xb3}=0x00e7;#3/2leftlowsinglequotationmark
 811 # $chars{0xb4}='è';
 812 $chars{0xb4}=0x00e8;
 813 $chars{0xbd}=0x02b9;
 814 $chars{0xbe}=0x02ba;
 815 # $chars{0xb5}='é';
 816 $chars{0xb5}=0x00e9;
 817 $chars{0x97}=0x003c;#3/2leftlowsinglequotationmark
 818 $chars{0x98}=0x003e;#3/2leftlowsinglequotationmark
 819 $chars{0xfa}=0x0153; #oe
 820 $chars{0xea}=0x0152; #oe
 821 $chars{0x81d1}=0x00b0;
 822
 823 ####
 824 ## combined characters iso5426
 825
 826 $chars{0xc041}=0x1ea2; # capital a with hook above
 827 $chars{0xc045}=0x1eba; # capital e with hook above
 828 $chars{0xc049}=0x1ec8; # capital i with hook above
 829 $chars{0xc04f}=0x1ece; # capital o with hook above
 830 $chars{0xc055}=0x1ee6; # capital u with hook above
 831 $chars{0xc059}=0x1ef6; # capital y with hook above
 832 $chars{0xc061}=0x1ea3; # small a with hook above
 833 $chars{0xc065}=0x1ebb; # small e with hook above
 834 $chars{0xc069}=0x1ec9; # small i with hook above
 835 $chars{0xc06f}=0x1ecf; # small o with hook above
 836 $chars{0xc075}=0x1ee7; # small u with hook above
 837 $chars{0xc079}=0x1ef7; # small y with hook above
 838
 839         # 4/1 grave accent
 840 $chars{0xc141}=0x00c0; # capital a with grave accent
 841 $chars{0xc145}=0x00c8; # capital e with grave accent
 842 $chars{0xc149}=0x00cc; # capital i with grave accent
 843 $chars{0xc14f}=0x00d2; # capital o with grave accent
 844 $chars{0xc155}=0x00d9; # capital u with grave accent
 845 $chars{0xc157}=0x1e80; # capital w with grave
 846 $chars{0xc159}=0x1ef2; # capital y with grave
 847 $chars{0xc161}=0x00e0; # small a with grave accent
 848 $chars{0xc165}=0x00e8; # small e with grave accent
 849 $chars{0xc169}=0x00ec; # small i with grave accent
 850 $chars{0xc16f}=0x00f2; # small o with grave accent
 851 $chars{0xc175}=0x00f9; # small u with grave accent
 852 $chars{0xc177}=0x1e81; # small w with grave
 853 $chars{0xc179}=0x1ef3; # small y with grave
 854         # 4/2 acute accent
 855 $chars{0xc241}=0x00c1; # capital a with acute accent
 856 $chars{0xc243}=0x0106; # capital c with acute accent
 857 $chars{0xc245}=0x00c9; # capital e with acute accent
 858 $chars{0xc247}=0x01f4; # capital g with acute
 859 $chars{0xc249}=0x00cd; # capital i with acute accent
 860 $chars{0xc24b}=0x1e30; # capital k with acute
 861 $chars{0xc24c}=0x0139; # capital l with acute accent
 862 $chars{0xc24d}=0x1e3e; # capital m with acute
 863 $chars{0xc24e}=0x0143; # capital n with acute accent
 864 $chars{0xc24f}=0x00d3; # capital o with acute accent
 865 $chars{0xc250}=0x1e54; # capital p with acute
 866 $chars{0xc252}=0x0154; # capital r with acute accent
 867 $chars{0xc253}=0x015a; # capital s with acute accent
 868 $chars{0xc255}=0x00da; # capital u with acute accent
 869 $chars{0xc257}=0x1e82; # capital w with acute
 870 $chars{0xc259}=0x00dd; # capital y with acute accent
 871 $chars{0xc25a}=0x0179; # capital z with acute accent
 872 $chars{0xc261}=0x00e1; # small a with acute accent
 873 $chars{0xc263}=0x0107; # small c with acute accent
 874 $chars{0xc265}=0x00e9; # small e with acute accent
 875 $chars{0xc267}=0x01f5; # small g with acute
 876 $chars{0xc269}=0x00ed; # small i with acute accent
 877 $chars{0xc26b}=0x1e31; # small k with acute
 878 $chars{0xc26c}=0x013a; # small l with acute accent
 879 $chars{0xc26d}=0x1e3f; # small m with acute
 880 $chars{0xc26e}=0x0144; # small n with acute accent
 881 $chars{0xc26f}=0x00f3; # small o with acute accent
 882 $chars{0xc270}=0x1e55; # small p with acute
 883 $chars{0xc272}=0x0155; # small r with acute accent
 884 $chars{0xc273}=0x015b; # small s with acute accent
 885 $chars{0xc275}=0x00fa; # small u with acute accent
 886 $chars{0xc277}=0x1e83; # small w with acute
 887 $chars{0xc279}=0x00fd; # small y with acute accent
 888 $chars{0xc27a}=0x017a; # small z with acute accent
 889 $chars{0xc2e1}=0x01fc; # capital ae with acute
 890 $chars{0xc2f1}=0x01fd; # small ae with acute
 891        # 4/3 circumflex accent
 892 $chars{0xc341}=0x00c2; # capital a with circumflex accent
 893 $chars{0xc343}=0x0108; # capital c with circumflex
 894 $chars{0xc345}=0x00ca; # capital e with circumflex accent
 895 $chars{0xc347}=0x011c; # capital g with circumflex
 896 $chars{0xc348}=0x0124; # capital h with circumflex
 897 $chars{0xc349}=0x00ce; # capital i with circumflex accent
 898 $chars{0xc34a}=0x0134; # capital j with circumflex
 899 $chars{0xc34f}=0x00d4; # capital o with circumflex accent
 900 $chars{0xc353}=0x015c; # capital s with circumflex
 901 $chars{0xc355}=0x00db; # capital u with circumflex
 902 $chars{0xc357}=0x0174; # capital w with circumflex
 903 $chars{0xc359}=0x0176; # capital y with circumflex
 904 $chars{0xc35a}=0x1e90; # capital z with circumflex
 905 $chars{0xc361}=0x00e2; # small a with circumflex accent
 906 $chars{0xc363}=0x0109; # small c with circumflex
 907 $chars{0xc365}=0x00ea; # small e with circumflex accent
 908 $chars{0xc367}=0x011d; # small g with circumflex
 909 $chars{0xc368}=0x0125; # small h with circumflex
 910 $chars{0xc369}=0x00ee; # small i with circumflex accent
 911 $chars{0xc36a}=0x0135; # small j with circumflex
 912 $chars{0xc36e}=0x00f1; # small n with tilde
 913 $chars{0xc36f}=0x00f4; # small o with circumflex accent
 914 $chars{0xc373}=0x015d; # small s with circumflex
 915 $chars{0xc375}=0x00fb; # small u with circumflex
 916 $chars{0xc377}=0x0175; # small w with circumflex
 917 $chars{0xc379}=0x0177; # small y with circumflex
 918 $chars{0xc37a}=0x1e91; # small z with circumflex
 919         # 4/4 tilde
 920 $chars{0xc441}=0x00c3; # capital a with tilde
 921 $chars{0xc445}=0x1ebc; # capital e with tilde
 922 $chars{0xc449}=0x0128; # capital i with tilde
 923 $chars{0xc44e}=0x00d1; # capital n with tilde
 924 $chars{0xc44f}=0x00d5; # capital o with tilde
 925 $chars{0xc455}=0x0168; # capital u with tilde
 926 $chars{0xc456}=0x1e7c; # capital v with tilde
 927 $chars{0xc459}=0x1ef8; # capital y with tilde
 928 $chars{0xc461}=0x00e3; # small a with tilde
 929 $chars{0xc465}=0x1ebd; # small e with tilde
 930 $chars{0xc469}=0x0129; # small i with tilde
 931 $chars{0xc46e}=0x00f1; # small n with tilde
 932 $chars{0xc46f}=0x00f5; # small o with tilde
 933 $chars{0xc475}=0x0169; # small u with tilde
 934 $chars{0xc476}=0x1e7d; # small v with tilde
 935 $chars{0xc479}=0x1ef9; # small y with tilde
 936     # 4/5 macron
 937 $chars{0xc541}=0x0100; # capital a with macron
 938 $chars{0xc545}=0x0112; # capital e with macron
 939 $chars{0xc547}=0x1e20; # capital g with macron
 940 $chars{0xc549}=0x012a; # capital i with macron
 941 $chars{0xc54f}=0x014c; # capital o with macron
 942 $chars{0xc555}=0x016a; # capital u with macron
 943 $chars{0xc561}=0x0101; # small a with macron
 944 $chars{0xc565}=0x0113; # small e with macron
 945 $chars{0xc567}=0x1e21; # small g with macron
 946 $chars{0xc569}=0x012b; # small i with macron
 947 $chars{0xc56f}=0x014d; # small o with macron
 948 $chars{0xc575}=0x016b; # small u with macron
 949 $chars{0xc572}=0x0159; # small r with macron
 950 $chars{0xc5e1}=0x01e2; # capital ae with macron
 951 $chars{0xc5f1}=0x01e3; # small ae with macron
 952         # 4/6 breve
 953 $chars{0xc641}=0x0102; # capital a with breve
 954 $chars{0xc645}=0x0114; # capital e with breve
 955 $chars{0xc647}=0x011e; # capital g with breve
 956 $chars{0xc649}=0x012c; # capital i with breve
 957 $chars{0xc64f}=0x014e; # capital o with breve
 958 $chars{0xc655}=0x016c; # capital u with breve
 959 $chars{0xc661}=0x0103; # small a with breve
 960 $chars{0xc665}=0x0115; # small e with breve
 961 $chars{0xc667}=0x011f; # small g with breve
 962 $chars{0xc669}=0x012d; # small i with breve
 963 $chars{0xc66f}=0x014f; # small o with breve
 964 $chars{0xc675}=0x016d; # small u with breve
 965         # 4/7 dot above
 966 $chars{0xc7b0}=0x01e1; # Ain with dot above
 967 $chars{0xc742}=0x1e02; # capital b with dot above
 968 $chars{0xc743}=0x010a; # capital c with dot above
 969 $chars{0xc744}=0x1e0a; # capital d with dot above
 970 $chars{0xc745}=0x0116; # capital e with dot above
 971 $chars{0xc746}=0x1e1e; # capital f with dot above
 972 $chars{0xc747}=0x0120; # capital g with dot above
 973 $chars{0xc748}=0x1e22; # capital h with dot above
 974 $chars{0xc749}=0x0130; # capital i with dot above
 975 $chars{0xc74d}=0x1e40; # capital m with dot above
 976 $chars{0xc74e}=0x1e44; # capital n with dot above
 977 $chars{0xc750}=0x1e56; # capital p with dot above
 978 $chars{0xc752}=0x1e58; # capital r with dot above
 979 $chars{0xc753}=0x1e60; # capital s with dot above
 980 $chars{0xc754}=0x1e6a; # capital t with dot above
 981 $chars{0xc757}=0x1e86; # capital w with dot above
 982 $chars{0xc758}=0x1e8a; # capital x with dot above
 983 $chars{0xc759}=0x1e8e; # capital y with dot above
 984 $chars{0xc75a}=0x017b; # capital z with dot above
 985 $chars{0xc761}=0x0227; # small b with dot above
 986 $chars{0xc762}=0x1e03; # small b with dot above
 987 $chars{0xc763}=0x010b; # small c with dot above
 988 $chars{0xc764}=0x1e0b; # small d with dot above
 989 $chars{0xc765}=0x0117; # small e with dot above
 990 $chars{0xc766}=0x1e1f; # small f with dot above
 991 $chars{0xc767}=0x0121; # small g with dot above
 992 $chars{0xc768}=0x1e23; # small h with dot above
 993 $chars{0xc76d}=0x1e41; # small m with dot above
 994 $chars{0xc76e}=0x1e45; # small n with dot above
 995 $chars{0xc770}=0x1e57; # small p with dot above
 996 $chars{0xc772}=0x1e59; # small r with dot above
 997 $chars{0xc773}=0x1e61; # small s with dot above
 998 $chars{0xc774}=0x1e6b; # small t with dot above
 999 $chars{0xc777}=0x1e87; # small w with dot above
1000 $chars{0xc778}=0x1e8b; # small x with dot above
1001 $chars{0xc779}=0x1e8f; # small y with dot above
1002 $chars{0xc77a}=0x017c; # small z with dot above
1003         # 4/8 trema, diaresis
1004 $chars{0xc820}=0x00a8; # diaeresis
1005 $chars{0xc841}=0x00c4; # capital a with diaeresis
1006 $chars{0xc845}=0x00cb; # capital e with diaeresis
1007 $chars{0xc848}=0x1e26; # capital h with diaeresis
1008 $chars{0xc849}=0x00cf; # capital i with diaeresis
1009 $chars{0xc84f}=0x00d6; # capital o with diaeresis
1010 $chars{0xc855}=0x00dc; # capital u with diaeresis
1011 $chars{0xc857}=0x1e84; # capital w with diaeresis
1012 $chars{0xc858}=0x1e8c; # capital x with diaeresis
1013 $chars{0xc859}=0x0178; # capital y with diaeresis
1014 $chars{0xc861}=0x00e4; # small a with diaeresis
1015 $chars{0xc865}=0x00eb; # small e with diaeresis
1016 $chars{0xc868}=0x1e27; # small h with diaeresis
1017 $chars{0xc869}=0x00ef; # small i with diaeresis
1018 $chars{0xc86f}=0x00f6; # small o with diaeresis
1019 $chars{0xc874}=0x1e97; # small t with diaeresis
1020 $chars{0xc875}=0x00fc; # small u with diaeresis
1021 $chars{0xc877}=0x1e85; # small w with diaeresis
1022 $chars{0xc878}=0x1e8d; # small x with diaeresis
1023 $chars{0xc879}=0x00ff; # small y with diaeresis
1024         # 4/9 umlaut
1025 $chars{0xc920}=0x00a8; # [diaeresis]
1026 $chars{0xc961}=0x00e4; # a with umlaut
1027 $chars{0xc965}=0x00eb; # e with umlaut
1028 $chars{0xc969}=0x00ef; # i with umlaut
1029 $chars{0xc96f}=0x00f6; # o with umlaut
1030 $chars{0xc975}=0x00fc; # u with umlaut
1031         # 4/10 circle above
1032 $chars{0xca41}=0x00c5; # capital a with ring above
1033 $chars{0xcaad}=0x016e; # capital u with ring above
1034 $chars{0xca61}=0x00e5; # small a with ring above
1035 $chars{0xca75}=0x016f; # small u with ring above
1036 $chars{0xca77}=0x1e98; # small w with ring above
1037 $chars{0xca79}=0x1e99; # small y with ring above
1038         # 4/11 high comma off centre
1039         # 4/12 inverted high comma centred
1040         # 4/13 double acute accent
1041 $chars{0xcd4f}=0x0150; # capital o with double acute
1042 $chars{0xcd55}=0x0170; # capital u with double acute
1043 $chars{0xcd6f}=0x0151; # small o with double acute
1044 $chars{0xcd75}=0x0171; # small u with double acute
1045         # 4/14 horn
1046 $chars{0xce54}=0x01a0; # latin capital letter o with horn
1047 $chars{0xce55}=0x01af; # latin capital letter u with horn
1048 $chars{0xce74}=0x01a1; # latin small letter o with horn
1049 $chars{0xce75}=0x01b0; # latin small letter u with horn
1050         # 4/15 caron (hacek
1051 $chars{0xcf41}=0x01cd; # capital a with caron
1052 $chars{0xcf43}=0x010c; # capital c with caron
1053 $chars{0xcf44}=0x010e; # capital d with caron
1054 $chars{0xcf45}=0x011a; # capital e with caron
1055 $chars{0xcf47}=0x01e6; # capital g with caron
1056 $chars{0xcf49}=0x01cf; # capital i with caron
1057 $chars{0xcf4b}=0x01e8; # capital k with caron
1058 $chars{0xcf4c}=0x013d; # capital l with caron
1059 $chars{0xcf4e}=0x0147; # capital n with caron
1060 $chars{0xcf4f}=0x01d1; # capital o with caron
1061 $chars{0xcf52}=0x0158; # capital r with caron
1062 $chars{0xcf53}=0x0160; # capital s with caron
1063 $chars{0xcf54}=0x0164; # capital t with caron
1064 $chars{0xcf55}=0x01d3; # capital u with caron
1065 $chars{0xcf5a}=0x017d; # capital z with caron
1066 $chars{0xcf61}=0x01ce; # small a with caron
1067 $chars{0xcf63}=0x010d; # small c with caron
1068 $chars{0xcf64}=0x010f; # small d with caron
1069 $chars{0xcf65}=0x011b; # small e with caron
1070 $chars{0xcf67}=0x01e7; # small g with caron
1071 $chars{0xcf69}=0x01d0; # small i with caron
1072 $chars{0xcf6a}=0x01f0; # small j with caron
1073 $chars{0xcf6b}=0x01e9; # small k with caron
1074 $chars{0xcf6c}=0x013e; # small l with caron
1075 $chars{0xcf6e}=0x0148; # small n with caron
1076 $chars{0xcf6f}=0x01d2; # small o with caron
1077 $chars{0xcf72}=0x0159; # small r with caron
1078 $chars{0xcf73}=0x0161; # small s with caron
1079 $chars{0xcf74}=0x0165; # small t with caron
1080 $chars{0xcf75}=0x01d4; # small u with caron
1081 $chars{0xcf7a}=0x017e; # small z with caron
1082         # 5/0 cedilla
1083 $chars{0xd020}=0x00b8; # cedilla
1084 $chars{0xd043}=0x00c7; # capital c with cedilla
1085 $chars{0xd044}=0x1e10; # capital d with cedilla
1086 $chars{0xd047}=0x0122; # capital g with cedilla
1087 $chars{0xd048}=0x1e28; # capital h with cedilla
1088 $chars{0xd04b}=0x0136; # capital k with cedilla
1089 $chars{0xd04c}=0x013b; # capital l with cedilla
1090 $chars{0xd04e}=0x0145; # capital n with cedilla
1091 $chars{0xd052}=0x0156; # capital r with cedilla
1092 $chars{0xd053}=0x015e; # capital s with cedilla
1093 $chars{0xd054}=0x0162; # capital t with cedilla
1094 $chars{0xd063}=0x00e7; # small c with cedilla
1095 $chars{0xd064}=0x1e11; # small d with cedilla
1096 $chars{0xd065}=0x0119; # small e with cedilla
1097 $chars{0xd067}=0x0123; # small g with cedilla
1098 $chars{0xd068}=0x1e29; # small h with cedilla
1099 $chars{0xd06b}=0x0137; # small k with cedilla
1100 $chars{0xd06c}=0x013c; # small l with cedilla
1101 $chars{0xd06e}=0x0146; # small n with cedilla
1102 $chars{0xd072}=0x0157; # small r with cedilla
1103 $chars{0xd073}=0x015f; # small s with cedilla
1104 $chars{0xd074}=0x0163; # small t with cedilla
1105         # 5/1 rude
1106         # 5/2 hook to left
1107         # 5/3 ogonek (hook to right
1108 $chars{0xd320}=0x02db; # ogonek
1109 $chars{0xd341}=0x0104; # capital a with ogonek
1110 $chars{0xd345}=0x0118; # capital e with ogonek
1111 $chars{0xd349}=0x012e; # capital i with ogonek
1112 $chars{0xd34f}=0x01ea; # capital o with ogonek
1113 $chars{0xd355}=0x0172; # capital u with ogonek
1114 $chars{0xd361}=0x0105; # small a with ogonek
1115 $chars{0xd365}=0x0119; # small e with ogonek
1116 $chars{0xd369}=0x012f; # small i with ogonek
1117 $chars{0xd36f}=0x01eb; # small o with ogonek
1118 $chars{0xd375}=0x0173; # small u with ogonek
1119         # 5/4 circle below
1120 $chars{0xd441}=0x1e00; # capital a with ring below
1121 $chars{0xd461}=0x1e01; # small a with ring below
1122         # 5/5 half circle below
1123 $chars{0xf948}=0x1e2a; # capital h with breve below
1124 $chars{0xf968}=0x1e2b; # small h with breve below
1125         # 5/6 dot below
1126 $chars{0xd641}=0x1ea0; # capital a with dot below
1127 $chars{0xd642}=0x1e04; # capital b with dot below
1128 $chars{0xd644}=0x1e0c; # capital d with dot below
1129 $chars{0xd645}=0x1eb8; # capital e with dot below
1130 $chars{0xd648}=0x1e24; # capital h with dot below
1131 $chars{0xd649}=0x1eca; # capital i with dot below
1132 $chars{0xd64b}=0x1e32; # capital k with dot below
1133 $chars{0xd64c}=0x1e36; # capital l with dot below
1134 $chars{0xd64d}=0x1e42; # capital m with dot below
1135 $chars{0xd64e}=0x1e46; # capital n with dot below
1136 $chars{0xd64f}=0x1ecc; # capital o with dot below
1137 $chars{0xd652}=0x1e5a; # capital r with dot below
1138 $chars{0xd653}=0x1e62; # capital s with dot below
1139 $chars{0xd654}=0x1e6c; # capital t with dot below
1140 $chars{0xd655}=0x1ee4; # capital u with dot below
1141 $chars{0xd656}=0x1e7e; # capital v with dot below
1142 $chars{0xd657}=0x1e88; # capital w with dot below
1143 $chars{0xd659}=0x1ef4; # capital y with dot below
1144 $chars{0xd65a}=0x1e92; # capital z with dot below
1145 $chars{0xd661}=0x1ea1; # small a with dot below
1146 $chars{0xd662}=0x1e05; # small b with dot below
1147 $chars{0xd664}=0x1e0d; # small d with dot below
1148 $chars{0xd665}=0x1eb9; # small e with dot below
1149 $chars{0xd668}=0x1e25; # small h with dot below
1150 $chars{0xd669}=0x1ecb; # small i with dot below
1151 $chars{0xd66b}=0x1e33; # small k with dot below
1152 $chars{0xd66c}=0x1e37; # small l with dot below
1153 $chars{0xd66d}=0x1e43; # small m with dot below
1154 $chars{0xd66e}=0x1e47; # small n with dot below
1155 $chars{0xd66f}=0x1ecd; # small o with dot below
1156 $chars{0xd672}=0x1e5b; # small r with dot below
1157 $chars{0xd673}=0x1e63; # small s with dot below
1158 $chars{0xd674}=0x1e6d; # small t with dot below
1159 $chars{0xd675}=0x1ee5; # small u with dot below
1160 $chars{0xd676}=0x1e7f; # small v with dot below
1161 $chars{0xd677}=0x1e89; # small w with dot below
1162 $chars{0xd679}=0x1ef5; # small y with dot below
1163 $chars{0xd67a}=0x1e93; # small z with dot below
1164         # 5/7 double dot below
1165 $chars{0xd755}=0x1e72; # capital u with diaeresis below
1166 $chars{0xd775}=0x1e73; # small u with diaeresis below
1167         # 5/8 underline
1168 $chars{0xd820}=0x005f; # underline
1169         # 5/9 double underline
1170 $chars{0xd920}=0x2017; # double underline
1171         # 5/10 small low vertical bar
1172 $chars{0xda20}=0x02cc; #
1173         # 5/11 circumflex below
1174         # 5/12 (this position shall not be used)
1175         # 5/13 left half of ligature sign and of double tilde
1176         # 5/14 right half of ligature sign
1177         # 5/15 right half of double tilde
1178 #     map {printf "%x :%x\n",$_,$chars{$_};}keys %chars;
1179
1180 sub char_decode5426 {
1181     my ( $string) = @_;
1182     my $result;
1183
1184     my @data = unpack("C*", $string);
1185     my @characters;
1186     my $length=scalar(@data);
1187     for (my $i = 0; $i < scalar(@data); $i++) {
1188       my $char= $data[$i];
1189       if ($char >= 0x00 && $char <= 0x7F){
1190         #IsAscii
1191
1192           push @characters,$char unless ($char<0x02 ||$char== 0x0F);
1193       }elsif (($char >= 0xC0 && $char <= 0xDF)) {
1194         #Combined Char
1195         my $convchar ;
1196         if ($chars{$char*256+$data[$i+1]}) {
1197           $convchar= $chars{$char * 256 + $data[$i+1]};
1198           $i++;
1199 #           printf "char %x $char, char to convert %x , converted %x\n",$char,$char * 256 + $data[$i - 1],$convchar;
1200         } elsif ($chars{$char})  {
1201           $convchar= $chars{$char};
1202 #           printf "0xC char %x, converted %x\n",$char,$chars{$char};
1203         }else {
1204           $convchar=$char;
1205         }
1206         push @characters,$convchar;
1207       } else {
1208         my $convchar;
1209         if ($chars{$char})  {
1210           $convchar= $chars{$char};
1211 #            printf "char %x,  converted %x\n",$char,$chars{$char};
1212         }else {
1213 #            printf "char %x $char\n",$char;
1214           $convchar=$char;
1215         }
1216         push @characters,$convchar;
1217       }
1218     }
1219     $result=pack "U*",@characters;
1220 #     $result=~s/\x01//;
1221 #     $result=~s/\x00//;
1222      $result=~s/\x0f//;
1223      $result=~s/\x1b.//;
1224      $result=~s/\x0e//;
1225      $result=~s/\x1b\x5b//;
1226 #   map{printf "%x",$_} @characters;
1227 #   printf "\n";
1228   return $result;
1229 }
1230
1231 1;
1232
1233
1234 =head1 AUTHOR
1235
1236 Koha Development Team <http://koha-community.org/>
1237
1238 Galen Charlton <galen.charlton@liblime.com>
1239
1240 =cut