3 # Copyright (C) 2008 LibLime
5 # This file is part of Koha.
7 # Koha is free software; you can redistribute it and/or modify it under the
8 # terms of the GNU General Public License as published by the Free Software
9 # Foundation; either version 2 of the License, or (at your option) any later
12 # Koha is distributed in the hope that it will be useful, but WITHOUT ANY
13 # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
14 # A PARTICULAR PURPOSE. See the GNU General Public License for more details.
16 # You should have received a copy of the GNU General Public License along with
17 # Koha; if not, write to the Free Software Foundation, Inc., 59 Temple Place,
18 # Suite 330, Boston, MA 02111-1307 USA
23 use MARC::Charset qw/marc8_to_utf8/;
26 use vars qw($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
29 # set the version for version checking
43 C4::Charset - utilities for handling character set conversions.
51 This module contains routines for dealing with character set
52 conversions, particularly for MARC records.
54 A variety of character encodings are in use by various MARC
55 standards, and even more character encodings are used by
56 non-standard MARC records. The various MARC formats generally
57 do not do a good job of advertising a given record's character
58 encoding, and even when a record does advertise its encoding,
59 e.g., via the Leader/09, experience has shown that one cannot
62 Ultimately, all MARC records are stored in Koha in UTF-8 and
63 must be converted from whatever the source character encoding is.
64 The goal of this module is to ensure that these conversions
65 take place accurately. When a character conversion cannot take
66 place, or at least not accurately, the module was provide
67 enough information to allow user-facing code to inform the user
68 on how to deal with the situation.
74 =head2 IsStringUTF8ish
78 my $is_utf8 = IsStringUTF8ish($str);
82 Determines if C<$str> is valid UTF-8. This can mean
89 The Perl UTF-8 flag is set and the string contains valid UTF-8.
93 The Perl UTF-8 flag is B<not> set, but the octets contain
98 The function is named C<IsStringUTF8ish> instead of C<IsStringUTF8>
99 because in one could be presented with a MARC blob that is
100 not actually in UTF-8 but whose sequence of octets appears to be
101 valid UTF-8. The rest of the MARC character conversion functions
102 will assume that this situation occur does not very often.
106 sub IsStringUTF8ish {
109 return 1 if utf8::is_utf8($str);
110 return utf8::decode($str);
113 =head2 MarcToUTF8Record
117 ($marc_record, $converted_from, $errors_arrayref) = MarcToUTF8Record($marc_blob, $marc_flavour, [, $source_encoding]);
121 Given a MARC blob or a C<MARC::Record>, the MARC flavour, and an
122 optional source encoding, return a C<MARC::Record> that is
125 The returned C<$marc_record> is guaranteed to be in valid UTF-8, but
126 is not guaranteed to have been converted correctly. Specifically,
127 if C<$converted_from> is 'failed', the MARC record returned failed
128 character conversion and had each of its non-ASCII octets changed
129 to the Unicode replacement character.
131 If the source encoding was not specified, this routine will
132 try to guess it; the character encoding used for a successful
133 conversion is returned in C<$converted_from>.
137 sub MarcToUTF8Record {
139 my $marc_flavour = shift;
140 my $source_encoding = shift;
143 my $marc_blob_is_utf8 = 0;
144 if (ref($marc) eq 'MARC::Record') {
145 my $marc_blob = $marc->as_usmarc();
146 $marc_blob_is_utf8 = IsStringUTF8ish($marc_blob);
147 $marc_record = $marc;
149 # dealing with a MARC blob
151 # remove any ersatz whitespace from the beginning and
152 # end of the MARC blob -- these can creep into MARC
153 # files produced by several sources -- caller really
154 # should be doing this, however
157 $marc_blob_is_utf8 = IsStringUTF8ish($marc);
159 $marc_record = MARC::Record->new_from_usmarc($marc);
162 # if we fail the first time, one likely problem
163 # is that we have a MARC21 record that says that it's
164 # UTF-8 (Leader/09 = 'a') but contains non-UTF-8 characters.
165 # We'll try parsing it again.
166 substr($marc, 9, 1) = ' ';
168 $marc_record = MARC::Record->new_from_usmarc($marc);
171 # it's hopeless; return an empty MARC::Record
172 return MARC::Record->new(), 'failed', ['could not parse MARC blob'];
177 # If we do not know the source encoding, try some guesses
179 # 1. Record is UTF-8 already.
180 # 2. If MARC flavor is MARC21, then
181 # a. record is MARC-8
182 # b. record is ISO-8859-1
183 # 3. If MARC flavor is UNIMARC, then
184 if (not defined $source_encoding) {
185 if ($marc_blob_is_utf8) {
186 # note that for MARC21 we are not bothering to check
187 # if the Leader/09 is set to 'a' or not -- because
188 # of problems with various ILSs (including Koha in the
189 # past, alas), this just is not trustworthy.
190 SetMarcUnicodeFlag($marc_record, $marc_flavour);
191 return $marc_record, 'UTF-8', [];
193 if ($marc_flavour eq 'MARC21') {
194 return _default_marc21_charconv_to_utf8($marc_record, $marc_flavour);
195 } elsif ($marc_flavour eq 'UNIMARC') {
196 return _default_unimarc_charconv_to_utf8($marc_record, $marc_flavour);
198 return _default_marc21_charconv_to_utf8($marc_record, $marc_flavour);
202 # caller knows the character encoding
203 my $original_marc_record = $marc_record->clone();
205 if ($source_encoding =~ /utf-?8/i) {
206 if ($marc_blob_is_utf8) {
207 SetMarcUnicodeFlag($marc_record, $marc_flavour);
208 return $marc_record, 'UTF-8', [];
210 push @errors, 'specified UTF-8 => UTF-8 conversion, but record is not in UTF-8';
212 } elsif ($source_encoding =~ /marc-?8/i) {
213 @errors = _marc_marc8_to_utf8($marc_record, $marc_flavour);
214 } elsif ($source_encoding =~ /5426/) {
215 @errors = _marc_iso5426_to_utf8($marc_record, $marc_flavour);
217 # assume any other character encoding is for Text::Iconv
218 @errors = _marc_to_utf8_via_text_iconv($marc_record, $marc_flavour, 'iso-8859-1');
222 _marc_to_utf8_replacement_char($original_marc_record, $marc_flavour);
223 return $original_marc_record, 'failed', \@errors;
225 return $marc_record, $source_encoding, [];
231 =head2 SetMarcUnicodeFlag
235 SetMarcUnicodeFlag($marc_record, $marc_flavour);
239 Set both the internal MARC::Record encoding flag
240 and the appropriate Leader/09 (MARC21) or
241 100/26-29 (UNIMARC) to indicate that the record
242 is in UTF-8. Note that this does B<not> do
243 any actual character conversion.
247 sub SetMarcUnicodeFlag {
248 my $marc_record = shift;
249 my $marc_flavour = shift; # || C4::Context->preference("marcflavour");
251 $marc_record->encoding('UTF-8');
252 if ($marc_flavour eq 'MARC21') {
253 my $leader = $marc_record->leader();
254 substr($leader, 9, 1) = 'a';
255 $marc_record->leader($leader);
256 } elsif ($marc_flavour eq "UNIMARC") {
257 if (my $field = $marc_record->field('100')) {
258 my $sfa = $field->subfield('a');
261 # fix the length of the field
262 $sfa = substr $sfa, 0, $subflength if (length($sfa) > $subflength);
263 $sfa = sprintf( "%-*s", 35, $sfa ) if (length($sfa) < $subflength);
265 substr($sfa, 26, 4) = '50 ';
266 $field->update('a' => $sfa);
269 warn "Unrecognized marcflavour: $marc_flavour";
273 =head2 StripNonXmlChars
277 my $new_str = StripNonXmlChars($old_str);
281 Given a string, return a copy with the
282 characters that are illegal in XML
285 This function exists to work around a problem
286 that can occur with badly-encoded MARC records.
287 Specifically, if a UTF-8 MARC record also
288 has excape (\x1b) characters, MARC::File::XML
289 will let the escape characters pass through
290 when as_xml() or as_xml_record() is called. The
291 problem is that the escape character is not
292 legal in well-formed XML documents, so when
293 MARC::File::XML attempts to parse such a record,
294 the XML parser will fail.
296 Stripping such characters will allow a
297 MARC::Record->new_from_xml()
298 to work, at the possible risk of some data loss.
302 sub StripNonXmlChars {
304 $str =~ s/[^\x09\x0A\x0D\x{0020}-\x{D7FF}\x{E000}-\x{FFFD}\x{10000}-\x{10FFFF}]//g;
308 =head1 INTERNAL FUNCTIONS
310 =head2 _default_marc21_charconv_to_utf8
314 my ($new_marc_record, $guessed_charset) = _default_marc21_charconv_to_utf8($marc_record);
318 Converts a C<MARC::Record> of unknown character set to UTF-8,
319 first by trying a MARC-8 to UTF-8 conversion, then ISO-8859-1
320 to UTF-8, then a default conversion that replaces each non-ASCII
321 character with the replacement character.
323 The C<$guessed_charset> return value contains the character set
324 that resulted in a conversion to valid UTF-8; note that
325 if the MARC-8 and ISO-8859-1 conversions failed, the value of
330 sub _default_marc21_charconv_to_utf8 {
331 my $marc_record = shift;
332 my $marc_flavour = shift;
334 my $trial_marc8 = $marc_record->clone();
336 my @errors = _marc_marc8_to_utf8($trial_marc8, $marc_flavour);
338 return $trial_marc8, 'MARC-8', [];
340 push @all_errors, @errors;
342 my $trial_8859_1 = $marc_record->clone();
343 @errors = _marc_to_utf8_via_text_iconv($trial_8859_1, $marc_flavour, 'iso-8859-1');
345 return $trial_8859_1, 'iso-8859-1', []; # note -- we could return \@all_errors
346 # instead if we wanted to report details
347 # of the failed attempt at MARC-8 => UTF-8
349 push @all_errors, @errors;
351 my $default_converted = $marc_record->clone();
352 _marc_to_utf8_replacement_char($default_converted, $marc_flavour);
353 return $default_converted, 'failed', \@all_errors;
356 =head2 _default_unimarc_charconv_to_utf8
360 my ($new_marc_record, $guessed_charset) = _default_unimarc_charconv_to_utf8($marc_record);
364 Converts a C<MARC::Record> of unknown character set to UTF-8,
365 first by trying a ISO-5426 to UTF-8 conversion, then ISO-8859-1
366 to UTF-8, then a default conversion that replaces each non-ASCII
367 character with the replacement character.
369 The C<$guessed_charset> return value contains the character set
370 that resulted in a conversion to valid UTF-8; note that
371 if the MARC-8 and ISO-8859-1 conversions failed, the value of
376 sub _default_unimarc_charconv_to_utf8 {
377 my $marc_record = shift;
378 my $marc_flavour = shift;
380 my $trial_marc8 = $marc_record->clone();
382 my @errors = _marc_iso5426_to_utf8($trial_marc8, $marc_flavour);
384 return $trial_marc8, 'iso-5426';
386 push @all_errors, @errors;
388 my $trial_8859_1 = $marc_record->clone();
389 @errors = _marc_to_utf8_via_text_iconv($trial_8859_1, $marc_flavour, 'iso-8859-1');
391 return $trial_8859_1, 'iso-8859-1';
393 push @all_errors, @errors;
395 my $default_converted = $marc_record->clone();
396 _marc_to_utf8_replacement_char($default_converted, $marc_flavour);
397 return $default_converted, 'failed', \@all_errors;
400 =head2 _marc_marc8_to_utf8
404 my @errors = _marc_marc8_to_utf8($marc_record, $marc_flavour, $source_encoding);
408 Convert a C<MARC::Record> to UTF-8 in-place from MARC-8.
409 If the conversion fails for some reason, an
410 appropriate messages will be placed in the returned
415 sub _marc_marc8_to_utf8 {
416 my $marc_record = shift;
417 my $marc_flavour = shift;
419 my $prev_ignore = MARC::Charset->ignore_errors();
420 MARC::Charset->ignore_errors(1);
422 # trap warnings raised by MARC::Charset
424 local $SIG{__WARN__} = sub {
426 if ($msg =~ /MARC.Charset/) {
427 # FIXME - purpose of this regexp is to strip out the
428 # line reference to MARC/Charset.pm, but as it
429 # exists probably won't work quite on Windows --
430 # some sort of minimal-bunch back-tracking RE
431 # would be helpful here
432 $msg =~ s/at [\/].*?.MARC.Charset\.pm line \d+\.\n$//;
435 # if warning doesn't come from MARC::Charset, just
441 foreach my $field ($marc_record->fields()) {
442 if ($field->is_control_field()) {
443 ; # do nothing -- control fields should not contain non-ASCII characters
445 my @converted_subfields;
446 foreach my $subfield ($field->subfields()) {
447 my $utf8sf = MARC::Charset::marc8_to_utf8($subfield->[1]);
448 unless (IsStringUTF8ish($utf8sf)) {
449 # Because of a bug in MARC::Charset 0.98, if the string
450 # has (a) one or more diacritics that (b) are only in character positions
451 # 128 to 255 inclusive, the resulting converted string is not in
452 # UTF-8, but the legacy 8-bit encoding (e.g., ISO-8859-1). If that
453 # occurs, upgrade the string in place. Moral of the story seems to be
454 # that pack("U", ...) is better than chr(...) if you need to guarantee
455 # that the resulting string is UTF-8.
456 utf8::upgrade($utf8sf);
458 push @converted_subfields, $subfield->[0], $utf8sf;
461 $field->replace_with(MARC::Field->new(
462 $field->tag(), $field->indicator(1), $field->indicator(2),
463 @converted_subfields)
468 MARC::Charset->ignore_errors($prev_ignore);
470 SetMarcUnicodeFlag($marc_record, $marc_flavour);
475 =head2 _marc_iso5426_to_utf8
479 my @errors = _marc_iso5426_to_utf8($marc_record, $marc_flavour, $source_encoding);
483 Convert a C<MARC::Record> to UTF-8 in-place from ISO-5426.
484 If the conversion fails for some reason, an
485 appropriate messages will be placed in the returned
488 FIXME - is ISO-5426 equivalent enough to MARC-8
489 that C<MARC::Charset> can be used instead?
493 sub _marc_iso5426_to_utf8 {
494 my $marc_record = shift;
495 my $marc_flavour = shift;
499 foreach my $field ($marc_record->fields()) {
500 if ($field->is_control_field()) {
501 ; # do nothing -- control fields should not contain non-ASCII characters
503 my @converted_subfields;
504 foreach my $subfield ($field->subfields()) {
505 my $utf8sf = char_decode5426($subfield->[1]);
506 push @converted_subfields, $subfield->[0], $utf8sf;
509 $field->replace_with(MARC::Field->new(
510 $field->tag(), $field->indicator(1), $field->indicator(2),
511 @converted_subfields)
516 SetMarcUnicodeFlag($marc_record, $marc_flavour);
521 =head2 _marc_to_utf8_via_text_iconv
525 my @errors = _marc_to_utf8_via_text_iconv($marc_record, $marc_flavour, $source_encoding);
529 Convert a C<MARC::Record> to UTF-8 in-place using the
530 C<Text::Iconv> CPAN module. Any source encoding accepted
531 by the user's iconv installation should work. If
532 the source encoding is not recognized on the user's
533 server or the conversion fails for some reason,
534 appropriate messages will be placed in the returned
539 sub _marc_to_utf8_via_text_iconv {
540 my $marc_record = shift;
541 my $marc_flavour = shift;
542 my $source_encoding = shift;
546 eval { $decoder = Text::Iconv->new($source_encoding, 'utf8'); };
548 push @errors, "Could not initialze $source_encoding => utf8 converter: $@";
552 my $prev_raise_error = Text::Iconv->raise_error();
553 Text::Iconv->raise_error(1);
555 foreach my $field ($marc_record->fields()) {
556 if ($field->is_control_field()) {
557 ; # do nothing -- control fields should not contain non-ASCII characters
559 my @converted_subfields;
560 foreach my $subfield ($field->subfields()) {
562 my $conversion_ok = 1;
563 eval { $converted_value = $decoder->convert($subfield->[1]); };
567 } elsif (not defined $converted_value) {
569 push @errors, "Text::Iconv conversion failed - retval is " . $decoder->retval();
572 if ($conversion_ok) {
573 push @converted_subfields, $subfield->[0], $converted_value;
575 $converted_value = $subfield->[1];
576 $converted_value =~ s/[\200-\377]/\xef\xbf\xbd/g;
577 push @converted_subfields, $subfield->[0], $converted_value;
581 $field->replace_with(MARC::Field->new(
582 $field->tag(), $field->indicator(1), $field->indicator(2),
583 @converted_subfields)
588 SetMarcUnicodeFlag($marc_record, $marc_flavour);
589 Text::Iconv->raise_error($prev_raise_error);
594 =head2 _marc_to_utf8_replacement_char
598 _marc_to_utf8_replacement_char($marc_record, $marc_flavour);
602 Convert a C<MARC::Record> to UTF-8 in-place, adopting the
603 unsatisfactory method of replacing all non-ASCII (e.g.,
604 where the eight bit is set) octet with the Unicode
605 replacement character. This is meant as a last-ditch
606 method, and would be best used as part of a UI that
607 lets a cataloguer pick various character conversions
608 until he or she finds the right one.
612 sub _marc_to_utf8_replacement_char {
613 my $marc_record = shift;
614 my $marc_flavour = shift;
616 foreach my $field ($marc_record->fields()) {
617 if ($field->is_control_field()) {
618 ; # do nothing -- control fields should not contain non-ASCII characters
620 my @converted_subfields;
621 foreach my $subfield ($field->subfields()) {
622 my $value = $subfield->[1];
623 $value =~ s/[\200-\377]/\xef\xbf\xbd/g;
624 push @converted_subfields, $subfield->[0], $value;
627 $field->replace_with(MARC::Field->new(
628 $field->tag(), $field->indicator(1), $field->indicator(2),
629 @converted_subfields)
634 SetMarcUnicodeFlag($marc_record, $marc_flavour);
637 =head2 char_decode5426
641 my $utf8string = char_decode5426($iso_5426_string);
645 Converts a string from ISO-5426 to UTF-8.
651 $chars{0xb0}=0x0101;#3/0ayn[ain]
652 $chars{0xb1}=0x0623;#3/1alif/hamzah[alefwithhamzaabove]
653 #$chars{0xb2}=0x00e0;#'Ã ';
654 $chars{0xb2}=0x00e0;#3/2leftlowsinglequotationmark
655 #$chars{0xb3}=0x00e7;#'ç';
656 $chars{0xb3}=0x00e7;#3/2leftlowsinglequotationmark
661 $chars{0x97}=0x003c;#3/2leftlowsinglequotationmark
662 $chars{0x98}=0x003e;#3/2leftlowsinglequotationmark
663 $chars{0xfa}=0x0153;#oe
664 $chars{0x81d1}=0x00b0;
667 ## combined characters iso5426
669 $chars{0xc041}=0x1ea2; # capital a with hook above
670 $chars{0xc045}=0x1eba; # capital e with hook above
671 $chars{0xc049}=0x1ec8; # capital i with hook above
672 $chars{0xc04f}=0x1ece; # capital o with hook above
673 $chars{0xc055}=0x1ee6; # capital u with hook above
674 $chars{0xc059}=0x1ef6; # capital y with hook above
675 $chars{0xc061}=0x1ea3; # small a with hook above
676 $chars{0xc065}=0x1ebb; # small e with hook above
677 $chars{0xc069}=0x1ec9; # small i with hook above
678 $chars{0xc06f}=0x1ecf; # small o with hook above
679 $chars{0xc075}=0x1ee7; # small u with hook above
680 $chars{0xc079}=0x1ef7; # small y with hook above
683 $chars{0xc141}=0x00c0; # capital a with grave accent
684 $chars{0xc145}=0x00c8; # capital e with grave accent
685 $chars{0xc149}=0x00cc; # capital i with grave accent
686 $chars{0xc14f}=0x00d2; # capital o with grave accent
687 $chars{0xc155}=0x00d9; # capital u with grave accent
688 $chars{0xc157}=0x1e80; # capital w with grave
689 $chars{0xc159}=0x1ef2; # capital y with grave
690 $chars{0xc161}=0x00e0; # small a with grave accent
691 $chars{0xc165}=0x00e8; # small e with grave accent
692 $chars{0xc169}=0x00ec; # small i with grave accent
693 $chars{0xc16f}=0x00f2; # small o with grave accent
694 $chars{0xc175}=0x00f9; # small u with grave accent
695 $chars{0xc177}=0x1e81; # small w with grave
696 $chars{0xc179}=0x1ef3; # small y with grave
698 $chars{0xc241}=0x00c1; # capital a with acute accent
699 $chars{0xc243}=0x0106; # capital c with acute accent
700 $chars{0xc245}=0x00c9; # capital e with acute accent
701 $chars{0xc247}=0x01f4; # capital g with acute
702 $chars{0xc249}=0x00cd; # capital i with acute accent
703 $chars{0xc24b}=0x1e30; # capital k with acute
704 $chars{0xc24c}=0x0139; # capital l with acute accent
705 $chars{0xc24d}=0x1e3e; # capital m with acute
706 $chars{0xc24e}=0x0143; # capital n with acute accent
707 $chars{0xc24f}=0x00d3; # capital o with acute accent
708 $chars{0xc250}=0x1e54; # capital p with acute
709 $chars{0xc252}=0x0154; # capital r with acute accent
710 $chars{0xc253}=0x015a; # capital s with acute accent
711 $chars{0xc255}=0x00da; # capital u with acute accent
712 $chars{0xc257}=0x1e82; # capital w with acute
713 $chars{0xc259}=0x00dd; # capital y with acute accent
714 $chars{0xc25a}=0x0179; # capital z with acute accent
715 $chars{0xc261}=0x00e1; # small a with acute accent
716 $chars{0xc263}=0x0107; # small c with acute accent
717 $chars{0xc265}=0x00e9; # small e with acute accent
718 $chars{0xc267}=0x01f5; # small g with acute
719 $chars{0xc269}=0x00ed; # small i with acute accent
720 $chars{0xc26b}=0x1e31; # small k with acute
721 $chars{0xc26c}=0x013a; # small l with acute accent
722 $chars{0xc26d}=0x1e3f; # small m with acute
723 $chars{0xc26e}=0x0144; # small n with acute accent
724 $chars{0xc26f}=0x00f3; # small o with acute accent
725 $chars{0xc270}=0x1e55; # small p with acute
726 $chars{0xc272}=0x0155; # small r with acute accent
727 $chars{0xc273}=0x015b; # small s with acute accent
728 $chars{0xc275}=0x00fa; # small u with acute accent
729 $chars{0xc277}=0x1e83; # small w with acute
730 $chars{0xc279}=0x00fd; # small y with acute accent
731 $chars{0xc27a}=0x017a; # small z with acute accent
732 $chars{0xc2e1}=0x01fc; # capital ae with acute
733 $chars{0xc2f1}=0x01fd; # small ae with acute
734 # 4/3 circumflex accent
735 $chars{0xc341}=0x00c2; # capital a with circumflex accent
736 $chars{0xc343}=0x0108; # capital c with circumflex
737 $chars{0xc345}=0x00ca; # capital e with circumflex accent
738 $chars{0xc347}=0x011c; # capital g with circumflex
739 $chars{0xc348}=0x0124; # capital h with circumflex
740 $chars{0xc349}=0x00ce; # capital i with circumflex accent
741 $chars{0xc34a}=0x0134; # capital j with circumflex
742 $chars{0xc34f}=0x00d4; # capital o with circumflex accent
743 $chars{0xc353}=0x015c; # capital s with circumflex
744 $chars{0xc355}=0x00db; # capital u with circumflex
745 $chars{0xc357}=0x0174; # capital w with circumflex
746 $chars{0xc359}=0x0176; # capital y with circumflex
747 $chars{0xc35a}=0x1e90; # capital z with circumflex
748 $chars{0xc361}=0x00e2; # small a with circumflex accent
749 $chars{0xc363}=0x0109; # small c with circumflex
750 $chars{0xc365}=0x00ea; # small e with circumflex accent
751 $chars{0xc367}=0x011d; # small g with circumflex
752 $chars{0xc368}=0x0125; # small h with circumflex
753 $chars{0xc369}=0x00ee; # small i with circumflex accent
754 $chars{0xc36a}=0x0135; # small j with circumflex
755 $chars{0xc36e}=0x00f1; # small n with tilde
756 $chars{0xc36f}=0x00f4; # small o with circumflex accent
757 $chars{0xc373}=0x015d; # small s with circumflex
758 $chars{0xc375}=0x00fb; # small u with circumflex
759 $chars{0xc377}=0x0175; # small w with circumflex
760 $chars{0xc379}=0x0177; # small y with circumflex
761 $chars{0xc37a}=0x1e91; # small z with circumflex
763 $chars{0xc441}=0x00c3; # capital a with tilde
764 $chars{0xc445}=0x1ebc; # capital e with tilde
765 $chars{0xc449}=0x0128; # capital i with tilde
766 $chars{0xc44e}=0x00d1; # capital n with tilde
767 $chars{0xc44f}=0x00d5; # capital o with tilde
768 $chars{0xc455}=0x0168; # capital u with tilde
769 $chars{0xc456}=0x1e7c; # capital v with tilde
770 $chars{0xc459}=0x1ef8; # capital y with tilde
771 $chars{0xc461}=0x00e3; # small a with tilde
772 $chars{0xc465}=0x1ebd; # small e with tilde
773 $chars{0xc469}=0x0129; # small i with tilde
774 $chars{0xc46e}=0x00f1; # small n with tilde
775 $chars{0xc46f}=0x00f5; # small o with tilde
776 $chars{0xc475}=0x0169; # small u with tilde
777 $chars{0xc476}=0x1e7d; # small v with tilde
778 $chars{0xc479}=0x1ef9; # small y with tilde
780 $chars{0xc541}=0x0100; # capital a with macron
781 $chars{0xc545}=0x0112; # capital e with macron
782 $chars{0xc547}=0x1e20; # capital g with macron
783 $chars{0xc549}=0x012a; # capital i with macron
784 $chars{0xc54f}=0x014c; # capital o with macron
785 $chars{0xc555}=0x016a; # capital u with macron
786 $chars{0xc561}=0x0101; # small a with macron
787 $chars{0xc565}=0x0113; # small e with macron
788 $chars{0xc567}=0x1e21; # small g with macron
789 $chars{0xc569}=0x012b; # small i with macron
790 $chars{0xc56f}=0x014d; # small o with macron
791 $chars{0xc575}=0x016b; # small u with macron
792 $chars{0xc572}=0x0159; # small r with macron
793 $chars{0xc5e1}=0x01e2; # capital ae with macron
794 $chars{0xc5f1}=0x01e3; # small ae with macron
796 $chars{0xc641}=0x0102; # capital a with breve
797 $chars{0xc645}=0x0114; # capital e with breve
798 $chars{0xc647}=0x011e; # capital g with breve
799 $chars{0xc649}=0x012c; # capital i with breve
800 $chars{0xc64f}=0x014e; # capital o with breve
801 $chars{0xc655}=0x016c; # capital u with breve
802 $chars{0xc661}=0x0103; # small a with breve
803 $chars{0xc665}=0x0115; # small e with breve
804 $chars{0xc667}=0x011f; # small g with breve
805 $chars{0xc669}=0x012d; # small i with breve
806 $chars{0xc66f}=0x014f; # small o with breve
807 $chars{0xc675}=0x016d; # small u with breve
809 $chars{0xc7b0}=0x01e1; # Ain with dot above
810 $chars{0xc742}=0x1e02; # capital b with dot above
811 $chars{0xc743}=0x010a; # capital c with dot above
812 $chars{0xc744}=0x1e0a; # capital d with dot above
813 $chars{0xc745}=0x0116; # capital e with dot above
814 $chars{0xc746}=0x1e1e; # capital f with dot above
815 $chars{0xc747}=0x0120; # capital g with dot above
816 $chars{0xc748}=0x1e22; # capital h with dot above
817 $chars{0xc749}=0x0130; # capital i with dot above
818 $chars{0xc74d}=0x1e40; # capital m with dot above
819 $chars{0xc74e}=0x1e44; # capital n with dot above
820 $chars{0xc750}=0x1e56; # capital p with dot above
821 $chars{0xc752}=0x1e58; # capital r with dot above
822 $chars{0xc753}=0x1e60; # capital s with dot above
823 $chars{0xc754}=0x1e6a; # capital t with dot above
824 $chars{0xc757}=0x1e86; # capital w with dot above
825 $chars{0xc758}=0x1e8a; # capital x with dot above
826 $chars{0xc759}=0x1e8e; # capital y with dot above
827 $chars{0xc75a}=0x017b; # capital z with dot above
828 $chars{0xc761}=0x0227; # small b with dot above
829 $chars{0xc762}=0x1e03; # small b with dot above
830 $chars{0xc763}=0x010b; # small c with dot above
831 $chars{0xc764}=0x1e0b; # small d with dot above
832 $chars{0xc765}=0x0117; # small e with dot above
833 $chars{0xc766}=0x1e1f; # small f with dot above
834 $chars{0xc767}=0x0121; # small g with dot above
835 $chars{0xc768}=0x1e23; # small h with dot above
836 $chars{0xc76d}=0x1e41; # small m with dot above
837 $chars{0xc76e}=0x1e45; # small n with dot above
838 $chars{0xc770}=0x1e57; # small p with dot above
839 $chars{0xc772}=0x1e59; # small r with dot above
840 $chars{0xc773}=0x1e61; # small s with dot above
841 $chars{0xc774}=0x1e6b; # small t with dot above
842 $chars{0xc777}=0x1e87; # small w with dot above
843 $chars{0xc778}=0x1e8b; # small x with dot above
844 $chars{0xc779}=0x1e8f; # small y with dot above
845 $chars{0xc77a}=0x017c; # small z with dot above
846 # 4/8 trema, diaresis
847 $chars{0xc820}=0x00a8; # diaeresis
848 $chars{0xc841}=0x00c4; # capital a with diaeresis
849 $chars{0xc845}=0x00cb; # capital e with diaeresis
850 $chars{0xc848}=0x1e26; # capital h with diaeresis
851 $chars{0xc849}=0x00cf; # capital i with diaeresis
852 $chars{0xc84f}=0x00d6; # capital o with diaeresis
853 $chars{0xc855}=0x00dc; # capital u with diaeresis
854 $chars{0xc857}=0x1e84; # capital w with diaeresis
855 $chars{0xc858}=0x1e8c; # capital x with diaeresis
856 $chars{0xc859}=0x0178; # capital y with diaeresis
857 $chars{0xc861}=0x00e4; # small a with diaeresis
858 $chars{0xc865}=0x00eb; # small e with diaeresis
859 $chars{0xc868}=0x1e27; # small h with diaeresis
860 $chars{0xc869}=0x00ef; # small i with diaeresis
861 $chars{0xc86f}=0x00f6; # small o with diaeresis
862 $chars{0xc874}=0x1e97; # small t with diaeresis
863 $chars{0xc875}=0x00fc; # small u with diaeresis
864 $chars{0xc877}=0x1e85; # small w with diaeresis
865 $chars{0xc878}=0x1e8d; # small x with diaeresis
866 $chars{0xc879}=0x00ff; # small y with diaeresis
868 $chars{0xc920}=0x00a8; # [diaeresis]
869 $chars{0xc961}=0x00e4; # a with umlaut
870 $chars{0xc965}=0x00eb; # e with umlaut
871 $chars{0xc969}=0x00ef; # i with umlaut
872 $chars{0xc96f}=0x00f6; # o with umlaut
873 $chars{0xc975}=0x00fc; # u with umlaut
875 $chars{0xca41}=0x00c5; # capital a with ring above
876 $chars{0xcaad}=0x016e; # capital u with ring above
877 $chars{0xca61}=0x00e5; # small a with ring above
878 $chars{0xca75}=0x016f; # small u with ring above
879 $chars{0xca77}=0x1e98; # small w with ring above
880 $chars{0xca79}=0x1e99; # small y with ring above
881 # 4/11 high comma off centre
882 # 4/12 inverted high comma centred
883 # 4/13 double acute accent
884 $chars{0xcd4f}=0x0150; # capital o with double acute
885 $chars{0xcd55}=0x0170; # capital u with double acute
886 $chars{0xcd6f}=0x0151; # small o with double acute
887 $chars{0xcd75}=0x0171; # small u with double acute
889 $chars{0xce54}=0x01a0; # latin capital letter o with horn
890 $chars{0xce55}=0x01af; # latin capital letter u with horn
891 $chars{0xce74}=0x01a1; # latin small letter o with horn
892 $chars{0xce75}=0x01b0; # latin small letter u with horn
894 $chars{0xcf41}=0x01cd; # capital a with caron
895 $chars{0xcf43}=0x010c; # capital c with caron
896 $chars{0xcf44}=0x010e; # capital d with caron
897 $chars{0xcf45}=0x011a; # capital e with caron
898 $chars{0xcf47}=0x01e6; # capital g with caron
899 $chars{0xcf49}=0x01cf; # capital i with caron
900 $chars{0xcf4b}=0x01e8; # capital k with caron
901 $chars{0xcf4c}=0x013d; # capital l with caron
902 $chars{0xcf4e}=0x0147; # capital n with caron
903 $chars{0xcf4f}=0x01d1; # capital o with caron
904 $chars{0xcf52}=0x0158; # capital r with caron
905 $chars{0xcf53}=0x0160; # capital s with caron
906 $chars{0xcf54}=0x0164; # capital t with caron
907 $chars{0xcf55}=0x01d3; # capital u with caron
908 $chars{0xcf5a}=0x017d; # capital z with caron
909 $chars{0xcf61}=0x01ce; # small a with caron
910 $chars{0xcf63}=0x010d; # small c with caron
911 $chars{0xcf64}=0x010f; # small d with caron
912 $chars{0xcf65}=0x011b; # small e with caron
913 $chars{0xcf67}=0x01e7; # small g with caron
914 $chars{0xcf69}=0x01d0; # small i with caron
915 $chars{0xcf6a}=0x01f0; # small j with caron
916 $chars{0xcf6b}=0x01e9; # small k with caron
917 $chars{0xcf6c}=0x013e; # small l with caron
918 $chars{0xcf6e}=0x0148; # small n with caron
919 $chars{0xcf6f}=0x01d2; # small o with caron
920 $chars{0xcf72}=0x0159; # small r with caron
921 $chars{0xcf73}=0x0161; # small s with caron
922 $chars{0xcf74}=0x0165; # small t with caron
923 $chars{0xcf75}=0x01d4; # small u with caron
924 $chars{0xcf7a}=0x017e; # small z with caron
926 $chars{0xd020}=0x00b8; # cedilla
927 $chars{0xd043}=0x00c7; # capital c with cedilla
928 $chars{0xd044}=0x1e10; # capital d with cedilla
929 $chars{0xd047}=0x0122; # capital g with cedilla
930 $chars{0xd048}=0x1e28; # capital h with cedilla
931 $chars{0xd04b}=0x0136; # capital k with cedilla
932 $chars{0xd04c}=0x013b; # capital l with cedilla
933 $chars{0xd04e}=0x0145; # capital n with cedilla
934 $chars{0xd052}=0x0156; # capital r with cedilla
935 $chars{0xd053}=0x015e; # capital s with cedilla
936 $chars{0xd054}=0x0162; # capital t with cedilla
937 $chars{0xd063}=0x00e7; # small c with cedilla
938 $chars{0xd064}=0x1e11; # small d with cedilla
939 $chars{0xd065}=0x0119; # small e with cedilla
940 $chars{0xd067}=0x0123; # small g with cedilla
941 $chars{0xd068}=0x1e29; # small h with cedilla
942 $chars{0xd06b}=0x0137; # small k with cedilla
943 $chars{0xd06c}=0x013c; # small l with cedilla
944 $chars{0xd06e}=0x0146; # small n with cedilla
945 $chars{0xd072}=0x0157; # small r with cedilla
946 $chars{0xd073}=0x015f; # small s with cedilla
947 $chars{0xd074}=0x0163; # small t with cedilla
950 # 5/3 ogonek (hook to right
951 $chars{0xd320}=0x02db; # ogonek
952 $chars{0xd341}=0x0104; # capital a with ogonek
953 $chars{0xd345}=0x0118; # capital e with ogonek
954 $chars{0xd349}=0x012e; # capital i with ogonek
955 $chars{0xd34f}=0x01ea; # capital o with ogonek
956 $chars{0xd355}=0x0172; # capital u with ogonek
957 $chars{0xd361}=0x0105; # small a with ogonek
958 $chars{0xd365}=0x0119; # small e with ogonek
959 $chars{0xd369}=0x012f; # small i with ogonek
960 $chars{0xd36f}=0x01eb; # small o with ogonek
961 $chars{0xd375}=0x0173; # small u with ogonek
963 $chars{0xd441}=0x1e00; # capital a with ring below
964 $chars{0xd461}=0x1e01; # small a with ring below
965 # 5/5 half circle below
966 $chars{0xf948}=0x1e2a; # capital h with breve below
967 $chars{0xf968}=0x1e2b; # small h with breve below
969 $chars{0xd641}=0x1ea0; # capital a with dot below
970 $chars{0xd642}=0x1e04; # capital b with dot below
971 $chars{0xd644}=0x1e0c; # capital d with dot below
972 $chars{0xd645}=0x1eb8; # capital e with dot below
973 $chars{0xd648}=0x1e24; # capital h with dot below
974 $chars{0xd649}=0x1eca; # capital i with dot below
975 $chars{0xd64b}=0x1e32; # capital k with dot below
976 $chars{0xd64c}=0x1e36; # capital l with dot below
977 $chars{0xd64d}=0x1e42; # capital m with dot below
978 $chars{0xd64e}=0x1e46; # capital n with dot below
979 $chars{0xd64f}=0x1ecc; # capital o with dot below
980 $chars{0xd652}=0x1e5a; # capital r with dot below
981 $chars{0xd653}=0x1e62; # capital s with dot below
982 $chars{0xd654}=0x1e6c; # capital t with dot below
983 $chars{0xd655}=0x1ee4; # capital u with dot below
984 $chars{0xd656}=0x1e7e; # capital v with dot below
985 $chars{0xd657}=0x1e88; # capital w with dot below
986 $chars{0xd659}=0x1ef4; # capital y with dot below
987 $chars{0xd65a}=0x1e92; # capital z with dot below
988 $chars{0xd661}=0x1ea1; # small a with dot below
989 $chars{0xd662}=0x1e05; # small b with dot below
990 $chars{0xd664}=0x1e0d; # small d with dot below
991 $chars{0xd665}=0x1eb9; # small e with dot below
992 $chars{0xd668}=0x1e25; # small h with dot below
993 $chars{0xd669}=0x1ecb; # small i with dot below
994 $chars{0xd66b}=0x1e33; # small k with dot below
995 $chars{0xd66c}=0x1e37; # small l with dot below
996 $chars{0xd66d}=0x1e43; # small m with dot below
997 $chars{0xd66e}=0x1e47; # small n with dot below
998 $chars{0xd66f}=0x1ecd; # small o with dot below
999 $chars{0xd672}=0x1e5b; # small r with dot below
1000 $chars{0xd673}=0x1e63; # small s with dot below
1001 $chars{0xd674}=0x1e6d; # small t with dot below
1002 $chars{0xd675}=0x1ee5; # small u with dot below
1003 $chars{0xd676}=0x1e7f; # small v with dot below
1004 $chars{0xd677}=0x1e89; # small w with dot below
1005 $chars{0xd679}=0x1ef5; # small y with dot below
1006 $chars{0xd67a}=0x1e93; # small z with dot below
1007 # 5/7 double dot below
1008 $chars{0xd755}=0x1e72; # capital u with diaeresis below
1009 $chars{0xd775}=0x1e73; # small u with diaeresis below
1011 $chars{0xd820}=0x005f; # underline
1012 # 5/9 double underline
1013 $chars{0xd920}=0x2017; # double underline
1014 # 5/10 small low vertical bar
1015 $chars{0xda20}=0x02cc; #
1016 # 5/11 circumflex below
1017 # 5/12 (this position shall not be used)
1018 # 5/13 left half of ligature sign and of double tilde
1019 # 5/14 right half of ligature sign
1020 # 5/15 right half of double tilde
1021 # map {printf "%x :%x\n",$_,$chars{$_};}keys %chars;
1023 sub char_decode5426 {
1027 my @data = unpack("C*", $string);
1029 my $length=scalar(@data);
1030 for (my $i = 0; $i < scalar(@data); $i++) {
1031 my $char= $data[$i];
1032 if ($char >= 0x00 && $char <= 0x7F){
1035 push @characters,$char unless ($char<0x02 ||$char== 0x0F);
1036 }elsif (($char >= 0xC0 && $char <= 0xDF)) {
1039 if ($chars{$char*256+$data[$i+1]}) {
1040 $convchar= $chars{$char * 256 + $data[$i+1]};
1042 # printf "char %x $char, char to convert %x , converted %x\n",$char,$char * 256 + $data[$i - 1],$convchar;
1043 } elsif ($chars{$char}) {
1044 $convchar= $chars{$char};
1045 # printf "0xC char %x, converted %x\n",$char,$chars{$char};
1049 push @characters,$convchar;
1052 if ($chars{$char}) {
1053 $convchar= $chars{$char};
1054 # printf "char %x, converted %x\n",$char,$chars{$char};
1056 # printf "char %x $char\n",$char;
1059 push @characters,$convchar;
1062 $result=pack "U*",@characters;
1063 # $result=~s/\x01//;
1064 # $result=~s/\x00//;
1068 $result=~s/\x1b\x5b//;
1069 # map{printf "%x",$_} @characters;
1079 Koha Development Team <info@koha.org>
1081 Galen Charlton <galen.charlton@liblime.com>