3 # Copyright (C) 2008 LibLime
5 # This file is part of Koha.
7 # Koha is free software; you can redistribute it and/or modify it under the
8 # terms of the GNU General Public License as published by the Free Software
9 # Foundation; either version 2 of the License, or (at your option) any later
12 # Koha is distributed in the hope that it will be useful, but WITHOUT ANY
13 # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
14 # A PARTICULAR PURPOSE. See the GNU General Public License for more details.
16 # You should have received a copy of the GNU General Public License along with
17 # Koha; if not, write to the Free Software Foundation, Inc., 59 Temple Place,
18 # Suite 330, Boston, MA 02111-1307 USA
21 use MARC::Charset qw/marc8_to_utf8/;
24 use vars qw($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
27 # set the version for version checking
40 C4::Charset - utilities for handling character set conversions.
48 This module contains routines for dealing with character set
49 conversions, particularly for MARC records.
51 A variety of character encodings are in use by various MARC
52 standards, and even more character encodings are used by
53 non-standard MARC records. The various MARC formats generally
54 do not do a good job of advertising a given record's character
55 encoding, and even when a record does advertise its encoding,
56 e.g., via the Leader/09, experience has shown that one cannot
59 Ultimately, all MARC records are stored in Koha in UTF-8 and
60 must be converted from whatever the source character encoding is.
61 The goal of this module is to ensure that these conversions
62 take place accurately. When a character conversion cannot take
63 place, or at least not accurately, the module was provide
64 enough information to allow user-facing code to inform the user
65 on how to deal with the situation.
71 =head2 IsStringUTF8ish
75 my $is_utf8 = IsStringUTF8ish($str);
79 Determines if C<$str> is valid UTF-8. This can mean
86 The Perl UTF-8 flag is set and the string contains valid UTF-8.
90 The Perl UTF-8 flag is B<not> set, but the octets contain
95 The function is named C<IsStringUTF8ish> instead of C<IsStringUTF8>
96 because in one could be presented with a MARC blob that is
97 not actually in UTF-8 but whose sequence of octets appears to be
98 valid UTF-8. The rest of the MARC character conversion functions
99 will assume that this situation occur does not very often.
103 sub IsStringUTF8ish {
106 return 1 if utf8::is_utf8($str);
107 return utf8::decode($str);
110 =head2 MarcToUTF8Record
114 ($marc_record, $converted_from, $errors_arrayref) = MarcToUTF8Record($marc_blob, $marc_flavour, [, $source_encoding]);
118 Given a MARC blob or a C<MARC::Record>, the MARC flavour, and an
119 optional source encoding, return a C<MARC::Record> that is
122 The returned C<$marc_record> is guaranteed to be in valid UTF-8, but
123 is not guaranteed to have been converted correctly. Specifically,
124 if C<$converted_from> is 'failed', the MARC record returned failed
125 character conversion and had each of its non-ASCII octets changed
126 to the Unicode replacement character.
128 If the source encoding was not specified, this routine will
129 try to guess it; the character encoding used for a successful
130 conversion is returned in C<$converted_from>.
134 sub MarcToUTF8Record {
136 my $marc_flavour = shift;
137 my $source_encoding = shift;
140 my $marc_blob_is_utf8 = 0;
141 if (ref($marc) eq 'MARC::Record') {
142 my $marc_blob = $marc->as_usmarc();
143 $marc_blob_is_utf8 = IsStringUTF8ish($marc_blob);
144 $marc_record = $marc;
146 # dealing with a MARC blob
148 # remove any ersatz whitespace from the beginning and
149 # end of the MARC blob -- these can creep into MARC
150 # files produced by several sources -- caller really
151 # should be doing this, however
154 $marc_blob_is_utf8 = IsStringUTF8ish($marc);
155 $marc_record = MARC::Record->new_from_usmarc($marc);
158 # If we do not know the source encoding, try some guesses
160 # 1. Record is UTF-8 already.
161 # 2. If MARC flavor is MARC21, then
162 # a. record is MARC-8
163 # b. record is ISO-8859-1
164 # 3. If MARC flavor is UNIMARC, then
165 if (not defined $source_encoding) {
166 if ($marc_blob_is_utf8) {
167 # note that for MARC21 we are not bothering to check
168 # if the Leader/09 is set to 'a' or not -- because
169 # of problems with various ILSs (including Koha in the
170 # past, alas), this just is not trustworthy.
171 SetMarcUnicodeFlag($marc_record, $marc_flavour);
172 return $marc_record, 'UTF-8', [];
174 if ($marc_flavour eq 'MARC21') {
175 return _default_marc21_charconv_to_utf8($marc_record, $marc_flavour);
176 } elsif ($marc_flavour eq 'UNIMARC') {
177 return _default_unimarc_charconv_to_utf8($marc_record, $marc_flavour);
179 return _default_marc21_charconv_to_utf8($marc_record, $marc_flavour);
183 # caller knows the character encoding
184 my $original_marc_record = $marc_record->clone();
186 if ($source_encoding =~ /utf-?8/i) {
187 if ($marc_blob_is_utf8) {
188 SetMarcUnicodeFlag($marc_record, $marc_flavour);
189 return $marc_record, 'UTF-8', [];
191 push @errors, 'specified UTF-8 => UTF-8 conversion, but record is not in UTF-8';
193 } elsif ($source_encoding =~ /marc-?8/i) {
194 @errors = _marc_marc8_to_utf8($marc_record, $marc_flavour);
195 } elsif ($source_encoding =~ /5426/) {
196 @errors = _marc_iso5426_to_utf8($marc_record, $marc_flavour);
198 # assume any other character encoding is for Text::Iconv
199 @errors = _marc_to_utf8_via_text_iconv($marc_record, $marc_flavour, 'iso-8859-1');
203 _marc_to_utf8_replacement_char($original_marc_record, $marc_flavour);
204 return $original_marc_record, 'failed', \@errors;
206 return $marc_record, $source_encoding, [];
212 =head2 SetMarcUnicodeFlag
216 SetMarcUnicodeFlag($marc_record, $marc_flavour);
220 Set both the internal MARC::Record encoding flag
221 and the appropriate Leader/09 (MARC21) or
222 100/26-29 (UNIMARC) to indicate that the record
223 is in UTF-8. Note that this does B<not> do
224 any actual character conversion.
228 sub SetMarcUnicodeFlag {
229 my $marc_record = shift;
230 my $marc_flavour = shift;
232 $marc_record->encoding('UTF-8');
233 if ($marc_flavour eq 'MARC21') {
234 my $leader = $marc_record->leader();
235 substr($leader, 9, 1) = 'a';
236 $marc_record->leader($leader);
237 } elsif ($marc_flavour eq "UNIMARC") {
238 if (my $field = $marc_record->field('100')) {
239 my $sfa = $field->subfield('a');
240 substr($sfa, 26, 4) = '5050';
241 $field->update('a' => $sfa);
248 =head1 INTERNAL FUNCTIONS
250 =head2 _default_marc21_charconv_to_utf8
254 my ($new_marc_record, $guessed_charset) = _default_marc21_charconv_to_utf8($marc_record);
258 Converts a C<MARC::Record> of unknown character set to UTF-8,
259 first by trying a MARC-8 to UTF-8 conversion, then ISO-8859-1
260 to UTF-8, then a default conversion that replaces each non-ASCII
261 character with the replacement character.
263 The C<$guessed_charset> return value contains the character set
264 that resulted in a conversion to valid UTF-8; note that
265 if the MARC-8 and ISO-8859-1 conversions failed, the value of
270 sub _default_marc21_charconv_to_utf8 {
271 my $marc_record = shift;
272 my $marc_flavour = shift;
274 my $trial_marc8 = $marc_record->clone();
276 my @errors = _marc_marc8_to_utf8($trial_marc8, $marc_flavour);
278 return $trial_marc8, 'MARC-8', [];
280 push @all_errors, @errors;
282 my $trial_8859_1 = $marc_record->clone();
283 @errors = _marc_to_utf8_via_text_iconv($trial_8859_1, $marc_flavour, 'iso-8859-1');
285 return $trial_8859_1, 'iso-8859-1', []; # note -- we could return \@all_errors
286 # instead if we wanted to report details
287 # of the failed attempt at MARC-8 => UTF-8
289 push @all_errors, @errors;
291 my $default_converted = $marc_record->clone();
292 _marc_to_utf8_replacement_char($default_converted, $marc_flavour);
293 return $default_converted, 'failed', \@all_errors;
296 =head2 _default_unimarc_charconv_to_utf8
300 my ($new_marc_record, $guessed_charset) = _default_unimarc_charconv_to_utf8($marc_record);
304 Converts a C<MARC::Record> of unknown character set to UTF-8,
305 first by trying a ISO-5426 to UTF-8 conversion, then ISO-8859-1
306 to UTF-8, then a default conversion that replaces each non-ASCII
307 character with the replacement character.
309 The C<$guessed_charset> return value contains the character set
310 that resulted in a conversion to valid UTF-8; note that
311 if the MARC-8 and ISO-8859-1 conversions failed, the value of
316 sub _default_unimarc_charconv_to_utf8 {
317 my $marc_record = shift;
318 my $marc_flavour = shift;
320 my $trial_marc8 = $marc_record->clone();
322 my @errors = _marc_iso5426_to_utf8($trial_marc8, $marc_flavour);
324 return $trial_marc8, 'iso-5426';
326 push @all_errors, @errors;
328 my $trial_8859_1 = $marc_record->clone();
329 @errors = _marc_to_utf8_via_text_iconv($trial_8859_1, $marc_flavour, 'iso-8859-1');
331 return $trial_8859_1, 'iso-8859-1';
333 push @all_errors, @errors;
335 my $default_converted = $marc_record->clone();
336 _marc_to_utf8_replacement_char($default_converted, $marc_flavour);
337 return $default_converted, 'failed', \@all_errors;
340 =head2 _marc_marc8_to_utf8
344 my @errors = _marc_marc8_to_utf8($marc_record, $marc_flavour, $source_encoding);
348 Convert a C<MARC::Record> to UTF-8 in-place from MARC-8.
349 If the conversion fails for some reason, an
350 appropriate messages will be placed in the returned
355 sub _marc_marc8_to_utf8 {
356 my $marc_record = shift;
357 my $marc_flavour = shift;
359 my $prev_ignore = MARC::Charset->ignore_errors();
360 MARC::Charset->ignore_errors(1);
362 # trap warnings raised by MARC::Charset
364 local $SIG{__WARN__} = sub {
366 if ($msg =~ /MARC.Charset/) {
367 # FIXME - purpose of this regexp is to strip out the
368 # line reference to MARC/Charset.pm, but as it
369 # exists probably won't work quite on Windows --
370 # some sort of minimal-bunch back-tracking RE
371 # would be helpful here
372 $msg =~ s/at [\/].*?.MARC.Charset\.pm line \d+\.\n$//;
375 # if warning doesn't come from MARC::Charset, just
381 foreach my $field ($marc_record->fields()) {
382 if ($field->is_control_field()) {
383 ; # do nothing -- control fields should not contain non-ASCII characters
385 my @converted_subfields;
386 foreach my $subfield ($field->subfields()) {
387 my $utf8sf = MARC::Charset::marc8_to_utf8($subfield->[1]);
388 push @converted_subfields, $subfield->[0], $utf8sf;
391 $field->replace_with(MARC::Field->new(
392 $field->tag(), $field->indicator(1), $field->indicator(2),
393 @converted_subfields)
398 MARC::Charset->ignore_errors($prev_ignore);
400 SetMarcUnicodeFlag($marc_record, $marc_flavour);
405 =head2 _marc_iso5426_to_utf8
409 my @errors = _marc_iso5426_to_utf8($marc_record, $marc_flavour, $source_encoding);
413 Convert a C<MARC::Record> to UTF-8 in-place from ISO-5426.
414 If the conversion fails for some reason, an
415 appropriate messages will be placed in the returned
418 FIXME - is ISO-5426 equivalent enough to MARC-8
419 that C<MARC::Charset> can be used instead?
423 sub _marc_iso5426_to_utf8 {
424 my $marc_record = shift;
425 my $marc_flavour = shift;
429 foreach my $field ($marc_record->fields()) {
430 if ($field->is_control_field()) {
431 ; # do nothing -- control fields should not contain non-ASCII characters
433 my @converted_subfields;
434 foreach my $subfield ($field->subfields()) {
435 my $utf8sf = char_decode5426($subfield->[1]);
436 push @converted_subfields, $subfield->[0], $utf8sf;
439 $field->replace_with(MARC::Field->new(
440 $field->tag(), $field->indicator(1), $field->indicator(2),
441 @converted_subfields)
446 SetMarcUnicodeFlag($marc_record, $marc_flavour);
451 =head2 _marc_to_utf8_via_text_iconv
455 my @errors = _marc_to_utf8_via_text_iconv($marc_record, $marc_flavour, $source_encoding);
459 Convert a C<MARC::Record> to UTF-8 in-place using the
460 C<Text::Iconv> CPAN module. Any source encoding accepted
461 by the user's iconv installation should work. If
462 the source encoding is not recognized on the user's
463 server or the conversion fails for some reason,
464 appropriate messages will be placed in the returned
469 sub _marc_to_utf8_via_text_iconv {
470 my $marc_record = shift;
471 my $marc_flavour = shift;
472 my $source_encoding = shift;
476 eval { $decoder = Text::Iconv->new($source_encoding, 'utf8'); };
478 push @errors, "Could not initialze $source_encoding => utf8 converter: $@";
482 my $prev_raise_error = Text::Iconv->raise_error();
483 Text::Iconv->raise_error(1);
485 foreach my $field ($marc_record->fields()) {
486 if ($field->is_control_field()) {
487 ; # do nothing -- control fields should not contain non-ASCII characters
489 my @converted_subfields;
490 foreach my $subfield ($field->subfields()) {
492 my $conversion_ok = 1;
493 eval { $converted_value = $decoder->convert($subfield->[1]); };
497 } elsif (not defined $converted_value) {
499 push @errors, "Text::Iconv conversion failed - retval is " . $decoder->retval();
502 if ($conversion_ok) {
503 push @converted_subfields, $subfield->[0], $converted_value;
505 $converted_value = $subfield->[1];
506 $converted_value =~ s/[\200-\377]/\xef\xbf\xbd/g;
507 push @converted_subfields, $subfield->[0], $converted_value;
511 $field->replace_with(MARC::Field->new(
512 $field->tag(), $field->indicator(1), $field->indicator(2),
513 @converted_subfields)
518 SetMarcUnicodeFlag($marc_record, $marc_flavour);
519 Text::Iconv->raise_error($prev_raise_error);
524 =head2 _marc_to_utf8_replacement_char
528 _marc_to_utf8_replacement_char($marc_record, $marc_flavour);
532 Convert a C<MARC::Record> to UTF-8 in-place, adopting the
533 unsatisfactory method of replacing all non-ASCII (e.g.,
534 where the eight bit is set) octet with the Unicode
535 replacement character. This is meant as a last-ditch
536 method, and would be best used as part of a UI that
537 lets a cataloguer pick various character conversions
538 until he or she finds the right one.
542 sub _marc_to_utf8_replacement_char {
543 my $marc_record = shift;
544 my $marc_flavour = shift;
546 foreach my $field ($marc_record->fields()) {
547 if ($field->is_control_field()) {
548 ; # do nothing -- control fields should not contain non-ASCII characters
550 my @converted_subfields;
551 foreach my $subfield ($field->subfields()) {
552 my $value = $subfield->[1];
553 $value =~ s/[\200-\377]/\xef\xbf\xbd/g;
554 push @converted_subfields, $subfield->[0], $value;
557 $field->replace_with(MARC::Field->new(
558 $field->tag(), $field->indicator(1), $field->indicator(2),
559 @converted_subfields)
564 SetMarcUnicodeFlag($marc_record, $marc_flavour);
567 =head2 char_decode5426
571 my $utf8string = char_decode5426($iso_5426_string);
575 Converts a string from ISO-5426 to UTF-8.
579 sub char_decode5426 {
583 $chars{0xb0}=0x0101;#3/0ayn[ain]
584 $chars{0xb1}=0x0623;#3/1alif/hamzah[alefwithhamzaabove]
585 #$chars{0xb2}=0x00e0;#'Ã ';
586 $chars{0xb2}=0x00e0;#3/2leftlowsinglequotationmark
587 #$chars{0xb3}=0x00e7;#'ç';
588 $chars{0xb3}=0x00e7;#3/2leftlowsinglequotationmark
593 $chars{0x97}=0x003c;#3/2leftlowsinglequotationmark
594 $chars{0x98}=0x003e;#3/2leftlowsinglequotationmark
595 $chars{0xfa}=0x0153;#oe
596 $chars{0x81d1}=0x00b0;
599 ## combined characters iso5426
601 $chars{0xc041}=0x1ea2; # capital a with hook above
602 $chars{0xc045}=0x1eba; # capital e with hook above
603 $chars{0xc049}=0x1ec8; # capital i with hook above
604 $chars{0xc04f}=0x1ece; # capital o with hook above
605 $chars{0xc055}=0x1ee6; # capital u with hook above
606 $chars{0xc059}=0x1ef6; # capital y with hook above
607 $chars{0xc061}=0x1ea3; # small a with hook above
608 $chars{0xc065}=0x1ebb; # small e with hook above
609 $chars{0xc069}=0x1ec9; # small i with hook above
610 $chars{0xc06f}=0x1ecf; # small o with hook above
611 $chars{0xc075}=0x1ee7; # small u with hook above
612 $chars{0xc079}=0x1ef7; # small y with hook above
615 $chars{0xc141}=0x00c0; # capital a with grave accent
616 $chars{0xc145}=0x00c8; # capital e with grave accent
617 $chars{0xc149}=0x00cc; # capital i with grave accent
618 $chars{0xc14f}=0x00d2; # capital o with grave accent
619 $chars{0xc155}=0x00d9; # capital u with grave accent
620 $chars{0xc157}=0x1e80; # capital w with grave
621 $chars{0xc159}=0x1ef2; # capital y with grave
622 $chars{0xc161}=0x00e0; # small a with grave accent
623 $chars{0xc165}=0x00e8; # small e with grave accent
624 $chars{0xc169}=0x00ec; # small i with grave accent
625 $chars{0xc16f}=0x00f2; # small o with grave accent
626 $chars{0xc175}=0x00f9; # small u with grave accent
627 $chars{0xc177}=0x1e81; # small w with grave
628 $chars{0xc179}=0x1ef3; # small y with grave
630 $chars{0xc241}=0x00c1; # capital a with acute accent
631 $chars{0xc243}=0x0106; # capital c with acute accent
632 $chars{0xc245}=0x00c9; # capital e with acute accent
633 $chars{0xc247}=0x01f4; # capital g with acute
634 $chars{0xc249}=0x00cd; # capital i with acute accent
635 $chars{0xc24b}=0x1e30; # capital k with acute
636 $chars{0xc24c}=0x0139; # capital l with acute accent
637 $chars{0xc24d}=0x1e3e; # capital m with acute
638 $chars{0xc24e}=0x0143; # capital n with acute accent
639 $chars{0xc24f}=0x00d3; # capital o with acute accent
640 $chars{0xc250}=0x1e54; # capital p with acute
641 $chars{0xc252}=0x0154; # capital r with acute accent
642 $chars{0xc253}=0x015a; # capital s with acute accent
643 $chars{0xc255}=0x00da; # capital u with acute accent
644 $chars{0xc257}=0x1e82; # capital w with acute
645 $chars{0xc259}=0x00dd; # capital y with acute accent
646 $chars{0xc25a}=0x0179; # capital z with acute accent
647 $chars{0xc261}=0x00e1; # small a with acute accent
648 $chars{0xc263}=0x0107; # small c with acute accent
649 $chars{0xc265}=0x00e9; # small e with acute accent
650 $chars{0xc267}=0x01f5; # small g with acute
651 $chars{0xc269}=0x00ed; # small i with acute accent
652 $chars{0xc26b}=0x1e31; # small k with acute
653 $chars{0xc26c}=0x013a; # small l with acute accent
654 $chars{0xc26d}=0x1e3f; # small m with acute
655 $chars{0xc26e}=0x0144; # small n with acute accent
656 $chars{0xc26f}=0x00f3; # small o with acute accent
657 $chars{0xc270}=0x1e55; # small p with acute
658 $chars{0xc272}=0x0155; # small r with acute accent
659 $chars{0xc273}=0x015b; # small s with acute accent
660 $chars{0xc275}=0x00fa; # small u with acute accent
661 $chars{0xc277}=0x1e83; # small w with acute
662 $chars{0xc279}=0x00fd; # small y with acute accent
663 $chars{0xc27a}=0x017a; # small z with acute accent
664 $chars{0xc2e1}=0x01fc; # capital ae with acute
665 $chars{0xc2f1}=0x01fd; # small ae with acute
666 # 4/3 circumflex accent
667 $chars{0xc341}=0x00c2; # capital a with circumflex accent
668 $chars{0xc343}=0x0108; # capital c with circumflex
669 $chars{0xc345}=0x00ca; # capital e with circumflex accent
670 $chars{0xc347}=0x011c; # capital g with circumflex
671 $chars{0xc348}=0x0124; # capital h with circumflex
672 $chars{0xc349}=0x00ce; # capital i with circumflex accent
673 $chars{0xc34a}=0x0134; # capital j with circumflex
674 $chars{0xc34f}=0x00d4; # capital o with circumflex accent
675 $chars{0xc353}=0x015c; # capital s with circumflex
676 $chars{0xc355}=0x00db; # capital u with circumflex
677 $chars{0xc357}=0x0174; # capital w with circumflex
678 $chars{0xc359}=0x0176; # capital y with circumflex
679 $chars{0xc35a}=0x1e90; # capital z with circumflex
680 $chars{0xc361}=0x00e2; # small a with circumflex accent
681 $chars{0xc363}=0x0109; # small c with circumflex
682 $chars{0xc365}=0x00ea; # small e with circumflex accent
683 $chars{0xc367}=0x011d; # small g with circumflex
684 $chars{0xc368}=0x0125; # small h with circumflex
685 $chars{0xc369}=0x00ee; # small i with circumflex accent
686 $chars{0xc36a}=0x0135; # small j with circumflex
687 $chars{0xc36e}=0x00f1; # small n with tilde
688 $chars{0xc36f}=0x00f4; # small o with circumflex accent
689 $chars{0xc373}=0x015d; # small s with circumflex
690 $chars{0xc375}=0x00fb; # small u with circumflex
691 $chars{0xc377}=0x0175; # small w with circumflex
692 $chars{0xc379}=0x0177; # small y with circumflex
693 $chars{0xc37a}=0x1e91; # small z with circumflex
695 $chars{0xc441}=0x00c3; # capital a with tilde
696 $chars{0xc445}=0x1ebc; # capital e with tilde
697 $chars{0xc449}=0x0128; # capital i with tilde
698 $chars{0xc44e}=0x00d1; # capital n with tilde
699 $chars{0xc44f}=0x00d5; # capital o with tilde
700 $chars{0xc455}=0x0168; # capital u with tilde
701 $chars{0xc456}=0x1e7c; # capital v with tilde
702 $chars{0xc459}=0x1ef8; # capital y with tilde
703 $chars{0xc461}=0x00e3; # small a with tilde
704 $chars{0xc465}=0x1ebd; # small e with tilde
705 $chars{0xc469}=0x0129; # small i with tilde
706 $chars{0xc46e}=0x00f1; # small n with tilde
707 $chars{0xc46f}=0x00f5; # small o with tilde
708 $chars{0xc475}=0x0169; # small u with tilde
709 $chars{0xc476}=0x1e7d; # small v with tilde
710 $chars{0xc479}=0x1ef9; # small y with tilde
712 $chars{0xc541}=0x0100; # capital a with macron
713 $chars{0xc545}=0x0112; # capital e with macron
714 $chars{0xc547}=0x1e20; # capital g with macron
715 $chars{0xc549}=0x012a; # capital i with macron
716 $chars{0xc54f}=0x014c; # capital o with macron
717 $chars{0xc555}=0x016a; # capital u with macron
718 $chars{0xc561}=0x0101; # small a with macron
719 $chars{0xc565}=0x0113; # small e with macron
720 $chars{0xc567}=0x1e21; # small g with macron
721 $chars{0xc569}=0x012b; # small i with macron
722 $chars{0xc56f}=0x014d; # small o with macron
723 $chars{0xc575}=0x016b; # small u with macron
724 $chars{0xc572}=0x0159; # small r with macron
725 $chars{0xc5e1}=0x01e2; # capital ae with macron
726 $chars{0xc5f1}=0x01e3; # small ae with macron
728 $chars{0xc641}=0x0102; # capital a with breve
729 $chars{0xc645}=0x0114; # capital e with breve
730 $chars{0xc647}=0x011e; # capital g with breve
731 $chars{0xc649}=0x012c; # capital i with breve
732 $chars{0xc64f}=0x014e; # capital o with breve
733 $chars{0xc655}=0x016c; # capital u with breve
734 $chars{0xc661}=0x0103; # small a with breve
735 $chars{0xc665}=0x0115; # small e with breve
736 $chars{0xc667}=0x011f; # small g with breve
737 $chars{0xc669}=0x012d; # small i with breve
738 $chars{0xc66f}=0x014f; # small o with breve
739 $chars{0xc675}=0x016d; # small u with breve
741 $chars{0xc7b0}=0x01e1; # Ain with dot above
742 $chars{0xc742}=0x1e02; # capital b with dot above
743 $chars{0xc743}=0x010a; # capital c with dot above
744 $chars{0xc744}=0x1e0a; # capital d with dot above
745 $chars{0xc745}=0x0116; # capital e with dot above
746 $chars{0xc746}=0x1e1e; # capital f with dot above
747 $chars{0xc747}=0x0120; # capital g with dot above
748 $chars{0xc748}=0x1e22; # capital h with dot above
749 $chars{0xc749}=0x0130; # capital i with dot above
750 $chars{0xc74d}=0x1e40; # capital m with dot above
751 $chars{0xc74e}=0x1e44; # capital n with dot above
752 $chars{0xc750}=0x1e56; # capital p with dot above
753 $chars{0xc752}=0x1e58; # capital r with dot above
754 $chars{0xc753}=0x1e60; # capital s with dot above
755 $chars{0xc754}=0x1e6a; # capital t with dot above
756 $chars{0xc757}=0x1e86; # capital w with dot above
757 $chars{0xc758}=0x1e8a; # capital x with dot above
758 $chars{0xc759}=0x1e8e; # capital y with dot above
759 $chars{0xc75a}=0x017b; # capital z with dot above
760 $chars{0xc761}=0x0227; # small b with dot above
761 $chars{0xc762}=0x1e03; # small b with dot above
762 $chars{0xc763}=0x010b; # small c with dot above
763 $chars{0xc764}=0x1e0b; # small d with dot above
764 $chars{0xc765}=0x0117; # small e with dot above
765 $chars{0xc766}=0x1e1f; # small f with dot above
766 $chars{0xc767}=0x0121; # small g with dot above
767 $chars{0xc768}=0x1e23; # small h with dot above
768 $chars{0xc76d}=0x1e41; # small m with dot above
769 $chars{0xc76e}=0x1e45; # small n with dot above
770 $chars{0xc770}=0x1e57; # small p with dot above
771 $chars{0xc772}=0x1e59; # small r with dot above
772 $chars{0xc773}=0x1e61; # small s with dot above
773 $chars{0xc774}=0x1e6b; # small t with dot above
774 $chars{0xc777}=0x1e87; # small w with dot above
775 $chars{0xc778}=0x1e8b; # small x with dot above
776 $chars{0xc779}=0x1e8f; # small y with dot above
777 $chars{0xc77a}=0x017c; # small z with dot above
778 # 4/8 trema, diaresis
779 $chars{0xc820}=0x00a8; # diaeresis
780 $chars{0xc841}=0x00c4; # capital a with diaeresis
781 $chars{0xc845}=0x00cb; # capital e with diaeresis
782 $chars{0xc848}=0x1e26; # capital h with diaeresis
783 $chars{0xc849}=0x00cf; # capital i with diaeresis
784 $chars{0xc84f}=0x00d6; # capital o with diaeresis
785 $chars{0xc855}=0x00dc; # capital u with diaeresis
786 $chars{0xc857}=0x1e84; # capital w with diaeresis
787 $chars{0xc858}=0x1e8c; # capital x with diaeresis
788 $chars{0xc859}=0x0178; # capital y with diaeresis
789 $chars{0xc861}=0x00e4; # small a with diaeresis
790 $chars{0xc865}=0x00eb; # small e with diaeresis
791 $chars{0xc868}=0x1e27; # small h with diaeresis
792 $chars{0xc869}=0x00ef; # small i with diaeresis
793 $chars{0xc86f}=0x00f6; # small o with diaeresis
794 $chars{0xc874}=0x1e97; # small t with diaeresis
795 $chars{0xc875}=0x00fc; # small u with diaeresis
796 $chars{0xc877}=0x1e85; # small w with diaeresis
797 $chars{0xc878}=0x1e8d; # small x with diaeresis
798 $chars{0xc879}=0x00ff; # small y with diaeresis
800 $chars{0xc920}=0x00a8; # [diaeresis]
801 $chars{0xc961}=0x00e4; # a with umlaut
802 $chars{0xc965}=0x00eb; # e with umlaut
803 $chars{0xc969}=0x00ef; # i with umlaut
804 $chars{0xc96f}=0x00f6; # o with umlaut
805 $chars{0xc975}=0x00fc; # u with umlaut
807 $chars{0xca41}=0x00c5; # capital a with ring above
808 $chars{0xcaad}=0x016e; # capital u with ring above
809 $chars{0xca61}=0x00e5; # small a with ring above
810 $chars{0xca75}=0x016f; # small u with ring above
811 $chars{0xca77}=0x1e98; # small w with ring above
812 $chars{0xca79}=0x1e99; # small y with ring above
813 # 4/11 high comma off centre
814 # 4/12 inverted high comma centred
815 # 4/13 double acute accent
816 $chars{0xcd4f}=0x0150; # capital o with double acute
817 $chars{0xcd55}=0x0170; # capital u with double acute
818 $chars{0xcd6f}=0x0151; # small o with double acute
819 $chars{0xcd75}=0x0171; # small u with double acute
821 $chars{0xce54}=0x01a0; # latin capital letter o with horn
822 $chars{0xce55}=0x01af; # latin capital letter u with horn
823 $chars{0xce74}=0x01a1; # latin small letter o with horn
824 $chars{0xce75}=0x01b0; # latin small letter u with horn
826 $chars{0xcf41}=0x01cd; # capital a with caron
827 $chars{0xcf43}=0x010c; # capital c with caron
828 $chars{0xcf44}=0x010e; # capital d with caron
829 $chars{0xcf45}=0x011a; # capital e with caron
830 $chars{0xcf47}=0x01e6; # capital g with caron
831 $chars{0xcf49}=0x01cf; # capital i with caron
832 $chars{0xcf4b}=0x01e8; # capital k with caron
833 $chars{0xcf4c}=0x013d; # capital l with caron
834 $chars{0xcf4e}=0x0147; # capital n with caron
835 $chars{0xcf4f}=0x01d1; # capital o with caron
836 $chars{0xcf52}=0x0158; # capital r with caron
837 $chars{0xcf53}=0x0160; # capital s with caron
838 $chars{0xcf54}=0x0164; # capital t with caron
839 $chars{0xcf55}=0x01d3; # capital u with caron
840 $chars{0xcf5a}=0x017d; # capital z with caron
841 $chars{0xcf61}=0x01ce; # small a with caron
842 $chars{0xcf63}=0x010d; # small c with caron
843 $chars{0xcf64}=0x010f; # small d with caron
844 $chars{0xcf65}=0x011b; # small e with caron
845 $chars{0xcf67}=0x01e7; # small g with caron
846 $chars{0xcf69}=0x01d0; # small i with caron
847 $chars{0xcf6a}=0x01f0; # small j with caron
848 $chars{0xcf6b}=0x01e9; # small k with caron
849 $chars{0xcf6c}=0x013e; # small l with caron
850 $chars{0xcf6e}=0x0148; # small n with caron
851 $chars{0xcf6f}=0x01d2; # small o with caron
852 $chars{0xcf72}=0x0159; # small r with caron
853 $chars{0xcf73}=0x0161; # small s with caron
854 $chars{0xcf74}=0x0165; # small t with caron
855 $chars{0xcf75}=0x01d4; # small u with caron
856 $chars{0xcf7a}=0x017e; # small z with caron
858 $chars{0xd020}=0x00b8; # cedilla
859 $chars{0xd043}=0x00c7; # capital c with cedilla
860 $chars{0xd044}=0x1e10; # capital d with cedilla
861 $chars{0xd047}=0x0122; # capital g with cedilla
862 $chars{0xd048}=0x1e28; # capital h with cedilla
863 $chars{0xd04b}=0x0136; # capital k with cedilla
864 $chars{0xd04c}=0x013b; # capital l with cedilla
865 $chars{0xd04e}=0x0145; # capital n with cedilla
866 $chars{0xd052}=0x0156; # capital r with cedilla
867 $chars{0xd053}=0x015e; # capital s with cedilla
868 $chars{0xd054}=0x0162; # capital t with cedilla
869 $chars{0xd063}=0x00e7; # small c with cedilla
870 $chars{0xd064}=0x1e11; # small d with cedilla
871 $chars{0xd065}=0x0119; # small e with cedilla
872 $chars{0xd067}=0x0123; # small g with cedilla
873 $chars{0xd068}=0x1e29; # small h with cedilla
874 $chars{0xd06b}=0x0137; # small k with cedilla
875 $chars{0xd06c}=0x013c; # small l with cedilla
876 $chars{0xd06e}=0x0146; # small n with cedilla
877 $chars{0xd072}=0x0157; # small r with cedilla
878 $chars{0xd073}=0x015f; # small s with cedilla
879 $chars{0xd074}=0x0163; # small t with cedilla
882 # 5/3 ogonek (hook to right
883 $chars{0xd320}=0x02db; # ogonek
884 $chars{0xd341}=0x0104; # capital a with ogonek
885 $chars{0xd345}=0x0118; # capital e with ogonek
886 $chars{0xd349}=0x012e; # capital i with ogonek
887 $chars{0xd34f}=0x01ea; # capital o with ogonek
888 $chars{0xd355}=0x0172; # capital u with ogonek
889 $chars{0xd361}=0x0105; # small a with ogonek
890 $chars{0xd365}=0x0119; # small e with ogonek
891 $chars{0xd369}=0x012f; # small i with ogonek
892 $chars{0xd36f}=0x01eb; # small o with ogonek
893 $chars{0xd375}=0x0173; # small u with ogonek
895 $chars{0xd441}=0x1e00; # capital a with ring below
896 $chars{0xd461}=0x1e01; # small a with ring below
897 # 5/5 half circle below
898 $chars{0xf948}=0x1e2a; # capital h with breve below
899 $chars{0xf968}=0x1e2b; # small h with breve below
901 $chars{0xd641}=0x1ea0; # capital a with dot below
902 $chars{0xd642}=0x1e04; # capital b with dot below
903 $chars{0xd644}=0x1e0c; # capital d with dot below
904 $chars{0xd645}=0x1eb8; # capital e with dot below
905 $chars{0xd648}=0x1e24; # capital h with dot below
906 $chars{0xd649}=0x1eca; # capital i with dot below
907 $chars{0xd64b}=0x1e32; # capital k with dot below
908 $chars{0xd64c}=0x1e36; # capital l with dot below
909 $chars{0xd64d}=0x1e42; # capital m with dot below
910 $chars{0xd64e}=0x1e46; # capital n with dot below
911 $chars{0xd64f}=0x1ecc; # capital o with dot below
912 $chars{0xd652}=0x1e5a; # capital r with dot below
913 $chars{0xd653}=0x1e62; # capital s with dot below
914 $chars{0xd654}=0x1e6c; # capital t with dot below
915 $chars{0xd655}=0x1ee4; # capital u with dot below
916 $chars{0xd656}=0x1e7e; # capital v with dot below
917 $chars{0xd657}=0x1e88; # capital w with dot below
918 $chars{0xd659}=0x1ef4; # capital y with dot below
919 $chars{0xd65a}=0x1e92; # capital z with dot below
920 $chars{0xd661}=0x1ea1; # small a with dot below
921 $chars{0xd662}=0x1e05; # small b with dot below
922 $chars{0xd664}=0x1e0d; # small d with dot below
923 $chars{0xd665}=0x1eb9; # small e with dot below
924 $chars{0xd668}=0x1e25; # small h with dot below
925 $chars{0xd669}=0x1ecb; # small i with dot below
926 $chars{0xd66b}=0x1e33; # small k with dot below
927 $chars{0xd66c}=0x1e37; # small l with dot below
928 $chars{0xd66d}=0x1e43; # small m with dot below
929 $chars{0xd66e}=0x1e47; # small n with dot below
930 $chars{0xd66f}=0x1ecd; # small o with dot below
931 $chars{0xd672}=0x1e5b; # small r with dot below
932 $chars{0xd673}=0x1e63; # small s with dot below
933 $chars{0xd674}=0x1e6d; # small t with dot below
934 $chars{0xd675}=0x1ee5; # small u with dot below
935 $chars{0xd676}=0x1e7f; # small v with dot below
936 $chars{0xd677}=0x1e89; # small w with dot below
937 $chars{0xd679}=0x1ef5; # small y with dot below
938 $chars{0xd67a}=0x1e93; # small z with dot below
939 # 5/7 double dot below
940 $chars{0xd755}=0x1e72; # capital u with diaeresis below
941 $chars{0xd775}=0x1e73; # small u with diaeresis below
943 $chars{0xd820}=0x005f; # underline
944 # 5/9 double underline
945 $chars{0xd920}=0x2017; # double underline
946 # 5/10 small low vertical bar
947 $chars{0xda20}=0x02cc; #
948 # 5/11 circumflex below
949 # 5/12 (this position shall not be used)
950 # 5/13 left half of ligature sign and of double tilde
951 # 5/14 right half of ligature sign
952 # 5/15 right half of double tilde
953 # map {printf "%x :%x\n",$_,$chars{$_};}keys %chars;
954 my @data = unpack("C*", $string);
956 my $length=scalar(@data);
957 for (my $i = 0; $i < scalar(@data); $i++) {
959 if ($char >= 0x00 && $char <= 0x7F){
962 push @characters,$char unless ($char<0x02 ||$char== 0x0F);
963 }elsif (($char >= 0xC0 && $char <= 0xDF)) {
966 if ($chars{$char*256+$data[$i+1]}) {
967 $convchar= $chars{$char * 256 + $data[$i+1]};
969 # printf "char %x $char, char to convert %x , converted %x\n",$char,$char * 256 + $data[$i - 1],$convchar;
970 } elsif ($chars{$char}) {
971 $convchar= $chars{$char};
972 # printf "0xC char %x, converted %x\n",$char,$chars{$char};
976 push @characters,$convchar;
980 $convchar= $chars{$char};
981 # printf "char %x, converted %x\n",$char,$chars{$char};
983 # printf "char %x $char\n",$char;
986 push @characters,$convchar;
989 $result=pack "U*",@characters;
995 $result=~s/\x1b\x5b//;
996 # map{printf "%x",$_} @characters;
1006 Koha Development Team <info@koha.org>
1008 Galen Charlton <galen.charlton@liblime.com>