3 # Copyright (C) 2008 LibLime
5 # This file is part of Koha.
7 # Koha is free software; you can redistribute it and/or modify it under the
8 # terms of the GNU General Public License as published by the Free Software
9 # Foundation; either version 2 of the License, or (at your option) any later
12 # Koha is distributed in the hope that it will be useful, but WITHOUT ANY
13 # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
14 # A PARTICULAR PURPOSE. See the GNU General Public License for more details.
16 # You should have received a copy of the GNU General Public License along with
17 # Koha; if not, write to the Free Software Foundation, Inc., 59 Temple Place,
18 # Suite 330, Boston, MA 02111-1307 USA
23 use MARC::Charset qw/marc8_to_utf8/;
26 use Unicode::Normalize;
28 use vars qw($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
31 # set the version for version checking
46 C4::Charset - utilities for handling character set conversions.
54 This module contains routines for dealing with character set
55 conversions, particularly for MARC records.
57 A variety of character encodings are in use by various MARC
58 standards, and even more character encodings are used by
59 non-standard MARC records. The various MARC formats generally
60 do not do a good job of advertising a given record's character
61 encoding, and even when a record does advertise its encoding,
62 e.g., via the Leader/09, experience has shown that one cannot
65 Ultimately, all MARC records are stored in Koha in UTF-8 and
66 must be converted from whatever the source character encoding is.
67 The goal of this module is to ensure that these conversions
68 take place accurately. When a character conversion cannot take
69 place, or at least not accurately, the module was provide
70 enough information to allow user-facing code to inform the user
71 on how to deal with the situation.
77 =head2 IsStringUTF8ish
81 my $is_utf8 = IsStringUTF8ish($str);
85 Determines if C<$str> is valid UTF-8. This can mean
92 The Perl UTF-8 flag is set and the string contains valid UTF-8.
96 The Perl UTF-8 flag is B<not> set, but the octets contain
101 The function is named C<IsStringUTF8ish> instead of C<IsStringUTF8>
102 because in one could be presented with a MARC blob that is
103 not actually in UTF-8 but whose sequence of octets appears to be
104 valid UTF-8. The rest of the MARC character conversion functions
105 will assume that this situation occur does not very often.
109 sub IsStringUTF8ish {
112 return 1 if utf8::is_utf8($str);
113 return utf8::decode($str);
120 my $marc_record = SetUTF8Flag($marc_record);
124 This function sets the PERL UTF8 flag for data.
125 It is required when using new_from_usmarc
126 since MARC::File::USMARC does not handle PERL UTF8 setting.
127 When editing unicode marc records fields and subfields, you
128 would end up in double encoding without using this function.
131 In my opinion, this function belongs to MARC::Record and not
133 But since it handles charset, and MARC::Record, it finds its way in that package
139 return unless ($record && $record->fields());
140 foreach my $field ($record->fields()){
141 if ($field->tag()>=10){
143 foreach my $subfield ($field->subfields()){
144 push @subfields,($$subfield[0],NormalizeString($$subfield[1]));
146 my $newfield=MARC::Field->new(
148 $field->indicator(1),
149 $field->indicator(2),
152 $field->replace_with($newfield);
157 =head2 NormalizeString
161 my $normalized_string=NormalizeString($string);
166 nfc : If you want to set NFC and not NFD
167 transform : If you expect all the signs to be removed
168 Sets the PERL UTF8 Flag on your initial data if need be
169 and applies cleaning if required
171 Returns a utf8 NFD normalized string
174 my $string=NormalizeString ("l'ornithoptère");
175 #results into ornithoptère in NFD form and sets UTF8 Flag
179 my ($string,$nfc,$transform)=@_;
180 utf8::decode($string) unless (utf8::is_utf8($string));
182 $string= NFD($string);
185 $string=NFC($string);
188 $string=~s/\<|\>|\^|\;|\.|\?|,|\-|\(|\)|\[|\]|\{|\}|\$|\%|\!|\*|\:|\\|\/|\&|\"|\'/ /g;
189 #removing one letter words "d'" "l'" was changed into "d " "l "
190 $string=~s/\b\S\b//g;
196 =head2 MarcToUTF8Record
200 ($marc_record, $converted_from, $errors_arrayref) = MarcToUTF8Record($marc_blob, $marc_flavour, [, $source_encoding]);
204 Given a MARC blob or a C<MARC::Record>, the MARC flavour, and an
205 optional source encoding, return a C<MARC::Record> that is
208 The returned C<$marc_record> is guaranteed to be in valid UTF-8, but
209 is not guaranteed to have been converted correctly. Specifically,
210 if C<$converted_from> is 'failed', the MARC record returned failed
211 character conversion and had each of its non-ASCII octets changed
212 to the Unicode replacement character.
214 If the source encoding was not specified, this routine will
215 try to guess it; the character encoding used for a successful
216 conversion is returned in C<$converted_from>.
220 sub MarcToUTF8Record {
222 my $marc_flavour = shift;
223 my $source_encoding = shift;
225 my $marc_blob_is_utf8 = 0;
226 if (ref($marc) eq 'MARC::Record') {
227 my $marc_blob = $marc->as_usmarc();
228 $marc_blob_is_utf8 = IsStringUTF8ish($marc_blob);
229 $marc_record = $marc;
231 # dealing with a MARC blob
233 # remove any ersatz whitespace from the beginning and
234 # end of the MARC blob -- these can creep into MARC
235 # files produced by several sources -- caller really
236 # should be doing this, however
239 $marc_blob_is_utf8 = IsStringUTF8ish($marc);
241 $marc_record = MARC::Record->new_from_usmarc($marc);
244 # if we fail the first time, one likely problem
245 # is that we have a MARC21 record that says that it's
246 # UTF-8 (Leader/09 = 'a') but contains non-UTF-8 characters.
247 # We'll try parsing it again.
248 substr($marc, 9, 1) = ' ';
250 $marc_record = MARC::Record->new_from_usmarc($marc);
253 # it's hopeless; return an empty MARC::Record
254 return MARC::Record->new(), 'failed', ['could not parse MARC blob'];
259 # If we do not know the source encoding, try some guesses
261 # 1. Record is UTF-8 already.
262 # 2. If MARC flavor is MARC21, then
263 # a. record is MARC-8
264 # b. record is ISO-8859-1
265 # 3. If MARC flavor is UNIMARC, then
266 if (not defined $source_encoding) {
267 if ($marc_blob_is_utf8) {
268 # note that for MARC21 we are not bothering to check
269 # if the Leader/09 is set to 'a' or not -- because
270 # of problems with various ILSs (including Koha in the
271 # past, alas), this just is not trustworthy.
272 SetMarcUnicodeFlag($marc_record, $marc_flavour);
273 return $marc_record, 'UTF-8', [];
275 if ($marc_flavour eq 'MARC21') {
276 return _default_marc21_charconv_to_utf8($marc_record, $marc_flavour);
277 } elsif ($marc_flavour =~/UNIMARC/) {
278 return _default_unimarc_charconv_to_utf8($marc_record, $marc_flavour);
280 return _default_marc21_charconv_to_utf8($marc_record, $marc_flavour);
284 # caller knows the character encoding
285 my $original_marc_record = $marc_record->clone();
287 if ($source_encoding =~ /utf-?8/i) {
288 if ($marc_blob_is_utf8) {
289 SetMarcUnicodeFlag($marc_record, $marc_flavour);
290 return $marc_record, 'UTF-8', [];
292 push @errors, 'specified UTF-8 => UTF-8 conversion, but record is not in UTF-8';
294 } elsif ($source_encoding =~ /marc-?8/i) {
295 @errors = _marc_marc8_to_utf8($marc_record, $marc_flavour);
296 } elsif ($source_encoding =~ /5426/) {
297 @errors = _marc_iso5426_to_utf8($marc_record, $marc_flavour);
299 # assume any other character encoding is for Text::Iconv
300 @errors = _marc_to_utf8_via_text_iconv($marc_record, $marc_flavour, $source_encoding);
304 _marc_to_utf8_replacement_char($original_marc_record, $marc_flavour);
305 return $original_marc_record, 'failed', \@errors;
307 return $marc_record, $source_encoding, [];
313 =head2 SetMarcUnicodeFlag
317 SetMarcUnicodeFlag($marc_record, $marc_flavour);
321 Set both the internal MARC::Record encoding flag
322 and the appropriate Leader/09 (MARC21) or
323 100/26-29 (UNIMARC) to indicate that the record
324 is in UTF-8. Note that this does B<not> do
325 any actual character conversion.
329 sub SetMarcUnicodeFlag {
330 my $marc_record = shift;
331 my $marc_flavour = shift; # || C4::Context->preference("marcflavour");
333 $marc_record->encoding('UTF-8');
334 if ($marc_flavour eq 'MARC21') {
335 my $leader = $marc_record->leader();
336 substr($leader, 9, 1) = 'a';
337 $marc_record->leader($leader);
338 } elsif ($marc_flavour =~/UNIMARC/) {
340 my ($subflength,$encodingposition)=($marc_flavour=~/AUTH/?(21,9):(36,22));
341 $string=$marc_record->subfield( 100, "a" );
342 if (defined $string && length($string)==$subflength) {
343 $string = substr $string, 0,$subflength if (length($string)>$subflength);
346 $string = POSIX::strftime( "%Y%m%d", localtime );
348 $string = sprintf( "%-*s", $subflength, $string );
350 substr( $string, $encodingposition, 8, "frey50 " );
351 if ( $marc_record->subfield( 100, "a" ) ) {
352 $marc_record->field('100')->update(a=>$string);
355 $marc_record->insert_grouped_field(
356 MARC::Field->new( 100, '', '', "a" => $string ) );
358 $debug && warn "encodage: ", substr( $marc_record->subfield(100, 'a'), $encodingposition, 8 );
360 warn "Unrecognized marcflavour: $marc_flavour";
364 =head2 StripNonXmlChars
368 my $new_str = StripNonXmlChars($old_str);
372 Given a string, return a copy with the
373 characters that are illegal in XML
376 This function exists to work around a problem
377 that can occur with badly-encoded MARC records.
378 Specifically, if a UTF-8 MARC record also
379 has excape (\x1b) characters, MARC::File::XML
380 will let the escape characters pass through
381 when as_xml() or as_xml_record() is called. The
382 problem is that the escape character is not
383 legal in well-formed XML documents, so when
384 MARC::File::XML attempts to parse such a record,
385 the XML parser will fail.
387 Stripping such characters will allow a
388 MARC::Record->new_from_xml()
389 to work, at the possible risk of some data loss.
393 sub StripNonXmlChars {
395 if (!defined($str) || $str eq ""){
398 $str =~ s/[^\x09\x0A\x0D\x{0020}-\x{D7FF}\x{E000}-\x{FFFD}\x{10000}-\x{10FFFF}]//g;
402 =head1 INTERNAL FUNCTIONS
404 =head2 _default_marc21_charconv_to_utf8
408 my ($new_marc_record, $guessed_charset) = _default_marc21_charconv_to_utf8($marc_record);
412 Converts a C<MARC::Record> of unknown character set to UTF-8,
413 first by trying a MARC-8 to UTF-8 conversion, then ISO-8859-1
414 to UTF-8, then a default conversion that replaces each non-ASCII
415 character with the replacement character.
417 The C<$guessed_charset> return value contains the character set
418 that resulted in a conversion to valid UTF-8; note that
419 if the MARC-8 and ISO-8859-1 conversions failed, the value of
424 sub _default_marc21_charconv_to_utf8 {
425 my $marc_record = shift;
426 my $marc_flavour = shift;
428 my $trial_marc8 = $marc_record->clone();
430 my @errors = _marc_marc8_to_utf8($trial_marc8, $marc_flavour);
432 return $trial_marc8, 'MARC-8', [];
434 push @all_errors, @errors;
436 my $trial_8859_1 = $marc_record->clone();
437 @errors = _marc_to_utf8_via_text_iconv($trial_8859_1, $marc_flavour, 'iso-8859-1');
439 return $trial_8859_1, 'iso-8859-1', []; # note -- we could return \@all_errors
440 # instead if we wanted to report details
441 # of the failed attempt at MARC-8 => UTF-8
443 push @all_errors, @errors;
445 my $default_converted = $marc_record->clone();
446 _marc_to_utf8_replacement_char($default_converted, $marc_flavour);
447 return $default_converted, 'failed', \@all_errors;
450 =head2 _default_unimarc_charconv_to_utf8
454 my ($new_marc_record, $guessed_charset) = _default_unimarc_charconv_to_utf8($marc_record);
458 Converts a C<MARC::Record> of unknown character set to UTF-8,
459 first by trying a ISO-5426 to UTF-8 conversion, then ISO-8859-1
460 to UTF-8, then a default conversion that replaces each non-ASCII
461 character with the replacement character.
463 The C<$guessed_charset> return value contains the character set
464 that resulted in a conversion to valid UTF-8; note that
465 if the MARC-8 and ISO-8859-1 conversions failed, the value of
470 sub _default_unimarc_charconv_to_utf8 {
471 my $marc_record = shift;
472 my $marc_flavour = shift;
474 my $trial_marc8 = $marc_record->clone();
476 my @errors = _marc_iso5426_to_utf8($trial_marc8, $marc_flavour);
478 return $trial_marc8, 'iso-5426';
480 push @all_errors, @errors;
482 my $trial_8859_1 = $marc_record->clone();
483 @errors = _marc_to_utf8_via_text_iconv($trial_8859_1, $marc_flavour, 'iso-8859-1');
485 return $trial_8859_1, 'iso-8859-1';
487 push @all_errors, @errors;
489 my $default_converted = $marc_record->clone();
490 _marc_to_utf8_replacement_char($default_converted, $marc_flavour);
491 return $default_converted, 'failed', \@all_errors;
494 =head2 _marc_marc8_to_utf8
498 my @errors = _marc_marc8_to_utf8($marc_record, $marc_flavour, $source_encoding);
502 Convert a C<MARC::Record> to UTF-8 in-place from MARC-8.
503 If the conversion fails for some reason, an
504 appropriate messages will be placed in the returned
509 sub _marc_marc8_to_utf8 {
510 my $marc_record = shift;
511 my $marc_flavour = shift;
513 my $prev_ignore = MARC::Charset->ignore_errors();
514 MARC::Charset->ignore_errors(1);
516 # trap warnings raised by MARC::Charset
518 local $SIG{__WARN__} = sub {
520 if ($msg =~ /MARC.Charset/) {
521 # FIXME - purpose of this regexp is to strip out the
522 # line reference to MARC/Charset.pm, but as it
523 # exists probably won't work quite on Windows --
524 # some sort of minimal-bunch back-tracking RE
525 # would be helpful here
526 $msg =~ s/at [\/].*?.MARC.Charset\.pm line \d+\.\n$//;
529 # if warning doesn't come from MARC::Charset, just
535 foreach my $field ($marc_record->fields()) {
536 if ($field->is_control_field()) {
537 ; # do nothing -- control fields should not contain non-ASCII characters
539 my @converted_subfields;
540 foreach my $subfield ($field->subfields()) {
541 my $utf8sf = MARC::Charset::marc8_to_utf8($subfield->[1]);
542 unless (IsStringUTF8ish($utf8sf)) {
543 # Because of a bug in MARC::Charset 0.98, if the string
544 # has (a) one or more diacritics that (b) are only in character positions
545 # 128 to 255 inclusive, the resulting converted string is not in
546 # UTF-8, but the legacy 8-bit encoding (e.g., ISO-8859-1). If that
547 # occurs, upgrade the string in place. Moral of the story seems to be
548 # that pack("U", ...) is better than chr(...) if you need to guarantee
549 # that the resulting string is UTF-8.
550 utf8::upgrade($utf8sf);
552 push @converted_subfields, $subfield->[0], $utf8sf;
555 $field->replace_with(MARC::Field->new(
556 $field->tag(), $field->indicator(1), $field->indicator(2),
557 @converted_subfields)
562 MARC::Charset->ignore_errors($prev_ignore);
564 SetMarcUnicodeFlag($marc_record, $marc_flavour);
569 =head2 _marc_iso5426_to_utf8
573 my @errors = _marc_iso5426_to_utf8($marc_record, $marc_flavour, $source_encoding);
577 Convert a C<MARC::Record> to UTF-8 in-place from ISO-5426.
578 If the conversion fails for some reason, an
579 appropriate messages will be placed in the returned
582 FIXME - is ISO-5426 equivalent enough to MARC-8
583 that C<MARC::Charset> can be used instead?
587 sub _marc_iso5426_to_utf8 {
588 my $marc_record = shift;
589 my $marc_flavour = shift;
593 foreach my $field ($marc_record->fields()) {
594 if ($field->is_control_field()) {
595 ; # do nothing -- control fields should not contain non-ASCII characters
597 my @converted_subfields;
598 foreach my $subfield ($field->subfields()) {
599 my $utf8sf = char_decode5426($subfield->[1]);
600 push @converted_subfields, $subfield->[0], $utf8sf;
603 $field->replace_with(MARC::Field->new(
604 $field->tag(), $field->indicator(1), $field->indicator(2),
605 @converted_subfields)
610 SetMarcUnicodeFlag($marc_record, $marc_flavour);
615 =head2 _marc_to_utf8_via_text_iconv
619 my @errors = _marc_to_utf8_via_text_iconv($marc_record, $marc_flavour, $source_encoding);
623 Convert a C<MARC::Record> to UTF-8 in-place using the
624 C<Text::Iconv> CPAN module. Any source encoding accepted
625 by the user's iconv installation should work. If
626 the source encoding is not recognized on the user's
627 server or the conversion fails for some reason,
628 appropriate messages will be placed in the returned
633 sub _marc_to_utf8_via_text_iconv {
634 my $marc_record = shift;
635 my $marc_flavour = shift;
636 my $source_encoding = shift;
640 eval { $decoder = Text::Iconv->new($source_encoding, 'utf8'); };
642 push @errors, "Could not initialze $source_encoding => utf8 converter: $@";
646 my $prev_raise_error = Text::Iconv->raise_error();
647 Text::Iconv->raise_error(1);
649 foreach my $field ($marc_record->fields()) {
650 if ($field->is_control_field()) {
651 ; # do nothing -- control fields should not contain non-ASCII characters
653 my @converted_subfields;
654 foreach my $subfield ($field->subfields()) {
656 my $conversion_ok = 1;
657 eval { $converted_value = $decoder->convert($subfield->[1]); };
661 } elsif (not defined $converted_value) {
663 push @errors, "Text::Iconv conversion failed - retval is " . $decoder->retval();
666 if ($conversion_ok) {
667 push @converted_subfields, $subfield->[0], $converted_value;
669 $converted_value = $subfield->[1];
670 $converted_value =~ s/[\200-\377]/\xef\xbf\xbd/g;
671 push @converted_subfields, $subfield->[0], $converted_value;
675 $field->replace_with(MARC::Field->new(
676 $field->tag(), $field->indicator(1), $field->indicator(2),
677 @converted_subfields)
682 SetMarcUnicodeFlag($marc_record, $marc_flavour);
683 Text::Iconv->raise_error($prev_raise_error);
688 =head2 _marc_to_utf8_replacement_char
692 _marc_to_utf8_replacement_char($marc_record, $marc_flavour);
696 Convert a C<MARC::Record> to UTF-8 in-place, adopting the
697 unsatisfactory method of replacing all non-ASCII (e.g.,
698 where the eight bit is set) octet with the Unicode
699 replacement character. This is meant as a last-ditch
700 method, and would be best used as part of a UI that
701 lets a cataloguer pick various character conversions
702 until he or she finds the right one.
706 sub _marc_to_utf8_replacement_char {
707 my $marc_record = shift;
708 my $marc_flavour = shift;
710 foreach my $field ($marc_record->fields()) {
711 if ($field->is_control_field()) {
712 ; # do nothing -- control fields should not contain non-ASCII characters
714 my @converted_subfields;
715 foreach my $subfield ($field->subfields()) {
716 my $value = $subfield->[1];
717 $value =~ s/[\200-\377]/\xef\xbf\xbd/g;
718 push @converted_subfields, $subfield->[0], $value;
721 $field->replace_with(MARC::Field->new(
722 $field->tag(), $field->indicator(1), $field->indicator(2),
723 @converted_subfields)
728 SetMarcUnicodeFlag($marc_record, $marc_flavour);
731 =head2 char_decode5426
735 my $utf8string = char_decode5426($iso_5426_string);
739 Converts a string from ISO-5426 to UTF-8.
745 $chars{0xb0}=0x0101;#3/0ayn[ain]
746 $chars{0xb1}=0x0623;#3/1alif/hamzah[alefwithhamzaabove]
747 #$chars{0xb2}=0x00e0;#'à';
748 $chars{0xb2}=0x00e0;#3/2leftlowsinglequotationmark
749 #$chars{0xb3}=0x00e7;#'ç';
750 $chars{0xb3}=0x00e7;#3/2leftlowsinglequotationmark
755 $chars{0x97}=0x003c;#3/2leftlowsinglequotationmark
756 $chars{0x98}=0x003e;#3/2leftlowsinglequotationmark
757 $chars{0xfa}=0x0153;#oe
758 $chars{0x81d1}=0x00b0;
761 ## combined characters iso5426
763 $chars{0xc041}=0x1ea2; # capital a with hook above
764 $chars{0xc045}=0x1eba; # capital e with hook above
765 $chars{0xc049}=0x1ec8; # capital i with hook above
766 $chars{0xc04f}=0x1ece; # capital o with hook above
767 $chars{0xc055}=0x1ee6; # capital u with hook above
768 $chars{0xc059}=0x1ef6; # capital y with hook above
769 $chars{0xc061}=0x1ea3; # small a with hook above
770 $chars{0xc065}=0x1ebb; # small e with hook above
771 $chars{0xc069}=0x1ec9; # small i with hook above
772 $chars{0xc06f}=0x1ecf; # small o with hook above
773 $chars{0xc075}=0x1ee7; # small u with hook above
774 $chars{0xc079}=0x1ef7; # small y with hook above
777 $chars{0xc141}=0x00c0; # capital a with grave accent
778 $chars{0xc145}=0x00c8; # capital e with grave accent
779 $chars{0xc149}=0x00cc; # capital i with grave accent
780 $chars{0xc14f}=0x00d2; # capital o with grave accent
781 $chars{0xc155}=0x00d9; # capital u with grave accent
782 $chars{0xc157}=0x1e80; # capital w with grave
783 $chars{0xc159}=0x1ef2; # capital y with grave
784 $chars{0xc161}=0x00e0; # small a with grave accent
785 $chars{0xc165}=0x00e8; # small e with grave accent
786 $chars{0xc169}=0x00ec; # small i with grave accent
787 $chars{0xc16f}=0x00f2; # small o with grave accent
788 $chars{0xc175}=0x00f9; # small u with grave accent
789 $chars{0xc177}=0x1e81; # small w with grave
790 $chars{0xc179}=0x1ef3; # small y with grave
792 $chars{0xc241}=0x00c1; # capital a with acute accent
793 $chars{0xc243}=0x0106; # capital c with acute accent
794 $chars{0xc245}=0x00c9; # capital e with acute accent
795 $chars{0xc247}=0x01f4; # capital g with acute
796 $chars{0xc249}=0x00cd; # capital i with acute accent
797 $chars{0xc24b}=0x1e30; # capital k with acute
798 $chars{0xc24c}=0x0139; # capital l with acute accent
799 $chars{0xc24d}=0x1e3e; # capital m with acute
800 $chars{0xc24e}=0x0143; # capital n with acute accent
801 $chars{0xc24f}=0x00d3; # capital o with acute accent
802 $chars{0xc250}=0x1e54; # capital p with acute
803 $chars{0xc252}=0x0154; # capital r with acute accent
804 $chars{0xc253}=0x015a; # capital s with acute accent
805 $chars{0xc255}=0x00da; # capital u with acute accent
806 $chars{0xc257}=0x1e82; # capital w with acute
807 $chars{0xc259}=0x00dd; # capital y with acute accent
808 $chars{0xc25a}=0x0179; # capital z with acute accent
809 $chars{0xc261}=0x00e1; # small a with acute accent
810 $chars{0xc263}=0x0107; # small c with acute accent
811 $chars{0xc265}=0x00e9; # small e with acute accent
812 $chars{0xc267}=0x01f5; # small g with acute
813 $chars{0xc269}=0x00ed; # small i with acute accent
814 $chars{0xc26b}=0x1e31; # small k with acute
815 $chars{0xc26c}=0x013a; # small l with acute accent
816 $chars{0xc26d}=0x1e3f; # small m with acute
817 $chars{0xc26e}=0x0144; # small n with acute accent
818 $chars{0xc26f}=0x00f3; # small o with acute accent
819 $chars{0xc270}=0x1e55; # small p with acute
820 $chars{0xc272}=0x0155; # small r with acute accent
821 $chars{0xc273}=0x015b; # small s with acute accent
822 $chars{0xc275}=0x00fa; # small u with acute accent
823 $chars{0xc277}=0x1e83; # small w with acute
824 $chars{0xc279}=0x00fd; # small y with acute accent
825 $chars{0xc27a}=0x017a; # small z with acute accent
826 $chars{0xc2e1}=0x01fc; # capital ae with acute
827 $chars{0xc2f1}=0x01fd; # small ae with acute
828 # 4/3 circumflex accent
829 $chars{0xc341}=0x00c2; # capital a with circumflex accent
830 $chars{0xc343}=0x0108; # capital c with circumflex
831 $chars{0xc345}=0x00ca; # capital e with circumflex accent
832 $chars{0xc347}=0x011c; # capital g with circumflex
833 $chars{0xc348}=0x0124; # capital h with circumflex
834 $chars{0xc349}=0x00ce; # capital i with circumflex accent
835 $chars{0xc34a}=0x0134; # capital j with circumflex
836 $chars{0xc34f}=0x00d4; # capital o with circumflex accent
837 $chars{0xc353}=0x015c; # capital s with circumflex
838 $chars{0xc355}=0x00db; # capital u with circumflex
839 $chars{0xc357}=0x0174; # capital w with circumflex
840 $chars{0xc359}=0x0176; # capital y with circumflex
841 $chars{0xc35a}=0x1e90; # capital z with circumflex
842 $chars{0xc361}=0x00e2; # small a with circumflex accent
843 $chars{0xc363}=0x0109; # small c with circumflex
844 $chars{0xc365}=0x00ea; # small e with circumflex accent
845 $chars{0xc367}=0x011d; # small g with circumflex
846 $chars{0xc368}=0x0125; # small h with circumflex
847 $chars{0xc369}=0x00ee; # small i with circumflex accent
848 $chars{0xc36a}=0x0135; # small j with circumflex
849 $chars{0xc36e}=0x00f1; # small n with tilde
850 $chars{0xc36f}=0x00f4; # small o with circumflex accent
851 $chars{0xc373}=0x015d; # small s with circumflex
852 $chars{0xc375}=0x00fb; # small u with circumflex
853 $chars{0xc377}=0x0175; # small w with circumflex
854 $chars{0xc379}=0x0177; # small y with circumflex
855 $chars{0xc37a}=0x1e91; # small z with circumflex
857 $chars{0xc441}=0x00c3; # capital a with tilde
858 $chars{0xc445}=0x1ebc; # capital e with tilde
859 $chars{0xc449}=0x0128; # capital i with tilde
860 $chars{0xc44e}=0x00d1; # capital n with tilde
861 $chars{0xc44f}=0x00d5; # capital o with tilde
862 $chars{0xc455}=0x0168; # capital u with tilde
863 $chars{0xc456}=0x1e7c; # capital v with tilde
864 $chars{0xc459}=0x1ef8; # capital y with tilde
865 $chars{0xc461}=0x00e3; # small a with tilde
866 $chars{0xc465}=0x1ebd; # small e with tilde
867 $chars{0xc469}=0x0129; # small i with tilde
868 $chars{0xc46e}=0x00f1; # small n with tilde
869 $chars{0xc46f}=0x00f5; # small o with tilde
870 $chars{0xc475}=0x0169; # small u with tilde
871 $chars{0xc476}=0x1e7d; # small v with tilde
872 $chars{0xc479}=0x1ef9; # small y with tilde
874 $chars{0xc541}=0x0100; # capital a with macron
875 $chars{0xc545}=0x0112; # capital e with macron
876 $chars{0xc547}=0x1e20; # capital g with macron
877 $chars{0xc549}=0x012a; # capital i with macron
878 $chars{0xc54f}=0x014c; # capital o with macron
879 $chars{0xc555}=0x016a; # capital u with macron
880 $chars{0xc561}=0x0101; # small a with macron
881 $chars{0xc565}=0x0113; # small e with macron
882 $chars{0xc567}=0x1e21; # small g with macron
883 $chars{0xc569}=0x012b; # small i with macron
884 $chars{0xc56f}=0x014d; # small o with macron
885 $chars{0xc575}=0x016b; # small u with macron
886 $chars{0xc572}=0x0159; # small r with macron
887 $chars{0xc5e1}=0x01e2; # capital ae with macron
888 $chars{0xc5f1}=0x01e3; # small ae with macron
890 $chars{0xc641}=0x0102; # capital a with breve
891 $chars{0xc645}=0x0114; # capital e with breve
892 $chars{0xc647}=0x011e; # capital g with breve
893 $chars{0xc649}=0x012c; # capital i with breve
894 $chars{0xc64f}=0x014e; # capital o with breve
895 $chars{0xc655}=0x016c; # capital u with breve
896 $chars{0xc661}=0x0103; # small a with breve
897 $chars{0xc665}=0x0115; # small e with breve
898 $chars{0xc667}=0x011f; # small g with breve
899 $chars{0xc669}=0x012d; # small i with breve
900 $chars{0xc66f}=0x014f; # small o with breve
901 $chars{0xc675}=0x016d; # small u with breve
903 $chars{0xc7b0}=0x01e1; # Ain with dot above
904 $chars{0xc742}=0x1e02; # capital b with dot above
905 $chars{0xc743}=0x010a; # capital c with dot above
906 $chars{0xc744}=0x1e0a; # capital d with dot above
907 $chars{0xc745}=0x0116; # capital e with dot above
908 $chars{0xc746}=0x1e1e; # capital f with dot above
909 $chars{0xc747}=0x0120; # capital g with dot above
910 $chars{0xc748}=0x1e22; # capital h with dot above
911 $chars{0xc749}=0x0130; # capital i with dot above
912 $chars{0xc74d}=0x1e40; # capital m with dot above
913 $chars{0xc74e}=0x1e44; # capital n with dot above
914 $chars{0xc750}=0x1e56; # capital p with dot above
915 $chars{0xc752}=0x1e58; # capital r with dot above
916 $chars{0xc753}=0x1e60; # capital s with dot above
917 $chars{0xc754}=0x1e6a; # capital t with dot above
918 $chars{0xc757}=0x1e86; # capital w with dot above
919 $chars{0xc758}=0x1e8a; # capital x with dot above
920 $chars{0xc759}=0x1e8e; # capital y with dot above
921 $chars{0xc75a}=0x017b; # capital z with dot above
922 $chars{0xc761}=0x0227; # small b with dot above
923 $chars{0xc762}=0x1e03; # small b with dot above
924 $chars{0xc763}=0x010b; # small c with dot above
925 $chars{0xc764}=0x1e0b; # small d with dot above
926 $chars{0xc765}=0x0117; # small e with dot above
927 $chars{0xc766}=0x1e1f; # small f with dot above
928 $chars{0xc767}=0x0121; # small g with dot above
929 $chars{0xc768}=0x1e23; # small h with dot above
930 $chars{0xc76d}=0x1e41; # small m with dot above
931 $chars{0xc76e}=0x1e45; # small n with dot above
932 $chars{0xc770}=0x1e57; # small p with dot above
933 $chars{0xc772}=0x1e59; # small r with dot above
934 $chars{0xc773}=0x1e61; # small s with dot above
935 $chars{0xc774}=0x1e6b; # small t with dot above
936 $chars{0xc777}=0x1e87; # small w with dot above
937 $chars{0xc778}=0x1e8b; # small x with dot above
938 $chars{0xc779}=0x1e8f; # small y with dot above
939 $chars{0xc77a}=0x017c; # small z with dot above
940 # 4/8 trema, diaresis
941 $chars{0xc820}=0x00a8; # diaeresis
942 $chars{0xc841}=0x00c4; # capital a with diaeresis
943 $chars{0xc845}=0x00cb; # capital e with diaeresis
944 $chars{0xc848}=0x1e26; # capital h with diaeresis
945 $chars{0xc849}=0x00cf; # capital i with diaeresis
946 $chars{0xc84f}=0x00d6; # capital o with diaeresis
947 $chars{0xc855}=0x00dc; # capital u with diaeresis
948 $chars{0xc857}=0x1e84; # capital w with diaeresis
949 $chars{0xc858}=0x1e8c; # capital x with diaeresis
950 $chars{0xc859}=0x0178; # capital y with diaeresis
951 $chars{0xc861}=0x00e4; # small a with diaeresis
952 $chars{0xc865}=0x00eb; # small e with diaeresis
953 $chars{0xc868}=0x1e27; # small h with diaeresis
954 $chars{0xc869}=0x00ef; # small i with diaeresis
955 $chars{0xc86f}=0x00f6; # small o with diaeresis
956 $chars{0xc874}=0x1e97; # small t with diaeresis
957 $chars{0xc875}=0x00fc; # small u with diaeresis
958 $chars{0xc877}=0x1e85; # small w with diaeresis
959 $chars{0xc878}=0x1e8d; # small x with diaeresis
960 $chars{0xc879}=0x00ff; # small y with diaeresis
962 $chars{0xc920}=0x00a8; # [diaeresis]
963 $chars{0xc961}=0x00e4; # a with umlaut
964 $chars{0xc965}=0x00eb; # e with umlaut
965 $chars{0xc969}=0x00ef; # i with umlaut
966 $chars{0xc96f}=0x00f6; # o with umlaut
967 $chars{0xc975}=0x00fc; # u with umlaut
969 $chars{0xca41}=0x00c5; # capital a with ring above
970 $chars{0xcaad}=0x016e; # capital u with ring above
971 $chars{0xca61}=0x00e5; # small a with ring above
972 $chars{0xca75}=0x016f; # small u with ring above
973 $chars{0xca77}=0x1e98; # small w with ring above
974 $chars{0xca79}=0x1e99; # small y with ring above
975 # 4/11 high comma off centre
976 # 4/12 inverted high comma centred
977 # 4/13 double acute accent
978 $chars{0xcd4f}=0x0150; # capital o with double acute
979 $chars{0xcd55}=0x0170; # capital u with double acute
980 $chars{0xcd6f}=0x0151; # small o with double acute
981 $chars{0xcd75}=0x0171; # small u with double acute
983 $chars{0xce54}=0x01a0; # latin capital letter o with horn
984 $chars{0xce55}=0x01af; # latin capital letter u with horn
985 $chars{0xce74}=0x01a1; # latin small letter o with horn
986 $chars{0xce75}=0x01b0; # latin small letter u with horn
988 $chars{0xcf41}=0x01cd; # capital a with caron
989 $chars{0xcf43}=0x010c; # capital c with caron
990 $chars{0xcf44}=0x010e; # capital d with caron
991 $chars{0xcf45}=0x011a; # capital e with caron
992 $chars{0xcf47}=0x01e6; # capital g with caron
993 $chars{0xcf49}=0x01cf; # capital i with caron
994 $chars{0xcf4b}=0x01e8; # capital k with caron
995 $chars{0xcf4c}=0x013d; # capital l with caron
996 $chars{0xcf4e}=0x0147; # capital n with caron
997 $chars{0xcf4f}=0x01d1; # capital o with caron
998 $chars{0xcf52}=0x0158; # capital r with caron
999 $chars{0xcf53}=0x0160; # capital s with caron
1000 $chars{0xcf54}=0x0164; # capital t with caron
1001 $chars{0xcf55}=0x01d3; # capital u with caron
1002 $chars{0xcf5a}=0x017d; # capital z with caron
1003 $chars{0xcf61}=0x01ce; # small a with caron
1004 $chars{0xcf63}=0x010d; # small c with caron
1005 $chars{0xcf64}=0x010f; # small d with caron
1006 $chars{0xcf65}=0x011b; # small e with caron
1007 $chars{0xcf67}=0x01e7; # small g with caron
1008 $chars{0xcf69}=0x01d0; # small i with caron
1009 $chars{0xcf6a}=0x01f0; # small j with caron
1010 $chars{0xcf6b}=0x01e9; # small k with caron
1011 $chars{0xcf6c}=0x013e; # small l with caron
1012 $chars{0xcf6e}=0x0148; # small n with caron
1013 $chars{0xcf6f}=0x01d2; # small o with caron
1014 $chars{0xcf72}=0x0159; # small r with caron
1015 $chars{0xcf73}=0x0161; # small s with caron
1016 $chars{0xcf74}=0x0165; # small t with caron
1017 $chars{0xcf75}=0x01d4; # small u with caron
1018 $chars{0xcf7a}=0x017e; # small z with caron
1020 $chars{0xd020}=0x00b8; # cedilla
1021 $chars{0xd043}=0x00c7; # capital c with cedilla
1022 $chars{0xd044}=0x1e10; # capital d with cedilla
1023 $chars{0xd047}=0x0122; # capital g with cedilla
1024 $chars{0xd048}=0x1e28; # capital h with cedilla
1025 $chars{0xd04b}=0x0136; # capital k with cedilla
1026 $chars{0xd04c}=0x013b; # capital l with cedilla
1027 $chars{0xd04e}=0x0145; # capital n with cedilla
1028 $chars{0xd052}=0x0156; # capital r with cedilla
1029 $chars{0xd053}=0x015e; # capital s with cedilla
1030 $chars{0xd054}=0x0162; # capital t with cedilla
1031 $chars{0xd063}=0x00e7; # small c with cedilla
1032 $chars{0xd064}=0x1e11; # small d with cedilla
1033 $chars{0xd065}=0x0119; # small e with cedilla
1034 $chars{0xd067}=0x0123; # small g with cedilla
1035 $chars{0xd068}=0x1e29; # small h with cedilla
1036 $chars{0xd06b}=0x0137; # small k with cedilla
1037 $chars{0xd06c}=0x013c; # small l with cedilla
1038 $chars{0xd06e}=0x0146; # small n with cedilla
1039 $chars{0xd072}=0x0157; # small r with cedilla
1040 $chars{0xd073}=0x015f; # small s with cedilla
1041 $chars{0xd074}=0x0163; # small t with cedilla
1044 # 5/3 ogonek (hook to right
1045 $chars{0xd320}=0x02db; # ogonek
1046 $chars{0xd341}=0x0104; # capital a with ogonek
1047 $chars{0xd345}=0x0118; # capital e with ogonek
1048 $chars{0xd349}=0x012e; # capital i with ogonek
1049 $chars{0xd34f}=0x01ea; # capital o with ogonek
1050 $chars{0xd355}=0x0172; # capital u with ogonek
1051 $chars{0xd361}=0x0105; # small a with ogonek
1052 $chars{0xd365}=0x0119; # small e with ogonek
1053 $chars{0xd369}=0x012f; # small i with ogonek
1054 $chars{0xd36f}=0x01eb; # small o with ogonek
1055 $chars{0xd375}=0x0173; # small u with ogonek
1057 $chars{0xd441}=0x1e00; # capital a with ring below
1058 $chars{0xd461}=0x1e01; # small a with ring below
1059 # 5/5 half circle below
1060 $chars{0xf948}=0x1e2a; # capital h with breve below
1061 $chars{0xf968}=0x1e2b; # small h with breve below
1063 $chars{0xd641}=0x1ea0; # capital a with dot below
1064 $chars{0xd642}=0x1e04; # capital b with dot below
1065 $chars{0xd644}=0x1e0c; # capital d with dot below
1066 $chars{0xd645}=0x1eb8; # capital e with dot below
1067 $chars{0xd648}=0x1e24; # capital h with dot below
1068 $chars{0xd649}=0x1eca; # capital i with dot below
1069 $chars{0xd64b}=0x1e32; # capital k with dot below
1070 $chars{0xd64c}=0x1e36; # capital l with dot below
1071 $chars{0xd64d}=0x1e42; # capital m with dot below
1072 $chars{0xd64e}=0x1e46; # capital n with dot below
1073 $chars{0xd64f}=0x1ecc; # capital o with dot below
1074 $chars{0xd652}=0x1e5a; # capital r with dot below
1075 $chars{0xd653}=0x1e62; # capital s with dot below
1076 $chars{0xd654}=0x1e6c; # capital t with dot below
1077 $chars{0xd655}=0x1ee4; # capital u with dot below
1078 $chars{0xd656}=0x1e7e; # capital v with dot below
1079 $chars{0xd657}=0x1e88; # capital w with dot below
1080 $chars{0xd659}=0x1ef4; # capital y with dot below
1081 $chars{0xd65a}=0x1e92; # capital z with dot below
1082 $chars{0xd661}=0x1ea1; # small a with dot below
1083 $chars{0xd662}=0x1e05; # small b with dot below
1084 $chars{0xd664}=0x1e0d; # small d with dot below
1085 $chars{0xd665}=0x1eb9; # small e with dot below
1086 $chars{0xd668}=0x1e25; # small h with dot below
1087 $chars{0xd669}=0x1ecb; # small i with dot below
1088 $chars{0xd66b}=0x1e33; # small k with dot below
1089 $chars{0xd66c}=0x1e37; # small l with dot below
1090 $chars{0xd66d}=0x1e43; # small m with dot below
1091 $chars{0xd66e}=0x1e47; # small n with dot below
1092 $chars{0xd66f}=0x1ecd; # small o with dot below
1093 $chars{0xd672}=0x1e5b; # small r with dot below
1094 $chars{0xd673}=0x1e63; # small s with dot below
1095 $chars{0xd674}=0x1e6d; # small t with dot below
1096 $chars{0xd675}=0x1ee5; # small u with dot below
1097 $chars{0xd676}=0x1e7f; # small v with dot below
1098 $chars{0xd677}=0x1e89; # small w with dot below
1099 $chars{0xd679}=0x1ef5; # small y with dot below
1100 $chars{0xd67a}=0x1e93; # small z with dot below
1101 # 5/7 double dot below
1102 $chars{0xd755}=0x1e72; # capital u with diaeresis below
1103 $chars{0xd775}=0x1e73; # small u with diaeresis below
1105 $chars{0xd820}=0x005f; # underline
1106 # 5/9 double underline
1107 $chars{0xd920}=0x2017; # double underline
1108 # 5/10 small low vertical bar
1109 $chars{0xda20}=0x02cc; #
1110 # 5/11 circumflex below
1111 # 5/12 (this position shall not be used)
1112 # 5/13 left half of ligature sign and of double tilde
1113 # 5/14 right half of ligature sign
1114 # 5/15 right half of double tilde
1115 # map {printf "%x :%x\n",$_,$chars{$_};}keys %chars;
1117 sub char_decode5426 {
1121 my @data = unpack("C*", $string);
1123 my $length=scalar(@data);
1124 for (my $i = 0; $i < scalar(@data); $i++) {
1125 my $char= $data[$i];
1126 if ($char >= 0x00 && $char <= 0x7F){
1129 push @characters,$char unless ($char<0x02 ||$char== 0x0F);
1130 }elsif (($char >= 0xC0 && $char <= 0xDF)) {
1133 if ($chars{$char*256+$data[$i+1]}) {
1134 $convchar= $chars{$char * 256 + $data[$i+1]};
1136 # printf "char %x $char, char to convert %x , converted %x\n",$char,$char * 256 + $data[$i - 1],$convchar;
1137 } elsif ($chars{$char}) {
1138 $convchar= $chars{$char};
1139 # printf "0xC char %x, converted %x\n",$char,$chars{$char};
1143 push @characters,$convchar;
1146 if ($chars{$char}) {
1147 $convchar= $chars{$char};
1148 # printf "char %x, converted %x\n",$char,$chars{$char};
1150 # printf "char %x $char\n",$char;
1153 push @characters,$convchar;
1156 $result=pack "U*",@characters;
1157 # $result=~s/\x01//;
1158 # $result=~s/\x00//;
1162 $result=~s/\x1b\x5b//;
1163 # map{printf "%x",$_} @characters;
1173 Koha Development Team <info@koha.org>
1175 Galen Charlton <galen.charlton@liblime.com>