3 # Copyright (C) 2008 LibLime
5 # This file is part of Koha.
7 # Koha is free software; you can redistribute it and/or modify it under the
8 # terms of the GNU General Public License as published by the Free Software
9 # Foundation; either version 2 of the License, or (at your option) any later
12 # Koha is distributed in the hope that it will be useful, but WITHOUT ANY
13 # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
14 # A PARTICULAR PURPOSE. See the GNU General Public License for more details.
16 # You should have received a copy of the GNU General Public License along
17 # with Koha; if not, write to the Free Software Foundation, Inc.,
18 # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
23 use MARC::Charset qw/marc8_to_utf8/;
27 use Unicode::Normalize;
29 use vars qw($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
32 # set the version for version checking
33 $VERSION = 3.07.00.049;
52 C4::Charset - utilities for handling character set conversions.
60 This module contains routines for dealing with character set
61 conversions, particularly for MARC records.
63 A variety of character encodings are in use by various MARC
64 standards, and even more character encodings are used by
65 non-standard MARC records. The various MARC formats generally
66 do not do a good job of advertising a given record's character
67 encoding, and even when a record does advertise its encoding,
68 e.g., via the Leader/09, experience has shown that one cannot
71 Ultimately, all MARC records are stored in Koha in UTF-8 and
72 must be converted from whatever the source character encoding is.
73 The goal of this module is to ensure that these conversions
74 take place accurately. When a character conversion cannot take
75 place, or at least not accurately, the module was provide
76 enough information to allow user-facing code to inform the user
77 on how to deal with the situation.
83 =head2 IsStringUTF8ish
85 my $is_utf8 = IsStringUTF8ish($str);
87 Determines if C<$str> is valid UTF-8. This can mean
94 The Perl UTF-8 flag is set and the string contains valid UTF-8.
98 The Perl UTF-8 flag is B<not> set, but the octets contain
103 The function is named C<IsStringUTF8ish> instead of C<IsStringUTF8>
104 because in one could be presented with a MARC blob that is
105 not actually in UTF-8 but whose sequence of octets appears to be
106 valid UTF-8. The rest of the MARC character conversion functions
107 will assume that this situation occur does not very often.
111 sub IsStringUTF8ish {
114 return 1 if utf8::is_utf8($str);
115 return utf8::decode($str);
120 my $marc_record = SetUTF8Flag($marc_record, $nfd);
122 This function sets the PERL UTF8 flag for data.
123 It is required when using new_from_usmarc
124 since MARC::File::USMARC does not handle PERL UTF8 setting.
125 When editing unicode marc records fields and subfields, you
126 would end up in double encoding without using this function.
128 If $nfd is set, string normalization will use NFD instead of NFC
131 In my opinion, this function belongs to MARC::Record and not
133 But since it handles charset, and MARC::Record, it finds its way in that package
138 my ($record, $nfd)=@_;
139 return unless ($record && $record->fields());
140 foreach my $field ($record->fields()){
141 if ($field->tag()>=10){
143 foreach my $subfield ($field->subfields()){
144 push @subfields,($$subfield[0],NormalizeString($$subfield[1],$nfd));
147 my $newfield=MARC::Field->new(
149 $field->indicator(1),
150 $field->indicator(2),
153 $field->replace_with($newfield);
155 warn "ERROR occurred in SetUTF8Flag $@" if $@;
160 =head2 NormalizeString
162 my $normalized_string=NormalizeString($string,$nfd,$transform);
165 nfd : If you want to set NFD and not NFC
166 transform : If you expect all the signs to be removed
168 Sets the PERL UTF8 Flag on your initial data if need be
169 and applies cleaning if required
171 Returns a utf8 NFC normalized string
174 my $string=NormalizeString ("l'ornithoptère");
175 #results into ornithoptère in NFC form and sets UTF8 Flag
181 my ($string,$nfd,$transform)=@_;
182 utf8::decode($string) unless (utf8::is_utf8($string));
184 $string= NFD($string);
187 $string=NFC($string);
190 $string=~s/\<|\>|\^|\;|\.|\?|,|\-|\(|\)|\[|\]|\{|\}|\$|\%|\!|\*|\:|\\|\/|\&|\"|\'/ /g;
191 #removing one letter words "d'" "l'" was changed into "d " "l "
192 $string=~s/\b\S\b//g;
198 =head2 MarcToUTF8Record
200 ($marc_record, $converted_from, $errors_arrayref) = MarcToUTF8Record($marc_blob,
201 $marc_flavour, [, $source_encoding]);
203 Given a MARC blob or a C<MARC::Record>, the MARC flavour, and an
204 optional source encoding, return a C<MARC::Record> that is
207 The returned C<$marc_record> is guaranteed to be in valid UTF-8, but
208 is not guaranteed to have been converted correctly. Specifically,
209 if C<$converted_from> is 'failed', the MARC record returned failed
210 character conversion and had each of its non-ASCII octets changed
211 to the Unicode replacement character.
213 If the source encoding was not specified, this routine will
214 try to guess it; the character encoding used for a successful
215 conversion is returned in C<$converted_from>.
219 sub MarcToUTF8Record {
221 my $marc_flavour = shift;
222 my $source_encoding = shift;
224 my $marc_blob_is_utf8 = 0;
225 if (ref($marc) eq 'MARC::Record') {
226 my $marc_blob = $marc->as_usmarc();
227 $marc_blob_is_utf8 = IsStringUTF8ish($marc_blob);
228 $marc_record = $marc;
230 # dealing with a MARC blob
232 # remove any ersatz whitespace from the beginning and
233 # end of the MARC blob -- these can creep into MARC
234 # files produced by several sources -- caller really
235 # should be doing this, however
238 $marc_blob_is_utf8 = IsStringUTF8ish($marc);
240 $marc_record = MARC::Record->new_from_usmarc($marc);
243 # if we fail the first time, one likely problem
244 # is that we have a MARC21 record that says that it's
245 # UTF-8 (Leader/09 = 'a') but contains non-UTF-8 characters.
246 # We'll try parsing it again.
247 substr($marc, 9, 1) = ' ';
249 $marc_record = MARC::Record->new_from_usmarc($marc);
252 # it's hopeless; return an empty MARC::Record
253 return MARC::Record->new(), 'failed', ['could not parse MARC blob'];
258 # If we do not know the source encoding, try some guesses
260 # 1. Record is UTF-8 already.
261 # 2. If MARC flavor is MARC21 or NORMARC, then
262 # a. record is MARC-8
263 # b. record is ISO-8859-1
264 # 3. If MARC flavor is UNIMARC, then
265 if (not defined $source_encoding) {
266 if ($marc_blob_is_utf8) {
267 # note that for MARC21/NORMARC we are not bothering to check
268 # if the Leader/09 is set to 'a' or not -- because
269 # of problems with various ILSs (including Koha in the
270 # past, alas), this just is not trustworthy.
271 SetMarcUnicodeFlag($marc_record, $marc_flavour);
272 return $marc_record, 'UTF-8', [];
274 if ($marc_flavour eq 'MARC21' || $marc_flavour eq 'NORMARC') {
275 return _default_marc21_charconv_to_utf8($marc_record, $marc_flavour);
276 } elsif ($marc_flavour =~/UNIMARC/) {
277 return _default_unimarc_charconv_to_utf8($marc_record, $marc_flavour);
279 return _default_marc21_charconv_to_utf8($marc_record, $marc_flavour);
283 # caller knows the character encoding
284 my $original_marc_record = $marc_record->clone();
286 if ($source_encoding =~ /utf-?8/i) {
287 if ($marc_blob_is_utf8) {
288 SetMarcUnicodeFlag($marc_record, $marc_flavour);
289 return $marc_record, 'UTF-8', [];
291 push @errors, 'specified UTF-8 => UTF-8 conversion, but record is not in UTF-8';
293 } elsif ($source_encoding =~ /marc-?8/i) {
294 @errors = _marc_marc8_to_utf8($marc_record, $marc_flavour);
295 } elsif ($source_encoding =~ /5426/) {
296 @errors = _marc_iso5426_to_utf8($marc_record, $marc_flavour);
298 # assume any other character encoding is for Text::Iconv
299 @errors = _marc_to_utf8_via_text_iconv($marc_record, $marc_flavour, $source_encoding);
303 _marc_to_utf8_replacement_char($original_marc_record, $marc_flavour);
304 return $original_marc_record, 'failed', \@errors;
306 return $marc_record, $source_encoding, [];
312 =head2 SetMarcUnicodeFlag
314 SetMarcUnicodeFlag($marc_record, $marc_flavour);
316 Set both the internal MARC::Record encoding flag
317 and the appropriate Leader/09 (MARC21) or
318 100/26-29 (UNIMARC) to indicate that the record
319 is in UTF-8. Note that this does B<not> do
320 any actual character conversion.
324 sub SetMarcUnicodeFlag {
325 my $marc_record = shift;
326 my $marc_flavour = shift; # || C4::Context->preference("marcflavour");
328 $marc_record->encoding('UTF-8');
329 if ($marc_flavour eq 'MARC21' || $marc_flavour eq 'NORMARC') {
330 my $leader = $marc_record->leader();
331 substr($leader, 9, 1) = 'a';
332 $marc_record->leader($leader);
333 } elsif ($marc_flavour =~/UNIMARC/) {
334 my $defaultlanguage = C4::Context->preference("UNIMARCField100Language");
335 $defaultlanguage = "fre" if (!$defaultlanguage || length($defaultlanguage) != 3);
337 my ($subflength,$encodingposition)=($marc_flavour=~/AUTH/?(21,12):(36,25));
338 $string=$marc_record->subfield( 100, "a" );
339 if (defined $string && length($string)==$subflength) {
340 $string = substr $string, 0,$subflength if (length($string)>$subflength);
343 $string = POSIX::strftime( "%Y%m%d", localtime );
345 $string = sprintf( "%-*s", $subflength, $string );
346 substr ( $string, ($encodingposition - 3), 3, $defaultlanguage);
348 substr( $string, $encodingposition, 3, "y50" );
349 if ( $marc_record->subfield( 100, "a" ) ) {
350 $marc_record->field('100')->update(a=>$string);
353 $marc_record->insert_grouped_field(
354 MARC::Field->new( 100, '', '', "a" => $string ) );
356 $debug && warn "encodage: ", substr( $marc_record->subfield(100, 'a'), $encodingposition, 3 );
358 warn "Unrecognized marcflavour: $marc_flavour";
362 =head2 StripNonXmlChars
364 my $new_str = StripNonXmlChars($old_str);
366 Given a string, return a copy with the
367 characters that are illegal in XML
370 This function exists to work around a problem
371 that can occur with badly-encoded MARC records.
372 Specifically, if a UTF-8 MARC record also
373 has excape (\x1b) characters, MARC::File::XML
374 will let the escape characters pass through
375 when as_xml() or as_xml_record() is called. The
376 problem is that the escape character is not
377 legal in well-formed XML documents, so when
378 MARC::File::XML attempts to parse such a record,
379 the XML parser will fail.
381 Stripping such characters will allow a
382 MARC::Record->new_from_xml()
383 to work, at the possible risk of some data loss.
387 sub StripNonXmlChars {
389 if (!defined($str) || $str eq ""){
392 $str =~ s/[^\x09\x0A\x0D\x{0020}-\x{D7FF}\x{E000}-\x{FFFD}\x{10000}-\x{10FFFF}]//g;
406 Removes Non Sorting Block characters
410 my $NSB = '\x88' ; # NSB : begin Non Sorting Block
411 my $NSE = '\x89' ; # NSE : Non Sorting Block end
412 my $NSB2 = '\x98' ; # NSB : begin Non Sorting Block
413 my $NSE2 = '\x9C' ; # NSE : Non Sorting Block end
414 my $C2 = '\xC2' ; # What is this char ? It is sometimes left by the regexp after removing NSB / NSE
416 # handles non sorting blocks
419 s/($C2){0,1}($NSB|$NSB2)//g ;
420 s/($C2){0,1}($NSE|$NSE2)//g ;
427 =head2 SanitizeRecord
429 SanitizeRecord($marcrecord);
432 This routine is called in the maintenance script misc/maintenance/sanitize_records.pl.
433 It cleans any string with '&amp;...', replacing it by '&'
438 my ( $record, $biblionumber ) = @_;
440 my $record_modified = 0;
441 my $frameworkcode = C4::Biblio::GetFrameworkCode($biblionumber);
442 my ( $url_field, $url_subfield ) =
443 C4::Biblio::GetMarcFromKohaField( 'biblioitems.url', $frameworkcode );
444 foreach my $field ( $record->fields() ) {
445 if ( $field->is_control_field() ) {
446 my $value = $field->data();
447 my $sanitized_value = _clean_ampersand($value);
448 $record_modified = 1 if $sanitized_value ne $value;
449 $field->update($sanitized_value);
452 my @subfields = $field->subfields();
454 foreach my $subfield (@subfields) {
456 if $url_field eq $field->tag()
457 and $url_subfield eq $subfield->[0];
458 my $value = $subfield->[1];
459 my $sanitized_value = _clean_ampersand($value);
460 push @new_subfields, $subfield->[0] => $sanitized_value;
461 $record_modified = 1 if $sanitized_value ne $value;
463 if ( scalar(@new_subfields) > 0 ) {
464 my $new_field = eval {
466 $field->tag(), $field->indicator(1),
467 $field->indicator(2), @new_subfields
474 $field->replace_with($new_field);
481 return $record, $record_modified;
484 sub _clean_ampersand {
486 $string =~ s/(&)(amp;)+/$1/g;
490 =head1 INTERNAL FUNCTIONS
492 =head2 _default_marc21_charconv_to_utf8
494 my ($new_marc_record, $guessed_charset) = _default_marc21_charconv_to_utf8($marc_record);
496 Converts a C<MARC::Record> of unknown character set to UTF-8,
497 first by trying a MARC-8 to UTF-8 conversion, then ISO-8859-1
498 to UTF-8, then a default conversion that replaces each non-ASCII
499 character with the replacement character.
501 The C<$guessed_charset> return value contains the character set
502 that resulted in a conversion to valid UTF-8; note that
503 if the MARC-8 and ISO-8859-1 conversions failed, the value of
508 sub _default_marc21_charconv_to_utf8 {
509 my $marc_record = shift;
510 my $marc_flavour = shift;
512 my $trial_marc8 = $marc_record->clone();
514 my @errors = _marc_marc8_to_utf8($trial_marc8, $marc_flavour);
516 return $trial_marc8, 'MARC-8', [];
518 push @all_errors, @errors;
520 my $trial_8859_1 = $marc_record->clone();
521 @errors = _marc_to_utf8_via_text_iconv($trial_8859_1, $marc_flavour, 'iso-8859-1');
523 return $trial_8859_1, 'iso-8859-1', []; # note -- we could return \@all_errors
524 # instead if we wanted to report details
525 # of the failed attempt at MARC-8 => UTF-8
527 push @all_errors, @errors;
529 my $default_converted = $marc_record->clone();
530 _marc_to_utf8_replacement_char($default_converted, $marc_flavour);
531 return $default_converted, 'failed', \@all_errors;
534 =head2 _default_unimarc_charconv_to_utf8
536 my ($new_marc_record, $guessed_charset) = _default_unimarc_charconv_to_utf8($marc_record);
538 Converts a C<MARC::Record> of unknown character set to UTF-8,
539 first by trying a ISO-5426 to UTF-8 conversion, then ISO-8859-1
540 to UTF-8, then a default conversion that replaces each non-ASCII
541 character with the replacement character.
543 The C<$guessed_charset> return value contains the character set
544 that resulted in a conversion to valid UTF-8; note that
545 if the MARC-8 and ISO-8859-1 conversions failed, the value of
550 sub _default_unimarc_charconv_to_utf8 {
551 my $marc_record = shift;
552 my $marc_flavour = shift;
554 my $trial_marc8 = $marc_record->clone();
556 my @errors = _marc_iso5426_to_utf8($trial_marc8, $marc_flavour);
558 return $trial_marc8, 'iso-5426';
560 push @all_errors, @errors;
562 my $trial_8859_1 = $marc_record->clone();
563 @errors = _marc_to_utf8_via_text_iconv($trial_8859_1, $marc_flavour, 'iso-8859-1');
565 return $trial_8859_1, 'iso-8859-1';
567 push @all_errors, @errors;
569 my $default_converted = $marc_record->clone();
570 _marc_to_utf8_replacement_char($default_converted, $marc_flavour);
571 return $default_converted, 'failed', \@all_errors;
574 =head2 _marc_marc8_to_utf8
576 my @errors = _marc_marc8_to_utf8($marc_record, $marc_flavour, $source_encoding);
578 Convert a C<MARC::Record> to UTF-8 in-place from MARC-8.
579 If the conversion fails for some reason, an
580 appropriate messages will be placed in the returned
585 sub _marc_marc8_to_utf8 {
586 my $marc_record = shift;
587 my $marc_flavour = shift;
589 my $prev_ignore = MARC::Charset->ignore_errors();
590 MARC::Charset->ignore_errors(1);
592 # trap warnings raised by MARC::Charset
594 local $SIG{__WARN__} = sub {
596 if ($msg =~ /MARC.Charset/) {
597 # FIXME - purpose of this regexp is to strip out the
598 # line reference to MARC/Charset.pm, but as it
599 # exists probably won't work quite on Windows --
600 # some sort of minimal-bunch back-tracking RE
601 # would be helpful here
602 $msg =~ s/at [\/].*?.MARC.Charset\.pm line \d+\.\n$//;
605 # if warning doesn't come from MARC::Charset, just
611 foreach my $field ($marc_record->fields()) {
612 if ($field->is_control_field()) {
613 ; # do nothing -- control fields should not contain non-ASCII characters
615 my @converted_subfields;
616 foreach my $subfield ($field->subfields()) {
617 my $utf8sf = MARC::Charset::marc8_to_utf8($subfield->[1]);
618 unless (IsStringUTF8ish($utf8sf)) {
619 # Because of a bug in MARC::Charset 0.98, if the string
620 # has (a) one or more diacritics that (b) are only in character positions
621 # 128 to 255 inclusive, the resulting converted string is not in
622 # UTF-8, but the legacy 8-bit encoding (e.g., ISO-8859-1). If that
623 # occurs, upgrade the string in place. Moral of the story seems to be
624 # that pack("U", ...) is better than chr(...) if you need to guarantee
625 # that the resulting string is UTF-8.
626 utf8::upgrade($utf8sf);
628 push @converted_subfields, $subfield->[0], $utf8sf;
631 $field->replace_with(MARC::Field->new(
632 $field->tag(), $field->indicator(1), $field->indicator(2),
633 @converted_subfields)
638 MARC::Charset->ignore_errors($prev_ignore);
640 SetMarcUnicodeFlag($marc_record, $marc_flavour);
645 =head2 _marc_iso5426_to_utf8
647 my @errors = _marc_iso5426_to_utf8($marc_record, $marc_flavour, $source_encoding);
649 Convert a C<MARC::Record> to UTF-8 in-place from ISO-5426.
650 If the conversion fails for some reason, an
651 appropriate messages will be placed in the returned
654 FIXME - is ISO-5426 equivalent enough to MARC-8
655 that C<MARC::Charset> can be used instead?
659 sub _marc_iso5426_to_utf8 {
660 my $marc_record = shift;
661 my $marc_flavour = shift;
665 foreach my $field ($marc_record->fields()) {
666 if ($field->is_control_field()) {
667 ; # do nothing -- control fields should not contain non-ASCII characters
669 my @converted_subfields;
670 foreach my $subfield ($field->subfields()) {
671 my $utf8sf = char_decode5426($subfield->[1]);
672 push @converted_subfields, $subfield->[0], $utf8sf;
675 $field->replace_with(MARC::Field->new(
676 $field->tag(), $field->indicator(1), $field->indicator(2),
677 @converted_subfields)
682 SetMarcUnicodeFlag($marc_record, $marc_flavour);
687 =head2 _marc_to_utf8_via_text_iconv
689 my @errors = _marc_to_utf8_via_text_iconv($marc_record, $marc_flavour, $source_encoding);
691 Convert a C<MARC::Record> to UTF-8 in-place using the
692 C<Text::Iconv> CPAN module. Any source encoding accepted
693 by the user's iconv installation should work. If
694 the source encoding is not recognized on the user's
695 server or the conversion fails for some reason,
696 appropriate messages will be placed in the returned
701 sub _marc_to_utf8_via_text_iconv {
702 my $marc_record = shift;
703 my $marc_flavour = shift;
704 my $source_encoding = shift;
708 eval { $decoder = Text::Iconv->new($source_encoding, 'utf8'); };
710 push @errors, "Could not initialze $source_encoding => utf8 converter: $@";
714 my $prev_raise_error = Text::Iconv->raise_error();
715 Text::Iconv->raise_error(1);
717 foreach my $field ($marc_record->fields()) {
718 if ($field->is_control_field()) {
719 ; # do nothing -- control fields should not contain non-ASCII characters
721 my @converted_subfields;
722 foreach my $subfield ($field->subfields()) {
724 my $conversion_ok = 1;
725 eval { $converted_value = $decoder->convert($subfield->[1]); };
729 } elsif (not defined $converted_value) {
731 push @errors, "Text::Iconv conversion failed - retval is " . $decoder->retval();
734 if ($conversion_ok) {
735 push @converted_subfields, $subfield->[0], $converted_value;
737 $converted_value = $subfield->[1];
738 $converted_value =~ s/[\200-\377]/\xef\xbf\xbd/g;
739 push @converted_subfields, $subfield->[0], $converted_value;
743 $field->replace_with(MARC::Field->new(
744 $field->tag(), $field->indicator(1), $field->indicator(2),
745 @converted_subfields)
750 SetMarcUnicodeFlag($marc_record, $marc_flavour);
751 Text::Iconv->raise_error($prev_raise_error);
756 =head2 _marc_to_utf8_replacement_char
758 _marc_to_utf8_replacement_char($marc_record, $marc_flavour);
760 Convert a C<MARC::Record> to UTF-8 in-place, adopting the
761 unsatisfactory method of replacing all non-ASCII (e.g.,
762 where the eight bit is set) octet with the Unicode
763 replacement character. This is meant as a last-ditch
764 method, and would be best used as part of a UI that
765 lets a cataloguer pick various character conversions
766 until he or she finds the right one.
770 sub _marc_to_utf8_replacement_char {
771 my $marc_record = shift;
772 my $marc_flavour = shift;
774 foreach my $field ($marc_record->fields()) {
775 if ($field->is_control_field()) {
776 ; # do nothing -- control fields should not contain non-ASCII characters
778 my @converted_subfields;
779 foreach my $subfield ($field->subfields()) {
780 my $value = $subfield->[1];
781 $value =~ s/[\200-\377]/\xef\xbf\xbd/g;
782 push @converted_subfields, $subfield->[0], $value;
785 $field->replace_with(MARC::Field->new(
786 $field->tag(), $field->indicator(1), $field->indicator(2),
787 @converted_subfields)
792 SetMarcUnicodeFlag($marc_record, $marc_flavour);
795 =head2 char_decode5426
797 my $utf8string = char_decode5426($iso_5426_string);
799 Converts a string from ISO-5426 to UTF-8.
805 $chars{0xb0}=0x0101;#3/0ayn[ain]
806 $chars{0xb1}=0x0623;#3/1alif/hamzah[alefwithhamzaabove]
807 #$chars{0xb2}=0x00e0;#'à';
808 $chars{0xb2}=0x00e0;#3/2leftlowsinglequotationmark
809 #$chars{0xb3}=0x00e7;#'ç';
810 $chars{0xb3}=0x00e7;#3/2leftlowsinglequotationmark
817 $chars{0x97}=0x003c;#3/2leftlowsinglequotationmark
818 $chars{0x98}=0x003e;#3/2leftlowsinglequotationmark
819 $chars{0xfa}=0x0153; #oe
820 $chars{0xea}=0x0152; #oe
821 $chars{0x81d1}=0x00b0;
824 ## combined characters iso5426
826 $chars{0xc041}=0x1ea2; # capital a with hook above
827 $chars{0xc045}=0x1eba; # capital e with hook above
828 $chars{0xc049}=0x1ec8; # capital i with hook above
829 $chars{0xc04f}=0x1ece; # capital o with hook above
830 $chars{0xc055}=0x1ee6; # capital u with hook above
831 $chars{0xc059}=0x1ef6; # capital y with hook above
832 $chars{0xc061}=0x1ea3; # small a with hook above
833 $chars{0xc065}=0x1ebb; # small e with hook above
834 $chars{0xc069}=0x1ec9; # small i with hook above
835 $chars{0xc06f}=0x1ecf; # small o with hook above
836 $chars{0xc075}=0x1ee7; # small u with hook above
837 $chars{0xc079}=0x1ef7; # small y with hook above
840 $chars{0xc141}=0x00c0; # capital a with grave accent
841 $chars{0xc145}=0x00c8; # capital e with grave accent
842 $chars{0xc149}=0x00cc; # capital i with grave accent
843 $chars{0xc14f}=0x00d2; # capital o with grave accent
844 $chars{0xc155}=0x00d9; # capital u with grave accent
845 $chars{0xc157}=0x1e80; # capital w with grave
846 $chars{0xc159}=0x1ef2; # capital y with grave
847 $chars{0xc161}=0x00e0; # small a with grave accent
848 $chars{0xc165}=0x00e8; # small e with grave accent
849 $chars{0xc169}=0x00ec; # small i with grave accent
850 $chars{0xc16f}=0x00f2; # small o with grave accent
851 $chars{0xc175}=0x00f9; # small u with grave accent
852 $chars{0xc177}=0x1e81; # small w with grave
853 $chars{0xc179}=0x1ef3; # small y with grave
855 $chars{0xc241}=0x00c1; # capital a with acute accent
856 $chars{0xc243}=0x0106; # capital c with acute accent
857 $chars{0xc245}=0x00c9; # capital e with acute accent
858 $chars{0xc247}=0x01f4; # capital g with acute
859 $chars{0xc249}=0x00cd; # capital i with acute accent
860 $chars{0xc24b}=0x1e30; # capital k with acute
861 $chars{0xc24c}=0x0139; # capital l with acute accent
862 $chars{0xc24d}=0x1e3e; # capital m with acute
863 $chars{0xc24e}=0x0143; # capital n with acute accent
864 $chars{0xc24f}=0x00d3; # capital o with acute accent
865 $chars{0xc250}=0x1e54; # capital p with acute
866 $chars{0xc252}=0x0154; # capital r with acute accent
867 $chars{0xc253}=0x015a; # capital s with acute accent
868 $chars{0xc255}=0x00da; # capital u with acute accent
869 $chars{0xc257}=0x1e82; # capital w with acute
870 $chars{0xc259}=0x00dd; # capital y with acute accent
871 $chars{0xc25a}=0x0179; # capital z with acute accent
872 $chars{0xc261}=0x00e1; # small a with acute accent
873 $chars{0xc263}=0x0107; # small c with acute accent
874 $chars{0xc265}=0x00e9; # small e with acute accent
875 $chars{0xc267}=0x01f5; # small g with acute
876 $chars{0xc269}=0x00ed; # small i with acute accent
877 $chars{0xc26b}=0x1e31; # small k with acute
878 $chars{0xc26c}=0x013a; # small l with acute accent
879 $chars{0xc26d}=0x1e3f; # small m with acute
880 $chars{0xc26e}=0x0144; # small n with acute accent
881 $chars{0xc26f}=0x00f3; # small o with acute accent
882 $chars{0xc270}=0x1e55; # small p with acute
883 $chars{0xc272}=0x0155; # small r with acute accent
884 $chars{0xc273}=0x015b; # small s with acute accent
885 $chars{0xc275}=0x00fa; # small u with acute accent
886 $chars{0xc277}=0x1e83; # small w with acute
887 $chars{0xc279}=0x00fd; # small y with acute accent
888 $chars{0xc27a}=0x017a; # small z with acute accent
889 $chars{0xc2e1}=0x01fc; # capital ae with acute
890 $chars{0xc2f1}=0x01fd; # small ae with acute
891 # 4/3 circumflex accent
892 $chars{0xc341}=0x00c2; # capital a with circumflex accent
893 $chars{0xc343}=0x0108; # capital c with circumflex
894 $chars{0xc345}=0x00ca; # capital e with circumflex accent
895 $chars{0xc347}=0x011c; # capital g with circumflex
896 $chars{0xc348}=0x0124; # capital h with circumflex
897 $chars{0xc349}=0x00ce; # capital i with circumflex accent
898 $chars{0xc34a}=0x0134; # capital j with circumflex
899 $chars{0xc34f}=0x00d4; # capital o with circumflex accent
900 $chars{0xc353}=0x015c; # capital s with circumflex
901 $chars{0xc355}=0x00db; # capital u with circumflex
902 $chars{0xc357}=0x0174; # capital w with circumflex
903 $chars{0xc359}=0x0176; # capital y with circumflex
904 $chars{0xc35a}=0x1e90; # capital z with circumflex
905 $chars{0xc361}=0x00e2; # small a with circumflex accent
906 $chars{0xc363}=0x0109; # small c with circumflex
907 $chars{0xc365}=0x00ea; # small e with circumflex accent
908 $chars{0xc367}=0x011d; # small g with circumflex
909 $chars{0xc368}=0x0125; # small h with circumflex
910 $chars{0xc369}=0x00ee; # small i with circumflex accent
911 $chars{0xc36a}=0x0135; # small j with circumflex
912 $chars{0xc36e}=0x00f1; # small n with tilde
913 $chars{0xc36f}=0x00f4; # small o with circumflex accent
914 $chars{0xc373}=0x015d; # small s with circumflex
915 $chars{0xc375}=0x00fb; # small u with circumflex
916 $chars{0xc377}=0x0175; # small w with circumflex
917 $chars{0xc379}=0x0177; # small y with circumflex
918 $chars{0xc37a}=0x1e91; # small z with circumflex
920 $chars{0xc441}=0x00c3; # capital a with tilde
921 $chars{0xc445}=0x1ebc; # capital e with tilde
922 $chars{0xc449}=0x0128; # capital i with tilde
923 $chars{0xc44e}=0x00d1; # capital n with tilde
924 $chars{0xc44f}=0x00d5; # capital o with tilde
925 $chars{0xc455}=0x0168; # capital u with tilde
926 $chars{0xc456}=0x1e7c; # capital v with tilde
927 $chars{0xc459}=0x1ef8; # capital y with tilde
928 $chars{0xc461}=0x00e3; # small a with tilde
929 $chars{0xc465}=0x1ebd; # small e with tilde
930 $chars{0xc469}=0x0129; # small i with tilde
931 $chars{0xc46e}=0x00f1; # small n with tilde
932 $chars{0xc46f}=0x00f5; # small o with tilde
933 $chars{0xc475}=0x0169; # small u with tilde
934 $chars{0xc476}=0x1e7d; # small v with tilde
935 $chars{0xc479}=0x1ef9; # small y with tilde
937 $chars{0xc541}=0x0100; # capital a with macron
938 $chars{0xc545}=0x0112; # capital e with macron
939 $chars{0xc547}=0x1e20; # capital g with macron
940 $chars{0xc549}=0x012a; # capital i with macron
941 $chars{0xc54f}=0x014c; # capital o with macron
942 $chars{0xc555}=0x016a; # capital u with macron
943 $chars{0xc561}=0x0101; # small a with macron
944 $chars{0xc565}=0x0113; # small e with macron
945 $chars{0xc567}=0x1e21; # small g with macron
946 $chars{0xc569}=0x012b; # small i with macron
947 $chars{0xc56f}=0x014d; # small o with macron
948 $chars{0xc575}=0x016b; # small u with macron
949 $chars{0xc572}=0x0159; # small r with macron
950 $chars{0xc5e1}=0x01e2; # capital ae with macron
951 $chars{0xc5f1}=0x01e3; # small ae with macron
953 $chars{0xc641}=0x0102; # capital a with breve
954 $chars{0xc645}=0x0114; # capital e with breve
955 $chars{0xc647}=0x011e; # capital g with breve
956 $chars{0xc649}=0x012c; # capital i with breve
957 $chars{0xc64f}=0x014e; # capital o with breve
958 $chars{0xc655}=0x016c; # capital u with breve
959 $chars{0xc661}=0x0103; # small a with breve
960 $chars{0xc665}=0x0115; # small e with breve
961 $chars{0xc667}=0x011f; # small g with breve
962 $chars{0xc669}=0x012d; # small i with breve
963 $chars{0xc66f}=0x014f; # small o with breve
964 $chars{0xc675}=0x016d; # small u with breve
966 $chars{0xc7b0}=0x01e1; # Ain with dot above
967 $chars{0xc742}=0x1e02; # capital b with dot above
968 $chars{0xc743}=0x010a; # capital c with dot above
969 $chars{0xc744}=0x1e0a; # capital d with dot above
970 $chars{0xc745}=0x0116; # capital e with dot above
971 $chars{0xc746}=0x1e1e; # capital f with dot above
972 $chars{0xc747}=0x0120; # capital g with dot above
973 $chars{0xc748}=0x1e22; # capital h with dot above
974 $chars{0xc749}=0x0130; # capital i with dot above
975 $chars{0xc74d}=0x1e40; # capital m with dot above
976 $chars{0xc74e}=0x1e44; # capital n with dot above
977 $chars{0xc750}=0x1e56; # capital p with dot above
978 $chars{0xc752}=0x1e58; # capital r with dot above
979 $chars{0xc753}=0x1e60; # capital s with dot above
980 $chars{0xc754}=0x1e6a; # capital t with dot above
981 $chars{0xc757}=0x1e86; # capital w with dot above
982 $chars{0xc758}=0x1e8a; # capital x with dot above
983 $chars{0xc759}=0x1e8e; # capital y with dot above
984 $chars{0xc75a}=0x017b; # capital z with dot above
985 $chars{0xc761}=0x0227; # small b with dot above
986 $chars{0xc762}=0x1e03; # small b with dot above
987 $chars{0xc763}=0x010b; # small c with dot above
988 $chars{0xc764}=0x1e0b; # small d with dot above
989 $chars{0xc765}=0x0117; # small e with dot above
990 $chars{0xc766}=0x1e1f; # small f with dot above
991 $chars{0xc767}=0x0121; # small g with dot above
992 $chars{0xc768}=0x1e23; # small h with dot above
993 $chars{0xc76d}=0x1e41; # small m with dot above
994 $chars{0xc76e}=0x1e45; # small n with dot above
995 $chars{0xc770}=0x1e57; # small p with dot above
996 $chars{0xc772}=0x1e59; # small r with dot above
997 $chars{0xc773}=0x1e61; # small s with dot above
998 $chars{0xc774}=0x1e6b; # small t with dot above
999 $chars{0xc777}=0x1e87; # small w with dot above
1000 $chars{0xc778}=0x1e8b; # small x with dot above
1001 $chars{0xc779}=0x1e8f; # small y with dot above
1002 $chars{0xc77a}=0x017c; # small z with dot above
1003 # 4/8 trema, diaresis
1004 $chars{0xc820}=0x00a8; # diaeresis
1005 $chars{0xc841}=0x00c4; # capital a with diaeresis
1006 $chars{0xc845}=0x00cb; # capital e with diaeresis
1007 $chars{0xc848}=0x1e26; # capital h with diaeresis
1008 $chars{0xc849}=0x00cf; # capital i with diaeresis
1009 $chars{0xc84f}=0x00d6; # capital o with diaeresis
1010 $chars{0xc855}=0x00dc; # capital u with diaeresis
1011 $chars{0xc857}=0x1e84; # capital w with diaeresis
1012 $chars{0xc858}=0x1e8c; # capital x with diaeresis
1013 $chars{0xc859}=0x0178; # capital y with diaeresis
1014 $chars{0xc861}=0x00e4; # small a with diaeresis
1015 $chars{0xc865}=0x00eb; # small e with diaeresis
1016 $chars{0xc868}=0x1e27; # small h with diaeresis
1017 $chars{0xc869}=0x00ef; # small i with diaeresis
1018 $chars{0xc86f}=0x00f6; # small o with diaeresis
1019 $chars{0xc874}=0x1e97; # small t with diaeresis
1020 $chars{0xc875}=0x00fc; # small u with diaeresis
1021 $chars{0xc877}=0x1e85; # small w with diaeresis
1022 $chars{0xc878}=0x1e8d; # small x with diaeresis
1023 $chars{0xc879}=0x00ff; # small y with diaeresis
1025 $chars{0xc920}=0x00a8; # [diaeresis]
1026 $chars{0xc961}=0x00e4; # a with umlaut
1027 $chars{0xc965}=0x00eb; # e with umlaut
1028 $chars{0xc969}=0x00ef; # i with umlaut
1029 $chars{0xc96f}=0x00f6; # o with umlaut
1030 $chars{0xc975}=0x00fc; # u with umlaut
1032 $chars{0xca41}=0x00c5; # capital a with ring above
1033 $chars{0xcaad}=0x016e; # capital u with ring above
1034 $chars{0xca61}=0x00e5; # small a with ring above
1035 $chars{0xca75}=0x016f; # small u with ring above
1036 $chars{0xca77}=0x1e98; # small w with ring above
1037 $chars{0xca79}=0x1e99; # small y with ring above
1038 # 4/11 high comma off centre
1039 # 4/12 inverted high comma centred
1040 # 4/13 double acute accent
1041 $chars{0xcd4f}=0x0150; # capital o with double acute
1042 $chars{0xcd55}=0x0170; # capital u with double acute
1043 $chars{0xcd6f}=0x0151; # small o with double acute
1044 $chars{0xcd75}=0x0171; # small u with double acute
1046 $chars{0xce54}=0x01a0; # latin capital letter o with horn
1047 $chars{0xce55}=0x01af; # latin capital letter u with horn
1048 $chars{0xce74}=0x01a1; # latin small letter o with horn
1049 $chars{0xce75}=0x01b0; # latin small letter u with horn
1051 $chars{0xcf41}=0x01cd; # capital a with caron
1052 $chars{0xcf43}=0x010c; # capital c with caron
1053 $chars{0xcf44}=0x010e; # capital d with caron
1054 $chars{0xcf45}=0x011a; # capital e with caron
1055 $chars{0xcf47}=0x01e6; # capital g with caron
1056 $chars{0xcf49}=0x01cf; # capital i with caron
1057 $chars{0xcf4b}=0x01e8; # capital k with caron
1058 $chars{0xcf4c}=0x013d; # capital l with caron
1059 $chars{0xcf4e}=0x0147; # capital n with caron
1060 $chars{0xcf4f}=0x01d1; # capital o with caron
1061 $chars{0xcf52}=0x0158; # capital r with caron
1062 $chars{0xcf53}=0x0160; # capital s with caron
1063 $chars{0xcf54}=0x0164; # capital t with caron
1064 $chars{0xcf55}=0x01d3; # capital u with caron
1065 $chars{0xcf5a}=0x017d; # capital z with caron
1066 $chars{0xcf61}=0x01ce; # small a with caron
1067 $chars{0xcf63}=0x010d; # small c with caron
1068 $chars{0xcf64}=0x010f; # small d with caron
1069 $chars{0xcf65}=0x011b; # small e with caron
1070 $chars{0xcf67}=0x01e7; # small g with caron
1071 $chars{0xcf69}=0x01d0; # small i with caron
1072 $chars{0xcf6a}=0x01f0; # small j with caron
1073 $chars{0xcf6b}=0x01e9; # small k with caron
1074 $chars{0xcf6c}=0x013e; # small l with caron
1075 $chars{0xcf6e}=0x0148; # small n with caron
1076 $chars{0xcf6f}=0x01d2; # small o with caron
1077 $chars{0xcf72}=0x0159; # small r with caron
1078 $chars{0xcf73}=0x0161; # small s with caron
1079 $chars{0xcf74}=0x0165; # small t with caron
1080 $chars{0xcf75}=0x01d4; # small u with caron
1081 $chars{0xcf7a}=0x017e; # small z with caron
1083 $chars{0xd020}=0x00b8; # cedilla
1084 $chars{0xd043}=0x00c7; # capital c with cedilla
1085 $chars{0xd044}=0x1e10; # capital d with cedilla
1086 $chars{0xd047}=0x0122; # capital g with cedilla
1087 $chars{0xd048}=0x1e28; # capital h with cedilla
1088 $chars{0xd04b}=0x0136; # capital k with cedilla
1089 $chars{0xd04c}=0x013b; # capital l with cedilla
1090 $chars{0xd04e}=0x0145; # capital n with cedilla
1091 $chars{0xd052}=0x0156; # capital r with cedilla
1092 $chars{0xd053}=0x015e; # capital s with cedilla
1093 $chars{0xd054}=0x0162; # capital t with cedilla
1094 $chars{0xd063}=0x00e7; # small c with cedilla
1095 $chars{0xd064}=0x1e11; # small d with cedilla
1096 $chars{0xd065}=0x0119; # small e with cedilla
1097 $chars{0xd067}=0x0123; # small g with cedilla
1098 $chars{0xd068}=0x1e29; # small h with cedilla
1099 $chars{0xd06b}=0x0137; # small k with cedilla
1100 $chars{0xd06c}=0x013c; # small l with cedilla
1101 $chars{0xd06e}=0x0146; # small n with cedilla
1102 $chars{0xd072}=0x0157; # small r with cedilla
1103 $chars{0xd073}=0x015f; # small s with cedilla
1104 $chars{0xd074}=0x0163; # small t with cedilla
1107 # 5/3 ogonek (hook to right
1108 $chars{0xd320}=0x02db; # ogonek
1109 $chars{0xd341}=0x0104; # capital a with ogonek
1110 $chars{0xd345}=0x0118; # capital e with ogonek
1111 $chars{0xd349}=0x012e; # capital i with ogonek
1112 $chars{0xd34f}=0x01ea; # capital o with ogonek
1113 $chars{0xd355}=0x0172; # capital u with ogonek
1114 $chars{0xd361}=0x0105; # small a with ogonek
1115 $chars{0xd365}=0x0119; # small e with ogonek
1116 $chars{0xd369}=0x012f; # small i with ogonek
1117 $chars{0xd36f}=0x01eb; # small o with ogonek
1118 $chars{0xd375}=0x0173; # small u with ogonek
1120 $chars{0xd441}=0x1e00; # capital a with ring below
1121 $chars{0xd461}=0x1e01; # small a with ring below
1122 # 5/5 half circle below
1123 $chars{0xf948}=0x1e2a; # capital h with breve below
1124 $chars{0xf968}=0x1e2b; # small h with breve below
1126 $chars{0xd641}=0x1ea0; # capital a with dot below
1127 $chars{0xd642}=0x1e04; # capital b with dot below
1128 $chars{0xd644}=0x1e0c; # capital d with dot below
1129 $chars{0xd645}=0x1eb8; # capital e with dot below
1130 $chars{0xd648}=0x1e24; # capital h with dot below
1131 $chars{0xd649}=0x1eca; # capital i with dot below
1132 $chars{0xd64b}=0x1e32; # capital k with dot below
1133 $chars{0xd64c}=0x1e36; # capital l with dot below
1134 $chars{0xd64d}=0x1e42; # capital m with dot below
1135 $chars{0xd64e}=0x1e46; # capital n with dot below
1136 $chars{0xd64f}=0x1ecc; # capital o with dot below
1137 $chars{0xd652}=0x1e5a; # capital r with dot below
1138 $chars{0xd653}=0x1e62; # capital s with dot below
1139 $chars{0xd654}=0x1e6c; # capital t with dot below
1140 $chars{0xd655}=0x1ee4; # capital u with dot below
1141 $chars{0xd656}=0x1e7e; # capital v with dot below
1142 $chars{0xd657}=0x1e88; # capital w with dot below
1143 $chars{0xd659}=0x1ef4; # capital y with dot below
1144 $chars{0xd65a}=0x1e92; # capital z with dot below
1145 $chars{0xd661}=0x1ea1; # small a with dot below
1146 $chars{0xd662}=0x1e05; # small b with dot below
1147 $chars{0xd664}=0x1e0d; # small d with dot below
1148 $chars{0xd665}=0x1eb9; # small e with dot below
1149 $chars{0xd668}=0x1e25; # small h with dot below
1150 $chars{0xd669}=0x1ecb; # small i with dot below
1151 $chars{0xd66b}=0x1e33; # small k with dot below
1152 $chars{0xd66c}=0x1e37; # small l with dot below
1153 $chars{0xd66d}=0x1e43; # small m with dot below
1154 $chars{0xd66e}=0x1e47; # small n with dot below
1155 $chars{0xd66f}=0x1ecd; # small o with dot below
1156 $chars{0xd672}=0x1e5b; # small r with dot below
1157 $chars{0xd673}=0x1e63; # small s with dot below
1158 $chars{0xd674}=0x1e6d; # small t with dot below
1159 $chars{0xd675}=0x1ee5; # small u with dot below
1160 $chars{0xd676}=0x1e7f; # small v with dot below
1161 $chars{0xd677}=0x1e89; # small w with dot below
1162 $chars{0xd679}=0x1ef5; # small y with dot below
1163 $chars{0xd67a}=0x1e93; # small z with dot below
1164 # 5/7 double dot below
1165 $chars{0xd755}=0x1e72; # capital u with diaeresis below
1166 $chars{0xd775}=0x1e73; # small u with diaeresis below
1168 $chars{0xd820}=0x005f; # underline
1169 # 5/9 double underline
1170 $chars{0xd920}=0x2017; # double underline
1171 # 5/10 small low vertical bar
1172 $chars{0xda20}=0x02cc; #
1173 # 5/11 circumflex below
1174 # 5/12 (this position shall not be used)
1175 # 5/13 left half of ligature sign and of double tilde
1176 # 5/14 right half of ligature sign
1177 # 5/15 right half of double tilde
1178 # map {printf "%x :%x\n",$_,$chars{$_};}keys %chars;
1180 sub char_decode5426 {
1184 my @data = unpack("C*", $string);
1186 my $length=scalar(@data);
1187 for (my $i = 0; $i < scalar(@data); $i++) {
1188 my $char= $data[$i];
1189 if ($char >= 0x00 && $char <= 0x7F){
1192 push @characters,$char unless ($char<0x02 ||$char== 0x0F);
1193 }elsif (($char >= 0xC0 && $char <= 0xDF)) {
1196 if ($chars{$char*256+$data[$i+1]}) {
1197 $convchar= $chars{$char * 256 + $data[$i+1]};
1199 # printf "char %x $char, char to convert %x , converted %x\n",$char,$char * 256 + $data[$i - 1],$convchar;
1200 } elsif ($chars{$char}) {
1201 $convchar= $chars{$char};
1202 # printf "0xC char %x, converted %x\n",$char,$chars{$char};
1206 push @characters,$convchar;
1209 if ($chars{$char}) {
1210 $convchar= $chars{$char};
1211 # printf "char %x, converted %x\n",$char,$chars{$char};
1213 # printf "char %x $char\n",$char;
1216 push @characters,$convchar;
1219 $result=pack "U*",@characters;
1220 # $result=~s/\x01//;
1221 # $result=~s/\x00//;
1225 $result=~s/\x1b\x5b//;
1226 # map{printf "%x",$_} @characters;
1236 Koha Development Team <http://koha-community.org/>
1238 Galen Charlton <galen.charlton@liblime.com>