Jonathan Druart
9d6d641d1f
On bug 17591 we discovered that there was something weird going on with the way we export and use subroutines/modules. This patch tries to standardize our EXPORT to use EXPORT_OK only. That way we will need to explicitely define the subroutine we want to use from a module. This patch is a squashed version of: Bug 17600: After export.pl Bug 17600: After perlimport Bug 17600: Manual changes Bug 17600: Other manual changes after second perlimports run Bug 17600: Fix tests And a lot of other manual changes. export.pl is a dirty script that can be found on bug 17600. "perlimport" is: git clone https://github.com/oalders/App-perlimports.git cd App-perlimports/ cpanm --installdeps . export PERL5LIB="$PERL5LIB:/kohadevbox/koha/App-perlimports/lib" find . \( -name "*.pl" -o -name "*.pm" \) -exec perl App-perlimports/script/perlimports --inplace-edit --no-preserve-unused --filename {} \; The ideas of this patch are to: * use EXPORT_OK instead of EXPORT * perltidy the EXPORT_OK list * remove '&' before the subroutine names * remove some uneeded use statements * explicitely import the subroutines we need within the controllers or modules Note that the private subroutines (starting with _) should not be exported (and not used from outside of the module except from tests). EXPORT vs EXPORT_OK (from https://www.thegeekstuff.com/2010/06/perl-exporter-examples/) """ Export allows to export the functions and variables of modules to user’s namespace using the standard import method. This way, we don’t need to create the objects for the modules to access it’s members. @EXPORT and @EXPORT_OK are the two main variables used during export operation. @EXPORT contains list of symbols (subroutines and variables) of the module to be exported into the caller namespace. @EXPORT_OK does export of symbols on demand basis. """ If this patch caused a conflict with a patch you wrote prior to its push: * Make sure you are not reintroducing a "use" statement that has been removed * "$subroutine" is not exported by the C4::$MODULE module means that you need to add the subroutine to the @EXPORT_OK list * Bareword "$subroutine" not allowed while "strict subs" means that you didn't imported the subroutine from the module: - use $MODULE qw( $subroutine list ); You can also use the fully qualified namespace: C4::$MODULE::$subroutine Signed-off-by: Jonathan Druart <jonathan.druart@bugs.koha-community.org>
1274 lines
45 KiB
Perl
1274 lines
45 KiB
Perl
package C4::Charset;
|
|
|
|
# Copyright (C) 2008 LibLime
|
|
#
|
|
# This file is part of Koha.
|
|
#
|
|
# Koha is free software; you can redistribute it and/or modify it
|
|
# under the terms of the GNU General Public License as published by
|
|
# the Free Software Foundation; either version 3 of the License, or
|
|
# (at your option) any later version.
|
|
#
|
|
# Koha is distributed in the hope that it will be useful, but
|
|
# WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
# GNU General Public License for more details.
|
|
#
|
|
# You should have received a copy of the GNU General Public License
|
|
# along with Koha; if not, see <http://www.gnu.org/licenses>.
|
|
|
|
use Modern::Perl;
|
|
|
|
use MARC::Charset;
|
|
use Text::Iconv;
|
|
use Unicode::Normalize qw( NFC NFD );
|
|
use Encode;
|
|
|
|
use Koha::Logger;
|
|
|
|
our (@ISA, @EXPORT_OK);
|
|
BEGIN {
|
|
require Exporter;
|
|
@ISA = qw(Exporter);
|
|
@EXPORT_OK = qw(
|
|
NormalizeString
|
|
IsStringUTF8ish
|
|
MarcToUTF8Record
|
|
SetUTF8Flag
|
|
SetMarcUnicodeFlag
|
|
StripNonXmlChars
|
|
nsb_clean
|
|
SanitizeRecord
|
|
);
|
|
}
|
|
|
|
=encoding UTF-8
|
|
|
|
=head1 NAME
|
|
|
|
C4::Charset - utilities for handling character set conversions.
|
|
|
|
=head1 SYNOPSIS
|
|
|
|
use C4::Charset;
|
|
|
|
=head1 DESCRIPTION
|
|
|
|
This module contains routines for dealing with character set
|
|
conversions, particularly for MARC records.
|
|
|
|
A variety of character encodings are in use by various MARC
|
|
standards, and even more character encodings are used by
|
|
non-standard MARC records. The various MARC formats generally
|
|
do not do a good job of advertising a given record's character
|
|
encoding, and even when a record does advertise its encoding,
|
|
e.g., via the Leader/09, experience has shown that one cannot
|
|
trust it.
|
|
|
|
Ultimately, all MARC records are stored in Koha in UTF-8 and
|
|
must be converted from whatever the source character encoding is.
|
|
The goal of this module is to ensure that these conversions
|
|
take place accurately. When a character conversion cannot take
|
|
place, or at least not accurately, the module was provide
|
|
enough information to allow user-facing code to inform the user
|
|
on how to deal with the situation.
|
|
|
|
=cut
|
|
|
|
=head1 FUNCTIONS
|
|
|
|
=head2 IsStringUTF8ish
|
|
|
|
my $is_utf8 = IsStringUTF8ish($str);
|
|
|
|
Determines if C<$str> is valid UTF-8. This can mean
|
|
one of two things:
|
|
|
|
=over
|
|
|
|
=item *
|
|
|
|
The Perl UTF-8 flag is set and the string contains valid UTF-8.
|
|
|
|
=item *
|
|
|
|
The Perl UTF-8 flag is B<not> set, but the octets contain
|
|
valid UTF-8.
|
|
|
|
=back
|
|
|
|
The function is named C<IsStringUTF8ish> instead of C<IsStringUTF8>
|
|
because in one could be presented with a MARC blob that is
|
|
not actually in UTF-8 but whose sequence of octets appears to be
|
|
valid UTF-8. The rest of the MARC character conversion functions
|
|
will assume that this situation occur does not very often.
|
|
|
|
=cut
|
|
|
|
sub IsStringUTF8ish {
|
|
my $str = shift;
|
|
|
|
return 1 if Encode::is_utf8($str);
|
|
return utf8::decode( $str );
|
|
}
|
|
|
|
=head2 SetUTF8Flag
|
|
|
|
my $marc_record = SetUTF8Flag($marc_record, $nfd);
|
|
|
|
This function sets the PERL UTF8 flag for data.
|
|
It is required when using new_from_usmarc
|
|
since MARC::File::USMARC does not handle PERL UTF8 setting.
|
|
When editing unicode marc records fields and subfields, you
|
|
would end up in double encoding without using this function.
|
|
|
|
If $nfd is set, string normalization will use NFD instead of NFC
|
|
|
|
FIXME
|
|
In my opinion, this function belongs to MARC::Record and not
|
|
to this package.
|
|
But since it handles charset, and MARC::Record, it finds its way in that package
|
|
|
|
=cut
|
|
|
|
sub SetUTF8Flag{
|
|
my ($record, $nfd)=@_;
|
|
return unless ($record && $record->fields());
|
|
foreach my $field ($record->fields()){
|
|
if ($field->tag()>=10){
|
|
my @subfields;
|
|
foreach my $subfield ($field->subfields()){
|
|
push @subfields,($$subfield[0],NormalizeString($$subfield[1],$nfd));
|
|
}
|
|
eval {
|
|
my $newfield=MARC::Field->new(
|
|
$field->tag(),
|
|
$field->indicator(1),
|
|
$field->indicator(2),
|
|
@subfields
|
|
);
|
|
$field->replace_with($newfield);
|
|
};
|
|
warn "ERROR occurred in SetUTF8Flag $@" if $@;
|
|
}
|
|
}
|
|
}
|
|
|
|
=head2 NormalizeString
|
|
|
|
my $normalized_string=NormalizeString($string,$nfd,$transform);
|
|
|
|
Given a string
|
|
nfd : If you want to set NFD and not NFC
|
|
transform : If you expect all the signs to be removed
|
|
|
|
Sets the PERL UTF8 Flag on your initial data if need be
|
|
and applies cleaning if required
|
|
|
|
Returns a utf8 NFC normalized string
|
|
|
|
Sample code :
|
|
my $string=NormalizeString ("l'ornithoptère");
|
|
#results into ornithoptère in NFC form and sets UTF8 Flag
|
|
|
|
=cut
|
|
|
|
|
|
sub NormalizeString{
|
|
my ($string,$nfd,$transform)=@_;
|
|
return $string unless defined($string); # force scalar context return.
|
|
$string = Encode::decode('UTF-8', $string) unless (Encode::is_utf8($string));
|
|
if ($nfd){
|
|
$string= NFD($string);
|
|
}
|
|
else {
|
|
$string=NFC($string);
|
|
}
|
|
if ($transform){
|
|
$string=~s/\<|\>|\^|\;|\.|\?|,|\-|\(|\)|\[|\]|\{|\}|\$|\%|\!|\*|\:|\\|\/|\&|\"|\'/ /g;
|
|
#removing one letter words "d'" "l'" was changed into "d " "l "
|
|
$string=~s/\b\S\b//g;
|
|
$string=~s/\s+$//g;
|
|
}
|
|
return $string;
|
|
}
|
|
|
|
=head2 MarcToUTF8Record
|
|
|
|
($marc_record, $converted_from, $errors_arrayref) = MarcToUTF8Record($marc_blob,
|
|
$marc_flavour, [, $source_encoding]);
|
|
|
|
Given a MARC blob or a C<MARC::Record>, the MARC flavour, and an
|
|
optional source encoding, return a C<MARC::Record> that is
|
|
converted to UTF-8.
|
|
|
|
The returned C<$marc_record> is guaranteed to be in valid UTF-8, but
|
|
is not guaranteed to have been converted correctly. Specifically,
|
|
if C<$converted_from> is 'failed', the MARC record returned failed
|
|
character conversion and had each of its non-ASCII octets changed
|
|
to the Unicode replacement character.
|
|
|
|
If the source encoding was not specified, this routine will
|
|
try to guess it; the character encoding used for a successful
|
|
conversion is returned in C<$converted_from>.
|
|
|
|
=cut
|
|
|
|
sub MarcToUTF8Record {
|
|
my $marc = shift;
|
|
my $marc_flavour = shift;
|
|
my $source_encoding = shift;
|
|
my $marc_record;
|
|
my $marc_blob_is_utf8 = 0;
|
|
if (ref($marc) eq 'MARC::Record') {
|
|
my $marc_blob = $marc->as_usmarc();
|
|
$marc_blob_is_utf8 = IsStringUTF8ish($marc_blob);
|
|
$marc_record = $marc;
|
|
} else {
|
|
# dealing with a MARC blob
|
|
|
|
# remove any ersatz whitespace from the beginning and
|
|
# end of the MARC blob -- these can creep into MARC
|
|
# files produced by several sources -- caller really
|
|
# should be doing this, however
|
|
$marc =~ s/^\s+//;
|
|
$marc =~ s/\s+$//;
|
|
$marc_blob_is_utf8 = IsStringUTF8ish($marc);
|
|
eval {
|
|
$marc_record = MARC::Record->new_from_usmarc($marc);
|
|
};
|
|
if ($@) {
|
|
# if we fail the first time, one likely problem
|
|
# is that we have a MARC21 record that says that it's
|
|
# UTF-8 (Leader/09 = 'a') but contains non-UTF-8 characters.
|
|
# We'll try parsing it again.
|
|
substr($marc, 9, 1) = ' ';
|
|
eval {
|
|
$marc_record = MARC::Record->new_from_usmarc($marc);
|
|
};
|
|
if ($@) {
|
|
# it's hopeless; return an empty MARC::Record
|
|
return MARC::Record->new(), 'failed', ['could not parse MARC blob'];
|
|
}
|
|
}
|
|
}
|
|
|
|
# If we do not know the source encoding, try some guesses
|
|
# as follows:
|
|
# 1. Record is UTF-8 already.
|
|
# 2. If MARC flavor is MARC21 or NORMARC, then
|
|
# a. record is MARC-8
|
|
# b. record is ISO-8859-1
|
|
# 3. If MARC flavor is UNIMARC, then
|
|
if (not defined $source_encoding) {
|
|
if ($marc_blob_is_utf8) {
|
|
# note that for MARC21/NORMARC we are not bothering to check
|
|
# if the Leader/09 is set to 'a' or not -- because
|
|
# of problems with various ILSs (including Koha in the
|
|
# past, alas), this just is not trustworthy.
|
|
SetMarcUnicodeFlag($marc_record, $marc_flavour);
|
|
return $marc_record, 'UTF-8', [];
|
|
} else {
|
|
if ($marc_flavour eq 'MARC21' || $marc_flavour eq 'NORMARC') {
|
|
return _default_marc21_charconv_to_utf8($marc_record, $marc_flavour);
|
|
} elsif ($marc_flavour =~/UNIMARC/) {
|
|
return _default_unimarc_charconv_to_utf8($marc_record, $marc_flavour);
|
|
} else {
|
|
return _default_marc21_charconv_to_utf8($marc_record, $marc_flavour);
|
|
}
|
|
}
|
|
} else {
|
|
# caller knows the character encoding
|
|
my $original_marc_record = $marc_record->clone();
|
|
my @errors;
|
|
if ($source_encoding =~ /utf-?8/i) {
|
|
if ($marc_blob_is_utf8) {
|
|
SetMarcUnicodeFlag($marc_record, $marc_flavour);
|
|
return $marc_record, 'UTF-8', [];
|
|
} else {
|
|
push @errors, 'specified UTF-8 => UTF-8 conversion, but record is not in UTF-8';
|
|
}
|
|
} elsif ($source_encoding =~ /marc-?8/i) {
|
|
@errors = _marc_marc8_to_utf8($marc_record, $marc_flavour);
|
|
} elsif ($source_encoding =~ /5426/) {
|
|
@errors = _marc_iso5426_to_utf8($marc_record, $marc_flavour);
|
|
} else {
|
|
# assume any other character encoding is for Text::Iconv
|
|
@errors = _marc_to_utf8_via_text_iconv($marc_record, $marc_flavour, $source_encoding);
|
|
}
|
|
|
|
if (@errors) {
|
|
_marc_to_utf8_replacement_char($original_marc_record, $marc_flavour);
|
|
return $original_marc_record, 'failed', \@errors;
|
|
} else {
|
|
return $marc_record, $source_encoding, [];
|
|
}
|
|
}
|
|
|
|
}
|
|
|
|
=head2 SetMarcUnicodeFlag
|
|
|
|
SetMarcUnicodeFlag($marc_record, $marc_flavour);
|
|
|
|
Set both the internal MARC::Record encoding flag
|
|
and the appropriate Leader/09 (MARC21) or
|
|
100/26-29 (UNIMARC) to indicate that the record
|
|
is in UTF-8. Note that this does B<not> do
|
|
any actual character conversion.
|
|
|
|
=cut
|
|
|
|
sub SetMarcUnicodeFlag {
|
|
my $marc_record = shift;
|
|
my $marc_flavour = shift; # || C4::Context->preference("marcflavour");
|
|
|
|
$marc_record->encoding('UTF-8');
|
|
if ($marc_flavour eq 'MARC21' || $marc_flavour eq 'NORMARC') {
|
|
my $leader = $marc_record->leader();
|
|
substr($leader, 9, 1) = 'a';
|
|
$marc_record->leader($leader);
|
|
} elsif ($marc_flavour =~/UNIMARC/) {
|
|
require C4::Context;
|
|
my $defaultlanguage = C4::Context->preference("UNIMARCField100Language");
|
|
$defaultlanguage = "fre" if (!$defaultlanguage || length($defaultlanguage) != 3);
|
|
my $string;
|
|
my ($subflength,$encodingposition)=($marc_flavour=~/AUTH/?(21,12):(36,25));
|
|
$string=$marc_record->subfield( 100, "a" );
|
|
if (defined $string && length($string)==$subflength) {
|
|
$string = substr $string, 0,$subflength if (length($string)>$subflength);
|
|
}
|
|
else {
|
|
$string = POSIX::strftime( "%Y%m%d", localtime );
|
|
$string =~ s/\-//g;
|
|
$string = sprintf( "%-*s", $subflength, $string );
|
|
substr ( $string, ($encodingposition - 3), 3, $defaultlanguage);
|
|
}
|
|
substr( $string, $encodingposition, 3, "y50" );
|
|
if ( $marc_record->subfield( 100, "a" ) ) {
|
|
$marc_record->field('100')->update(a=>$string);
|
|
}
|
|
else {
|
|
$marc_record->insert_grouped_field(
|
|
MARC::Field->new( 100, '', '', "a" => $string ) );
|
|
}
|
|
Koha::Logger->get->debug("encodage: ", substr( $marc_record->subfield(100, 'a'), $encodingposition, 3 ));
|
|
} else {
|
|
warn "Unrecognized marcflavour: $marc_flavour";
|
|
}
|
|
}
|
|
|
|
=head2 StripNonXmlChars
|
|
|
|
my $new_str = StripNonXmlChars($old_str);
|
|
|
|
Given a string, return a copy with the
|
|
characters that are illegal in XML
|
|
removed.
|
|
|
|
This function exists to work around a problem
|
|
that can occur with badly-encoded MARC records.
|
|
Specifically, if a UTF-8 MARC record also
|
|
has excape (\x1b) characters, MARC::File::XML
|
|
will let the escape characters pass through
|
|
when as_xml() or as_xml_record() is called. The
|
|
problem is that the escape character is not
|
|
legal in well-formed XML documents, so when
|
|
MARC::File::XML attempts to parse such a record,
|
|
the XML parser will fail.
|
|
|
|
Stripping such characters will allow a
|
|
MARC::Record->new_from_xml()
|
|
to work, at the possible risk of some data loss.
|
|
|
|
=cut
|
|
|
|
sub StripNonXmlChars {
|
|
my $str = shift;
|
|
if (!defined($str) || $str eq ""){
|
|
return "";
|
|
}
|
|
$str =~ s/[^\x09\x0A\x0D\x{0020}-\x{D7FF}\x{E000}-\x{FFFD}\x{10000}-\x{10FFFF}]//g;
|
|
return $str;
|
|
}
|
|
|
|
|
|
|
|
=head2 nsb_clean
|
|
|
|
=over 4
|
|
|
|
nsb_clean($string);
|
|
|
|
=back
|
|
|
|
Removes Non Sorting Block characters
|
|
|
|
=cut
|
|
sub nsb_clean {
|
|
my $NSB = '\x88' ; # NSB : begin Non Sorting Block
|
|
my $NSE = '\x89' ; # NSE : Non Sorting Block end
|
|
my $NSB2 = '\x98' ; # NSB : begin Non Sorting Block
|
|
my $NSE2 = '\x9C' ; # NSE : Non Sorting Block end
|
|
my $C2 = '\xC2' ; # What is this char ? It is sometimes left by the regexp after removing NSB / NSE
|
|
|
|
# handles non sorting blocks
|
|
my ($string) = @_ ;
|
|
$_ = $string ;
|
|
s/($C2){0,1}($NSB|$NSB2)//g ;
|
|
s/($C2){0,1}($NSE|$NSE2)//g ;
|
|
$string = $_ ;
|
|
|
|
return($string) ;
|
|
}
|
|
|
|
|
|
=head2 SanitizeRecord
|
|
|
|
SanitizeRecord($marcrecord);
|
|
|
|
Sanitize a record
|
|
This routine is called in the maintenance script misc/maintenance/sanitize_records.pl.
|
|
It cleans any string with '&amp;...', replacing it by '&'
|
|
|
|
=cut
|
|
|
|
sub SanitizeRecord {
|
|
my ( $record, $biblionumber ) = @_;
|
|
my $string;
|
|
my $record_modified = 0;
|
|
my $frameworkcode = C4::Biblio::GetFrameworkCode($biblionumber);
|
|
my ( $url_field, $url_subfield ) =
|
|
C4::Biblio::GetMarcFromKohaField( 'biblioitems.url' );
|
|
foreach my $field ( $record->fields() ) {
|
|
if ( $field->is_control_field() ) {
|
|
my $value = $field->data();
|
|
my $sanitized_value = _clean_ampersand($value);
|
|
$record_modified = 1 if $sanitized_value ne $value;
|
|
$field->update($sanitized_value);
|
|
}
|
|
else {
|
|
my @subfields = $field->subfields();
|
|
my @new_subfields;
|
|
foreach my $subfield (@subfields) {
|
|
next
|
|
if $url_field eq $field->tag()
|
|
and $url_subfield eq $subfield->[0];
|
|
my $value = $subfield->[1];
|
|
my $sanitized_value = _clean_ampersand($value);
|
|
push @new_subfields, $subfield->[0] => $sanitized_value;
|
|
$record_modified = 1 if $sanitized_value ne $value;
|
|
}
|
|
if ( scalar(@new_subfields) > 0 ) {
|
|
my $new_field = eval {
|
|
MARC::Field->new(
|
|
$field->tag(), $field->indicator(1),
|
|
$field->indicator(2), @new_subfields
|
|
);
|
|
};
|
|
if ($@) {
|
|
warn "error : $@";
|
|
}
|
|
else {
|
|
$field->replace_with($new_field);
|
|
}
|
|
|
|
}
|
|
}
|
|
}
|
|
|
|
return $record, $record_modified;
|
|
}
|
|
|
|
sub _clean_ampersand {
|
|
my ($string) = @_;
|
|
$string =~ s/(&)(amp;)+/$1/g;
|
|
return $string;
|
|
}
|
|
|
|
=head1 INTERNAL FUNCTIONS
|
|
|
|
=head2 _default_marc21_charconv_to_utf8
|
|
|
|
my ($new_marc_record, $guessed_charset) = _default_marc21_charconv_to_utf8($marc_record);
|
|
|
|
Converts a C<MARC::Record> of unknown character set to UTF-8,
|
|
first by trying a MARC-8 to UTF-8 conversion, then ISO-8859-1
|
|
to UTF-8, then a default conversion that replaces each non-ASCII
|
|
character with the replacement character.
|
|
|
|
The C<$guessed_charset> return value contains the character set
|
|
that resulted in a conversion to valid UTF-8; note that
|
|
if the MARC-8 and ISO-8859-1 conversions failed, the value of
|
|
this is 'failed'.
|
|
|
|
=cut
|
|
|
|
sub _default_marc21_charconv_to_utf8 {
|
|
my $marc_record = shift;
|
|
my $marc_flavour = shift;
|
|
|
|
my $trial_marc8 = $marc_record->clone();
|
|
my @all_errors = ();
|
|
my @errors = _marc_marc8_to_utf8($trial_marc8, $marc_flavour);
|
|
unless (@errors) {
|
|
return $trial_marc8, 'MARC-8', [];
|
|
}
|
|
push @all_errors, @errors;
|
|
|
|
my $trial_8859_1 = $marc_record->clone();
|
|
@errors = _marc_to_utf8_via_text_iconv($trial_8859_1, $marc_flavour, 'iso-8859-1');
|
|
unless (@errors) {
|
|
return $trial_8859_1, 'iso-8859-1', []; # note -- we could return \@all_errors
|
|
# instead if we wanted to report details
|
|
# of the failed attempt at MARC-8 => UTF-8
|
|
}
|
|
push @all_errors, @errors;
|
|
|
|
my $default_converted = $marc_record->clone();
|
|
_marc_to_utf8_replacement_char($default_converted, $marc_flavour);
|
|
return $default_converted, 'failed', \@all_errors;
|
|
}
|
|
|
|
=head2 _default_unimarc_charconv_to_utf8
|
|
|
|
my ($new_marc_record, $guessed_charset) = _default_unimarc_charconv_to_utf8($marc_record);
|
|
|
|
Converts a C<MARC::Record> of unknown character set to UTF-8,
|
|
first by trying a ISO-5426 to UTF-8 conversion, then ISO-8859-1
|
|
to UTF-8, then a default conversion that replaces each non-ASCII
|
|
character with the replacement character.
|
|
|
|
The C<$guessed_charset> return value contains the character set
|
|
that resulted in a conversion to valid UTF-8; note that
|
|
if the MARC-8 and ISO-8859-1 conversions failed, the value of
|
|
this is 'failed'.
|
|
|
|
=cut
|
|
|
|
sub _default_unimarc_charconv_to_utf8 {
|
|
my $marc_record = shift;
|
|
my $marc_flavour = shift;
|
|
|
|
my $trial_marc8 = $marc_record->clone();
|
|
my @all_errors = ();
|
|
my @errors = _marc_iso5426_to_utf8($trial_marc8, $marc_flavour);
|
|
unless (@errors) {
|
|
return $trial_marc8, 'iso-5426';
|
|
}
|
|
push @all_errors, @errors;
|
|
|
|
my $trial_8859_1 = $marc_record->clone();
|
|
@errors = _marc_to_utf8_via_text_iconv($trial_8859_1, $marc_flavour, 'iso-8859-1');
|
|
unless (@errors) {
|
|
return $trial_8859_1, 'iso-8859-1';
|
|
}
|
|
push @all_errors, @errors;
|
|
|
|
my $default_converted = $marc_record->clone();
|
|
_marc_to_utf8_replacement_char($default_converted, $marc_flavour);
|
|
return $default_converted, 'failed', \@all_errors;
|
|
}
|
|
|
|
=head2 _marc_marc8_to_utf8
|
|
|
|
my @errors = _marc_marc8_to_utf8($marc_record, $marc_flavour, $source_encoding);
|
|
|
|
Convert a C<MARC::Record> to UTF-8 in-place from MARC-8.
|
|
If the conversion fails for some reason, an
|
|
appropriate messages will be placed in the returned
|
|
C<@errors> array.
|
|
|
|
=cut
|
|
|
|
sub _marc_marc8_to_utf8 {
|
|
my $marc_record = shift;
|
|
my $marc_flavour = shift;
|
|
|
|
my $prev_ignore = MARC::Charset->ignore_errors();
|
|
MARC::Charset->ignore_errors(1);
|
|
|
|
# trap warnings raised by MARC::Charset
|
|
my @errors = ();
|
|
local $SIG{__WARN__} = sub {
|
|
my $msg = $_[0];
|
|
if ($msg =~ /MARC.Charset/) {
|
|
# FIXME - purpose of this regexp is to strip out the
|
|
# line reference to MARC/Charset.pm, but as it
|
|
# exists probably won't work quite on Windows --
|
|
# some sort of minimal-bunch back-tracking RE
|
|
# would be helpful here
|
|
$msg =~ s/at [\/].*?.MARC.Charset\.pm line \d+\.\n$//;
|
|
push @errors, $msg;
|
|
} else {
|
|
# if warning doesn't come from MARC::Charset, just
|
|
# pass it on
|
|
warn $msg;
|
|
}
|
|
};
|
|
|
|
foreach my $field ($marc_record->fields()) {
|
|
if ($field->is_control_field()) {
|
|
; # do nothing -- control fields should not contain non-ASCII characters
|
|
} else {
|
|
my @converted_subfields;
|
|
foreach my $subfield ($field->subfields()) {
|
|
my $utf8sf = MARC::Charset::marc8_to_utf8($subfield->[1]);
|
|
unless (IsStringUTF8ish($utf8sf)) {
|
|
# Because of a bug in MARC::Charset 0.98, if the string
|
|
# has (a) one or more diacritics that (b) are only in character positions
|
|
# 128 to 255 inclusive, the resulting converted string is not in
|
|
# UTF-8, but the legacy 8-bit encoding (e.g., ISO-8859-1). If that
|
|
# occurs, upgrade the string in place. Moral of the story seems to be
|
|
# that pack("U", ...) is better than chr(...) if you need to guarantee
|
|
# that the resulting string is UTF-8.
|
|
$utf8sf = Encode::encode('UTF-8', $utf8sf);
|
|
}
|
|
push @converted_subfields, $subfield->[0], $utf8sf;
|
|
}
|
|
|
|
$field->replace_with(MARC::Field->new(
|
|
$field->tag(), $field->indicator(1), $field->indicator(2),
|
|
@converted_subfields)
|
|
);
|
|
}
|
|
}
|
|
|
|
MARC::Charset->ignore_errors($prev_ignore);
|
|
|
|
SetMarcUnicodeFlag($marc_record, $marc_flavour);
|
|
|
|
return @errors;
|
|
}
|
|
|
|
=head2 _marc_iso5426_to_utf8
|
|
|
|
my @errors = _marc_iso5426_to_utf8($marc_record, $marc_flavour, $source_encoding);
|
|
|
|
Convert a C<MARC::Record> to UTF-8 in-place from ISO-5426.
|
|
If the conversion fails for some reason, an
|
|
appropriate messages will be placed in the returned
|
|
C<@errors> array.
|
|
|
|
FIXME - is ISO-5426 equivalent enough to MARC-8
|
|
that C<MARC::Charset> can be used instead?
|
|
|
|
=cut
|
|
|
|
sub _marc_iso5426_to_utf8 {
|
|
my $marc_record = shift;
|
|
my $marc_flavour = shift;
|
|
|
|
my @errors = ();
|
|
|
|
foreach my $field ($marc_record->fields()) {
|
|
if ($field->is_control_field()) {
|
|
; # do nothing -- control fields should not contain non-ASCII characters
|
|
} else {
|
|
my @converted_subfields;
|
|
foreach my $subfield ($field->subfields()) {
|
|
my $utf8sf = char_decode5426($subfield->[1]);
|
|
push @converted_subfields, $subfield->[0], $utf8sf;
|
|
}
|
|
|
|
$field->replace_with(MARC::Field->new(
|
|
$field->tag(), $field->indicator(1), $field->indicator(2),
|
|
@converted_subfields)
|
|
);
|
|
}
|
|
}
|
|
|
|
SetMarcUnicodeFlag($marc_record, $marc_flavour);
|
|
|
|
return @errors;
|
|
}
|
|
|
|
=head2 _marc_to_utf8_via_text_iconv
|
|
|
|
my @errors = _marc_to_utf8_via_text_iconv($marc_record, $marc_flavour, $source_encoding);
|
|
|
|
Convert a C<MARC::Record> to UTF-8 in-place using the
|
|
C<Text::Iconv> CPAN module. Any source encoding accepted
|
|
by the user's iconv installation should work. If
|
|
the source encoding is not recognized on the user's
|
|
server or the conversion fails for some reason,
|
|
appropriate messages will be placed in the returned
|
|
C<@errors> array.
|
|
|
|
=cut
|
|
|
|
sub _marc_to_utf8_via_text_iconv {
|
|
my $marc_record = shift;
|
|
my $marc_flavour = shift;
|
|
my $source_encoding = shift;
|
|
|
|
my @errors = ();
|
|
my $decoder;
|
|
eval { $decoder = Text::Iconv->new($source_encoding, 'utf8'); };
|
|
if ($@) {
|
|
push @errors, "Could not initialze $source_encoding => utf8 converter: $@";
|
|
return @errors;
|
|
}
|
|
|
|
my $prev_raise_error = Text::Iconv->raise_error();
|
|
Text::Iconv->raise_error(1);
|
|
|
|
foreach my $field ($marc_record->fields()) {
|
|
if ($field->is_control_field()) {
|
|
; # do nothing -- control fields should not contain non-ASCII characters
|
|
} else {
|
|
my @converted_subfields;
|
|
foreach my $subfield ($field->subfields()) {
|
|
my $converted_value;
|
|
my $conversion_ok = 1;
|
|
eval { $converted_value = $decoder->convert($subfield->[1]); };
|
|
if ($@) {
|
|
$conversion_ok = 0;
|
|
push @errors, $@;
|
|
} elsif (not defined $converted_value) {
|
|
$conversion_ok = 0;
|
|
push @errors, "Text::Iconv conversion failed - retval is " . $decoder->retval();
|
|
}
|
|
|
|
if ($conversion_ok) {
|
|
push @converted_subfields, $subfield->[0], $converted_value;
|
|
} else {
|
|
$converted_value = $subfield->[1];
|
|
$converted_value =~ s/[\200-\377]/\xef\xbf\xbd/g;
|
|
push @converted_subfields, $subfield->[0], $converted_value;
|
|
}
|
|
}
|
|
|
|
$field->replace_with(MARC::Field->new(
|
|
$field->tag(), $field->indicator(1), $field->indicator(2),
|
|
@converted_subfields)
|
|
);
|
|
}
|
|
}
|
|
|
|
SetMarcUnicodeFlag($marc_record, $marc_flavour);
|
|
Text::Iconv->raise_error($prev_raise_error);
|
|
|
|
return @errors;
|
|
}
|
|
|
|
=head2 _marc_to_utf8_replacement_char
|
|
|
|
_marc_to_utf8_replacement_char($marc_record, $marc_flavour);
|
|
|
|
Convert a C<MARC::Record> to UTF-8 in-place, adopting the
|
|
unsatisfactory method of replacing all non-ASCII (e.g.,
|
|
where the eight bit is set) octet with the Unicode
|
|
replacement character. This is meant as a last-ditch
|
|
method, and would be best used as part of a UI that
|
|
lets a cataloguer pick various character conversions
|
|
until they find the right one.
|
|
|
|
=cut
|
|
|
|
sub _marc_to_utf8_replacement_char {
|
|
my $marc_record = shift;
|
|
my $marc_flavour = shift;
|
|
|
|
foreach my $field ($marc_record->fields()) {
|
|
if ($field->is_control_field()) {
|
|
; # do nothing -- control fields should not contain non-ASCII characters
|
|
} else {
|
|
my @converted_subfields;
|
|
foreach my $subfield ($field->subfields()) {
|
|
my $value = $subfield->[1];
|
|
$value =~ s/[\200-\377]/\xef\xbf\xbd/g;
|
|
push @converted_subfields, $subfield->[0], $value;
|
|
}
|
|
|
|
$field->replace_with(MARC::Field->new(
|
|
$field->tag(), $field->indicator(1), $field->indicator(2),
|
|
@converted_subfields)
|
|
);
|
|
}
|
|
}
|
|
|
|
SetMarcUnicodeFlag($marc_record, $marc_flavour);
|
|
}
|
|
|
|
=head2 char_decode5426
|
|
|
|
my $utf8string = char_decode5426($iso_5426_string);
|
|
|
|
Converts a string from ISO-5426 to UTF-8.
|
|
|
|
=cut
|
|
|
|
|
|
my %chars;
|
|
|
|
####
|
|
## 0xb
|
|
$chars{0xb0}=0x0101;#3/0ayn[ain]
|
|
$chars{0xb1}=0x0623;#3/1alif/hamzah[alefwithhamzaabove]
|
|
#$chars{0xb2}=0x00e0;#'à';
|
|
$chars{0xb2}=0x00e0;#3/2leftlowsinglequotationmark
|
|
#$chars{0xb3}=0x00e7;#'ç';
|
|
$chars{0xb3}=0x00e7;#3/2leftlowsinglequotationmark
|
|
# $chars{0xb4}='è';
|
|
$chars{0xb4}=0x00e8;
|
|
# $chars{0xb5}='é';
|
|
$chars{0xb5}=0x00e9;
|
|
$chars{0xb6}=0x2021; # double dagger
|
|
$chars{0xb7}=0x00b7; # middle dot
|
|
$chars{0xb8}=0x2033; # double prime
|
|
$chars{0xb9}=0x2019; # right single quotation mark
|
|
$chars{0xba}=0x201d; # right double quotation mark
|
|
$chars{0xbb}=0x00bb; # right-pointing double angle quotation mark
|
|
$chars{0xbc}=0x266f; # music sharp sign
|
|
$chars{0xbd}=0x02b9; # modifier letter prime
|
|
$chars{0xbe}=0x02ba; # modifier letter double prime
|
|
$chars{0xbf}=0x00bf; # inverted question mark
|
|
|
|
####
|
|
## 0xe
|
|
$chars{0xe1}=0x00c6; # latin capital letter ae
|
|
$chars{0xe2}=0x0110; # latin capital letter d with stroke
|
|
$chars{0xe6}=0x0132; # latin capital ligature ij
|
|
$chars{0xe8}=0x0141; # latin capital letter l with stroke
|
|
$chars{0xe9}=0x00d8; # latin capital letter o with stroke
|
|
$chars{0xea}=0x0152; # latin capital ligature oe
|
|
$chars{0xec}=0x00de; # latin capital letter thorn
|
|
|
|
####
|
|
## 0xf
|
|
$chars{0xf1}=0x00e6; # latin small letter ae
|
|
$chars{0xf2}=0x0111; # latin small letter d with stroke
|
|
$chars{0xf3}=0x00f0; # latin small letter eth
|
|
$chars{0xf5}=0x0131; # latin small letter dotless i
|
|
$chars{0xf6}=0x0133; # latin small ligature ij
|
|
$chars{0xf8}=0x0142; # latin small letter l with stroke
|
|
$chars{0xf9}=0x00f8; # latin small letter o with stroke
|
|
$chars{0xfa}=0x0153; # latin small ligature oe
|
|
$chars{0xfb}=0x00df; # latin small letter sharp s
|
|
$chars{0xfc}=0x00fe; # latin small letter thorn
|
|
|
|
####
|
|
## Others
|
|
$chars{0x97}=0x003c;#3/2leftlowsinglequotationmark
|
|
$chars{0x98}=0x003e;#3/2leftlowsinglequotationmark
|
|
#$chars{0x81d1}=0x00b0; # FIXME useless
|
|
|
|
####
|
|
## combined characters iso5426
|
|
|
|
$chars{0xc041}=0x1ea2; # capital a with hook above
|
|
$chars{0xc045}=0x1eba; # capital e with hook above
|
|
$chars{0xc049}=0x1ec8; # capital i with hook above
|
|
$chars{0xc04f}=0x1ece; # capital o with hook above
|
|
$chars{0xc055}=0x1ee6; # capital u with hook above
|
|
$chars{0xc059}=0x1ef6; # capital y with hook above
|
|
$chars{0xc061}=0x1ea3; # small a with hook above
|
|
$chars{0xc065}=0x1ebb; # small e with hook above
|
|
$chars{0xc069}=0x1ec9; # small i with hook above
|
|
$chars{0xc06f}=0x1ecf; # small o with hook above
|
|
$chars{0xc075}=0x1ee7; # small u with hook above
|
|
$chars{0xc079}=0x1ef7; # small y with hook above
|
|
|
|
# 4/1 grave accent
|
|
$chars{0xc141}=0x00c0; # capital a with grave accent
|
|
$chars{0xc145}=0x00c8; # capital e with grave accent
|
|
$chars{0xc149}=0x00cc; # capital i with grave accent
|
|
$chars{0xc14f}=0x00d2; # capital o with grave accent
|
|
$chars{0xc155}=0x00d9; # capital u with grave accent
|
|
$chars{0xc157}=0x1e80; # capital w with grave
|
|
$chars{0xc159}=0x1ef2; # capital y with grave
|
|
$chars{0xc161}=0x00e0; # small a with grave accent
|
|
$chars{0xc165}=0x00e8; # small e with grave accent
|
|
$chars{0xc169}=0x00ec; # small i with grave accent
|
|
$chars{0xc16f}=0x00f2; # small o with grave accent
|
|
$chars{0xc175}=0x00f9; # small u with grave accent
|
|
$chars{0xc177}=0x1e81; # small w with grave
|
|
$chars{0xc179}=0x1ef3; # small y with grave
|
|
# 4/2 acute accent
|
|
$chars{0xc241}=0x00c1; # capital a with acute accent
|
|
$chars{0xc243}=0x0106; # capital c with acute accent
|
|
$chars{0xc245}=0x00c9; # capital e with acute accent
|
|
$chars{0xc247}=0x01f4; # capital g with acute
|
|
$chars{0xc249}=0x00cd; # capital i with acute accent
|
|
$chars{0xc24b}=0x1e30; # capital k with acute
|
|
$chars{0xc24c}=0x0139; # capital l with acute accent
|
|
$chars{0xc24d}=0x1e3e; # capital m with acute
|
|
$chars{0xc24e}=0x0143; # capital n with acute accent
|
|
$chars{0xc24f}=0x00d3; # capital o with acute accent
|
|
$chars{0xc250}=0x1e54; # capital p with acute
|
|
$chars{0xc252}=0x0154; # capital r with acute accent
|
|
$chars{0xc253}=0x015a; # capital s with acute accent
|
|
$chars{0xc255}=0x00da; # capital u with acute accent
|
|
$chars{0xc257}=0x1e82; # capital w with acute
|
|
$chars{0xc259}=0x00dd; # capital y with acute accent
|
|
$chars{0xc25a}=0x0179; # capital z with acute accent
|
|
$chars{0xc261}=0x00e1; # small a with acute accent
|
|
$chars{0xc263}=0x0107; # small c with acute accent
|
|
$chars{0xc265}=0x00e9; # small e with acute accent
|
|
$chars{0xc267}=0x01f5; # small g with acute
|
|
$chars{0xc269}=0x00ed; # small i with acute accent
|
|
$chars{0xc26b}=0x1e31; # small k with acute
|
|
$chars{0xc26c}=0x013a; # small l with acute accent
|
|
$chars{0xc26d}=0x1e3f; # small m with acute
|
|
$chars{0xc26e}=0x0144; # small n with acute accent
|
|
$chars{0xc26f}=0x00f3; # small o with acute accent
|
|
$chars{0xc270}=0x1e55; # small p with acute
|
|
$chars{0xc272}=0x0155; # small r with acute accent
|
|
$chars{0xc273}=0x015b; # small s with acute accent
|
|
$chars{0xc275}=0x00fa; # small u with acute accent
|
|
$chars{0xc277}=0x1e83; # small w with acute
|
|
$chars{0xc279}=0x00fd; # small y with acute accent
|
|
$chars{0xc27a}=0x017a; # small z with acute accent
|
|
$chars{0xc2e1}=0x01fc; # capital ae with acute
|
|
$chars{0xc2f1}=0x01fd; # small ae with acute
|
|
# 4/3 circumflex accent
|
|
$chars{0xc341}=0x00c2; # capital a with circumflex accent
|
|
$chars{0xc343}=0x0108; # capital c with circumflex
|
|
$chars{0xc345}=0x00ca; # capital e with circumflex accent
|
|
$chars{0xc347}=0x011c; # capital g with circumflex
|
|
$chars{0xc348}=0x0124; # capital h with circumflex
|
|
$chars{0xc349}=0x00ce; # capital i with circumflex accent
|
|
$chars{0xc34a}=0x0134; # capital j with circumflex
|
|
$chars{0xc34f}=0x00d4; # capital o with circumflex accent
|
|
$chars{0xc353}=0x015c; # capital s with circumflex
|
|
$chars{0xc355}=0x00db; # capital u with circumflex
|
|
$chars{0xc357}=0x0174; # capital w with circumflex
|
|
$chars{0xc359}=0x0176; # capital y with circumflex
|
|
$chars{0xc35a}=0x1e90; # capital z with circumflex
|
|
$chars{0xc361}=0x00e2; # small a with circumflex accent
|
|
$chars{0xc363}=0x0109; # small c with circumflex
|
|
$chars{0xc365}=0x00ea; # small e with circumflex accent
|
|
$chars{0xc367}=0x011d; # small g with circumflex
|
|
$chars{0xc368}=0x0125; # small h with circumflex
|
|
$chars{0xc369}=0x00ee; # small i with circumflex accent
|
|
$chars{0xc36a}=0x0135; # small j with circumflex
|
|
$chars{0xc36e}=0x00f1; # small n with tilde
|
|
$chars{0xc36f}=0x00f4; # small o with circumflex accent
|
|
$chars{0xc373}=0x015d; # small s with circumflex
|
|
$chars{0xc375}=0x00fb; # small u with circumflex
|
|
$chars{0xc377}=0x0175; # small w with circumflex
|
|
$chars{0xc379}=0x0177; # small y with circumflex
|
|
$chars{0xc37a}=0x1e91; # small z with circumflex
|
|
# 4/4 tilde
|
|
$chars{0xc441}=0x00c3; # capital a with tilde
|
|
$chars{0xc445}=0x1ebc; # capital e with tilde
|
|
$chars{0xc449}=0x0128; # capital i with tilde
|
|
$chars{0xc44e}=0x00d1; # capital n with tilde
|
|
$chars{0xc44f}=0x00d5; # capital o with tilde
|
|
$chars{0xc455}=0x0168; # capital u with tilde
|
|
$chars{0xc456}=0x1e7c; # capital v with tilde
|
|
$chars{0xc459}=0x1ef8; # capital y with tilde
|
|
$chars{0xc461}=0x00e3; # small a with tilde
|
|
$chars{0xc465}=0x1ebd; # small e with tilde
|
|
$chars{0xc469}=0x0129; # small i with tilde
|
|
$chars{0xc46e}=0x00f1; # small n with tilde
|
|
$chars{0xc46f}=0x00f5; # small o with tilde
|
|
$chars{0xc475}=0x0169; # small u with tilde
|
|
$chars{0xc476}=0x1e7d; # small v with tilde
|
|
$chars{0xc479}=0x1ef9; # small y with tilde
|
|
# 4/5 macron
|
|
$chars{0xc541}=0x0100; # capital a with macron
|
|
$chars{0xc545}=0x0112; # capital e with macron
|
|
$chars{0xc547}=0x1e20; # capital g with macron
|
|
$chars{0xc549}=0x012a; # capital i with macron
|
|
$chars{0xc54f}=0x014c; # capital o with macron
|
|
$chars{0xc555}=0x016a; # capital u with macron
|
|
$chars{0xc561}=0x0101; # small a with macron
|
|
$chars{0xc565}=0x0113; # small e with macron
|
|
$chars{0xc567}=0x1e21; # small g with macron
|
|
$chars{0xc569}=0x012b; # small i with macron
|
|
$chars{0xc56f}=0x014d; # small o with macron
|
|
$chars{0xc575}=0x016b; # small u with macron
|
|
$chars{0xc572}=0x0159; # small r with macron
|
|
$chars{0xc5e1}=0x01e2; # capital ae with macron
|
|
$chars{0xc5f1}=0x01e3; # small ae with macron
|
|
# 4/6 breve
|
|
$chars{0xc641}=0x0102; # capital a with breve
|
|
$chars{0xc645}=0x0114; # capital e with breve
|
|
$chars{0xc647}=0x011e; # capital g with breve
|
|
$chars{0xc649}=0x012c; # capital i with breve
|
|
$chars{0xc64f}=0x014e; # capital o with breve
|
|
$chars{0xc655}=0x016c; # capital u with breve
|
|
$chars{0xc661}=0x0103; # small a with breve
|
|
$chars{0xc665}=0x0115; # small e with breve
|
|
$chars{0xc667}=0x011f; # small g with breve
|
|
$chars{0xc669}=0x012d; # small i with breve
|
|
$chars{0xc66f}=0x014f; # small o with breve
|
|
$chars{0xc675}=0x016d; # small u with breve
|
|
# 4/7 dot above
|
|
$chars{0xc7b0}=0x01e1; # Ain with dot above
|
|
$chars{0xc742}=0x1e02; # capital b with dot above
|
|
$chars{0xc743}=0x010a; # capital c with dot above
|
|
$chars{0xc744}=0x1e0a; # capital d with dot above
|
|
$chars{0xc745}=0x0116; # capital e with dot above
|
|
$chars{0xc746}=0x1e1e; # capital f with dot above
|
|
$chars{0xc747}=0x0120; # capital g with dot above
|
|
$chars{0xc748}=0x1e22; # capital h with dot above
|
|
$chars{0xc749}=0x0130; # capital i with dot above
|
|
$chars{0xc74d}=0x1e40; # capital m with dot above
|
|
$chars{0xc74e}=0x1e44; # capital n with dot above
|
|
$chars{0xc750}=0x1e56; # capital p with dot above
|
|
$chars{0xc752}=0x1e58; # capital r with dot above
|
|
$chars{0xc753}=0x1e60; # capital s with dot above
|
|
$chars{0xc754}=0x1e6a; # capital t with dot above
|
|
$chars{0xc757}=0x1e86; # capital w with dot above
|
|
$chars{0xc758}=0x1e8a; # capital x with dot above
|
|
$chars{0xc759}=0x1e8e; # capital y with dot above
|
|
$chars{0xc75a}=0x017b; # capital z with dot above
|
|
$chars{0xc761}=0x0227; # small b with dot above
|
|
$chars{0xc762}=0x1e03; # small b with dot above
|
|
$chars{0xc763}=0x010b; # small c with dot above
|
|
$chars{0xc764}=0x1e0b; # small d with dot above
|
|
$chars{0xc765}=0x0117; # small e with dot above
|
|
$chars{0xc766}=0x1e1f; # small f with dot above
|
|
$chars{0xc767}=0x0121; # small g with dot above
|
|
$chars{0xc768}=0x1e23; # small h with dot above
|
|
$chars{0xc76d}=0x1e41; # small m with dot above
|
|
$chars{0xc76e}=0x1e45; # small n with dot above
|
|
$chars{0xc770}=0x1e57; # small p with dot above
|
|
$chars{0xc772}=0x1e59; # small r with dot above
|
|
$chars{0xc773}=0x1e61; # small s with dot above
|
|
$chars{0xc774}=0x1e6b; # small t with dot above
|
|
$chars{0xc777}=0x1e87; # small w with dot above
|
|
$chars{0xc778}=0x1e8b; # small x with dot above
|
|
$chars{0xc779}=0x1e8f; # small y with dot above
|
|
$chars{0xc77a}=0x017c; # small z with dot above
|
|
# 4/8 trema, diaresis
|
|
$chars{0xc820}=0x00a8; # diaeresis
|
|
$chars{0xc841}=0x00c4; # capital a with diaeresis
|
|
$chars{0xc845}=0x00cb; # capital e with diaeresis
|
|
$chars{0xc848}=0x1e26; # capital h with diaeresis
|
|
$chars{0xc849}=0x00cf; # capital i with diaeresis
|
|
$chars{0xc84f}=0x00d6; # capital o with diaeresis
|
|
$chars{0xc855}=0x00dc; # capital u with diaeresis
|
|
$chars{0xc857}=0x1e84; # capital w with diaeresis
|
|
$chars{0xc858}=0x1e8c; # capital x with diaeresis
|
|
$chars{0xc859}=0x0178; # capital y with diaeresis
|
|
$chars{0xc861}=0x00e4; # small a with diaeresis
|
|
$chars{0xc865}=0x00eb; # small e with diaeresis
|
|
$chars{0xc868}=0x1e27; # small h with diaeresis
|
|
$chars{0xc869}=0x00ef; # small i with diaeresis
|
|
$chars{0xc86f}=0x00f6; # small o with diaeresis
|
|
$chars{0xc874}=0x1e97; # small t with diaeresis
|
|
$chars{0xc875}=0x00fc; # small u with diaeresis
|
|
$chars{0xc877}=0x1e85; # small w with diaeresis
|
|
$chars{0xc878}=0x1e8d; # small x with diaeresis
|
|
$chars{0xc879}=0x00ff; # small y with diaeresis
|
|
# 4/9 umlaut
|
|
$chars{0xc920}=0x00a8; # [diaeresis]
|
|
$chars{0xc961}=0x00e4; # a with umlaut
|
|
$chars{0xc965}=0x00eb; # e with umlaut
|
|
$chars{0xc969}=0x00ef; # i with umlaut
|
|
$chars{0xc96f}=0x00f6; # o with umlaut
|
|
$chars{0xc975}=0x00fc; # u with umlaut
|
|
# 4/10 circle above
|
|
$chars{0xca41}=0x00c5; # capital a with ring above
|
|
$chars{0xcaad}=0x016e; # capital u with ring above
|
|
$chars{0xca61}=0x00e5; # small a with ring above
|
|
$chars{0xca75}=0x016f; # small u with ring above
|
|
$chars{0xca77}=0x1e98; # small w with ring above
|
|
$chars{0xca79}=0x1e99; # small y with ring above
|
|
# 4/11 high comma off centre
|
|
# 4/12 inverted high comma centred
|
|
# 4/13 double acute accent
|
|
$chars{0xcd4f}=0x0150; # capital o with double acute
|
|
$chars{0xcd55}=0x0170; # capital u with double acute
|
|
$chars{0xcd6f}=0x0151; # small o with double acute
|
|
$chars{0xcd75}=0x0171; # small u with double acute
|
|
# 4/14 horn
|
|
$chars{0xce54}=0x01a0; # latin capital letter o with horn
|
|
$chars{0xce55}=0x01af; # latin capital letter u with horn
|
|
$chars{0xce74}=0x01a1; # latin small letter o with horn
|
|
$chars{0xce75}=0x01b0; # latin small letter u with horn
|
|
# 4/15 caron (hacek
|
|
$chars{0xcf41}=0x01cd; # capital a with caron
|
|
$chars{0xcf43}=0x010c; # capital c with caron
|
|
$chars{0xcf44}=0x010e; # capital d with caron
|
|
$chars{0xcf45}=0x011a; # capital e with caron
|
|
$chars{0xcf47}=0x01e6; # capital g with caron
|
|
$chars{0xcf49}=0x01cf; # capital i with caron
|
|
$chars{0xcf4b}=0x01e8; # capital k with caron
|
|
$chars{0xcf4c}=0x013d; # capital l with caron
|
|
$chars{0xcf4e}=0x0147; # capital n with caron
|
|
$chars{0xcf4f}=0x01d1; # capital o with caron
|
|
$chars{0xcf52}=0x0158; # capital r with caron
|
|
$chars{0xcf53}=0x0160; # capital s with caron
|
|
$chars{0xcf54}=0x0164; # capital t with caron
|
|
$chars{0xcf55}=0x01d3; # capital u with caron
|
|
$chars{0xcf5a}=0x017d; # capital z with caron
|
|
$chars{0xcf61}=0x01ce; # small a with caron
|
|
$chars{0xcf63}=0x010d; # small c with caron
|
|
$chars{0xcf64}=0x010f; # small d with caron
|
|
$chars{0xcf65}=0x011b; # small e with caron
|
|
$chars{0xcf67}=0x01e7; # small g with caron
|
|
$chars{0xcf69}=0x01d0; # small i with caron
|
|
$chars{0xcf6a}=0x01f0; # small j with caron
|
|
$chars{0xcf6b}=0x01e9; # small k with caron
|
|
$chars{0xcf6c}=0x013e; # small l with caron
|
|
$chars{0xcf6e}=0x0148; # small n with caron
|
|
$chars{0xcf6f}=0x01d2; # small o with caron
|
|
$chars{0xcf72}=0x0159; # small r with caron
|
|
$chars{0xcf73}=0x0161; # small s with caron
|
|
$chars{0xcf74}=0x0165; # small t with caron
|
|
$chars{0xcf75}=0x01d4; # small u with caron
|
|
$chars{0xcf7a}=0x017e; # small z with caron
|
|
# 5/0 cedilla
|
|
$chars{0xd020}=0x00b8; # cedilla
|
|
$chars{0xd043}=0x00c7; # capital c with cedilla
|
|
$chars{0xd044}=0x1e10; # capital d with cedilla
|
|
$chars{0xd047}=0x0122; # capital g with cedilla
|
|
$chars{0xd048}=0x1e28; # capital h with cedilla
|
|
$chars{0xd04b}=0x0136; # capital k with cedilla
|
|
$chars{0xd04c}=0x013b; # capital l with cedilla
|
|
$chars{0xd04e}=0x0145; # capital n with cedilla
|
|
$chars{0xd052}=0x0156; # capital r with cedilla
|
|
$chars{0xd053}=0x015e; # capital s with cedilla
|
|
$chars{0xd054}=0x0162; # capital t with cedilla
|
|
$chars{0xd063}=0x00e7; # small c with cedilla
|
|
$chars{0xd064}=0x1e11; # small d with cedilla
|
|
$chars{0xd065}=0x0119; # small e with cedilla
|
|
$chars{0xd067}=0x0123; # small g with cedilla
|
|
$chars{0xd068}=0x1e29; # small h with cedilla
|
|
$chars{0xd06b}=0x0137; # small k with cedilla
|
|
$chars{0xd06c}=0x013c; # small l with cedilla
|
|
$chars{0xd06e}=0x0146; # small n with cedilla
|
|
$chars{0xd072}=0x0157; # small r with cedilla
|
|
$chars{0xd073}=0x015f; # small s with cedilla
|
|
$chars{0xd074}=0x0163; # small t with cedilla
|
|
# 5/1 rude
|
|
# 5/2 hook to left
|
|
# 5/3 ogonek (hook to right
|
|
$chars{0xd320}=0x02db; # ogonek
|
|
$chars{0xd341}=0x0104; # capital a with ogonek
|
|
$chars{0xd345}=0x0118; # capital e with ogonek
|
|
$chars{0xd349}=0x012e; # capital i with ogonek
|
|
$chars{0xd34f}=0x01ea; # capital o with ogonek
|
|
$chars{0xd355}=0x0172; # capital u with ogonek
|
|
$chars{0xd361}=0x0105; # small a with ogonek
|
|
$chars{0xd365}=0x0119; # small e with ogonek
|
|
$chars{0xd369}=0x012f; # small i with ogonek
|
|
$chars{0xd36f}=0x01eb; # small o with ogonek
|
|
$chars{0xd375}=0x0173; # small u with ogonek
|
|
# 5/4 circle below
|
|
$chars{0xd441}=0x1e00; # capital a with ring below
|
|
$chars{0xd461}=0x1e01; # small a with ring below
|
|
# 5/5 half circle below
|
|
$chars{0xd548}=0x1e2a; # capital h with breve below
|
|
$chars{0xd568}=0x1e2b; # small h with breve below
|
|
# 5/6 dot below
|
|
$chars{0xd641}=0x1ea0; # capital a with dot below
|
|
$chars{0xd642}=0x1e04; # capital b with dot below
|
|
$chars{0xd644}=0x1e0c; # capital d with dot below
|
|
$chars{0xd645}=0x1eb8; # capital e with dot below
|
|
$chars{0xd648}=0x1e24; # capital h with dot below
|
|
$chars{0xd649}=0x1eca; # capital i with dot below
|
|
$chars{0xd64b}=0x1e32; # capital k with dot below
|
|
$chars{0xd64c}=0x1e36; # capital l with dot below
|
|
$chars{0xd64d}=0x1e42; # capital m with dot below
|
|
$chars{0xd64e}=0x1e46; # capital n with dot below
|
|
$chars{0xd64f}=0x1ecc; # capital o with dot below
|
|
$chars{0xd652}=0x1e5a; # capital r with dot below
|
|
$chars{0xd653}=0x1e62; # capital s with dot below
|
|
$chars{0xd654}=0x1e6c; # capital t with dot below
|
|
$chars{0xd655}=0x1ee4; # capital u with dot below
|
|
$chars{0xd656}=0x1e7e; # capital v with dot below
|
|
$chars{0xd657}=0x1e88; # capital w with dot below
|
|
$chars{0xd659}=0x1ef4; # capital y with dot below
|
|
$chars{0xd65a}=0x1e92; # capital z with dot below
|
|
$chars{0xd661}=0x1ea1; # small a with dot below
|
|
$chars{0xd662}=0x1e05; # small b with dot below
|
|
$chars{0xd664}=0x1e0d; # small d with dot below
|
|
$chars{0xd665}=0x1eb9; # small e with dot below
|
|
$chars{0xd668}=0x1e25; # small h with dot below
|
|
$chars{0xd669}=0x1ecb; # small i with dot below
|
|
$chars{0xd66b}=0x1e33; # small k with dot below
|
|
$chars{0xd66c}=0x1e37; # small l with dot below
|
|
$chars{0xd66d}=0x1e43; # small m with dot below
|
|
$chars{0xd66e}=0x1e47; # small n with dot below
|
|
$chars{0xd66f}=0x1ecd; # small o with dot below
|
|
$chars{0xd672}=0x1e5b; # small r with dot below
|
|
$chars{0xd673}=0x1e63; # small s with dot below
|
|
$chars{0xd674}=0x1e6d; # small t with dot below
|
|
$chars{0xd675}=0x1ee5; # small u with dot below
|
|
$chars{0xd676}=0x1e7f; # small v with dot below
|
|
$chars{0xd677}=0x1e89; # small w with dot below
|
|
$chars{0xd679}=0x1ef5; # small y with dot below
|
|
$chars{0xd67a}=0x1e93; # small z with dot below
|
|
# 5/7 double dot below
|
|
$chars{0xd755}=0x1e72; # capital u with diaeresis below
|
|
$chars{0xd775}=0x1e73; # small u with diaeresis below
|
|
# 5/8 underline
|
|
$chars{0xd820}=0x005f; # underline
|
|
# 5/9 double underline
|
|
$chars{0xd920}=0x2017; # double underline
|
|
# 5/10 small low vertical bar
|
|
$chars{0xda20}=0x02cc; #
|
|
# 5/11 circumflex below
|
|
# 5/12 (this position shall not be used)
|
|
# 5/13 left half of ligature sign and of double tilde
|
|
# 5/14 right half of ligature sign
|
|
# 5/15 right half of double tilde
|
|
# map {printf "%x :%x\n",$_,$chars{$_};}keys %chars;
|
|
|
|
sub char_decode5426 {
|
|
my ( $string) = @_;
|
|
my $result;
|
|
|
|
my @data = unpack("C*", $string);
|
|
my @characters;
|
|
my $length=scalar(@data);
|
|
for (my $i = 0; $i < scalar(@data); $i++) {
|
|
my $char= $data[$i];
|
|
if ($char >= 0x00 && $char <= 0x7F){
|
|
#IsAscii
|
|
|
|
push @characters,$char unless ($char<0x02 ||$char== 0x0F);
|
|
}elsif (($char >= 0xC0 && $char <= 0xDF)) {
|
|
#Combined Char
|
|
my $convchar ;
|
|
if ($chars{$char*256+$data[$i+1]}) {
|
|
$convchar= $chars{$char * 256 + $data[$i+1]};
|
|
$i++;
|
|
# printf "char %x $char, char to convert %x , converted %x\n",$char,$char * 256 + $data[$i - 1],$convchar;
|
|
} elsif ($chars{$char}) {
|
|
$convchar= $chars{$char};
|
|
# printf "0xC char %x, converted %x\n",$char,$chars{$char};
|
|
}else {
|
|
$convchar=$char;
|
|
}
|
|
push @characters,$convchar;
|
|
} else {
|
|
my $convchar;
|
|
if ($chars{$char}) {
|
|
$convchar= $chars{$char};
|
|
# printf "char %x, converted %x\n",$char,$chars{$char};
|
|
}else {
|
|
# printf "char %x $char\n",$char;
|
|
$convchar=$char;
|
|
}
|
|
push @characters,$convchar;
|
|
}
|
|
}
|
|
$result=pack "U*",@characters;
|
|
# $result=~s/\x01//;
|
|
# $result=~s/\x00//;
|
|
$result=~s/\x0f//;
|
|
$result=~s/\x1b.//;
|
|
$result=~s/\x0e//;
|
|
$result=~s/\x1b\x5b//;
|
|
# map{printf "%x",$_} @characters;
|
|
# printf "\n";
|
|
return $result;
|
|
}
|
|
|
|
1;
|
|
|
|
|
|
=head1 AUTHOR
|
|
|
|
Koha Development Team <http://koha-community.org/>
|
|
|
|
Galen Charlton <galen.charlton@liblime.com>
|
|
|
|
=cut
|