ea50c2acb6
in ISO-8859-1. A new C4::Charset module (tentative name) has been created to guess the charset of a piece of HTML markup. The CGI programs will be modified to use this module as they are encountered during translation.
109 lines
2.7 KiB
Perl
109 lines
2.7 KiB
Perl
package C4::Charset;
|
|
|
|
# $Id$
|
|
|
|
#package to work around problems in HTTP headers
|
|
# Note: This is just a utility module; it should not be instantiated.
|
|
|
|
|
|
# Copyright 2003 Katipo Communications
|
|
#
|
|
# This file is part of Koha.
|
|
#
|
|
# Koha is free software; you can redistribute it and/or modify it under the
|
|
# terms of the GNU General Public License as published by the Free Software
|
|
# Foundation; either version 2 of the License, or (at your option) any later
|
|
# version.
|
|
#
|
|
# Koha is distributed in the hope that it will be useful, but WITHOUT ANY
|
|
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
|
|
# A PARTICULAR PURPOSE. See the GNU General Public License for more details.
|
|
#
|
|
# You should have received a copy of the GNU General Public License along with
|
|
# Koha; if not, write to the Free Software Foundation, Inc., 59 Temple Place,
|
|
# Suite 330, Boston, MA 02111-1307 USA
|
|
|
|
use strict;
|
|
require Exporter;
|
|
|
|
use vars qw($VERSION @ISA @EXPORT);
|
|
|
|
# set the version for version checking
|
|
$VERSION = 0.01;
|
|
|
|
=head1 NAME
|
|
|
|
C4::Charset - Functions for handling charsets in HTML pages
|
|
|
|
=head1 SYNOPSIS
|
|
|
|
use C4::Charset;
|
|
|
|
print $query->header(-type => C4::Charset::gettype($output)), $output;
|
|
|
|
=head1 DESCRIPTION
|
|
|
|
The functions in this module peek into a piece of HTML and return strings
|
|
related to the (guessed) charset.
|
|
|
|
=head1 FUNCTIONS
|
|
|
|
=over 2
|
|
|
|
=cut
|
|
|
|
@ISA = qw(Exporter);
|
|
@EXPORT = qw(
|
|
&guesscharset
|
|
&guesstype
|
|
);
|
|
|
|
=pod
|
|
|
|
&guesscharset($output)
|
|
|
|
"Guesses" the charset from the some HTML that would be output.
|
|
|
|
C<$output> is the HTML page to be output. If it contains a META tag
|
|
with a Content-Type, the tag will be scanned for a language code.
|
|
This code is returned if it is found; undef is returned otherwise.
|
|
|
|
This function only does sloppy guessing; it will be confused by
|
|
unexpected things like SGML comments. What it basically does is to
|
|
grab something that looks like a META tag and scan it.
|
|
|
|
=cut
|
|
|
|
sub guesscharset ($) {
|
|
my($html) = @_;
|
|
my $charset = undef;
|
|
local($`, $&, $', $1, $2, $3);
|
|
# FIXME... These regular expressions will miss a lot of valid tags!
|
|
if ($html =~ /<meta\s+http-equiv=(["']?)Content-Type\1\s+content=(["'])text\/html\s*;\s*charset=([^\2\s\r\n]+)\2\s*(?:\/?)>/is) {
|
|
$charset = $3;
|
|
} elsif ($html =~ /<meta\s+content=(["'])text\/html\s*;\s*charset=([^\1\s\r\n]+)\1\s+http-equiv=(["']?)Content-Type\3\s*(?:\/?)>/is) {
|
|
$charset = $2;
|
|
}
|
|
return $charset;
|
|
} # guess
|
|
|
|
sub guesstype ($) {
|
|
my($html) = @_;
|
|
my $charset = guesscharset($html);
|
|
return defined $charset? "text/html; charset=$charset": "text/html";
|
|
}
|
|
|
|
#---------------------------------
|
|
|
|
END { } # module clean-up code here (global destructor)
|
|
|
|
1;
|
|
__END__
|
|
|
|
=back
|
|
|
|
=head1 AUTHOR
|
|
|
|
Koha Developement team <info@koha.org>
|
|
|
|
=cut
|