From b2138f5d0d0a8f1a9586a11af4dbbb5253b08ff6 Mon Sep 17 00:00:00 2001 From: acli Date: Sun, 22 Feb 2004 05:18:52 +0000 Subject: [PATCH] Handle the iso8859-1 charset somewhat, so that when the po file is in either iso8859-1 or utf8, msgmerge(1) won't crap out. The code is ugly; the conversion table is hard-coded, and in some place not very appropriate. However, this does fix the case where a few strings containing French characters can't be translated. As a side effect, tmpl_process3 can now also be used for French or other languages using iso8859-1. --- misc/translator/TmplTokenizer.pm | 10 ++- misc/translator/tmpl_process3.pl | 45 +++++++++-- misc/translator/xgettext.pl | 124 +++++++++++++++++++++++++++---- 3 files changed, 158 insertions(+), 21 deletions(-) diff --git a/misc/translator/TmplTokenizer.pm b/misc/translator/TmplTokenizer.pm index fdd1ee84cc..43d3ee31b1 100644 --- a/misc/translator/TmplTokenizer.pm +++ b/misc/translator/TmplTokenizer.pm @@ -539,7 +539,7 @@ sub quote_po ($) { return "\"$s\""; } -# Complication function that shouldn't be here +# Some functions that shouldn't be here... should be moved out some time sub parametrize ($@) { my($fmt, @params) = @_; my $it = ''; @@ -572,6 +572,14 @@ sub parametrize ($@) { return $it; } +sub charset_canon ($) { + my($charset) = @_; + $charset = uc($charset); + $charset = "$1-$2" if $charset =~ /^(ISO|UTF)(\d.*)/i; + $charset = 'Big5' if $charset eq 'BIG5'; # "Big5" must be in mixed case + return $charset; +} + ############################################################################### =pod diff --git a/misc/translator/tmpl_process3.pl b/misc/translator/tmpl_process3.pl index 4653c1ad93..82cd768803 100755 --- a/misc/translator/tmpl_process3.pl +++ b/misc/translator/tmpl_process3.pl @@ -26,6 +26,7 @@ use vars qw( $recursive_p ); use vars qw( $pedantic_p ); use vars qw( $href ); use vars qw( $type ); # file extension (DOS form without the dot) to match +use vars qw( $charset_in $charset_out ); ############################################################################### @@ -98,7 +99,6 @@ sub text_replace (**) { } } -# FIXME: Should we use the GNOME convention of using POTFILES.in instead? sub listfiles ($$) { my($dir, $type) = @_; my @it = (); @@ -146,6 +146,12 @@ GetOptions( VerboseWarnings::set_application_name $0; VerboseWarnings::set_pedantic_mode $pedantic_p; +# keep the buggy Locale::PO quiet if it says stupid things +$SIG{__WARN__} = sub { + my($s) = @_; + print STDERR $s unless $s =~ /^Strange line in [^:]+: #~/s + }; + my $action = shift or usage_error('You must specify an ACTION.'); usage_error('You must at least specify input and string list filenames.') if !@in_files || !defined $str_file; @@ -179,26 +185,56 @@ if (-d $in_files[0]) { } } +# restores the string list from file +$href = Locale::PO->load_file_ashash($str_file); + +# guess the charsets. HTML::Templates defaults to iso-8859-1 +if (defined $href) { + $charset_out = TmplTokenizer::charset_canon $2 + if $href->{'""'}->msgstr =~ /\bcharset=(["']?)([^;\s"'\\]+)\1/; + for my $msgid (keys %$href) { + if ($msgid =~ /\bcharset=(["']?)([^;\s"'\\]+)\1/) { + my $candidate = TmplTokenizer::charset_canon $2; + die "Conflicting charsets in msgid: $charset_in vs $candidate\n" + if defined $charset_in && $charset_in ne $candidate; + $charset_in = $2; + } + } +} +if (!defined $charset_in) { + $charset_in = TmplTokenizer::charset_canon 'iso8859-1'; + warn "Warning: Can't determine original templates' charset, defaulting to $charset_in\n"; +} + if ($action eq 'create') { # updates the list. As the list is empty, every entry will be added die "$str_file: Output file already exists" if -f $str_file; my($tmph, $tmpfile) = tmpnam(); + # Generate the temporary file that acts as /POTFILES.in for my $input (@in_files) { print $tmph "$input\n"; } close $tmph; - system {'./xgettext.pl'} ('xgettext.pl', '-s', '-f', $tmpfile, '-o', $str_file); + # Generate the specified po file ($str_file) + system ('xgettext.pl', '-s', '-f', $tmpfile, '-o', $str_file); unlink $tmpfile || warn_normal "$tmpfile: unlink failed: $!\n", undef; } elsif ($action eq 'update') { my($tmph1, $tmpfile1) = tmpnam(); my($tmph2, $tmpfile2) = tmpnam(); close $tmph2; # We just want a name + # Generate the temporary file that acts as /POTFILES.in for my $input (@in_files) { print $tmph1 "$input\n"; } close $tmph1; - system('./xgettext.pl', '-s', '-f', $tmpfile1, '-o', $tmpfile2); + # Generate the temporary file that acts as /.pot + system('./xgettext.pl', '-s', '-f', $tmpfile1, '-o', $tmpfile2, + (defined $charset_in? ('-I', $charset_in): ()), + (defined $charset_out? ('-O', $charset_out): ())); + # Merge the temporary "pot file" with the specified po file ($str_file) + # FIXME: msgmerge(1) is a Unix dependency + # FIXME: need to check the return value system('msgmerge', '-U', '-s', $str_file, $tmpfile2); unlink $tmpfile1 || warn_normal "$tmpfile1: unlink failed: $!\n", undef; unlink $tmpfile2 || warn_normal "$tmpfile2: unlink failed: $!\n", undef; @@ -221,9 +257,6 @@ if ($action eq 'create') { open(INPUT, "<$str_file") || die "$str_file: $!\n"; close INPUT; - # restores the string list from file - $href = Locale::PO->load_file_ashash($str_file); - # creates the new tmpl file using the new translation for my $input (@in_files) { die "Assertion failed" diff --git a/misc/translator/xgettext.pl b/misc/translator/xgettext.pl index a2c611f9f3..e777abfaaf 100755 --- a/misc/translator/xgettext.pl +++ b/misc/translator/xgettext.pl @@ -12,9 +12,50 @@ use Locale::PO; use TmplTokenizer; use VerboseWarnings; +use vars qw( $convert_from ); use vars qw( $files_from $directory $output $sort ); use vars qw( $pedantic_p ); -use vars qw( %text ); +use vars qw( %text %translation ); +use vars qw( $charset_in $charset_out ); + +############################################################################### + +use vars qw( @latin1_utf8 ); +@latin1_utf8 = ( + "\302\200", "\302\201", "\302\202", "\302\203", "\302\204", "\302\205", + "\302\206", "\302\207", "\302\210", "\302\211", "\302\212", "\302\213", + "\302\214", "\302\215", undef, undef, "\302\220", "\302\221", + "\302\222", "\302\223", "\302\224", "\302\225", "\302\226", "\302\227", + "\302\230", "\302\231", "\302\232", "\302\233", "\302\234", "\302\235", + "\302\236", "\302\237", "\302\240", "\302\241", "\302\242", "\302\243", + "\302\244", "\302\245", "\302\246", "\302\247", "\302\250", "\302\251", + "\302\252", "\302\253", "\302\254", "\302\255", "\302\256", "\302\257", + "\302\260", "\302\261", "\302\262", "\302\263", "\302\264", "\302\265", + "\302\266", "\302\267", "\302\270", "\302\271", "\302\272", "\302\273", + "\302\274", "\302\275", "\302\276", "\302\277", "\303\200", "\303\201", + "\303\202", "\303\203", "\303\204", "\303\205", "\303\206", "\303\207", + "\303\210", "\303\211", "\303\212", "\303\213", "\303\214", "\303\215", + "\303\216", "\303\217", "\303\220", "\303\221", "\303\222", "\303\223", + "\303\224", "\303\225", "\303\226", "\303\227", "\303\230", "\303\231", + "\303\232", "\303\233", "\303\234", "\303\235", "\303\236", "\303\237", + "\303\240", "\303\241", "\303\242", "\303\243", "\303\244", "\303\245", + "\303\246", "\303\247", "\303\250", "\303\251", "\303\252", "\303\253", + "\303\254", "\303\255", "\303\256", "\303\257", "\303\260", "\303\261", + "\303\262", "\303\263", "\303\264", "\303\265", "\303\266", "\303\267", + "\303\270", "\303\271", "\303\272", "\303\273", "\303\274", "\303\275", + "\303\276", "\303\277" ); + +sub charset_convert ($) { + my($s) = @_; + if ($s !~ /[\200-\377]/s) { # FIXME: don't worry about iso2022 for now + ; + } elsif ($charset_in eq 'ISO-8859-1' && $charset_out eq 'UTF-8') { + $s =~ s/[\200-\377]/ $latin1_utf8[ord($&) - 128] /egs; + } elsif ($charset_in ne $charset_out) { + VerboseWarnings::warn_normal "conversion from $charset_in to $charset_out is not supported\n", undef; + } + return $s; +} ############################################################################### @@ -87,6 +128,8 @@ sub generate_strings_list () { sub generate_po_file () { # We don't emit the Plural-Forms header; it's meaningless for us + my $pot_charset = (defined $charset_out? $charset_out: 'CHARSET'); + $pot_charset = TmplTokenizer::charset_canon $pot_charset; print OUTPUT <\\n" "Language-Team: LANGUAGE \\n" "MIME-Version: 1.0\\n" -"Content-Type: text/plain; charset=CHARSET\\n" +"Content-Type: text/plain; charset=$pot_charset\\n" "Content-Transfer-Encoding: 8bit\\n" EOF @@ -113,12 +156,53 @@ EOF for my $token (@{$text{$t}}) { my $pathname = $token->pathname; $pathname =~ s/^$directory_re//os; - printf OUTPUT "#: %s:%d\n", $pathname, $token->line_number; + printf OUTPUT "#: %s:%d\n", $pathname, $token->line_number + if defined $pathname && defined $token->line_number; $cformat_p = 1 if $token->type == TmplTokenType::TEXT_PARAMETRIZED; } printf OUTPUT "#, c-format\n" if $cformat_p; - printf OUTPUT "msgid %s\n", TmplTokenizer::quote_po( $t ); - printf OUTPUT "msgstr \"\"\n\n"; + printf OUTPUT "msgid %s\n", TmplTokenizer::quote_po( charset_convert $t ); + printf OUTPUT "msgstr %s\n\n", (defined $translation{$t}? + TmplTokenizer::quote_po( $translation{$t} ): "\"\""); + } +} + +############################################################################### + +sub convert_translation_file () { + open(INPUT, "<$convert_from") || die "$convert_from: $!\n"; + VerboseWarnings::set_input_file_name $convert_from; + while () { + chomp; + my($msgid, $msgstr) = split(/\t/); + die "$convert_from: $.: Malformed tmpl_process input (no tab)\n" + unless defined $msgstr; + + # Fixup some of the bad strings + $msgid =~ s/^SELECTED>//; + + # Create dummy token + my $token = TmplToken->new( $msgid, TmplTokenType::UNKNOWN, undef, undef ); + remember( $token, $msgid ); + $msgstr =~ s/^(?:LIMIT;|LIMITED;)//g; # unneeded for tmpl_process3 + $translation{$msgid} = $msgstr unless $msgstr eq '*****'; + + if ($msgid =~ /\bcharset=(["']?)([^;\s"']+)\1/s) { + my $candidate = TmplTokenizer::charset_canon $2; + die "Conflicting charsets in msgid: $candidate vs $charset_in\n" + if defined $charset_in && $charset_in ne $candidate; + $charset_in = $candidate; + } + if ($msgstr =~ /\bcharset=(["']?)([^;\s"']+)\1/s) { + my $candidate = TmplTokenizer::charset_canon $2; + die "Conflicting charsets in msgid: $candidate vs $charset_out\n" + if defined $charset_out && $charset_out ne $candidate; + $charset_out = $candidate; + } + } + if (!defined $charset_in) { + $charset_in = $charset_out = TmplTokenizer::charset_canon 'iso8859-1'; + warn "Warning: Can't determine original templates' charset, defaulting to $charset_in\n"; } } @@ -164,9 +248,13 @@ sub usage_error (;$) { Getopt::Long::config qw( bundling no_auto_abbrev ); GetOptions( + 'charset=s' => sub { $charset_in = $charset_out = $_[1] }, # INTERNAL + 'convert-from=s' => \$convert_from, 'D|directory=s' => \$directory, 'f|files-from=s' => \$files_from, + 'I|input-charset=s' => \$charset_in, # INTERNAL 'pedantic-warnings|pedantic' => sub { $pedantic_p = 1 }, + 'O|output-charset=s' => \$charset_out, # INTERNAL 'output|o=s' => \$output, 's|sort-output' => sub { $sort = 's' }, 'F|sort-by-file' => sub { $sort = 'F' }, @@ -176,24 +264,32 @@ GetOptions( VerboseWarnings::set_application_name $0; VerboseWarnings::set_pedantic_mode $pedantic_p; -usage_error('Missing mandatory option -f') unless defined $files_from; +usage_error('Missing mandatory option -f') + unless defined $files_from || defined $convert_from; $directory = '.' unless defined $directory; +usage_error('You cannot specify both --convert-from and --files-from') + if defined $convert_from && defined $files_from; + if (defined $output && $output ne '-') { open(OUTPUT, ">$output") || die "$output: $!\n"; } else { open(OUTPUT, ">&STDOUT"); } -open(INPUT, "<$files_from") || die "$files_from: $!\n"; -while () { - chomp; - my $h = TmplTokenizer->new( "$directory/$_" ); - $h->set_allow_cformat( 1 ); - VerboseWarnings::set_input_file_name "$directory/$_"; - text_extract( $h ); +if (defined $files_from) { + open(INPUT, "<$files_from") || die "$files_from: $!\n"; + while () { + chomp; + my $h = TmplTokenizer->new( "$directory/$_" ); + $h->set_allow_cformat( 1 ); + VerboseWarnings::set_input_file_name "$directory/$_"; + text_extract( $h ); + } + close INPUT; +} else { + convert_translation_file; } -close INPUT; generate_po_file; warn "This input will not work with Mozilla standards-compliant mode\n", undef -- 2.39.5