From 77a1d8682df8689aeb73d6a177575986db2c3e44 Mon Sep 17 00:00:00 2001 From: acli Date: Mon, 23 Feb 2004 01:21:03 +0000 Subject: [PATCH] Fold all consecutive whitespaces into single blanks. This avoids problems when minor whitespace changes occur in the original templates; it also makes the strings much easier to read (e.g., instead of "foo\n\n\t\t bar", xgettext.pl will now always generate "foo bar" and tmpl_process3.pl will understand it to be the same as the original string). --- misc/translator/TmplTokenizer.pm | 20 ++++++++++++++++++++ misc/translator/tmpl_process3.pl | 24 +++++++++++++----------- misc/translator/xgettext.pl | 6 ++++-- 3 files changed, 37 insertions(+), 13 deletions(-) diff --git a/misc/translator/TmplTokenizer.pm b/misc/translator/TmplTokenizer.pm index ce66ffea66..a7f850345e 100644 --- a/misc/translator/TmplTokenizer.pm +++ b/misc/translator/TmplTokenizer.pm @@ -448,9 +448,25 @@ sub _quote_cformat ($) { return $s; } +sub string_canon ($) { + my($s) = @_; + if (1) { # FIXME + # Fold all whitespace into single blanks + $s =~ s/\s+/ /gs; + } + return $s; +} + +sub _formalize_string_cformat ($) { + my($s) = @_; + return _quote_cformat string_canon $s; +} + sub _formalize ($) { my($t) = @_; return $t->type == TmplTokenType::DIRECTIVE? '%s': + $t->type == TmplTokenType::TEXT? + _formalize_string_cformat($t->string): $t->type == TmplTokenType::TAG? ($t->string =~ /^': _quote_cformat($t->string)): _quote_cformat($t->string); @@ -598,6 +614,10 @@ sub next_token { } } } + if (defined $it && $it->type == TmplTokenType::TEXT) { + my $form = string_canon $it->string; + $it->set_form( $form ); + } return $it; } diff --git a/misc/translator/tmpl_process3.pl b/misc/translator/tmpl_process3.pl index a7637cb2c7..a77b262482 100755 --- a/misc/translator/tmpl_process3.pl +++ b/misc/translator/tmpl_process3.pl @@ -32,8 +32,14 @@ use vars qw( $charset_in $charset_out ); sub find_translation ($) { my($s) = @_; - my $key = TmplTokenizer::quote_po($s) if $s =~ /\S/; - $key = TmplTokenizer::charset_convert($key, $charset_in, $charset_out); + my $key = $s; + if ($s =~ /\S/s) { + print STDERR "DEBUG: before: ($key)\n"; + $key = TmplTokenizer::string_canon($key); + $key = TmplTokenizer::charset_convert($key, $charset_in, $charset_out); + $key = TmplTokenizer::quote_po($key); + print STDERR "DEBUG: after: ($key)\n"; + } return defined $href->{$key} && !$href->{$key}->fuzzy && length Locale::PO->dequote($href->{$key}->msgstr)? @@ -312,19 +318,15 @@ appears to be a full sentence (this actual work being done by TmplTokenizer(3)); these larger patterns appear in the translation file as c-format strings with %s. +Whitespace in extracted strings are folded to single blanks, in +order to prevent new strings from appearing when minor changes in +the original templates occur, and to prevent overly difficult to +read strings in the PO file. + =head1 BUGS The --help option has not been implemented yet. -If an extracted string contain actual text (versus tags or -TMPL_VAR directives), the strings are extracted verbatim, -resulting in unwieldy things like multiple spaces, tabs, -and/or newlines which are semantically indistinguishable -from single blanks. If the template writer changes the -spacing just a little bit, the new formatting would be -considered new strings. This is arguably wrong, and in any -case counter-productive. - xgettext.pl must be present in the current directory; the msgmerge(1) command must also be present in the search path. The script currently does not check carefully whether these diff --git a/misc/translator/xgettext.pl b/misc/translator/xgettext.pl index 11a4f15bcc..2da236360f 100755 --- a/misc/translator/xgettext.pl +++ b/misc/translator/xgettext.pl @@ -55,8 +55,9 @@ sub remember ($$) { my($token, $string) = @_; # If we determine that the string is negligible, don't bother to remember unless (string_negligible_p( $string ) || token_negligible_p( $token )) { - $text{$string} = [] unless defined $text{$string}; - push @{$text{$string}}, $token; + my $key = TmplTokenizer::string_canon( $string ); + $text{$key} = [] unless defined $text{$key}; + push @{$text{$key}}, $token; } } @@ -153,6 +154,7 @@ EOF } printf OUTPUT "#, c-format\n" if $cformat_p; printf OUTPUT "msgid %s\n", TmplTokenizer::quote_po + TmplTokenizer::string_canon TmplTokenizer::charset_convert $t, $charset_in, $charset_out; printf OUTPUT "msgstr %s\n\n", (defined $translation{$t}? TmplTokenizer::quote_po( $translation{$t} ): "\"\""); -- 2.39.5