From 10a00d1b5004b19d89650b141c0bd783b699dbde Mon Sep 17 00:00:00 2001 From: acli Date: Sun, 22 Feb 2004 21:34:40 +0000 Subject: [PATCH] Preliminary support for "analysis" of strings with tags. Early termination of analysis if we encounter some strings, such as or | or ||, in order to avoid extracting strings that are unnecessarily long and which doesn't add any meaningful context. --- misc/translator/TmplToken.pm | 6 +++ misc/translator/TmplTokenizer.pm | 93 +++++++++++++++++++++++--------- misc/translator/tmpl_process3.pl | 20 +++++-- misc/translator/xgettext.pl | 53 ++++++++++-------- 4 files changed, 119 insertions(+), 53 deletions(-) diff --git a/misc/translator/TmplToken.pm b/misc/translator/TmplToken.pm index 854a70a304..87d360777e 100644 --- a/misc/translator/TmplToken.pm +++ b/misc/translator/TmplToken.pm @@ -87,6 +87,12 @@ sub parameters { return map { $_->type == TmplTokenType::DIRECTIVE? $_: ()} @{$this->{'_kids'}}; } +# only meaningful for TEXT_PARAMETRIZED tokens +sub anchors { + my $this = shift; + return map { $_->type == TmplTokenType::TAG && $_->string =~ /^{'_kids'}}; +} + # only meaningful for TEXT_PARAMETRIZED tokens sub form { my $this = shift; diff --git a/misc/translator/TmplTokenizer.pm b/misc/translator/TmplTokenizer.pm index 608d141001..ce66ffea66 100644 --- a/misc/translator/TmplTokenizer.pm +++ b/misc/translator/TmplTokenizer.pm @@ -420,13 +420,24 @@ sub _next_token_intermediate { return $it; } -sub _token_groupable_p ($) { # groupable into a TEXT_PARAMETRIZED token +sub _token_groupable1_p ($) { # as first token, groupable into TEXT_PARAMETRIZED my($t) = @_; - return $t->type == TmplTokenType::TEXT + return ($t->type == TmplTokenType::TEXT && $t->string !~ /^[,\.:\|\s]+$/s) || ($t->type == TmplTokenType::DIRECTIVE && $t->string =~ /^(?:$re_tmpl_var)$/os) || ($t->type == TmplTokenType::TAG - && ($t->string =~ /^<\/?(?:b|em|h[123456]|i|u)\b/is + && ($t->string =~ /^<(?:b|em|h[123456]|i|u)\b/is + || ($t->string =~ /^attributes->{'type'} =~ /^(?:text)$/i))) +} + +sub _token_groupable2_p ($) { # as other token, groupable into TEXT_PARAMETRIZED + my($t) = @_; + return ($t->type == TmplTokenType::TEXT && ($t->string =~ /^\s*$/s || $t->string !~ /^[\|\s]+$/s)) + || ($t->type == TmplTokenType::DIRECTIVE + && $t->string =~ /^(?:$re_tmpl_var)$/os) + || ($t->type == TmplTokenType::TAG + && ($t->string =~ /^<\/?(?:a|b|em|h[123456]|i|u)\b/is || ($t->string =~ /^attributes->{'type'} =~ /^(?:text)$/i))) } @@ -439,7 +450,10 @@ sub _quote_cformat ($) { sub _formalize ($) { my($t) = @_; - return $t->type == TmplTokenType::DIRECTIVE? '%s': _quote_cformat($t->string); + return $t->type == TmplTokenType::DIRECTIVE? '%s': + $t->type == TmplTokenType::TAG? + ($t->string =~ /^': _quote_cformat($t->string)): + _quote_cformat($t->string); } sub _optimize { @@ -452,6 +466,7 @@ sub _optimize { push @{$this->{_queue}}, pop @structure; } }; + &$undo_trailing_blanks; # FIXME: If the last token is a close tag but there are no tags # FIXME: before it, drop the close tag back into the queue. This # FIXME: is an ugly hack to get rid of "foo %s" type mess. @@ -514,46 +529,56 @@ sub next_token { $it = $this->_next_token_intermediate($h); if (!$this->cdata_mode_p && $this->allow_cformat_p && defined $it && ($it->type == TmplTokenType::TEXT? - !blank_p( $it->string ): _token_groupable_p( $it ))) { + !blank_p( $it->string ): _token_groupable1_p( $it ))) { my @structure = ( $it ); - my($n_trailing_spaces, $next) = (0, undef); - my($nonblank_text_p, $parametrized_p, $next) = (0, 0); + my @tags = (); + my $next = undef; + my($nonblank_text_p, $parametrized_p, $with_anchor_p) = (0, 0, 0); if ($it->type == TmplTokenType::TEXT) { $nonblank_text_p = 1 if !blank_p( $it->string ); } elsif ($it->type == TmplTokenType::DIRECTIVE) { $parametrized_p = 1; + } elsif ($it->type == TmplTokenType::TAG && $it->string =~ /^<([A-Z0-9]+)/is) { + push @tags, lc($1); + $with_anchor_p = 1 if lc($1) eq 'a'; } - for (my $i = 1, $n_trailing_spaces = 0;; $i += 1) { + # We hate | and || in msgid strings, so we try to avoid them + for (my $i = 1, my $quit_p = 0, my $quit_next_p = ($it->type == TmplTokenType::TEXT && $it->string =~ /^\|+$/s);; $i += 1) { $next = $this->_next_token_intermediate($h); push @structure, $next; # for consistency (with initialization) - last unless defined $next && _token_groupable_p( $next ); + last unless defined $next && _token_groupable2_p( $next ); + last if $quit_next_p; if ($next->type == TmplTokenType::TEXT) { - if (blank_p( $next->string )) { - $n_trailing_spaces += 1; - } else { - ($n_trailing_spaces, $nonblank_text_p) = (0, 1); - } + $nonblank_text_p = 1 if !blank_p( $next->string ); + $quit_p = 1 if $next->string =~ /^\|+$/s; # We hate | and || } elsif ($next->type == TmplTokenType::DIRECTIVE) { - $n_trailing_spaces = 0; $parametrized_p = 1; - } else { - $n_trailing_spaces = 0; + } elsif ($next->type == TmplTokenType::TAG) { + if ($next->string =~ /^<([A-Z0-9]+)/is) { + push @tags, lc($1); + $with_anchor_p = 1 if lc($1) eq 'a'; + } elsif ($next->string =~ /^<\/([A-Z0-9]+)/is) { + my $close = lc($1); + $quit_p = 1 unless @tags && $close eq $tags[$#tags]; + $quit_next_p = 1 if $close =~ /^h\d$/; + pop @tags; + } } + last if $quit_p; } # Undo the last token push @{$this->{_queue}}, pop @structure; - # Undo trailing blank tokens - for (my $i = 0; $i < $n_trailing_spaces; $i += 1) { - push @{$this->{_queue}}, pop @structure; - } + # Simply it a bit more @structure = $this->_optimize( @structure ); if (@structure < 2) { # Nothing to do ; - } elsif ($nonblank_text_p && $parametrized_p) { + } elsif ($nonblank_text_p && ($parametrized_p || $with_anchor_p)) { # Create the corresponding c-format string my $string = join('', map { $_->string } @structure); my $form = join('', map { _formalize $_ } @structure); + my $a_counter = 0; + $form =~ s// $a_counter += 1, "" /egs; $it = TmplToken->new($string, TmplTokenType::TEXT_PARAMETRIZED, $it->line_number, $it->pathname); $it->set_form( $form ); $it->set_children( @structure ); @@ -604,10 +629,10 @@ sub quote_po ($) { } # Some functions that shouldn't be here... should be moved out some time -sub parametrize ($@) { - my($fmt, @params) = @_; +sub parametrize ($$$) { + my($fmt_0, $params, $anchors) = @_; my $it = ''; - for (my $n = 0; length $fmt;) { + for (my $n = 0, my $fmt = $fmt_0; length $fmt;) { if ($fmt =~ /^[^%]+/) { $fmt = $'; $it .= $&; @@ -619,7 +644,7 @@ sub parametrize ($@) { my($i, $width, $prec) = ((defined $1? $1: $n), $2, $3); $fmt = $'; if (!defined $width && !defined $prec) { - $it .= $params[$i] + $it .= $params->[$i] } elsif (defined $width && defined $prec && !$width && !$prec) { ; } else { @@ -633,6 +658,22 @@ sub parametrize ($@) { die "Completely confused parametrizing: $fmt\n";#XXX } } + for (my $n = 0, my $fmt = $it, $it = ''; length $fmt;) { + if ($fmt =~ /^(?:(?!).)+/is) { + $fmt = $'; + $it .= $&; + } elsif ($fmt =~ /^/is) { + $n += 1; + my $i = $1; + $fmt = $'; + my $anchor = $anchors->[$i - 1]; + warn_normal "$&: Anchor $1 not found for msgid \"$fmt_0\"", undef #FIXME + unless defined $anchor; + $it .= $anchor->string; + } else { + die "Completely confused decoding anchors: $fmt\n";#XXX + } + } return $it; } diff --git a/misc/translator/tmpl_process3.pl b/misc/translator/tmpl_process3.pl index ae383fea33..a7637cb2c7 100755 --- a/misc/translator/tmpl_process3.pl +++ b/misc/translator/tmpl_process3.pl @@ -88,10 +88,10 @@ sub text_replace (**) { print $output $pre, find_translation($trimmed), $post; } elsif ($kind eq TmplTokenType::TEXT_PARAMETRIZED) { my $fmt = find_translation($s->form); - print $output TmplTokenizer::parametrize($fmt, map { + print $output TmplTokenizer::parametrize($fmt, [ map { my($kind, $t, $attr) = ($_->type, $_->string, $_->attributes); $kind == TmplTokenType::TAG && %$attr? - text_replace_tag($t, $attr): $t } $s->parameters); + text_replace_tag($t, $attr): $t } $s->parameters ], [ $s->anchors ]); } elsif ($kind eq TmplTokenType::TAG && %$attr) { print $output text_replace_tag($t, $attr); } elsif (defined $t) { @@ -297,7 +297,8 @@ exit 0; =head1 DESCRIPTION This is an experimental version of the tmpl_process.pl script, -using standard gettext-style PO files. +using standard gettext-style PO files. Note that the behaviour +of this script should still be considered unstable. Currently, the create, update, and install actions have all been reimplemented and seem to work. @@ -315,8 +316,14 @@ file as c-format strings with %s. The --help option has not been implemented yet. -There are probably some real bugs too, since this has not been -tested very much. +If an extracted string contain actual text (versus tags or +TMPL_VAR directives), the strings are extracted verbatim, +resulting in unwieldy things like multiple spaces, tabs, +and/or newlines which are semantically indistinguishable +from single blanks. If the template writer changes the +spacing just a little bit, the new formatting would be +considered new strings. This is arguably wrong, and in any +case counter-productive. xgettext.pl must be present in the current directory; the msgmerge(1) command must also be present in the search path. @@ -331,6 +338,9 @@ generate GNU PO files properly; a couple of workarounds have been written in TmplTokenizer and more is likely to be needed (e.g., to get rid of the "Strange line" warning for #~). +There are probably some other bugs too, since this has not been +tested very much. + =head1 SEE ALSO xgettext.pl, diff --git a/misc/translator/xgettext.pl b/misc/translator/xgettext.pl index 3d5d11b042..11a4f15bcc 100755 --- a/misc/translator/xgettext.pl +++ b/misc/translator/xgettext.pl @@ -21,26 +21,43 @@ use vars qw( $charset_in $charset_out ); ############################################################################### -sub negligible_p ($) { +sub string_negligible_p ($) { my($t) = @_; # a string # Don't emit pure whitespace, pure numbers, pure punctuation, # single letters, or TMPL_VAR's. # Punctuation should arguably be translated. But without context - # they are untranslatable. + # they are untranslatable. Note that $t is a string, not a token object. return !$extract_all_p && ( - TmplTokenizer::blank_p($t) # blank or TMPL_VAR + TmplTokenizer::blank_p($t) # blank or TMPL_VAR || $t =~ /^\d+$/ # purely digits - || $t =~ /^[-\.,:;'"%\(\)\[\]\|]+$/ # pure punctuation w/o context + || $t =~ /^[-\+\.,:;!\?'"%\(\)\[\]\|]+$/ # punctuation w/o context || $t =~ /^[A-Za-z]$/ # single letters - ); + ) +} + +sub token_negligible_p( $ ) { + my($x) = @_; + my $t = $x->type; + return !$extract_all_p && ( + $t == TmplTokenType::TEXT? string_negligible_p( $x->string ): + $t == TmplTokenType::DIRECTIVE? 1: + $t == TmplTokenType::TEXT_PARAMETRIZED + && join( '', map { my $t = $_->type; + $t == TmplTokenType::DIRECTIVE? + '1': $t == TmplTokenType::TAG? + '': token_negligible_p( $_ )? + '': '1' } @{$x->children} ) eq '' ); } ############################################################################### sub remember ($$) { my($token, $string) = @_; - $text{$string} = [] unless defined $text{$string}; - push @{$text{$string}}, $token; + # If we determine that the string is negligible, don't bother to remember + unless (string_negligible_p( $string ) || token_negligible_p( $token )) { + $text{$string} = [] unless defined $text{$string}; + push @{$text{$string}}, $token; + } } ############################################################################### @@ -69,10 +86,8 @@ sub text_extract (*) { last unless defined $s; my($kind, $t, $attr) = ($s->type, $s->string, $s->attributes); if ($kind eq TmplTokenType::TEXT) { - #$t = TmplTokenizer::trim $t; remember( $s, $t ) if $t =~ /\S/s; } elsif ($kind eq TmplTokenType::TEXT_PARAMETRIZED) { - #$t = TmplTokenizer::trim $t; remember( $s, $s->form ) if $s->form =~ /\S/s; } elsif ($kind eq TmplTokenType::TAG && %$attr) { # value [tag=input], meta @@ -96,7 +111,7 @@ sub text_extract (*) { sub generate_strings_list () { # Emit all extracted strings. for my $t (string_list) { - printf OUTPUT "%s\n", $t unless negligible_p($t); + printf OUTPUT "%s\n", $t # unless negligible_p($t); } } @@ -127,7 +142,7 @@ msgstr "" EOF my $directory_re = quotemeta("$directory/"); for my $t (string_list) { - next if negligible_p($t); + #next if negligible_p($t); my $cformat_p; for my $token (@{$text{$t}}) { my $pathname = $token->pathname; @@ -316,17 +331,11 @@ the gettext format. =back -Right now it does about the same thing as text-extract2.pl but -generates gettext-style output; however, because it is scanner- -instead of parser-based, it is able to address the 4 weaknesses -listed in translator_doc.txt. Ultimately, the goal is to make -this able to do some kind of simple analysis on the input to -produce gettext-style output with c-format strings, in order to -facilitate translation to languages with a different word order -than English. - -When the above is finished, the generated po file may contain -some HTML tags in addition to %s strings. +Note that this script is experimental and should still be +considered unstable. + +Please refer to the explanation in tmpl_process3 for further +details. If you want to generate GNOME-style POTFILES.in files, such files (passed to -f) can be generated thus: -- 2.39.5