From 86e2fbbe49cd0c7b0203b3f179cd9280b14ff404 Mon Sep 17 00:00:00 2001 From: Christopher Hall Date: Fri, 25 Feb 2011 16:38:49 +1300 Subject: [PATCH] Bug 5917 : final commit from catalyst --- misc/translator/TTParser.pm | 24 +++-- misc/translator/TmplTokenType.pm | 2 +- misc/translator/TmplTokenizer.pm | 147 ++++++++++++++++--------------- misc/translator/xgettext.pl | 60 ++++++------- 4 files changed, 125 insertions(+), 108 deletions(-) diff --git a/misc/translator/TTParser.pm b/misc/translator/TTParser.pm index c86166d224..507474e67b 100755 --- a/misc/translator/TTParser.pm +++ b/misc/translator/TTParser.pm @@ -21,7 +21,7 @@ sub next_token{ } #unshift token back on @tokens -sub return_token{ +sub unshift_token{ my $self = shift; unshift @tokens, shift; } @@ -39,8 +39,9 @@ sub build_tokens{ $self->{filename} = $filename; $self->handler(start => "start", "self, line, tagname, attr, text"); #signature is start( self, linenumber, tagname, hash of attributes, origional text ) $self->handler(text => "text", "self, line, text, is_cdata"); #signature is text( self, linenumber, origional text, is_cdata ) - $self->handler(end => ""); #ignore end tags + $self->handler(end => "end", "self, line, tag, text"); #signature is end( self, linenumber, tagename, origional text ) $self->marked_sections(1); #treat anything inside CDATA tags as text, should really make it a TmplTokenType::CDATA + $self->unbroken_text(1); #make contiguous whitespace into a single token (can span multiple lines) $self->parse_file($filename); return $self; } @@ -52,7 +53,7 @@ sub text{ my $work = shift; # original text my $is_cdata = shift; while($work){ - return if $work =~ m/^\s*$/; +# return if $work =~ m/^\s*$/; # if there is a template_toolkit tag if( $work =~ m/\[%.*?\]/ ){ #everything before this tag is text (or possibly CDATA), add a text token to tokens if $` @@ -81,13 +82,14 @@ sub start{ my $self = shift; my $line = shift; my $tag = shift; - my $hash = shift; - my $text = shift; #unused atm... + my $hash = shift; #hash of attr/value pairs + my $text = shift; #origional text #return if ! $interesting_tags{$tag}; # was $hash->{$key} # print "#### " . $self->{filename} . " " . $tag . "####\n"; - my $t = TmplToken->new( $tag, TmplTokenType::TAG, $line, $self->{filename}); + my $t = TmplToken->new( $text, TmplTokenType::TAG, $line, $self->{filename}); my %attr; + # tags seem to be uses in an 'interesting' way elsewhere.. for my $key( %$hash ) { next unless defined $hash->{$key}; $attr{+lc($key)} = [ $key, $hash->{$key}, $key."=".$hash->{$key}, 0 ]; @@ -96,4 +98,14 @@ sub start{ push @tokens, $t; } +#handle closing html tags +sub end{ + my $self = shift; + my $line = shift; + my $tag = shift; + my $text = shift; + # what format should this be in? + my $t = TmplToken->new( $text, TmplTokenType::TAG, $line, $self->{filename} ); +} + 1; diff --git a/misc/translator/TmplTokenType.pm b/misc/translator/TmplTokenType.pm index 217cfbbab9..bfebebbe23 100644 --- a/misc/translator/TmplTokenType.pm +++ b/misc/translator/TmplTokenType.pm @@ -108,7 +108,7 @@ something that has the form of an SGML processing instruction =item DIRECTIVE -a HTML::Template directive (whether or not embedded in an SGML comment) +a Template Toolkit directive =item COMMENT diff --git a/misc/translator/TmplTokenizer.pm b/misc/translator/TmplTokenizer.pm index b4c69584ed..22c0a13328 100644 --- a/misc/translator/TmplTokenizer.pm +++ b/misc/translator/TmplTokenizer.pm @@ -14,15 +14,11 @@ use vars qw($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS); =head1 NAME -TmplTokenizer.pm - Simple-minded tokenizer class for HTML::Template .tmpl files +TmplTokenizer.pm - Simple-minded wrapper class for TTParser =head1 DESCRIPTION -Because .tmpl files contains HTML::Template directives -that tend to confuse real parsers (e.g., HTML::Parse), -it might be better to create a customized scanner -to scan the template files for tokens. -This module is a simple-minded attempt at such a scanner. +A wrapper for the functionality found in TTParser to allow an easier transition to Template Toolkit =cut @@ -148,44 +144,44 @@ sub _split_js ($) { my ($s0) = @_; my @it = (); while (length $s0) { - if ($s0 =~ /^\s+/s) { # whitespace - push @it, $&; - $s0 = $'; - } elsif ($s0 =~ /^\/\/[^\r\n]*(?:[\r\n]|$)/s) { # C++-style comment - push @it, $&; - $s0 = $'; - } elsif ($s0 =~ /^\/\*(?:(?!\*\/).)*\*\//s) { # C-style comment - push @it, $&; - $s0 = $'; - # Keyword or identifier, ECMA-262 p.13 (section 7.5) - } elsif ($s0 =~ /^[A-Z_\$][A-Z\d_\$]*/is) { # IdentifierName - push @it, $&; - $s0 = $'; - # Punctuator, ECMA-262 p.13 (section 7.6) - } elsif ($s0 =~ /^(?:[\(\){}\[\];]|>>>=|<<=|>>=|[-\+\*\/\&\|\^\%]=|>>>|<<|>>|--|\+\+|\|\||\&\&|==|<=|>=|!=|[=><,!~\?:\.\-\+\*\/\&\|\^\%])/s) { - push @it, $&; - $s0 = $'; - # DecimalLiteral, ECMA-262 p.14 (section 7.7.3); note: bug in the spec - } elsif ($s0 =~ /^(?:0|[1-9]\d+(?:\.\d*(?:[eE][-\+]?\d+)?)?)/s) { - push @it, $&; - $s0 = $'; - # HexIntegerLiteral, ECMA-262 p.15 (section 7.7.3) - } elsif ($s0 =~ /^0[xX][\da-fA-F]+/s) { - push @it, $&; - $s0 = $'; - # OctalIntegerLiteral, ECMA-262 p.15 (section 7.7.3) - } elsif ($s0 =~ /^0[\da-fA-F]+/s) { - push @it, $&; - $s0 = $'; - # StringLiteral, ECMA-262 p.17 (section 7.7.4) - # XXX SourceCharacter doesn't seem to be defined (?) - } elsif ($s0 =~ /^(?:"(?:(?!["\\\r\n]).|$js_EscapeSequence)*"|'(?:(?!['\\\r\n]).|$js_EscapeSequence)*')/os) { - push @it, $&; - $s0 = $'; - } elsif ($s0 =~ /^./) { # UNKNOWN TOKEN !!! - push @it, $&; - $s0 = $'; - } + if ($s0 =~ /^\s+/s) { # whitespace + push @it, $&; + $s0 = $'; + } elsif ($s0 =~ /^\/\/[^\r\n]*(?:[\r\n]|$)/s) { # C++-style comment + push @it, $&; + $s0 = $'; + } elsif ($s0 =~ /^\/\*(?:(?!\*\/).)*\*\//s) { # C-style comment + push @it, $&; + $s0 = $'; + # Keyword or identifier, ECMA-262 p.13 (section 7.5) + } elsif ($s0 =~ /^[A-Z_\$][A-Z\d_\$]*/is) { # IdentifierName + push @it, $&; + $s0 = $'; + # Punctuator, ECMA-262 p.13 (section 7.6) + } elsif ($s0 =~ /^(?:[\(\){}\[\];]|>>>=|<<=|>>=|[-\+\*\/\&\|\^\%]=|>>>|<<|>>|--|\+\+|\|\||\&\&|==|<=|>=|!=|[=><,!~\?:\.\-\+\*\/\&\|\^\%])/s) { + push @it, $&; + $s0 = $'; + # DecimalLiteral, ECMA-262 p.14 (section 7.7.3); note: bug in the spec + } elsif ($s0 =~ /^(?:0|[1-9]\d+(?:\.\d*(?:[eE][-\+]?\d+)?)?)/s) { + push @it, $&; + $s0 = $'; + # HexIntegerLiteral, ECMA-262 p.15 (section 7.7.3) + } elsif ($s0 =~ /^0[xX][\da-fA-F]+/s) { + push @it, $&; + $s0 = $'; + # OctalIntegerLiteral, ECMA-262 p.15 (section 7.7.3) + } elsif ($s0 =~ /^0[\da-fA-F]+/s) { + push @it, $&; + $s0 = $'; + # StringLiteral, ECMA-262 p.17 (section 7.7.4) + # XXX SourceCharacter doesn't seem to be defined (?) + } elsif ($s0 =~ /^(?:"(?:(?!["\\\r\n]).|$js_EscapeSequence)*"|'(?:(?!['\\\r\n]).|$js_EscapeSequence)*')/os) { + push @it, $&; + $s0 = $'; + } elsif ($s0 =~ /^./) { # UNKNOWN TOKEN !!! + push @it, $&; + $s0 = $'; + } } return @it; } @@ -202,28 +198,28 @@ sub _identify_js_translatables (@) { # We mark a JavaScript translatable string as in C, i.e., _("literal") # For simplicity, we ONLY look for "_" "(" StringLiteral ")" for (my $i = 0, my $state = 0, my($j, $q, $s); $i <= $#input; $i += 1) { - my $reset_state_p = 0; - push @output, [0, $input[$i]]; - if ($input[$i] !~ /\S/s) { - ; - } elsif ($state == 0) { - $state = STATE_UNDERSCORE if $input[$i] eq '_'; - } elsif ($state == STATE_UNDERSCORE) { - $state = $input[$i] eq parenleft ? STATE_PARENLEFT : 0; - } elsif ($state == STATE_PARENLEFT) { - if ($input[$i] =~ /^(['"])(.*)\1$/s) { - ($state, $j, $q, $s) = (STATE_STRING_LITERAL, $#output, $1, $2); - } else { - $state = 0; - } - } elsif ($state == STATE_STRING_LITERAL) { - if ($input[$i] eq parenright) { - $output[$j] = [1, $output[$j]->[1], $q, $s]; - } - $state = 0; - } else { - die "identify_js_translatables internal error: Unknown state $state" - } + my $reset_state_p = 0; + push @output, [0, $input[$i]]; + if ($input[$i] !~ /\S/s) { + ; + } elsif ($state == 0) { + $state = STATE_UNDERSCORE if $input[$i] eq '_'; + } elsif ($state == STATE_UNDERSCORE) { + $state = $input[$i] eq parenleft ? STATE_PARENLEFT : 0; + } elsif ($state == STATE_PARENLEFT) { + if ($input[$i] =~ /^(['"])(.*)\1$/s) { + ($state, $j, $q, $s) = (STATE_STRING_LITERAL, $#output, $1, $2); + } else { + $state = 0; + } + } elsif ($state == STATE_STRING_LITERAL) { + if ($input[$i] eq parenright) { + $output[$j] = [1, $output[$j]->[1], $q, $s]; + } + $state = 0; + } else { + die "identify_js_translatables internal error: Unknown state $state" + } } return \@output; } @@ -233,10 +229,19 @@ sub _identify_js_translatables (@) { sub string_canon ($) { my $s = shift; # Fold all whitespace into single blanks - $s =~ s/\s+/ /gs; + $s =~ s/\s+/ /g; + return $s; +} + +# safer version used internally, preserves new lines +sub string_canon_safe ($) { + my $s = shift; + # fold tabs and spaces into single spaces + $s =~ s/[\ \t]+/ /gs; return $s; } + sub _quote_cformat{ my $s = shift; $s =~ s/%/%%/g; @@ -245,7 +250,7 @@ sub _quote_cformat{ sub _formalize_string_cformat{ my $s = shift; - return _quote_cformat( string_canon $s ); + return _quote_cformat( string_canon_safe $s ); } sub _formalize{ @@ -312,7 +317,7 @@ sub next_token { # if there is nothing in parts, return this token return $next unless @parts; # OTHERWISE, put this token back and return the parametrized string of @parts - $self->{_parser}->return_token($next); + $self->{_parser}->unshift_token($next); return $self->_parametrize_internal(@parts); } } @@ -320,7 +325,7 @@ sub next_token { ############################################################################### -# ugly method taken from old version +# function taken from old version # used by tmpl_process3 sub parametrize ($$$$) { my($fmt_0, $cformat_p, $t, $f) = @_; @@ -527,7 +532,7 @@ is different (replacing %s with %1$s, %2$s, etc.), or when certain words will require certain inflectional suffixes in sentences. Because this is an incompatible change, this mode must be explicitly -turned on using the set_cformat(1) method call. +turned on using the set_allow_cformat(1) method call. =head2 The flag characters diff --git a/misc/translator/xgettext.pl b/misc/translator/xgettext.pl index abe315e5d7..ef1fb3db4a 100755 --- a/misc/translator/xgettext.pl +++ b/misc/translator/xgettext.pl @@ -60,9 +60,9 @@ sub remember ($$) { my($token, $string) = @_; # If we determine that the string is negligible, don't bother to remember unless (string_negligible_p( $string ) || token_negligible_p( $token )) { - my $key = TmplTokenizer::string_canon( $string ); - $text{$key} = [] unless defined $text{$key}; - push @{$text{$key}}, $token; + my $key = TmplTokenizer::string_canon( $string ); + $text{$key} = [] unless defined $text{$key}; + push @{$text{$key}}, $token; } } @@ -83,37 +83,37 @@ sub string_list () { return @t; } -############################################################################### + ############################################################################### sub text_extract (*) { my($h) = @_; for (;;) { - my $s = TmplTokenizer::next_token $h; - last unless defined $s; - my($kind, $t, $attr) = ($s->type, $s->string, $s->attributes); - if ($kind eq TmplTokenType::TEXT) { - remember( $s, $t ) if $t =~ /\S/s; - } elsif ($kind eq TmplTokenType::TEXT_PARAMETRIZED) { - remember( $s, $s->form ) if $s->form =~ /\S/s; - } elsif ($kind eq TmplTokenType::TAG && %$attr) { - # value [tag=input], meta - my $tag = lc($1) if $t =~ /^<(\S+)/s; - for my $a ('alt', 'content', 'title', 'value','label') { - if ($attr->{$a}) { - next if $a eq 'label' && $tag ne 'optgroup'; - next if $a eq 'content' && $tag ne 'meta'; - next if $a eq 'value' && ($tag ne 'input' - || (ref $attr->{'type'} && $attr->{'type'}->[1] =~ /^(?:hidden|radio|checkbox)$/)); # FIXME - my($key, $val, $val_orig, $order) = @{$attr->{$a}}; #FIXME - $val = TmplTokenizer::trim $val; - remember( $s, $val ) if $val =~ /\S/s; - } - } - } elsif ($s->has_js_data) { - for my $t (@{$s->js_data}) { - remember( $s, $t->[3] ) if $t->[0]; # FIXME - } - } + my $s = TmplTokenizer::next_token $h; + last unless defined $s; + my($kind, $t, $attr) = ($s->type, $s->string, $s->attributes); + if ($kind eq TmplTokenType::TEXT) { + remember( $s, $t ) if $t =~ /\S/s; + } elsif ($kind eq TmplTokenType::TEXT_PARAMETRIZED) { + remember( $s, $s->form ) if $s->form =~ /\S/s; + } elsif ($kind eq TmplTokenType::TAG && %$attr) { + # value [tag=input], meta + my $tag = lc($1) if $t =~ /^<(\S+)/s; + for my $a ('alt', 'content', 'title', 'value','label') { + if ($attr->{$a}) { + next if $a eq 'label' && $tag ne 'optgroup'; + next if $a eq 'content' && $tag ne 'meta'; + next if $a eq 'value' && ($tag ne 'input' + || (ref $attr->{'type'} && $attr->{'type'}->[1] =~ /^(?:hidden|radio|checkbox)$/)); # FIXME + my($key, $val, $val_orig, $order) = @{$attr->{$a}}; #FIXME + $val = TmplTokenizer::trim $val; + remember( $s, $val ) if $val =~ /\S/s; + } + } + } elsif ($s->has_js_data) { + for my $t (@{$s->js_data}) { + remember( $s, $t->[3] ) if $t->[0]; # FIXME + } + } } } -- 2.39.2