From ff532faaa627436241018737f7d4a5dcf8cf6556 Mon Sep 17 00:00:00 2001 From: Christopher Hall Date: Wed, 16 Feb 2011 14:00:10 +1300 Subject: [PATCH] Bug 5917 : initial work on translation tool conversion --- misc/translator/TTParser.pm | 99 ++++ misc/translator/TmplTokenizer.pm | 805 +++++-------------------------- misc/translator/tmpl_process3.pl | 4 +- 3 files changed, 226 insertions(+), 682 deletions(-) create mode 100755 misc/translator/TTParser.pm diff --git a/misc/translator/TTParser.pm b/misc/translator/TTParser.pm new file mode 100755 index 0000000000..c86166d224 --- /dev/null +++ b/misc/translator/TTParser.pm @@ -0,0 +1,99 @@ +#!/usr/bin/env perl +#simple parser for HTML with Template Toolkit directives. Tokens are put into @tokens and are accesible via next_token and peep_token +package TTParser; +use base qw(HTML::Parser); +use TmplToken; +use strict; +use warnings; + +#seems to be handled post tokenizer +##hash where key is tag we are interested in and the value is a hash of the attributes we want +#my %interesting_tags = ( +# img => { alt => 1 }, +#); + +#tokens found so far (used like a stack) +my ( @tokens ); + +#shiftnext token or undef +sub next_token{ + return shift @tokens; +} + +#unshift token back on @tokens +sub return_token{ + my $self = shift; + unshift @tokens, shift; +} + +#have a peep at next token +sub peep_token{ + return $tokens[0]; +} + +#wrapper for parse +#please use this method INSTEAD of the HTML::Parser->parse_file method (and HTML::Parser->parse) +#signature build_tokens( self, filename) +sub build_tokens{ + my ($self, $filename) = @_; + $self->{filename} = $filename; + $self->handler(start => "start", "self, line, tagname, attr, text"); #signature is start( self, linenumber, tagname, hash of attributes, origional text ) + $self->handler(text => "text", "self, line, text, is_cdata"); #signature is text( self, linenumber, origional text, is_cdata ) + $self->handler(end => ""); #ignore end tags + $self->marked_sections(1); #treat anything inside CDATA tags as text, should really make it a TmplTokenType::CDATA + $self->parse_file($filename); + return $self; +} + +#handle parsing of text +sub text{ + my $self = shift; + my $line = shift; + my $work = shift; # original text + my $is_cdata = shift; + while($work){ + return if $work =~ m/^\s*$/; + # if there is a template_toolkit tag + if( $work =~ m/\[%.*?\]/ ){ + #everything before this tag is text (or possibly CDATA), add a text token to tokens if $` + if( $` ){ + my $t = TmplToken->new( $`, ($is_cdata? TmplTokenType::CDATA : TmplTokenType::TEXT), $line, $self->{filename} ); + push @tokens, $t; + } + + #the match itself is a DIRECTIVE $& + my $t = TmplToken->new( $&, TmplTokenType::DIRECTIVE, $line, $self->{filename} ); + push @tokens, $t; + + #put work still to do back into work + $work = $' ? $' : 0; + } else { + #If there is some left over work, treat it as text token + my $t = TmplToken->new( $work, ($is_cdata? TmplTokenType::CDATA : TmplTokenType::TEXT), $line, $self->{filename} ); + push @tokens, $t; + last; + } + } +} + +#handle opening html tags +sub start{ + my $self = shift; + my $line = shift; + my $tag = shift; + my $hash = shift; + my $text = shift; #unused atm... + #return if ! $interesting_tags{$tag}; + # was $hash->{$key} + # print "#### " . $self->{filename} . " " . $tag . "####\n"; + my $t = TmplToken->new( $tag, TmplTokenType::TAG, $line, $self->{filename}); + my %attr; + for my $key( %$hash ) { + next unless defined $hash->{$key}; + $attr{+lc($key)} = [ $key, $hash->{$key}, $key."=".$hash->{$key}, 0 ]; + } + $t->set_attributes( \%attr ); + push @tokens, $t; +} + +1; diff --git a/misc/translator/TmplTokenizer.pm b/misc/translator/TmplTokenizer.pm index 5c4f28ba5c..b4c69584ed 100644 --- a/misc/translator/TmplTokenizer.pm +++ b/misc/translator/TmplTokenizer.pm @@ -4,6 +4,7 @@ use strict; #use warnings; FIXME - Bug 2505 use TmplTokenType; use TmplToken; +use TTParser; use VerboseWarnings qw( pedantic_p error_normal warn_normal warn_pedantic ); require Exporter; @@ -39,44 +40,12 @@ use vars qw( $pedantic_error_markup_in_pcdata_p ); ############################################################################### # Hideous stuff -use vars qw( $re_directive $re_tmpl_var $re_tmpl_var_escaped $re_tmpl_include ); -use vars qw( $re_directive_control $re_tmpl_endif_endloop $re_xsl); +use vars qw( $re_xsl $re_end_entity $re_tmpl_var); BEGIN { - # $re_directive must not do any backreferences - $re_directive = q{<(?:(?i)(?:!--\s*)?\/?TMPL_(?:VAR|LOOP|INCLUDE|IF|ELSE|UNLESS)(?:\s+(?:[a-zA-Z][-a-zA-Z0-9]*=)?(?:'[^']*'|"[^"]*"|[^\s<>]+))*\s*(?:--)?)>}; - # TMPL_VAR or TMPL_INCLUDE - $re_tmpl_var = q{<(?:(?i)(?:!--\s*)?TMPL_(?:VAR)(?:\s+(?:[a-zA-Z][-a-zA-Z0-9]*=)?(?:'[^']*'|"[^"]*"|[^\s<>]+))*\s*(?:--)?)>}; - $re_tmpl_include = q{<(?:(?i)(?:!--\s*)?TMPL_(?:INCLUDE)(?:\s+(?:[a-zA-Z][-a-zA-Z0-9]*=)?(?:'[^']*'|"[^"]*"|[^\s<>]+))*\s*(?:--)?)>}; - # TMPL_VAR ESCAPE=1/HTML/URL + $re_tmpl_var = q{\[%\s*[get|set|default]?\s*[\w\.]+\s*[|.*?]?\s*%\]}; $re_xsl = q{<\/?(?:xsl:)(?:[\s\-a-zA-Z0-9"'\/\.\[\]\@\(\):=,$]+)\/?>}; - $re_tmpl_var_escaped = q{<(?:(?i)(?:!--\s*)?TMPL_(?:VAR|INCLUDE)(?:\s+(?:[a-zA-Z][-a-zA-Z0-9]*=)?(?:'[^']*'|"[^"]*"|[^\s<>]+))\s+ESCAPE=(?:1|HTML|URL)(?:\s+(?:[a-zA-Z][-a-zA-Z0-9]*=)?(?:'[^']*'|"[^"]*"|[^\s<>]+))*\s*(?:--)?)>}; - # Any control flow directive - $re_directive_control = q{<(?:(?i)(?:!--\s*)?\/?TMPL_(?:LOOP|IF|ELSE|UNLESS)(?:\s+(?:[a-zA-Z][-a-zA-Z0-9]*=)?(?:'[^']*'|"[^"]*"|[^\s<>]+))*\s*(?:--)?)>}; - # /LOOP or /IF or /UNLESS - $re_tmpl_endif_endloop = q{<(?:(?i)(?:!--\s*)?\/TMPL_(?:LOOP|IF|UNLESS)(?:\s+(?:[a-zA-Z][-a-zA-Z0-9]*=)?(?:'[^']*'|"[^"]*"|[^\s<>]+))*\s*(?:--)?)>}; -} - -# Hideous stuff from subst.pl, slightly modified to use the above hideous stuff -# Note: The $re_tag's set $1 (), and $3 (rest of string) -use vars qw( $re_comment $re_entity_name $re_end_entity $re_etag ); -use vars qw( $re_tag_strict $re_tag_compat @re_tag ); -sub re_tag ($) { - my($compat) = @_; - my $etag = $compat? '>': '<>\/'; - # This is no longer similar to the original regexp in subst.pl :-( - # Note that we don't want <> in compat mode; Mozilla knows about < - q{(<\/?(?:|(?:"(?:} . $re_directive . q{|[^"])*"|'(?:} . $re_directive . q{|[^'])*'|--(?:(?!--)(?:$re_directive)*.)*--|(?:} - . $re_directive - . q{|(?!--)[^"'<>} . $etag . q{]))+))([} . $etag . q{]|(?=<))(.*)}; -} -BEGIN { - $re_comment = '(?:--(?:[^-]|-[^-])*--)'; - $re_entity_name = '(?:[^&%#;<>\s]+)'; # NOTE: not really correct SGML $re_end_entity = '(?:;|$|(?=\s))'; # semicolon or before-whitespace - $re_etag = q{(?:<\/?(?:"[^"]*"|'[^']*'|[^"'>\/])*[>\/])}; # end-tag - @re_tag = ($re_tag_strict, $re_tag_compat) = (re_tag(0), re_tag(1)); } - # End of the hideous stuff use vars qw( $serial ); @@ -87,14 +56,14 @@ sub FATAL_P () {'fatal-p'} sub SYNTAXERROR_P () {'syntaxerror-p'} sub FILENAME () {'input'} -sub HANDLE () {'handle'} +#sub HANDLE () {'handle'} -sub READAHEAD () {'readahead'} +#sub READAHEAD () {'readahead'} sub LINENUM_START () {'lc_0'} sub LINENUM () {'lc'} sub CDATA_MODE_P () {'cdata-mode-p'} sub CDATA_CLOSE () {'cdata-close'} -sub PCDATA_MODE_P () {'pcdata-mode-p'} # additional submode for CDATA +#sub PCDATA_MODE_P () {'pcdata-mode-p'} # additional submode for CDATA sub JS_MODE_P () {'js-mode-p'} # cdata-mode-p must also be true sub ALLOW_CFORMAT_P () {'allow-cformat-p'} @@ -102,11 +71,14 @@ sub ALLOW_CFORMAT_P () {'allow-cformat-p'} sub new { shift; my ($filename) = @_; - open my $handle,$filename or die "can't open $filename"; + #open my $handle,$filename or die "can't open $filename"; + my $parser = TTParser->new; + $parser->build_tokens( $filename ); bless { - filename => $filename - , handle => $handle - , readahead => [] + filename => $filename, + _parser => $parser +# , handle => $handle +# , readahead => [] } , __PACKAGE__; } @@ -119,49 +91,16 @@ sub filename { return $this->{filename}; } -sub _handle { - my $this = shift; - return $this->{handle}; -} - sub fatal_p { my $this = shift; return $this->{+FATAL_P}; } +# work around, currently not implemented sub syntaxerror_p { - my $this = shift; - return $this->{+SYNTAXERROR_P}; -} - -sub has_readahead_p { - my $this = shift; - return @{$this->{readahead}}; -} - -sub _peek_readahead { - my $this = shift; - return $this->{readahead}->[$#{$this->{readahead}}]; -} - -sub line_number_start { - my $this = shift; - return $this->{+LINENUM_START}; -} - -sub line_number { - my $this = shift; - return $this->{+LINENUM}; -} - -sub cdata_mode_p { - my $this = shift; - return $this->{+CDATA_MODE_P}; -} - -sub pcdata_mode_p { - my $this = shift; - return $this->{+PCDATA_MODE_P}; +# my $this = shift; +# return $this->{+SYNTAXERROR_P}; + return 0; } sub js_mode_p { @@ -169,11 +108,6 @@ sub js_mode_p { return $this->{+JS_MODE_P}; } -sub cdata_close { - my $this = shift; - return $this->{+CDATA_CLOSE}; -} - sub allow_cformat_p { my $this = shift; return $this->{+ALLOW_CFORMAT_P}; @@ -187,71 +121,13 @@ sub _set_fatal { return $this; } -sub _set_syntaxerror { - my $this = shift; - $this->{+SYNTAXERROR_P} = $_[0]; - return $this; -} - -sub _push_readahead { - my $this = shift; - push @{$this->{readahead}}, $_[0]; - return $this; -} - -sub _pop_readahead { - my $this = shift; - return pop @{$this->{readahead}}; -} - -sub _append_readahead { - my $this = shift; - $this->{readahead}->[$#{$this->{readahead}}] .= $_[0]; - return $this; -} - -sub _set_readahead { - my $this = shift; - $this->{readahead}->[$#{$this->{readahead}}] = $_[0]; - return $this; -} - -sub _increment_line_number { - my $this = shift; - $this->{+LINENUM} += 1; - return $this; -} - -sub _set_line_number_start { - my $this = shift; - $this->{+LINENUM_START} = $_[0]; - return $this; -} - -sub _set_cdata_mode { - my $this = shift; - $this->{+CDATA_MODE_P} = $_[0]; - return $this; -} - -sub _set_pcdata_mode { - my $this = shift; - $this->{+PCDATA_MODE_P} = $_[0]; - return $this; -} - sub _set_js_mode { my $this = shift; $this->{+JS_MODE_P} = $_[0]; return $this; } -sub _set_cdata_close { - my $this = shift; - $this->{+CDATA_CLOSE} = $_[0]; - return $this; -} - +#used in xgettext, tmpl_process3 and text-extract2 sub set_allow_cformat { my $this = shift; $this->{+ALLOW_CFORMAT_P} = $_[0]; @@ -268,7 +144,7 @@ BEGIN { sub parenleft () { '(' } sub parenright () { ')' } -sub split_js ($) { +sub _split_js ($) { my ($s0) = @_; my @it = (); while (length $s0) { @@ -320,7 +196,7 @@ sub STATE_STRING_LITERAL () { 3 } # XXX This is a crazy hack. I don't want to write an ECMAScript parser. # XXX A scanner is one thing; a parser another thing. -sub identify_js_translatables (@) { +sub _identify_js_translatables (@) { my @input = @_; my @output = (); # We mark a JavaScript translatable string as in C, i.e., _("literal") @@ -354,555 +230,98 @@ sub identify_js_translatables (@) { ############################################################################### -sub _extract_attributes ($;$) { - my $this = shift; - my($s, $lc) = @_; - my %attr; - $s = $1 if $s =~ /^<(?:(?!$re_directive_control)\S)+(.*)\/\S$/s # XML-style self-closing tags - || $s =~ /^<(?:(?!$re_directive_control)\S)+(.*)\S$/s; # SGML-style tags - - for (my $i = 0; $s =~ /^(?:$re_directive_control)?\s+(?:$re_directive_control)?(?:([a-zA-Z][-a-zA-Z0-9]*)\s*=\s*)?('((?:$re_directive|[^'])*)'|"((?:$re_directive|[^"])*)"|((?:$re_directive|[^\s<>])+))/os;) { - my($key, $val, $val_orig, $rest) - = ($1, (defined $3? $3: defined $4? $4: $5), $2, $'); - $i += 1; - $attr{+lc($key)} = [$key, $val, $val_orig, $i]; - $s = $rest; - if ($val =~ /$re_tmpl_include/os) { - warn_normal "TMPL_INCLUDE in attribute: $val_orig\n", $lc; - } elsif ($val =~ /$re_tmpl_var/os && $val !~ /$re_tmpl_var_escaped/os) { - # XXX: we probably should not warn if key is "onclick" etc - # XXX: there's just no reasonable thing to suggest - my $suggest = ($key =~ /^(?:action|archive|background|cite|classid|codebase|data|datasrc|for|href|longdesc|profile|src|usemap)$/i? 'URL': 'HTML'); - undef $suggest if $key =~ /^(?:onblur|onchange|onclick|ondblclick|onfocus|onkeydown|onkeypress|onkeyup|onload|onmousedown|onmousemove|onmouseout|onmouseover|onmouseup|onreset|onselect|onsubmit|onunload)$/i; - warn_pedantic - "Suggest ESCAPE=$suggest for TMPL_VAR in attribute \"$key\"" - . ": $val_orig", - $lc, \$pedantic_tmpl_var_use_in_nonpedantic_mode_p - if defined $suggest && (pedantic_p || !$pedantic_tmpl_var_use_in_nonpedantic_mode_p); - } elsif ($val_orig !~ /^['"]/) { - my $t = $val; $t =~ s/$re_directive_control//os; - warn_pedantic - "Unquoted attribute contains character(s) that should be quoted" - . ": $val_orig", - $lc, \$pedantic_attribute_error_in_nonpedantic_mode_p - if $t =~ /[^-\.A-Za-z0-9]/s; - } - } - my $s2 = $s; $s2 =~ s/$re_tmpl_endif_endloop//g; # for the next check - if ($s2 =~ /\S/s) { # should never happen - if ($s =~ /^([^\n]*)\n/s) { # this is even worse - error_normal("Completely confused while extracting attributes: $1", $lc); - error_normal((scalar(split(/\n/, $s)) - 1) . " more line(s) not shown.", undef); - $this->_set_fatal( 1 ); - } else { - # There's something wrong with the attribute syntax. - # We might be able to deduce a likely cause by looking more. - if ($s =~ /^[a-z0-9]/is && "" =~ /^$re_tag_compat$/s) { - warn_normal "Probably missing whitespace before or missing quotation mark near: $s\n", $lc; - } else { - warn_normal "Strange attribute syntax: $s\n", $lc; - } - } - } - return \%attr; -} - -sub _next_token_internal { - my $this = shift; - my($h) = @_; - my($it, $kind); - my $eof_p = 0; - $this->_pop_readahead if $this->has_readahead_p - && !ref $this->_peek_readahead - && !length $this->_peek_readahead; - if (!$this->has_readahead_p) { - my $next = scalar <$h>; - $eof_p = !defined $next; - if (!$eof_p) { - $this->_increment_line_number; - $this->_push_readahead( $next ); - } - } - $this->_set_line_number_start( $this->line_number ); # remember 1st line num - if ($this->has_readahead_p && ref $this->_peek_readahead) { # TmplToken obj. - ($it, $kind) = ($this->_pop_readahead, undef); - } elsif ($eof_p && !$this->has_readahead_p) { # nothing left to do - ; - } elsif ($this->_peek_readahead =~ /^\s+/s) { # whitespace - ($kind, $it) = (TmplTokenType::TEXT, $&); - $this->_set_readahead( $' ); - # FIXME the following (the [<\s] part) is an unreliable HACK :-( - } elsif ($this->_peek_readahead =~ /^(?:[^<]|<[<\s])*(?:[^<\s])/s) { # non-space normal text - ($kind, $it) = (TmplTokenType::TEXT, $&); - $this->_set_readahead( $' ); - warn_normal "Unescaped < in $it\n", $this->line_number_start - if !$this->cdata_mode_p && $it =~ /cdata_close;;) { - if ($this->cdata_mode_p) { - my $next = $this->_pop_readahead; - if ($next =~ /^$cdata_close/is) { - ($kind, $it) = (TmplTokenType::TAG, $&); - $this->_push_readahead( $' ); - $ok_p = 1; - } elsif ($next =~ /^((?:(?!$cdata_close).)+)($cdata_close)/is) { - ($kind, $it) = (TmplTokenType::TEXT, $1); - $this->_push_readahead( "$2$'" ); - $ok_p = 1; - } else { - ($kind, $it) = (TmplTokenType::TEXT, $next); - $ok_p = 1; - } - } elsif ($this->_peek_readahead =~ /^$re_tag_compat/os) { - # If we detect a "closed start tag" but we know that the - # following token looks like a TMPL_VAR, don't stop - my($head, $tail, $post) = ($1, $2, $3); - if ($tail eq '' && $post =~ $re_tmpl_var) { - # Don't bother to show the warning if we're too confused - # FIXME. There's no method for _closed_start_tag_warning - if (!defined $this->{'_closed_start_tag_warning'} - || ($this->{'_closed_start_tag_warning'}->[0] eq $head - && $this->{'_closed_start_tag_warning'}->[1] != $this->line_number - 1)) { - warn_normal "Possible SGML \"closed start tag\" notation: $head<\n", $this->line_number - if split(/\n/, $head) < 10; - } - $this->{'_closed_start_tag_warning'} = [$head, $this->line_number]; - } else { - ($kind, $it) = (TmplTokenType::TAG, "$head>"); - $this->_set_readahead( $post ); - $ok_p = 1; - warn_normal "SGML \"closed start tag\" notation: $head<\n", $this->line_number if $tail eq '' - and $head ne '_peek_readahead =~ /^)$re_directive*.)*-->/os) { - ($kind, $it) = (TmplTokenType::COMMENT, $&); - $this->_set_readahead( $' ); - $ok_p = 1; - $bad_comment_p = 1; - } - last if $ok_p; - my $next = scalar <$h>; - $eof_p = !defined $next; - last if $eof_p; - $this->_increment_line_number; - $this->_append_readahead( $next ); - } - if ($kind ne TmplTokenType::TAG) { - ; - } elsif ($it =~ /^).)*-->/; - if ($kind == TmplTokenType::COMMENT && $it =~ /^