package TmplTokenizer; use strict; use TmplTokenType; use TmplToken; use VerboseWarnings qw( pedantic_p error_normal warn_normal warn_pedantic ); require Exporter; use vars qw($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS); ############################################################################### =head1 NAME TmplTokenizer.pm - Simple-minded tokenizer class for HTML::Template .tmpl files =head1 DESCRIPTION Because .tmpl files contains HTML::Template directives that tend to confuse real parsers (e.g., HTML::Parse), it might be better to create a customized scanner to scan the template files for tokens. This module is a simple-minded attempt at such a scanner. =cut ############################################################################### $VERSION = 0.02; @ISA = qw(Exporter); @EXPORT_OK = qw(); use vars qw( $pedantic_attribute_error_in_nonpedantic_mode_p ); use vars qw( $pedantic_tmpl_var_use_in_nonpedantic_mode_p ); use vars qw( $pedantic_error_markup_in_pcdata_p ); ############################################################################### # Hideous stuff use vars qw( $re_directive $re_tmpl_var $re_tmpl_var_escaped $re_tmpl_include ); use vars qw( $re_directive_control $re_tmpl_endif_endloop $re_xsl); BEGIN { # $re_directive must not do any backreferences $re_directive = q{<(?:(?i)(?:!--\s*)?\/?TMPL_(?:VAR|LOOP|INCLUDE|IF|ELSE|UNLESS)(?:\s+(?:[a-zA-Z][-a-zA-Z0-9]*=)?(?:'[^']*'|"[^"]*"|[^\s<>]+))*\s*(?:--)?)>}; # TMPL_VAR or TMPL_INCLUDE $re_tmpl_var = q{<(?:(?i)(?:!--\s*)?TMPL_(?:VAR)(?:\s+(?:[a-zA-Z][-a-zA-Z0-9]*=)?(?:'[^']*'|"[^"]*"|[^\s<>]+))*\s*(?:--)?)>}; $re_tmpl_include = q{<(?:(?i)(?:!--\s*)?TMPL_(?:INCLUDE)(?:\s+(?:[a-zA-Z][-a-zA-Z0-9]*=)?(?:'[^']*'|"[^"]*"|[^\s<>]+))*\s*(?:--)?)>}; # TMPL_VAR ESCAPE=1/HTML/URL $re_xsl = q{<\/?(?:xsl:)(?:[\s\-a-zA-Z0-9"'\/\.\[\]\@\(\):=,$]+)\/?>}; $re_tmpl_var_escaped = q{<(?:(?i)(?:!--\s*)?TMPL_(?:VAR|INCLUDE)(?:\s+(?:[a-zA-Z][-a-zA-Z0-9]*=)?(?:'[^']*'|"[^"]*"|[^\s<>]+))\s+ESCAPE=(?:1|HTML|URL)(?:\s+(?:[a-zA-Z][-a-zA-Z0-9]*=)?(?:'[^']*'|"[^"]*"|[^\s<>]+))*\s*(?:--)?)>}; # Any control flow directive $re_directive_control = q{<(?:(?i)(?:!--\s*)?\/?TMPL_(?:LOOP|IF|ELSE|UNLESS)(?:\s+(?:[a-zA-Z][-a-zA-Z0-9]*=)?(?:'[^']*'|"[^"]*"|[^\s<>]+))*\s*(?:--)?)>}; # /LOOP or /IF or /UNLESS $re_tmpl_endif_endloop = q{<(?:(?i)(?:!--\s*)?\/TMPL_(?:LOOP|IF|UNLESS)(?:\s+(?:[a-zA-Z][-a-zA-Z0-9]*=)?(?:'[^']*'|"[^"]*"|[^\s<>]+))*\s*(?:--)?)>}; } # Hideous stuff from subst.pl, slightly modified to use the above hideous stuff # Note: The $re_tag's set $1 (), and $3 (rest of string) use vars qw( $re_comment $re_entity_name $re_end_entity $re_etag ); use vars qw( $re_tag_strict $re_tag_compat @re_tag ); sub re_tag ($) { my($compat) = @_; my $etag = $compat? '>': '<>\/'; # This is no longer similar to the original regexp in subst.pl :-( # Note that we don't want <> in compat mode; Mozilla knows about < q{(<\/?(?:|(?:"(?:} . $re_directive . q{|[^"])*"|'(?:} . $re_directive . q{|[^'])*'|--(?:(?!--)(?:$re_directive)*.)*--|(?:} . $re_directive . q{|(?!--)[^"'<>} . $etag . q{]))+))([} . $etag . q{]|(?=<))(.*)}; } BEGIN { $re_comment = '(?:--(?:[^-]|-[^-])*--)'; $re_entity_name = '(?:[^&%#;<>\s]+)'; # NOTE: not really correct SGML $re_end_entity = '(?:;|$|(?=\s))'; # semicolon or before-whitespace $re_etag = q{(?:<\/?(?:"[^"]*"|'[^']*'|[^"'>\/])*[>\/])}; # end-tag @re_tag = ($re_tag_strict, $re_tag_compat) = (re_tag(0), re_tag(1)); } # End of the hideous stuff use vars qw( $serial ); ############################################################################### sub FATAL_P () {'fatal-p'} sub SYNTAXERROR_P () {'syntaxerror-p'} sub FILENAME () {'input'} sub HANDLE () {'handle'} sub READAHEAD () {'readahead'} sub LINENUM_START () {'lc_0'} sub LINENUM () {'lc'} sub CDATA_MODE_P () {'cdata-mode-p'} sub CDATA_CLOSE () {'cdata-close'} sub PCDATA_MODE_P () {'pcdata-mode-p'} # additional submode for CDATA sub JS_MODE_P () {'js-mode-p'} # cdata-mode-p must also be true sub ALLOW_CFORMAT_P () {'allow-cformat-p'} sub new { shift; my ($filename) = @_; open my $handle,$filename or die "can't open $filename"; bless { filename => $filename , handle => $handle , readahead => [] } , __PACKAGE__; } ############################################################################### # Simple getters sub filename { my $this = shift; return $this->{filename}; } sub _handle { my $this = shift; return $this->{handle}; } sub fatal_p { my $this = shift; return $this->{+FATAL_P}; } sub syntaxerror_p { my $this = shift; return $this->{+SYNTAXERROR_P}; } sub has_readahead_p { my $this = shift; return @{$this->{readahead}}; } sub _peek_readahead { my $this = shift; return $this->{readahead}->[$#{$this->{readahead}}]; } sub line_number_start { my $this = shift; return $this->{+LINENUM_START}; } sub line_number { my $this = shift; return $this->{+LINENUM}; } sub cdata_mode_p { my $this = shift; return $this->{+CDATA_MODE_P}; } sub pcdata_mode_p { my $this = shift; return $this->{+PCDATA_MODE_P}; } sub js_mode_p { my $this = shift; return $this->{+JS_MODE_P}; } sub cdata_close { my $this = shift; return $this->{+CDATA_CLOSE}; } sub allow_cformat_p { my $this = shift; return $this->{+ALLOW_CFORMAT_P}; } # Simple setters sub _set_fatal { my $this = shift; $this->{+FATAL_P} = $_[0]; return $this; } sub _set_syntaxerror { my $this = shift; $this->{+SYNTAXERROR_P} = $_[0]; return $this; } sub _push_readahead { my $this = shift; push @{$this->{readahead}}, $_[0]; return $this; } sub _pop_readahead { my $this = shift; return pop @{$this->{readahead}}; } sub _append_readahead { my $this = shift; $this->{readahead}->[$#{$this->{readahead}}] .= $_[0]; return $this; } sub _set_readahead { my $this = shift; $this->{readahead}->[$#{$this->{readahead}}] = $_[0]; return $this; } sub _increment_line_number { my $this = shift; $this->{+LINENUM} += 1; return $this; } sub _set_line_number_start { my $this = shift; $this->{+LINENUM_START} = $_[0]; return $this; } sub _set_cdata_mode { my $this = shift; $this->{+CDATA_MODE_P} = $_[0]; return $this; } sub _set_pcdata_mode { my $this = shift; $this->{+PCDATA_MODE_P} = $_[0]; return $this; } sub _set_js_mode { my $this = shift; $this->{+JS_MODE_P} = $_[0]; return $this; } sub _set_cdata_close { my $this = shift; $this->{+CDATA_CLOSE} = $_[0]; return $this; } sub set_allow_cformat { my $this = shift; $this->{+ALLOW_CFORMAT_P} = $_[0]; return $this; } ############################################################################### use vars qw( $js_EscapeSequence ); BEGIN { # Perl quoting is really screwed up, but this common subexp is way too long $js_EscapeSequence = q{\\\\(?:['"\\\\bfnrt]|[^0-7xu]|[0-3]?[0-7]{1,2}|x[\da-fA-F]{2}|u[\da-fA-F]{4})}; } sub parenleft () { '(' } sub parenright () { ')' } sub split_js ($) { my ($s0) = @_; my @it = (); while (length $s0) { if ($s0 =~ /^\s+/s) { # whitespace push @it, $&; $s0 = $'; } elsif ($s0 =~ /^\/\/[^\r\n]*(?:[\r\n]|$)/s) { # C++-style comment push @it, $&; $s0 = $'; } elsif ($s0 =~ /^\/\*(?:(?!\*\/).)*\*\//s) { # C-style comment push @it, $&; $s0 = $'; # Keyword or identifier, ECMA-262 p.13 (section 7.5) } elsif ($s0 =~ /^[A-Z_\$][A-Z\d_\$]*/is) { # IdentifierName push @it, $&; $s0 = $'; # Punctuator, ECMA-262 p.13 (section 7.6) } elsif ($s0 =~ /^(?:[\(\){}\[\];]|>>>=|<<=|>>=|[-\+\*\/\&\|\^\%]=|>>>|<<|>>|--|\+\+|\|\||\&\&|==|<=|>=|!=|[=><,!~\?:\.\-\+\*\/\&\|\^\%])/s) { push @it, $&; $s0 = $'; # DecimalLiteral, ECMA-262 p.14 (section 7.7.3); note: bug in the spec } elsif ($s0 =~ /^(?:0|[1-9]\d+(?:\.\d*(?:[eE][-\+]?\d+)?)?)/s) { push @it, $&; $s0 = $'; # HexIntegerLiteral, ECMA-262 p.15 (section 7.7.3) } elsif ($s0 =~ /^0[xX][\da-fA-F]+/s) { push @it, $&; $s0 = $'; # OctalIntegerLiteral, ECMA-262 p.15 (section 7.7.3) } elsif ($s0 =~ /^0[\da-fA-F]+/s) { push @it, $&; $s0 = $'; # StringLiteral, ECMA-262 p.17 (section 7.7.4) # XXX SourceCharacter doesn't seem to be defined (?) } elsif ($s0 =~ /^(?:"(?:(?!["\\\r\n]).|$js_EscapeSequence)*"|'(?:(?!['\\\r\n]).|$js_EscapeSequence)*')/os) { push @it, $&; $s0 = $'; } elsif ($s0 =~ /^./) { # UNKNOWN TOKEN !!! push @it, $&; $s0 = $'; } } return @it; } sub STATE_UNDERSCORE () { 1 } sub STATE_PARENLEFT () { 2 } sub STATE_STRING_LITERAL () { 3 } # XXX This is a crazy hack. I don't want to write an ECMAScript parser. # XXX A scanner is one thing; a parser another thing. sub identify_js_translatables (@) { my @input = @_; my @output = (); # We mark a JavaScript translatable string as in C, i.e., _("literal") # For simplicity, we ONLY look for "_" "(" StringLiteral ")" for (my $i = 0, my $state = 0, my($j, $q, $s); $i <= $#input; $i += 1) { my $reset_state_p = 0; push @output, [0, $input[$i]]; if ($input[$i] !~ /\S/s) { ; } elsif ($state == 0) { $state = STATE_UNDERSCORE if $input[$i] eq '_'; } elsif ($state == STATE_UNDERSCORE) { $state = $input[$i] eq parenleft ? STATE_PARENLEFT : 0; } elsif ($state == STATE_PARENLEFT) { if ($input[$i] =~ /^(['"])(.*)\1$/s) { ($state, $j, $q, $s) = (STATE_STRING_LITERAL, $#output, $1, $2); } else { $state = 0; } } elsif ($state == STATE_STRING_LITERAL) { if ($input[$i] eq parenright) { $output[$j] = [1, $output[$j]->[1], $q, $s]; } $state = 0; } else { die "identify_js_translatables internal error: Unknown state $state" } } return \@output; } ############################################################################### sub _extract_attributes ($;$) { my $this = shift; my($s, $lc) = @_; my %attr; $s = $1 if $s =~ /^<(?:(?!$re_directive_control)\S)+(.*)\/\S$/s # XML-style self-closing tags || $s =~ /^<(?:(?!$re_directive_control)\S)+(.*)\S$/s; # SGML-style tags for (my $i = 0; $s =~ /^(?:$re_directive_control)?\s+(?:$re_directive_control)?(?:([a-zA-Z][-a-zA-Z0-9]*)\s*=\s*)?('((?:$re_directive|[^'])*)'|"((?:$re_directive|[^"])*)"|((?:$re_directive|[^\s<>])+))/os;) { my($key, $val, $val_orig, $rest) = ($1, (defined $3? $3: defined $4? $4: $5), $2, $'); $i += 1; $attr{+lc($key)} = [$key, $val, $val_orig, $i]; $s = $rest; if ($val =~ /$re_tmpl_include/os) { warn_normal "TMPL_INCLUDE in attribute: $val_orig\n", $lc; } elsif ($val =~ /$re_tmpl_var/os && $val !~ /$re_tmpl_var_escaped/os) { # XXX: we probably should not warn if key is "onclick" etc # XXX: there's just no reasonable thing to suggest my $suggest = ($key =~ /^(?:action|archive|background|cite|classid|codebase|data|datasrc|for|href|longdesc|profile|src|usemap)$/i? 'URL': 'HTML'); undef $suggest if $key =~ /^(?:onblur|onchange|onclick|ondblclick|onfocus|onkeydown|onkeypress|onkeyup|onload|onmousedown|onmousemove|onmouseout|onmouseover|onmouseup|onreset|onselect|onsubmit|onunload)$/i; warn_pedantic "Suggest ESCAPE=$suggest for TMPL_VAR in attribute \"$key\"" . ": $val_orig", $lc, \$pedantic_tmpl_var_use_in_nonpedantic_mode_p if defined $suggest && (pedantic_p || !$pedantic_tmpl_var_use_in_nonpedantic_mode_p); } elsif ($val_orig !~ /^['"]/) { my $t = $val; $t =~ s/$re_directive_control//os; warn_pedantic "Unquoted attribute contains character(s) that should be quoted" . ": $val_orig", $lc, \$pedantic_attribute_error_in_nonpedantic_mode_p if $t =~ /[^-\.A-Za-z0-9]/s; } } my $s2 = $s; $s2 =~ s/$re_tmpl_endif_endloop//g; # for the next check if ($s2 =~ /\S/s) { # should never happen if ($s =~ /^([^\n]*)\n/s) { # this is even worse error_normal("Completely confused while extracting attributes: $1", $lc); error_normal((scalar(split(/\n/, $s)) - 1) . " more line(s) not shown.", undef); $this->_set_fatal( 1 ); } else { # There's something wrong with the attribute syntax. # We might be able to deduce a likely cause by looking more. if ($s =~ /^[a-z0-9]/is && "" =~ /^$re_tag_compat$/s) { warn_normal "Probably missing whitespace before or missing quotation mark near: $s\n", $lc; } else { warn_normal "Strange attribute syntax: $s\n", $lc; } } } return \%attr; } sub _next_token_internal { my $this = shift; my($h) = @_; my($it, $kind); my $eof_p = 0; $this->_pop_readahead if $this->has_readahead_p && !ref $this->_peek_readahead && !length $this->_peek_readahead; if (!$this->has_readahead_p) { my $next = scalar <$h>; $eof_p = !defined $next; if (!$eof_p) { $this->_increment_line_number; $this->_push_readahead( $next ); } } $this->_set_line_number_start( $this->line_number ); # remember 1st line num if ($this->has_readahead_p && ref $this->_peek_readahead) { # TmplToken obj. ($it, $kind) = ($this->_pop_readahead, undef); } elsif ($eof_p && !$this->has_readahead_p) { # nothing left to do ; } elsif ($this->_peek_readahead =~ /^\s+/s) { # whitespace ($kind, $it) = (TmplTokenType::TEXT, $&); $this->_set_readahead( $' ); # FIXME the following (the [<\s] part) is an unreliable HACK :-( } elsif ($this->_peek_readahead =~ /^(?:[^<]|<[<\s])*(?:[^<\s])/s) { # non-space normal text ($kind, $it) = (TmplTokenType::TEXT, $&); $this->_set_readahead( $' ); warn_normal "Unescaped < in $it\n", $this->line_number_start if !$this->cdata_mode_p && $it =~ /cdata_close;;) { if ($this->cdata_mode_p) { my $next = $this->_pop_readahead; if ($next =~ /^$cdata_close/is) { ($kind, $it) = (TmplTokenType::TAG, $&); $this->_push_readahead( $' ); $ok_p = 1; } elsif ($next =~ /^((?:(?!$cdata_close).)+)($cdata_close)/is) { ($kind, $it) = (TmplTokenType::TEXT, $1); $this->_push_readahead( "$2$'" ); $ok_p = 1; } else { ($kind, $it) = (TmplTokenType::TEXT, $next); $ok_p = 1; } } elsif ($this->_peek_readahead =~ /^$re_tag_compat/os) { # If we detect a "closed start tag" but we know that the # following token looks like a TMPL_VAR, don't stop my($head, $tail, $post) = ($1, $2, $3); if ($tail eq '' && $post =~ $re_tmpl_var) { # Don't bother to show the warning if we're too confused # FIXME. There's no method for _closed_start_tag_warning if (!defined $this->{'_closed_start_tag_warning'} || ($this->{'_closed_start_tag_warning'}->[0] eq $head && $this->{'_closed_start_tag_warning'}->[1] != $this->line_number - 1)) { warn_normal "Possible SGML \"closed start tag\" notation: $head<\n", $this->line_number if split(/\n/, $head) < 10; } $this->{'_closed_start_tag_warning'} = [$head, $this->line_number]; } else { ($kind, $it) = (TmplTokenType::TAG, "$head>"); $this->_set_readahead( $post ); $ok_p = 1; warn_normal "SGML \"closed start tag\" notation: $head<\n", $this->line_number if $tail eq ''; } } elsif ($this->_peek_readahead =~ /^)$re_directive*.)*-->/os) { ($kind, $it) = (TmplTokenType::COMMENT, $&); $this->_set_readahead( $' ); $ok_p = 1; $bad_comment_p = 1; } last if $ok_p; my $next = scalar <$h>; $eof_p = !defined $next; last if $eof_p; $this->_increment_line_number; $this->_append_readahead( $next ); } if ($kind ne TmplTokenType::TAG) { ; } elsif ($it =~ /^).)*-->/; if ($kind == TmplTokenType::COMMENT && $it =~ /^