4 use VerboseWarnings qw( pedantic_p error_normal warn_normal warn_pedantic );
7 use vars qw($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
9 ###############################################################################
13 TmplTokenizer.pm - Simple-minded tokenizer for HTML::Template .tmpl files
17 Because .tmpl files contains HTML::Template directives
18 that tend to confuse real parsers (e.g., HTML::Parse),
19 it might be better to create a customized scanner
20 to scan the template files for tokens.
21 This module is a simple-minded attempt at such a scanner.
25 This tokenizer is mostly based
26 on Ambrose's hideous Perl script known as subst.pl.
30 ###############################################################################
46 use vars qw( $input );
47 use vars qw( $debug_dump_only_p );
48 use vars qw( $pedantic_attribute_error_in_nonpedantic_mode_p );
49 use vars qw( $pedantic_tmpl_var_use_in_nonpedantic_mode_p );
50 use vars qw( $fatal_p );
52 ###############################################################################
55 use vars qw( $re_directive $re_tmpl_var $re_tmpl_var_escaped $re_tmpl_include );
56 use vars qw( $re_directive_control $re_tmpl_endif_endloop );
58 # $re_directive must not do any backreferences
59 $re_directive = q{<(?:(?i)(?:!--\s*)?\/?TMPL_(?:VAR|LOOP|INCLUDE|IF|ELSE|UNLESS)(?:\s+(?:[a-zA-Z][-a-zA-Z0-9]*=)?(?:'[^']*'|"[^"]*"|[^\s<>]+))*\s*(?:--)?)>};
60 # TMPL_VAR or TMPL_INCLUDE
61 $re_tmpl_var = q{<(?:(?i)(?:!--\s*)?TMPL_(?:VAR)(?:\s+(?:[a-zA-Z][-a-zA-Z0-9]*=)?(?:'[^']*'|"[^"]*"|[^\s<>]+))*\s*(?:--)?)>};
62 $re_tmpl_include = q{<(?:(?i)(?:!--\s*)?TMPL_(?:INCLUDE)(?:\s+(?:[a-zA-Z][-a-zA-Z0-9]*=)?(?:'[^']*'|"[^"]*"|[^\s<>]+))*\s*(?:--)?)>};
63 # TMPL_VAR ESCAPE=1/HTML/URL
64 $re_tmpl_var_escaped = q{<(?:(?i)(?:!--\s*)?TMPL_(?:VAR|INCLUDE)(?:\s+(?:[a-zA-Z][-a-zA-Z0-9]*=)?(?:'[^']*'|"[^"]*"|[^\s<>]+))\s+ESCAPE=(?:1|HTML|URL)(?:\s+(?:[a-zA-Z][-a-zA-Z0-9]*=)?(?:'[^']*'|"[^"]*"|[^\s<>]+))*\s*(?:--)?)>};
65 # Any control flow directive
66 $re_directive_control = q{<(?:(?i)(?:!--\s*)?\/?TMPL_(?:LOOP|IF|ELSE|UNLESS)(?:\s+(?:[a-zA-Z][-a-zA-Z0-9]*=)?(?:'[^']*'|"[^"]*"|[^\s<>]+))*\s*(?:--)?)>};
67 # /LOOP or /IF or /UNLESS
68 $re_tmpl_endif_endloop = q{<(?:(?i)(?:!--\s*)?\/TMPL_(?:LOOP|IF|UNLESS)(?:\s+(?:[a-zA-Z][-a-zA-Z0-9]*=)?(?:'[^']*'|"[^"]*"|[^\s<>]+))*\s*(?:--)?)>};
71 # Hideous stuff from subst.pl, slightly modified to use the above hideous stuff
72 # Note: The $re_tag's set $1 (<tag), $2 (>), and $3 (rest of string)
73 use vars qw( $re_comment $re_entity_name $re_end_entity $re_etag );
74 use vars qw( $re_tag_strict $re_tag_compat @re_tag );
77 my $etag = $compat? '>': '<>\/';
78 # This is no longer similar to the original regexp in subst.pl :-(
79 # Note that we don't want <> in compat mode; Mozilla knows about <
80 q{(<\/?(?:|(?:"(?:} . $re_directive . q{|[^"])*"|'(?:} . $re_directive . q{|[^'])*'|--(?:[^-]|-[^-])*--|(?:}
82 . q{|(?!--)[^"'<>} . $etag . q{]))+))([} . $etag . q{]|(?=<))(.*)};
85 $re_comment = '(?:--(?:[^-]|-[^-])*--)';
86 $re_entity_name = '(?:[^&%#;<>\s]+)'; # NOTE: not really correct SGML
87 $re_end_entity = '(?:;|$|(?=\s))'; # semicolon or before-whitespace
88 $re_etag = q{(?:<\/?(?:"[^"]*"|'[^']*'|[^"'>\/])*[>\/])}; # end-tag
89 @re_tag = ($re_tag_strict, $re_tag_compat) = (re_tag(0), re_tag(1));
92 # End of the hideous stuff
94 sub KIND_TEXT () { 'TEXT' }
95 sub KIND_CDATA () { 'CDATA' }
96 sub KIND_TAG () { 'TAG' }
97 sub KIND_DECL () { 'DECL' }
98 sub KIND_PI () { 'PI' }
99 sub KIND_DIRECTIVE () { 'HTML::Template' }
100 sub KIND_COMMENT () { 'COMMENT' } # empty DECL with exactly one SGML comment
101 sub KIND_UNKNOWN () { 'ERROR' }
103 use vars qw( $readahead $lc_0 $lc $syntaxerror_p );
104 use vars qw( $cdata_mode_p $cdata_close );
106 ###############################################################################
114 sub syntaxerror_p () {
115 return $syntaxerror_p;
118 ###############################################################################
120 sub extract_attributes ($;$) {
123 $s = $1 if $s =~ /^<\S+(.*)\/\S$/s # XML-style self-closing tags
124 || $s =~ /^<\S+(.*)\S$/s; # SGML-style tags
126 for (my $i = 0; $s =~ /^(?:$re_directive_control)?\s+(?:$re_directive_control)?(?:([a-zA-Z][-a-zA-Z0-9]*)\s*=\s*)?('((?:$re_directive|[^'])*)'|"((?:$re_directive|[^"])*)"|((?:$re_directive|[^\s<>])+))/os;) {
127 my($key, $val, $val_orig, $rest)
128 = ($1, (defined $3? $3: defined $4? $4: $5), $2, $');
130 $attr{+lc($key)} = [$key, $val, $val_orig, $i];
132 if ($val =~ /$re_tmpl_include/os) {
133 warn_normal "TMPL_INCLUDE in attribute: $val_orig\n", $lc;
134 } elsif ($val =~ /$re_tmpl_var/os && $val !~ /$re_tmpl_var_escaped/os) {
135 # XXX: we probably should not warn if key is "onclick" etc
136 # XXX: there's just no reasonable thing to suggest
137 my $suggest = ($key =~ /^(?:action|archive|background|cite|classid|codebase|data|datasrc|for|href|longdesc|profile|src|usemap)$/i? 'URL': 'HTML');
138 undef $suggest if $key =~ /^(?:onblur|onchange|onclick|ondblclick|onfocus|onkeydown|onkeypress|onkeyup|onload|onmousedown|onmousemove|onmouseout|onmouseover|onmouseup|onreset|onselect|onsubmit|onunload)$/i;
140 "Suggest ESCAPE=$suggest for TMPL_VAR in attribute \"$key\""
142 $lc, \$pedantic_tmpl_var_use_in_nonpedantic_mode_p
143 if defined $suggest && (pedantic_p || !$pedantic_tmpl_var_use_in_nonpedantic_mode_p);
144 } elsif ($val_orig !~ /^['"]/) {
145 my $t = $val; $t =~ s/$re_directive_control//os;
147 "Unquoted attribute contains character(s) that should be quoted"
149 $lc, \$pedantic_attribute_error_in_nonpedantic_mode_p
150 if $t =~ /[^-\.A-Za-z0-9]/s;
153 my $s2 = $s; $s2 =~ s/$re_tmpl_endif_endloop//g; # for the next check
154 if ($s2 =~ /\S/s) { # should never happen
155 if ($s =~ /^([^\n]*)\n/s) { # this is even worse
156 error_normal("Completely confused while extracting attributes: $1", $lc);
157 error_normal((scalar(split(/\n/, $s)) - 1) . " more line(s) not shown.", undef);
160 warn_normal "Strange attribute syntax: $s\n", $lc;
166 sub next_token_internal (*) {
170 if (!defined $readahead || !length $readahead) {
171 my $next = scalar <$h>;
172 $eof_p = !defined $next;
178 $lc_0 = $lc; # remember line number of first line
179 if ($eof_p && !length $readahead) { # nothing left to do
181 } elsif ($readahead =~ /^\s+/s) { # whitespace
182 ($kind, $it, $readahead) = (KIND_TEXT, $&, $');
183 # FIXME the following (the [<\s] part) is an unreliable HACK :-(
184 } elsif ($readahead =~ /^(?:[^<]|<[<\s])+/s) { # non-space normal text
185 ($kind, $it, $readahead) = (KIND_TEXT, $&, $');
186 warn_normal "Warning: Unescaped < $it\n", $lc_0
187 if !$cdata_mode_p && $it =~ /</s;
188 } else { # tag/declaration/processing instruction
192 if ($readahead =~ /^$cdata_close/) {
193 ($kind, $it, $readahead) = (KIND_TAG, $&, $');
196 ($kind, $it, $readahead) = (KIND_TEXT, $readahead, undef);
199 } elsif ($readahead =~ /^$re_tag_compat/os) {
200 ($kind, $it, $readahead) = (KIND_TAG, "$1>", $3);
202 warn_normal "SGML \"closed start tag\" notation: $1<\n", $lc_0 if $2 eq '';
203 } elsif ($readahead =~ /^<!--(?:(?!-->).)*-->/s) {
204 ($kind, $it, $readahead) = (KIND_COMMENT, $&, $');
206 warn_normal "Syntax error in comment: $&\n", $lc_0;
210 my $next = scalar <$h>;
211 $eof_p = !defined $next;
216 if ($kind ne KIND_TAG) {
218 } elsif ($it =~ /^<!/) {
220 $kind = KIND_COMMENT if $it =~ /^<!--(?:(?!-->).)*-->/;
221 } elsif ($it =~ /^<\?/) {
224 if ($it =~ /^$re_directive/ios && !$cdata_mode_p) {
225 $kind = KIND_DIRECTIVE;
227 if (!$ok_p && $eof_p) {
228 ($kind, $it, $readahead) = (KIND_UNKNOWN, $readahead, undef);
232 warn_normal "Unrecognizable token found: $it\n", $lc_0
233 if $kind eq KIND_UNKNOWN;
234 return defined $it? (wantarray? ($kind, $it):
235 [$kind, $it]): undef;
241 if (!$cdata_mode_p) {
242 $it = next_token_internal($h);
243 if (defined $it && $it->[0] eq KIND_TAG) { # FIXME
244 ($cdata_mode_p, $cdata_close) = (1, "</$1\\s*>")
245 if $it->[1] =~ /^<(script|style|textarea)\b/i; #FIXME
246 push @$it, extract_attributes($it->[1], $lc_0); #FIXME
251 my $next = next_token_internal($h);
252 last if !defined $next;
253 if (defined $next && $next->[1] =~ /$cdata_close/i) { #FIXME
254 ($lc, $readahead) = ($lc_prev, $next->[1] . $readahead); #FIXME
257 last unless $cdata_mode_p;
258 $it .= $next->[1]; #FIXME
260 $it = [KIND_CDATA, $it]; #FIXME
261 $cdata_close = undef;
263 return defined $it? (wantarray? @$it: $it): undef;
266 ###############################################################################
270 $s =~ s/^(?:\s|\ $re_end_entity)+//os;
271 $s =~ s/(?:\s|\ $re_end_entity)+$//os;
275 ###############################################################################
279 Code could be written to detect template variables and
280 construct gettext-c-format-string-like meta-strings (e.g., "Results %s
281 through %s of %s records" that will be more likely to be translatable
282 to languages where word order is very unlike English word order.
283 This will be relatively major rework, requiring corresponding
284 rework in tmpl_process.pl