4 use VerboseWarnings qw( pedantic_p error_normal warn_normal warn_pedantic );
7 use vars qw($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
9 ###############################################################################
13 TmplTokenizer.pm - Simple-minded tokenizer for HTML::Template .tmpl files
17 Because .tmpl files contains HTML::Template directives
18 that tend to confuse real parsers (e.g., HTML::Parse),
19 it might be better to create a customized scanner
20 to scan the template files for tokens.
21 This module is a simple-minded attempt at such a scanner.
25 This tokenizer is mostly based
26 on Ambrose's hideous Perl script known as subst.pl.
30 ###############################################################################
46 use vars qw( $input );
47 use vars qw( $debug_dump_only_p );
48 use vars qw( $pedantic_attribute_error_in_nonpedantic_mode_p );
49 use vars qw( $pedantic_tmpl_var_use_in_nonpedantic_mode_p );
50 use vars qw( $fatal_p );
52 ###############################################################################
55 use vars qw( $re_directive $re_tmpl_var $re_tmpl_var_escaped $re_tmpl_include );
56 use vars qw( $re_directive_control $re_tmpl_endif_endloop );
58 # $re_directive must not do any backreferences
59 $re_directive = q{<(?:(?i)(?:!--\s*)?\/?TMPL_(?:VAR|LOOP|INCLUDE|IF|ELSE|UNLESS)(?:\s+(?:[a-zA-Z][-a-zA-Z0-9]*=)?(?:'[^']*'|"[^"]*"|[^\s<>]+))*\s*(?:--)?)>};
60 # TMPL_VAR or TMPL_INCLUDE
61 $re_tmpl_var = q{<(?:(?i)(?:!--\s*)?TMPL_(?:VAR)(?:\s+(?:[a-zA-Z][-a-zA-Z0-9]*=)?(?:'[^']*'|"[^"]*"|[^\s<>]+))*\s*(?:--)?)>};
62 $re_tmpl_include = q{<(?:(?i)(?:!--\s*)?TMPL_(?:INCLUDE)(?:\s+(?:[a-zA-Z][-a-zA-Z0-9]*=)?(?:'[^']*'|"[^"]*"|[^\s<>]+))*\s*(?:--)?)>};
63 # TMPL_VAR ESCAPE=1/HTML/URL
64 $re_tmpl_var_escaped = q{<(?:(?i)(?:!--\s*)?TMPL_(?:VAR|INCLUDE)(?:\s+(?:[a-zA-Z][-a-zA-Z0-9]*=)?(?:'[^']*'|"[^"]*"|[^\s<>]+))\s+ESCAPE=(?:1|HTML|URL)(?:\s+(?:[a-zA-Z][-a-zA-Z0-9]*=)?(?:'[^']*'|"[^"]*"|[^\s<>]+))*\s*(?:--)?)>};
65 # Any control flow directive
66 $re_directive_control = q{<(?:(?i)(?:!--\s*)?\/?TMPL_(?:LOOP|IF|ELSE|UNLESS)(?:\s+(?:[a-zA-Z][-a-zA-Z0-9]*=)?(?:'[^']*'|"[^"]*"|[^\s<>]+))*\s*(?:--)?)>};
67 # /LOOP or /IF or /UNLESS
68 $re_tmpl_endif_endloop = q{<(?:(?i)(?:!--\s*)?\/TMPL_(?:LOOP|IF|UNLESS)(?:\s+(?:[a-zA-Z][-a-zA-Z0-9]*=)?(?:'[^']*'|"[^"]*"|[^\s<>]+))*\s*(?:--)?)>};
71 # Hideous stuff from subst.pl, slightly modified to use the above hideous stuff
72 # Note: The $re_tag's set $1 (<tag), $2 (>), and $3 (rest of string)
73 use vars qw( $re_comment $re_entity_name $re_end_entity $re_etag );
74 use vars qw( $re_tag_strict $re_tag_compat @re_tag );
77 my $etag = $compat? '>': '<>\/';
78 # This is no longer similar to the original regexp in subst.pl :-(
79 # Note that we don't want <> in compat mode; Mozilla knows about <
80 q{(<\/?(?:|(?:"(?:} . $re_directive . q{|[^"])*"|'(?:} . $re_directive . q{|[^'])*'|--(?:[^-]|-[^-])*--|(?:}
82 . q{|(?!--)[^"'<>} . $etag . q{]))+))([} . $etag . q{]|(?=<))(.*)};
85 $re_comment = '(?:--(?:[^-]|-[^-])*--)';
86 $re_entity_name = '(?:[^&%#;<>\s]+)'; # NOTE: not really correct SGML
87 $re_end_entity = '(?:;|$|(?=\s))'; # semicolon or before-whitespace
88 $re_etag = q{(?:<\/?(?:"[^"]*"|'[^']*'|[^"'>\/])*[>\/])}; # end-tag
89 @re_tag = ($re_tag_strict, $re_tag_compat) = (re_tag(0), re_tag(1));
92 # End of the hideous stuff
94 sub KIND_TEXT () { 'TEXT' }
95 sub KIND_CDATA () { 'CDATA' }
96 sub KIND_TAG () { 'TAG' }
97 sub KIND_DECL () { 'DECL' }
98 sub KIND_PI () { 'PI' }
99 sub KIND_DIRECTIVE () { 'HTML::Template' }
100 sub KIND_COMMENT () { 'COMMENT' } # empty DECL with exactly one SGML comment
101 sub KIND_UNKNOWN () { 'ERROR' }
103 use vars qw( $readahead $lc_0 $lc $syntaxerror_p );
104 use vars qw( $cdata_mode_p $cdata_close );
106 ###############################################################################
114 sub syntaxerror_p () {
115 return $syntaxerror_p;
118 ###############################################################################
120 sub extract_attributes ($;$) {
123 $s = $1 if $s =~ /^<\S+(.*)\/\S$/s # XML-style self-closing tags
124 || $s =~ /^<\S+(.*)\S$/s; # SGML-style tags
126 for (my $i = 0; $s =~ /^(?:$re_directive_control)?\s+(?:$re_directive_control)?(?:([a-zA-Z][-a-zA-Z0-9]*)\s*=\s*)?('((?:$re_directive|[^'])*)'|"((?:$re_directive|[^"])*)"|((?:$re_directive|[^\s<>])+))/os;) {
127 my($key, $val, $val_orig, $rest)
128 = ($1, (defined $3? $3: defined $4? $4: $5), $2, $');
130 $attr{+lc($key)} = [$key, $val, $val_orig, $i];
132 if ($val =~ /$re_tmpl_include/os) {
133 warn_normal "TMPL_INCLUDE in attribute: $val_orig\n", $lc;
134 } elsif ($val =~ /$re_tmpl_var/os && $val !~ /$re_tmpl_var_escaped/os) {
135 # XXX: we probably should not warn if key is "onclick" etc
136 # XXX: there's just no reasonable thing to suggest
137 my $suggest = ($key =~ /^(?:action|archive|background|cite|classid|codebase|data|datasrc|for|href|longdesc|profile|src|usemap)$/i? 'URL': 'HTML');
138 undef $suggest if $key =~ /^(?:onblur|onchange|onclick|ondblclick|onfocus|onkeydown|onkeypress|onkeyup|onload|onmousedown|onmousemove|onmouseout|onmouseover|onmouseup|onreset|onselect|onsubmit|onunload)$/i;
140 "Suggest ESCAPE=$suggest for TMPL_VAR in attribute \"$key\""
142 $lc, \$pedantic_tmpl_var_use_in_nonpedantic_mode_p
143 if defined $suggest && (pedantic_p || !$pedantic_tmpl_var_use_in_nonpedantic_mode_p);
144 } elsif ($val_orig !~ /^['"]/) {
145 my $t = $val; $t =~ s/$re_directive_control//os;
147 "Unquoted attribute contains character(s) that should be quoted"
149 $lc, \$pedantic_attribute_error_in_nonpedantic_mode_p
150 if $t =~ /[^-\.A-Za-z0-9]/s;
153 my $s2 = $s; $s2 =~ s/$re_tmpl_endif_endloop//g; # for the next check
154 if ($s2 =~ /\S/s) { # should never happen
155 if ($s =~ /^([^\n]*)\n/s) { # this is even worse
156 error_normal("Completely confused while extracting attributes: $1", $lc);
157 error_normal((scalar(split(/\n/, $s)) - 1) . " more line(s) not shown.", undef);
160 warn_normal "Strange attribute syntax: $s\n", $lc;
166 sub next_token_internal (*) {
170 if (!defined $readahead || !length $readahead) {
171 my $next = scalar <$h>;
172 $eof_p = !defined $next;
178 $lc_0 = $lc; # remember line number of first line
179 if ($eof_p && !length $readahead) { # nothing left to do
181 } elsif ($readahead =~ /^\s+/s) { # whitespace
182 ($kind, $it, $readahead) = (KIND_TEXT, $&, $');
183 # FIXME the following (the [<\s] part) is an unreliable HACK :-(
184 } elsif ($readahead =~ /^(?:[^<]|<[<\s])+/s) { # non-space normal text
185 ($kind, $it, $readahead) = (KIND_TEXT, $&, $');
186 warn_normal "Warning: Unescaped < $it\n", $lc_0
187 if !$cdata_mode_p && $it =~ /</s;
188 } else { # tag/declaration/processing instruction
192 if ($readahead =~ /^$cdata_close/) {
193 ($kind, $it, $readahead) = (KIND_TAG, $&, $');
196 ($kind, $it, $readahead) = (KIND_TEXT, $readahead, undef);
199 } elsif ($readahead =~ /^$re_tag_compat/os) {
200 ($kind, $it, $readahead) = (KIND_TAG, "$1>", $3);
202 warn_normal "SGML \"closed start tag\" notation: $1<\n", $lc_0 if $2 eq '';
203 } elsif ($readahead =~ /^<!--(?:(?!-->).)*-->/s) {
204 ($kind, $it, $readahead) = (KIND_COMMENT, $&, $');
206 warn_normal "Syntax error in comment: $&\n", $lc_0;
210 my $next = scalar <$h>;
211 $eof_p = !defined $next;
216 if ($kind ne KIND_TAG) {
218 } elsif ($it =~ /^<!/) {
220 $kind = KIND_COMMENT if $it =~ /^<!--(?:(?!-->).)*-->/;
221 } elsif ($it =~ /^<\?/) {
224 if ($it =~ /^$re_directive/ios && !$cdata_mode_p) {
225 $kind = KIND_DIRECTIVE;
227 if (!$ok_p && $eof_p) {
228 ($kind, $it, $readahead) = (KIND_UNKNOWN, $readahead, undef);
232 warn_normal "Unrecognizable token found: $it\n", $lc_0
233 if $kind eq KIND_UNKNOWN;
234 return defined $it? (wantarray? ($kind, $it):
235 [$kind, $it]): undef;
241 if (!$cdata_mode_p) {
242 $it = next_token_internal($h);
243 if (defined $it && $it->[0] eq KIND_TAG) { # FIXME
244 ($cdata_mode_p, $cdata_close) = (1, "</$1\\s*>")
245 if $it->[1] =~ /^<(script|style|textarea)\b/i; #FIXME
246 push @$it, extract_attributes($it->[1], $lc_0); #FIXME
251 my $next = next_token_internal($h);
252 last if !defined $next;
253 if (defined $next && $next->[1] =~ /$cdata_close/i) { #FIXME
254 ($lc, $readahead) = ($lc_prev, $next->[1] . $readahead); #FIXME
257 last unless $cdata_mode_p;
258 $it .= $next->[1]; #FIXME
260 $it = [KIND_CDATA, $it]; #FIXME
261 $cdata_close = undef;
263 return defined $it? (wantarray? @$it: $it): undef;
266 ###############################################################################
268 sub debug_dump (*) { # for testing only
270 print "re_tag_compat is /$re_tag_compat/\n";
272 my $s = next_token $h;
273 last unless defined $s;
274 printf "%s\n", ('-' x 79);
275 my($kind, $t, $attr) = @$s; # FIXME
276 printf "%s:\n", $kind;
277 printf "%4dH%s\n", length($t),
278 join('', map {/[\0-\37]/? $_: "$_\b$_"} split(//, $t));
279 if ($kind eq KIND_TAG && %$attr) {
280 printf "Attributes:\n";
281 for my $a (keys %$attr) {
282 my($key, $val, $val_orig, $order) = @{$attr->{$a}};
283 printf "%s = %dH%s -- %s\n", $a, length $val,
284 join('', map {/[\0-\37]/? $_: "$_\b$_"} split(//, $val)),
291 ###############################################################################
295 $s =~ s/^(?:\s|\ $re_end_entity)+//os;
296 $s =~ s/(?:\s|\ $re_end_entity)+$//os;
300 ###############################################################################
302 sub text_extract (*) {
306 my $s = next_token $h;
307 last unless defined $s;
308 my($kind, $t, $attr) = @$s; # FIXME
309 if ($kind eq KIND_TEXT) {
311 $text{$t} = 1 if $t =~ /\S/s;
312 } elsif ($kind eq KIND_TAG && %$attr) {
313 # value [tag=input], meta
314 my $tag = lc($1) if $t =~ /^<(\S+)/s;
315 for my $a ('alt', 'content', 'title', 'value') {
317 next if $a eq 'content' && $tag ne 'meta';
318 next if $a eq 'value' && ($tag ne 'input'
319 || (ref $attr->{'type'} && $attr->{'type'}->[1] eq 'hidden')); # FIXME
320 my($key, $val, $val_orig, $order) = @{$attr->{$a}}; #FIXME
322 $text{$val} = 1 if $val =~ /\S/s;
327 # Emit all extracted strings.
328 # Don't emit pure whitespace, pure numbers, or TMPL_VAR's.
329 for my $t (keys %text) {
331 unless $t =~ /^(?:\s|\ $re_end_entity|$re_tmpl_var)*$/os || $t =~ /^\d+$/;
335 ###############################################################################
339 my $h = $exitcode? *STDERR: *STDOUT;
342 Extract strings from HTML file.
344 --debug-dump-only Do not extract strings; but display scanned tokens
345 -f, --file=FILE Extract from the specified FILE
346 --pedantic-warnings Issue warnings even for detected problems which
347 are likely to be harmless
348 --help Display this help and exit
353 ###############################################################################
355 sub usage_error (;$) {
356 print STDERR "$_[0]\n" if @_;
357 print STDERR "Try `$0 --help' for more information.\n";
361 ###############################################################################
365 Code could be written to detect template variables and
366 construct gettext-c-format-string-like meta-strings (e.g., "Results %s
367 through %s of %s records" that will be more likely to be translatable
368 to languages where word order is very unlike English word order.
369 This will be relatively major rework, requiring corresponding
370 rework in tmpl_process.pl