6 use VerboseWarnings qw( pedantic_p error_normal warn_normal warn_pedantic );
9 use vars qw($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
11 ###############################################################################
15 TmplTokenizer.pm - Simple-minded tokenizer class for HTML::Template .tmpl files
19 Because .tmpl files contains HTML::Template directives
20 that tend to confuse real parsers (e.g., HTML::Parse),
21 it might be better to create a customized scanner
22 to scan the template files for tokens.
23 This module is a simple-minded attempt at such a scanner.
27 This tokenizer is mostly based
28 on Ambrose's hideous Perl script known as subst.pl.
32 ###############################################################################
39 use vars qw( $pedantic_attribute_error_in_nonpedantic_mode_p );
40 use vars qw( $pedantic_tmpl_var_use_in_nonpedantic_mode_p );
42 ###############################################################################
45 use vars qw( $re_directive $re_tmpl_var $re_tmpl_var_escaped $re_tmpl_include );
46 use vars qw( $re_directive_control $re_tmpl_endif_endloop );
48 # $re_directive must not do any backreferences
49 $re_directive = q{<(?:(?i)(?:!--\s*)?\/?TMPL_(?:VAR|LOOP|INCLUDE|IF|ELSE|UNLESS)(?:\s+(?:[a-zA-Z][-a-zA-Z0-9]*=)?(?:'[^']*'|"[^"]*"|[^\s<>]+))*\s*(?:--)?)>};
50 # TMPL_VAR or TMPL_INCLUDE
51 $re_tmpl_var = q{<(?:(?i)(?:!--\s*)?TMPL_(?:VAR)(?:\s+(?:[a-zA-Z][-a-zA-Z0-9]*=)?(?:'[^']*'|"[^"]*"|[^\s<>]+))*\s*(?:--)?)>};
52 $re_tmpl_include = q{<(?:(?i)(?:!--\s*)?TMPL_(?:INCLUDE)(?:\s+(?:[a-zA-Z][-a-zA-Z0-9]*=)?(?:'[^']*'|"[^"]*"|[^\s<>]+))*\s*(?:--)?)>};
53 # TMPL_VAR ESCAPE=1/HTML/URL
54 $re_tmpl_var_escaped = q{<(?:(?i)(?:!--\s*)?TMPL_(?:VAR|INCLUDE)(?:\s+(?:[a-zA-Z][-a-zA-Z0-9]*=)?(?:'[^']*'|"[^"]*"|[^\s<>]+))\s+ESCAPE=(?:1|HTML|URL)(?:\s+(?:[a-zA-Z][-a-zA-Z0-9]*=)?(?:'[^']*'|"[^"]*"|[^\s<>]+))*\s*(?:--)?)>};
55 # Any control flow directive
56 $re_directive_control = q{<(?:(?i)(?:!--\s*)?\/?TMPL_(?:LOOP|IF|ELSE|UNLESS)(?:\s+(?:[a-zA-Z][-a-zA-Z0-9]*=)?(?:'[^']*'|"[^"]*"|[^\s<>]+))*\s*(?:--)?)>};
57 # /LOOP or /IF or /UNLESS
58 $re_tmpl_endif_endloop = q{<(?:(?i)(?:!--\s*)?\/TMPL_(?:LOOP|IF|UNLESS)(?:\s+(?:[a-zA-Z][-a-zA-Z0-9]*=)?(?:'[^']*'|"[^"]*"|[^\s<>]+))*\s*(?:--)?)>};
61 # Hideous stuff from subst.pl, slightly modified to use the above hideous stuff
62 # Note: The $re_tag's set $1 (<tag), $2 (>), and $3 (rest of string)
63 use vars qw( $re_comment $re_entity_name $re_end_entity $re_etag );
64 use vars qw( $re_tag_strict $re_tag_compat @re_tag );
67 my $etag = $compat? '>': '<>\/';
68 # This is no longer similar to the original regexp in subst.pl :-(
69 # Note that we don't want <> in compat mode; Mozilla knows about <
70 q{(<\/?(?:|(?:"(?:} . $re_directive . q{|[^"])*"|'(?:} . $re_directive . q{|[^'])*'|--(?:[^-]|-[^-])*--|(?:}
72 . q{|(?!--)[^"'<>} . $etag . q{]))+))([} . $etag . q{]|(?=<))(.*)};
75 $re_comment = '(?:--(?:[^-]|-[^-])*--)';
76 $re_entity_name = '(?:[^&%#;<>\s]+)'; # NOTE: not really correct SGML
77 $re_end_entity = '(?:;|$|(?=\s))'; # semicolon or before-whitespace
78 $re_etag = q{(?:<\/?(?:"[^"]*"|'[^']*'|[^"'>\/])*[>\/])}; # end-tag
79 @re_tag = ($re_tag_strict, $re_tag_compat) = (re_tag(0), re_tag(1));
82 # End of the hideous stuff
84 use vars qw( $serial );
86 ###############################################################################
88 sub FATAL_P () {'fatal-p'}
89 sub SYNTAXERROR_P () {'syntaxerror-p'}
91 sub FILENAME () {'input'}
92 sub HANDLE () {'handle'}
94 sub READAHEAD () {'readahead'}
95 sub LINENUM_START () {'lc_0'}
97 sub CDATA_MODE_P () {'cdata-mode-p'}
98 sub CDATA_CLOSE () {'cdata-close'}
103 my $class = ref($this) || $this;
107 my $handle = sprintf('TMPLTOKENIZER%d', $serial);
111 open($handle, "<$input") || die "$input: $!\n";
113 $self->{+FILENAME} = $input;
114 $self->{+HANDLE} = $handle;
115 $self->{+READAHEAD} = [];
119 ###############################################################################
125 return $this->{+FILENAME};
130 return $this->{+HANDLE};
135 return $this->{+FATAL_P};
140 return $this->{+SYNTAXERROR_P};
143 sub has_readahead_p {
145 return @{$this->{+READAHEAD}};
148 sub _peek_readahead {
150 return $this->{+READAHEAD}->[$#{$this->{+READAHEAD}}];
153 sub line_number_start {
155 return $this->{+LINENUM_START};
160 return $this->{+LINENUM};
165 return $this->{+CDATA_MODE_P};
170 return $this->{+CDATA_CLOSE};
177 $this->{+FATAL_P} = $_[0];
181 sub _set_syntaxerror {
183 $this->{+SYNTAXERROR_P} = $_[0];
187 sub _push_readahead {
189 push @{$this->{+READAHEAD}}, $_[0];
195 return pop @{$this->{+READAHEAD}};
198 sub _append_readahead {
200 $this->{+READAHEAD}->[$#{$this->{+READAHEAD}}] .= $_[0];
206 $this->{+READAHEAD}->[$#{$this->{+READAHEAD}}] = $_[0];
210 sub _increment_line_number {
212 $this->{+LINENUM} += 1;
216 sub _set_line_number_start {
218 $this->{+LINENUM_START} = $_[0];
222 sub _set_cdata_mode {
224 $this->{+CDATA_MODE_P} = $_[0];
228 sub _set_cdata_close {
230 $this->{+CDATA_CLOSE} = $_[0];
234 ###############################################################################
236 sub _extract_attributes ($;$) {
240 $s = $1 if $s =~ /^<\S+(.*)\/\S$/s # XML-style self-closing tags
241 || $s =~ /^<\S+(.*)\S$/s; # SGML-style tags
243 for (my $i = 0; $s =~ /^(?:$re_directive_control)?\s+(?:$re_directive_control)?(?:([a-zA-Z][-a-zA-Z0-9]*)\s*=\s*)?('((?:$re_directive|[^'])*)'|"((?:$re_directive|[^"])*)"|((?:$re_directive|[^\s<>])+))/os;) {
244 my($key, $val, $val_orig, $rest)
245 = ($1, (defined $3? $3: defined $4? $4: $5), $2, $');
247 $attr{+lc($key)} = [$key, $val, $val_orig, $i];
249 if ($val =~ /$re_tmpl_include/os) {
250 warn_normal "TMPL_INCLUDE in attribute: $val_orig\n", $lc;
251 } elsif ($val =~ /$re_tmpl_var/os && $val !~ /$re_tmpl_var_escaped/os) {
252 # XXX: we probably should not warn if key is "onclick" etc
253 # XXX: there's just no reasonable thing to suggest
254 my $suggest = ($key =~ /^(?:action|archive|background|cite|classid|codebase|data|datasrc|for|href|longdesc|profile|src|usemap)$/i? 'URL': 'HTML');
255 undef $suggest if $key =~ /^(?:onblur|onchange|onclick|ondblclick|onfocus|onkeydown|onkeypress|onkeyup|onload|onmousedown|onmousemove|onmouseout|onmouseover|onmouseup|onreset|onselect|onsubmit|onunload)$/i;
257 "Suggest ESCAPE=$suggest for TMPL_VAR in attribute \"$key\""
259 $lc, \$pedantic_tmpl_var_use_in_nonpedantic_mode_p
260 if defined $suggest && (pedantic_p || !$pedantic_tmpl_var_use_in_nonpedantic_mode_p);
261 } elsif ($val_orig !~ /^['"]/) {
262 my $t = $val; $t =~ s/$re_directive_control//os;
264 "Unquoted attribute contains character(s) that should be quoted"
266 $lc, \$pedantic_attribute_error_in_nonpedantic_mode_p
267 if $t =~ /[^-\.A-Za-z0-9]/s;
270 my $s2 = $s; $s2 =~ s/$re_tmpl_endif_endloop//g; # for the next check
271 if ($s2 =~ /\S/s) { # should never happen
272 if ($s =~ /^([^\n]*)\n/s) { # this is even worse
273 error_normal("Completely confused while extracting attributes: $1", $lc);
274 error_normal((scalar(split(/\n/, $s)) - 1) . " more line(s) not shown.", undef);
275 $this->_set_fatal( 1 );
277 warn_normal "Strange attribute syntax: $s\n", $lc;
283 sub _next_token_internal {
288 $this->_pop_readahead if $this->has_readahead_p
289 && !ref $this->_peek_readahead
290 && !length $this->_peek_readahead;
291 if (!$this->has_readahead_p) {
292 my $next = scalar <$h>;
293 $eof_p = !defined $next;
295 $this->_increment_line_number;
296 $this->_push_readahead( $next );
299 $this->_set_line_number_start( $this->line_number ); # remember 1st line num
300 if ($this->has_readahead_p && ref $this->_peek_readahead) { # TmplToken obj.
301 ($it, $kind) = ($this->_pop_readahead, undef);
302 } elsif ($eof_p && !$this->has_readahead_p) { # nothing left to do
304 } elsif ($this->_peek_readahead =~ /^\s+/s) { # whitespace
305 ($kind, $it) = (TmplTokenType::TEXT, $&);
306 $this->_set_readahead( $' );
307 # FIXME the following (the [<\s] part) is an unreliable HACK :-(
308 } elsif ($this->_peek_readahead =~ /^(?:[^<]|<[<\s])+/s) { # non-space normal text
309 ($kind, $it) = (TmplTokenType::TEXT, $&);
310 $this->_set_readahead( $' );
311 warn_normal "Unescaped < in $it\n", $this->line_number_start
312 if !$this->cdata_mode_p && $it =~ /</s;
313 } else { # tag/declaration/processing instruction
315 for (my $cdata_close = $this->cdata_close;;) {
316 if ($this->cdata_mode_p) {
317 if ($this->_peek_readahead =~ /^$cdata_close/) {
318 ($kind, $it) = (TmplTokenType::TAG, $&);
319 $this->_set_readahead( $' );
322 ($kind, $it) = (TmplTokenType::TEXT, $this->_pop_readahead);
325 } elsif ($this->_peek_readahead =~ /^$re_tag_compat/os) {
326 ($kind, $it) = (TmplTokenType::TAG, "$1>");
327 $this->_set_readahead( $3 );
329 warn_normal "SGML \"closed start tag\" notation: $1<\n", $this->line_number_start if $2 eq '';
330 } elsif ($this->_peek_readahead =~ /^<!--(?:(?!-->).)*-->/s) {
331 ($kind, $it) = (TmplTokenType::COMMENT, $&);
332 $this->_set_readahead( $' );
334 warn_normal "Syntax error in comment: $&\n", $this->line_number_start;
335 $this->_set_syntaxerror( 1 );
338 my $next = scalar <$h>;
339 $eof_p = !defined $next;
341 $this->_increment_line_number;
342 $this->_append_readahead( $next );
344 if ($kind ne TmplTokenType::TAG) {
346 } elsif ($it =~ /^<!/) {
347 $kind = TmplTokenType::DECL;
348 $kind = TmplTokenType::COMMENT if $it =~ /^<!--(?:(?!-->).)*-->/;
349 } elsif ($it =~ /^<\?/) {
350 $kind = TmplTokenType::PI;
352 if ($it =~ /^$re_directive/ios && !$this->cdata_mode_p) {
353 $kind = TmplTokenType::DIRECTIVE;
355 if (!$ok_p && $eof_p) {
356 ($kind, $it) = (TmplTokenType::UNKNOWN, $this->_peek_readahead);
357 $this->_set_readahead, undef;
358 $this->_set_syntaxerror( 1 );
361 warn_normal "Unrecognizable token found: $it\n", $this->line_number_start
362 if $kind eq TmplTokenType::UNKNOWN;
363 return defined $it? (ref $it? $it: TmplToken->new($it, $kind, $this->line_number, $this->filename)): undef;
368 my $h = $this->_handle;
370 if (!$this->cdata_mode_p) {
371 $it = $this->_next_token_internal($h);
372 if (defined $it && $it->type eq TmplTokenType::TAG) {
373 if ($it->string =~ /^<(script|style|textarea)\b/i) {
374 $this->_set_cdata_mode( 1 );
375 $this->_set_cdata_close( "</$1\\s*>" );
377 $it->set_attributes( $this->_extract_attributes($it->string, $it->line_number) );
380 for ($it = '', my $cdata_close = $this->cdata_close;;) {
381 my $next = $this->_next_token_internal($h);
382 last if !defined $next;
383 if (defined $next && $next->string =~ /$cdata_close/i) {
384 $this->_push_readahead( $next ); # push entire TmplToken object
385 $this->_set_cdata_mode( 0 );
387 last unless $this->cdata_mode_p;
388 $it .= $next->string;
390 $it = TmplToken->new( $it, TmplTokenType::CDATA, $this->line_number );
391 $this->_set_cdata_close, undef;
396 ###############################################################################
398 # Other easy functions
402 return $s =~ /^(?:\s|\ $re_end_entity|$re_tmpl_var)*$/os;
407 $s =~ s/^(?:\s|\ $re_end_entity)+//os;
408 $s =~ s/(?:\s|\ $re_end_entity)+$//os;
412 ###############################################################################
416 Code could be written to detect template variables and
417 construct gettext-c-format-string-like meta-strings (e.g., "Results %s
418 through %s of %s records" that will be more likely to be translatable
419 to languages where word order is very unlike English word order.
420 This will be relatively major rework, requiring corresponding
421 rework in tmpl_process.pl
423 Gettext-style line number references would also be very helpful in
424 disambiguating the strings. Ultimately, we should generate and work
425 with gettext-style po files, so that translators are able to use
426 tools designed for gettext.
428 An example of a string untranslatable to Chinese is "Accounts for";
429 "Accounts for %s", however, would be translatable. Short words like
430 "in" would also be untranslatable, not only to Chinese, but also to
431 languages requiring declension of nouns.