6 use VerboseWarnings qw( pedantic_p error_normal warn_normal warn_pedantic );
9 use vars qw($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
11 ###############################################################################
15 TmplTokenizer.pm - Simple-minded tokenizer for HTML::Template .tmpl files
19 Because .tmpl files contains HTML::Template directives
20 that tend to confuse real parsers (e.g., HTML::Parse),
21 it might be better to create a customized scanner
22 to scan the template files for tokens.
23 This module is a simple-minded attempt at such a scanner.
27 This tokenizer is mostly based
28 on Ambrose's hideous Perl script known as subst.pl.
32 ###############################################################################
39 use vars qw( $input );
40 use vars qw( $debug_dump_only_p );
41 use vars qw( $pedantic_attribute_error_in_nonpedantic_mode_p );
42 use vars qw( $pedantic_tmpl_var_use_in_nonpedantic_mode_p );
43 use vars qw( $fatal_p );
45 ###############################################################################
48 use vars qw( $re_directive $re_tmpl_var $re_tmpl_var_escaped $re_tmpl_include );
49 use vars qw( $re_directive_control $re_tmpl_endif_endloop );
51 # $re_directive must not do any backreferences
52 $re_directive = q{<(?:(?i)(?:!--\s*)?\/?TMPL_(?:VAR|LOOP|INCLUDE|IF|ELSE|UNLESS)(?:\s+(?:[a-zA-Z][-a-zA-Z0-9]*=)?(?:'[^']*'|"[^"]*"|[^\s<>]+))*\s*(?:--)?)>};
53 # TMPL_VAR or TMPL_INCLUDE
54 $re_tmpl_var = q{<(?:(?i)(?:!--\s*)?TMPL_(?:VAR)(?:\s+(?:[a-zA-Z][-a-zA-Z0-9]*=)?(?:'[^']*'|"[^"]*"|[^\s<>]+))*\s*(?:--)?)>};
55 $re_tmpl_include = q{<(?:(?i)(?:!--\s*)?TMPL_(?:INCLUDE)(?:\s+(?:[a-zA-Z][-a-zA-Z0-9]*=)?(?:'[^']*'|"[^"]*"|[^\s<>]+))*\s*(?:--)?)>};
56 # TMPL_VAR ESCAPE=1/HTML/URL
57 $re_tmpl_var_escaped = q{<(?:(?i)(?:!--\s*)?TMPL_(?:VAR|INCLUDE)(?:\s+(?:[a-zA-Z][-a-zA-Z0-9]*=)?(?:'[^']*'|"[^"]*"|[^\s<>]+))\s+ESCAPE=(?:1|HTML|URL)(?:\s+(?:[a-zA-Z][-a-zA-Z0-9]*=)?(?:'[^']*'|"[^"]*"|[^\s<>]+))*\s*(?:--)?)>};
58 # Any control flow directive
59 $re_directive_control = q{<(?:(?i)(?:!--\s*)?\/?TMPL_(?:LOOP|IF|ELSE|UNLESS)(?:\s+(?:[a-zA-Z][-a-zA-Z0-9]*=)?(?:'[^']*'|"[^"]*"|[^\s<>]+))*\s*(?:--)?)>};
60 # /LOOP or /IF or /UNLESS
61 $re_tmpl_endif_endloop = q{<(?:(?i)(?:!--\s*)?\/TMPL_(?:LOOP|IF|UNLESS)(?:\s+(?:[a-zA-Z][-a-zA-Z0-9]*=)?(?:'[^']*'|"[^"]*"|[^\s<>]+))*\s*(?:--)?)>};
64 # Hideous stuff from subst.pl, slightly modified to use the above hideous stuff
65 # Note: The $re_tag's set $1 (<tag), $2 (>), and $3 (rest of string)
66 use vars qw( $re_comment $re_entity_name $re_end_entity $re_etag );
67 use vars qw( $re_tag_strict $re_tag_compat @re_tag );
70 my $etag = $compat? '>': '<>\/';
71 # This is no longer similar to the original regexp in subst.pl :-(
72 # Note that we don't want <> in compat mode; Mozilla knows about <
73 q{(<\/?(?:|(?:"(?:} . $re_directive . q{|[^"])*"|'(?:} . $re_directive . q{|[^'])*'|--(?:[^-]|-[^-])*--|(?:}
75 . q{|(?!--)[^"'<>} . $etag . q{]))+))([} . $etag . q{]|(?=<))(.*)};
78 $re_comment = '(?:--(?:[^-]|-[^-])*--)';
79 $re_entity_name = '(?:[^&%#;<>\s]+)'; # NOTE: not really correct SGML
80 $re_end_entity = '(?:;|$|(?=\s))'; # semicolon or before-whitespace
81 $re_etag = q{(?:<\/?(?:"[^"]*"|'[^']*'|[^"'>\/])*[>\/])}; # end-tag
82 @re_tag = ($re_tag_strict, $re_tag_compat) = (re_tag(0), re_tag(1));
85 # End of the hideous stuff
87 use vars qw( @readahead $lc_0 $lc $syntaxerror_p );
88 use vars qw( $cdata_mode_p $cdata_close );
90 ###############################################################################
98 sub syntaxerror_p () {
99 return $syntaxerror_p;
102 ###############################################################################
104 sub extract_attributes ($;$) {
107 $s = $1 if $s =~ /^<\S+(.*)\/\S$/s # XML-style self-closing tags
108 || $s =~ /^<\S+(.*)\S$/s; # SGML-style tags
110 for (my $i = 0; $s =~ /^(?:$re_directive_control)?\s+(?:$re_directive_control)?(?:([a-zA-Z][-a-zA-Z0-9]*)\s*=\s*)?('((?:$re_directive|[^'])*)'|"((?:$re_directive|[^"])*)"|((?:$re_directive|[^\s<>])+))/os;) {
111 my($key, $val, $val_orig, $rest)
112 = ($1, (defined $3? $3: defined $4? $4: $5), $2, $');
114 $attr{+lc($key)} = [$key, $val, $val_orig, $i];
116 if ($val =~ /$re_tmpl_include/os) {
117 warn_normal "TMPL_INCLUDE in attribute: $val_orig\n", $lc;
118 } elsif ($val =~ /$re_tmpl_var/os && $val !~ /$re_tmpl_var_escaped/os) {
119 # XXX: we probably should not warn if key is "onclick" etc
120 # XXX: there's just no reasonable thing to suggest
121 my $suggest = ($key =~ /^(?:action|archive|background|cite|classid|codebase|data|datasrc|for|href|longdesc|profile|src|usemap)$/i? 'URL': 'HTML');
122 undef $suggest if $key =~ /^(?:onblur|onchange|onclick|ondblclick|onfocus|onkeydown|onkeypress|onkeyup|onload|onmousedown|onmousemove|onmouseout|onmouseover|onmouseup|onreset|onselect|onsubmit|onunload)$/i;
124 "Suggest ESCAPE=$suggest for TMPL_VAR in attribute \"$key\""
126 $lc, \$pedantic_tmpl_var_use_in_nonpedantic_mode_p
127 if defined $suggest && (pedantic_p || !$pedantic_tmpl_var_use_in_nonpedantic_mode_p);
128 } elsif ($val_orig !~ /^['"]/) {
129 my $t = $val; $t =~ s/$re_directive_control//os;
131 "Unquoted attribute contains character(s) that should be quoted"
133 $lc, \$pedantic_attribute_error_in_nonpedantic_mode_p
134 if $t =~ /[^-\.A-Za-z0-9]/s;
137 my $s2 = $s; $s2 =~ s/$re_tmpl_endif_endloop//g; # for the next check
138 if ($s2 =~ /\S/s) { # should never happen
139 if ($s =~ /^([^\n]*)\n/s) { # this is even worse
140 error_normal("Completely confused while extracting attributes: $1", $lc);
141 error_normal((scalar(split(/\n/, $s)) - 1) . " more line(s) not shown.", undef);
144 warn_normal "Strange attribute syntax: $s\n", $lc;
150 sub next_token_internal (*) {
154 pop @readahead if @readahead && !ref $readahead[$#readahead]
155 && !length $readahead[$#readahead];
157 my $next = scalar <$h>;
158 $eof_p = !defined $next;
161 push @readahead, $next;
164 $lc_0 = $lc; # remember line number of first line
165 if (@readahead && ref $readahead[$#readahead]) { # TmplToken object
166 my $t = pop @readahead;
167 ($it, $kind, local $lc) = ($t->string, $t->type, $t->line_number);
168 } elsif ($eof_p && !@readahead) { # nothing left to do
170 } elsif ($readahead[$#readahead] =~ /^\s+/s) { # whitespace
171 ($kind, $it, $readahead[$#readahead]) = (TmplTokenType::TEXT, $&, $');
172 # FIXME the following (the [<\s] part) is an unreliable HACK :-(
173 } elsif ($readahead[$#readahead] =~ /^(?:[^<]|<[<\s])+/s) { # non-space normal text
174 ($kind, $it, $readahead[$#readahead]) = (TmplTokenType::TEXT, $&, $');
175 warn_normal "Unescaped < in $it\n", $lc_0
176 if !$cdata_mode_p && $it =~ /</s;
177 } else { # tag/declaration/processing instruction
181 if ($readahead[$#readahead] =~ /^$cdata_close/) {
182 ($kind, $it, $readahead[$#readahead]) = (TmplTokenType::TAG, $&, $');
185 ($kind, $it) = (TmplTokenType::TEXT, pop @readahead);
188 } elsif ($readahead[$#readahead] =~ /^$re_tag_compat/os) {
189 ($kind, $it, $readahead[$#readahead]) = (TmplTokenType::TAG, "$1>", $3);
191 warn_normal "SGML \"closed start tag\" notation: $1<\n", $lc_0 if $2 eq '';
192 } elsif ($readahead[$#readahead] =~ /^<!--(?:(?!-->).)*-->/s) {
193 ($kind, $it, $readahead[$#readahead]) = (TmplTokenType::COMMENT, $&, $');
195 warn_normal "Syntax error in comment: $&\n", $lc_0;
199 my $next = scalar <$h>;
200 $eof_p = !defined $next;
203 $readahead[$#readahead] .= $next;
205 if ($kind ne TmplTokenType::TAG) {
207 } elsif ($it =~ /^<!/) {
208 $kind = TmplTokenType::DECL;
209 $kind = TmplTokenType::COMMENT if $it =~ /^<!--(?:(?!-->).)*-->/;
210 } elsif ($it =~ /^<\?/) {
211 $kind = TmplTokenType::PI;
213 if ($it =~ /^$re_directive/ios && !$cdata_mode_p) {
214 $kind = TmplTokenType::DIRECTIVE;
216 if (!$ok_p && $eof_p) {
217 ($kind, $it, $readahead[$#readahead]) = (TmplTokenType::UNKNOWN, $readahead[$#readahead], undef);
221 warn_normal "Unrecognizable token found: $it\n", $lc_0
222 if $kind eq TmplTokenType::UNKNOWN;
223 return defined $it? TmplToken->new($it, $kind, $lc): undef;
229 if (!$cdata_mode_p) {
230 $it = next_token_internal($h);
231 if (defined $it && $it->type eq TmplTokenType::TAG) {
232 ($cdata_mode_p, $cdata_close) = (1, "</$1\\s*>")
233 if $it->string =~ /^<(script|style|textarea)\b/i;
234 $it->set_attributes( extract_attributes($it->string, $lc_0) );
239 my $next = next_token_internal($h);
240 last if !defined $next;
241 if (defined $next && $next->string =~ /$cdata_close/i) {
242 push @readahead, $next; # push the entire TmplToken object
246 last unless $cdata_mode_p;
247 $it .= $next->string;
249 $it = TmplToken->new( $it, TmplTokenType::CDATA, $lc );
250 $cdata_close = undef;
255 ###############################################################################
257 # Other easy functions
261 return $s =~ /^(?:\s|\ $re_end_entity|$re_tmpl_var)*$/os;
266 $s =~ s/^(?:\s|\ $re_end_entity)+//os;
267 $s =~ s/(?:\s|\ $re_end_entity)+$//os;
271 ###############################################################################
275 Code could be written to detect template variables and
276 construct gettext-c-format-string-like meta-strings (e.g., "Results %s
277 through %s of %s records" that will be more likely to be translatable
278 to languages where word order is very unlike English word order.
279 This will be relatively major rework, requiring corresponding
280 rework in tmpl_process.pl
282 Gettext-style line number references would also be very helpful in
283 disambiguating the strings. Ultimately, we should generate and work
284 with gettext-style po files, so that translators are able to use
285 tools designed for gettext.
287 An example of a string untranslatable to Chinese is "Accounts for";
288 "Accounts for %s", however, would be translatable. Short words like
289 "in" would also be untranslatable, not only to Chinese, but also to
290 languages requiring declension of nouns.