misc/translator/TmplTokenizer.pm

   1 package TmplTokenizer;
   2
   3 use strict;
   4 use TmplTokenType;
   5 use TmplToken;
   6 use VerboseWarnings qw( pedantic_p error_normal warn_normal warn_pedantic );
   7 require Exporter;
   8
   9 use vars qw($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
  10
  11 ###############################################################################
  12
  13 =head1 NAME
  14
  15 TmplTokenizer.pm - Simple-minded tokenizer for HTML::Template .tmpl files
  16
  17 =head1 DESCRIPTION
  18
  19 Because .tmpl files contains HTML::Template directives
  20 that tend to confuse real parsers (e.g., HTML::Parse),
  21 it might be better to create a customized scanner
  22 to scan the template files for tokens.
  23 This module is a simple-minded attempt at such a scanner.
  24
  25 =head1 HISTORY
  26
  27 This tokenizer is mostly based
  28 on Ambrose's hideous Perl script known as subst.pl.
  29
  30 =cut
  31
  32 ###############################################################################
  33
  34 $VERSION = 0.01;
  35
  36 @ISA = qw(Exporter);
  37 @EXPORT_OK = qw();
  38
  39 use vars qw( $input );
  40 use vars qw( $debug_dump_only_p );
  41 use vars qw( $pedantic_attribute_error_in_nonpedantic_mode_p );
  42 use vars qw( $pedantic_tmpl_var_use_in_nonpedantic_mode_p );
  43 use vars qw( $fatal_p );
  44
  45 ###############################################################################
  46
  47 # Hideous stuff
  48 use vars qw( $re_directive $re_tmpl_var $re_tmpl_var_escaped $re_tmpl_include );
  49 use vars qw( $re_directive_control $re_tmpl_endif_endloop );
  50 BEGIN {
  51     # $re_directive must not do any backreferences
  52     $re_directive = q{<(?:(?i)(?:!--\s*)?\/?TMPL_(?:VAR|LOOP|INCLUDE|IF|ELSE|UNLESS)(?:\s+(?:[a-zA-Z][-a-zA-Z0-9]*=)?(?:'[^']*'|"[^"]*"|[^\s<>]+))*\s*(?:--)?)>};
  53     # TMPL_VAR or TMPL_INCLUDE
  54     $re_tmpl_var = q{<(?:(?i)(?:!--\s*)?TMPL_(?:VAR)(?:\s+(?:[a-zA-Z][-a-zA-Z0-9]*=)?(?:'[^']*'|"[^"]*"|[^\s<>]+))*\s*(?:--)?)>};
  55     $re_tmpl_include = q{<(?:(?i)(?:!--\s*)?TMPL_(?:INCLUDE)(?:\s+(?:[a-zA-Z][-a-zA-Z0-9]*=)?(?:'[^']*'|"[^"]*"|[^\s<>]+))*\s*(?:--)?)>};
  56     # TMPL_VAR ESCAPE=1/HTML/URL
  57     $re_tmpl_var_escaped = q{<(?:(?i)(?:!--\s*)?TMPL_(?:VAR|INCLUDE)(?:\s+(?:[a-zA-Z][-a-zA-Z0-9]*=)?(?:'[^']*'|"[^"]*"|[^\s<>]+))\s+ESCAPE=(?:1|HTML|URL)(?:\s+(?:[a-zA-Z][-a-zA-Z0-9]*=)?(?:'[^']*'|"[^"]*"|[^\s<>]+))*\s*(?:--)?)>};
  58     # Any control flow directive
  59     $re_directive_control = q{<(?:(?i)(?:!--\s*)?\/?TMPL_(?:LOOP|IF|ELSE|UNLESS)(?:\s+(?:[a-zA-Z][-a-zA-Z0-9]*=)?(?:'[^']*'|"[^"]*"|[^\s<>]+))*\s*(?:--)?)>};
  60     # /LOOP or /IF or /UNLESS
  61     $re_tmpl_endif_endloop = q{<(?:(?i)(?:!--\s*)?\/TMPL_(?:LOOP|IF|UNLESS)(?:\s+(?:[a-zA-Z][-a-zA-Z0-9]*=)?(?:'[^']*'|"[^"]*"|[^\s<>]+))*\s*(?:--)?)>};
  62 }
  63
  64 # Hideous stuff from subst.pl, slightly modified to use the above hideous stuff
  65 # Note: The $re_tag's set $1 (<tag), $2 (>), and $3 (rest of string)
  66 use vars qw( $re_comment $re_entity_name $re_end_entity $re_etag );
  67 use vars qw( $re_tag_strict $re_tag_compat @re_tag );
  68 sub re_tag ($) {
  69    my($compat) = @_;
  70    my $etag = $compat? '>': '<>\/';
  71    # This is no longer similar to the original regexp in subst.pl :-(
  72    # Note that we don't want <> in compat mode; Mozilla knows about <
  73    q{(<\/?(?:|(?:"(?:} . $re_directive . q{|[^"])*"|'(?:} . $re_directive . q{|[^'])*'|--(?:[^-]|-[^-])*--|(?:}
  74    . $re_directive
  75    . q{|(?!--)[^"'<>} . $etag . q{]))+))([} . $etag . q{]|(?=<))(.*)};
  76 }
  77 BEGIN {
  78     $re_comment = '(?:--(?:[^-]|-[^-])*--)';
  79     $re_entity_name = '(?:[^&%#;<>\s]+)'; # NOTE: not really correct SGML
  80     $re_end_entity = '(?:;|$|(?=\s))'; # semicolon or before-whitespace
  81     $re_etag = q{(?:<\/?(?:"[^"]*"|'[^']*'|[^"'>\/])*[>\/])}; # end-tag
  82     @re_tag = ($re_tag_strict, $re_tag_compat) = (re_tag(0), re_tag(1));
  83 }
  84
  85 # End of the hideous stuff
  86
  87 use vars qw( @readahead $lc_0 $lc $syntaxerror_p );
  88 use vars qw( $cdata_mode_p $cdata_close );
  89
  90 ###############################################################################
  91
  92 # Easy accessors
  93
  94 sub fatal_p () {
  95     return $fatal_p;
  96 }
  97
  98 sub syntaxerror_p () {
  99     return $syntaxerror_p;
 100 }
 101
 102 ###############################################################################
 103
 104 sub extract_attributes ($;$) {
 105     my($s, $lc) = @_;
 106     my %attr;
 107     $s = $1 if $s =~ /^<\S+(.*)\/\S$/s  # XML-style self-closing tags
 108             || $s =~ /^<\S+(.*)\S$/s;   # SGML-style tags
 109
 110     for (my $i = 0; $s =~ /^(?:$re_directive_control)?\s+(?:$re_directive_control)?(?:([a-zA-Z][-a-zA-Z0-9]*)\s*=\s*)?('((?:$re_directive|[^'])*)'|"((?:$re_directive|[^"])*)"|((?:$re_directive|[^\s<>])+))/os;) {
 111         my($key, $val, $val_orig, $rest)
 112                 = ($1, (defined $3? $3: defined $4? $4: $5), $2, $');
 113         $i += 1;
 114         $attr{+lc($key)} = [$key, $val, $val_orig, $i];
 115         $s = $rest;
 116         if ($val =~ /$re_tmpl_include/os) {
 117             warn_normal "TMPL_INCLUDE in attribute: $val_orig\n", $lc;
 118         } elsif ($val =~ /$re_tmpl_var/os && $val !~ /$re_tmpl_var_escaped/os) {
 119             # XXX: we probably should not warn if key is "onclick" etc
 120             # XXX: there's just no reasonable thing to suggest
 121             my $suggest = ($key =~ /^(?:action|archive|background|cite|classid|codebase|data|datasrc|for|href|longdesc|profile|src|usemap)$/i? 'URL': 'HTML');
 122             undef $suggest if $key =~ /^(?:onblur|onchange|onclick|ondblclick|onfocus|onkeydown|onkeypress|onkeyup|onload|onmousedown|onmousemove|onmouseout|onmouseover|onmouseup|onreset|onselect|onsubmit|onunload)$/i;
 123             warn_pedantic
 124                     "Suggest ESCAPE=$suggest for TMPL_VAR in attribute \"$key\""
 125                         . ": $val_orig",
 126                     $lc, \$pedantic_tmpl_var_use_in_nonpedantic_mode_p
 127                 if defined $suggest && (pedantic_p || !$pedantic_tmpl_var_use_in_nonpedantic_mode_p);
 128         } elsif ($val_orig !~ /^['"]/) {
 129             my $t = $val; $t =~ s/$re_directive_control//os;
 130             warn_pedantic
 131                 "Unquoted attribute contains character(s) that should be quoted"
 132                     . ": $val_orig",
 133                 $lc, \$pedantic_attribute_error_in_nonpedantic_mode_p
 134                 if $t =~ /[^-\.A-Za-z0-9]/s;
 135         }
 136     }
 137     my $s2 = $s; $s2 =~ s/$re_tmpl_endif_endloop//g; # for the next check
 138     if ($s2 =~ /\S/s) { # should never happen
 139         if ($s =~ /^([^\n]*)\n/s) { # this is even worse
 140             error_normal("Completely confused while extracting attributes: $1", $lc);
 141             error_normal((scalar(split(/\n/, $s)) - 1) . " more line(s) not shown.", undef);
 142             $fatal_p = 1;
 143         } else {
 144             warn_normal "Strange attribute syntax: $s\n", $lc;
 145         }
 146     }
 147     return \%attr;
 148 }
 149
 150 sub next_token_internal (*) {
 151     my($h) = @_;
 152     my($it, $kind);
 153     my $eof_p = 0;
 154     pop @readahead if @readahead && !ref $readahead[$#readahead]
 155             && !length $readahead[$#readahead];
 156     if (!@readahead) {
 157         my $next = scalar <$h>;
 158         $eof_p = !defined $next;
 159         if (!$eof_p) {
 160             $lc += 1;
 161             push @readahead, $next;
 162         }
 163     }
 164     $lc_0 = $lc;                        # remember line number of first line
 165     if (@readahead && ref $readahead[$#readahead]) {    # TmplToken object
 166         my $t = pop @readahead;
 167         ($it, $kind, local $lc) = ($t->string, $t->type, $t->line_number);
 168     } elsif ($eof_p && !@readahead) {   # nothing left to do
 169         ;
 170     } elsif ($readahead[$#readahead] =~ /^\s+/s) {      # whitespace
 171         ($kind, $it, $readahead[$#readahead]) = (TmplTokenType::TEXT, $&, $');
 172     # FIXME the following (the [<\s] part) is an unreliable HACK :-(
 173     } elsif ($readahead[$#readahead] =~ /^(?:[^<]|<[<\s])+/s) { # non-space normal text
 174         ($kind, $it, $readahead[$#readahead]) = (TmplTokenType::TEXT, $&, $');
 175         warn_normal "Unescaped < in $it\n", $lc_0
 176                 if !$cdata_mode_p && $it =~ /</s;
 177     } else {                            # tag/declaration/processing instruction
 178         my $ok_p = 0;
 179         for (;;) {
 180             if ($cdata_mode_p) {
 181                 if ($readahead[$#readahead] =~ /^$cdata_close/) {
 182                     ($kind, $it, $readahead[$#readahead]) = (TmplTokenType::TAG, $&, $');
 183                     $ok_p = 1;
 184                 } else {
 185                     ($kind, $it) = (TmplTokenType::TEXT, pop @readahead);
 186                     $ok_p = 1;
 187                 }
 188             } elsif ($readahead[$#readahead] =~ /^$re_tag_compat/os) {
 189                 ($kind, $it, $readahead[$#readahead]) = (TmplTokenType::TAG, "$1>", $3);
 190                 $ok_p = 1;
 191                 warn_normal "SGML \"closed start tag\" notation: $1<\n", $lc_0 if $2 eq '';
 192             } elsif ($readahead[$#readahead] =~ /^<!--(?:(?!-->).)*-->/s) {
 193                 ($kind, $it, $readahead[$#readahead]) = (TmplTokenType::COMMENT, $&, $');
 194                 $ok_p = 1;
 195                 warn_normal "Syntax error in comment: $&\n", $lc_0;
 196                 $syntaxerror_p = 1;
 197             }
 198         last if $ok_p;
 199             my $next = scalar <$h>;
 200             $eof_p = !defined $next;
 201         last if $eof_p;
 202             $lc += 1;
 203             $readahead[$#readahead] .= $next;
 204         }
 205         if ($kind ne TmplTokenType::TAG) {
 206             ;
 207         } elsif ($it =~ /^<!/) {
 208             $kind = TmplTokenType::DECL;
 209             $kind = TmplTokenType::COMMENT if $it =~ /^<!--(?:(?!-->).)*-->/;
 210         } elsif ($it =~ /^<\?/) {
 211             $kind = TmplTokenType::PI;
 212         }
 213         if ($it =~ /^$re_directive/ios && !$cdata_mode_p) {
 214             $kind = TmplTokenType::DIRECTIVE;
 215         }
 216         if (!$ok_p && $eof_p) {
 217             ($kind, $it, $readahead[$#readahead]) = (TmplTokenType::UNKNOWN, $readahead[$#readahead], undef);
 218             $syntaxerror_p = 1;
 219         }
 220     }
 221     warn_normal "Unrecognizable token found: $it\n", $lc_0
 222             if $kind eq TmplTokenType::UNKNOWN;
 223     return defined $it? TmplToken->new($it, $kind, $lc): undef;
 224 }
 225
 226 sub next_token (*) {
 227     my($h) = @_;
 228     my $it;
 229     if (!$cdata_mode_p) {
 230         $it = next_token_internal($h);
 231         if (defined $it && $it->type eq TmplTokenType::TAG) {
 232             ($cdata_mode_p, $cdata_close) = (1, "</$1\\s*>")
 233                     if $it->string =~ /^<(script|style|textarea)\b/i;
 234             $it->set_attributes( extract_attributes($it->string, $lc_0) );
 235         }
 236     } else {
 237         for ($it = '';;) {
 238             my $lc_prev = $lc;
 239             my $next = next_token_internal($h);
 240         last if !defined $next;
 241             if (defined $next && $next->string =~ /$cdata_close/i) {
 242                 push @readahead, $next; # push the entire TmplToken object
 243                 #$lc = $lc_prev; XXX
 244                 $cdata_mode_p = 0;
 245             }
 246         last unless $cdata_mode_p;
 247             $it .= $next->string;
 248         }
 249         $it = TmplToken->new( $it, TmplTokenType::CDATA, $lc );
 250         $cdata_close = undef;
 251     }
 252     return $it;
 253 }
 254
 255 ###############################################################################
 256
 257 # Other easy functions
 258
 259 sub blank_p ($) {
 260     my($s) = @_;
 261     return $s =~ /^(?:\s|\&nbsp$re_end_entity|$re_tmpl_var)*$/os;
 262 }
 263
 264 sub trim ($) {
 265     my($s) = @_;
 266     $s =~ s/^(?:\s|\&nbsp$re_end_entity)+//os;
 267     $s =~ s/(?:\s|\&nbsp$re_end_entity)+$//os;
 268     return $s;
 269 }
 270
 271 ###############################################################################
 272
 273 =head1 FUTURE PLANS
 274
 275 Code could be written to detect template variables and
 276 construct gettext-c-format-string-like meta-strings (e.g., "Results %s
 277 through %s of %s records" that will be more likely to be translatable
 278 to languages where word order is very unlike English word order.
 279 This will be relatively major rework, requiring corresponding
 280 rework in tmpl_process.pl
 281
 282 Gettext-style line number references would also be very helpful in
 283 disambiguating the strings. Ultimately, we should generate and work
 284 with gettext-style po files, so that translators are able to use
 285 tools designed for gettext.
 286
 287 An example of a string untranslatable to Chinese is "Accounts for";
 288 "Accounts for %s", however, would be translatable. Short words like
 289 "in" would also be untranslatable, not only to Chinese, but also to
 290 languages requiring declension of nouns.
 291
 292 =cut
 293
 294 1;