misc/translator/TmplTokenizer.pm

   1 package TmplTokenizer;
   2
   3 use strict;
   4 use VerboseWarnings qw( pedantic_p error_normal warn_normal warn_pedantic );
   5 require Exporter;
   6
   7 use vars qw($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
   8
   9 ###############################################################################
  10
  11 =head1 NAME
  12
  13 TmplTokenizer.pm - Simple-minded tokenizer for HTML::Template .tmpl files
  14
  15 =head1 DESCRIPTION
  16
  17 Because .tmpl files contains HTML::Template directives
  18 that tend to confuse real parsers (e.g., HTML::Parse),
  19 it might be better to create a customized scanner
  20 to scan the template files for tokens.
  21 This module is a simple-minded attempt at such a scanner.
  22
  23 =head1 HISTORY
  24
  25 This tokenizer is mostly based
  26 on Ambrose's hideous Perl script known as subst.pl.
  27
  28 =cut
  29
  30 ###############################################################################
  31
  32 $VERSION = 0.01;
  33
  34 @ISA = qw(Exporter);
  35 @EXPORT_OK = qw(
  36     &KIND_TEXT
  37     &KIND_CDATA
  38     &KIND_TAG
  39     &KIND_DECL
  40     &KIND_PI
  41     &KIND_DIRECTIVE
  42     &KIND_COMMENT
  43     &KIND_UNKNOWN
  44 );
  45
  46 use vars qw( $input );
  47 use vars qw( $debug_dump_only_p );
  48 use vars qw( $pedantic_attribute_error_in_nonpedantic_mode_p );
  49 use vars qw( $pedantic_tmpl_var_use_in_nonpedantic_mode_p );
  50 use vars qw( $fatal_p );
  51
  52 ###############################################################################
  53
  54 # Hideous stuff
  55 use vars qw( $re_directive $re_tmpl_var $re_tmpl_var_escaped $re_tmpl_include );
  56 use vars qw( $re_directive_control $re_tmpl_endif_endloop );
  57 BEGIN {
  58     # $re_directive must not do any backreferences
  59     $re_directive = q{<(?:(?i)(?:!--\s*)?\/?TMPL_(?:VAR|LOOP|INCLUDE|IF|ELSE|UNLESS)(?:\s+(?:[a-zA-Z][-a-zA-Z0-9]*=)?(?:'[^']*'|"[^"]*"|[^\s<>]+))*\s*(?:--)?)>};
  60     # TMPL_VAR or TMPL_INCLUDE
  61     $re_tmpl_var = q{<(?:(?i)(?:!--\s*)?TMPL_(?:VAR)(?:\s+(?:[a-zA-Z][-a-zA-Z0-9]*=)?(?:'[^']*'|"[^"]*"|[^\s<>]+))*\s*(?:--)?)>};
  62     $re_tmpl_include = q{<(?:(?i)(?:!--\s*)?TMPL_(?:INCLUDE)(?:\s+(?:[a-zA-Z][-a-zA-Z0-9]*=)?(?:'[^']*'|"[^"]*"|[^\s<>]+))*\s*(?:--)?)>};
  63     # TMPL_VAR ESCAPE=1/HTML/URL
  64     $re_tmpl_var_escaped = q{<(?:(?i)(?:!--\s*)?TMPL_(?:VAR|INCLUDE)(?:\s+(?:[a-zA-Z][-a-zA-Z0-9]*=)?(?:'[^']*'|"[^"]*"|[^\s<>]+))\s+ESCAPE=(?:1|HTML|URL)(?:\s+(?:[a-zA-Z][-a-zA-Z0-9]*=)?(?:'[^']*'|"[^"]*"|[^\s<>]+))*\s*(?:--)?)>};
  65     # Any control flow directive
  66     $re_directive_control = q{<(?:(?i)(?:!--\s*)?\/?TMPL_(?:LOOP|IF|ELSE|UNLESS)(?:\s+(?:[a-zA-Z][-a-zA-Z0-9]*=)?(?:'[^']*'|"[^"]*"|[^\s<>]+))*\s*(?:--)?)>};
  67     # /LOOP or /IF or /UNLESS
  68     $re_tmpl_endif_endloop = q{<(?:(?i)(?:!--\s*)?\/TMPL_(?:LOOP|IF|UNLESS)(?:\s+(?:[a-zA-Z][-a-zA-Z0-9]*=)?(?:'[^']*'|"[^"]*"|[^\s<>]+))*\s*(?:--)?)>};
  69 }
  70
  71 # Hideous stuff from subst.pl, slightly modified to use the above hideous stuff
  72 # Note: The $re_tag's set $1 (<tag), $2 (>), and $3 (rest of string)
  73 use vars qw( $re_comment $re_entity_name $re_end_entity $re_etag );
  74 use vars qw( $re_tag_strict $re_tag_compat @re_tag );
  75 sub re_tag ($) {
  76    my($compat) = @_;
  77    my $etag = $compat? '>': '<>\/';
  78    # This is no longer similar to the original regexp in subst.pl :-(
  79    # Note that we don't want <> in compat mode; Mozilla knows about <
  80    q{(<\/?(?:|(?:"(?:} . $re_directive . q{|[^"])*"|'(?:} . $re_directive . q{|[^'])*'|--(?:[^-]|-[^-])*--|(?:}
  81    . $re_directive
  82    . q{|(?!--)[^"'<>} . $etag . q{]))+))([} . $etag . q{]|(?=<))(.*)};
  83 }
  84 BEGIN {
  85     $re_comment = '(?:--(?:[^-]|-[^-])*--)';
  86     $re_entity_name = '(?:[^&%#;<>\s]+)'; # NOTE: not really correct SGML
  87     $re_end_entity = '(?:;|$|(?=\s))'; # semicolon or before-whitespace
  88     $re_etag = q{(?:<\/?(?:"[^"]*"|'[^']*'|[^"'>\/])*[>\/])}; # end-tag
  89     @re_tag = ($re_tag_strict, $re_tag_compat) = (re_tag(0), re_tag(1));
  90 }
  91
  92 # End of the hideous stuff
  93
  94 sub KIND_TEXT      () { 'TEXT' }
  95 sub KIND_CDATA     () { 'CDATA' }
  96 sub KIND_TAG       () { 'TAG' }
  97 sub KIND_DECL      () { 'DECL' }
  98 sub KIND_PI        () { 'PI' }
  99 sub KIND_DIRECTIVE () { 'HTML::Template' }
 100 sub KIND_COMMENT   () { 'COMMENT' }   # empty DECL with exactly one SGML comment
 101 sub KIND_UNKNOWN   () { 'ERROR' }
 102
 103 use vars qw( $readahead $lc_0 $lc $syntaxerror_p );
 104 use vars qw( $cdata_mode_p $cdata_close );
 105
 106 ###############################################################################
 107
 108 # Easy accessors
 109
 110 sub fatal_p () {
 111     return $fatal_p;
 112 }
 113
 114 sub syntaxerror_p () {
 115     return $syntaxerror_p;
 116 }
 117
 118 ###############################################################################
 119
 120 sub extract_attributes ($;$) {
 121     my($s, $lc) = @_;
 122     my %attr;
 123     $s = $1 if $s =~ /^<\S+(.*)\/\S$/s  # XML-style self-closing tags
 124             || $s =~ /^<\S+(.*)\S$/s;   # SGML-style tags
 125
 126     for (my $i = 0; $s =~ /^(?:$re_directive_control)?\s+(?:$re_directive_control)?(?:([a-zA-Z][-a-zA-Z0-9]*)\s*=\s*)?('((?:$re_directive|[^'])*)'|"((?:$re_directive|[^"])*)"|((?:$re_directive|[^\s<>])+))/os;) {
 127         my($key, $val, $val_orig, $rest)
 128                 = ($1, (defined $3? $3: defined $4? $4: $5), $2, $');
 129         $i += 1;
 130         $attr{+lc($key)} = [$key, $val, $val_orig, $i];
 131         $s = $rest;
 132         if ($val =~ /$re_tmpl_include/os) {
 133             warn_normal "TMPL_INCLUDE in attribute: $val_orig\n", $lc;
 134         } elsif ($val =~ /$re_tmpl_var/os && $val !~ /$re_tmpl_var_escaped/os) {
 135             # XXX: we probably should not warn if key is "onclick" etc
 136             # XXX: there's just no reasonable thing to suggest
 137             my $suggest = ($key =~ /^(?:action|archive|background|cite|classid|codebase|data|datasrc|for|href|longdesc|profile|src|usemap)$/i? 'URL': 'HTML');
 138             undef $suggest if $key =~ /^(?:onblur|onchange|onclick|ondblclick|onfocus|onkeydown|onkeypress|onkeyup|onload|onmousedown|onmousemove|onmouseout|onmouseover|onmouseup|onreset|onselect|onsubmit|onunload)$/i;
 139             warn_pedantic
 140                     "Suggest ESCAPE=$suggest for TMPL_VAR in attribute \"$key\""
 141                         . ": $val_orig",
 142                     $lc, \$pedantic_tmpl_var_use_in_nonpedantic_mode_p
 143                 if defined $suggest && (pedantic_p || !$pedantic_tmpl_var_use_in_nonpedantic_mode_p);
 144         } elsif ($val_orig !~ /^['"]/) {
 145             my $t = $val; $t =~ s/$re_directive_control//os;
 146             warn_pedantic
 147                 "Unquoted attribute contains character(s) that should be quoted"
 148                     . ": $val_orig",
 149                 $lc, \$pedantic_attribute_error_in_nonpedantic_mode_p
 150                 if $t =~ /[^-\.A-Za-z0-9]/s;
 151         }
 152     }
 153     my $s2 = $s; $s2 =~ s/$re_tmpl_endif_endloop//g; # for the next check
 154     if ($s2 =~ /\S/s) { # should never happen
 155         if ($s =~ /^([^\n]*)\n/s) { # this is even worse
 156             error_normal("Completely confused while extracting attributes: $1", $lc);
 157             error_normal((scalar(split(/\n/, $s)) - 1) . " more line(s) not shown.", undef);
 158             $fatal_p = 1;
 159         } else {
 160             warn_normal "Strange attribute syntax: $s\n", $lc;
 161         }
 162     }
 163     return \%attr;
 164 }
 165
 166 sub next_token_internal (*) {
 167     my($h) = @_;
 168     my($it, $kind);
 169     my $eof_p = 0;
 170     if (!defined $readahead || !length $readahead) {
 171         my $next = scalar <$h>;
 172         $eof_p = !defined $next;
 173         if (!$eof_p) {
 174             $lc += 1;
 175             $readahead .= $next;
 176         }
 177     }
 178     $lc_0 = $lc;                        # remember line number of first line
 179     if ($eof_p && !length $readahead) { # nothing left to do
 180         ;
 181     } elsif ($readahead =~ /^\s+/s) {   # whitespace
 182         ($kind, $it, $readahead) = (KIND_TEXT, $&, $');
 183     # FIXME the following (the [<\s] part) is an unreliable HACK :-(
 184     } elsif ($readahead =~ /^(?:[^<]|<[<\s])+/s) {      # non-space normal text
 185         ($kind, $it, $readahead) = (KIND_TEXT, $&, $');
 186         warn_normal "Warning: Unescaped < $it\n", $lc_0
 187                 if !$cdata_mode_p && $it =~ /</s;
 188     } else {                            # tag/declaration/processing instruction
 189         my $ok_p = 0;
 190         for (;;) {
 191             if ($cdata_mode_p) {
 192                 if ($readahead =~ /^$cdata_close/) {
 193                     ($kind, $it, $readahead) = (KIND_TAG, $&, $');
 194                     $ok_p = 1;
 195                 } else {
 196                     ($kind, $it, $readahead) = (KIND_TEXT, $readahead, undef);
 197                     $ok_p = 1;
 198                 }
 199             } elsif ($readahead =~ /^$re_tag_compat/os) {
 200                 ($kind, $it, $readahead) = (KIND_TAG, "$1>", $3);
 201                 $ok_p = 1;
 202                 warn_normal "SGML \"closed start tag\" notation: $1<\n", $lc_0 if $2 eq '';
 203             } elsif ($readahead =~ /^<!--(?:(?!-->).)*-->/s) {
 204                 ($kind, $it, $readahead) = (KIND_COMMENT, $&, $');
 205                 $ok_p = 1;
 206                 warn_normal "Syntax error in comment: $&\n", $lc_0;
 207                 $syntaxerror_p = 1;
 208             }
 209         last if $ok_p;
 210             my $next = scalar <$h>;
 211             $eof_p = !defined $next;
 212         last if $eof_p;
 213             $lc += 1;
 214             $readahead .= $next;
 215         }
 216         if ($kind ne KIND_TAG) {
 217             ;
 218         } elsif ($it =~ /^<!/) {
 219             $kind = KIND_DECL;
 220             $kind = KIND_COMMENT if $it =~ /^<!--(?:(?!-->).)*-->/;
 221         } elsif ($it =~ /^<\?/) {
 222             $kind = KIND_PI;
 223         }
 224         if ($it =~ /^$re_directive/ios && !$cdata_mode_p) {
 225             $kind = KIND_DIRECTIVE;
 226         }
 227         if (!$ok_p && $eof_p) {
 228             ($kind, $it, $readahead) = (KIND_UNKNOWN, $readahead, undef);
 229             $syntaxerror_p = 1;
 230         }
 231     }
 232     warn_normal "Unrecognizable token found: $it\n", $lc_0
 233             if $kind eq KIND_UNKNOWN;
 234     return defined $it? (wantarray? ($kind, $it):
 235                                     [$kind, $it]): undef;
 236 }
 237
 238 sub next_token (*) {
 239     my($h) = @_;
 240     my $it;
 241     if (!$cdata_mode_p) {
 242         $it = next_token_internal($h);
 243         if (defined $it && $it->[0] eq KIND_TAG) { # FIXME
 244             ($cdata_mode_p, $cdata_close) = (1, "</$1\\s*>")
 245                     if $it->[1] =~ /^<(script|style|textarea)\b/i; #FIXME
 246             push @$it, extract_attributes($it->[1], $lc_0); #FIXME
 247         }
 248     } else {
 249         for ($it = '';;) {
 250             my $lc_prev = $lc;
 251             my $next = next_token_internal($h);
 252         last if !defined $next;
 253             if (defined $next && $next->[1] =~ /$cdata_close/i) { #FIXME
 254                 ($lc, $readahead) = ($lc_prev, $next->[1] . $readahead); #FIXME
 255                 $cdata_mode_p = 0;
 256             }
 257         last unless $cdata_mode_p;
 258             $it .= $next->[1]; #FIXME
 259         }
 260         $it = [KIND_CDATA, $it]; #FIXME
 261         $cdata_close = undef;
 262     }
 263     return defined $it? (wantarray? @$it: $it): undef;
 264 }
 265
 266 ###############################################################################
 267
 268 sub trim ($) {
 269     my($s) = @_;
 270     $s =~ s/^(?:\s|\&nbsp$re_end_entity)+//os;
 271     $s =~ s/(?:\s|\&nbsp$re_end_entity)+$//os;
 272     return $s;
 273 }
 274
 275 ###############################################################################
 276
 277 =head1 FUTURE PLANS
 278
 279 Code could be written to detect template variables and
 280 construct gettext-c-format-string-like meta-strings (e.g., "Results %s
 281 through %s of %s records" that will be more likely to be translatable
 282 to languages where word order is very unlike English word order.
 283 This will be relatively major rework, requiring corresponding
 284 rework in tmpl_process.pl
 285
 286 =cut