misc/translator/TmplTokenizer.pm

   1 package TmplTokenizer;
   2
   3 use strict;
   4 use VerboseWarnings qw( pedantic_p error_normal warn_normal warn_pedantic );
   5 require Exporter;
   6
   7 use vars qw($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
   8
   9 ###############################################################################
  10
  11 =head1 NAME
  12
  13 TmplTokenizer.pm - Simple-minded tokenizer for HTML::Template .tmpl files
  14
  15 =head1 DESCRIPTION
  16
  17 Because .tmpl files contains HTML::Template directives
  18 that tend to confuse real parsers (e.g., HTML::Parse),
  19 it might be better to create a customized scanner
  20 to scan the template files for tokens.
  21 This module is a simple-minded attempt at such a scanner.
  22
  23 =head1 HISTORY
  24
  25 This tokenizer is mostly based
  26 on Ambrose's hideous Perl script known as subst.pl.
  27
  28 =cut
  29
  30 ###############################################################################
  31
  32 $VERSION = 0.01;
  33
  34 @ISA = qw(Exporter);
  35 @EXPORT_OK = qw(
  36     &KIND_TEXT
  37     &KIND_CDATA
  38     &KIND_TAG
  39     &KIND_DECL
  40     &KIND_PI
  41     &KIND_DIRECTIVE
  42     &KIND_COMMENT
  43     &KIND_UNKNOWN
  44 );
  45
  46 use vars qw( $input );
  47 use vars qw( $debug_dump_only_p );
  48 use vars qw( $pedantic_attribute_error_in_nonpedantic_mode_p );
  49 use vars qw( $pedantic_tmpl_var_use_in_nonpedantic_mode_p );
  50 use vars qw( $fatal_p );
  51
  52 ###############################################################################
  53
  54 # Hideous stuff
  55 use vars qw( $re_directive $re_tmpl_var $re_tmpl_var_escaped $re_tmpl_include );
  56 use vars qw( $re_directive_control $re_tmpl_endif_endloop );
  57 BEGIN {
  58     # $re_directive must not do any backreferences
  59     $re_directive = q{<(?:(?i)(?:!--\s*)?\/?TMPL_(?:VAR|LOOP|INCLUDE|IF|ELSE|UNLESS)(?:\s+(?:[a-zA-Z][-a-zA-Z0-9]*=)?(?:'[^']*'|"[^"]*"|[^\s<>]+))*\s*(?:--)?)>};
  60     # TMPL_VAR or TMPL_INCLUDE
  61     $re_tmpl_var = q{<(?:(?i)(?:!--\s*)?TMPL_(?:VAR)(?:\s+(?:[a-zA-Z][-a-zA-Z0-9]*=)?(?:'[^']*'|"[^"]*"|[^\s<>]+))*\s*(?:--)?)>};
  62     $re_tmpl_include = q{<(?:(?i)(?:!--\s*)?TMPL_(?:INCLUDE)(?:\s+(?:[a-zA-Z][-a-zA-Z0-9]*=)?(?:'[^']*'|"[^"]*"|[^\s<>]+))*\s*(?:--)?)>};
  63     # TMPL_VAR ESCAPE=1/HTML/URL
  64     $re_tmpl_var_escaped = q{<(?:(?i)(?:!--\s*)?TMPL_(?:VAR|INCLUDE)(?:\s+(?:[a-zA-Z][-a-zA-Z0-9]*=)?(?:'[^']*'|"[^"]*"|[^\s<>]+))\s+ESCAPE=(?:1|HTML|URL)(?:\s+(?:[a-zA-Z][-a-zA-Z0-9]*=)?(?:'[^']*'|"[^"]*"|[^\s<>]+))*\s*(?:--)?)>};
  65     # Any control flow directive
  66     $re_directive_control = q{<(?:(?i)(?:!--\s*)?\/?TMPL_(?:LOOP|IF|ELSE|UNLESS)(?:\s+(?:[a-zA-Z][-a-zA-Z0-9]*=)?(?:'[^']*'|"[^"]*"|[^\s<>]+))*\s*(?:--)?)>};
  67     # /LOOP or /IF or /UNLESS
  68     $re_tmpl_endif_endloop = q{<(?:(?i)(?:!--\s*)?\/TMPL_(?:LOOP|IF|UNLESS)(?:\s+(?:[a-zA-Z][-a-zA-Z0-9]*=)?(?:'[^']*'|"[^"]*"|[^\s<>]+))*\s*(?:--)?)>};
  69 }
  70
  71 # Hideous stuff from subst.pl, slightly modified to use the above hideous stuff
  72 # Note: The $re_tag's set $1 (<tag), $2 (>), and $3 (rest of string)
  73 use vars qw( $re_comment $re_entity_name $re_end_entity $re_etag );
  74 use vars qw( $re_tag_strict $re_tag_compat @re_tag );
  75 sub re_tag ($) {
  76    my($compat) = @_;
  77    my $etag = $compat? '>': '<>\/';
  78    # This is no longer similar to the original regexp in subst.pl :-(
  79    # Note that we don't want <> in compat mode; Mozilla knows about <
  80    q{(<\/?(?:|(?:"(?:} . $re_directive . q{|[^"])*"|'(?:} . $re_directive . q{|[^'])*'|--(?:[^-]|-[^-])*--|(?:}
  81    . $re_directive
  82    . q{|(?!--)[^"'<>} . $etag . q{]))+))([} . $etag . q{]|(?=<))(.*)};
  83 }
  84 BEGIN {
  85     $re_comment = '(?:--(?:[^-]|-[^-])*--)';
  86     $re_entity_name = '(?:[^&%#;<>\s]+)'; # NOTE: not really correct SGML
  87     $re_end_entity = '(?:;|$|(?=\s))'; # semicolon or before-whitespace
  88     $re_etag = q{(?:<\/?(?:"[^"]*"|'[^']*'|[^"'>\/])*[>\/])}; # end-tag
  89     @re_tag = ($re_tag_strict, $re_tag_compat) = (re_tag(0), re_tag(1));
  90 }
  91
  92 # End of the hideous stuff
  93
  94 sub KIND_TEXT      () { 'TEXT' }
  95 sub KIND_CDATA     () { 'CDATA' }
  96 sub KIND_TAG       () { 'TAG' }
  97 sub KIND_DECL      () { 'DECL' }
  98 sub KIND_PI        () { 'PI' }
  99 sub KIND_DIRECTIVE () { 'HTML::Template' }
 100 sub KIND_COMMENT   () { 'COMMENT' }   # empty DECL with exactly one SGML comment
 101 sub KIND_UNKNOWN   () { 'ERROR' }
 102
 103 use vars qw( $readahead $lc_0 $lc $syntaxerror_p );
 104 use vars qw( $cdata_mode_p $cdata_close );
 105
 106 ###############################################################################
 107
 108 # Easy accessors
 109
 110 sub fatal_p () {
 111     return $fatal_p;
 112 }
 113
 114 sub syntaxerror_p () {
 115     return $syntaxerror_p;
 116 }
 117
 118 ###############################################################################
 119
 120 sub extract_attributes ($;$) {
 121     my($s, $lc) = @_;
 122     my %attr;
 123     $s = $1 if $s =~ /^<\S+(.*)\/\S$/s  # XML-style self-closing tags
 124             || $s =~ /^<\S+(.*)\S$/s;   # SGML-style tags
 125
 126     for (my $i = 0; $s =~ /^(?:$re_directive_control)?\s+(?:$re_directive_control)?(?:([a-zA-Z][-a-zA-Z0-9]*)\s*=\s*)?('((?:$re_directive|[^'])*)'|"((?:$re_directive|[^"])*)"|((?:$re_directive|[^\s<>])+))/os;) {
 127         my($key, $val, $val_orig, $rest)
 128                 = ($1, (defined $3? $3: defined $4? $4: $5), $2, $');
 129         $i += 1;
 130         $attr{+lc($key)} = [$key, $val, $val_orig, $i];
 131         $s = $rest;
 132         if ($val =~ /$re_tmpl_include/os) {
 133             warn_normal "TMPL_INCLUDE in attribute: $val_orig\n", $lc;
 134         } elsif ($val =~ /$re_tmpl_var/os && $val !~ /$re_tmpl_var_escaped/os) {
 135             # XXX: we probably should not warn if key is "onclick" etc
 136             # XXX: there's just no reasonable thing to suggest
 137             my $suggest = ($key =~ /^(?:action|archive|background|cite|classid|codebase|data|datasrc|for|href|longdesc|profile|src|usemap)$/i? 'URL': 'HTML');
 138             undef $suggest if $key =~ /^(?:onblur|onchange|onclick|ondblclick|onfocus|onkeydown|onkeypress|onkeyup|onload|onmousedown|onmousemove|onmouseout|onmouseover|onmouseup|onreset|onselect|onsubmit|onunload)$/i;
 139             warn_pedantic
 140                     "Suggest ESCAPE=$suggest for TMPL_VAR in attribute \"$key\""
 141                         . ": $val_orig",
 142                     $lc, \$pedantic_tmpl_var_use_in_nonpedantic_mode_p
 143                 if defined $suggest && (pedantic_p || !$pedantic_tmpl_var_use_in_nonpedantic_mode_p);
 144         } elsif ($val_orig !~ /^['"]/) {
 145             my $t = $val; $t =~ s/$re_directive_control//os;
 146             warn_pedantic
 147                 "Unquoted attribute contains character(s) that should be quoted"
 148                     . ": $val_orig",
 149                 $lc, \$pedantic_attribute_error_in_nonpedantic_mode_p
 150                 if $t =~ /[^-\.A-Za-z0-9]/s;
 151         }
 152     }
 153     my $s2 = $s; $s2 =~ s/$re_tmpl_endif_endloop//g; # for the next check
 154     if ($s2 =~ /\S/s) { # should never happen
 155         if ($s =~ /^([^\n]*)\n/s) { # this is even worse
 156             error_normal("Completely confused while extracting attributes: $1", $lc);
 157             error_normal((scalar(split(/\n/, $s)) - 1) . " more line(s) not shown.", undef);
 158             $fatal_p = 1;
 159         } else {
 160             warn_normal "Strange attribute syntax: $s\n", $lc;
 161         }
 162     }
 163     return \%attr;
 164 }
 165
 166 sub next_token_internal (*) {
 167     my($h) = @_;
 168     my($it, $kind);
 169     my $eof_p = 0;
 170     if (!defined $readahead || !length $readahead) {
 171         my $next = scalar <$h>;
 172         $eof_p = !defined $next;
 173         if (!$eof_p) {
 174             $lc += 1;
 175             $readahead .= $next;
 176         }
 177     }
 178     $lc_0 = $lc;                        # remember line number of first line
 179     if ($eof_p && !length $readahead) { # nothing left to do
 180         ;
 181     } elsif ($readahead =~ /^\s+/s) {   # whitespace
 182         ($kind, $it, $readahead) = (KIND_TEXT, $&, $');
 183     # FIXME the following (the [<\s] part) is an unreliable HACK :-(
 184     } elsif ($readahead =~ /^(?:[^<]|<[<\s])+/s) {      # non-space normal text
 185         ($kind, $it, $readahead) = (KIND_TEXT, $&, $');
 186         warn_normal "Warning: Unescaped < $it\n", $lc_0
 187                 if !$cdata_mode_p && $it =~ /</s;
 188     } else {                            # tag/declaration/processing instruction
 189         my $ok_p = 0;
 190         for (;;) {
 191             if ($cdata_mode_p) {
 192                 if ($readahead =~ /^$cdata_close/) {
 193                     ($kind, $it, $readahead) = (KIND_TAG, $&, $');
 194                     $ok_p = 1;
 195                 } else {
 196                     ($kind, $it, $readahead) = (KIND_TEXT, $readahead, undef);
 197                     $ok_p = 1;
 198                 }
 199             } elsif ($readahead =~ /^$re_tag_compat/os) {
 200                 ($kind, $it, $readahead) = (KIND_TAG, "$1>", $3);
 201                 $ok_p = 1;
 202                 warn_normal "SGML \"closed start tag\" notation: $1<\n", $lc_0 if $2 eq '';
 203             } elsif ($readahead =~ /^<!--(?:(?!-->).)*-->/s) {
 204                 ($kind, $it, $readahead) = (KIND_COMMENT, $&, $');
 205                 $ok_p = 1;
 206                 warn_normal "Syntax error in comment: $&\n", $lc_0;
 207                 $syntaxerror_p = 1;
 208             }
 209         last if $ok_p;
 210             my $next = scalar <$h>;
 211             $eof_p = !defined $next;
 212         last if $eof_p;
 213             $lc += 1;
 214             $readahead .= $next;
 215         }
 216         if ($kind ne KIND_TAG) {
 217             ;
 218         } elsif ($it =~ /^<!/) {
 219             $kind = KIND_DECL;
 220             $kind = KIND_COMMENT if $it =~ /^<!--(?:(?!-->).)*-->/;
 221         } elsif ($it =~ /^<\?/) {
 222             $kind = KIND_PI;
 223         }
 224         if ($it =~ /^$re_directive/ios && !$cdata_mode_p) {
 225             $kind = KIND_DIRECTIVE;
 226         }
 227         if (!$ok_p && $eof_p) {
 228             ($kind, $it, $readahead) = (KIND_UNKNOWN, $readahead, undef);
 229             $syntaxerror_p = 1;
 230         }
 231     }
 232     warn_normal "Unrecognizable token found: $it\n", $lc_0
 233             if $kind eq KIND_UNKNOWN;
 234     return defined $it? (wantarray? ($kind, $it):
 235                                     [$kind, $it]): undef;
 236 }
 237
 238 sub next_token (*) {
 239     my($h) = @_;
 240     my $it;
 241     if (!$cdata_mode_p) {
 242         $it = next_token_internal($h);
 243         if (defined $it && $it->[0] eq KIND_TAG) { # FIXME
 244             ($cdata_mode_p, $cdata_close) = (1, "</$1\\s*>")
 245                     if $it->[1] =~ /^<(script|style|textarea)\b/i; #FIXME
 246             push @$it, extract_attributes($it->[1], $lc_0); #FIXME
 247         }
 248     } else {
 249         for ($it = '';;) {
 250             my $lc_prev = $lc;
 251             my $next = next_token_internal($h);
 252         last if !defined $next;
 253             if (defined $next && $next->[1] =~ /$cdata_close/i) { #FIXME
 254                 ($lc, $readahead) = ($lc_prev, $next->[1] . $readahead); #FIXME
 255                 $cdata_mode_p = 0;
 256             }
 257         last unless $cdata_mode_p;
 258             $it .= $next->[1]; #FIXME
 259         }
 260         $it = [KIND_CDATA, $it]; #FIXME
 261         $cdata_close = undef;
 262     }
 263     return defined $it? (wantarray? @$it: $it): undef;
 264 }
 265
 266 ###############################################################################
 267
 268 sub debug_dump (*) { # for testing only
 269     my($h) = @_;
 270     print "re_tag_compat is /$re_tag_compat/\n";
 271     for (;;) {
 272         my $s = next_token $h;
 273     last unless defined $s;
 274         printf "%s\n", ('-' x 79);
 275         my($kind, $t, $attr) = @$s; # FIXME
 276         printf "%s:\n", $kind;
 277         printf "%4dH%s\n", length($t),
 278                 join('', map {/[\0-\37]/? $_: "$_\b$_"} split(//, $t));
 279         if ($kind eq KIND_TAG && %$attr) {
 280             printf "Attributes:\n";
 281             for my $a (keys %$attr) {
 282                 my($key, $val, $val_orig, $order) = @{$attr->{$a}};
 283                 printf "%s = %dH%s -- %s\n", $a, length $val,
 284                 join('', map {/[\0-\37]/? $_: "$_\b$_"} split(//, $val)),
 285                 $val_orig;
 286             }
 287         }
 288     }
 289 }
 290
 291 ###############################################################################
 292
 293 sub trim ($) {
 294     my($s) = @_;
 295     $s =~ s/^(?:\s|\&nbsp$re_end_entity)+//os;
 296     $s =~ s/(?:\s|\&nbsp$re_end_entity)+$//os;
 297     return $s;
 298 }
 299
 300 ###############################################################################
 301
 302 sub text_extract (*) {
 303     my($h) = @_;
 304     my %text = ();
 305     for (;;) {
 306         my $s = next_token $h;
 307     last unless defined $s;
 308         my($kind, $t, $attr) = @$s; # FIXME
 309         if ($kind eq KIND_TEXT) {
 310             $t = trim $t;
 311             $text{$t} = 1 if $t =~ /\S/s;
 312         } elsif ($kind eq KIND_TAG && %$attr) {
 313             # value [tag=input], meta
 314             my $tag = lc($1) if $t =~ /^<(\S+)/s;
 315             for my $a ('alt', 'content', 'title', 'value') {
 316                 if ($attr->{$a}) {
 317                     next if $a eq 'content' && $tag ne 'meta';
 318                     next if $a eq 'value' && ($tag ne 'input'
 319                         || (ref $attr->{'type'} && $attr->{'type'}->[1] eq 'hidden')); # FIXME
 320                     my($key, $val, $val_orig, $order) = @{$attr->{$a}}; #FIXME
 321                     $val = trim $val;
 322                     $text{$val} = 1 if $val =~ /\S/s;
 323                 }
 324             }
 325         }
 326     }
 327     # Emit all extracted strings.
 328     # Don't emit pure whitespace, pure numbers, or TMPL_VAR's.
 329     for my $t (keys %text) {
 330         printf "%s\n", $t
 331             unless $t =~ /^(?:\s|\&nbsp$re_end_entity|$re_tmpl_var)*$/os || $t =~ /^\d+$/;
 332     }
 333 }
 334
 335 ###############################################################################
 336
 337 sub usage ($) {
 338     my($exitcode) = @_;
 339     my $h = $exitcode? *STDERR: *STDOUT;
 340     print $h <<EOF;
 341 Usage: $0 [OPTIONS]
 342 Extract strings from HTML file.
 343
 344       --debug-dump-only     Do not extract strings; but display scanned tokens
 345   -f, --file=FILE           Extract from the specified FILE
 346       --pedantic-warnings   Issue warnings even for detected problems which
 347                             are likely to be harmless
 348       --help                Display this help and exit
 349 EOF
 350     exit($exitcode);
 351 }
 352
 353 ###############################################################################
 354
 355 sub usage_error (;$) {
 356     print STDERR "$_[0]\n" if @_;
 357     print STDERR "Try `$0 --help' for more information.\n";
 358     exit(-1);
 359 }
 360
 361 ###############################################################################
 362
 363 =head1 FUTURE PLANS
 364
 365 Code could be written to detect template variables and
 366 construct gettext-c-format-string-like meta-strings (e.g., "Results %s
 367 through %s of %s records" that will be more likely to be translatable
 368 to languages where word order is very unlike English word order.
 369 This will be relatively major rework, requiring corresponding
 370 rework in tmpl_process.pl
 371
 372 =cut