3 # Test filter partially based on Ambrose's hideous subst.pl code
4 # The idea is that the .tmpl files are not valid HTML, and as a result
5 # HTML::Parse would be completely confused by these templates.
6 # This is just a simple scanner (not a parser) & should give better results.
8 # This script is meant to be a drop-in replacement of text-extract.pl
13 use vars qw( $input );
14 use vars qw( $debug_dump_only_p );
16 ###############################################################################
18 # Hideous stuff from subst.pl
19 # Note: The $re_tag's set $1 (<tag), $2 (>), and $3 (rest of string)
20 use vars qw( $re_comment $re_entity_name $re_end_entity $re_etag );
21 use vars qw( $re_tag_strict $re_tag_compat @re_tag );
24 my $etag = $compat? '>': '<>\/';
25 # See the file "subst.pl.test1" for how the following mess is derived
26 q{(<\/?(?:|(?:"[^"]*"|'[^']*'|--(?:[^-]|-[^-])*--|(?:[^-"'} . $etag . q{]|-[^-]))+))([} . $etag . q{])(.*)};
29 $re_comment = '(?:--(?:[^-]|-[^-])*--)';
30 $re_entity_name = '(?:[^&%#;<>\s]+)'; # NOTE: not really correct SGML
31 $re_end_entity = '(?:;|$|(?=\s))'; # semicolon or before-whitespace
32 $re_etag = q{(?:<\/?(?:"[^"]*"|'[^']*'|[^"'>\/])*[>\/])}; # end-tag
33 @re_tag = ($re_tag_strict, $re_tag_compat) = (re_tag(0), re_tag(1));
36 # End of the hideous stuff
38 use vars qw( $re_directive );
40 # $re_directive must not do any backreferences
41 $re_directive = q{<(?:!--\s*)?\/?TMPL_(?:VAR|LOOP|INCLUDE|IF|ELSE|UNLESS)\b(?:\s+(?:[a-zA-Z][-a-zA-Z0-9]*=)?(?:'[^']*'|"[^"]*"|[^\s<>]+))\s*(?:--)?>};
44 sub KIND_TEXT () { 'TEXT' }
45 sub KIND_CDATA () { 'CDATA' }
46 sub KIND_TAG () { 'TAG' }
47 sub KIND_DECL () { 'DECL' }
48 sub KIND_PI () { 'PI' }
49 sub KIND_DIRECTIVE () { 'HTML::Template' }
50 sub KIND_COMMENT () { 'COMMENT' } # empty DECL with exactly one SGML comment
51 sub KIND_UNKNOWN () { 'ERROR' }
53 use vars qw( $readahead $lc_0 $lc $syntaxerror_p );
54 use vars qw( $cdata_mode_p $cdata_close );
56 sub extract_attributes ($;$) {
59 $s = $1 if $s =~ /^<\S+(.*)\S$/s; # should be always true
60 for (my $i = 0; $s =~ /^\s+(?:([a-zA-Z][-a-zA-Z0-9]*)=)?('((?:$re_directive|[^'])*)'|"((?:$re_directive|[^"])*)"|(($re_directive|[^\s<>])+))/os;) {
61 my($key, $val, $val_orig, $rest)
62 = ($1, (defined $3? $3: defined $4? $4: $5), $2, $');
64 $attr{+lc($key)} = [$key, $val, $val_orig, $i];
67 if ($s =~ /\S/s) { # should never happen
68 warn "Warning: Strange attribute syntax"
69 . (defined $lc? " in line $lc": '') . ": $s\n";
75 sub next_token_internal (*) {
79 if (!defined $readahead || !length $readahead) {
80 my $next = scalar <$h>;
81 $eof_p = !defined $next;
87 $lc_0 = $lc; # remember line number of first line
88 if ($eof_p && !length $readahead) { # nothing left to do
90 } elsif ($readahead =~ /^\s+/s) { # whitespace
91 ($kind, $it, $readahead) = (KIND_TEXT, $&, $');
92 } elsif ($readahead =~ /^[^<]+/s) { # non-whitespace normal text
93 ($kind, $it, $readahead) = (KIND_TEXT, $&, $');
94 } else { # tag/declaration/processing instruction
98 if ($readahead =~ /^$cdata_close/) {
99 ($kind, $it, $readahead) = (KIND_TAG, $&, $');
102 ($kind, $it, $readahead) = (KIND_TEXT, $readahead, undef);
105 } elsif ($readahead =~ /^$re_tag_compat/os) {
106 ($kind, $it, $readahead) = (KIND_TAG, "$1$2", $3);
108 } elsif ($readahead =~ /^<!--(?:(?!-->).)*-->/s) {
109 ($kind, $it, $readahead) = (KIND_COMMENT, $&, $');
111 warn "Warning: Syntax error in comment at line $lc_0: $&\n";
115 my $next = scalar <$h>;
116 $eof_p = !defined $next;
121 if ($kind ne KIND_TAG) {
123 } elsif ($it =~ /^<!/) {
125 $kind = KIND_COMMENT if $it =~ /^<!--(?:(?!-->).)*-->/;
126 } elsif ($it =~ /^<\?/) {
129 if ($it =~ /^$re_directive/ios && !$cdata_mode_p) {
130 $kind = KIND_DIRECTIVE;
132 ($kind, $it) = (KIND_UNKNOWN, $readahead)
133 if !$ok_p && $eof_p && !length $readahead;
135 return defined $it? (wantarray? ($kind, $it):
136 [$kind, $it]): undef;
142 if (!$cdata_mode_p) {
143 $it = next_token_internal($h);
144 if (defined $it && $it->[0] eq KIND_TAG) { # FIXME
145 ($cdata_mode_p, $cdata_close) = (1, "</$1\\s*>")
146 if $it->[1] =~ /^<(script|style|textarea)\b/i; #FIXME
147 push @$it, extract_attributes($it->[1], $lc); #FIXME
152 my $next = next_token_internal($h);
153 last if !defined $next;
154 if (defined $next && $next->[1] =~ /$cdata_close/i) { #FIXME
155 ($lc, $readahead) = ($lc_prev, $next->[1] . $readahead); #FIXME
158 last unless $cdata_mode_p;
159 $it .= $next->[1]; #FIXME
161 $it = [KIND_CDATA, $it] if defined $it; #FIXME
162 $cdata_close = undef;
164 return defined $it? (wantarray? @$it: $it): undef;
167 ###############################################################################
169 sub debug_dump (*) { # for testing only
171 print "re_tag_compat is /$re_tag_compat/\n";
173 my $s = next_token $h;
174 last unless defined $s;
175 printf "%s\n", ('-' x 79);
176 my($kind, $t, $attr) = @$s; # FIXME
177 printf "%s:\n", $kind;
178 printf "%4dH%s\n", length($t),
179 join('', map {/[\0-\37]/? $_: "$_\b$_"} split(//, $t));
180 if ($kind eq KIND_TAG && %$attr) {
181 printf "Attributes:\n";
182 for my $a (keys %$attr) {
183 my($key, $val, $val_orig, $order) = @{$attr->{$a}};
184 printf "%s = %dH%s -- %s\n", $a, length $val,
185 join('', map {/[\0-\37]/? $_: "$_\b$_"} split(//, $val)),
192 ###############################################################################
194 sub text_extract (*) {
198 my $s = next_token $h;
199 last unless defined $s;
200 my($kind, $t, $attr) = @$s; # FIXME
201 if ($kind eq KIND_TEXT) {
203 $text{$t} = 1 if $t =~ /\S/s; # FIXME... trailing whitespace
204 } elsif ($kind eq KIND_TAG && %$attr) {
205 # value [tag=input], meta
206 my $tag = lc($1) if $t =~ /^<(\S+)/s;
207 for my $a ('alt', 'content', 'title', 'value') {
209 next if $a eq 'content' && $tag ne 'meta';
210 next if $a eq 'value' && ($tag ne 'input'
211 || (ref $attr->{'type'} && $attr->{'type'}->[1] eq 'hidden'));
212 my($key, $val, $val_orig, $order) = @{$attr->{$a}}; #FIXME
214 $text{$val} = 1 if $val =~ /\S/s;
219 for my $t (keys %text) {
220 printf "%s\n", $t unless $t =~ /^(?:\s|\ )*$/s;
224 ###############################################################################
227 'f|file=s' => \$input,
228 'debug-dump-only-p' => \$debug_dump_only_p,
231 open(INPUT, "<$input") || die "$0: $input: $!\n";
232 if ($debug_dump_only_p) {
235 text_extract(*INPUT);
238 warn "Warning: This input will not work with Mozilla standards-compliant mode\n"