3 # Test filter partially based on Ambrose's hideous subst.pl code
4 # The idea is that the .tmpl files are not valid HTML, and as a result
5 # HTML::Parse would be completely confused by these templates.
6 # This is just a simple scanner (not a parser) & should give better results.
8 # This script is meant to be a drop-in replacement of text-extract.pl
10 # A grander plan: Code could be written to detect template variables and
11 # construct gettext-c-format-string-like meta-strings (e.g., "Results %s
12 # through %s of %s records" that will be more likely to be translatable
13 # to languages where word order is very unlike English word order.
14 # --> This will be relatively major rework, and requires corresponding
15 # rework in tmpl_process.pl
22 use vars qw( $input );
23 use vars qw( $debug_dump_only_p );
24 use vars qw( $pedantic_p );
26 ###############################################################################
28 sub debug_dump (*) { # for testing only
30 print "re_tag_compat is /$TmplTokenizer::re_tag_compat/\n";
32 my $s = TmplTokenizer::next_token $h;
33 last unless defined $s;
34 printf "%s\n", ('-' x 79);
35 my($kind, $t, $attr) = @$s; # FIXME
36 printf "%s:\n", $kind;
37 printf "%4dH%s\n", length($t),
38 join('', map {/[\0-\37]/? $_: "$_\b$_"} split(//, $t));
39 if ($kind eq TmplTokenizer::KIND_TAG && %$attr) {
40 printf "Attributes:\n";
41 for my $a (keys %$attr) {
42 my($key, $val, $val_orig, $order) = @{$attr->{$a}};
43 printf "%s = %dH%s -- %s\n", $a, length $val,
44 join('', map {/[\0-\37]/? $_: "$_\b$_"} split(//, $val)),
51 ###############################################################################
55 $s =~ s/^(?:\s|\ $TmplTokenizer::re_end_entity)+//os;
56 $s =~ s/(?:\s|\ $TmplTokenizer::re_end_entity)+$//os;
60 ###############################################################################
62 sub text_extract (*) {
66 my $s = TmplTokenizer::next_token $h;
67 last unless defined $s;
68 my($kind, $t, $attr) = @$s; # FIXME
69 if ($kind eq TmplTokenizer::KIND_TEXT) {
71 $text{$t} = 1 if $t =~ /\S/s;
72 } elsif ($kind eq TmplTokenizer::KIND_TAG && %$attr) {
73 # value [tag=input], meta
74 my $tag = lc($1) if $t =~ /^<(\S+)/s;
75 for my $a ('alt', 'content', 'title', 'value') {
77 next if $a eq 'content' && $tag ne 'meta';
78 next if $a eq 'value' && ($tag ne 'input'
79 || (ref $attr->{'type'} && $attr->{'type'}->[1] eq 'hidden')); # FIXME
80 my($key, $val, $val_orig, $order) = @{$attr->{$a}}; #FIXME
82 $text{$val} = 1 if $val =~ /\S/s;
87 # Emit all extracted strings.
88 # Don't emit pure whitespace, pure numbers, or TMPL_VAR's.
89 for my $t (keys %text) {
91 unless $t =~ /^(?:\s|\ $TmplTokenizer::re_end_entity|$TmplTokenizer::re_tmpl_var)*$/os || $t =~ /^\d+$/;
95 ###############################################################################
99 my $h = $exitcode? *STDERR: *STDOUT;
102 Extract strings from HTML file.
104 --debug-dump-only Do not extract strings; but display scanned tokens
105 -f, --file=FILE Extract from the specified FILE
106 --pedantic-warnings Issue warnings even for detected problems which
107 are likely to be harmless
108 --help Display this help and exit
113 ###############################################################################
115 sub usage_error (;$) {
116 print STDERR "$_[0]\n" if @_;
117 print STDERR "Try `$0 --help' for more information.\n";
121 ###############################################################################
124 'f|file=s' => \$input,
125 'debug-dump-only' => \$debug_dump_only_p,
126 'pedantic-warnings' => sub { $pedantic_p = 1 },
127 'help' => sub { usage(0) },
130 VerboseWarnings::set_application_name $0;
131 VerboseWarnings::set_input_file_name $input;
132 VerboseWarnings::set_pedantic_mode $pedantic_p;
134 usage_error('Missing mandatory option -f') unless defined $input;
136 open(INPUT, "<$input") || die "$0: $input: $!\n";
137 if ($debug_dump_only_p) {
140 text_extract(*INPUT);
143 warn "This input will not work with Mozilla standards-compliant mode\n", undef
144 if TmplTokenizer::syntaxerror_p;
148 exit(-1) if TmplTokenizer::fatal_p;