misc/translator/text-extract2.pl

   1 #!/usr/bin/perl
   2
   3 # Test filter partially based on Ambrose's hideous subst.pl code
   4 # The idea is that the .tmpl files are not valid HTML, and as a result
   5 # HTML::Parse would be completely confused by these templates.
   6 # This is just a simple scanner (not a parser) & should give better results.
   7
   8 # This script is meant to be a drop-in replacement of text-extract.pl
   9
  10 # A grander plan: Code could be written to detect template variables and
  11 # construct gettext-c-format-string-like meta-strings (e.g., "Results %s
  12 # through %s of %s records" that will be more likely to be translatable
  13 # to languages where word order is very unlike English word order.
  14 # --> This will be relatively major rework, and requires corresponding
  15 # rework in tmpl_process.pl
  16
  17 use Getopt::Long;
  18 use TmplTokenizer;
  19 use VerboseWarnings;
  20 use strict;
  21
  22 use vars qw( $input );
  23 use vars qw( $debug_dump_only_p );
  24 use vars qw( $pedantic_p );
  25
  26 ###############################################################################
  27
  28 sub debug_dump (*) { # for testing only
  29     my($h) = @_;
  30     print "re_tag_compat is /$TmplTokenizer::re_tag_compat/\n";
  31     for (;;) {
  32         my $s = TmplTokenizer::next_token $h;
  33     last unless defined $s;
  34         printf "%s\n", ('-' x 79);
  35         my($kind, $t, $attr) = @$s; # FIXME
  36         printf "%s:\n", $kind;
  37         printf "%4dH%s\n", length($t),
  38                 join('', map {/[\0-\37]/? $_: "$_\b$_"} split(//, $t));
  39         if ($kind eq TmplTokenizer::KIND_TAG && %$attr) {
  40             printf "Attributes:\n";
  41             for my $a (keys %$attr) {
  42                 my($key, $val, $val_orig, $order) = @{$attr->{$a}};
  43                 printf "%s = %dH%s -- %s\n", $a, length $val,
  44                 join('', map {/[\0-\37]/? $_: "$_\b$_"} split(//, $val)),
  45                 $val_orig;
  46             }
  47         }
  48     }
  49 }
  50
  51 ###############################################################################
  52
  53 sub trim ($) {
  54     my($s) = @_;
  55     $s =~ s/^(?:\s|\&nbsp$TmplTokenizer::re_end_entity)+//os;
  56     $s =~ s/(?:\s|\&nbsp$TmplTokenizer::re_end_entity)+$//os;
  57     return $s;
  58 }
  59
  60 ###############################################################################
  61
  62 sub text_extract (*) {
  63     my($h) = @_;
  64     my %text = ();
  65     for (;;) {
  66         my $s = TmplTokenizer::next_token $h;
  67     last unless defined $s;
  68         my($kind, $t, $attr) = @$s; # FIXME
  69         if ($kind eq TmplTokenizer::KIND_TEXT) {
  70             $t = trim $t;
  71             $text{$t} = 1 if $t =~ /\S/s;
  72         } elsif ($kind eq TmplTokenizer::KIND_TAG && %$attr) {
  73             # value [tag=input], meta
  74             my $tag = lc($1) if $t =~ /^<(\S+)/s;
  75             for my $a ('alt', 'content', 'title', 'value') {
  76                 if ($attr->{$a}) {
  77                     next if $a eq 'content' && $tag ne 'meta';
  78                     next if $a eq 'value' && ($tag ne 'input'
  79                         || (ref $attr->{'type'} && $attr->{'type'}->[1] eq 'hidden')); # FIXME
  80                     my($key, $val, $val_orig, $order) = @{$attr->{$a}}; #FIXME
  81                     $val = trim $val;
  82                     $text{$val} = 1 if $val =~ /\S/s;
  83                 }
  84             }
  85         }
  86     }
  87     # Emit all extracted strings.
  88     # Don't emit pure whitespace, pure numbers, or TMPL_VAR's.
  89     for my $t (keys %text) {
  90         printf "%s\n", $t
  91             unless $t =~ /^(?:\s|\&nbsp$TmplTokenizer::re_end_entity|$TmplTokenizer::re_tmpl_var)*$/os || $t =~ /^\d+$/;
  92     }
  93 }
  94
  95 ###############################################################################
  96
  97 sub usage ($) {
  98     my($exitcode) = @_;
  99     my $h = $exitcode? *STDERR: *STDOUT;
 100     print $h <<EOF;
 101 Usage: $0 [OPTIONS]
 102 Extract strings from HTML file.
 103
 104       --debug-dump-only     Do not extract strings; but display scanned tokens
 105   -f, --file=FILE           Extract from the specified FILE
 106       --pedantic-warnings   Issue warnings even for detected problems which
 107                             are likely to be harmless
 108       --help                Display this help and exit
 109 EOF
 110     exit($exitcode);
 111 }
 112
 113 ###############################################################################
 114
 115 sub usage_error (;$) {
 116     print STDERR "$_[0]\n" if @_;
 117     print STDERR "Try `$0 --help' for more information.\n";
 118     exit(-1);
 119 }
 120
 121 ###############################################################################
 122
 123 GetOptions(
 124     'f|file=s'          => \$input,
 125     'debug-dump-only'   => \$debug_dump_only_p,
 126     'pedantic-warnings' => sub { $pedantic_p = 1 },
 127     'help'              => sub { usage(0) },
 128 ) || usage_error;
 129
 130 VerboseWarnings::set_application_name $0;
 131 VerboseWarnings::set_input_file_name $input;
 132 VerboseWarnings::set_pedantic_mode $pedantic_p;
 133
 134 usage_error('Missing mandatory option -f') unless defined $input;
 135
 136 open(INPUT, "<$input") || die "$0: $input: $!\n";
 137 if ($debug_dump_only_p) {
 138     debug_dump(*INPUT);
 139 } else {
 140     text_extract(*INPUT);
 141 }
 142
 143 warn "This input will not work with Mozilla standards-compliant mode\n", undef
 144         if TmplTokenizer::syntaxerror_p;
 145
 146 close INPUT;
 147
 148 exit(-1) if TmplTokenizer::fatal_p;