2d132d2b6b
using C-like _("some translatable string") notation. English templates will need to be modified.
166 lines
5.1 KiB
Perl
Executable file
166 lines
5.1 KiB
Perl
Executable file
#!/usr/bin/perl
|
|
|
|
# Test filter partially based on Ambrose's hideous subst.pl code
|
|
# The idea is that the .tmpl files are not valid HTML, and as a result
|
|
# HTML::Parse would be completely confused by these templates.
|
|
# This is just a simple scanner (not a parser) & should give better results.
|
|
|
|
# This script is meant to be a drop-in replacement of text-extract.pl
|
|
|
|
# A grander plan: Code could be written to detect template variables and
|
|
# construct gettext-c-format-string-like meta-strings (e.g., "Results %s
|
|
# through %s of %s records" that will be more likely to be translatable
|
|
# to languages where word order is very unlike English word order.
|
|
# --> This will be relatively major rework, and requires corresponding
|
|
# rework in tmpl_process.pl
|
|
|
|
use Getopt::Long;
|
|
use TmplTokenizer;
|
|
use VerboseWarnings;
|
|
use strict;
|
|
|
|
use vars qw( $input );
|
|
use vars qw( $debug_dump_only_p );
|
|
use vars qw( $pedantic_p );
|
|
use vars qw( $allow_cformat_p ); # FOR TESTING PURPOSES ONLY!!
|
|
|
|
###############################################################################
|
|
|
|
sub underline ($) { # for testing only
|
|
my($s) = @_;
|
|
join('', map {/[\0-\37]/? $_: "$_\b$_"} split(//, $s));
|
|
}
|
|
|
|
sub debug_dump ($) { # for testing only
|
|
my($h) = @_;
|
|
print "re_tag_compat is /", TmplTokenizer::re_tag(1), "/\n";
|
|
for (;;) {
|
|
my $s = TmplTokenizer::next_token $h;
|
|
last unless defined $s;
|
|
printf "%s\n", ('-' x 79);
|
|
my($kind, $t, $attr) = ($s->type, $s->string, $s->attributes);
|
|
printf "%s [line %d]:\n", $kind->to_string, $s->line_number;
|
|
printf "%4dH%s\n", length($t), underline($t);
|
|
if ($kind == TmplTokenType::TAG && %$attr) {
|
|
printf "Attributes:\n";
|
|
for my $a (keys %$attr) {
|
|
my($key, $val, $val_orig, $order) = @{$attr->{$a}};
|
|
printf "%s = %dH%s -- %s\n", $a, length $val, underline $val,
|
|
$val_orig;
|
|
}
|
|
}
|
|
if ($kind == TmplTokenType::TEXT_PARAMETRIZED) {
|
|
printf "Form (c-format string):\n";
|
|
printf "%dH%s\n", length $s->form, underline $s->form;
|
|
printf "Parameters:\n";
|
|
my $i = 1;
|
|
for my $a ($s->parameters) {
|
|
my $t = $a->string;
|
|
printf "%%%d\$s = %dH%s\n", $i, length $t, underline $t;
|
|
$i += 1;
|
|
}
|
|
}
|
|
if ($s->has_js_data) {
|
|
printf "JavaScript translatable strings:\n";
|
|
for my $t (@{$s->js_data}) {
|
|
printf "%dH%s\n", length $t->[3], underline $t->[3] if $t->[0]; # FIXME
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
###############################################################################
|
|
|
|
sub text_extract ($) {
|
|
my($h) = @_;
|
|
my %text = ();
|
|
for (;;) {
|
|
my $s = TmplTokenizer::next_token $h;
|
|
last unless defined $s;
|
|
my($kind, $t, $attr) = ($s->type, $s->string, $s->attributes);
|
|
if ($kind == TmplTokenType::TEXT) {
|
|
$t = TmplTokenizer::trim $t;
|
|
$text{$t} = 1 if $t =~ /\S/s;
|
|
} elsif ($kind == TmplTokenType::TAG && %$attr) {
|
|
# value [tag=input], meta
|
|
my $tag = lc($1) if $t =~ /^<(\S+)/s;
|
|
for my $a ('alt', 'content', 'title', 'value') {
|
|
if ($attr->{$a}) {
|
|
next if $a eq 'content' && $tag ne 'meta';
|
|
next if $a eq 'value' && ($tag ne 'input'
|
|
|| (ref $attr->{'type'} && $attr->{'type'}->[1] =~ /^(?:hidden|radio)$/)); # FIXME
|
|
my($key, $val, $val_orig, $order) = @{$attr->{$a}}; #FIXME
|
|
$val = TmplTokenizer::trim $val;
|
|
$text{$val} = 1 if $val =~ /\S/s;
|
|
}
|
|
}
|
|
} elsif ($s->has_js_data) {
|
|
for my $t (@{$s->js_data}) {
|
|
remember( $s, $t->[3] ) if $t->[0]; # FIXME
|
|
}
|
|
}
|
|
}
|
|
# Emit all extracted strings.
|
|
# Don't emit pure whitespace, pure numbers, or TMPL_VAR's.
|
|
for my $t (keys %text) {
|
|
printf "%s\n", $t
|
|
unless TmplTokenizer::blank_p($t) || $t =~ /^\d+$/;
|
|
}
|
|
}
|
|
|
|
###############################################################################
|
|
|
|
sub usage ($) {
|
|
my($exitcode) = @_;
|
|
my $h = $exitcode? *STDERR: *STDOUT;
|
|
print $h <<EOF;
|
|
Usage: $0 [OPTIONS]
|
|
Extract strings from HTML file.
|
|
|
|
--debug-dump-only Do not extract strings; but display scanned tokens
|
|
-f, --file=FILE Extract from the specified FILE
|
|
--pedantic-warnings Issue warnings even for detected problems which
|
|
are likely to be harmless
|
|
--help Display this help and exit
|
|
EOF
|
|
exit($exitcode);
|
|
}
|
|
|
|
###############################################################################
|
|
|
|
sub usage_error (;$) {
|
|
print STDERR "$_[0]\n" if @_;
|
|
print STDERR "Try `$0 --help' for more information.\n";
|
|
exit(-1);
|
|
}
|
|
|
|
###############################################################################
|
|
|
|
GetOptions(
|
|
'enable-cformat' => \$allow_cformat_p,
|
|
'f|file=s' => \$input,
|
|
'debug-dump-only' => \$debug_dump_only_p,
|
|
'pedantic-warnings' => sub { $pedantic_p = 1 },
|
|
'help' => sub { usage(0) },
|
|
) || usage_error;
|
|
|
|
VerboseWarnings::set_application_name $0;
|
|
VerboseWarnings::set_input_file_name $input;
|
|
VerboseWarnings::set_pedantic_mode $pedantic_p;
|
|
|
|
usage_error('Missing mandatory option -f') unless defined $input;
|
|
|
|
my $h = TmplTokenizer->new( $input );
|
|
$h->set_allow_cformat( 1 ) if $allow_cformat_p;
|
|
if ($debug_dump_only_p) {
|
|
debug_dump( $h );
|
|
} else {
|
|
text_extract( $h );
|
|
}
|
|
|
|
warn "This input will not work with Mozilla standards-compliant mode\n", undef
|
|
if TmplTokenizer::syntaxerror_p;
|
|
|
|
close INPUT;
|
|
|
|
exit(-1) if TmplTokenizer::fatal_p;
|