8b57901d85
word order is too different than the word order of the target language to yield meaningful translations. The new scripts use a different translation file format (namely standard gettext-style PO files). This seems to reasonably work (e.g., producing an empty en_GB translation then installing seems to not corrupt the "translated" files), but it likely will still contain some bugs. There is also little documentation, but try to run perldoc on the .p[lm] files to see what's there. There are also some spurious warnings (both from bugs in the new scripts and from buggy third- party Locale::PO module).
156 lines
4.8 KiB
Perl
Executable file
156 lines
4.8 KiB
Perl
Executable file
#!/usr/bin/perl
|
|
|
|
# Test filter partially based on Ambrose's hideous subst.pl code
|
|
# The idea is that the .tmpl files are not valid HTML, and as a result
|
|
# HTML::Parse would be completely confused by these templates.
|
|
# This is just a simple scanner (not a parser) & should give better results.
|
|
|
|
# This script is meant to be a drop-in replacement of text-extract.pl
|
|
|
|
# A grander plan: Code could be written to detect template variables and
|
|
# construct gettext-c-format-string-like meta-strings (e.g., "Results %s
|
|
# through %s of %s records" that will be more likely to be translatable
|
|
# to languages where word order is very unlike English word order.
|
|
# --> This will be relatively major rework, and requires corresponding
|
|
# rework in tmpl_process.pl
|
|
|
|
use Getopt::Long;
|
|
use TmplTokenizer;
|
|
use VerboseWarnings;
|
|
use strict;
|
|
|
|
use vars qw( $input );
|
|
use vars qw( $debug_dump_only_p );
|
|
use vars qw( $pedantic_p );
|
|
use vars qw( $allow_cformat_p ); # FOR TESTING PURPOSES ONLY!!
|
|
|
|
###############################################################################
|
|
|
|
sub underline ($) { # for testing only
|
|
my($s) = @_;
|
|
join('', map {/[\0-\37]/? $_: "$_\b$_"} split(//, $s));
|
|
}
|
|
|
|
sub debug_dump ($) { # for testing only
|
|
my($h) = @_;
|
|
print "re_tag_compat is /", TmplTokenizer::re_tag(1), "/\n";
|
|
for (;;) {
|
|
my $s = TmplTokenizer::next_token $h;
|
|
last unless defined $s;
|
|
printf "%s\n", ('-' x 79);
|
|
my($kind, $t, $attr) = ($s->type, $s->string, $s->attributes);
|
|
printf "%s [line %d]:\n", $kind->to_string, $s->line_number;
|
|
printf "%4dH%s\n", length($t), underline($t);
|
|
if ($kind == TmplTokenType::TAG && %$attr) {
|
|
printf "Attributes:\n";
|
|
for my $a (keys %$attr) {
|
|
my($key, $val, $val_orig, $order) = @{$attr->{$a}};
|
|
printf "%s = %dH%s -- %s\n", $a, length $val, underline $val,
|
|
$val_orig;
|
|
}
|
|
}
|
|
if ($kind == TmplTokenType::TEXT_PARAMETRIZED) {
|
|
printf "Form (c-format string):\n";
|
|
printf "%dH%s\n", length $s->form, underline $s->form;
|
|
printf "Parameters:\n";
|
|
my $i = 1;
|
|
for my $a ($s->parameters) {
|
|
my $t = $a->string;
|
|
printf "%%%d\$s = %dH%s\n", $i, length $t, underline $t;
|
|
$i += 1;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
###############################################################################
|
|
|
|
sub text_extract ($) {
|
|
my($h) = @_;
|
|
my %text = ();
|
|
for (;;) {
|
|
my $s = TmplTokenizer::next_token $h;
|
|
last unless defined $s;
|
|
my($kind, $t, $attr) = ($s->type, $s->string, $s->attributes);
|
|
if ($kind == TmplTokenType::TEXT) {
|
|
$t = TmplTokenizer::trim $t;
|
|
$text{$t} = 1 if $t =~ /\S/s;
|
|
} elsif ($kind == TmplTokenType::TAG && %$attr) {
|
|
# value [tag=input], meta
|
|
my $tag = lc($1) if $t =~ /^<(\S+)/s;
|
|
for my $a ('alt', 'content', 'title', 'value') {
|
|
if ($attr->{$a}) {
|
|
next if $a eq 'content' && $tag ne 'meta';
|
|
next if $a eq 'value' && ($tag ne 'input'
|
|
|| (ref $attr->{'type'} && $attr->{'type'}->[1] =~ /^(?:hidden|radio)$/)); # FIXME
|
|
my($key, $val, $val_orig, $order) = @{$attr->{$a}}; #FIXME
|
|
$val = TmplTokenizer::trim $val;
|
|
$text{$val} = 1 if $val =~ /\S/s;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
# Emit all extracted strings.
|
|
# Don't emit pure whitespace, pure numbers, or TMPL_VAR's.
|
|
for my $t (keys %text) {
|
|
printf "%s\n", $t
|
|
unless TmplTokenizer::blank_p($t) || $t =~ /^\d+$/;
|
|
}
|
|
}
|
|
|
|
###############################################################################
|
|
|
|
sub usage ($) {
|
|
my($exitcode) = @_;
|
|
my $h = $exitcode? *STDERR: *STDOUT;
|
|
print $h <<EOF;
|
|
Usage: $0 [OPTIONS]
|
|
Extract strings from HTML file.
|
|
|
|
--debug-dump-only Do not extract strings; but display scanned tokens
|
|
-f, --file=FILE Extract from the specified FILE
|
|
--pedantic-warnings Issue warnings even for detected problems which
|
|
are likely to be harmless
|
|
--help Display this help and exit
|
|
EOF
|
|
exit($exitcode);
|
|
}
|
|
|
|
###############################################################################
|
|
|
|
sub usage_error (;$) {
|
|
print STDERR "$_[0]\n" if @_;
|
|
print STDERR "Try `$0 --help' for more information.\n";
|
|
exit(-1);
|
|
}
|
|
|
|
###############################################################################
|
|
|
|
GetOptions(
|
|
'enable-cformat' => \$allow_cformat_p,
|
|
'f|file=s' => \$input,
|
|
'debug-dump-only' => \$debug_dump_only_p,
|
|
'pedantic-warnings' => sub { $pedantic_p = 1 },
|
|
'help' => sub { usage(0) },
|
|
) || usage_error;
|
|
|
|
VerboseWarnings::set_application_name $0;
|
|
VerboseWarnings::set_input_file_name $input;
|
|
VerboseWarnings::set_pedantic_mode $pedantic_p;
|
|
|
|
usage_error('Missing mandatory option -f') unless defined $input;
|
|
|
|
my $h = TmplTokenizer->new( $input );
|
|
$h->set_allow_cformat( 1 ) if $allow_cformat_p;
|
|
if ($debug_dump_only_p) {
|
|
debug_dump( $h );
|
|
} else {
|
|
text_extract( $h );
|
|
}
|
|
|
|
warn "This input will not work with Mozilla standards-compliant mode\n", undef
|
|
if TmplTokenizer::syntaxerror_p;
|
|
|
|
close INPUT;
|
|
|
|
exit(-1) if TmplTokenizer::fatal_p;
|