Frédéric Demians
a41100a445
This test validate Template Toolkit (TT) Koha files. For the time being an unique validation is done: Test if TT files contain TT directive within HTML tag. For example: <li[% IF This kind of constuction MUST be avoided because it break Koha translation process. This patch transform also translation specific modules into C4 modules in order to be able to use them in test case: C4::TTPaser C4::TmplToken C4::TmplTokenType This patch is a Perl adaptation of a Haskell script from Frère Sébastien Marie. Signed-off-by: Katrin Fischer <Katrin.Fischer.83@web.de> Notes on testing: - translate install de-DE - worked ok - translate update de-DE > translate install de-DE - worked ok - running the test xt/tt_valid.t - worked ok and pointed out lots of problems. Found no problems. Signed-off-by: Katrin Fischer <Katrin.Fischer.83@web.de> Signed-off-by: Chris Cormack <chrisc@catalyst.net.nz>
478 lines
16 KiB
Perl
Executable file
478 lines
16 KiB
Perl
Executable file
#!/usr/bin/perl
|
|
|
|
=head1 NAME
|
|
|
|
xgettext.pl - xgettext(1)-like interface for .tmpl strings extraction
|
|
|
|
=cut
|
|
|
|
use strict;
|
|
use warnings;
|
|
use Getopt::Long;
|
|
use POSIX;
|
|
use Locale::PO;
|
|
use TmplTokenizer;
|
|
use VerboseWarnings;
|
|
|
|
use vars qw( $convert_from );
|
|
use vars qw( $files_from $directory $output $sort );
|
|
use vars qw( $extract_all_p );
|
|
use vars qw( $pedantic_p );
|
|
use vars qw( %text %translation );
|
|
use vars qw( $charset_in $charset_out );
|
|
use vars qw( $disable_fuzzy_p );
|
|
use vars qw( $verbose_p );
|
|
use vars qw( $po_mode_p );
|
|
|
|
###############################################################################
|
|
|
|
sub string_negligible_p ($) {
|
|
my($t) = @_; # a string
|
|
# Don't emit pure whitespace, pure numbers, pure punctuation,
|
|
# single letters, or TMPL_VAR's.
|
|
# Punctuation should arguably be translated. But without context
|
|
# they are untranslatable. Note that $t is a string, not a token object.
|
|
return !$extract_all_p && (
|
|
TmplTokenizer::blank_p($t) # blank or TMPL_VAR
|
|
|| $t =~ /^\d+$/ # purely digits
|
|
|| $t =~ /^[-\+\.,:;!\?'"%\(\)\[\]\|]+$/ # punctuation w/o context
|
|
|| $t =~ /^[A-Za-z]$/ # single letters
|
|
)
|
|
}
|
|
|
|
sub token_negligible_p( $ ) {
|
|
my($x) = @_;
|
|
my $t = $x->type;
|
|
return !$extract_all_p && (
|
|
$t == C4::TmplTokenType::TEXT? string_negligible_p( $x->string ):
|
|
$t == C4::TmplTokenType::DIRECTIVE? 1:
|
|
$t == C4::TmplTokenType::TEXT_PARAMETRIZED
|
|
&& join( '', map { my $t = $_->type;
|
|
$t == C4::TmplTokenType::DIRECTIVE?
|
|
'1': $t == C4::TmplTokenType::TAG?
|
|
'': token_negligible_p( $_ )?
|
|
'': '1' } @{$x->children} ) eq '' );
|
|
}
|
|
|
|
###############################################################################
|
|
|
|
sub remember ($$) {
|
|
my($token, $string) = @_;
|
|
# If we determine that the string is negligible, don't bother to remember
|
|
unless (string_negligible_p( $string ) || token_negligible_p( $token )) {
|
|
my $key = TmplTokenizer::string_canon( $string );
|
|
$text{$key} = [] unless defined $text{$key};
|
|
push @{$text{$key}}, $token;
|
|
}
|
|
}
|
|
|
|
###############################################################################
|
|
|
|
sub string_list () {
|
|
my @t = keys %text;
|
|
# The real gettext tools seems to sort case sensitively; I don't know why
|
|
@t = sort { $a cmp $b } @t if $sort eq 's';
|
|
@t = sort {
|
|
my @aa = sort { $a->pathname cmp $b->pathname
|
|
|| $a->line_number <=> $b->line_number } @{$text{$a}};
|
|
my @bb = sort { $a->pathname cmp $b->pathname
|
|
|| $a->line_number <=> $b->line_number } @{$text{$b}};
|
|
$aa[0]->pathname cmp $bb[0]->pathname
|
|
|| $aa[0]->line_number <=> $bb[0]->line_number;
|
|
} @t if $sort eq 'F';
|
|
return @t;
|
|
}
|
|
|
|
###############################################################################
|
|
|
|
sub text_extract (*) {
|
|
my($h) = @_;
|
|
for (;;) {
|
|
my $s = TmplTokenizer::next_token $h;
|
|
last unless defined $s;
|
|
my($kind, $t, $attr) = ($s->type, $s->string, $s->attributes);
|
|
if ($kind eq C4::TmplTokenType::TEXT) {
|
|
if ($t =~ /\S/s && $t !~ /<!/){
|
|
remember( $s, $t );
|
|
}
|
|
} elsif ($kind eq C4::TmplTokenType::TEXT_PARAMETRIZED) {
|
|
if ($s->form =~ /\S/s && $s->form !~ /<!/){
|
|
remember( $s, $s->form );
|
|
}
|
|
} elsif ($kind eq C4::TmplTokenType::TAG && %$attr) {
|
|
# value [tag=input], meta
|
|
my $tag = lc($1) if $t =~ /^<(\S+)/s;
|
|
for my $a ('alt', 'content', 'title', 'value','label') {
|
|
if ($attr->{$a}) {
|
|
next if $a eq 'label' && $tag ne 'optgroup';
|
|
next if $a eq 'content' && $tag ne 'meta';
|
|
next if $a eq 'value' && ($tag ne 'input'
|
|
|| (ref $attr->{'type'} && $attr->{'type'}->[1] =~ /^(?:hidden|radio|checkbox)$/)); # FIXME
|
|
my($key, $val, $val_orig, $order) = @{$attr->{$a}}; #FIXME
|
|
$val = TmplTokenizer::trim $val;
|
|
remember( $s, $val ) if $val =~ /\S/s;
|
|
}
|
|
}
|
|
} elsif ($s->has_js_data) {
|
|
for my $t (@{$s->js_data}) {
|
|
remember( $s, $t->[3] ) if $t->[0]; # FIXME
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
###############################################################################
|
|
|
|
sub generate_strings_list () {
|
|
# Emit all extracted strings.
|
|
for my $t (string_list) {
|
|
printf OUTPUT "%s\n", $t;
|
|
}
|
|
}
|
|
|
|
###############################################################################
|
|
|
|
sub generate_po_file () {
|
|
# We don't emit the Plural-Forms header; it's meaningless for us
|
|
my $pot_charset = (defined $charset_out? $charset_out: 'CHARSET');
|
|
$pot_charset = TmplTokenizer::charset_canon $pot_charset;
|
|
# Time stamps aren't exactly right semantically. I don't know how to fix it.
|
|
my $time = POSIX::strftime('%Y-%m-%d %H:%M%z', localtime(time));
|
|
my $time_pot = $time;
|
|
my $time_po = $po_mode_p? $time: 'YEAR-MO-DA HO:MI+ZONE';
|
|
print OUTPUT <<EOF;
|
|
# SOME DESCRIPTIVE TITLE.
|
|
# Copyright (C) YEAR THE PACKAGE'S COPYRIGHT HOLDER
|
|
# This file is distributed under the same license as the PACKAGE package.
|
|
# FIRST AUTHOR <EMAIL\@ADDRESS>, YEAR.
|
|
#
|
|
EOF
|
|
print OUTPUT <<EOF unless $disable_fuzzy_p;
|
|
#, fuzzy
|
|
EOF
|
|
print OUTPUT <<EOF;
|
|
msgid ""
|
|
msgstr ""
|
|
"Project-Id-Version: PACKAGE VERSION\\n"
|
|
"POT-Creation-Date: $time_pot\\n"
|
|
"PO-Revision-Date: $time_po\\n"
|
|
"Last-Translator: FULL NAME <EMAIL\@ADDRESS>\\n"
|
|
"Language-Team: LANGUAGE <LL\@li.org>\\n"
|
|
"MIME-Version: 1.0\\n"
|
|
"Content-Type: text/plain; charset=$pot_charset\\n"
|
|
"Content-Transfer-Encoding: 8bit\\n"
|
|
|
|
EOF
|
|
my $directory_re = quotemeta("$directory/");
|
|
for my $t (string_list) {
|
|
if ($text{$t}->[0]->type == C4::TmplTokenType::TEXT_PARAMETRIZED) {
|
|
my($token, $n) = ($text{$t}->[0], 0);
|
|
printf OUTPUT "#. For the first occurrence,\n"
|
|
if @{$text{$t}} > 1 && $token->parameters_and_fields > 0;
|
|
for my $param ($token->parameters_and_fields) {
|
|
$n += 1;
|
|
my $type = $param->type;
|
|
my $subtype = ($type == C4::TmplTokenType::TAG
|
|
&& $param->string =~ /^<input\b/is?
|
|
$param->attributes->{'type'}->[1]: undef);
|
|
my $fmt = TmplTokenizer::_formalize( $param );
|
|
$fmt =~ s/^%/%$n\$/;
|
|
if ($type == C4::TmplTokenType::DIRECTIVE) {
|
|
# $type = "Template::Toolkit Directive";
|
|
$type = $param->string =~ /\[%(.*?)%\]/is? $1: 'ERROR';
|
|
my $name = $param->string =~ /\bname=(["']?)([^\s"']+)\1/is?
|
|
$2: undef;
|
|
printf OUTPUT "#. %s: %s\n", $fmt,
|
|
"$type" . (defined $name? " name=$name": '');
|
|
} else {
|
|
my $name = $param->attributes->{'name'};
|
|
my $value = $param->attributes->{'value'}
|
|
unless $subtype =~ /^(?:text)$/;
|
|
printf OUTPUT "#. %s: %s\n", $fmt, "type=$subtype"
|
|
. (defined $name? " name=$name->[1]": '')
|
|
. (defined $value? " value=$value->[1]": '');
|
|
}
|
|
}
|
|
} elsif ($text{$t}->[0]->type == C4::TmplTokenType::TAG) {
|
|
my($token) = ($text{$t}->[0]);
|
|
printf OUTPUT "#. For the first occurrence,\n"
|
|
if @{$text{$t}} > 1 && $token->parameters_and_fields > 0;
|
|
if ($token->string =~ /^<meta\b/is) {
|
|
my $type = $token->attributes->{'http-equiv'}->[1];
|
|
print OUTPUT "#. META http-equiv=$type\n" if defined $type;
|
|
} elsif ($token->string =~ /^<([a-z0-9]+)/is) {
|
|
my $tag = uc($1);
|
|
my $type = (lc($tag) eq 'input'?
|
|
$token->attributes->{'type'}: undef);
|
|
my $name = $token->attributes->{'name'};
|
|
printf OUTPUT "#. %s\n", $tag
|
|
. (defined $type? " type=$type->[1]": '')
|
|
. (defined $name? " name=$name->[1]": '');
|
|
}
|
|
} elsif ($text{$t}->[0]->has_js_data) {
|
|
printf OUTPUT "#. For the first occurrence,\n" if @{$text{$t}} > 1;
|
|
printf OUTPUT "#. SCRIPT\n";
|
|
}
|
|
my $cformat_p;
|
|
for my $token (@{$text{$t}}) {
|
|
my $pathname = $token->pathname;
|
|
$pathname =~ s/^$directory_re//os;
|
|
$pathname =~ s/^.*\/koha-tmpl\/(.*)$/$1/;
|
|
printf OUTPUT "#: %s:%d\n", $pathname, $token->line_number
|
|
if defined $pathname && defined $token->line_number;
|
|
$cformat_p = 1 if $token->type == C4::TmplTokenType::TEXT_PARAMETRIZED;
|
|
}
|
|
printf OUTPUT "#, c-format\n" if $cformat_p;
|
|
printf OUTPUT "msgid %s\n", TmplTokenizer::quote_po
|
|
TmplTokenizer::string_canon
|
|
TmplTokenizer::charset_convert $t, $charset_in, $charset_out;
|
|
printf OUTPUT "msgstr %s\n\n", (defined $translation{$t}?
|
|
TmplTokenizer::quote_po( $translation{$t} ): "\"\"");
|
|
}
|
|
}
|
|
|
|
###############################################################################
|
|
|
|
sub convert_translation_file () {
|
|
open(INPUT, "<$convert_from") || die "$convert_from: $!\n";
|
|
VerboseWarnings::set_input_file_name $convert_from;
|
|
while (<INPUT>) {
|
|
chomp;
|
|
my($msgid, $msgstr) = split(/\t/);
|
|
die "$convert_from: $.: Malformed tmpl_process input (no tab)\n"
|
|
unless defined $msgstr;
|
|
|
|
# Fixup some of the bad strings
|
|
$msgid =~ s/^SELECTED>//;
|
|
|
|
# Create dummy token
|
|
my $token = TmplToken->new( $msgid, C4::TmplTokenType::UNKNOWN, undef, undef );
|
|
remember( $token, $msgid );
|
|
$msgstr =~ s/^(?:LIMIT;|LIMITED;)//g; # unneeded for tmpl_process3
|
|
$translation{$msgid} = $msgstr unless $msgstr eq '*****';
|
|
|
|
if ($msgid =~ /\bcharset=(["']?)([^;\s"']+)\1/s) {
|
|
my $candidate = TmplTokenizer::charset_canon $2;
|
|
die "Conflicting charsets in msgid: $candidate vs $charset_in\n"
|
|
if defined $charset_in && $charset_in ne $candidate;
|
|
$charset_in = $candidate;
|
|
}
|
|
if ($msgstr =~ /\bcharset=(["']?)([^;\s"']+)\1/s) {
|
|
my $candidate = TmplTokenizer::charset_canon $2;
|
|
die "Conflicting charsets in msgid: $candidate vs $charset_out\n"
|
|
if defined $charset_out && $charset_out ne $candidate;
|
|
$charset_out = $candidate;
|
|
}
|
|
}
|
|
# The following assumption is correct; that's what HTML::Template assumes
|
|
if (!defined $charset_in) {
|
|
$charset_in = $charset_out = TmplTokenizer::charset_canon 'utf-8';
|
|
warn "Warning: Can't determine original templates' charset, defaulting to $charset_in\n";
|
|
}
|
|
}
|
|
|
|
###############################################################################
|
|
|
|
sub usage ($) {
|
|
my($exitcode) = @_;
|
|
my $h = $exitcode? *STDERR: *STDOUT;
|
|
print $h <<EOF;
|
|
Usage: $0 [OPTIONS]
|
|
Extract translatable strings from given HTML::Template input files.
|
|
|
|
Input file location:
|
|
-f, --files-from=FILE Get list of input files from FILE
|
|
-D, --directory=DIRECTORY Add DIRECTORY to list for input files search
|
|
|
|
Output file location:
|
|
-o, --output=FILE Write output to specified file
|
|
|
|
HTML::Template options:
|
|
-a, --extract-all Extract all strings
|
|
--pedantic-warnings Issue warnings even for detected problems
|
|
which are likely to be harmless
|
|
|
|
Output details:
|
|
-s, --sort-output generate sorted output
|
|
-F, --sort-by-file sort output by file location
|
|
-v, --verbose explain what is being done
|
|
|
|
Informative output:
|
|
--help Display this help and exit
|
|
|
|
Try `perldoc $0' for perhaps more information.
|
|
EOF
|
|
exit($exitcode);
|
|
}
|
|
|
|
###############################################################################
|
|
|
|
sub usage_error (;$) {
|
|
print STDERR "$_[0]\n" if @_;
|
|
print STDERR "Try `$0 --help' for more information.\n";
|
|
exit(-1);
|
|
}
|
|
|
|
###############################################################################
|
|
|
|
Getopt::Long::config qw( bundling no_auto_abbrev );
|
|
GetOptions(
|
|
'a|extract-all' => \$extract_all_p,
|
|
'charset=s' => sub { $charset_in = $charset_out = $_[1] }, # INTERNAL
|
|
'convert-from=s' => \$convert_from,
|
|
'D|directory=s' => \$directory,
|
|
'disable-fuzzy' => \$disable_fuzzy_p, # INTERNAL
|
|
'f|files-from=s' => \$files_from,
|
|
'I|input-charset=s' => \$charset_in, # INTERNAL
|
|
'pedantic-warnings|pedantic' => sub { $pedantic_p = 1 },
|
|
'O|output-charset=s' => \$charset_out, # INTERNAL
|
|
'output|o=s' => \$output,
|
|
'po-mode' => \$po_mode_p, # INTERNAL
|
|
's|sort-output' => sub { $sort = 's' },
|
|
'F|sort-by-file' => sub { $sort = 'F' },
|
|
'v|verbose' => \$verbose_p,
|
|
'help' => sub { usage(0) },
|
|
) || usage_error;
|
|
|
|
VerboseWarnings::set_application_name $0;
|
|
VerboseWarnings::set_pedantic_mode $pedantic_p;
|
|
|
|
usage_error('Missing mandatory option -f')
|
|
unless defined $files_from || defined $convert_from;
|
|
$directory = '.' unless defined $directory;
|
|
|
|
usage_error('You cannot specify both --convert-from and --files-from')
|
|
if defined $convert_from && defined $files_from;
|
|
|
|
if (defined $output && $output ne '-') {
|
|
print STDERR "$0: Opening output file \"$output\"\n" if $verbose_p;
|
|
open(OUTPUT, ">$output") || die "$output: $!\n";
|
|
} else {
|
|
print STDERR "$0: Outputting to STDOUT...\n" if $verbose_p;
|
|
open(OUTPUT, ">&STDOUT");
|
|
}
|
|
binmode( OUTPUT, ":utf8" );
|
|
|
|
if (defined $files_from) {
|
|
print STDERR "$0: Opening input file list \"$files_from\"\n" if $verbose_p;
|
|
open(INPUT, "<$files_from") || die "$files_from: $!\n";
|
|
while (<INPUT>) {
|
|
chomp;
|
|
my $input = /^\//? $_: "$directory/$_";
|
|
my $h = TmplTokenizer->new( $input );
|
|
$h->set_allow_cformat( 1 );
|
|
VerboseWarnings::set_input_file_name $input;
|
|
print STDERR "$0: Processing file \"$input\"\n" if $verbose_p;
|
|
text_extract( $h );
|
|
}
|
|
close INPUT;
|
|
} else {
|
|
print STDERR "$0: Converting \"$convert_from\"\n" if $verbose_p;
|
|
convert_translation_file;
|
|
}
|
|
generate_po_file;
|
|
|
|
warn "This input will not work with Mozilla standards-compliant mode\n", undef
|
|
if TmplTokenizer::syntaxerror_p;
|
|
|
|
|
|
exit(-1) if TmplTokenizer::fatal_p;
|
|
|
|
###############################################################################
|
|
|
|
=head1 DESCRIPTION
|
|
|
|
This is an experimental script based on the modularized
|
|
text-extract2.pl script. It has behaviour similar to
|
|
xgettext(1), and generates gettext-compatible output files.
|
|
|
|
A gettext-like format provides the following advantages:
|
|
|
|
=over
|
|
|
|
=item -
|
|
|
|
Translation to non-English-like languages with different word
|
|
order: gettext's c-format strings can theoretically be
|
|
emulated if we are able to do some analysis on the .tmpl input
|
|
and treat <TMPL_VAR> in a way similar to %s.
|
|
|
|
=item -
|
|
|
|
Context for the extracted strings: the gettext format provides
|
|
the filenames and line numbers where each string can be found.
|
|
The translator can read the source file and see the context,
|
|
in case the string by itself can mean several different things.
|
|
|
|
=item -
|
|
|
|
Place for the translator to add comments about the translations.
|
|
|
|
=item -
|
|
|
|
Gettext-compatible tools, if any, might be usable if we adopt
|
|
the gettext format.
|
|
|
|
=back
|
|
|
|
This script has already been in use for over a year and should
|
|
be reasonable stable. Nevertheless, it is still somewhat
|
|
experimental and there are still some issues.
|
|
|
|
Please refer to the explanation in tmpl_process3 for further
|
|
details.
|
|
|
|
If you want to generate GNOME-style POTFILES.in files, such
|
|
files (passed to -f) can be generated thus:
|
|
|
|
(cd ../.. && find koha-tmpl/opac-tmpl/default/en \
|
|
-name \*.inc -o -name \*.tmpl) > opac/POTFILES.in
|
|
(cd ../.. && find koha-tmpl/intranet-tmpl/default/en \
|
|
-name \*.inc -o -name \*.tmpl) > intranet/POTFILES.in
|
|
|
|
This is, however, quite pointless, because the "create" and
|
|
"update" actions have already been implemented in tmpl_process3.pl.
|
|
|
|
=head2 Strings inside JavaScript
|
|
|
|
In the SCRIPT elements, the script will attempt to scan for
|
|
_("I<string literal>") patterns, and extract the I<string literal>
|
|
as a translatable string.
|
|
|
|
Note that the C-like _(...) notation is required.
|
|
|
|
The JavaScript must actually define a _ function
|
|
so that the code remains correct JavaScript.
|
|
A suitable definition of such a function can be
|
|
|
|
function _(s) { return s } // dummy function for gettext
|
|
|
|
=head1 SEE ALSO
|
|
|
|
tmpl_process3.pl,
|
|
xgettext(1),
|
|
Locale::PO(3),
|
|
translator_doc.txt
|
|
|
|
=head1 BUGS
|
|
|
|
There probably are some. Bugs related to scanning of <INPUT>
|
|
tags seem to be especially likely to be present.
|
|
|
|
Its diagnostics are probably too verbose.
|
|
|
|
When a <TMPL_VAR> within a JavaScript-related attribute is
|
|
detected, the script currently displays no warnings at all.
|
|
It might be good to display some kind of warning.
|
|
|
|
Its sort order (-s option) seems to be different than the real
|
|
xgettext(1)'s sort option. This will result in translation
|
|
strings inside the generated PO file spuriously moving about
|
|
when tmpl_process3.pl calls msgmerge(1) to update the PO file.
|
|
|
|
If a Javascript string has leading spaces, it will
|
|
generate strings with spurious leading spaces,
|
|
leading to failure to match the strings when actually generating
|
|
translated files.
|
|
|
|
=cut
|