3 # This file is part of Koha.
5 # Koha is free software; you can redistribute it and/or modify it
6 # under the terms of the GNU General Public License as published by
7 # the Free Software Foundation; either version 3 of the License, or
8 # (at your option) any later version.
10 # Koha is distributed in the hope that it will be useful, but
11 # WITHOUT ANY WARRANTY; without even the implied warranty of
12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 # GNU General Public License for more details.
15 # You should have received a copy of the GNU General Public License
16 # along with Koha; if not, see <http://www.gnu.org/licenses>.
20 xgettext.pl - xgettext(1)-like interface for .tt strings extraction
25 use lib $FindBin::Bin;
29 use Getopt::Long qw( GetOptions );
35 use vars qw( $convert_from );
36 use vars qw( $files_from $directory $output $sort );
37 use vars qw( $extract_all_p );
38 use vars qw( $pedantic_p );
39 use vars qw( %text %translation );
40 use vars qw( $charset_in $charset_out );
41 use vars qw( $disable_fuzzy_p );
42 use vars qw( $verbose_p );
43 use vars qw( $po_mode_p );
47 ###############################################################################
49 sub string_negligible_p {
50 my($t) = @_; # a string
51 # Don't emit pure whitespace, pure numbers, pure punctuation,
52 # single letters, or TMPL_VAR's.
53 # Punctuation should arguably be translated. But without context
54 # they are untranslatable. Note that $t is a string, not a token object.
55 return !$extract_all_p && (
56 TmplTokenizer::blank_p($t) # blank or TMPL_VAR
57 || $t =~ /^\d+$/ # purely digits
58 || $t =~ /^[-\+\.,:;!\?'"%\(\)\[\]\|]+$/ # punctuation w/o context
59 || $t =~ /^[A-Za-z]$/ # single letters
60 || $t =~ /^(&[a-z]+;|&#\d+;|&#x[0-9a-fA-F]+;|%%|%s|\s|[[:punct:]])*$/ # html entities,placeholder,punct, ...
61 || ( $t =~ /^\[\%.*\%\]$/ and $t !~ /\%\].*\[\%/ ) # pure TT entities
62 || $t =~ /^\s*<\?.*\?>/ # ignore xml prolog
66 sub token_negligible_p {
69 return !$extract_all_p && (
70 $t == C4::TmplTokenType::TEXT() ? string_negligible_p( $x->string )
71 : $t == C4::TmplTokenType::DIRECTIVE() ? 1
72 : $t == C4::TmplTokenType::TEXT_PARAMETRIZED()
77 $t == C4::TmplTokenType::DIRECTIVE() ? '1'
78 : $t == C4::TmplTokenType::TAG() ? ''
79 : token_negligible_p($_) ? ''
86 ###############################################################################
89 my($token, $string) = @_;
90 # If we determine that the string is negligible, don't bother to remember
91 unless (string_negligible_p( $string ) || token_negligible_p( $token )) {
92 my $key = TmplTokenizer::string_canon( $string );
93 $text{$key} = [] unless defined $text{$key};
94 push @{$text{$key}}, $token;
98 ###############################################################################
102 # The real gettext tools seems to sort case sensitively; I don't know why
103 @t = sort { $a cmp $b } @t if $sort eq 's';
105 my @aa = sort { $a->pathname cmp $b->pathname
106 || $a->line_number <=> $b->line_number } @{$text{$a}};
107 my @bb = sort { $a->pathname cmp $b->pathname
108 || $a->line_number <=> $b->line_number } @{$text{$b}};
109 $aa[0]->pathname cmp $bb[0]->pathname
110 || $aa[0]->line_number <=> $bb[0]->line_number;
111 } @t if $sort eq 'F';
115 ###############################################################################
120 my $s = TmplTokenizer::next_token($h);
121 last unless defined $s;
122 my($kind, $t, $attr) = ($s->type, $s->string, $s->attributes);
123 if ($kind eq C4::TmplTokenType::TEXT) {
124 if ($t =~ /\S/s && $t !~ /<!/){
127 } elsif ($kind eq C4::TmplTokenType::TEXT_PARAMETRIZED) {
128 if ($s->form =~ /\S/s && $s->form !~ /<!/){
129 remember( $s, $s->form );
131 } elsif ($kind eq C4::TmplTokenType::TAG && %$attr) {
132 # value [tag=input], meta
134 $tag = lc($1) if $t =~ /^<(\S+)/s;
135 for my $a ('alt', 'content', 'title', 'value', 'label', 'placeholder', 'aria-label') {
137 next if $a eq 'label' && $tag ne 'optgroup';
138 next if $a eq 'content' && $tag ne 'meta';
139 next if $a eq 'value' && ($tag ne 'input'
140 || (ref $attr->{'type'} && $attr->{'type'}->[1] =~ /^(?:hidden|radio|checkbox)$/)); # FIXME
141 my($key, $val, $val_orig, $order) = @{$attr->{$a}}; #FIXME
142 $val = TmplTokenizer::trim($val);
143 # for selected attributes replace '[%..%]' with '%s' globally
144 if ( $a =~ /title|value|alt|content|placeholder|aria-label/ ) {
145 $val =~ s/\[\%.*?\%\]/\%s/g;
147 # save attribute text for translation
148 remember( $s, $val ) if $val =~ /\S/s;
151 } elsif ($s->has_js_data) {
152 for my $t (@{$s->js_data}) {
153 remember( $s, $t->[3] ) if $t->[0]; # FIXME
159 ###############################################################################
161 sub generate_po_file {
162 # We don't emit the Plural-Forms header; it's meaningless for us
163 my $pot_charset = (defined $charset_out? $charset_out: 'CHARSET');
164 $pot_charset = TmplTokenizer::charset_canon($pot_charset);
165 # Time stamps aren't exactly right semantically. I don't know how to fix it.
166 my $time = POSIX::strftime('%Y-%m-%d %H:%M%z', localtime(time));
167 my $time_pot = $time;
168 my $time_po = $po_mode_p? $time: 'YEAR-MO-DA HO:MI+ZONE';
170 # SOME DESCRIPTIVE TITLE.
171 # Copyright (C) YEAR THE PACKAGE'S COPYRIGHT HOLDER
172 # This file is distributed under the same license as the PACKAGE package.
173 # FIRST AUTHOR <EMAIL\@ADDRESS>, YEAR.
176 print $OUTPUT <<EOF unless $disable_fuzzy_p;
182 "Project-Id-Version: Koha\\n"
183 "POT-Creation-Date: $time_pot\\n"
184 "PO-Revision-Date: $time_po\\n"
185 "Last-Translator: FULL NAME <EMAIL\@ADDRESS>\\n"
186 "Language-Team: LANGUAGE <LL\@li.org>\\n"
187 "MIME-Version: 1.0\\n"
188 "Content-Type: text/plain; charset=$pot_charset\\n"
189 "Content-Transfer-Encoding: 8bit\\n"
192 my $directory_re = quotemeta("$directory/");
193 for my $t (string_list) {
194 if ($text{$t}->[0]->type == C4::TmplTokenType::TEXT_PARAMETRIZED) {
195 my($token, $n) = ($text{$t}->[0], 0);
196 printf $OUTPUT "#. For the first occurrence,\n"
197 if @{$text{$t}} > 1 && $token->parameters_and_fields > 0;
198 for my $param ($token->parameters_and_fields) {
200 my $type = $param->type;
201 my $subtype = ($type == C4::TmplTokenType::TAG
202 && $param->string =~ /^<input\b/is?
203 $param->attributes->{'type'}->[1]: undef);
204 my $fmt = TmplTokenizer::_formalize( $param );
206 if ($type == C4::TmplTokenType::DIRECTIVE) {
207 # $type = "Template::Toolkit Directive";
208 $type = $param->string =~ /\[%(.*?)%\]/is? $1: 'ERROR';
209 my $name = $param->string =~ /\bname=(["']?)([^\s"']+)\1/is?
211 printf $OUTPUT "#. %s: %s\n", $fmt,
212 "$type" . (defined $name? " name=$name": '');
214 my $name = $param->attributes->{'name'};
216 $value = $param->attributes->{'value'}
217 unless $subtype =~ /^(?:text)$/;
218 printf $OUTPUT "#. %s: %s\n", $fmt, "type=$subtype"
219 . (defined $name? " name=$name->[1]": '')
220 . (defined $value? " value=$value->[1]": '');
223 } elsif ($text{$t}->[0]->type == C4::TmplTokenType::TAG) {
224 my($token) = ($text{$t}->[0]);
225 printf $OUTPUT "#. For the first occurrence,\n"
226 if @{$text{$t}} > 1 && $token->parameters_and_fields > 0;
227 if ($token->string =~ /^<meta\b/is) {
228 my $type = $token->attributes->{'http-equiv'}->[1];
229 print $OUTPUT "#. META http-equiv=$type\n" if defined $type;
230 } elsif ($token->string =~ /^<([a-z0-9]+)/is) {
232 my $type = (lc($tag) eq 'input'?
233 $token->attributes->{'type'}: undef);
234 my $name = $token->attributes->{'name'};
235 printf $OUTPUT "#. %s\n", $tag
236 . (defined $type? " type=$type->[1]": '')
237 . (defined $name? " name=$name->[1]": '');
239 } elsif ($text{$t}->[0]->has_js_data) {
240 printf $OUTPUT "#. For the first occurrence,\n" if @{$text{$t}} > 1;
241 printf $OUTPUT "#. SCRIPT\n";
244 for my $token (@{$text{$t}}) {
245 my $pathname = $token->pathname;
246 $pathname =~ s/^$directory_re//os;
247 $pathname =~ s/^.*\/koha-tmpl\/(.*)$/$1/;
248 printf $OUTPUT "#: %s:%d\n", $pathname, $token->line_number
249 if defined $pathname && defined $token->line_number;
250 $cformat_p = 1 if $token->type == C4::TmplTokenType::TEXT_PARAMETRIZED;
252 printf $OUTPUT "#, c-format\n" if $cformat_p;
253 printf $OUTPUT "msgid %s\n", TmplTokenizer::quote_po(
254 TmplTokenizer::string_canon(
255 TmplTokenizer::charset_convert($t, $charset_in, $charset_out)
258 printf $OUTPUT "msgstr %s\n\n", (defined $translation{$t}?
259 TmplTokenizer::quote_po( $translation{$t} ): "\"\"");
263 ###############################################################################
265 sub convert_translation_file {
266 open(my $INPUT, '<:encoding(utf-8)', $convert_from) || die "$convert_from: $!\n";
267 VerboseWarnings::set_input_file_name($convert_from);
270 my($msgid, $msgstr) = split(/\t/);
271 die "$convert_from: $.: Malformed tmpl_process input (no tab)\n"
272 unless defined $msgstr;
274 # Fixup some of the bad strings
275 $msgid =~ s/^SELECTED>//;
278 my $token = TmplToken->new( $msgid, C4::TmplTokenType::UNKNOWN, undef, undef );
279 remember( $token, $msgid );
280 $msgstr =~ s/^(?:LIMIT;|LIMITED;)//g; # unneeded for tmpl_process3
281 $translation{$msgid} = $msgstr unless $msgstr eq '*****';
283 if ($msgid =~ /\bcharset=(["']?)([^;\s"']+)\1/s) {
284 my $candidate = TmplTokenizer::charset_canon($2);
285 die "Conflicting charsets in msgid: $candidate vs $charset_in\n"
286 if defined $charset_in && $charset_in ne $candidate;
287 $charset_in = $candidate;
289 if ($msgstr =~ /\bcharset=(["']?)([^;\s"']+)\1/s) {
290 my $candidate = TmplTokenizer::charset_canon($2);
291 die "Conflicting charsets in msgid: $candidate vs $charset_out\n"
292 if defined $charset_out && $charset_out ne $candidate;
293 $charset_out = $candidate;
296 # The following assumption is correct; that's what HTML::Template assumes
297 if (!defined $charset_in) {
298 $charset_in = $charset_out = TmplTokenizer::charset_canon('utf-8');
299 warn "Warning: Can't determine original templates' charset, defaulting to $charset_in\n";
303 ###############################################################################
307 my $h = $exitcode? *STDERR: *STDOUT;
310 Extract translatable strings from given HTML::Template input files.
313 -f, --files-from=FILE Get list of input files from FILE
314 -D, --directory=DIRECTORY Add DIRECTORY to list for input files search
316 Output file location:
317 -o, --output=FILE Write output to specified file
319 HTML::Template options:
320 -a, --extract-all Extract all strings
321 --pedantic-warnings Issue warnings even for detected problems
322 which are likely to be harmless
325 -s, --sort-output generate sorted output
326 -F, --sort-by-file sort output by file location
327 -v, --verbose explain what is being done
330 --help Display this help and exit
332 Try `perldoc $0' for perhaps more information.
337 ###############################################################################
340 print STDERR "$_[0]\n" if @_;
341 print STDERR "Try `$0 --help' for more information.\n";
345 ###############################################################################
347 Getopt::Long::config qw( bundling no_auto_abbrev );
349 'a|extract-all' => \$extract_all_p,
350 'charset=s' => sub { $charset_in = $charset_out = $_[1] }, # INTERNAL
351 'convert-from=s' => \$convert_from,
352 'D|directory=s' => \$directory,
353 'disable-fuzzy' => \$disable_fuzzy_p, # INTERNAL
354 'f|files-from=s' => \$files_from,
355 'I|input-charset=s' => \$charset_in, # INTERNAL
356 'pedantic-warnings|pedantic' => sub { $pedantic_p = 1 },
357 'O|output-charset=s' => \$charset_out, # INTERNAL
358 'output|o=s' => \$output,
359 'po-mode' => \$po_mode_p, # INTERNAL
360 's|sort-output' => sub { $sort = 's' },
361 'F|sort-by-file' => sub { $sort = 'F' },
362 'v|verbose' => \$verbose_p,
363 'help' => sub { usage(0) },
366 VerboseWarnings::set_application_name($0);
367 VerboseWarnings::set_pedantic_mode($pedantic_p);
369 usage_error('Missing mandatory option -f')
370 unless defined $files_from || defined $convert_from;
371 $directory = '.' unless defined $directory;
373 usage_error('You cannot specify both --convert-from and --files-from')
374 if defined $convert_from && defined $files_from;
376 if (defined $output && $output ne '-') {
377 print STDERR "$0: Opening output file \"$output\"\n" if $verbose_p;
378 open($OUTPUT, '>:encoding(utf-8)', $output) || die "$output: $!\n";
380 print STDERR "$0: Outputting to STDOUT...\n" if $verbose_p;
381 open($OUTPUT, q{>}, "&STDOUT");
384 if (defined $files_from) {
385 print STDERR "$0: Opening input file list \"$files_from\"\n" if $verbose_p;
386 open(my $INPUT, '<:encoding(utf-8)', $files_from) || die "$files_from: $!\n";
389 my $input = /^\//? $_: "$directory/$_";
390 my $h = TmplTokenizer->new( $input );
391 $h->set_allow_cformat( 1 );
392 VerboseWarnings::set_input_file_name($input);
393 print STDERR "$0: Processing file \"$input\"\n" if $verbose_p;
398 print STDERR "$0: Converting \"$convert_from\"\n" if $verbose_p;
399 convert_translation_file;
403 warn "This input will not work with Mozilla standards-compliant mode\n", undef
404 if TmplTokenizer::syntaxerror_p;
407 exit(-1) if TmplTokenizer::fatal_p;
409 ###############################################################################
413 This script has behaviour similar to
414 xgettext(1), and generates gettext-compatible output files.
416 A gettext-like format provides the following advantages:
422 Translation to non-English-like languages with different word
423 order: gettext's c-format strings can theoretically be
424 emulated if we are able to do some analysis on the .tt input
425 and treat <TMPL_VAR> in a way similar to %s.
429 Context for the extracted strings: the gettext format provides
430 the filenames and line numbers where each string can be found.
431 The translator can read the source file and see the context,
432 in case the string by itself can mean several different things.
436 Place for the translator to add comments about the translations.
440 Gettext-compatible tools, if any, might be usable if we adopt
445 This script has already been in use for over a year and should
446 be reasonable stable. Nevertheless, it is still somewhat
447 experimental and there are still some issues.
449 Please refer to the explanation in tmpl_process3 for further
452 If you want to generate GNOME-style POTFILES.in files, such
453 files (passed to -f) can be generated thus:
455 (cd ../.. && find koha-tmpl/opac-tmpl/default/en \
456 -name \*.inc -o -name \*.tt) > opac/POTFILES.in
457 (cd ../.. && find koha-tmpl/intranet-tmpl/default/en \
458 -name \*.inc -o -name \*.tt) > intranet/POTFILES.in
460 This is, however, quite pointless, because the "create" and
461 "update" actions have already been implemented in tmpl_process3.pl.
463 =head2 Strings inside JavaScript
465 In the SCRIPT elements, the script will attempt to scan for
466 _("I<string literal>") patterns, and extract the I<string literal>
467 as a translatable string.
469 Note that the C-like _(...) notation is required.
471 The JavaScript must actually define a _ function
472 so that the code remains correct JavaScript.
473 A suitable definition of such a function can be
475 function _(s) { return s } // dummy function for gettext
486 There probably are some. Bugs related to scanning of <INPUT>
487 tags seem to be especially likely to be present.
489 Its diagnostics are probably too verbose.
491 When a <TMPL_VAR> within a JavaScript-related attribute is
492 detected, the script currently displays no warnings at all.
493 It might be good to display some kind of warning.
495 Its sort order (-s option) seems to be different than the real
496 xgettext(1)'s sort option. This will result in translation
497 strings inside the generated PO file spuriously moving about
498 when tmpl_process3.pl calls msgmerge(1) to update the PO file.
500 If a Javascript string has leading spaces, it will
501 generate strings with spurious leading spaces,
502 leading to failure to match the strings when actually generating