3 # This file is part of Koha.
5 # Koha is free software; you can redistribute it and/or modify it
6 # under the terms of the GNU General Public License as published by
7 # the Free Software Foundation; either version 3 of the License, or
8 # (at your option) any later version.
10 # Koha is distributed in the hope that it will be useful, but
11 # WITHOUT ANY WARRANTY; without even the implied warranty of
12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 # GNU General Public License for more details.
15 # You should have received a copy of the GNU General Public License
16 # along with Koha; if not, see <http://www.gnu.org/licenses>.
20 xgettext.pl - xgettext(1)-like interface for .tt strings extraction
25 use lib $FindBin::Bin;
35 use vars qw( $convert_from );
36 use vars qw( $files_from $directory $output $sort );
37 use vars qw( $extract_all_p );
38 use vars qw( $pedantic_p );
39 use vars qw( %text %translation );
40 use vars qw( $charset_in $charset_out );
41 use vars qw( $disable_fuzzy_p );
42 use vars qw( $verbose_p );
43 use vars qw( $po_mode_p );
47 ###############################################################################
49 sub string_negligible_p {
50 my($t) = @_; # a string
51 # Don't emit pure whitespace, pure numbers, pure punctuation,
52 # single letters, or TMPL_VAR's.
53 # Punctuation should arguably be translated. But without context
54 # they are untranslatable. Note that $t is a string, not a token object.
55 return !$extract_all_p && (
56 TmplTokenizer::blank_p($t) # blank or TMPL_VAR
57 || $t =~ /^\d+$/ # purely digits
58 || $t =~ /^[-\+\.,:;!\?'"%\(\)\[\]\|]+$/ # punctuation w/o context
59 || $t =~ /^[A-Za-z]$/ # single letters
60 || $t =~ /^(&[a-z]+;|&#\d+;|&#x[0-9a-fA-F]+;|%%|%s|\s|[[:punct:]])*$/ # html entities,placeholder,punct, ...
61 || ( $t =~ /^\[\%.*\%\]$/ and $t !~ /\%\].*\[\%/ ) # pure TT entities
62 || $t =~ /^\s*<\?.*\?>/ # ignore xml prolog
66 sub token_negligible_p {
69 return !$extract_all_p && (
70 $t == C4::TmplTokenType::TEXT() ? string_negligible_p( $x->string )
71 : $t == C4::TmplTokenType::DIRECTIVE() ? 1
72 : $t == C4::TmplTokenType::TEXT_PARAMETRIZED()
77 $t == C4::TmplTokenType::DIRECTIVE() ? '1'
78 : $t == C4::TmplTokenType::TAG() ? ''
79 : token_negligible_p($_) ? ''
86 ###############################################################################
89 my($token, $string) = @_;
90 # If we determine that the string is negligible, don't bother to remember
91 unless (string_negligible_p( $string ) || token_negligible_p( $token )) {
92 my $key = TmplTokenizer::string_canon( $string );
93 $text{$key} = [] unless defined $text{$key};
94 push @{$text{$key}}, $token;
98 ###############################################################################
102 # The real gettext tools seems to sort case sensitively; I don't know why
103 @t = sort { $a cmp $b } @t if $sort eq 's';
105 my @aa = sort { $a->pathname cmp $b->pathname
106 || $a->line_number <=> $b->line_number } @{$text{$a}};
107 my @bb = sort { $a->pathname cmp $b->pathname
108 || $a->line_number <=> $b->line_number } @{$text{$b}};
109 $aa[0]->pathname cmp $bb[0]->pathname
110 || $aa[0]->line_number <=> $bb[0]->line_number;
111 } @t if $sort eq 'F';
115 ###############################################################################
120 my $s = TmplTokenizer::next_token($h);
121 last unless defined $s;
122 my($kind, $t, $attr) = ($s->type, $s->string, $s->attributes);
123 if ($kind eq C4::TmplTokenType::TEXT) {
124 if ($t =~ /\S/s && $t !~ /<!/){
127 } elsif ($kind eq C4::TmplTokenType::TEXT_PARAMETRIZED) {
128 if ($s->form =~ /\S/s && $s->form !~ /<!/){
129 remember( $s, $s->form );
131 } elsif ($kind eq C4::TmplTokenType::TAG && %$attr) {
132 # value [tag=input], meta
134 $tag = lc($1) if $t =~ /^<(\S+)/s;
135 for my $a ('alt', 'content', 'title', 'value', 'label', 'placeholder', 'aria-label') {
137 next if $a eq 'label' && $tag ne 'optgroup';
138 next if $a eq 'content' && $tag ne 'meta';
139 next if $a eq 'value' && ($tag ne 'input'
140 || (ref $attr->{'type'} && $attr->{'type'}->[1] =~ /^(?:hidden|radio|checkbox)$/)); # FIXME
141 my($key, $val, $val_orig, $order) = @{$attr->{$a}}; #FIXME
142 $val = TmplTokenizer::trim($val);
143 # for selected attributes replace '[%..%]' with '%s' globally
144 if ( $a =~ /title|value|alt|content|placeholder|aria-label/ ) {
145 $val =~ s/\[\%.*?\%\]/\%s/g;
147 # save attribute text for translation
148 remember( $s, $val ) if $val =~ /\S/s;
151 } elsif ($s->has_js_data) {
152 for my $t (@{$s->js_data}) {
153 remember( $s, $t->[3] ) if $t->[0]; # FIXME
159 ###############################################################################
161 sub generate_strings_list {
162 # Emit all extracted strings.
163 for my $t (string_list) {
164 printf $OUTPUT "%s\n", $t;
168 ###############################################################################
170 sub generate_po_file {
171 # We don't emit the Plural-Forms header; it's meaningless for us
172 my $pot_charset = (defined $charset_out? $charset_out: 'CHARSET');
173 $pot_charset = TmplTokenizer::charset_canon($pot_charset);
174 # Time stamps aren't exactly right semantically. I don't know how to fix it.
175 my $time = POSIX::strftime('%Y-%m-%d %H:%M%z', localtime(time));
176 my $time_pot = $time;
177 my $time_po = $po_mode_p? $time: 'YEAR-MO-DA HO:MI+ZONE';
179 # SOME DESCRIPTIVE TITLE.
180 # Copyright (C) YEAR THE PACKAGE'S COPYRIGHT HOLDER
181 # This file is distributed under the same license as the PACKAGE package.
182 # FIRST AUTHOR <EMAIL\@ADDRESS>, YEAR.
185 print $OUTPUT <<EOF unless $disable_fuzzy_p;
191 "Project-Id-Version: Koha\\n"
192 "POT-Creation-Date: $time_pot\\n"
193 "PO-Revision-Date: $time_po\\n"
194 "Last-Translator: FULL NAME <EMAIL\@ADDRESS>\\n"
195 "Language-Team: LANGUAGE <LL\@li.org>\\n"
196 "MIME-Version: 1.0\\n"
197 "Content-Type: text/plain; charset=$pot_charset\\n"
198 "Content-Transfer-Encoding: 8bit\\n"
201 my $directory_re = quotemeta("$directory/");
202 for my $t (string_list) {
203 if ($text{$t}->[0]->type == C4::TmplTokenType::TEXT_PARAMETRIZED) {
204 my($token, $n) = ($text{$t}->[0], 0);
205 printf $OUTPUT "#. For the first occurrence,\n"
206 if @{$text{$t}} > 1 && $token->parameters_and_fields > 0;
207 for my $param ($token->parameters_and_fields) {
209 my $type = $param->type;
210 my $subtype = ($type == C4::TmplTokenType::TAG
211 && $param->string =~ /^<input\b/is?
212 $param->attributes->{'type'}->[1]: undef);
213 my $fmt = TmplTokenizer::_formalize( $param );
215 if ($type == C4::TmplTokenType::DIRECTIVE) {
216 # $type = "Template::Toolkit Directive";
217 $type = $param->string =~ /\[%(.*?)%\]/is? $1: 'ERROR';
218 my $name = $param->string =~ /\bname=(["']?)([^\s"']+)\1/is?
220 printf $OUTPUT "#. %s: %s\n", $fmt,
221 "$type" . (defined $name? " name=$name": '');
223 my $name = $param->attributes->{'name'};
225 $value = $param->attributes->{'value'}
226 unless $subtype =~ /^(?:text)$/;
227 printf $OUTPUT "#. %s: %s\n", $fmt, "type=$subtype"
228 . (defined $name? " name=$name->[1]": '')
229 . (defined $value? " value=$value->[1]": '');
232 } elsif ($text{$t}->[0]->type == C4::TmplTokenType::TAG) {
233 my($token) = ($text{$t}->[0]);
234 printf $OUTPUT "#. For the first occurrence,\n"
235 if @{$text{$t}} > 1 && $token->parameters_and_fields > 0;
236 if ($token->string =~ /^<meta\b/is) {
237 my $type = $token->attributes->{'http-equiv'}->[1];
238 print $OUTPUT "#. META http-equiv=$type\n" if defined $type;
239 } elsif ($token->string =~ /^<([a-z0-9]+)/is) {
241 my $type = (lc($tag) eq 'input'?
242 $token->attributes->{'type'}: undef);
243 my $name = $token->attributes->{'name'};
244 printf $OUTPUT "#. %s\n", $tag
245 . (defined $type? " type=$type->[1]": '')
246 . (defined $name? " name=$name->[1]": '');
248 } elsif ($text{$t}->[0]->has_js_data) {
249 printf $OUTPUT "#. For the first occurrence,\n" if @{$text{$t}} > 1;
250 printf $OUTPUT "#. SCRIPT\n";
253 for my $token (@{$text{$t}}) {
254 my $pathname = $token->pathname;
255 $pathname =~ s/^$directory_re//os;
256 $pathname =~ s/^.*\/koha-tmpl\/(.*)$/$1/;
257 printf $OUTPUT "#: %s:%d\n", $pathname, $token->line_number
258 if defined $pathname && defined $token->line_number;
259 $cformat_p = 1 if $token->type == C4::TmplTokenType::TEXT_PARAMETRIZED;
261 printf $OUTPUT "#, c-format\n" if $cformat_p;
262 printf $OUTPUT "msgid %s\n", TmplTokenizer::quote_po(
263 TmplTokenizer::string_canon(
264 TmplTokenizer::charset_convert($t, $charset_in, $charset_out)
267 printf $OUTPUT "msgstr %s\n\n", (defined $translation{$t}?
268 TmplTokenizer::quote_po( $translation{$t} ): "\"\"");
272 ###############################################################################
274 sub convert_translation_file {
275 open(my $INPUT, '<:encoding(utf-8)', $convert_from) || die "$convert_from: $!\n";
276 VerboseWarnings::set_input_file_name($convert_from);
279 my($msgid, $msgstr) = split(/\t/);
280 die "$convert_from: $.: Malformed tmpl_process input (no tab)\n"
281 unless defined $msgstr;
283 # Fixup some of the bad strings
284 $msgid =~ s/^SELECTED>//;
287 my $token = TmplToken->new( $msgid, C4::TmplTokenType::UNKNOWN, undef, undef );
288 remember( $token, $msgid );
289 $msgstr =~ s/^(?:LIMIT;|LIMITED;)//g; # unneeded for tmpl_process3
290 $translation{$msgid} = $msgstr unless $msgstr eq '*****';
292 if ($msgid =~ /\bcharset=(["']?)([^;\s"']+)\1/s) {
293 my $candidate = TmplTokenizer::charset_canon($2);
294 die "Conflicting charsets in msgid: $candidate vs $charset_in\n"
295 if defined $charset_in && $charset_in ne $candidate;
296 $charset_in = $candidate;
298 if ($msgstr =~ /\bcharset=(["']?)([^;\s"']+)\1/s) {
299 my $candidate = TmplTokenizer::charset_canon($2);
300 die "Conflicting charsets in msgid: $candidate vs $charset_out\n"
301 if defined $charset_out && $charset_out ne $candidate;
302 $charset_out = $candidate;
305 # The following assumption is correct; that's what HTML::Template assumes
306 if (!defined $charset_in) {
307 $charset_in = $charset_out = TmplTokenizer::charset_canon('utf-8');
308 warn "Warning: Can't determine original templates' charset, defaulting to $charset_in\n";
312 ###############################################################################
316 my $h = $exitcode? *STDERR: *STDOUT;
319 Extract translatable strings from given HTML::Template input files.
322 -f, --files-from=FILE Get list of input files from FILE
323 -D, --directory=DIRECTORY Add DIRECTORY to list for input files search
325 Output file location:
326 -o, --output=FILE Write output to specified file
328 HTML::Template options:
329 -a, --extract-all Extract all strings
330 --pedantic-warnings Issue warnings even for detected problems
331 which are likely to be harmless
334 -s, --sort-output generate sorted output
335 -F, --sort-by-file sort output by file location
336 -v, --verbose explain what is being done
339 --help Display this help and exit
341 Try `perldoc $0' for perhaps more information.
346 ###############################################################################
349 print STDERR "$_[0]\n" if @_;
350 print STDERR "Try `$0 --help' for more information.\n";
354 ###############################################################################
356 Getopt::Long::config qw( bundling no_auto_abbrev );
358 'a|extract-all' => \$extract_all_p,
359 'charset=s' => sub { $charset_in = $charset_out = $_[1] }, # INTERNAL
360 'convert-from=s' => \$convert_from,
361 'D|directory=s' => \$directory,
362 'disable-fuzzy' => \$disable_fuzzy_p, # INTERNAL
363 'f|files-from=s' => \$files_from,
364 'I|input-charset=s' => \$charset_in, # INTERNAL
365 'pedantic-warnings|pedantic' => sub { $pedantic_p = 1 },
366 'O|output-charset=s' => \$charset_out, # INTERNAL
367 'output|o=s' => \$output,
368 'po-mode' => \$po_mode_p, # INTERNAL
369 's|sort-output' => sub { $sort = 's' },
370 'F|sort-by-file' => sub { $sort = 'F' },
371 'v|verbose' => \$verbose_p,
372 'help' => sub { usage(0) },
375 VerboseWarnings::set_application_name($0);
376 VerboseWarnings::set_pedantic_mode($pedantic_p);
378 usage_error('Missing mandatory option -f')
379 unless defined $files_from || defined $convert_from;
380 $directory = '.' unless defined $directory;
382 usage_error('You cannot specify both --convert-from and --files-from')
383 if defined $convert_from && defined $files_from;
385 if (defined $output && $output ne '-') {
386 print STDERR "$0: Opening output file \"$output\"\n" if $verbose_p;
387 open($OUTPUT, '>:encoding(utf-8)', $output) || die "$output: $!\n";
389 print STDERR "$0: Outputting to STDOUT...\n" if $verbose_p;
390 open($OUTPUT, ">&STDOUT");
393 if (defined $files_from) {
394 print STDERR "$0: Opening input file list \"$files_from\"\n" if $verbose_p;
395 open(my $INPUT, '<:encoding(utf-8)', $files_from) || die "$files_from: $!\n";
398 my $input = /^\//? $_: "$directory/$_";
399 my $h = TmplTokenizer->new( $input );
400 $h->set_allow_cformat( 1 );
401 VerboseWarnings::set_input_file_name($input);
402 print STDERR "$0: Processing file \"$input\"\n" if $verbose_p;
407 print STDERR "$0: Converting \"$convert_from\"\n" if $verbose_p;
408 convert_translation_file;
412 warn "This input will not work with Mozilla standards-compliant mode\n", undef
413 if TmplTokenizer::syntaxerror_p;
416 exit(-1) if TmplTokenizer::fatal_p;
418 ###############################################################################
422 This script has behaviour similar to
423 xgettext(1), and generates gettext-compatible output files.
425 A gettext-like format provides the following advantages:
431 Translation to non-English-like languages with different word
432 order: gettext's c-format strings can theoretically be
433 emulated if we are able to do some analysis on the .tt input
434 and treat <TMPL_VAR> in a way similar to %s.
438 Context for the extracted strings: the gettext format provides
439 the filenames and line numbers where each string can be found.
440 The translator can read the source file and see the context,
441 in case the string by itself can mean several different things.
445 Place for the translator to add comments about the translations.
449 Gettext-compatible tools, if any, might be usable if we adopt
454 This script has already been in use for over a year and should
455 be reasonable stable. Nevertheless, it is still somewhat
456 experimental and there are still some issues.
458 Please refer to the explanation in tmpl_process3 for further
461 If you want to generate GNOME-style POTFILES.in files, such
462 files (passed to -f) can be generated thus:
464 (cd ../.. && find koha-tmpl/opac-tmpl/default/en \
465 -name \*.inc -o -name \*.tt) > opac/POTFILES.in
466 (cd ../.. && find koha-tmpl/intranet-tmpl/default/en \
467 -name \*.inc -o -name \*.tt) > intranet/POTFILES.in
469 This is, however, quite pointless, because the "create" and
470 "update" actions have already been implemented in tmpl_process3.pl.
472 =head2 Strings inside JavaScript
474 In the SCRIPT elements, the script will attempt to scan for
475 _("I<string literal>") patterns, and extract the I<string literal>
476 as a translatable string.
478 Note that the C-like _(...) notation is required.
480 The JavaScript must actually define a _ function
481 so that the code remains correct JavaScript.
482 A suitable definition of such a function can be
484 function _(s) { return s } // dummy function for gettext
495 There probably are some. Bugs related to scanning of <INPUT>
496 tags seem to be especially likely to be present.
498 Its diagnostics are probably too verbose.
500 When a <TMPL_VAR> within a JavaScript-related attribute is
501 detected, the script currently displays no warnings at all.
502 It might be good to display some kind of warning.
504 Its sort order (-s option) seems to be different than the real
505 xgettext(1)'s sort option. This will result in translation
506 strings inside the generated PO file spuriously moving about
507 when tmpl_process3.pl calls msgmerge(1) to update the PO file.
509 If a Javascript string has leading spaces, it will
510 generate strings with spurious leading spaces,
511 leading to failure to match the strings when actually generating