3 # This file is part of Koha.
5 # Koha is free software; you can redistribute it and/or modify it
6 # under the terms of the GNU General Public License as published by
7 # the Free Software Foundation; either version 3 of the License, or
8 # (at your option) any later version.
10 # Koha is distributed in the hope that it will be useful, but
11 # WITHOUT ANY WARRANTY; without even the implied warranty of
12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 # GNU General Public License for more details.
15 # You should have received a copy of the GNU General Public License
16 # along with Koha; if not, see <http://www.gnu.org/licenses>.
20 xgettext.pl - xgettext(1)-like interface for .tt strings extraction
25 use lib $FindBin::Bin;
29 use Getopt::Long qw( GetOptions );
35 use vars qw( $convert_from );
36 use vars qw( $files_from $directory $output $sort );
37 use vars qw( $extract_all_p );
38 use vars qw( $pedantic_p );
39 use vars qw( %text %translation );
40 use vars qw( $charset_in $charset_out );
41 use vars qw( $disable_fuzzy_p );
42 use vars qw( $verbose_p );
43 use vars qw( $po_mode_p );
47 ###############################################################################
49 sub string_negligible_p {
50 my($t) = @_; # a string
51 # Don't emit pure whitespace, pure numbers, pure punctuation,
52 # single letters, or TMPL_VAR's.
53 # Punctuation should arguably be translated. But without context
54 # they are untranslatable. Note that $t is a string, not a token object.
55 return !$extract_all_p && (
56 TmplTokenizer::blank_p($t) # blank or TMPL_VAR
57 || $t =~ /^\d+$/ # purely digits
58 || $t =~ /^[-\+\.,:;!\?'"%\(\)\[\]\|]+$/ # punctuation w/o context
59 || $t =~ /^[A-Za-z]$/ # single letters
60 || $t =~ /^(&[a-z]+;|&#\d+;|&#x[0-9a-fA-F]+;|%%|%s|\s|[[:punct:]])*$/ # html entities,placeholder,punct, ...
61 || ( $t =~ /^\[\%.*\%\]$/ and $t !~ /\%\].*\[\%/ ) # pure TT entities
62 || $t =~ /^\s*<\?.*\?>/ # ignore xml prolog
66 sub token_negligible_p {
69 return !$extract_all_p && (
70 $t == C4::TmplTokenType::TEXT() ? string_negligible_p( $x->string )
71 : $t == C4::TmplTokenType::DIRECTIVE() ? 1
72 : $t == C4::TmplTokenType::TEXT_PARAMETRIZED()
77 $t == C4::TmplTokenType::DIRECTIVE() ? '1'
78 : $t == C4::TmplTokenType::TAG() ? ''
79 : token_negligible_p($_) ? ''
86 ###############################################################################
89 my($token, $string) = @_;
90 # If we determine that the string is negligible, don't bother to remember
91 unless (string_negligible_p( $string ) || token_negligible_p( $token )) {
92 my $key = TmplTokenizer::string_canon( $string );
93 $text{$key} = [] unless defined $text{$key};
94 push @{$text{$key}}, $token;
98 ###############################################################################
102 # The real gettext tools seems to sort case sensitively; I don't know why
103 @t = sort { $a cmp $b } @t if $sort eq 's';
105 my @aa = sort { $a->pathname cmp $b->pathname
106 || $a->line_number <=> $b->line_number } @{$text{$a}};
107 my @bb = sort { $a->pathname cmp $b->pathname
108 || $a->line_number <=> $b->line_number } @{$text{$b}};
109 $aa[0]->pathname cmp $bb[0]->pathname
110 || $aa[0]->line_number <=> $bb[0]->line_number;
111 } @t if $sort eq 'F';
115 ###############################################################################
120 my $s = TmplTokenizer::next_token($h);
121 last unless defined $s;
122 my($kind, $t, $attr) = ($s->type, $s->string, $s->attributes);
123 if ($kind eq C4::TmplTokenType::TEXT) {
124 if ($t =~ /\S/s && $t !~ /<!/){
127 } elsif ($kind eq C4::TmplTokenType::TEXT_PARAMETRIZED) {
128 if ($s->form =~ /\S/s && $s->form !~ /<!/){
129 remember( $s, $s->form );
131 } elsif ($kind eq C4::TmplTokenType::TAG && %$attr) {
132 # value [tag=input], meta
134 $tag = lc($1) if $t =~ /^<(\S+)/s;
135 for my $a ('alt', 'content', 'title', 'value', 'label', 'placeholder', 'aria-label') {
137 next if $a eq 'label' && $tag ne 'optgroup';
138 next if $a eq 'content' && $tag ne 'meta';
139 next if $a eq 'value' && ($tag ne 'input'
140 || (ref $attr->{'type'} && $attr->{'type'}->[1] =~ /^(?:hidden|radio|checkbox)$/)); # FIXME
141 my($key, $val, $val_orig, $order) = @{$attr->{$a}}; #FIXME
142 $val = TmplTokenizer::trim($val);
143 # for selected attributes replace '[%..%]' with '%s' globally
144 if ( $a =~ /title|value|alt|content|placeholder|aria-label/ ) {
145 $val =~ s/\[\%.*?\%\]/\%s/g;
147 # save attribute text for translation
148 remember( $s, $val ) if $val =~ /\S/s;
151 } elsif ($s->has_js_data) {
152 for my $t (@{$s->js_data}) {
153 remember( $s, $t->[3] ) if $t->[0]; # FIXME
159 ###############################################################################
161 sub generate_po_file {
162 # We don't emit the Plural-Forms header; it's meaningless for us
163 my $pot_charset = (defined $charset_out? $charset_out: 'CHARSET');
164 $pot_charset = TmplTokenizer::charset_canon($pot_charset);
165 # Time stamps aren't exactly right semantically. I don't know how to fix it.
166 my $time = POSIX::strftime('%Y-%m-%d %H:%M%z', localtime(time));
167 my $time_pot = $time;
168 my $time_po = $po_mode_p? $time: 'YEAR-MO-DA HO:MI+ZONE';
170 # SOME DESCRIPTIVE TITLE.
171 # Copyright (C) YEAR THE PACKAGE'S COPYRIGHT HOLDER
172 # This file is distributed under the same license as the PACKAGE package.
173 # FIRST AUTHOR <EMAIL\@ADDRESS>, YEAR.
176 print $OUTPUT <<EOF unless $disable_fuzzy_p;
182 "Project-Id-Version: Koha\\n"
183 "POT-Creation-Date: $time_pot\\n"
184 "PO-Revision-Date: $time_po\\n"
185 "Last-Translator: FULL NAME <EMAIL\@ADDRESS>\\n"
186 "Language-Team: LANGUAGE <LL\@li.org>\\n"
187 "MIME-Version: 1.0\\n"
188 "Content-Type: text/plain; charset=$pot_charset\\n"
189 "Content-Transfer-Encoding: 8bit\\n"
192 my $directory_re = quotemeta("$directory/");
194 for my $t ( keys %text ) {
195 my @ordered_tokens = sort {
196 $a->pathname cmp $b->pathname
197 || $a->line_number cmp $b->line_number
199 my $token = $ordered_tokens[0];
201 if ( $token->type == C4::TmplTokenType::TEXT_PARAMETRIZED ) {
203 printf $OUTPUT "#. For the first occurrence,\n"
204 if @{ $text{$t} } > 1 && $token->parameters_and_fields > 0;
205 for my $param ( $token->parameters_and_fields ) {
207 my $type = $param->type;
208 my $subtype = ( $type == C4::TmplTokenType::TAG
209 && $param->string =~ /^<input\b/is?
210 $param->attributes->{'type'}->[1] : undef );
211 my $fmt = TmplTokenizer::_formalize($param);
213 if ( $type == C4::TmplTokenType::DIRECTIVE ) {
215 # $type = "Template::Toolkit Directive";
216 $type = $param->string =~ /\[%(.*?)%\]/is ? $1 : 'ERROR';
217 my $name = $param->string =~ /\bname=(["']?)([^\s"']+)\1/is?
219 printf $OUTPUT "#. %s: %s\n", $fmt,
220 "$type" . ( defined $name ? " name=$name" : '' );
222 my $name = $param->attributes->{'name'};
224 $value = $param->attributes->{'value'}
225 unless $subtype =~ /^(?:text)$/;
226 printf $OUTPUT "#. %s: %s\n", $fmt, "type=$subtype"
227 . ( defined $name ? " name=$name->[1]" : '' )
228 . ( defined $value ? " value=$value->[1]" : '' );
231 } elsif ( $token->type == C4::TmplTokenType::TAG ) {
232 printf $OUTPUT "#. For the first occurrence,\n"
233 if @{ $text{$t} } > 1 && $token->parameters_and_fields > 0;
234 if ( $token->string =~ /^<meta\b/is ) {
235 my $type = $token->attributes->{'http-equiv'}->[1];
236 print $OUTPUT "#. META http-equiv=$type\n" if defined $type;
237 } elsif ( $token->string =~ /^<([a-z0-9]+)/is ) {
239 my $type = ( lc($tag) eq 'input'?
240 $token->attributes->{'type'}: undef );
241 my $name = $token->attributes->{'name'};
242 printf $OUTPUT "#. %s\n", $tag
243 . (defined $type? " type=$type->[1]": '')
244 . (defined $name? " name=$name->[1]": '');
246 } elsif ( $token->has_js_data ) {
247 printf $OUTPUT "#. For the first occurrence,\n" if @{ $text{$t} } > 1;
248 printf $OUTPUT "#. SCRIPT\n";
252 for my $token ( @{ $text{$t} } ) {
253 my $pathname = $token->pathname;
254 $pathname =~ s/^$directory_re//os;
255 $pathname =~ s/^.*\/koha-tmpl\/(.*)$/$1/;
256 push @{ $location->{$pathname} }, $token->line_number
257 if defined $pathname && defined $token->line_number;
258 $cformat_p = 1 if $token->type == C4::TmplTokenType::TEXT_PARAMETRIZED;
261 for my $pathname ( sort keys %$location ) {
262 for my $line_number ( @{ $location->{$pathname} } ) {
263 printf $OUTPUT "#: %s:%d\n", $pathname, $line_number;
267 printf $OUTPUT "#, c-format\n" if $cformat_p;
268 printf $OUTPUT "msgid %s\n", TmplTokenizer::quote_po(
269 TmplTokenizer::string_canon(
270 TmplTokenizer::charset_convert( $t, $charset_in, $charset_out )
273 printf $OUTPUT "msgstr %s\n\n", (defined $translation{$t}?
274 TmplTokenizer::quote_po( $translation{$t} ): "\"\"");
278 ###############################################################################
280 sub convert_translation_file {
281 open(my $INPUT, '<:encoding(utf-8)', $convert_from) || die "$convert_from: $!\n";
282 VerboseWarnings::set_input_file_name($convert_from);
285 my($msgid, $msgstr) = split(/\t/);
286 die "$convert_from: $.: Malformed tmpl_process input (no tab)\n"
287 unless defined $msgstr;
289 # Fixup some of the bad strings
290 $msgid =~ s/^SELECTED>//;
293 my $token = TmplToken->new( $msgid, C4::TmplTokenType::UNKNOWN, undef, undef );
294 remember( $token, $msgid );
295 $msgstr =~ s/^(?:LIMIT;|LIMITED;)//g; # unneeded for tmpl_process3
296 $translation{$msgid} = $msgstr unless $msgstr eq '*****';
298 if ($msgid =~ /\bcharset=(["']?)([^;\s"']+)\1/s) {
299 my $candidate = TmplTokenizer::charset_canon($2);
300 die "Conflicting charsets in msgid: $candidate vs $charset_in\n"
301 if defined $charset_in && $charset_in ne $candidate;
302 $charset_in = $candidate;
304 if ($msgstr =~ /\bcharset=(["']?)([^;\s"']+)\1/s) {
305 my $candidate = TmplTokenizer::charset_canon($2);
306 die "Conflicting charsets in msgid: $candidate vs $charset_out\n"
307 if defined $charset_out && $charset_out ne $candidate;
308 $charset_out = $candidate;
311 # The following assumption is correct; that's what HTML::Template assumes
312 if (!defined $charset_in) {
313 $charset_in = $charset_out = TmplTokenizer::charset_canon('utf-8');
314 warn "Warning: Can't determine original templates' charset, defaulting to $charset_in\n";
318 ###############################################################################
322 my $h = $exitcode? *STDERR: *STDOUT;
325 Extract translatable strings from given HTML::Template input files.
328 -f, --files-from=FILE Get list of input files from FILE
329 -D, --directory=DIRECTORY Add DIRECTORY to list for input files search
331 Output file location:
332 -o, --output=FILE Write output to specified file
334 HTML::Template options:
335 -a, --extract-all Extract all strings
336 --pedantic-warnings Issue warnings even for detected problems
337 which are likely to be harmless
340 -s, --sort-output generate sorted output
341 -F, --sort-by-file sort output by file location
342 -v, --verbose explain what is being done
345 --help Display this help and exit
347 Try `perldoc $0' for perhaps more information.
352 ###############################################################################
355 print STDERR "$_[0]\n" if @_;
356 print STDERR "Try `$0 --help' for more information.\n";
360 ###############################################################################
362 Getopt::Long::config qw( bundling no_auto_abbrev );
364 'a|extract-all' => \$extract_all_p,
365 'charset=s' => sub { $charset_in = $charset_out = $_[1] }, # INTERNAL
366 'convert-from=s' => \$convert_from,
367 'D|directory=s' => \$directory,
368 'disable-fuzzy' => \$disable_fuzzy_p, # INTERNAL
369 'f|files-from=s' => \$files_from,
370 'I|input-charset=s' => \$charset_in, # INTERNAL
371 'pedantic-warnings|pedantic' => sub { $pedantic_p = 1 },
372 'O|output-charset=s' => \$charset_out, # INTERNAL
373 'output|o=s' => \$output,
374 'po-mode' => \$po_mode_p, # INTERNAL
375 's|sort-output' => sub { $sort = 's' },
376 'F|sort-by-file' => sub { $sort = 'F' },
377 'v|verbose' => \$verbose_p,
378 'help' => sub { usage(0) },
381 VerboseWarnings::set_application_name($0);
382 VerboseWarnings::set_pedantic_mode($pedantic_p);
384 usage_error('Missing mandatory option -f')
385 unless defined $files_from || defined $convert_from;
386 $directory = '.' unless defined $directory;
388 usage_error('You cannot specify both --convert-from and --files-from')
389 if defined $convert_from && defined $files_from;
391 if (defined $output && $output ne '-') {
392 print STDERR "$0: Opening output file \"$output\"\n" if $verbose_p;
393 open($OUTPUT, '>:encoding(utf-8)', $output) || die "$output: $!\n";
395 print STDERR "$0: Outputting to STDOUT...\n" if $verbose_p;
396 open($OUTPUT, q{>}, "&STDOUT");
399 if (defined $files_from) {
400 print STDERR "$0: Opening input file list \"$files_from\"\n" if $verbose_p;
401 open(my $INPUT, '<:encoding(utf-8)', $files_from) || die "$files_from: $!\n";
404 my $input = /^\//? $_: "$directory/$_";
405 my $h = TmplTokenizer->new( $input );
406 $h->set_allow_cformat( 1 );
407 VerboseWarnings::set_input_file_name($input);
408 print STDERR "$0: Processing file \"$input\"\n" if $verbose_p;
413 print STDERR "$0: Converting \"$convert_from\"\n" if $verbose_p;
414 convert_translation_file;
418 warn "This input will not work with Mozilla standards-compliant mode\n", undef
419 if TmplTokenizer::syntaxerror_p;
422 exit(-1) if TmplTokenizer::fatal_p;
424 ###############################################################################
428 This script has behaviour similar to
429 xgettext(1), and generates gettext-compatible output files.
431 A gettext-like format provides the following advantages:
437 Translation to non-English-like languages with different word
438 order: gettext's c-format strings can theoretically be
439 emulated if we are able to do some analysis on the .tt input
440 and treat <TMPL_VAR> in a way similar to %s.
444 Context for the extracted strings: the gettext format provides
445 the filenames and line numbers where each string can be found.
446 The translator can read the source file and see the context,
447 in case the string by itself can mean several different things.
451 Place for the translator to add comments about the translations.
455 Gettext-compatible tools, if any, might be usable if we adopt
460 This script has already been in use for over a year and should
461 be reasonable stable. Nevertheless, it is still somewhat
462 experimental and there are still some issues.
464 Please refer to the explanation in tmpl_process3 for further
467 If you want to generate GNOME-style POTFILES.in files, such
468 files (passed to -f) can be generated thus:
470 (cd ../.. && find koha-tmpl/opac-tmpl/default/en \
471 -name \*.inc -o -name \*.tt) > opac/POTFILES.in
472 (cd ../.. && find koha-tmpl/intranet-tmpl/default/en \
473 -name \*.inc -o -name \*.tt) > intranet/POTFILES.in
475 This is, however, quite pointless, because the "create" and
476 "update" actions have already been implemented in tmpl_process3.pl.
478 =head2 Strings inside JavaScript
480 In the SCRIPT elements, the script will attempt to scan for
481 _("I<string literal>") patterns, and extract the I<string literal>
482 as a translatable string.
484 Note that the C-like _(...) notation is required.
486 The JavaScript must actually define a _ function
487 so that the code remains correct JavaScript.
488 A suitable definition of such a function can be
490 function _(s) { return s } // dummy function for gettext
501 There probably are some. Bugs related to scanning of <INPUT>
502 tags seem to be especially likely to be present.
504 Its diagnostics are probably too verbose.
506 When a <TMPL_VAR> within a JavaScript-related attribute is
507 detected, the script currently displays no warnings at all.
508 It might be good to display some kind of warning.
510 Its sort order (-s option) seems to be different than the real
511 xgettext(1)'s sort option. This will result in translation
512 strings inside the generated PO file spuriously moving about
513 when tmpl_process3.pl calls msgmerge(1) to update the PO file.
515 If a Javascript string has leading spaces, it will
516 generate strings with spurious leading spaces,
517 leading to failure to match the strings when actually generating