3 # This file is part of Koha.
5 # Koha is free software; you can redistribute it and/or modify it
6 # under the terms of the GNU General Public License as published by
7 # the Free Software Foundation; either version 3 of the License, or
8 # (at your option) any later version.
10 # Koha is distributed in the hope that it will be useful, but
11 # WITHOUT ANY WARRANTY; without even the implied warranty of
12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 # GNU General Public License for more details.
15 # You should have received a copy of the GNU General Public License
16 # along with Koha; if not, see <http://www.gnu.org/licenses>.
20 xgettext.pl - xgettext(1)-like interface for .tt strings extraction
25 use lib $FindBin::Bin;
29 use Getopt::Long qw( GetOptions );
35 use vars qw( $convert_from );
36 use vars qw( $files_from $directory $output $sort );
37 use vars qw( $extract_all_p );
38 use vars qw( $pedantic_p );
39 use vars qw( %text %translation );
40 use vars qw( $charset_in $charset_out );
41 use vars qw( $disable_fuzzy_p );
42 use vars qw( $verbose_p );
43 use vars qw( $po_mode_p );
47 ###############################################################################
49 sub string_negligible_p {
50 my($t) = @_; # a string
51 # Don't emit pure whitespace, pure numbers, pure punctuation,
52 # single letters, or TMPL_VAR's.
53 # Punctuation should arguably be translated. But without context
54 # they are untranslatable. Note that $t is a string, not a token object.
55 return !$extract_all_p && (
56 TmplTokenizer::blank_p($t) # blank or TMPL_VAR
57 || $t =~ /^\d+$/ # purely digits
58 || $t =~ /^[-\+\.,:;!\?'"%\(\)\[\]\|]+$/ # punctuation w/o context
59 || $t =~ /^[A-Za-z]$/ # single letters
60 || $t =~ /^(&[a-z]+;|&#\d+;|&#x[0-9a-fA-F]+;|%%|%s|\s|[[:punct:]])*$/ # html entities,placeholder,punct, ...
61 || ( $t =~ /^\[\%.*\%\]$/ and $t !~ /\%\].*\[\%/ ) # pure TT entities
62 || $t =~ /^\s*<\?.*\?>/ # ignore xml prolog
66 sub token_negligible_p {
69 return !$extract_all_p && (
70 $t == C4::TmplTokenType::TEXT() ? string_negligible_p( $x->string )
71 : $t == C4::TmplTokenType::DIRECTIVE() ? 1
72 : $t == C4::TmplTokenType::TEXT_PARAMETRIZED()
77 $t == C4::TmplTokenType::DIRECTIVE() ? '1'
78 : $t == C4::TmplTokenType::TAG() ? ''
79 : token_negligible_p($_) ? ''
86 ###############################################################################
89 my($token, $string) = @_;
90 # If we determine that the string is negligible, don't bother to remember
91 unless (string_negligible_p( $string ) || token_negligible_p( $token )) {
92 my $key = TmplTokenizer::string_canon( $string );
93 $text{$key} = [] unless defined $text{$key};
94 push @{$text{$key}}, $token;
98 ###############################################################################
102 # The real gettext tools seems to sort case sensitively; I don't know why
103 @t = sort { $a cmp $b } @t if $sort eq 's';
105 my @aa = sort { $a->pathname cmp $b->pathname
106 || $a->line_number <=> $b->line_number } @{$text{$a}};
107 my @bb = sort { $a->pathname cmp $b->pathname
108 || $a->line_number <=> $b->line_number } @{$text{$b}};
109 $aa[0]->pathname cmp $bb[0]->pathname
110 || $aa[0]->line_number <=> $bb[0]->line_number;
111 } @t if $sort eq 'F';
115 ###############################################################################
120 my $s = TmplTokenizer::next_token($h);
121 last unless defined $s;
122 my($kind, $t, $attr) = ($s->type, $s->string, $s->attributes);
123 if ($kind eq C4::TmplTokenType::TEXT) {
124 if ($t =~ /\S/s && $t !~ /<!/){
127 } elsif ($kind eq C4::TmplTokenType::TEXT_PARAMETRIZED) {
128 if ($s->form =~ /\S/s && $s->form !~ /<!/){
129 remember( $s, $s->form );
131 } elsif ($kind eq C4::TmplTokenType::TAG && %$attr) {
132 # value [tag=input], meta
134 $tag = lc($1) if $t =~ /^<(\S+)/s;
135 for my $a ('alt', 'content', 'title', 'value', 'label', 'placeholder', 'aria-label') {
137 next if $a eq 'label' && $tag ne 'optgroup';
138 next if $a eq 'content';
139 next if $a eq 'value' && ($tag ne 'input'
140 || (ref $attr->{'type'} && $attr->{'type'}->[1] =~ /^(?:hidden|radio|checkbox)$/)); # FIXME
141 next if $tag eq 'meta';
142 my($key, $val, $val_orig, $order) = @{$attr->{$a}}; #FIXME
143 $val = TmplTokenizer::trim($val);
144 # for selected attributes replace '[%..%]' with '%s' globally
145 if ( $a =~ /title|value|alt|content|placeholder|aria-label/ ) {
146 $val =~ s/\[\%.*?\%\]/\%s/g;
148 # save attribute text for translation
149 remember( $s, $val ) if $val =~ /\S/s;
152 } elsif ($s->has_js_data) {
153 for my $t (@{$s->js_data}) {
154 remember( $s, $t->[3] ) if $t->[0]; # FIXME
160 ###############################################################################
162 sub generate_po_file {
163 # We don't emit the Plural-Forms header; it's meaningless for us
164 my $pot_charset = (defined $charset_out? $charset_out: 'CHARSET');
165 $pot_charset = TmplTokenizer::charset_canon($pot_charset);
166 # Time stamps aren't exactly right semantically. I don't know how to fix it.
167 my $time = POSIX::strftime('%Y-%m-%d %H:%M%z', localtime(time));
168 my $time_pot = $time;
169 my $time_po = $po_mode_p? $time: 'YEAR-MO-DA HO:MI+ZONE';
171 # SOME DESCRIPTIVE TITLE.
172 # Copyright (C) YEAR THE PACKAGE'S COPYRIGHT HOLDER
173 # This file is distributed under the same license as the PACKAGE package.
174 # FIRST AUTHOR <EMAIL\@ADDRESS>, YEAR.
177 print $OUTPUT <<EOF unless $disable_fuzzy_p;
183 "Project-Id-Version: Koha\\n"
184 "POT-Creation-Date: $time_pot\\n"
185 "PO-Revision-Date: $time_po\\n"
186 "Last-Translator: FULL NAME <EMAIL\@ADDRESS>\\n"
187 "Language-Team: LANGUAGE <LL\@li.org>\\n"
188 "MIME-Version: 1.0\\n"
189 "Content-Type: text/plain; charset=$pot_charset\\n"
190 "Content-Transfer-Encoding: 8bit\\n"
193 my $directory_re = quotemeta("$directory/");
195 for my $t ( keys %text ) {
196 my @ordered_tokens = sort {
197 $a->pathname cmp $b->pathname
198 || $a->line_number cmp $b->line_number
200 my $token = $ordered_tokens[0];
202 if ( $token->type == C4::TmplTokenType::TEXT_PARAMETRIZED ) {
204 printf $OUTPUT "#. For the first occurrence,\n"
205 if @{ $text{$t} } > 1 && $token->parameters_and_fields > 0;
206 for my $param ( $token->parameters_and_fields ) {
208 my $type = $param->type;
209 my $subtype = ( $type == C4::TmplTokenType::TAG
210 && $param->string =~ /^<input\b/is?
211 $param->attributes->{'type'}->[1] : undef );
212 my $fmt = TmplTokenizer::_formalize($param);
214 if ( $type == C4::TmplTokenType::DIRECTIVE ) {
216 # $type = "Template::Toolkit Directive";
217 $type = $param->string =~ /\[%(.*?)%\]/is ? $1 : 'ERROR';
218 my $name = $param->string =~ /\bname=(["']?)([^\s"']+)\1/is?
220 printf $OUTPUT "#. %s: %s\n", $fmt,
221 "$type" . ( defined $name ? " name=$name" : '' );
223 my $name = $param->attributes->{'name'};
225 $value = $param->attributes->{'value'}
226 unless $subtype =~ /^(?:text)$/;
227 printf $OUTPUT "#. %s: %s\n", $fmt, "type=$subtype"
228 . ( defined $name ? " name=$name->[1]" : '' )
229 . ( defined $value ? " value=$value->[1]" : '' );
232 } elsif ( $token->type == C4::TmplTokenType::TAG ) {
233 printf $OUTPUT "#. For the first occurrence,\n"
234 if @{ $text{$t} } > 1 && $token->parameters_and_fields > 0;
235 if ( $token->string =~ /^<meta\b/is ) {
236 my $type = $token->attributes->{'http-equiv'}->[1];
237 print $OUTPUT "#. META http-equiv=$type\n" if defined $type;
238 } elsif ( $token->string =~ /^<([a-z0-9]+)/is ) {
240 my $type = ( lc($tag) eq 'input'?
241 $token->attributes->{'type'}: undef );
242 my $name = $token->attributes->{'name'};
243 printf $OUTPUT "#. %s\n", $tag
244 . (defined $type? " type=$type->[1]": '')
245 . (defined $name? " name=$name->[1]": '');
247 } elsif ( $token->has_js_data ) {
248 printf $OUTPUT "#. For the first occurrence,\n" if @{ $text{$t} } > 1;
249 printf $OUTPUT "#. SCRIPT\n";
253 for my $token ( @{ $text{$t} } ) {
254 my $pathname = $token->pathname;
255 $pathname =~ s/^$directory_re//os;
256 $pathname =~ s/^.*\/koha-tmpl\/(.*)$/$1/;
257 push @{ $location->{$pathname} }, $token->line_number
258 if defined $pathname && defined $token->line_number;
259 $cformat_p = 1 if $token->type == C4::TmplTokenType::TEXT_PARAMETRIZED;
262 for my $pathname ( sort keys %$location ) {
263 for my $line_number ( @{ $location->{$pathname} } ) {
264 printf $OUTPUT "#: %s:%d\n", $pathname, $line_number;
268 printf $OUTPUT "#, c-format\n" if $cformat_p;
269 my $msgid = TmplTokenizer::string_canon( TmplTokenizer::charset_convert( $t, $charset_in, $charset_out ) );
270 printf $OUTPUT "msgid %s\n", ( defined $msgid && length $msgid ? Locale::PO->quote($msgid) : q{""} );
271 printf $OUTPUT "msgstr %s\n\n", ( defined $translation{$t} ? Locale::PO->quote( $translation{$t} ) : q{""} );
275 ###############################################################################
277 sub convert_translation_file {
278 open(my $INPUT, '<:encoding(utf-8)', $convert_from) || die "$convert_from: $!\n";
279 VerboseWarnings::set_input_file_name($convert_from);
282 my($msgid, $msgstr) = split(/\t/);
283 die "$convert_from: $.: Malformed tmpl_process input (no tab)\n"
284 unless defined $msgstr;
286 # Fixup some of the bad strings
287 $msgid =~ s/^SELECTED>//;
290 my $token = TmplToken->new( $msgid, C4::TmplTokenType::UNKNOWN, undef, undef );
291 remember( $token, $msgid );
292 $msgstr =~ s/^(?:LIMIT;|LIMITED;)//g; # unneeded for tmpl_process3
293 $translation{$msgid} = $msgstr unless $msgstr eq '*****';
295 if ($msgid =~ /\bcharset=(["']?)([^;\s"']+)\1/s) {
296 my $candidate = TmplTokenizer::charset_canon($2);
297 die "Conflicting charsets in msgid: $candidate vs $charset_in\n"
298 if defined $charset_in && $charset_in ne $candidate;
299 $charset_in = $candidate;
301 if ($msgstr =~ /\bcharset=(["']?)([^;\s"']+)\1/s) {
302 my $candidate = TmplTokenizer::charset_canon($2);
303 die "Conflicting charsets in msgid: $candidate vs $charset_out\n"
304 if defined $charset_out && $charset_out ne $candidate;
305 $charset_out = $candidate;
308 # The following assumption is correct; that's what HTML::Template assumes
309 if (!defined $charset_in) {
310 $charset_in = $charset_out = TmplTokenizer::charset_canon('utf-8');
311 warn "Warning: Can't determine original templates' charset, defaulting to $charset_in\n";
315 ###############################################################################
319 my $h = $exitcode? *STDERR: *STDOUT;
322 Extract translatable strings from given HTML::Template input files.
325 -f, --files-from=FILE Get list of input files from FILE
326 -D, --directory=DIRECTORY Add DIRECTORY to list for input files search
328 Output file location:
329 -o, --output=FILE Write output to specified file
331 HTML::Template options:
332 -a, --extract-all Extract all strings
333 --pedantic-warnings Issue warnings even for detected problems
334 which are likely to be harmless
337 -s, --sort-output generate sorted output
338 -F, --sort-by-file sort output by file location
339 -v, --verbose explain what is being done
342 --help Display this help and exit
344 Try `perldoc $0' for perhaps more information.
349 ###############################################################################
352 print STDERR "$_[0]\n" if @_;
353 print STDERR "Try `$0 --help' for more information.\n";
357 ###############################################################################
359 Getopt::Long::config qw( bundling no_auto_abbrev );
361 'a|extract-all' => \$extract_all_p,
362 'charset=s' => sub { $charset_in = $charset_out = $_[1] }, # INTERNAL
363 'convert-from=s' => \$convert_from,
364 'D|directory=s' => \$directory,
365 'disable-fuzzy' => \$disable_fuzzy_p, # INTERNAL
366 'f|files-from=s' => \$files_from,
367 'I|input-charset=s' => \$charset_in, # INTERNAL
368 'pedantic-warnings|pedantic' => sub { $pedantic_p = 1 },
369 'O|output-charset=s' => \$charset_out, # INTERNAL
370 'output|o=s' => \$output,
371 'po-mode' => \$po_mode_p, # INTERNAL
372 's|sort-output' => sub { $sort = 's' },
373 'F|sort-by-file' => sub { $sort = 'F' },
374 'v|verbose' => \$verbose_p,
375 'help' => sub { usage(0) },
378 VerboseWarnings::set_application_name($0);
379 VerboseWarnings::set_pedantic_mode($pedantic_p);
381 usage_error('Missing mandatory option -f')
382 unless defined $files_from || defined $convert_from;
383 $directory = '.' unless defined $directory;
385 usage_error('You cannot specify both --convert-from and --files-from')
386 if defined $convert_from && defined $files_from;
388 if (defined $output && $output ne '-') {
389 print STDERR "$0: Opening output file \"$output\"\n" if $verbose_p;
390 open($OUTPUT, '>:encoding(utf-8)', $output) || die "$output: $!\n";
392 print STDERR "$0: Outputting to STDOUT...\n" if $verbose_p;
393 open($OUTPUT, q{>}, "&STDOUT");
396 if (defined $files_from) {
397 print STDERR "$0: Opening input file list \"$files_from\"\n" if $verbose_p;
398 open(my $INPUT, '<:encoding(utf-8)', $files_from) || die "$files_from: $!\n";
401 my $input = /^\//? $_: "$directory/$_";
402 my $h = TmplTokenizer->new( $input );
403 $h->set_allow_cformat( 1 );
404 VerboseWarnings::set_input_file_name($input);
405 print STDERR "$0: Processing file \"$input\"\n" if $verbose_p;
410 print STDERR "$0: Converting \"$convert_from\"\n" if $verbose_p;
411 convert_translation_file;
415 warn "This input will not work with Mozilla standards-compliant mode\n", undef
416 if TmplTokenizer::syntaxerror_p;
419 exit(-1) if TmplTokenizer::fatal_p;
421 ###############################################################################
425 This script has behaviour similar to
426 xgettext(1), and generates gettext-compatible output files.
428 A gettext-like format provides the following advantages:
434 Translation to non-English-like languages with different word
435 order: gettext's c-format strings can theoretically be
436 emulated if we are able to do some analysis on the .tt input
437 and treat <TMPL_VAR> in a way similar to %s.
441 Context for the extracted strings: the gettext format provides
442 the filenames and line numbers where each string can be found.
443 The translator can read the source file and see the context,
444 in case the string by itself can mean several different things.
448 Place for the translator to add comments about the translations.
452 Gettext-compatible tools, if any, might be usable if we adopt
457 This script has already been in use for over a year and should
458 be reasonable stable. Nevertheless, it is still somewhat
459 experimental and there are still some issues.
461 Please refer to the explanation in tmpl_process3 for further
464 If you want to generate GNOME-style POTFILES.in files, such
465 files (passed to -f) can be generated thus:
467 (cd ../.. && find koha-tmpl/opac-tmpl/default/en \
468 -name \*.inc -o -name \*.tt) > opac/POTFILES.in
469 (cd ../.. && find koha-tmpl/intranet-tmpl/default/en \
470 -name \*.inc -o -name \*.tt) > intranet/POTFILES.in
472 This is, however, quite pointless, because the "create" and
473 "update" actions have already been implemented in tmpl_process3.pl.
475 =head2 Strings inside JavaScript
477 In the SCRIPT elements, the script will attempt to scan for
478 _("I<string literal>") patterns, and extract the I<string literal>
479 as a translatable string.
481 Note that the C-like _(...) notation is required.
483 The JavaScript must actually define a _ function
484 so that the code remains correct JavaScript.
485 A suitable definition of such a function can be
487 function _(s) { return s } // dummy function for gettext
498 There probably are some. Bugs related to scanning of <INPUT>
499 tags seem to be especially likely to be present.
501 Its diagnostics are probably too verbose.
503 When a <TMPL_VAR> within a JavaScript-related attribute is
504 detected, the script currently displays no warnings at all.
505 It might be good to display some kind of warning.
507 Its sort order (-s option) seems to be different than the real
508 xgettext(1)'s sort option. This will result in translation
509 strings inside the generated PO file spuriously moving about
510 when tmpl_process3.pl calls msgmerge(1) to update the PO file.
512 If a Javascript string has leading spaces, it will
513 generate strings with spurious leading spaces,
514 leading to failure to match the strings when actually generating