3 # This file is part of Koha.
5 # Koha is free software; you can redistribute it and/or modify it
6 # under the terms of the GNU General Public License as published by
7 # the Free Software Foundation; either version 3 of the License, or
8 # (at your option) any later version.
10 # Koha is distributed in the hope that it will be useful, but
11 # WITHOUT ANY WARRANTY; without even the implied warranty of
12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 # GNU General Public License for more details.
15 # You should have received a copy of the GNU General Public License
16 # along with Koha; if not, see <http://www.gnu.org/licenses>.
20 xgettext.pl - xgettext(1)-like interface for .tt strings extraction
25 use lib $FindBin::Bin;
29 use Getopt::Long qw( GetOptions );
35 use vars qw( $convert_from );
36 use vars qw( $files_from $directory $output $sort );
37 use vars qw( $extract_all_p );
38 use vars qw( $pedantic_p );
39 use vars qw( %text %translation );
40 use vars qw( $charset_in $charset_out );
41 use vars qw( $disable_fuzzy_p );
42 use vars qw( $verbose_p );
43 use vars qw( $po_mode_p );
47 ###############################################################################
49 sub string_negligible_p {
50 my($t) = @_; # a string
51 # Don't emit pure whitespace, pure numbers, pure punctuation,
52 # single letters, or TMPL_VAR's.
53 # Punctuation should arguably be translated. But without context
54 # they are untranslatable. Note that $t is a string, not a token object.
55 return !$extract_all_p && (
56 TmplTokenizer::blank_p($t) # blank or TMPL_VAR
57 || $t =~ /^\d+$/ # purely digits
58 || $t =~ /^[-\+\.,:;!\?'"%\(\)\[\]\|]+$/ # punctuation w/o context
59 || $t =~ /^[A-Za-z]$/ # single letters
60 || $t =~ /^(&[a-z]+;|&#\d+;|&#x[0-9a-fA-F]+;|%%|%s|\s|[[:punct:]])*$/ # html entities,placeholder,punct, ...
61 || ( $t =~ /^\[\%.*\%\]$/ and $t !~ /\%\].*\[\%/ ) # pure TT entities
62 || $t =~ /^\s*<\?.*\?>/ # ignore xml prolog
66 sub token_negligible_p {
69 return !$extract_all_p && (
70 $t == C4::TmplTokenType::TEXT() ? string_negligible_p( $x->string )
71 : $t == C4::TmplTokenType::DIRECTIVE() ? 1
72 : $t == C4::TmplTokenType::TEXT_PARAMETRIZED()
77 $t == C4::TmplTokenType::DIRECTIVE() ? '1'
78 : $t == C4::TmplTokenType::TAG() ? ''
79 : token_negligible_p($_) ? ''
86 ###############################################################################
89 my($token, $string) = @_;
90 # If we determine that the string is negligible, don't bother to remember
91 unless (string_negligible_p( $string ) || token_negligible_p( $token )) {
92 my $key = TmplTokenizer::string_canon( $string );
93 $text{$key} = [] unless defined $text{$key};
94 push @{$text{$key}}, $token;
98 ###############################################################################
102 # The real gettext tools seems to sort case sensitively; I don't know why
103 @t = sort { $a cmp $b } @t if $sort eq 's';
105 my @aa = sort { $a->pathname cmp $b->pathname
106 || $a->line_number <=> $b->line_number } @{$text{$a}};
107 my @bb = sort { $a->pathname cmp $b->pathname
108 || $a->line_number <=> $b->line_number } @{$text{$b}};
109 $aa[0]->pathname cmp $bb[0]->pathname
110 || $aa[0]->line_number <=> $bb[0]->line_number;
111 } @t if $sort eq 'F';
115 ###############################################################################
120 my $s = TmplTokenizer::next_token($h);
121 last unless defined $s;
122 my($kind, $t, $attr) = ($s->type, $s->string, $s->attributes);
123 if ($kind eq C4::TmplTokenType::TEXT) {
124 if ($t =~ /\S/s && $t !~ /<!/){
127 } elsif ($kind eq C4::TmplTokenType::TEXT_PARAMETRIZED) {
128 if ($s->form =~ /\S/s && $s->form !~ /<!/){
129 remember( $s, $s->form );
131 } elsif ($kind eq C4::TmplTokenType::TAG && %$attr) {
132 # value [tag=input], meta
134 $tag = lc($1) if $t =~ /^<(\S+)/s;
135 for my $a ('alt', 'content', 'title', 'value', 'label', 'placeholder', 'aria-label') {
137 next if $a eq 'label' && $tag ne 'optgroup';
138 next if $a eq 'content' && $tag ne 'meta';
139 next if $a eq 'value' && ($tag ne 'input'
140 || (ref $attr->{'type'} && $attr->{'type'}->[1] =~ /^(?:hidden|radio|checkbox)$/)); # FIXME
141 my($key, $val, $val_orig, $order) = @{$attr->{$a}}; #FIXME
142 $val = TmplTokenizer::trim($val);
143 # for selected attributes replace '[%..%]' with '%s' globally
144 if ( $a =~ /title|value|alt|content|placeholder|aria-label/ ) {
145 $val =~ s/\[\%.*?\%\]/\%s/g;
147 # save attribute text for translation
148 remember( $s, $val ) if $val =~ /\S/s;
151 } elsif ($s->has_js_data) {
152 for my $t (@{$s->js_data}) {
153 remember( $s, $t->[3] ) if $t->[0]; # FIXME
159 ###############################################################################
161 sub generate_po_file {
162 # We don't emit the Plural-Forms header; it's meaningless for us
163 my $pot_charset = (defined $charset_out? $charset_out: 'CHARSET');
164 $pot_charset = TmplTokenizer::charset_canon($pot_charset);
165 # Time stamps aren't exactly right semantically. I don't know how to fix it.
166 my $time = POSIX::strftime('%Y-%m-%d %H:%M%z', localtime(time));
167 my $time_pot = $time;
168 my $time_po = $po_mode_p? $time: 'YEAR-MO-DA HO:MI+ZONE';
170 # SOME DESCRIPTIVE TITLE.
171 # Copyright (C) YEAR THE PACKAGE'S COPYRIGHT HOLDER
172 # This file is distributed under the same license as the PACKAGE package.
173 # FIRST AUTHOR <EMAIL\@ADDRESS>, YEAR.
176 print $OUTPUT <<EOF unless $disable_fuzzy_p;
182 "Project-Id-Version: Koha\\n"
183 "POT-Creation-Date: $time_pot\\n"
184 "PO-Revision-Date: $time_po\\n"
185 "Last-Translator: FULL NAME <EMAIL\@ADDRESS>\\n"
186 "Language-Team: LANGUAGE <LL\@li.org>\\n"
187 "MIME-Version: 1.0\\n"
188 "Content-Type: text/plain; charset=$pot_charset\\n"
189 "Content-Transfer-Encoding: 8bit\\n"
192 my $directory_re = quotemeta("$directory/");
194 for my $t ( keys %text ) {
195 my @ordered_tokens = sort {
196 $a->pathname cmp $b->pathname
197 || $a->line_number cmp $b->line_number
199 my $token = $ordered_tokens[0];
201 if ( $token->type == C4::TmplTokenType::TEXT_PARAMETRIZED ) {
203 printf $OUTPUT "#. For the first occurrence,\n"
204 if @{ $text{$t} } > 1 && $token->parameters_and_fields > 0;
205 for my $param ( $token->parameters_and_fields ) {
207 my $type = $param->type;
208 my $subtype = ( $type == C4::TmplTokenType::TAG
209 && $param->string =~ /^<input\b/is?
210 $param->attributes->{'type'}->[1] : undef );
211 my $fmt = TmplTokenizer::_formalize($param);
213 if ( $type == C4::TmplTokenType::DIRECTIVE ) {
215 # $type = "Template::Toolkit Directive";
216 $type = $param->string =~ /\[%(.*?)%\]/is ? $1 : 'ERROR';
217 my $name = $param->string =~ /\bname=(["']?)([^\s"']+)\1/is?
219 printf $OUTPUT "#. %s: %s\n", $fmt,
220 "$type" . ( defined $name ? " name=$name" : '' );
222 my $name = $param->attributes->{'name'};
224 $value = $param->attributes->{'value'}
225 unless $subtype =~ /^(?:text)$/;
226 printf $OUTPUT "#. %s: %s\n", $fmt, "type=$subtype"
227 . ( defined $name ? " name=$name->[1]" : '' )
228 . ( defined $value ? " value=$value->[1]" : '' );
231 } elsif ( $token->type == C4::TmplTokenType::TAG ) {
232 printf $OUTPUT "#. For the first occurrence,\n"
233 if @{ $text{$t} } > 1 && $token->parameters_and_fields > 0;
234 if ( $token->string =~ /^<meta\b/is ) {
235 my $type = $token->attributes->{'http-equiv'}->[1];
236 print $OUTPUT "#. META http-equiv=$type\n" if defined $type;
237 } elsif ( $token->string =~ /^<([a-z0-9]+)/is ) {
239 my $type = ( lc($tag) eq 'input'?
240 $token->attributes->{'type'}: undef );
241 my $name = $token->attributes->{'name'};
242 printf $OUTPUT "#. %s\n", $tag
243 . (defined $type? " type=$type->[1]": '')
244 . (defined $name? " name=$name->[1]": '');
246 } elsif ( $token->has_js_data ) {
247 printf $OUTPUT "#. For the first occurrence,\n" if @{ $text{$t} } > 1;
248 printf $OUTPUT "#. SCRIPT\n";
252 for my $token ( @{ $text{$t} } ) {
253 my $pathname = $token->pathname;
254 $pathname =~ s/^$directory_re//os;
255 $pathname =~ s/^.*\/koha-tmpl\/(.*)$/$1/;
256 push @{ $location->{$pathname} }, $token->line_number
257 if defined $pathname && defined $token->line_number;
258 $cformat_p = 1 if $token->type == C4::TmplTokenType::TEXT_PARAMETRIZED;
261 for my $pathname ( sort keys %$location ) {
262 for my $line_number ( @{ $location->{$pathname} } ) {
263 printf $OUTPUT "#: %s:%d\n", $pathname, $line_number;
267 printf $OUTPUT "#, c-format\n" if $cformat_p;
268 my $msgid = TmplTokenizer::string_canon( TmplTokenizer::charset_convert( $t, $charset_in, $charset_out ) );
269 printf $OUTPUT "msgid %s\n", ( defined $msgid && length $msgid ? Locale::PO->quote($msgid) : q{""} );
270 printf $OUTPUT "msgstr %s\n\n", ( defined $translation{$t} ? Locale::PO->quote( $translation{$t} ) : q{""} );
274 ###############################################################################
276 sub convert_translation_file {
277 open(my $INPUT, '<:encoding(utf-8)', $convert_from) || die "$convert_from: $!\n";
278 VerboseWarnings::set_input_file_name($convert_from);
281 my($msgid, $msgstr) = split(/\t/);
282 die "$convert_from: $.: Malformed tmpl_process input (no tab)\n"
283 unless defined $msgstr;
285 # Fixup some of the bad strings
286 $msgid =~ s/^SELECTED>//;
289 my $token = TmplToken->new( $msgid, C4::TmplTokenType::UNKNOWN, undef, undef );
290 remember( $token, $msgid );
291 $msgstr =~ s/^(?:LIMIT;|LIMITED;)//g; # unneeded for tmpl_process3
292 $translation{$msgid} = $msgstr unless $msgstr eq '*****';
294 if ($msgid =~ /\bcharset=(["']?)([^;\s"']+)\1/s) {
295 my $candidate = TmplTokenizer::charset_canon($2);
296 die "Conflicting charsets in msgid: $candidate vs $charset_in\n"
297 if defined $charset_in && $charset_in ne $candidate;
298 $charset_in = $candidate;
300 if ($msgstr =~ /\bcharset=(["']?)([^;\s"']+)\1/s) {
301 my $candidate = TmplTokenizer::charset_canon($2);
302 die "Conflicting charsets in msgid: $candidate vs $charset_out\n"
303 if defined $charset_out && $charset_out ne $candidate;
304 $charset_out = $candidate;
307 # The following assumption is correct; that's what HTML::Template assumes
308 if (!defined $charset_in) {
309 $charset_in = $charset_out = TmplTokenizer::charset_canon('utf-8');
310 warn "Warning: Can't determine original templates' charset, defaulting to $charset_in\n";
314 ###############################################################################
318 my $h = $exitcode? *STDERR: *STDOUT;
321 Extract translatable strings from given HTML::Template input files.
324 -f, --files-from=FILE Get list of input files from FILE
325 -D, --directory=DIRECTORY Add DIRECTORY to list for input files search
327 Output file location:
328 -o, --output=FILE Write output to specified file
330 HTML::Template options:
331 -a, --extract-all Extract all strings
332 --pedantic-warnings Issue warnings even for detected problems
333 which are likely to be harmless
336 -s, --sort-output generate sorted output
337 -F, --sort-by-file sort output by file location
338 -v, --verbose explain what is being done
341 --help Display this help and exit
343 Try `perldoc $0' for perhaps more information.
348 ###############################################################################
351 print STDERR "$_[0]\n" if @_;
352 print STDERR "Try `$0 --help' for more information.\n";
356 ###############################################################################
358 Getopt::Long::config qw( bundling no_auto_abbrev );
360 'a|extract-all' => \$extract_all_p,
361 'charset=s' => sub { $charset_in = $charset_out = $_[1] }, # INTERNAL
362 'convert-from=s' => \$convert_from,
363 'D|directory=s' => \$directory,
364 'disable-fuzzy' => \$disable_fuzzy_p, # INTERNAL
365 'f|files-from=s' => \$files_from,
366 'I|input-charset=s' => \$charset_in, # INTERNAL
367 'pedantic-warnings|pedantic' => sub { $pedantic_p = 1 },
368 'O|output-charset=s' => \$charset_out, # INTERNAL
369 'output|o=s' => \$output,
370 'po-mode' => \$po_mode_p, # INTERNAL
371 's|sort-output' => sub { $sort = 's' },
372 'F|sort-by-file' => sub { $sort = 'F' },
373 'v|verbose' => \$verbose_p,
374 'help' => sub { usage(0) },
377 VerboseWarnings::set_application_name($0);
378 VerboseWarnings::set_pedantic_mode($pedantic_p);
380 usage_error('Missing mandatory option -f')
381 unless defined $files_from || defined $convert_from;
382 $directory = '.' unless defined $directory;
384 usage_error('You cannot specify both --convert-from and --files-from')
385 if defined $convert_from && defined $files_from;
387 if (defined $output && $output ne '-') {
388 print STDERR "$0: Opening output file \"$output\"\n" if $verbose_p;
389 open($OUTPUT, '>:encoding(utf-8)', $output) || die "$output: $!\n";
391 print STDERR "$0: Outputting to STDOUT...\n" if $verbose_p;
392 open($OUTPUT, q{>}, "&STDOUT");
395 if (defined $files_from) {
396 print STDERR "$0: Opening input file list \"$files_from\"\n" if $verbose_p;
397 open(my $INPUT, '<:encoding(utf-8)', $files_from) || die "$files_from: $!\n";
400 my $input = /^\//? $_: "$directory/$_";
401 my $h = TmplTokenizer->new( $input );
402 $h->set_allow_cformat( 1 );
403 VerboseWarnings::set_input_file_name($input);
404 print STDERR "$0: Processing file \"$input\"\n" if $verbose_p;
409 print STDERR "$0: Converting \"$convert_from\"\n" if $verbose_p;
410 convert_translation_file;
414 warn "This input will not work with Mozilla standards-compliant mode\n", undef
415 if TmplTokenizer::syntaxerror_p;
418 exit(-1) if TmplTokenizer::fatal_p;
420 ###############################################################################
424 This script has behaviour similar to
425 xgettext(1), and generates gettext-compatible output files.
427 A gettext-like format provides the following advantages:
433 Translation to non-English-like languages with different word
434 order: gettext's c-format strings can theoretically be
435 emulated if we are able to do some analysis on the .tt input
436 and treat <TMPL_VAR> in a way similar to %s.
440 Context for the extracted strings: the gettext format provides
441 the filenames and line numbers where each string can be found.
442 The translator can read the source file and see the context,
443 in case the string by itself can mean several different things.
447 Place for the translator to add comments about the translations.
451 Gettext-compatible tools, if any, might be usable if we adopt
456 This script has already been in use for over a year and should
457 be reasonable stable. Nevertheless, it is still somewhat
458 experimental and there are still some issues.
460 Please refer to the explanation in tmpl_process3 for further
463 If you want to generate GNOME-style POTFILES.in files, such
464 files (passed to -f) can be generated thus:
466 (cd ../.. && find koha-tmpl/opac-tmpl/default/en \
467 -name \*.inc -o -name \*.tt) > opac/POTFILES.in
468 (cd ../.. && find koha-tmpl/intranet-tmpl/default/en \
469 -name \*.inc -o -name \*.tt) > intranet/POTFILES.in
471 This is, however, quite pointless, because the "create" and
472 "update" actions have already been implemented in tmpl_process3.pl.
474 =head2 Strings inside JavaScript
476 In the SCRIPT elements, the script will attempt to scan for
477 _("I<string literal>") patterns, and extract the I<string literal>
478 as a translatable string.
480 Note that the C-like _(...) notation is required.
482 The JavaScript must actually define a _ function
483 so that the code remains correct JavaScript.
484 A suitable definition of such a function can be
486 function _(s) { return s } // dummy function for gettext
497 There probably are some. Bugs related to scanning of <INPUT>
498 tags seem to be especially likely to be present.
500 Its diagnostics are probably too verbose.
502 When a <TMPL_VAR> within a JavaScript-related attribute is
503 detected, the script currently displays no warnings at all.
504 It might be good to display some kind of warning.
506 Its sort order (-s option) seems to be different than the real
507 xgettext(1)'s sort option. This will result in translation
508 strings inside the generated PO file spuriously moving about
509 when tmpl_process3.pl calls msgmerge(1) to update the PO file.
511 If a Javascript string has leading spaces, it will
512 generate strings with spurious leading spaces,
513 leading to failure to match the strings when actually generating