misc/translator/xgettext.pl

   1 #!/usr/bin/perl
   2
   3 # This file is part of Koha.
   4 #
   5 # Koha is free software; you can redistribute it and/or modify it
   6 # under the terms of the GNU General Public License as published by
   7 # the Free Software Foundation; either version 3 of the License, or
   8 # (at your option) any later version.
   9 #
  10 # Koha is distributed in the hope that it will be useful, but
  11 # WITHOUT ANY WARRANTY; without even the implied warranty of
  12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  13 # GNU General Public License for more details.
  14 #
  15 # You should have received a copy of the GNU General Public License
  16 # along with Koha; if not, see <http://www.gnu.org/licenses>.
  17
  18 =head1 NAME
  19
  20 xgettext.pl - xgettext(1)-like interface for .tt strings extraction
  21
  22 =cut
  23
  24 use FindBin;
  25 use lib $FindBin::Bin;
  26
  27 use strict;
  28 use warnings;
  29 use Getopt::Long;
  30 use POSIX;
  31 use Locale::PO;
  32 use TmplTokenizer;
  33 use VerboseWarnings;
  34
  35 use vars qw( $convert_from );
  36 use vars qw( $files_from $directory $output $sort );
  37 use vars qw( $extract_all_p );
  38 use vars qw( $pedantic_p );
  39 use vars qw( %text %translation );
  40 use vars qw( $charset_in $charset_out );
  41 use vars qw( $disable_fuzzy_p );
  42 use vars qw( $verbose_p );
  43 use vars qw( $po_mode_p );
  44
  45 our $OUTPUT;
  46
  47 ###############################################################################
  48
  49 sub string_negligible_p {
  50     my($t) = @_;                                # a string
  51     # Don't emit pure whitespace, pure numbers, pure punctuation,
  52     # single letters, or TMPL_VAR's.
  53     # Punctuation should arguably be translated. But without context
  54     # they are untranslatable. Note that $t is a string, not a token object.
  55     return !$extract_all_p && (
  56                TmplTokenizer::blank_p($t)       # blank or TMPL_VAR
  57             || $t =~ /^\d+$/                    # purely digits
  58             || $t =~ /^[-\+\.,:;!\?'"%\(\)\[\]\|]+$/ # punctuation w/o context
  59             || $t =~ /^[A-Za-z]$/               # single letters
  60             || $t =~ /^(&[a-z]+;|&#\d+;|&#x[0-9a-fA-F]+;|%%|%s|\s|[[:punct:]])*$/ # html entities,placeholder,punct, ...
  61         || ( $t =~ /^\[\%.*\%\]$/ and $t !~ /\%\].*\[\%/ )    # pure TT entities
  62         || $t =~ /^\s*<\?.*\?>/                               # ignore xml prolog
  63         )
  64 }
  65
  66 sub token_negligible_p {
  67     my ($x) = @_;
  68     my $t = $x->type;
  69     return !$extract_all_p && (
  70           $t == C4::TmplTokenType::TEXT() ? string_negligible_p( $x->string )
  71         : $t == C4::TmplTokenType::DIRECTIVE() ? 1
  72         : $t == C4::TmplTokenType::TEXT_PARAMETRIZED()
  73         && join(
  74             '',
  75             map {
  76                 my $t = $_->type;
  77                     $t == C4::TmplTokenType::DIRECTIVE() ? '1'
  78                   : $t == C4::TmplTokenType::TAG()       ? ''
  79                   : token_negligible_p($_)               ? ''
  80                   : '1'
  81             } @{ $x->children }
  82         ) eq ''
  83     );
  84 }
  85
  86 ###############################################################################
  87
  88 sub remember {
  89     my($token, $string) = @_;
  90     # If we determine that the string is negligible, don't bother to remember
  91     unless (string_negligible_p( $string ) || token_negligible_p( $token )) {
  92         my $key = TmplTokenizer::string_canon( $string );
  93         $text{$key} = [] unless defined $text{$key};
  94         push @{$text{$key}}, $token;
  95     }
  96 }
  97
  98 ###############################################################################
  99
 100 sub string_list {
 101     my @t = keys %text;
 102     # The real gettext tools seems to sort case sensitively; I don't know why
 103     @t = sort { $a cmp $b } @t if $sort eq 's';
 104     @t = sort {
 105             my @aa = sort { $a->pathname cmp $b->pathname
 106                     || $a->line_number <=> $b->line_number } @{$text{$a}};
 107             my @bb = sort { $a->pathname cmp $b->pathname
 108                     || $a->line_number <=> $b->line_number } @{$text{$b}};
 109             $aa[0]->pathname cmp $bb[0]->pathname
 110                     || $aa[0]->line_number <=> $bb[0]->line_number;
 111         } @t if $sort eq 'F';
 112     return @t;
 113 }
 114
 115   ###############################################################################
 116
 117 sub text_extract {
 118     my($h) = @_;
 119     for (;;) {
 120         my $s = TmplTokenizer::next_token($h);
 121         last unless defined $s;
 122         my($kind, $t, $attr) = ($s->type, $s->string, $s->attributes);
 123         if ($kind eq C4::TmplTokenType::TEXT) {
 124             if ($t =~ /\S/s && $t !~ /<!/){
 125                 remember( $s, $t );
 126             }
 127         } elsif ($kind eq C4::TmplTokenType::TEXT_PARAMETRIZED) {
 128             if ($s->form =~ /\S/s && $s->form !~ /<!/){
 129                 remember( $s, $s->form );
 130             }
 131         } elsif ($kind eq C4::TmplTokenType::TAG && %$attr) {
 132             # value [tag=input], meta
 133             my $tag;
 134             $tag = lc($1) if $t =~ /^<(\S+)/s;
 135             for my $a ('alt', 'content', 'title', 'value', 'label', 'placeholder', 'aria-label') {
 136                 if ($attr->{$a}) {
 137                     next if $a eq 'label' && $tag ne 'optgroup';
 138                     next if $a eq 'content' && $tag ne 'meta';
 139                     next if $a eq 'value' && ($tag ne 'input'
 140                         || (ref $attr->{'type'} && $attr->{'type'}->[1] =~ /^(?:hidden|radio|checkbox)$/)); # FIXME
 141                     my($key, $val, $val_orig, $order) = @{$attr->{$a}}; #FIXME
 142                     $val = TmplTokenizer::trim($val);
 143                     # for selected attributes replace '[%..%]' with '%s' globally
 144                     if ( $a =~ /title|value|alt|content|placeholder|aria-label/ ) {
 145                         $val =~ s/\[\%.*?\%\]/\%s/g;
 146                     }
 147                     # save attribute text for translation
 148                     remember( $s, $val ) if $val =~ /\S/s;
 149                 }
 150             }
 151         } elsif ($s->has_js_data) {
 152             for my $t (@{$s->js_data}) {
 153               remember( $s, $t->[3] ) if $t->[0]; # FIXME
 154             }
 155         }
 156     }
 157 }
 158
 159 ###############################################################################
 160
 161 sub generate_strings_list {
 162     # Emit all extracted strings.
 163     for my $t (string_list) {
 164         printf $OUTPUT "%s\n", $t;
 165     }
 166 }
 167
 168 ###############################################################################
 169
 170 sub generate_po_file {
 171     # We don't emit the Plural-Forms header; it's meaningless for us
 172     my $pot_charset = (defined $charset_out? $charset_out: 'CHARSET');
 173     $pot_charset = TmplTokenizer::charset_canon($pot_charset);
 174     # Time stamps aren't exactly right semantically. I don't know how to fix it.
 175     my $time = POSIX::strftime('%Y-%m-%d %H:%M%z', localtime(time));
 176     my $time_pot = $time;
 177     my $time_po  = $po_mode_p? $time: 'YEAR-MO-DA HO:MI+ZONE';
 178     print $OUTPUT <<EOF;
 179 # SOME DESCRIPTIVE TITLE.
 180 # Copyright (C) YEAR THE PACKAGE'S COPYRIGHT HOLDER
 181 # This file is distributed under the same license as the PACKAGE package.
 182 # FIRST AUTHOR <EMAIL\@ADDRESS>, YEAR.
 183 #
 184 EOF
 185     print $OUTPUT <<EOF unless $disable_fuzzy_p;
 186 #, fuzzy
 187 EOF
 188     print $OUTPUT <<EOF;
 189 msgid ""
 190 msgstr ""
 191 "Project-Id-Version: Koha\\n"
 192 "POT-Creation-Date: $time_pot\\n"
 193 "PO-Revision-Date: $time_po\\n"
 194 "Last-Translator: FULL NAME <EMAIL\@ADDRESS>\\n"
 195 "Language-Team: LANGUAGE <LL\@li.org>\\n"
 196 "MIME-Version: 1.0\\n"
 197 "Content-Type: text/plain; charset=$pot_charset\\n"
 198 "Content-Transfer-Encoding: 8bit\\n"
 199
 200 EOF
 201     my $directory_re = quotemeta("$directory/");
 202     for my $t (string_list) {
 203         if ($text{$t}->[0]->type == C4::TmplTokenType::TEXT_PARAMETRIZED) {
 204             my($token, $n) = ($text{$t}->[0], 0);
 205         printf $OUTPUT "#. For the first occurrence,\n"
 206                     if @{$text{$t}} > 1 && $token->parameters_and_fields > 0;
 207             for my $param ($token->parameters_and_fields) {
 208                 $n += 1;
 209                 my $type = $param->type;
 210                 my $subtype = ($type == C4::TmplTokenType::TAG
 211                         && $param->string =~ /^<input\b/is?
 212                                 $param->attributes->{'type'}->[1]: undef);
 213                 my $fmt = TmplTokenizer::_formalize( $param );
 214                 $fmt =~ s/^%/%$n\$/;
 215                 if ($type == C4::TmplTokenType::DIRECTIVE) {
 216 #                   $type = "Template::Toolkit Directive";
 217                     $type = $param->string =~ /\[%(.*?)%\]/is? $1: 'ERROR';
 218                     my $name = $param->string =~ /\bname=(["']?)([^\s"']+)\1/is?
 219                             $2: undef;
 220             printf $OUTPUT "#. %s: %s\n", $fmt,
 221                         "$type" . (defined $name? " name=$name": '');
 222                 } else {
 223                     my $name = $param->attributes->{'name'};
 224             my $value;
 225             $value = $param->attributes->{'value'}
 226                             unless $subtype =~ /^(?:text)$/;
 227             printf $OUTPUT "#. %s: %s\n", $fmt, "type=$subtype"
 228                             . (defined $name?  " name=$name->[1]": '')
 229                             . (defined $value? " value=$value->[1]": '');
 230                 }
 231             }
 232         } elsif ($text{$t}->[0]->type == C4::TmplTokenType::TAG) {
 233             my($token) = ($text{$t}->[0]);
 234         printf $OUTPUT "#. For the first occurrence,\n"
 235                     if @{$text{$t}} > 1 && $token->parameters_and_fields > 0;
 236             if ($token->string =~ /^<meta\b/is) {
 237                 my $type = $token->attributes->{'http-equiv'}->[1];
 238         print $OUTPUT "#. META http-equiv=$type\n" if defined $type;
 239             } elsif ($token->string =~ /^<([a-z0-9]+)/is) {
 240                 my $tag = uc($1);
 241                 my $type = (lc($tag) eq 'input'?
 242                         $token->attributes->{'type'}: undef);
 243                 my $name = $token->attributes->{'name'};
 244         printf $OUTPUT "#. %s\n", $tag
 245                     . (defined $type? " type=$type->[1]": '')
 246                     . (defined $name? " name=$name->[1]": '');
 247             }
 248         } elsif ($text{$t}->[0]->has_js_data) {
 249         printf $OUTPUT "#. For the first occurrence,\n" if @{$text{$t}} > 1;
 250         printf $OUTPUT "#. SCRIPT\n";
 251         }
 252         my $cformat_p;
 253         for my $token (@{$text{$t}}) {
 254             my $pathname = $token->pathname;
 255             $pathname =~ s/^$directory_re//os;
 256         $pathname =~ s/^.*\/koha-tmpl\/(.*)$/$1/;
 257         printf $OUTPUT "#: %s:%d\n", $pathname, $token->line_number
 258                     if defined $pathname && defined $token->line_number;
 259             $cformat_p = 1 if $token->type == C4::TmplTokenType::TEXT_PARAMETRIZED;
 260         }
 261         printf $OUTPUT "#, c-format\n" if $cformat_p;
 262         printf $OUTPUT "msgid %s\n", TmplTokenizer::quote_po(
 263             TmplTokenizer::string_canon(
 264                 TmplTokenizer::charset_convert($t, $charset_in, $charset_out)
 265             )
 266         );
 267         printf $OUTPUT "msgstr %s\n\n", (defined $translation{$t}?
 268                 TmplTokenizer::quote_po( $translation{$t} ): "\"\"");
 269     }
 270 }
 271
 272 ###############################################################################
 273
 274 sub convert_translation_file {
 275     open(my $INPUT, '<:encoding(utf-8)', $convert_from) || die "$convert_from: $!\n";
 276     VerboseWarnings::set_input_file_name($convert_from);
 277     while (<$INPUT>) {
 278         chomp;
 279         my($msgid, $msgstr) = split(/\t/);
 280         die "$convert_from: $.: Malformed tmpl_process input (no tab)\n"
 281                 unless defined $msgstr;
 282
 283         # Fixup some of the bad strings
 284         $msgid =~ s/^SELECTED>//;
 285
 286         # Create dummy token
 287         my $token = TmplToken->new( $msgid, C4::TmplTokenType::UNKNOWN, undef, undef );
 288         remember( $token, $msgid );
 289         $msgstr =~ s/^(?:LIMIT;|LIMITED;)//g; # unneeded for tmpl_process3
 290         $translation{$msgid} = $msgstr unless $msgstr eq '*****';
 291
 292         if ($msgid  =~ /\bcharset=(["']?)([^;\s"']+)\1/s) {
 293         my $candidate = TmplTokenizer::charset_canon($2);
 294             die "Conflicting charsets in msgid: $candidate vs $charset_in\n"
 295                     if defined $charset_in && $charset_in ne $candidate;
 296             $charset_in = $candidate;
 297         }
 298         if ($msgstr =~ /\bcharset=(["']?)([^;\s"']+)\1/s) {
 299         my $candidate = TmplTokenizer::charset_canon($2);
 300             die "Conflicting charsets in msgid: $candidate vs $charset_out\n"
 301                     if defined $charset_out && $charset_out ne $candidate;
 302             $charset_out = $candidate;
 303         }
 304     }
 305     # The following assumption is correct; that's what HTML::Template assumes
 306     if (!defined $charset_in) {
 307         $charset_in = $charset_out = TmplTokenizer::charset_canon('utf-8');
 308         warn "Warning: Can't determine original templates' charset, defaulting to $charset_in\n";
 309     }
 310 }
 311
 312 ###############################################################################
 313
 314 sub usage {
 315     my($exitcode) = @_;
 316     my $h = $exitcode? *STDERR: *STDOUT;
 317     print $h <<EOF;
 318 Usage: $0 [OPTIONS]
 319 Extract translatable strings from given HTML::Template input files.
 320
 321 Input file location:
 322   -f, --files-from=FILE          Get list of input files from FILE
 323   -D, --directory=DIRECTORY      Add DIRECTORY to list for input files search
 324
 325 Output file location:
 326   -o, --output=FILE              Write output to specified file
 327
 328 HTML::Template options:
 329   -a, --extract-all              Extract all strings
 330       --pedantic-warnings        Issue warnings even for detected problems
 331                                  which are likely to be harmless
 332
 333 Output details:
 334   -s, --sort-output              generate sorted output
 335   -F, --sort-by-file             sort output by file location
 336   -v, --verbose                  explain what is being done
 337
 338 Informative output:
 339       --help                     Display this help and exit
 340
 341 Try `perldoc $0' for perhaps more information.
 342 EOF
 343     exit($exitcode);
 344 }
 345
 346 ###############################################################################
 347
 348 sub usage_error {
 349     print STDERR "$_[0]\n" if @_;
 350     print STDERR "Try `$0 --help' for more information.\n";
 351     exit(-1);
 352 }
 353
 354 ###############################################################################
 355
 356 Getopt::Long::config qw( bundling no_auto_abbrev );
 357 GetOptions(
 358     'a|extract-all'                     => \$extract_all_p,
 359     'charset=s' => sub { $charset_in = $charset_out = $_[1] },  # INTERNAL
 360     'convert-from=s'                    => \$convert_from,
 361     'D|directory=s'                     => \$directory,
 362     'disable-fuzzy'                     => \$disable_fuzzy_p,   # INTERNAL
 363     'f|files-from=s'                    => \$files_from,
 364     'I|input-charset=s'                 => \$charset_in,        # INTERNAL
 365     'pedantic-warnings|pedantic'        => sub { $pedantic_p = 1 },
 366     'O|output-charset=s'                => \$charset_out,       # INTERNAL
 367     'output|o=s'                        => \$output,
 368     'po-mode'                           => \$po_mode_p,         # INTERNAL
 369     's|sort-output'                     => sub { $sort = 's' },
 370     'F|sort-by-file'                    => sub { $sort = 'F' },
 371     'v|verbose'                         => \$verbose_p,
 372     'help'                              => sub { usage(0) },
 373 ) || usage_error;
 374
 375 VerboseWarnings::set_application_name($0);
 376 VerboseWarnings::set_pedantic_mode($pedantic_p);
 377
 378 usage_error('Missing mandatory option -f')
 379         unless defined $files_from || defined $convert_from;
 380 $directory = '.' unless defined $directory;
 381
 382 usage_error('You cannot specify both --convert-from and --files-from')
 383         if defined $convert_from && defined $files_from;
 384
 385 if (defined $output && $output ne '-') {
 386     print STDERR "$0: Opening output file \"$output\"\n" if $verbose_p;
 387     open($OUTPUT, '>:encoding(utf-8)', $output) || die "$output: $!\n";
 388 } else {
 389     print STDERR "$0: Outputting to STDOUT...\n" if $verbose_p;
 390     open($OUTPUT, ">&STDOUT");
 391 }
 392
 393 if (defined $files_from) {
 394     print STDERR "$0: Opening input file list \"$files_from\"\n" if $verbose_p;
 395     open(my $INPUT, '<:encoding(utf-8)', $files_from) || die "$files_from: $!\n";
 396     while (<$INPUT>) {
 397         chomp;
 398         my $input = /^\//? $_: "$directory/$_";
 399         my $h = TmplTokenizer->new( $input );
 400         $h->set_allow_cformat( 1 );
 401     VerboseWarnings::set_input_file_name($input);
 402         print STDERR "$0: Processing file \"$input\"\n" if $verbose_p;
 403         text_extract( $h );
 404     }
 405     close $INPUT;
 406 } else {
 407     print STDERR "$0: Converting \"$convert_from\"\n" if $verbose_p;
 408     convert_translation_file;
 409 }
 410 generate_po_file;
 411
 412 warn "This input will not work with Mozilla standards-compliant mode\n", undef
 413         if TmplTokenizer::syntaxerror_p;
 414
 415
 416 exit(-1) if TmplTokenizer::fatal_p;
 417
 418 ###############################################################################
 419
 420 =head1 DESCRIPTION
 421
 422 This script has behaviour similar to
 423 xgettext(1), and generates gettext-compatible output files.
 424
 425 A gettext-like format provides the following advantages:
 426
 427 =over
 428
 429 =item -
 430
 431 Translation to non-English-like languages with different word
 432 order:  gettext's c-format strings can theoretically be
 433 emulated if we are able to do some analysis on the .tt input
 434 and treat <TMPL_VAR> in a way similar to %s.
 435
 436 =item -
 437
 438 Context for the extracted strings:  the gettext format provides
 439 the filenames and line numbers where each string can be found.
 440 The translator can read the source file and see the context,
 441 in case the string by itself can mean several different things.
 442
 443 =item -
 444
 445 Place for the translator to add comments about the translations.
 446
 447 =item -
 448
 449 Gettext-compatible tools, if any, might be usable if we adopt
 450 the gettext format.
 451
 452 =back
 453
 454 This script has already been in use for over a year and should
 455 be reasonable stable. Nevertheless, it is still somewhat
 456 experimental and there are still some issues.
 457
 458 Please refer to the explanation in tmpl_process3 for further
 459 details.
 460
 461 If you want to generate GNOME-style POTFILES.in files, such
 462 files (passed to -f) can be generated thus:
 463
 464     (cd ../.. && find koha-tmpl/opac-tmpl/default/en \
 465         -name \*.inc -o -name \*.tt) > opac/POTFILES.in
 466     (cd ../.. && find koha-tmpl/intranet-tmpl/default/en \
 467         -name \*.inc -o -name \*.tt) > intranet/POTFILES.in
 468
 469 This is, however, quite pointless, because the "create" and
 470 "update" actions have already been implemented in tmpl_process3.pl.
 471
 472 =head2 Strings inside JavaScript
 473
 474 In the SCRIPT elements, the script will attempt to scan for
 475 _("I<string literal>") patterns, and extract the I<string literal>
 476 as a translatable string.
 477
 478 Note that the C-like _(...) notation is required.
 479
 480 The JavaScript must actually define a _ function
 481 so that the code remains correct JavaScript.
 482 A suitable definition of such a function can be
 483
 484         function _(s) { return s } // dummy function for gettext
 485
 486 =head1 SEE ALSO
 487
 488 tmpl_process3.pl,
 489 xgettext(1),
 490 Locale::PO(3),
 491 translator_doc.txt
 492
 493 =head1 BUGS
 494
 495 There probably are some. Bugs related to scanning of <INPUT>
 496 tags seem to be especially likely to be present.
 497
 498 Its diagnostics are probably too verbose.
 499
 500 When a <TMPL_VAR> within a JavaScript-related attribute is
 501 detected, the script currently displays no warnings at all.
 502 It might be good to display some kind of warning.
 503
 504 Its sort order (-s option) seems to be different than the real
 505 xgettext(1)'s sort option. This will result in translation
 506 strings inside the generated PO file spuriously moving about
 507 when tmpl_process3.pl calls msgmerge(1) to update the PO file.
 508
 509 If a Javascript string has leading spaces, it will
 510 generate strings with spurious leading spaces,
 511 leading to failure to match the strings when actually generating
 512 translated files.
 513
 514 =cut