misc/translator/xgettext.pl

   1 #!/usr/bin/perl
   2
   3 # This file is part of Koha.
   4 #
   5 # Koha is free software; you can redistribute it and/or modify it
   6 # under the terms of the GNU General Public License as published by
   7 # the Free Software Foundation; either version 3 of the License, or
   8 # (at your option) any later version.
   9 #
  10 # Koha is distributed in the hope that it will be useful, but
  11 # WITHOUT ANY WARRANTY; without even the implied warranty of
  12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  13 # GNU General Public License for more details.
  14 #
  15 # You should have received a copy of the GNU General Public License
  16 # along with Koha; if not, see <http://www.gnu.org/licenses>.
  17
  18 =head1 NAME
  19
  20 xgettext.pl - xgettext(1)-like interface for .tt strings extraction
  21
  22 =cut
  23
  24 use FindBin;
  25 use lib $FindBin::Bin;
  26
  27 use strict;
  28 use warnings;
  29 use Getopt::Long qw( GetOptions );
  30 use POSIX;
  31 use Locale::PO;
  32 use TmplTokenizer;
  33 use VerboseWarnings;
  34
  35 use vars qw( $convert_from );
  36 use vars qw( $files_from $directory $output $sort );
  37 use vars qw( $extract_all_p );
  38 use vars qw( $pedantic_p );
  39 use vars qw( %text %translation );
  40 use vars qw( $charset_in $charset_out );
  41 use vars qw( $disable_fuzzy_p );
  42 use vars qw( $verbose_p );
  43 use vars qw( $po_mode_p );
  44
  45 our $OUTPUT;
  46
  47 ###############################################################################
  48
  49 sub string_negligible_p {
  50     my($t) = @_;                                # a string
  51     # Don't emit pure whitespace, pure numbers, pure punctuation,
  52     # single letters, or TMPL_VAR's.
  53     # Punctuation should arguably be translated. But without context
  54     # they are untranslatable. Note that $t is a string, not a token object.
  55     return !$extract_all_p && (
  56                TmplTokenizer::blank_p($t)       # blank or TMPL_VAR
  57             || $t =~ /^\d+$/                    # purely digits
  58             || $t =~ /^[-\+\.,:;!\?'"%\(\)\[\]\|]+$/ # punctuation w/o context
  59             || $t =~ /^[A-Za-z]$/               # single letters
  60             || $t =~ /^(&[a-z]+;|&#\d+;|&#x[0-9a-fA-F]+;|%%|%s|\s|[[:punct:]])*$/ # html entities,placeholder,punct, ...
  61         || ( $t =~ /^\[\%.*\%\]$/ and $t !~ /\%\].*\[\%/ )    # pure TT entities
  62         || $t =~ /^\s*<\?.*\?>/                               # ignore xml prolog
  63         )
  64 }
  65
  66 sub token_negligible_p {
  67     my ($x) = @_;
  68     my $t = $x->type;
  69     return !$extract_all_p && (
  70           $t == C4::TmplTokenType::TEXT() ? string_negligible_p( $x->string )
  71         : $t == C4::TmplTokenType::DIRECTIVE() ? 1
  72         : $t == C4::TmplTokenType::TEXT_PARAMETRIZED()
  73         && join(
  74             '',
  75             map {
  76                 my $t = $_->type;
  77                     $t == C4::TmplTokenType::DIRECTIVE() ? '1'
  78                   : $t == C4::TmplTokenType::TAG()       ? ''
  79                   : token_negligible_p($_)               ? ''
  80                   : '1'
  81             } @{ $x->children }
  82         ) eq ''
  83     );
  84 }
  85
  86 ###############################################################################
  87
  88 sub remember {
  89     my($token, $string) = @_;
  90     # If we determine that the string is negligible, don't bother to remember
  91     unless (string_negligible_p( $string ) || token_negligible_p( $token )) {
  92         my $key = TmplTokenizer::string_canon( $string );
  93         $text{$key} = [] unless defined $text{$key};
  94         push @{$text{$key}}, $token;
  95     }
  96 }
  97
  98 ###############################################################################
  99
 100 sub string_list {
 101     my @t = keys %text;
 102     # The real gettext tools seems to sort case sensitively; I don't know why
 103     @t = sort { $a cmp $b } @t if $sort eq 's';
 104     @t = sort {
 105             my @aa = sort { $a->pathname cmp $b->pathname
 106                     || $a->line_number <=> $b->line_number } @{$text{$a}};
 107             my @bb = sort { $a->pathname cmp $b->pathname
 108                     || $a->line_number <=> $b->line_number } @{$text{$b}};
 109             $aa[0]->pathname cmp $bb[0]->pathname
 110                     || $aa[0]->line_number <=> $bb[0]->line_number;
 111         } @t if $sort eq 'F';
 112     return @t;
 113 }
 114
 115   ###############################################################################
 116
 117 sub text_extract {
 118     my($h) = @_;
 119     for (;;) {
 120         my $s = TmplTokenizer::next_token($h);
 121         last unless defined $s;
 122         my($kind, $t, $attr) = ($s->type, $s->string, $s->attributes);
 123         if ($kind eq C4::TmplTokenType::TEXT) {
 124             if ($t =~ /\S/s && $t !~ /<!/){
 125                 remember( $s, $t );
 126             }
 127         } elsif ($kind eq C4::TmplTokenType::TEXT_PARAMETRIZED) {
 128             if ($s->form =~ /\S/s && $s->form !~ /<!/){
 129                 remember( $s, $s->form );
 130             }
 131         } elsif ($kind eq C4::TmplTokenType::TAG && %$attr) {
 132             # value [tag=input], meta
 133             my $tag;
 134             $tag = lc($1) if $t =~ /^<(\S+)/s;
 135             for my $a ('alt', 'content', 'title', 'value', 'label', 'placeholder', 'aria-label') {
 136                 if ($attr->{$a}) {
 137                     next if $a eq 'label' && $tag ne 'optgroup';
 138                     next if $a eq 'content';
 139                     next if $a eq 'value' && ($tag ne 'input'
 140                         || (ref $attr->{'type'} && $attr->{'type'}->[1] =~ /^(?:hidden|radio|checkbox)$/)); # FIXME
 141                     next if $tag eq 'meta';
 142                     my($key, $val, $val_orig, $order) = @{$attr->{$a}}; #FIXME
 143                     $val = TmplTokenizer::trim($val);
 144                     # for selected attributes replace '[%..%]' with '%s' globally
 145                     if ( $a =~ /title|value|alt|content|placeholder|aria-label/ ) {
 146                         $val =~ s/\[\%.*?\%\]/\%s/g;
 147                     }
 148                     # save attribute text for translation
 149                     remember( $s, $val ) if $val =~ /\S/s;
 150                 }
 151             }
 152         } elsif ($s->has_js_data) {
 153             for my $t (@{$s->js_data}) {
 154               remember( $s, $t->[3] ) if $t->[0]; # FIXME
 155             }
 156         }
 157     }
 158 }
 159
 160 ###############################################################################
 161
 162 sub generate_po_file {
 163     # We don't emit the Plural-Forms header; it's meaningless for us
 164     my $pot_charset = (defined $charset_out? $charset_out: 'CHARSET');
 165     $pot_charset = TmplTokenizer::charset_canon($pot_charset);
 166     # Time stamps aren't exactly right semantically. I don't know how to fix it.
 167     my $time = POSIX::strftime('%Y-%m-%d %H:%M%z', localtime(time));
 168     my $time_pot = $time;
 169     my $time_po  = $po_mode_p? $time: 'YEAR-MO-DA HO:MI+ZONE';
 170     print $OUTPUT <<EOF;
 171 # SOME DESCRIPTIVE TITLE.
 172 # Copyright (C) YEAR THE PACKAGE'S COPYRIGHT HOLDER
 173 # This file is distributed under the same license as the PACKAGE package.
 174 # FIRST AUTHOR <EMAIL\@ADDRESS>, YEAR.
 175 #
 176 EOF
 177     print $OUTPUT <<EOF unless $disable_fuzzy_p;
 178 #, fuzzy
 179 EOF
 180     print $OUTPUT <<EOF;
 181 msgid ""
 182 msgstr ""
 183 "Project-Id-Version: Koha\\n"
 184 "POT-Creation-Date: $time_pot\\n"
 185 "PO-Revision-Date: $time_po\\n"
 186 "Last-Translator: FULL NAME <EMAIL\@ADDRESS>\\n"
 187 "Language-Team: LANGUAGE <LL\@li.org>\\n"
 188 "MIME-Version: 1.0\\n"
 189 "Content-Type: text/plain; charset=$pot_charset\\n"
 190 "Content-Transfer-Encoding: 8bit\\n"
 191
 192 EOF
 193     my $directory_re = quotemeta("$directory/");
 194
 195     for my $t ( keys %text ) {
 196         my @ordered_tokens = sort {
 197                    $a->pathname cmp $b->pathname
 198                 || $a->line_number cmp $b->line_number
 199         } @{$text{$t}};
 200         my $token = $ordered_tokens[0];
 201
 202         if ( $token->type == C4::TmplTokenType::TEXT_PARAMETRIZED ) {
 203             my $n = 0;
 204             printf $OUTPUT "#. For the first occurrence,\n"
 205                 if @{ $text{$t} } > 1 && $token->parameters_and_fields > 0;
 206             for my $param ( $token->parameters_and_fields ) {
 207                 $n += 1;
 208                 my $type    = $param->type;
 209                 my $subtype = ( $type == C4::TmplTokenType::TAG
 210                         && $param->string =~ /^<input\b/is?
 211                         $param->attributes->{'type'}->[1] : undef );
 212                 my $fmt = TmplTokenizer::_formalize($param);
 213                 $fmt =~ s/^%/%$n\$/;
 214                 if ( $type == C4::TmplTokenType::DIRECTIVE ) {
 215
 216                     #               $type = "Template::Toolkit Directive";
 217                     $type = $param->string =~ /\[%(.*?)%\]/is ? $1 : 'ERROR';
 218                     my $name = $param->string =~ /\bname=(["']?)([^\s"']+)\1/is?
 219                         $2: undef;
 220                     printf $OUTPUT "#. %s: %s\n", $fmt,
 221                         "$type" . ( defined $name ? " name=$name" : '' );
 222                 } else {
 223                     my $name = $param->attributes->{'name'};
 224                     my $value;
 225                     $value = $param->attributes->{'value'}
 226                         unless $subtype =~ /^(?:text)$/;
 227                     printf $OUTPUT "#. %s: %s\n", $fmt, "type=$subtype"
 228                         . ( defined $name  ? " name=$name->[1]"   : '' )
 229                         . ( defined $value ? " value=$value->[1]" : '' );
 230                 }
 231             }
 232         } elsif ( $token->type == C4::TmplTokenType::TAG ) {
 233             printf $OUTPUT "#. For the first occurrence,\n"
 234                 if @{ $text{$t} } > 1 && $token->parameters_and_fields > 0;
 235             if ( $token->string =~ /^<meta\b/is ) {
 236                 my $type = $token->attributes->{'http-equiv'}->[1];
 237                 print $OUTPUT "#. META http-equiv=$type\n" if defined $type;
 238             } elsif ( $token->string =~ /^<([a-z0-9]+)/is ) {
 239                 my $tag  = uc($1);
 240                 my $type = ( lc($tag) eq 'input'?
 241                     $token->attributes->{'type'}: undef );
 242                 my $name = $token->attributes->{'name'};
 243                 printf $OUTPUT "#. %s\n", $tag
 244                    . (defined $type? " type=$type->[1]": '')
 245                    . (defined $name? " name=$name->[1]": '');
 246             }
 247         } elsif ( $token->has_js_data ) {
 248             printf $OUTPUT "#. For the first occurrence,\n" if @{ $text{$t} } > 1;
 249             printf $OUTPUT "#. SCRIPT\n";
 250         }
 251         my $cformat_p;
 252         my $location = {};
 253         for my $token ( @{ $text{$t} } ) {
 254             my $pathname = $token->pathname;
 255             $pathname =~ s/^$directory_re//os;
 256             $pathname =~ s/^.*\/koha-tmpl\/(.*)$/$1/;
 257             push @{ $location->{$pathname} }, $token->line_number
 258                 if defined $pathname && defined $token->line_number;
 259             $cformat_p = 1 if $token->type == C4::TmplTokenType::TEXT_PARAMETRIZED;
 260         }
 261
 262         for my $pathname ( sort keys %$location ) {
 263             for my $line_number ( @{ $location->{$pathname} } ) {
 264                 printf $OUTPUT "#: %s:%d\n", $pathname, $line_number;
 265             }
 266         }
 267
 268         printf $OUTPUT "#, c-format\n" if $cformat_p;
 269         my $msgid = TmplTokenizer::string_canon( TmplTokenizer::charset_convert( $t, $charset_in, $charset_out ) );
 270         printf $OUTPUT "msgid %s\n", ( defined $msgid && length $msgid ? Locale::PO->quote($msgid) : q{""} );
 271         printf $OUTPUT "msgstr %s\n\n", ( defined $translation{$t} ? Locale::PO->quote( $translation{$t} ) : q{""} );
 272     }
 273 }
 274
 275 ###############################################################################
 276
 277 sub convert_translation_file {
 278     open(my $INPUT, '<:encoding(utf-8)', $convert_from) || die "$convert_from: $!\n";
 279     VerboseWarnings::set_input_file_name($convert_from);
 280     while (<$INPUT>) {
 281         chomp;
 282         my($msgid, $msgstr) = split(/\t/);
 283         die "$convert_from: $.: Malformed tmpl_process input (no tab)\n"
 284                 unless defined $msgstr;
 285
 286         # Fixup some of the bad strings
 287         $msgid =~ s/^SELECTED>//;
 288
 289         # Create dummy token
 290         my $token = TmplToken->new( $msgid, C4::TmplTokenType::UNKNOWN, undef, undef );
 291         remember( $token, $msgid );
 292         $msgstr =~ s/^(?:LIMIT;|LIMITED;)//g; # unneeded for tmpl_process3
 293         $translation{$msgid} = $msgstr unless $msgstr eq '*****';
 294
 295         if ($msgid  =~ /\bcharset=(["']?)([^;\s"']+)\1/s) {
 296         my $candidate = TmplTokenizer::charset_canon($2);
 297             die "Conflicting charsets in msgid: $candidate vs $charset_in\n"
 298                     if defined $charset_in && $charset_in ne $candidate;
 299             $charset_in = $candidate;
 300         }
 301         if ($msgstr =~ /\bcharset=(["']?)([^;\s"']+)\1/s) {
 302         my $candidate = TmplTokenizer::charset_canon($2);
 303             die "Conflicting charsets in msgid: $candidate vs $charset_out\n"
 304                     if defined $charset_out && $charset_out ne $candidate;
 305             $charset_out = $candidate;
 306         }
 307     }
 308     # The following assumption is correct; that's what HTML::Template assumes
 309     if (!defined $charset_in) {
 310         $charset_in = $charset_out = TmplTokenizer::charset_canon('utf-8');
 311         warn "Warning: Can't determine original templates' charset, defaulting to $charset_in\n";
 312     }
 313 }
 314
 315 ###############################################################################
 316
 317 sub usage {
 318     my($exitcode) = @_;
 319     my $h = $exitcode? *STDERR: *STDOUT;
 320     print $h <<EOF;
 321 Usage: $0 [OPTIONS]
 322 Extract translatable strings from given HTML::Template input files.
 323
 324 Input file location:
 325   -f, --files-from=FILE          Get list of input files from FILE
 326   -D, --directory=DIRECTORY      Add DIRECTORY to list for input files search
 327
 328 Output file location:
 329   -o, --output=FILE              Write output to specified file
 330
 331 HTML::Template options:
 332   -a, --extract-all              Extract all strings
 333       --pedantic-warnings        Issue warnings even for detected problems
 334                                  which are likely to be harmless
 335
 336 Output details:
 337   -s, --sort-output              generate sorted output
 338   -F, --sort-by-file             sort output by file location
 339   -v, --verbose                  explain what is being done
 340
 341 Informative output:
 342       --help                     Display this help and exit
 343
 344 Try `perldoc $0' for perhaps more information.
 345 EOF
 346     exit($exitcode);
 347 }
 348
 349 ###############################################################################
 350
 351 sub usage_error {
 352     print STDERR "$_[0]\n" if @_;
 353     print STDERR "Try `$0 --help' for more information.\n";
 354     exit(-1);
 355 }
 356
 357 ###############################################################################
 358
 359 Getopt::Long::config qw( bundling no_auto_abbrev );
 360 GetOptions(
 361     'a|extract-all'                     => \$extract_all_p,
 362     'charset=s' => sub { $charset_in = $charset_out = $_[1] },  # INTERNAL
 363     'convert-from=s'                    => \$convert_from,
 364     'D|directory=s'                     => \$directory,
 365     'disable-fuzzy'                     => \$disable_fuzzy_p,   # INTERNAL
 366     'f|files-from=s'                    => \$files_from,
 367     'I|input-charset=s'                 => \$charset_in,        # INTERNAL
 368     'pedantic-warnings|pedantic'        => sub { $pedantic_p = 1 },
 369     'O|output-charset=s'                => \$charset_out,       # INTERNAL
 370     'output|o=s'                        => \$output,
 371     'po-mode'                           => \$po_mode_p,         # INTERNAL
 372     's|sort-output'                     => sub { $sort = 's' },
 373     'F|sort-by-file'                    => sub { $sort = 'F' },
 374     'v|verbose'                         => \$verbose_p,
 375     'help'                              => sub { usage(0) },
 376 ) || usage_error;
 377
 378 VerboseWarnings::set_application_name($0);
 379 VerboseWarnings::set_pedantic_mode($pedantic_p);
 380
 381 usage_error('Missing mandatory option -f')
 382         unless defined $files_from || defined $convert_from;
 383 $directory = '.' unless defined $directory;
 384
 385 usage_error('You cannot specify both --convert-from and --files-from')
 386         if defined $convert_from && defined $files_from;
 387
 388 if (defined $output && $output ne '-') {
 389     print STDERR "$0: Opening output file \"$output\"\n" if $verbose_p;
 390     open($OUTPUT, '>:encoding(utf-8)', $output) || die "$output: $!\n";
 391 } else {
 392     print STDERR "$0: Outputting to STDOUT...\n" if $verbose_p;
 393     open($OUTPUT, q{>}, "&STDOUT");
 394 }
 395
 396 if (defined $files_from) {
 397     print STDERR "$0: Opening input file list \"$files_from\"\n" if $verbose_p;
 398     open(my $INPUT, '<:encoding(utf-8)', $files_from) || die "$files_from: $!\n";
 399     while (<$INPUT>) {
 400         chomp;
 401         my $input = /^\//? $_: "$directory/$_";
 402         my $h = TmplTokenizer->new( $input );
 403         $h->set_allow_cformat( 1 );
 404     VerboseWarnings::set_input_file_name($input);
 405         print STDERR "$0: Processing file \"$input\"\n" if $verbose_p;
 406         text_extract( $h );
 407     }
 408     close $INPUT;
 409 } else {
 410     print STDERR "$0: Converting \"$convert_from\"\n" if $verbose_p;
 411     convert_translation_file;
 412 }
 413 generate_po_file;
 414
 415 warn "This input will not work with Mozilla standards-compliant mode\n", undef
 416         if TmplTokenizer::syntaxerror_p;
 417
 418
 419 exit(-1) if TmplTokenizer::fatal_p;
 420
 421 ###############################################################################
 422
 423 =head1 DESCRIPTION
 424
 425 This script has behaviour similar to
 426 xgettext(1), and generates gettext-compatible output files.
 427
 428 A gettext-like format provides the following advantages:
 429
 430 =over
 431
 432 =item -
 433
 434 Translation to non-English-like languages with different word
 435 order:  gettext's c-format strings can theoretically be
 436 emulated if we are able to do some analysis on the .tt input
 437 and treat <TMPL_VAR> in a way similar to %s.
 438
 439 =item -
 440
 441 Context for the extracted strings:  the gettext format provides
 442 the filenames and line numbers where each string can be found.
 443 The translator can read the source file and see the context,
 444 in case the string by itself can mean several different things.
 445
 446 =item -
 447
 448 Place for the translator to add comments about the translations.
 449
 450 =item -
 451
 452 Gettext-compatible tools, if any, might be usable if we adopt
 453 the gettext format.
 454
 455 =back
 456
 457 This script has already been in use for over a year and should
 458 be reasonable stable. Nevertheless, it is still somewhat
 459 experimental and there are still some issues.
 460
 461 Please refer to the explanation in tmpl_process3 for further
 462 details.
 463
 464 If you want to generate GNOME-style POTFILES.in files, such
 465 files (passed to -f) can be generated thus:
 466
 467     (cd ../.. && find koha-tmpl/opac-tmpl/default/en \
 468         -name \*.inc -o -name \*.tt) > opac/POTFILES.in
 469     (cd ../.. && find koha-tmpl/intranet-tmpl/default/en \
 470         -name \*.inc -o -name \*.tt) > intranet/POTFILES.in
 471
 472 This is, however, quite pointless, because the "create" and
 473 "update" actions have already been implemented in tmpl_process3.pl.
 474
 475 =head2 Strings inside JavaScript
 476
 477 In the SCRIPT elements, the script will attempt to scan for
 478 _("I<string literal>") patterns, and extract the I<string literal>
 479 as a translatable string.
 480
 481 Note that the C-like _(...) notation is required.
 482
 483 The JavaScript must actually define a _ function
 484 so that the code remains correct JavaScript.
 485 A suitable definition of such a function can be
 486
 487         function _(s) { return s } // dummy function for gettext
 488
 489 =head1 SEE ALSO
 490
 491 tmpl_process3.pl,
 492 xgettext(1),
 493 Locale::PO(3),
 494 translator_doc.txt
 495
 496 =head1 BUGS
 497
 498 There probably are some. Bugs related to scanning of <INPUT>
 499 tags seem to be especially likely to be present.
 500
 501 Its diagnostics are probably too verbose.
 502
 503 When a <TMPL_VAR> within a JavaScript-related attribute is
 504 detected, the script currently displays no warnings at all.
 505 It might be good to display some kind of warning.
 506
 507 Its sort order (-s option) seems to be different than the real
 508 xgettext(1)'s sort option. This will result in translation
 509 strings inside the generated PO file spuriously moving about
 510 when tmpl_process3.pl calls msgmerge(1) to update the PO file.
 511
 512 If a Javascript string has leading spaces, it will
 513 generate strings with spurious leading spaces,
 514 leading to failure to match the strings when actually generating
 515 translated files.
 516
 517 =cut