misc/translator/xgettext.pl

   1 #!/usr/bin/perl
   2
   3 # This file is part of Koha.
   4 #
   5 # Koha is free software; you can redistribute it and/or modify it
   6 # under the terms of the GNU General Public License as published by
   7 # the Free Software Foundation; either version 3 of the License, or
   8 # (at your option) any later version.
   9 #
  10 # Koha is distributed in the hope that it will be useful, but
  11 # WITHOUT ANY WARRANTY; without even the implied warranty of
  12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  13 # GNU General Public License for more details.
  14 #
  15 # You should have received a copy of the GNU General Public License
  16 # along with Koha; if not, see <http://www.gnu.org/licenses>.
  17
  18 =head1 NAME
  19
  20 xgettext.pl - xgettext(1)-like interface for .tt strings extraction
  21
  22 =cut
  23
  24 use FindBin;
  25 use lib $FindBin::Bin;
  26
  27 use strict;
  28 use warnings;
  29 use Getopt::Long qw( GetOptions );
  30 use POSIX;
  31 use Locale::PO;
  32 use TmplTokenizer;
  33 use VerboseWarnings;
  34
  35 use vars qw( $convert_from );
  36 use vars qw( $files_from $directory $output $sort );
  37 use vars qw( $extract_all_p );
  38 use vars qw( $pedantic_p );
  39 use vars qw( %text %translation );
  40 use vars qw( $charset_in $charset_out );
  41 use vars qw( $disable_fuzzy_p );
  42 use vars qw( $verbose_p );
  43 use vars qw( $po_mode_p );
  44
  45 our $OUTPUT;
  46
  47 ###############################################################################
  48
  49 sub string_negligible_p {
  50     my($t) = @_;                                # a string
  51     # Don't emit pure whitespace, pure numbers, pure punctuation,
  52     # single letters, or TMPL_VAR's.
  53     # Punctuation should arguably be translated. But without context
  54     # they are untranslatable. Note that $t is a string, not a token object.
  55     return !$extract_all_p && (
  56                TmplTokenizer::blank_p($t)       # blank or TMPL_VAR
  57             || $t =~ /^\d+$/                    # purely digits
  58             || $t =~ /^[-\+\.,:;!\?'"%\(\)\[\]\|]+$/ # punctuation w/o context
  59             || $t =~ /^[A-Za-z]$/               # single letters
  60             || $t =~ /^(&[a-z]+;|&#\d+;|&#x[0-9a-fA-F]+;|%%|%s|\s|[[:punct:]])*$/ # html entities,placeholder,punct, ...
  61         || ( $t =~ /^\[\%.*\%\]$/ and $t !~ /\%\].*\[\%/ )    # pure TT entities
  62         || $t =~ /^\s*<\?.*\?>/                               # ignore xml prolog
  63         )
  64 }
  65
  66 sub token_negligible_p {
  67     my ($x) = @_;
  68     my $t = $x->type;
  69     return !$extract_all_p && (
  70           $t == C4::TmplTokenType::TEXT() ? string_negligible_p( $x->string )
  71         : $t == C4::TmplTokenType::DIRECTIVE() ? 1
  72         : $t == C4::TmplTokenType::TEXT_PARAMETRIZED()
  73         && join(
  74             '',
  75             map {
  76                 my $t = $_->type;
  77                     $t == C4::TmplTokenType::DIRECTIVE() ? '1'
  78                   : $t == C4::TmplTokenType::TAG()       ? ''
  79                   : token_negligible_p($_)               ? ''
  80                   : '1'
  81             } @{ $x->children }
  82         ) eq ''
  83     );
  84 }
  85
  86 ###############################################################################
  87
  88 sub remember {
  89     my($token, $string) = @_;
  90     # If we determine that the string is negligible, don't bother to remember
  91     unless (string_negligible_p( $string ) || token_negligible_p( $token )) {
  92         my $key = TmplTokenizer::string_canon( $string );
  93         $text{$key} = [] unless defined $text{$key};
  94         push @{$text{$key}}, $token;
  95     }
  96 }
  97
  98 ###############################################################################
  99
 100 sub string_list {
 101     my @t = keys %text;
 102     # The real gettext tools seems to sort case sensitively; I don't know why
 103     @t = sort { $a cmp $b } @t if $sort eq 's';
 104     @t = sort {
 105             my @aa = sort { $a->pathname cmp $b->pathname
 106                     || $a->line_number <=> $b->line_number } @{$text{$a}};
 107             my @bb = sort { $a->pathname cmp $b->pathname
 108                     || $a->line_number <=> $b->line_number } @{$text{$b}};
 109             $aa[0]->pathname cmp $bb[0]->pathname
 110                     || $aa[0]->line_number <=> $bb[0]->line_number;
 111         } @t if $sort eq 'F';
 112     return @t;
 113 }
 114
 115   ###############################################################################
 116
 117 sub text_extract {
 118     my($h) = @_;
 119     for (;;) {
 120         my $s = TmplTokenizer::next_token($h);
 121         last unless defined $s;
 122         my($kind, $t, $attr) = ($s->type, $s->string, $s->attributes);
 123         if ($kind eq C4::TmplTokenType::TEXT) {
 124             if ($t =~ /\S/s && $t !~ /<!/){
 125                 remember( $s, $t );
 126             }
 127         } elsif ($kind eq C4::TmplTokenType::TEXT_PARAMETRIZED) {
 128             if ($s->form =~ /\S/s && $s->form !~ /<!/){
 129                 remember( $s, $s->form );
 130             }
 131         } elsif ($kind eq C4::TmplTokenType::TAG && %$attr) {
 132             # value [tag=input], meta
 133             my $tag;
 134             $tag = lc($1) if $t =~ /^<(\S+)/s;
 135             for my $a ('alt', 'content', 'title', 'value', 'label', 'placeholder', 'aria-label') {
 136                 if ($attr->{$a}) {
 137                     next if $a eq 'label' && $tag ne 'optgroup';
 138                     next if $a eq 'content' && $tag ne 'meta';
 139                     next if $a eq 'value' && ($tag ne 'input'
 140                         || (ref $attr->{'type'} && $attr->{'type'}->[1] =~ /^(?:hidden|radio|checkbox)$/)); # FIXME
 141                     my($key, $val, $val_orig, $order) = @{$attr->{$a}}; #FIXME
 142                     $val = TmplTokenizer::trim($val);
 143                     # for selected attributes replace '[%..%]' with '%s' globally
 144                     if ( $a =~ /title|value|alt|content|placeholder|aria-label/ ) {
 145                         $val =~ s/\[\%.*?\%\]/\%s/g;
 146                     }
 147                     # save attribute text for translation
 148                     remember( $s, $val ) if $val =~ /\S/s;
 149                 }
 150             }
 151         } elsif ($s->has_js_data) {
 152             for my $t (@{$s->js_data}) {
 153               remember( $s, $t->[3] ) if $t->[0]; # FIXME
 154             }
 155         }
 156     }
 157 }
 158
 159 ###############################################################################
 160
 161 sub generate_po_file {
 162     # We don't emit the Plural-Forms header; it's meaningless for us
 163     my $pot_charset = (defined $charset_out? $charset_out: 'CHARSET');
 164     $pot_charset = TmplTokenizer::charset_canon($pot_charset);
 165     # Time stamps aren't exactly right semantically. I don't know how to fix it.
 166     my $time = POSIX::strftime('%Y-%m-%d %H:%M%z', localtime(time));
 167     my $time_pot = $time;
 168     my $time_po  = $po_mode_p? $time: 'YEAR-MO-DA HO:MI+ZONE';
 169     print $OUTPUT <<EOF;
 170 # SOME DESCRIPTIVE TITLE.
 171 # Copyright (C) YEAR THE PACKAGE'S COPYRIGHT HOLDER
 172 # This file is distributed under the same license as the PACKAGE package.
 173 # FIRST AUTHOR <EMAIL\@ADDRESS>, YEAR.
 174 #
 175 EOF
 176     print $OUTPUT <<EOF unless $disable_fuzzy_p;
 177 #, fuzzy
 178 EOF
 179     print $OUTPUT <<EOF;
 180 msgid ""
 181 msgstr ""
 182 "Project-Id-Version: Koha\\n"
 183 "POT-Creation-Date: $time_pot\\n"
 184 "PO-Revision-Date: $time_po\\n"
 185 "Last-Translator: FULL NAME <EMAIL\@ADDRESS>\\n"
 186 "Language-Team: LANGUAGE <LL\@li.org>\\n"
 187 "MIME-Version: 1.0\\n"
 188 "Content-Type: text/plain; charset=$pot_charset\\n"
 189 "Content-Transfer-Encoding: 8bit\\n"
 190
 191 EOF
 192     my $directory_re = quotemeta("$directory/");
 193
 194     for my $t ( keys %text ) {
 195         my @ordered_tokens = sort {
 196                    $a->pathname cmp $b->pathname
 197                 || $a->line_number cmp $b->line_number
 198         } @{$text{$t}};
 199         my $token = $ordered_tokens[0];
 200
 201         if ( $token->type == C4::TmplTokenType::TEXT_PARAMETRIZED ) {
 202             my $n = 0;
 203             printf $OUTPUT "#. For the first occurrence,\n"
 204                 if @{ $text{$t} } > 1 && $token->parameters_and_fields > 0;
 205             for my $param ( $token->parameters_and_fields ) {
 206                 $n += 1;
 207                 my $type    = $param->type;
 208                 my $subtype = ( $type == C4::TmplTokenType::TAG
 209                         && $param->string =~ /^<input\b/is?
 210                         $param->attributes->{'type'}->[1] : undef );
 211                 my $fmt = TmplTokenizer::_formalize($param);
 212                 $fmt =~ s/^%/%$n\$/;
 213                 if ( $type == C4::TmplTokenType::DIRECTIVE ) {
 214
 215                     #               $type = "Template::Toolkit Directive";
 216                     $type = $param->string =~ /\[%(.*?)%\]/is ? $1 : 'ERROR';
 217                     my $name = $param->string =~ /\bname=(["']?)([^\s"']+)\1/is?
 218                         $2: undef;
 219                     printf $OUTPUT "#. %s: %s\n", $fmt,
 220                         "$type" . ( defined $name ? " name=$name" : '' );
 221                 } else {
 222                     my $name = $param->attributes->{'name'};
 223                     my $value;
 224                     $value = $param->attributes->{'value'}
 225                         unless $subtype =~ /^(?:text)$/;
 226                     printf $OUTPUT "#. %s: %s\n", $fmt, "type=$subtype"
 227                         . ( defined $name  ? " name=$name->[1]"   : '' )
 228                         . ( defined $value ? " value=$value->[1]" : '' );
 229                 }
 230             }
 231         } elsif ( $token->type == C4::TmplTokenType::TAG ) {
 232             printf $OUTPUT "#. For the first occurrence,\n"
 233                 if @{ $text{$t} } > 1 && $token->parameters_and_fields > 0;
 234             if ( $token->string =~ /^<meta\b/is ) {
 235                 my $type = $token->attributes->{'http-equiv'}->[1];
 236                 print $OUTPUT "#. META http-equiv=$type\n" if defined $type;
 237             } elsif ( $token->string =~ /^<([a-z0-9]+)/is ) {
 238                 my $tag  = uc($1);
 239                 my $type = ( lc($tag) eq 'input'?
 240                     $token->attributes->{'type'}: undef );
 241                 my $name = $token->attributes->{'name'};
 242                 printf $OUTPUT "#. %s\n", $tag
 243                    . (defined $type? " type=$type->[1]": '')
 244                    . (defined $name? " name=$name->[1]": '');
 245             }
 246         } elsif ( $token->has_js_data ) {
 247             printf $OUTPUT "#. For the first occurrence,\n" if @{ $text{$t} } > 1;
 248             printf $OUTPUT "#. SCRIPT\n";
 249         }
 250         my $cformat_p;
 251         my $location = {};
 252         for my $token ( @{ $text{$t} } ) {
 253             my $pathname = $token->pathname;
 254             $pathname =~ s/^$directory_re//os;
 255             $pathname =~ s/^.*\/koha-tmpl\/(.*)$/$1/;
 256             push @{ $location->{$pathname} }, $token->line_number
 257                 if defined $pathname && defined $token->line_number;
 258             $cformat_p = 1 if $token->type == C4::TmplTokenType::TEXT_PARAMETRIZED;
 259         }
 260
 261         for my $pathname ( sort keys %$location ) {
 262             for my $line_number ( @{ $location->{$pathname} } ) {
 263                 printf $OUTPUT "#: %s:%d\n", $pathname, $line_number;
 264             }
 265         }
 266
 267         printf $OUTPUT "#, c-format\n" if $cformat_p;
 268         my $msgid = TmplTokenizer::string_canon( TmplTokenizer::charset_convert( $t, $charset_in, $charset_out ) );
 269         printf $OUTPUT "msgid %s\n", ( defined $msgid && length $msgid ? Locale::PO->quote($msgid) : q{""} );
 270         printf $OUTPUT "msgstr %s\n\n", ( defined $translation{$t} ? Locale::PO->quote( $translation{$t} ) : q{""} );
 271     }
 272 }
 273
 274 ###############################################################################
 275
 276 sub convert_translation_file {
 277     open(my $INPUT, '<:encoding(utf-8)', $convert_from) || die "$convert_from: $!\n";
 278     VerboseWarnings::set_input_file_name($convert_from);
 279     while (<$INPUT>) {
 280         chomp;
 281         my($msgid, $msgstr) = split(/\t/);
 282         die "$convert_from: $.: Malformed tmpl_process input (no tab)\n"
 283                 unless defined $msgstr;
 284
 285         # Fixup some of the bad strings
 286         $msgid =~ s/^SELECTED>//;
 287
 288         # Create dummy token
 289         my $token = TmplToken->new( $msgid, C4::TmplTokenType::UNKNOWN, undef, undef );
 290         remember( $token, $msgid );
 291         $msgstr =~ s/^(?:LIMIT;|LIMITED;)//g; # unneeded for tmpl_process3
 292         $translation{$msgid} = $msgstr unless $msgstr eq '*****';
 293
 294         if ($msgid  =~ /\bcharset=(["']?)([^;\s"']+)\1/s) {
 295         my $candidate = TmplTokenizer::charset_canon($2);
 296             die "Conflicting charsets in msgid: $candidate vs $charset_in\n"
 297                     if defined $charset_in && $charset_in ne $candidate;
 298             $charset_in = $candidate;
 299         }
 300         if ($msgstr =~ /\bcharset=(["']?)([^;\s"']+)\1/s) {
 301         my $candidate = TmplTokenizer::charset_canon($2);
 302             die "Conflicting charsets in msgid: $candidate vs $charset_out\n"
 303                     if defined $charset_out && $charset_out ne $candidate;
 304             $charset_out = $candidate;
 305         }
 306     }
 307     # The following assumption is correct; that's what HTML::Template assumes
 308     if (!defined $charset_in) {
 309         $charset_in = $charset_out = TmplTokenizer::charset_canon('utf-8');
 310         warn "Warning: Can't determine original templates' charset, defaulting to $charset_in\n";
 311     }
 312 }
 313
 314 ###############################################################################
 315
 316 sub usage {
 317     my($exitcode) = @_;
 318     my $h = $exitcode? *STDERR: *STDOUT;
 319     print $h <<EOF;
 320 Usage: $0 [OPTIONS]
 321 Extract translatable strings from given HTML::Template input files.
 322
 323 Input file location:
 324   -f, --files-from=FILE          Get list of input files from FILE
 325   -D, --directory=DIRECTORY      Add DIRECTORY to list for input files search
 326
 327 Output file location:
 328   -o, --output=FILE              Write output to specified file
 329
 330 HTML::Template options:
 331   -a, --extract-all              Extract all strings
 332       --pedantic-warnings        Issue warnings even for detected problems
 333                                  which are likely to be harmless
 334
 335 Output details:
 336   -s, --sort-output              generate sorted output
 337   -F, --sort-by-file             sort output by file location
 338   -v, --verbose                  explain what is being done
 339
 340 Informative output:
 341       --help                     Display this help and exit
 342
 343 Try `perldoc $0' for perhaps more information.
 344 EOF
 345     exit($exitcode);
 346 }
 347
 348 ###############################################################################
 349
 350 sub usage_error {
 351     print STDERR "$_[0]\n" if @_;
 352     print STDERR "Try `$0 --help' for more information.\n";
 353     exit(-1);
 354 }
 355
 356 ###############################################################################
 357
 358 Getopt::Long::config qw( bundling no_auto_abbrev );
 359 GetOptions(
 360     'a|extract-all'                     => \$extract_all_p,
 361     'charset=s' => sub { $charset_in = $charset_out = $_[1] },  # INTERNAL
 362     'convert-from=s'                    => \$convert_from,
 363     'D|directory=s'                     => \$directory,
 364     'disable-fuzzy'                     => \$disable_fuzzy_p,   # INTERNAL
 365     'f|files-from=s'                    => \$files_from,
 366     'I|input-charset=s'                 => \$charset_in,        # INTERNAL
 367     'pedantic-warnings|pedantic'        => sub { $pedantic_p = 1 },
 368     'O|output-charset=s'                => \$charset_out,       # INTERNAL
 369     'output|o=s'                        => \$output,
 370     'po-mode'                           => \$po_mode_p,         # INTERNAL
 371     's|sort-output'                     => sub { $sort = 's' },
 372     'F|sort-by-file'                    => sub { $sort = 'F' },
 373     'v|verbose'                         => \$verbose_p,
 374     'help'                              => sub { usage(0) },
 375 ) || usage_error;
 376
 377 VerboseWarnings::set_application_name($0);
 378 VerboseWarnings::set_pedantic_mode($pedantic_p);
 379
 380 usage_error('Missing mandatory option -f')
 381         unless defined $files_from || defined $convert_from;
 382 $directory = '.' unless defined $directory;
 383
 384 usage_error('You cannot specify both --convert-from and --files-from')
 385         if defined $convert_from && defined $files_from;
 386
 387 if (defined $output && $output ne '-') {
 388     print STDERR "$0: Opening output file \"$output\"\n" if $verbose_p;
 389     open($OUTPUT, '>:encoding(utf-8)', $output) || die "$output: $!\n";
 390 } else {
 391     print STDERR "$0: Outputting to STDOUT...\n" if $verbose_p;
 392     open($OUTPUT, q{>}, "&STDOUT");
 393 }
 394
 395 if (defined $files_from) {
 396     print STDERR "$0: Opening input file list \"$files_from\"\n" if $verbose_p;
 397     open(my $INPUT, '<:encoding(utf-8)', $files_from) || die "$files_from: $!\n";
 398     while (<$INPUT>) {
 399         chomp;
 400         my $input = /^\//? $_: "$directory/$_";
 401         my $h = TmplTokenizer->new( $input );
 402         $h->set_allow_cformat( 1 );
 403     VerboseWarnings::set_input_file_name($input);
 404         print STDERR "$0: Processing file \"$input\"\n" if $verbose_p;
 405         text_extract( $h );
 406     }
 407     close $INPUT;
 408 } else {
 409     print STDERR "$0: Converting \"$convert_from\"\n" if $verbose_p;
 410     convert_translation_file;
 411 }
 412 generate_po_file;
 413
 414 warn "This input will not work with Mozilla standards-compliant mode\n", undef
 415         if TmplTokenizer::syntaxerror_p;
 416
 417
 418 exit(-1) if TmplTokenizer::fatal_p;
 419
 420 ###############################################################################
 421
 422 =head1 DESCRIPTION
 423
 424 This script has behaviour similar to
 425 xgettext(1), and generates gettext-compatible output files.
 426
 427 A gettext-like format provides the following advantages:
 428
 429 =over
 430
 431 =item -
 432
 433 Translation to non-English-like languages with different word
 434 order:  gettext's c-format strings can theoretically be
 435 emulated if we are able to do some analysis on the .tt input
 436 and treat <TMPL_VAR> in a way similar to %s.
 437
 438 =item -
 439
 440 Context for the extracted strings:  the gettext format provides
 441 the filenames and line numbers where each string can be found.
 442 The translator can read the source file and see the context,
 443 in case the string by itself can mean several different things.
 444
 445 =item -
 446
 447 Place for the translator to add comments about the translations.
 448
 449 =item -
 450
 451 Gettext-compatible tools, if any, might be usable if we adopt
 452 the gettext format.
 453
 454 =back
 455
 456 This script has already been in use for over a year and should
 457 be reasonable stable. Nevertheless, it is still somewhat
 458 experimental and there are still some issues.
 459
 460 Please refer to the explanation in tmpl_process3 for further
 461 details.
 462
 463 If you want to generate GNOME-style POTFILES.in files, such
 464 files (passed to -f) can be generated thus:
 465
 466     (cd ../.. && find koha-tmpl/opac-tmpl/default/en \
 467         -name \*.inc -o -name \*.tt) > opac/POTFILES.in
 468     (cd ../.. && find koha-tmpl/intranet-tmpl/default/en \
 469         -name \*.inc -o -name \*.tt) > intranet/POTFILES.in
 470
 471 This is, however, quite pointless, because the "create" and
 472 "update" actions have already been implemented in tmpl_process3.pl.
 473
 474 =head2 Strings inside JavaScript
 475
 476 In the SCRIPT elements, the script will attempt to scan for
 477 _("I<string literal>") patterns, and extract the I<string literal>
 478 as a translatable string.
 479
 480 Note that the C-like _(...) notation is required.
 481
 482 The JavaScript must actually define a _ function
 483 so that the code remains correct JavaScript.
 484 A suitable definition of such a function can be
 485
 486         function _(s) { return s } // dummy function for gettext
 487
 488 =head1 SEE ALSO
 489
 490 tmpl_process3.pl,
 491 xgettext(1),
 492 Locale::PO(3),
 493 translator_doc.txt
 494
 495 =head1 BUGS
 496
 497 There probably are some. Bugs related to scanning of <INPUT>
 498 tags seem to be especially likely to be present.
 499
 500 Its diagnostics are probably too verbose.
 501
 502 When a <TMPL_VAR> within a JavaScript-related attribute is
 503 detected, the script currently displays no warnings at all.
 504 It might be good to display some kind of warning.
 505
 506 Its sort order (-s option) seems to be different than the real
 507 xgettext(1)'s sort option. This will result in translation
 508 strings inside the generated PO file spuriously moving about
 509 when tmpl_process3.pl calls msgmerge(1) to update the PO file.
 510
 511 If a Javascript string has leading spaces, it will
 512 generate strings with spurious leading spaces,
 513 leading to failure to match the strings when actually generating
 514 translated files.
 515
 516 =cut