misc/translator/xgettext.pl

   1 #!/usr/bin/perl
   2
   3 # This file is part of Koha.
   4 #
   5 # Koha is free software; you can redistribute it and/or modify it
   6 # under the terms of the GNU General Public License as published by
   7 # the Free Software Foundation; either version 3 of the License, or
   8 # (at your option) any later version.
   9 #
  10 # Koha is distributed in the hope that it will be useful, but
  11 # WITHOUT ANY WARRANTY; without even the implied warranty of
  12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  13 # GNU General Public License for more details.
  14 #
  15 # You should have received a copy of the GNU General Public License
  16 # along with Koha; if not, see <http://www.gnu.org/licenses>.
  17
  18 =head1 NAME
  19
  20 xgettext.pl - xgettext(1)-like interface for .tt strings extraction
  21
  22 =cut
  23
  24 use FindBin;
  25 use lib $FindBin::Bin;
  26
  27 use strict;
  28 use warnings;
  29 use Getopt::Long qw( GetOptions );
  30 use POSIX;
  31 use Locale::PO;
  32 use TmplTokenizer;
  33 use VerboseWarnings;
  34
  35 use vars qw( $convert_from );
  36 use vars qw( $files_from $directory $output $sort );
  37 use vars qw( $extract_all_p );
  38 use vars qw( $pedantic_p );
  39 use vars qw( %text %translation );
  40 use vars qw( $charset_in $charset_out );
  41 use vars qw( $disable_fuzzy_p );
  42 use vars qw( $verbose_p );
  43 use vars qw( $po_mode_p );
  44
  45 our $OUTPUT;
  46
  47 ###############################################################################
  48
  49 sub string_negligible_p {
  50     my($t) = @_;                                # a string
  51     # Don't emit pure whitespace, pure numbers, pure punctuation,
  52     # single letters, or TMPL_VAR's.
  53     # Punctuation should arguably be translated. But without context
  54     # they are untranslatable. Note that $t is a string, not a token object.
  55     return !$extract_all_p && (
  56                TmplTokenizer::blank_p($t)       # blank or TMPL_VAR
  57             || $t =~ /^\d+$/                    # purely digits
  58             || $t =~ /^[-\+\.,:;!\?'"%\(\)\[\]\|]+$/ # punctuation w/o context
  59             || $t =~ /^[A-Za-z]$/               # single letters
  60             || $t =~ /^(&[a-z]+;|&#\d+;|&#x[0-9a-fA-F]+;|%%|%s|\s|[[:punct:]])*$/ # html entities,placeholder,punct, ...
  61         || ( $t =~ /^\[\%.*\%\]$/ and $t !~ /\%\].*\[\%/ )    # pure TT entities
  62         || $t =~ /^\s*<\?.*\?>/                               # ignore xml prolog
  63         )
  64 }
  65
  66 sub token_negligible_p {
  67     my ($x) = @_;
  68     my $t = $x->type;
  69     return !$extract_all_p && (
  70           $t == C4::TmplTokenType::TEXT() ? string_negligible_p( $x->string )
  71         : $t == C4::TmplTokenType::DIRECTIVE() ? 1
  72         : $t == C4::TmplTokenType::TEXT_PARAMETRIZED()
  73         && join(
  74             '',
  75             map {
  76                 my $t = $_->type;
  77                     $t == C4::TmplTokenType::DIRECTIVE() ? '1'
  78                   : $t == C4::TmplTokenType::TAG()       ? ''
  79                   : token_negligible_p($_)               ? ''
  80                   : '1'
  81             } @{ $x->children }
  82         ) eq ''
  83     );
  84 }
  85
  86 ###############################################################################
  87
  88 sub remember {
  89     my($token, $string) = @_;
  90     # If we determine that the string is negligible, don't bother to remember
  91     unless (string_negligible_p( $string ) || token_negligible_p( $token )) {
  92         my $key = TmplTokenizer::string_canon( $string );
  93         $text{$key} = [] unless defined $text{$key};
  94         push @{$text{$key}}, $token;
  95     }
  96 }
  97
  98 ###############################################################################
  99
 100 sub string_list {
 101     my @t = keys %text;
 102     # The real gettext tools seems to sort case sensitively; I don't know why
 103     @t = sort { $a cmp $b } @t if $sort eq 's';
 104     @t = sort {
 105             my @aa = sort { $a->pathname cmp $b->pathname
 106                     || $a->line_number <=> $b->line_number } @{$text{$a}};
 107             my @bb = sort { $a->pathname cmp $b->pathname
 108                     || $a->line_number <=> $b->line_number } @{$text{$b}};
 109             $aa[0]->pathname cmp $bb[0]->pathname
 110                     || $aa[0]->line_number <=> $bb[0]->line_number;
 111         } @t if $sort eq 'F';
 112     return @t;
 113 }
 114
 115   ###############################################################################
 116
 117 sub text_extract {
 118     my($h) = @_;
 119     for (;;) {
 120         my $s = TmplTokenizer::next_token($h);
 121         last unless defined $s;
 122         my($kind, $t, $attr) = ($s->type, $s->string, $s->attributes);
 123         if ($kind eq C4::TmplTokenType::TEXT) {
 124             if ($t =~ /\S/s && $t !~ /<!/){
 125                 remember( $s, $t );
 126             }
 127         } elsif ($kind eq C4::TmplTokenType::TEXT_PARAMETRIZED) {
 128             if ($s->form =~ /\S/s && $s->form !~ /<!/){
 129                 remember( $s, $s->form );
 130             }
 131         } elsif ($kind eq C4::TmplTokenType::TAG && %$attr) {
 132             # value [tag=input], meta
 133             my $tag;
 134             $tag = lc($1) if $t =~ /^<(\S+)/s;
 135             for my $a ('alt', 'content', 'title', 'value', 'label', 'placeholder', 'aria-label') {
 136                 if ($attr->{$a}) {
 137                     next if $a eq 'label' && $tag ne 'optgroup';
 138                     next if $a eq 'content' && $tag ne 'meta';
 139                     next if $a eq 'value' && ($tag ne 'input'
 140                         || (ref $attr->{'type'} && $attr->{'type'}->[1] =~ /^(?:hidden|radio|checkbox)$/)); # FIXME
 141                     my($key, $val, $val_orig, $order) = @{$attr->{$a}}; #FIXME
 142                     $val = TmplTokenizer::trim($val);
 143                     # for selected attributes replace '[%..%]' with '%s' globally
 144                     if ( $a =~ /title|value|alt|content|placeholder|aria-label/ ) {
 145                         $val =~ s/\[\%.*?\%\]/\%s/g;
 146                     }
 147                     # save attribute text for translation
 148                     remember( $s, $val ) if $val =~ /\S/s;
 149                 }
 150             }
 151         } elsif ($s->has_js_data) {
 152             for my $t (@{$s->js_data}) {
 153               remember( $s, $t->[3] ) if $t->[0]; # FIXME
 154             }
 155         }
 156     }
 157 }
 158
 159 ###############################################################################
 160
 161 sub generate_po_file {
 162     # We don't emit the Plural-Forms header; it's meaningless for us
 163     my $pot_charset = (defined $charset_out? $charset_out: 'CHARSET');
 164     $pot_charset = TmplTokenizer::charset_canon($pot_charset);
 165     # Time stamps aren't exactly right semantically. I don't know how to fix it.
 166     my $time = POSIX::strftime('%Y-%m-%d %H:%M%z', localtime(time));
 167     my $time_pot = $time;
 168     my $time_po  = $po_mode_p? $time: 'YEAR-MO-DA HO:MI+ZONE';
 169     print $OUTPUT <<EOF;
 170 # SOME DESCRIPTIVE TITLE.
 171 # Copyright (C) YEAR THE PACKAGE'S COPYRIGHT HOLDER
 172 # This file is distributed under the same license as the PACKAGE package.
 173 # FIRST AUTHOR <EMAIL\@ADDRESS>, YEAR.
 174 #
 175 EOF
 176     print $OUTPUT <<EOF unless $disable_fuzzy_p;
 177 #, fuzzy
 178 EOF
 179     print $OUTPUT <<EOF;
 180 msgid ""
 181 msgstr ""
 182 "Project-Id-Version: Koha\\n"
 183 "POT-Creation-Date: $time_pot\\n"
 184 "PO-Revision-Date: $time_po\\n"
 185 "Last-Translator: FULL NAME <EMAIL\@ADDRESS>\\n"
 186 "Language-Team: LANGUAGE <LL\@li.org>\\n"
 187 "MIME-Version: 1.0\\n"
 188 "Content-Type: text/plain; charset=$pot_charset\\n"
 189 "Content-Transfer-Encoding: 8bit\\n"
 190
 191 EOF
 192     my $directory_re = quotemeta("$directory/");
 193     for my $t (string_list) {
 194         if ($text{$t}->[0]->type == C4::TmplTokenType::TEXT_PARAMETRIZED) {
 195             my($token, $n) = ($text{$t}->[0], 0);
 196         printf $OUTPUT "#. For the first occurrence,\n"
 197                     if @{$text{$t}} > 1 && $token->parameters_and_fields > 0;
 198             for my $param ($token->parameters_and_fields) {
 199                 $n += 1;
 200                 my $type = $param->type;
 201                 my $subtype = ($type == C4::TmplTokenType::TAG
 202                         && $param->string =~ /^<input\b/is?
 203                                 $param->attributes->{'type'}->[1]: undef);
 204                 my $fmt = TmplTokenizer::_formalize( $param );
 205                 $fmt =~ s/^%/%$n\$/;
 206                 if ($type == C4::TmplTokenType::DIRECTIVE) {
 207 #                   $type = "Template::Toolkit Directive";
 208                     $type = $param->string =~ /\[%(.*?)%\]/is? $1: 'ERROR';
 209                     my $name = $param->string =~ /\bname=(["']?)([^\s"']+)\1/is?
 210                             $2: undef;
 211             printf $OUTPUT "#. %s: %s\n", $fmt,
 212                         "$type" . (defined $name? " name=$name": '');
 213                 } else {
 214                     my $name = $param->attributes->{'name'};
 215             my $value;
 216             $value = $param->attributes->{'value'}
 217                             unless $subtype =~ /^(?:text)$/;
 218             printf $OUTPUT "#. %s: %s\n", $fmt, "type=$subtype"
 219                             . (defined $name?  " name=$name->[1]": '')
 220                             . (defined $value? " value=$value->[1]": '');
 221                 }
 222             }
 223         } elsif ($text{$t}->[0]->type == C4::TmplTokenType::TAG) {
 224             my($token) = ($text{$t}->[0]);
 225         printf $OUTPUT "#. For the first occurrence,\n"
 226                     if @{$text{$t}} > 1 && $token->parameters_and_fields > 0;
 227             if ($token->string =~ /^<meta\b/is) {
 228                 my $type = $token->attributes->{'http-equiv'}->[1];
 229         print $OUTPUT "#. META http-equiv=$type\n" if defined $type;
 230             } elsif ($token->string =~ /^<([a-z0-9]+)/is) {
 231                 my $tag = uc($1);
 232                 my $type = (lc($tag) eq 'input'?
 233                         $token->attributes->{'type'}: undef);
 234                 my $name = $token->attributes->{'name'};
 235         printf $OUTPUT "#. %s\n", $tag
 236                     . (defined $type? " type=$type->[1]": '')
 237                     . (defined $name? " name=$name->[1]": '');
 238             }
 239         } elsif ($text{$t}->[0]->has_js_data) {
 240         printf $OUTPUT "#. For the first occurrence,\n" if @{$text{$t}} > 1;
 241         printf $OUTPUT "#. SCRIPT\n";
 242         }
 243         my $cformat_p;
 244         for my $token (@{$text{$t}}) {
 245             my $pathname = $token->pathname;
 246             $pathname =~ s/^$directory_re//os;
 247         $pathname =~ s/^.*\/koha-tmpl\/(.*)$/$1/;
 248         printf $OUTPUT "#: %s:%d\n", $pathname, $token->line_number
 249                     if defined $pathname && defined $token->line_number;
 250             $cformat_p = 1 if $token->type == C4::TmplTokenType::TEXT_PARAMETRIZED;
 251         }
 252         printf $OUTPUT "#, c-format\n" if $cformat_p;
 253         printf $OUTPUT "msgid %s\n", TmplTokenizer::quote_po(
 254             TmplTokenizer::string_canon(
 255                 TmplTokenizer::charset_convert($t, $charset_in, $charset_out)
 256             )
 257         );
 258         printf $OUTPUT "msgstr %s\n\n", (defined $translation{$t}?
 259                 TmplTokenizer::quote_po( $translation{$t} ): "\"\"");
 260     }
 261 }
 262
 263 ###############################################################################
 264
 265 sub convert_translation_file {
 266     open(my $INPUT, '<:encoding(utf-8)', $convert_from) || die "$convert_from: $!\n";
 267     VerboseWarnings::set_input_file_name($convert_from);
 268     while (<$INPUT>) {
 269         chomp;
 270         my($msgid, $msgstr) = split(/\t/);
 271         die "$convert_from: $.: Malformed tmpl_process input (no tab)\n"
 272                 unless defined $msgstr;
 273
 274         # Fixup some of the bad strings
 275         $msgid =~ s/^SELECTED>//;
 276
 277         # Create dummy token
 278         my $token = TmplToken->new( $msgid, C4::TmplTokenType::UNKNOWN, undef, undef );
 279         remember( $token, $msgid );
 280         $msgstr =~ s/^(?:LIMIT;|LIMITED;)//g; # unneeded for tmpl_process3
 281         $translation{$msgid} = $msgstr unless $msgstr eq '*****';
 282
 283         if ($msgid  =~ /\bcharset=(["']?)([^;\s"']+)\1/s) {
 284         my $candidate = TmplTokenizer::charset_canon($2);
 285             die "Conflicting charsets in msgid: $candidate vs $charset_in\n"
 286                     if defined $charset_in && $charset_in ne $candidate;
 287             $charset_in = $candidate;
 288         }
 289         if ($msgstr =~ /\bcharset=(["']?)([^;\s"']+)\1/s) {
 290         my $candidate = TmplTokenizer::charset_canon($2);
 291             die "Conflicting charsets in msgid: $candidate vs $charset_out\n"
 292                     if defined $charset_out && $charset_out ne $candidate;
 293             $charset_out = $candidate;
 294         }
 295     }
 296     # The following assumption is correct; that's what HTML::Template assumes
 297     if (!defined $charset_in) {
 298         $charset_in = $charset_out = TmplTokenizer::charset_canon('utf-8');
 299         warn "Warning: Can't determine original templates' charset, defaulting to $charset_in\n";
 300     }
 301 }
 302
 303 ###############################################################################
 304
 305 sub usage {
 306     my($exitcode) = @_;
 307     my $h = $exitcode? *STDERR: *STDOUT;
 308     print $h <<EOF;
 309 Usage: $0 [OPTIONS]
 310 Extract translatable strings from given HTML::Template input files.
 311
 312 Input file location:
 313   -f, --files-from=FILE          Get list of input files from FILE
 314   -D, --directory=DIRECTORY      Add DIRECTORY to list for input files search
 315
 316 Output file location:
 317   -o, --output=FILE              Write output to specified file
 318
 319 HTML::Template options:
 320   -a, --extract-all              Extract all strings
 321       --pedantic-warnings        Issue warnings even for detected problems
 322                                  which are likely to be harmless
 323
 324 Output details:
 325   -s, --sort-output              generate sorted output
 326   -F, --sort-by-file             sort output by file location
 327   -v, --verbose                  explain what is being done
 328
 329 Informative output:
 330       --help                     Display this help and exit
 331
 332 Try `perldoc $0' for perhaps more information.
 333 EOF
 334     exit($exitcode);
 335 }
 336
 337 ###############################################################################
 338
 339 sub usage_error {
 340     print STDERR "$_[0]\n" if @_;
 341     print STDERR "Try `$0 --help' for more information.\n";
 342     exit(-1);
 343 }
 344
 345 ###############################################################################
 346
 347 Getopt::Long::config qw( bundling no_auto_abbrev );
 348 GetOptions(
 349     'a|extract-all'                     => \$extract_all_p,
 350     'charset=s' => sub { $charset_in = $charset_out = $_[1] },  # INTERNAL
 351     'convert-from=s'                    => \$convert_from,
 352     'D|directory=s'                     => \$directory,
 353     'disable-fuzzy'                     => \$disable_fuzzy_p,   # INTERNAL
 354     'f|files-from=s'                    => \$files_from,
 355     'I|input-charset=s'                 => \$charset_in,        # INTERNAL
 356     'pedantic-warnings|pedantic'        => sub { $pedantic_p = 1 },
 357     'O|output-charset=s'                => \$charset_out,       # INTERNAL
 358     'output|o=s'                        => \$output,
 359     'po-mode'                           => \$po_mode_p,         # INTERNAL
 360     's|sort-output'                     => sub { $sort = 's' },
 361     'F|sort-by-file'                    => sub { $sort = 'F' },
 362     'v|verbose'                         => \$verbose_p,
 363     'help'                              => sub { usage(0) },
 364 ) || usage_error;
 365
 366 VerboseWarnings::set_application_name($0);
 367 VerboseWarnings::set_pedantic_mode($pedantic_p);
 368
 369 usage_error('Missing mandatory option -f')
 370         unless defined $files_from || defined $convert_from;
 371 $directory = '.' unless defined $directory;
 372
 373 usage_error('You cannot specify both --convert-from and --files-from')
 374         if defined $convert_from && defined $files_from;
 375
 376 if (defined $output && $output ne '-') {
 377     print STDERR "$0: Opening output file \"$output\"\n" if $verbose_p;
 378     open($OUTPUT, '>:encoding(utf-8)', $output) || die "$output: $!\n";
 379 } else {
 380     print STDERR "$0: Outputting to STDOUT...\n" if $verbose_p;
 381     open($OUTPUT, q{>}, "&STDOUT");
 382 }
 383
 384 if (defined $files_from) {
 385     print STDERR "$0: Opening input file list \"$files_from\"\n" if $verbose_p;
 386     open(my $INPUT, '<:encoding(utf-8)', $files_from) || die "$files_from: $!\n";
 387     while (<$INPUT>) {
 388         chomp;
 389         my $input = /^\//? $_: "$directory/$_";
 390         my $h = TmplTokenizer->new( $input );
 391         $h->set_allow_cformat( 1 );
 392     VerboseWarnings::set_input_file_name($input);
 393         print STDERR "$0: Processing file \"$input\"\n" if $verbose_p;
 394         text_extract( $h );
 395     }
 396     close $INPUT;
 397 } else {
 398     print STDERR "$0: Converting \"$convert_from\"\n" if $verbose_p;
 399     convert_translation_file;
 400 }
 401 generate_po_file;
 402
 403 warn "This input will not work with Mozilla standards-compliant mode\n", undef
 404         if TmplTokenizer::syntaxerror_p;
 405
 406
 407 exit(-1) if TmplTokenizer::fatal_p;
 408
 409 ###############################################################################
 410
 411 =head1 DESCRIPTION
 412
 413 This script has behaviour similar to
 414 xgettext(1), and generates gettext-compatible output files.
 415
 416 A gettext-like format provides the following advantages:
 417
 418 =over
 419
 420 =item -
 421
 422 Translation to non-English-like languages with different word
 423 order:  gettext's c-format strings can theoretically be
 424 emulated if we are able to do some analysis on the .tt input
 425 and treat <TMPL_VAR> in a way similar to %s.
 426
 427 =item -
 428
 429 Context for the extracted strings:  the gettext format provides
 430 the filenames and line numbers where each string can be found.
 431 The translator can read the source file and see the context,
 432 in case the string by itself can mean several different things.
 433
 434 =item -
 435
 436 Place for the translator to add comments about the translations.
 437
 438 =item -
 439
 440 Gettext-compatible tools, if any, might be usable if we adopt
 441 the gettext format.
 442
 443 =back
 444
 445 This script has already been in use for over a year and should
 446 be reasonable stable. Nevertheless, it is still somewhat
 447 experimental and there are still some issues.
 448
 449 Please refer to the explanation in tmpl_process3 for further
 450 details.
 451
 452 If you want to generate GNOME-style POTFILES.in files, such
 453 files (passed to -f) can be generated thus:
 454
 455     (cd ../.. && find koha-tmpl/opac-tmpl/default/en \
 456         -name \*.inc -o -name \*.tt) > opac/POTFILES.in
 457     (cd ../.. && find koha-tmpl/intranet-tmpl/default/en \
 458         -name \*.inc -o -name \*.tt) > intranet/POTFILES.in
 459
 460 This is, however, quite pointless, because the "create" and
 461 "update" actions have already been implemented in tmpl_process3.pl.
 462
 463 =head2 Strings inside JavaScript
 464
 465 In the SCRIPT elements, the script will attempt to scan for
 466 _("I<string literal>") patterns, and extract the I<string literal>
 467 as a translatable string.
 468
 469 Note that the C-like _(...) notation is required.
 470
 471 The JavaScript must actually define a _ function
 472 so that the code remains correct JavaScript.
 473 A suitable definition of such a function can be
 474
 475         function _(s) { return s } // dummy function for gettext
 476
 477 =head1 SEE ALSO
 478
 479 tmpl_process3.pl,
 480 xgettext(1),
 481 Locale::PO(3),
 482 translator_doc.txt
 483
 484 =head1 BUGS
 485
 486 There probably are some. Bugs related to scanning of <INPUT>
 487 tags seem to be especially likely to be present.
 488
 489 Its diagnostics are probably too verbose.
 490
 491 When a <TMPL_VAR> within a JavaScript-related attribute is
 492 detected, the script currently displays no warnings at all.
 493 It might be good to display some kind of warning.
 494
 495 Its sort order (-s option) seems to be different than the real
 496 xgettext(1)'s sort option. This will result in translation
 497 strings inside the generated PO file spuriously moving about
 498 when tmpl_process3.pl calls msgmerge(1) to update the PO file.
 499
 500 If a Javascript string has leading spaces, it will
 501 generate strings with spurious leading spaces,
 502 leading to failure to match the strings when actually generating
 503 translated files.
 504
 505 =cut