misc/translator/xgettext.pl

   1 #!/usr/bin/perl
   2
   3 =head1 NAME
   4
   5 xgettext.pl - xgettext(1)-like interface for .tt strings extraction
   6
   7 =cut
   8
   9 use FindBin;
  10 use lib $FindBin::Bin;
  11
  12 use strict;
  13 use warnings;
  14 use Getopt::Long;
  15 use POSIX;
  16 use Locale::PO;
  17 use TmplTokenizer;
  18 use VerboseWarnings;
  19
  20 use vars qw( $convert_from );
  21 use vars qw( $files_from $directory $output $sort );
  22 use vars qw( $extract_all_p );
  23 use vars qw( $pedantic_p );
  24 use vars qw( %text %translation );
  25 use vars qw( $charset_in $charset_out );
  26 use vars qw( $disable_fuzzy_p );
  27 use vars qw( $verbose_p );
  28 use vars qw( $po_mode_p );
  29
  30 our $OUTPUT;
  31
  32 ###############################################################################
  33
  34 sub string_negligible_p {
  35     my($t) = @_;                                # a string
  36     # Don't emit pure whitespace, pure numbers, pure punctuation,
  37     # single letters, or TMPL_VAR's.
  38     # Punctuation should arguably be translated. But without context
  39     # they are untranslatable. Note that $t is a string, not a token object.
  40     return !$extract_all_p && (
  41                TmplTokenizer::blank_p($t)       # blank or TMPL_VAR
  42             || $t =~ /^\d+$/                    # purely digits
  43             || $t =~ /^[-\+\.,:;!\?'"%\(\)\[\]\|]+$/ # punctuation w/o context
  44             || $t =~ /^[A-Za-z]$/               # single letters
  45             || $t =~ /^(&[a-z]+;|&#\d+;|&#x[0-9a-fA-F]+;|%%|%s|\s|[[:punct:]])*$/ # html entities,placeholder,punct, ...
  46         || ( $t =~ /^\[\%.*\%\]$/ and $t !~ /\%\].*\[\%/ )    # pure TT entities
  47         )
  48 }
  49
  50 sub token_negligible_p {
  51     my ($x) = @_;
  52     my $t = $x->type;
  53     return !$extract_all_p && (
  54           $t == C4::TmplTokenType::TEXT() ? string_negligible_p( $x->string )
  55         : $t == C4::TmplTokenType::DIRECTIVE() ? 1
  56         : $t == C4::TmplTokenType::TEXT_PARAMETRIZED()
  57         && join(
  58             '',
  59             map {
  60                 my $t = $_->type;
  61                     $t == C4::TmplTokenType::DIRECTIVE() ? '1'
  62                   : $t == C4::TmplTokenType::TAG()       ? ''
  63                   : token_negligible_p($_)               ? ''
  64                   : '1'
  65             } @{ $x->children }
  66         ) eq ''
  67     );
  68 }
  69
  70 ###############################################################################
  71
  72 sub remember {
  73     my($token, $string) = @_;
  74     # If we determine that the string is negligible, don't bother to remember
  75     unless (string_negligible_p( $string ) || token_negligible_p( $token )) {
  76         my $key = TmplTokenizer::string_canon( $string );
  77         $text{$key} = [] unless defined $text{$key};
  78         push @{$text{$key}}, $token;
  79     }
  80 }
  81
  82 ###############################################################################
  83
  84 sub string_list {
  85     my @t = keys %text;
  86     # The real gettext tools seems to sort case sensitively; I don't know why
  87     @t = sort { $a cmp $b } @t if $sort eq 's';
  88     @t = sort {
  89             my @aa = sort { $a->pathname cmp $b->pathname
  90                     || $a->line_number <=> $b->line_number } @{$text{$a}};
  91             my @bb = sort { $a->pathname cmp $b->pathname
  92                     || $a->line_number <=> $b->line_number } @{$text{$b}};
  93             $aa[0]->pathname cmp $bb[0]->pathname
  94                     || $aa[0]->line_number <=> $bb[0]->line_number;
  95         } @t if $sort eq 'F';
  96     return @t;
  97 }
  98
  99   ###############################################################################
 100
 101 sub text_extract {
 102     my($h) = @_;
 103     for (;;) {
 104         my $s = TmplTokenizer::next_token $h;
 105         last unless defined $s;
 106         my($kind, $t, $attr) = ($s->type, $s->string, $s->attributes);
 107         if ($kind eq C4::TmplTokenType::TEXT) {
 108             if ($t =~ /\S/s && $t !~ /<!/){
 109                 remember( $s, $t );
 110             }
 111         } elsif ($kind eq C4::TmplTokenType::TEXT_PARAMETRIZED) {
 112             if ($s->form =~ /\S/s && $s->form !~ /<!/){
 113                 remember( $s, $s->form );
 114             }
 115         } elsif ($kind eq C4::TmplTokenType::TAG && %$attr) {
 116             # value [tag=input], meta
 117             my $tag;
 118             $tag = lc($1) if $t =~ /^<(\S+)/s;
 119             for my $a ('alt', 'content', 'title', 'value', 'label', 'placeholder') {
 120                 if ($attr->{$a}) {
 121                     next if $a eq 'label' && $tag ne 'optgroup';
 122                     next if $a eq 'content' && $tag ne 'meta';
 123                     next if $a eq 'value' && ($tag ne 'input'
 124                         || (ref $attr->{'type'} && $attr->{'type'}->[1] =~ /^(?:hidden|radio|checkbox)$/)); # FIXME
 125                     my($key, $val, $val_orig, $order) = @{$attr->{$a}}; #FIXME
 126                     $val = TmplTokenizer::trim $val;
 127                     # for selected attributes replace '[%..%]' with '%s' globally
 128                     if ( $a =~ /title|value|alt|content|placeholder/ ) {
 129                         $val =~ s/\[\%.*?\%\]/\%s/g;
 130                     }
 131                     # save attribute text for translation
 132                     remember( $s, $val ) if $val =~ /\S/s;
 133                 }
 134             }
 135         } elsif ($s->has_js_data) {
 136             for my $t (@{$s->js_data}) {
 137               remember( $s, $t->[3] ) if $t->[0]; # FIXME
 138             }
 139         }
 140     }
 141 }
 142
 143 ###############################################################################
 144
 145 sub generate_strings_list {
 146     # Emit all extracted strings.
 147     for my $t (string_list) {
 148         printf $OUTPUT "%s\n", $t;
 149     }
 150 }
 151
 152 ###############################################################################
 153
 154 sub generate_po_file {
 155     # We don't emit the Plural-Forms header; it's meaningless for us
 156     my $pot_charset = (defined $charset_out? $charset_out: 'CHARSET');
 157     $pot_charset = TmplTokenizer::charset_canon $pot_charset;
 158     # Time stamps aren't exactly right semantically. I don't know how to fix it.
 159     my $time = POSIX::strftime('%Y-%m-%d %H:%M%z', localtime(time));
 160     my $time_pot = $time;
 161     my $time_po  = $po_mode_p? $time: 'YEAR-MO-DA HO:MI+ZONE';
 162     print $OUTPUT <<EOF;
 163 # SOME DESCRIPTIVE TITLE.
 164 # Copyright (C) YEAR THE PACKAGE'S COPYRIGHT HOLDER
 165 # This file is distributed under the same license as the PACKAGE package.
 166 # FIRST AUTHOR <EMAIL\@ADDRESS>, YEAR.
 167 #
 168 EOF
 169     print $OUTPUT <<EOF unless $disable_fuzzy_p;
 170 #, fuzzy
 171 EOF
 172     print $OUTPUT <<EOF;
 173 msgid ""
 174 msgstr ""
 175 "Project-Id-Version: PACKAGE VERSION\\n"
 176 "POT-Creation-Date: $time_pot\\n"
 177 "PO-Revision-Date: $time_po\\n"
 178 "Last-Translator: FULL NAME <EMAIL\@ADDRESS>\\n"
 179 "Language-Team: LANGUAGE <LL\@li.org>\\n"
 180 "MIME-Version: 1.0\\n"
 181 "Content-Type: text/plain; charset=$pot_charset\\n"
 182 "Content-Transfer-Encoding: 8bit\\n"
 183
 184 EOF
 185     my $directory_re = quotemeta("$directory/");
 186     for my $t (string_list) {
 187         if ($text{$t}->[0]->type == C4::TmplTokenType::TEXT_PARAMETRIZED) {
 188             my($token, $n) = ($text{$t}->[0], 0);
 189         printf $OUTPUT "#. For the first occurrence,\n"
 190                     if @{$text{$t}} > 1 && $token->parameters_and_fields > 0;
 191             for my $param ($token->parameters_and_fields) {
 192                 $n += 1;
 193                 my $type = $param->type;
 194                 my $subtype = ($type == C4::TmplTokenType::TAG
 195                         && $param->string =~ /^<input\b/is?
 196                                 $param->attributes->{'type'}->[1]: undef);
 197                 my $fmt = TmplTokenizer::_formalize( $param );
 198                 $fmt =~ s/^%/%$n\$/;
 199                 if ($type == C4::TmplTokenType::DIRECTIVE) {
 200 #                   $type = "Template::Toolkit Directive";
 201                     $type = $param->string =~ /\[%(.*?)%\]/is? $1: 'ERROR';
 202                     my $name = $param->string =~ /\bname=(["']?)([^\s"']+)\1/is?
 203                             $2: undef;
 204             printf $OUTPUT "#. %s: %s\n", $fmt,
 205                         "$type" . (defined $name? " name=$name": '');
 206                 } else {
 207                     my $name = $param->attributes->{'name'};
 208             my $value;
 209             $value = $param->attributes->{'value'}
 210                             unless $subtype =~ /^(?:text)$/;
 211             printf $OUTPUT "#. %s: %s\n", $fmt, "type=$subtype"
 212                             . (defined $name?  " name=$name->[1]": '')
 213                             . (defined $value? " value=$value->[1]": '');
 214                 }
 215             }
 216         } elsif ($text{$t}->[0]->type == C4::TmplTokenType::TAG) {
 217             my($token) = ($text{$t}->[0]);
 218         printf $OUTPUT "#. For the first occurrence,\n"
 219                     if @{$text{$t}} > 1 && $token->parameters_and_fields > 0;
 220             if ($token->string =~ /^<meta\b/is) {
 221                 my $type = $token->attributes->{'http-equiv'}->[1];
 222         print $OUTPUT "#. META http-equiv=$type\n" if defined $type;
 223             } elsif ($token->string =~ /^<([a-z0-9]+)/is) {
 224                 my $tag = uc($1);
 225                 my $type = (lc($tag) eq 'input'?
 226                         $token->attributes->{'type'}: undef);
 227                 my $name = $token->attributes->{'name'};
 228         printf $OUTPUT "#. %s\n", $tag
 229                     . (defined $type? " type=$type->[1]": '')
 230                     . (defined $name? " name=$name->[1]": '');
 231             }
 232         } elsif ($text{$t}->[0]->has_js_data) {
 233         printf $OUTPUT "#. For the first occurrence,\n" if @{$text{$t}} > 1;
 234         printf $OUTPUT "#. SCRIPT\n";
 235         }
 236         my $cformat_p;
 237         for my $token (@{$text{$t}}) {
 238             my $pathname = $token->pathname;
 239             $pathname =~ s/^$directory_re//os;
 240         $pathname =~ s/^.*\/koha-tmpl\/(.*)$/$1/;
 241         printf $OUTPUT "#: %s:%d\n", $pathname, $token->line_number
 242                     if defined $pathname && defined $token->line_number;
 243             $cformat_p = 1 if $token->type == C4::TmplTokenType::TEXT_PARAMETRIZED;
 244         }
 245         printf $OUTPUT "#, c-format\n" if $cformat_p;
 246         printf $OUTPUT "msgid %s\n", TmplTokenizer::quote_po
 247                 TmplTokenizer::string_canon
 248                 TmplTokenizer::charset_convert $t, $charset_in, $charset_out;
 249         printf $OUTPUT "msgstr %s\n\n", (defined $translation{$t}?
 250                 TmplTokenizer::quote_po( $translation{$t} ): "\"\"");
 251     }
 252 }
 253
 254 ###############################################################################
 255
 256 sub convert_translation_file {
 257     open(my $INPUT, '<', $convert_from) || die "$convert_from: $!\n";
 258     VerboseWarnings::set_input_file_name $convert_from;
 259     while (<$INPUT>) {
 260         chomp;
 261         my($msgid, $msgstr) = split(/\t/);
 262         die "$convert_from: $.: Malformed tmpl_process input (no tab)\n"
 263                 unless defined $msgstr;
 264
 265         # Fixup some of the bad strings
 266         $msgid =~ s/^SELECTED>//;
 267
 268         # Create dummy token
 269         my $token = TmplToken->new( $msgid, C4::TmplTokenType::UNKNOWN, undef, undef );
 270         remember( $token, $msgid );
 271         $msgstr =~ s/^(?:LIMIT;|LIMITED;)//g; # unneeded for tmpl_process3
 272         $translation{$msgid} = $msgstr unless $msgstr eq '*****';
 273
 274         if ($msgid  =~ /\bcharset=(["']?)([^;\s"']+)\1/s) {
 275             my $candidate = TmplTokenizer::charset_canon $2;
 276             die "Conflicting charsets in msgid: $candidate vs $charset_in\n"
 277                     if defined $charset_in && $charset_in ne $candidate;
 278             $charset_in = $candidate;
 279         }
 280         if ($msgstr =~ /\bcharset=(["']?)([^;\s"']+)\1/s) {
 281             my $candidate = TmplTokenizer::charset_canon $2;
 282             die "Conflicting charsets in msgid: $candidate vs $charset_out\n"
 283                     if defined $charset_out && $charset_out ne $candidate;
 284             $charset_out = $candidate;
 285         }
 286     }
 287     # The following assumption is correct; that's what HTML::Template assumes
 288     if (!defined $charset_in) {
 289         $charset_in = $charset_out = TmplTokenizer::charset_canon 'utf-8';
 290         warn "Warning: Can't determine original templates' charset, defaulting to $charset_in\n";
 291     }
 292 }
 293
 294 ###############################################################################
 295
 296 sub usage {
 297     my($exitcode) = @_;
 298     my $h = $exitcode? *STDERR: *STDOUT;
 299     print $h <<EOF;
 300 Usage: $0 [OPTIONS]
 301 Extract translatable strings from given HTML::Template input files.
 302
 303 Input file location:
 304   -f, --files-from=FILE          Get list of input files from FILE
 305   -D, --directory=DIRECTORY      Add DIRECTORY to list for input files search
 306
 307 Output file location:
 308   -o, --output=FILE              Write output to specified file
 309
 310 HTML::Template options:
 311   -a, --extract-all              Extract all strings
 312       --pedantic-warnings        Issue warnings even for detected problems
 313                                  which are likely to be harmless
 314
 315 Output details:
 316   -s, --sort-output              generate sorted output
 317   -F, --sort-by-file             sort output by file location
 318   -v, --verbose                  explain what is being done
 319
 320 Informative output:
 321       --help                     Display this help and exit
 322
 323 Try `perldoc $0' for perhaps more information.
 324 EOF
 325     exit($exitcode);
 326 }
 327
 328 ###############################################################################
 329
 330 sub usage_error {
 331     print STDERR "$_[0]\n" if @_;
 332     print STDERR "Try `$0 --help' for more information.\n";
 333     exit(-1);
 334 }
 335
 336 ###############################################################################
 337
 338 Getopt::Long::config qw( bundling no_auto_abbrev );
 339 GetOptions(
 340     'a|extract-all'                     => \$extract_all_p,
 341     'charset=s' => sub { $charset_in = $charset_out = $_[1] },  # INTERNAL
 342     'convert-from=s'                    => \$convert_from,
 343     'D|directory=s'                     => \$directory,
 344     'disable-fuzzy'                     => \$disable_fuzzy_p,   # INTERNAL
 345     'f|files-from=s'                    => \$files_from,
 346     'I|input-charset=s'                 => \$charset_in,        # INTERNAL
 347     'pedantic-warnings|pedantic'        => sub { $pedantic_p = 1 },
 348     'O|output-charset=s'                => \$charset_out,       # INTERNAL
 349     'output|o=s'                        => \$output,
 350     'po-mode'                           => \$po_mode_p,         # INTERNAL
 351     's|sort-output'                     => sub { $sort = 's' },
 352     'F|sort-by-file'                    => sub { $sort = 'F' },
 353     'v|verbose'                         => \$verbose_p,
 354     'help'                              => sub { usage(0) },
 355 ) || usage_error;
 356
 357 VerboseWarnings::set_application_name $0;
 358 VerboseWarnings::set_pedantic_mode $pedantic_p;
 359
 360 usage_error('Missing mandatory option -f')
 361         unless defined $files_from || defined $convert_from;
 362 $directory = '.' unless defined $directory;
 363
 364 usage_error('You cannot specify both --convert-from and --files-from')
 365         if defined $convert_from && defined $files_from;
 366
 367 if (defined $output && $output ne '-') {
 368     print STDERR "$0: Opening output file \"$output\"\n" if $verbose_p;
 369         open($OUTPUT, '>:encoding(utf-8)', $output) || die "$output: $!\n";
 370 } else {
 371     print STDERR "$0: Outputting to STDOUT...\n" if $verbose_p;
 372     open($OUTPUT, ">&STDOUT");
 373 }
 374
 375 if (defined $files_from) {
 376     print STDERR "$0: Opening input file list \"$files_from\"\n" if $verbose_p;
 377     open(my $INPUT, '<', $files_from) || die "$files_from: $!\n";
 378     while (<$INPUT>) {
 379         chomp;
 380         my $input = /^\//? $_: "$directory/$_";
 381         my $h = TmplTokenizer->new( $input );
 382         $h->set_allow_cformat( 1 );
 383         VerboseWarnings::set_input_file_name $input;
 384         print STDERR "$0: Processing file \"$input\"\n" if $verbose_p;
 385         text_extract( $h );
 386     }
 387     close $INPUT;
 388 } else {
 389     print STDERR "$0: Converting \"$convert_from\"\n" if $verbose_p;
 390     convert_translation_file;
 391 }
 392 generate_po_file;
 393
 394 warn "This input will not work with Mozilla standards-compliant mode\n", undef
 395         if TmplTokenizer::syntaxerror_p;
 396
 397
 398 exit(-1) if TmplTokenizer::fatal_p;
 399
 400 ###############################################################################
 401
 402 =head1 DESCRIPTION
 403
 404 This script has behaviour similar to
 405 xgettext(1), and generates gettext-compatible output files.
 406
 407 A gettext-like format provides the following advantages:
 408
 409 =over
 410
 411 =item -
 412
 413 Translation to non-English-like languages with different word
 414 order:  gettext's c-format strings can theoretically be
 415 emulated if we are able to do some analysis on the .tt input
 416 and treat <TMPL_VAR> in a way similar to %s.
 417
 418 =item -
 419
 420 Context for the extracted strings:  the gettext format provides
 421 the filenames and line numbers where each string can be found.
 422 The translator can read the source file and see the context,
 423 in case the string by itself can mean several different things.
 424
 425 =item -
 426
 427 Place for the translator to add comments about the translations.
 428
 429 =item -
 430
 431 Gettext-compatible tools, if any, might be usable if we adopt
 432 the gettext format.
 433
 434 =back
 435
 436 This script has already been in use for over a year and should
 437 be reasonable stable. Nevertheless, it is still somewhat
 438 experimental and there are still some issues.
 439
 440 Please refer to the explanation in tmpl_process3 for further
 441 details.
 442
 443 If you want to generate GNOME-style POTFILES.in files, such
 444 files (passed to -f) can be generated thus:
 445
 446     (cd ../.. && find koha-tmpl/opac-tmpl/default/en \
 447         -name \*.inc -o -name \*.tt) > opac/POTFILES.in
 448     (cd ../.. && find koha-tmpl/intranet-tmpl/default/en \
 449         -name \*.inc -o -name \*.tt) > intranet/POTFILES.in
 450
 451 This is, however, quite pointless, because the "create" and
 452 "update" actions have already been implemented in tmpl_process3.pl.
 453
 454 =head2 Strings inside JavaScript
 455
 456 In the SCRIPT elements, the script will attempt to scan for
 457 _("I<string literal>") patterns, and extract the I<string literal>
 458 as a translatable string.
 459
 460 Note that the C-like _(...) notation is required.
 461
 462 The JavaScript must actually define a _ function
 463 so that the code remains correct JavaScript.
 464 A suitable definition of such a function can be
 465
 466         function _(s) { return s } // dummy function for gettext
 467
 468 =head1 SEE ALSO
 469
 470 tmpl_process3.pl,
 471 xgettext(1),
 472 Locale::PO(3),
 473 translator_doc.txt
 474
 475 =head1 BUGS
 476
 477 There probably are some. Bugs related to scanning of <INPUT>
 478 tags seem to be especially likely to be present.
 479
 480 Its diagnostics are probably too verbose.
 481
 482 When a <TMPL_VAR> within a JavaScript-related attribute is
 483 detected, the script currently displays no warnings at all.
 484 It might be good to display some kind of warning.
 485
 486 Its sort order (-s option) seems to be different than the real
 487 xgettext(1)'s sort option. This will result in translation
 488 strings inside the generated PO file spuriously moving about
 489 when tmpl_process3.pl calls msgmerge(1) to update the PO file.
 490
 491 If a Javascript string has leading spaces, it will
 492 generate strings with spurious leading spaces,
 493 leading to failure to match the strings when actually generating
 494 translated files.
 495
 496 =cut