misc/translator/xgettext.pl

   1 #!/usr/bin/perl
   2
   3 =head1 NAME
   4
   5 xgettext.pl - xgettext(1)-like interface for .tt strings extraction
   6
   7 =cut
   8
   9 use FindBin;
  10 use lib $FindBin::Bin;
  11
  12 use strict;
  13 use warnings;
  14 use Getopt::Long;
  15 use POSIX;
  16 use Locale::PO;
  17 use TmplTokenizer;
  18 use VerboseWarnings;
  19
  20 use vars qw( $convert_from );
  21 use vars qw( $files_from $directory $output $sort );
  22 use vars qw( $extract_all_p );
  23 use vars qw( $pedantic_p );
  24 use vars qw( %text %translation );
  25 use vars qw( $charset_in $charset_out );
  26 use vars qw( $disable_fuzzy_p );
  27 use vars qw( $verbose_p );
  28 use vars qw( $po_mode_p );
  29
  30 our $OUTPUT;
  31
  32 ###############################################################################
  33
  34 sub string_negligible_p {
  35     my($t) = @_;                                # a string
  36     # Don't emit pure whitespace, pure numbers, pure punctuation,
  37     # single letters, or TMPL_VAR's.
  38     # Punctuation should arguably be translated. But without context
  39     # they are untranslatable. Note that $t is a string, not a token object.
  40     return !$extract_all_p && (
  41                TmplTokenizer::blank_p($t)       # blank or TMPL_VAR
  42             || $t =~ /^\d+$/                    # purely digits
  43             || $t =~ /^[-\+\.,:;!\?'"%\(\)\[\]\|]+$/ # punctuation w/o context
  44             || $t =~ /^[A-Za-z]$/               # single letters
  45             || $t =~ /^(&[a-z]+;|&#\d+;|&#x[0-9a-fA-F]+;|%%|%s|\s|[[:punct:]])*$/ # html entities,placeholder,punct, ...
  46         || ( $t =~ /^\[\%.*\%\]$/ and $t !~ /\%\].*\[\%/ )    # pure TT entities
  47         )
  48 }
  49
  50 sub token_negligible_p {
  51     my ($x) = @_;
  52     my $t = $x->type;
  53     return !$extract_all_p && (
  54           $t == C4::TmplTokenType::TEXT() ? string_negligible_p( $x->string )
  55         : $t == C4::TmplTokenType::DIRECTIVE() ? 1
  56         : $t == C4::TmplTokenType::TEXT_PARAMETRIZED()
  57         && join(
  58             '',
  59             map {
  60                 my $t = $_->type;
  61                     $t == C4::TmplTokenType::DIRECTIVE() ? '1'
  62                   : $t == C4::TmplTokenType::TAG()       ? ''
  63                   : token_negligible_p($_)               ? ''
  64                   : '1'
  65             } @{ $x->children }
  66         ) eq ''
  67     );
  68 }
  69
  70 ###############################################################################
  71
  72 sub remember {
  73     my($token, $string) = @_;
  74     # If we determine that the string is negligible, don't bother to remember
  75     unless (string_negligible_p( $string ) || token_negligible_p( $token )) {
  76         my $key = TmplTokenizer::string_canon( $string );
  77         $text{$key} = [] unless defined $text{$key};
  78         push @{$text{$key}}, $token;
  79     }
  80 }
  81
  82 ###############################################################################
  83
  84 sub string_list {
  85     my @t = keys %text;
  86     # The real gettext tools seems to sort case sensitively; I don't know why
  87     @t = sort { $a cmp $b } @t if $sort eq 's';
  88     @t = sort {
  89             my @aa = sort { $a->pathname cmp $b->pathname
  90                     || $a->line_number <=> $b->line_number } @{$text{$a}};
  91             my @bb = sort { $a->pathname cmp $b->pathname
  92                     || $a->line_number <=> $b->line_number } @{$text{$b}};
  93             $aa[0]->pathname cmp $bb[0]->pathname
  94                     || $aa[0]->line_number <=> $bb[0]->line_number;
  95         } @t if $sort eq 'F';
  96     return @t;
  97 }
  98
  99   ###############################################################################
 100
 101 sub text_extract {
 102     my($h) = @_;
 103     for (;;) {
 104         my $s = TmplTokenizer::next_token $h;
 105         last unless defined $s;
 106         my($kind, $t, $attr) = ($s->type, $s->string, $s->attributes);
 107         if ($kind eq C4::TmplTokenType::TEXT) {
 108             if ($t =~ /\S/s && $t !~ /<!/){
 109                 remember( $s, $t );
 110             }
 111         } elsif ($kind eq C4::TmplTokenType::TEXT_PARAMETRIZED) {
 112             if ($s->form =~ /\S/s && $s->form !~ /<!/){
 113                 remember( $s, $s->form );
 114             }
 115         } elsif ($kind eq C4::TmplTokenType::TAG && %$attr) {
 116             # value [tag=input], meta
 117             my $tag;
 118             $tag = lc($1) if $t =~ /^<(\S+)/s;
 119             for my $a ('alt', 'content', 'title', 'value', 'label', 'placeholder') {
 120                 if ($attr->{$a}) {
 121                     next if $a eq 'label' && $tag ne 'optgroup';
 122                     next if $a eq 'content' && $tag ne 'meta';
 123                     next if $a eq 'value' && ($tag ne 'input'
 124                         || (ref $attr->{'type'} && $attr->{'type'}->[1] =~ /^(?:hidden|radio|checkbox)$/)); # FIXME
 125                     my($key, $val, $val_orig, $order) = @{$attr->{$a}}; #FIXME
 126                     $val = TmplTokenizer::trim $val;
 127                     remember( $s, $val ) if $val =~ /\S/s;
 128                 }
 129             }
 130         } elsif ($s->has_js_data) {
 131             for my $t (@{$s->js_data}) {
 132               remember( $s, $t->[3] ) if $t->[0]; # FIXME
 133             }
 134         }
 135     }
 136 }
 137
 138 ###############################################################################
 139
 140 sub generate_strings_list {
 141     # Emit all extracted strings.
 142     for my $t (string_list) {
 143         printf $OUTPUT "%s\n", $t;
 144     }
 145 }
 146
 147 ###############################################################################
 148
 149 sub generate_po_file {
 150     # We don't emit the Plural-Forms header; it's meaningless for us
 151     my $pot_charset = (defined $charset_out? $charset_out: 'CHARSET');
 152     $pot_charset = TmplTokenizer::charset_canon $pot_charset;
 153     # Time stamps aren't exactly right semantically. I don't know how to fix it.
 154     my $time = POSIX::strftime('%Y-%m-%d %H:%M%z', localtime(time));
 155     my $time_pot = $time;
 156     my $time_po  = $po_mode_p? $time: 'YEAR-MO-DA HO:MI+ZONE';
 157     print $OUTPUT <<EOF;
 158 # SOME DESCRIPTIVE TITLE.
 159 # Copyright (C) YEAR THE PACKAGE'S COPYRIGHT HOLDER
 160 # This file is distributed under the same license as the PACKAGE package.
 161 # FIRST AUTHOR <EMAIL\@ADDRESS>, YEAR.
 162 #
 163 EOF
 164     print $OUTPUT <<EOF unless $disable_fuzzy_p;
 165 #, fuzzy
 166 EOF
 167     print $OUTPUT <<EOF;
 168 msgid ""
 169 msgstr ""
 170 "Project-Id-Version: PACKAGE VERSION\\n"
 171 "POT-Creation-Date: $time_pot\\n"
 172 "PO-Revision-Date: $time_po\\n"
 173 "Last-Translator: FULL NAME <EMAIL\@ADDRESS>\\n"
 174 "Language-Team: LANGUAGE <LL\@li.org>\\n"
 175 "MIME-Version: 1.0\\n"
 176 "Content-Type: text/plain; charset=$pot_charset\\n"
 177 "Content-Transfer-Encoding: 8bit\\n"
 178
 179 EOF
 180     my $directory_re = quotemeta("$directory/");
 181     for my $t (string_list) {
 182         if ($text{$t}->[0]->type == C4::TmplTokenType::TEXT_PARAMETRIZED) {
 183             my($token, $n) = ($text{$t}->[0], 0);
 184         printf $OUTPUT "#. For the first occurrence,\n"
 185                     if @{$text{$t}} > 1 && $token->parameters_and_fields > 0;
 186             for my $param ($token->parameters_and_fields) {
 187                 $n += 1;
 188                 my $type = $param->type;
 189                 my $subtype = ($type == C4::TmplTokenType::TAG
 190                         && $param->string =~ /^<input\b/is?
 191                                 $param->attributes->{'type'}->[1]: undef);
 192                 my $fmt = TmplTokenizer::_formalize( $param );
 193                 $fmt =~ s/^%/%$n\$/;
 194                 if ($type == C4::TmplTokenType::DIRECTIVE) {
 195 #                   $type = "Template::Toolkit Directive";
 196                     $type = $param->string =~ /\[%(.*?)%\]/is? $1: 'ERROR';
 197                     my $name = $param->string =~ /\bname=(["']?)([^\s"']+)\1/is?
 198                             $2: undef;
 199             printf $OUTPUT "#. %s: %s\n", $fmt,
 200                         "$type" . (defined $name? " name=$name": '');
 201                 } else {
 202                     my $name = $param->attributes->{'name'};
 203             my $value;
 204             $value = $param->attributes->{'value'}
 205                             unless $subtype =~ /^(?:text)$/;
 206             printf $OUTPUT "#. %s: %s\n", $fmt, "type=$subtype"
 207                             . (defined $name?  " name=$name->[1]": '')
 208                             . (defined $value? " value=$value->[1]": '');
 209                 }
 210             }
 211         } elsif ($text{$t}->[0]->type == C4::TmplTokenType::TAG) {
 212             my($token) = ($text{$t}->[0]);
 213         printf $OUTPUT "#. For the first occurrence,\n"
 214                     if @{$text{$t}} > 1 && $token->parameters_and_fields > 0;
 215             if ($token->string =~ /^<meta\b/is) {
 216                 my $type = $token->attributes->{'http-equiv'}->[1];
 217         print $OUTPUT "#. META http-equiv=$type\n" if defined $type;
 218             } elsif ($token->string =~ /^<([a-z0-9]+)/is) {
 219                 my $tag = uc($1);
 220                 my $type = (lc($tag) eq 'input'?
 221                         $token->attributes->{'type'}: undef);
 222                 my $name = $token->attributes->{'name'};
 223         printf $OUTPUT "#. %s\n", $tag
 224                     . (defined $type? " type=$type->[1]": '')
 225                     . (defined $name? " name=$name->[1]": '');
 226             }
 227         } elsif ($text{$t}->[0]->has_js_data) {
 228         printf $OUTPUT "#. For the first occurrence,\n" if @{$text{$t}} > 1;
 229         printf $OUTPUT "#. SCRIPT\n";
 230         }
 231         my $cformat_p;
 232         for my $token (@{$text{$t}}) {
 233             my $pathname = $token->pathname;
 234             $pathname =~ s/^$directory_re//os;
 235         $pathname =~ s/^.*\/koha-tmpl\/(.*)$/$1/;
 236         printf $OUTPUT "#: %s:%d\n", $pathname, $token->line_number
 237                     if defined $pathname && defined $token->line_number;
 238             $cformat_p = 1 if $token->type == C4::TmplTokenType::TEXT_PARAMETRIZED;
 239         }
 240         printf $OUTPUT "#, c-format\n" if $cformat_p;
 241         printf $OUTPUT "msgid %s\n", TmplTokenizer::quote_po
 242                 TmplTokenizer::string_canon
 243                 TmplTokenizer::charset_convert $t, $charset_in, $charset_out;
 244         printf $OUTPUT "msgstr %s\n\n", (defined $translation{$t}?
 245                 TmplTokenizer::quote_po( $translation{$t} ): "\"\"");
 246     }
 247 }
 248
 249 ###############################################################################
 250
 251 sub convert_translation_file {
 252     open(my $INPUT, '<', $convert_from) || die "$convert_from: $!\n";
 253     VerboseWarnings::set_input_file_name $convert_from;
 254     while (<$INPUT>) {
 255         chomp;
 256         my($msgid, $msgstr) = split(/\t/);
 257         die "$convert_from: $.: Malformed tmpl_process input (no tab)\n"
 258                 unless defined $msgstr;
 259
 260         # Fixup some of the bad strings
 261         $msgid =~ s/^SELECTED>//;
 262
 263         # Create dummy token
 264         my $token = TmplToken->new( $msgid, C4::TmplTokenType::UNKNOWN, undef, undef );
 265         remember( $token, $msgid );
 266         $msgstr =~ s/^(?:LIMIT;|LIMITED;)//g; # unneeded for tmpl_process3
 267         $translation{$msgid} = $msgstr unless $msgstr eq '*****';
 268
 269         if ($msgid  =~ /\bcharset=(["']?)([^;\s"']+)\1/s) {
 270             my $candidate = TmplTokenizer::charset_canon $2;
 271             die "Conflicting charsets in msgid: $candidate vs $charset_in\n"
 272                     if defined $charset_in && $charset_in ne $candidate;
 273             $charset_in = $candidate;
 274         }
 275         if ($msgstr =~ /\bcharset=(["']?)([^;\s"']+)\1/s) {
 276             my $candidate = TmplTokenizer::charset_canon $2;
 277             die "Conflicting charsets in msgid: $candidate vs $charset_out\n"
 278                     if defined $charset_out && $charset_out ne $candidate;
 279             $charset_out = $candidate;
 280         }
 281     }
 282     # The following assumption is correct; that's what HTML::Template assumes
 283     if (!defined $charset_in) {
 284         $charset_in = $charset_out = TmplTokenizer::charset_canon 'utf-8';
 285         warn "Warning: Can't determine original templates' charset, defaulting to $charset_in\n";
 286     }
 287 }
 288
 289 ###############################################################################
 290
 291 sub usage {
 292     my($exitcode) = @_;
 293     my $h = $exitcode? *STDERR: *STDOUT;
 294     print $h <<EOF;
 295 Usage: $0 [OPTIONS]
 296 Extract translatable strings from given HTML::Template input files.
 297
 298 Input file location:
 299   -f, --files-from=FILE          Get list of input files from FILE
 300   -D, --directory=DIRECTORY      Add DIRECTORY to list for input files search
 301
 302 Output file location:
 303   -o, --output=FILE              Write output to specified file
 304
 305 HTML::Template options:
 306   -a, --extract-all              Extract all strings
 307       --pedantic-warnings        Issue warnings even for detected problems
 308                                  which are likely to be harmless
 309
 310 Output details:
 311   -s, --sort-output              generate sorted output
 312   -F, --sort-by-file             sort output by file location
 313   -v, --verbose                  explain what is being done
 314
 315 Informative output:
 316       --help                     Display this help and exit
 317
 318 Try `perldoc $0' for perhaps more information.
 319 EOF
 320     exit($exitcode);
 321 }
 322
 323 ###############################################################################
 324
 325 sub usage_error {
 326     print STDERR "$_[0]\n" if @_;
 327     print STDERR "Try `$0 --help' for more information.\n";
 328     exit(-1);
 329 }
 330
 331 ###############################################################################
 332
 333 Getopt::Long::config qw( bundling no_auto_abbrev );
 334 GetOptions(
 335     'a|extract-all'                     => \$extract_all_p,
 336     'charset=s' => sub { $charset_in = $charset_out = $_[1] },  # INTERNAL
 337     'convert-from=s'                    => \$convert_from,
 338     'D|directory=s'                     => \$directory,
 339     'disable-fuzzy'                     => \$disable_fuzzy_p,   # INTERNAL
 340     'f|files-from=s'                    => \$files_from,
 341     'I|input-charset=s'                 => \$charset_in,        # INTERNAL
 342     'pedantic-warnings|pedantic'        => sub { $pedantic_p = 1 },
 343     'O|output-charset=s'                => \$charset_out,       # INTERNAL
 344     'output|o=s'                        => \$output,
 345     'po-mode'                           => \$po_mode_p,         # INTERNAL
 346     's|sort-output'                     => sub { $sort = 's' },
 347     'F|sort-by-file'                    => sub { $sort = 'F' },
 348     'v|verbose'                         => \$verbose_p,
 349     'help'                              => sub { usage(0) },
 350 ) || usage_error;
 351
 352 VerboseWarnings::set_application_name $0;
 353 VerboseWarnings::set_pedantic_mode $pedantic_p;
 354
 355 usage_error('Missing mandatory option -f')
 356         unless defined $files_from || defined $convert_from;
 357 $directory = '.' unless defined $directory;
 358
 359 usage_error('You cannot specify both --convert-from and --files-from')
 360         if defined $convert_from && defined $files_from;
 361
 362 if (defined $output && $output ne '-') {
 363     print STDERR "$0: Opening output file \"$output\"\n" if $verbose_p;
 364         open($OUTPUT, '>', $output) || die "$output: $!\n";
 365 } else {
 366     print STDERR "$0: Outputting to STDOUT...\n" if $verbose_p;
 367     open($OUTPUT, ">&STDOUT");
 368 }
 369
 370 if (defined $files_from) {
 371     print STDERR "$0: Opening input file list \"$files_from\"\n" if $verbose_p;
 372     open(my $INPUT, '<', $files_from) || die "$files_from: $!\n";
 373     while (<$INPUT>) {
 374         chomp;
 375         my $input = /^\//? $_: "$directory/$_";
 376         my $h = TmplTokenizer->new( $input );
 377         $h->set_allow_cformat( 1 );
 378         VerboseWarnings::set_input_file_name $input;
 379         print STDERR "$0: Processing file \"$input\"\n" if $verbose_p;
 380         text_extract( $h );
 381     }
 382     close $INPUT;
 383 } else {
 384     print STDERR "$0: Converting \"$convert_from\"\n" if $verbose_p;
 385     convert_translation_file;
 386 }
 387 generate_po_file;
 388
 389 warn "This input will not work with Mozilla standards-compliant mode\n", undef
 390         if TmplTokenizer::syntaxerror_p;
 391
 392
 393 exit(-1) if TmplTokenizer::fatal_p;
 394
 395 ###############################################################################
 396
 397 =head1 DESCRIPTION
 398
 399 This is an experimental script based on the modularized
 400 text-extract2.pl script.  It has behaviour similar to
 401 xgettext(1), and generates gettext-compatible output files.
 402
 403 A gettext-like format provides the following advantages:
 404
 405 =over
 406
 407 =item -
 408
 409 Translation to non-English-like languages with different word
 410 order:  gettext's c-format strings can theoretically be
 411 emulated if we are able to do some analysis on the .tt input
 412 and treat <TMPL_VAR> in a way similar to %s.
 413
 414 =item -
 415
 416 Context for the extracted strings:  the gettext format provides
 417 the filenames and line numbers where each string can be found.
 418 The translator can read the source file and see the context,
 419 in case the string by itself can mean several different things.
 420
 421 =item -
 422
 423 Place for the translator to add comments about the translations.
 424
 425 =item -
 426
 427 Gettext-compatible tools, if any, might be usable if we adopt
 428 the gettext format.
 429
 430 =back
 431
 432 This script has already been in use for over a year and should
 433 be reasonable stable. Nevertheless, it is still somewhat
 434 experimental and there are still some issues.
 435
 436 Please refer to the explanation in tmpl_process3 for further
 437 details.
 438
 439 If you want to generate GNOME-style POTFILES.in files, such
 440 files (passed to -f) can be generated thus:
 441
 442     (cd ../.. && find koha-tmpl/opac-tmpl/default/en \
 443         -name \*.inc -o -name \*.tt) > opac/POTFILES.in
 444     (cd ../.. && find koha-tmpl/intranet-tmpl/default/en \
 445         -name \*.inc -o -name \*.tt) > intranet/POTFILES.in
 446
 447 This is, however, quite pointless, because the "create" and
 448 "update" actions have already been implemented in tmpl_process3.pl.
 449
 450 =head2 Strings inside JavaScript
 451
 452 In the SCRIPT elements, the script will attempt to scan for
 453 _("I<string literal>") patterns, and extract the I<string literal>
 454 as a translatable string.
 455
 456 Note that the C-like _(...) notation is required.
 457
 458 The JavaScript must actually define a _ function
 459 so that the code remains correct JavaScript.
 460 A suitable definition of such a function can be
 461
 462         function _(s) { return s } // dummy function for gettext
 463
 464 =head1 SEE ALSO
 465
 466 tmpl_process3.pl,
 467 xgettext(1),
 468 Locale::PO(3),
 469 translator_doc.txt
 470
 471 =head1 BUGS
 472
 473 There probably are some. Bugs related to scanning of <INPUT>
 474 tags seem to be especially likely to be present.
 475
 476 Its diagnostics are probably too verbose.
 477
 478 When a <TMPL_VAR> within a JavaScript-related attribute is
 479 detected, the script currently displays no warnings at all.
 480 It might be good to display some kind of warning.
 481
 482 Its sort order (-s option) seems to be different than the real
 483 xgettext(1)'s sort option. This will result in translation
 484 strings inside the generated PO file spuriously moving about
 485 when tmpl_process3.pl calls msgmerge(1) to update the PO file.
 486
 487 If a Javascript string has leading spaces, it will
 488 generate strings with spurious leading spaces,
 489 leading to failure to match the strings when actually generating
 490 translated files.
 491
 492 =cut