misc/translator/xgettext.pl

   1 #!/usr/bin/perl
   2
   3 =head1 NAME
   4
   5 xgettext.pl - xgettext(1)-like interface for .tt strings extraction
   6
   7 =cut
   8
   9 use strict;
  10 use warnings;
  11 use Getopt::Long;
  12 use POSIX;
  13 use Locale::PO;
  14 use TmplTokenizer;
  15 use VerboseWarnings;
  16
  17 use vars qw( $convert_from );
  18 use vars qw( $files_from $directory $output $sort );
  19 use vars qw( $extract_all_p );
  20 use vars qw( $pedantic_p );
  21 use vars qw( %text %translation );
  22 use vars qw( $charset_in $charset_out );
  23 use vars qw( $disable_fuzzy_p );
  24 use vars qw( $verbose_p );
  25 use vars qw( $po_mode_p );
  26
  27 our $OUTPUT;
  28
  29 ###############################################################################
  30
  31 sub string_negligible_p {
  32     my($t) = @_;                                # a string
  33     # Don't emit pure whitespace, pure numbers, pure punctuation,
  34     # single letters, or TMPL_VAR's.
  35     # Punctuation should arguably be translated. But without context
  36     # they are untranslatable. Note that $t is a string, not a token object.
  37     return !$extract_all_p && (
  38                TmplTokenizer::blank_p($t)       # blank or TMPL_VAR
  39             || $t =~ /^\d+$/                    # purely digits
  40             || $t =~ /^[-\+\.,:;!\?'"%\(\)\[\]\|]+$/ # punctuation w/o context
  41             || $t =~ /^[A-Za-z]$/               # single letters
  42             || $t =~ /^(&[a-z]+;|&#\d+;|&#x[0-9a-fA-F]+;|%%|%s|\s|[[:punct:]])*$/ # html entities,placeholder,punct, ...
  43         )
  44 }
  45
  46 sub token_negligible_p {
  47     my($x) = @_;
  48     my $t = $x->type;
  49     return !$extract_all_p && (
  50             $t == C4::TmplTokenType::TEXT? string_negligible_p( $x->string ):
  51             $t == C4::TmplTokenType::DIRECTIVE? 1:
  52             $t == C4::TmplTokenType::TEXT_PARAMETRIZED
  53                 && join( '', map { my $t = $_->type;
  54                         $t == C4::TmplTokenType::DIRECTIVE?
  55                                 '1': $t == C4::TmplTokenType::TAG?
  56                                         '': token_negligible_p( $_ )?
  57                                         '': '1' } @{$x->children} ) eq '' );
  58 }
  59
  60 ###############################################################################
  61
  62 sub remember {
  63     my($token, $string) = @_;
  64     # If we determine that the string is negligible, don't bother to remember
  65     unless (string_negligible_p( $string ) || token_negligible_p( $token )) {
  66         my $key = TmplTokenizer::string_canon( $string );
  67         $text{$key} = [] unless defined $text{$key};
  68         push @{$text{$key}}, $token;
  69     }
  70 }
  71
  72 ###############################################################################
  73
  74 sub string_list {
  75     my @t = keys %text;
  76     # The real gettext tools seems to sort case sensitively; I don't know why
  77     @t = sort { $a cmp $b } @t if $sort eq 's';
  78     @t = sort {
  79             my @aa = sort { $a->pathname cmp $b->pathname
  80                     || $a->line_number <=> $b->line_number } @{$text{$a}};
  81             my @bb = sort { $a->pathname cmp $b->pathname
  82                     || $a->line_number <=> $b->line_number } @{$text{$b}};
  83             $aa[0]->pathname cmp $bb[0]->pathname
  84                     || $aa[0]->line_number <=> $bb[0]->line_number;
  85         } @t if $sort eq 'F';
  86     return @t;
  87 }
  88
  89   ###############################################################################
  90
  91 sub text_extract {
  92     my($h) = @_;
  93     for (;;) {
  94         my $s = TmplTokenizer::next_token $h;
  95         last unless defined $s;
  96         my($kind, $t, $attr) = ($s->type, $s->string, $s->attributes);
  97         if ($kind eq C4::TmplTokenType::TEXT) {
  98             if ($t =~ /\S/s && $t !~ /<!/){
  99                 remember( $s, $t );
 100             }
 101         } elsif ($kind eq C4::TmplTokenType::TEXT_PARAMETRIZED) {
 102             if ($s->form =~ /\S/s && $s->form !~ /<!/){
 103                 remember( $s, $s->form );
 104             }
 105         } elsif ($kind eq C4::TmplTokenType::TAG && %$attr) {
 106             # value [tag=input], meta
 107             my $tag;
 108             $tag = lc($1) if $t =~ /^<(\S+)/s;
 109             for my $a ('alt', 'content', 'title', 'value', 'label', 'placeholder') {
 110                 if ($attr->{$a}) {
 111                     next if $a eq 'label' && $tag ne 'optgroup';
 112                     next if $a eq 'content' && $tag ne 'meta';
 113                     next if $a eq 'value' && ($tag ne 'input'
 114                         || (ref $attr->{'type'} && $attr->{'type'}->[1] =~ /^(?:hidden|radio|checkbox)$/)); # FIXME
 115                     my($key, $val, $val_orig, $order) = @{$attr->{$a}}; #FIXME
 116                     $val = TmplTokenizer::trim $val;
 117                     remember( $s, $val ) if $val =~ /\S/s;
 118                 }
 119             }
 120         } elsif ($s->has_js_data) {
 121             for my $t (@{$s->js_data}) {
 122               remember( $s, $t->[3] ) if $t->[0]; # FIXME
 123             }
 124         }
 125     }
 126 }
 127
 128 ###############################################################################
 129
 130 sub generate_strings_list {
 131     # Emit all extracted strings.
 132     for my $t (string_list) {
 133         printf $OUTPUT "%s\n", $t;
 134     }
 135 }
 136
 137 ###############################################################################
 138
 139 sub generate_po_file {
 140     # We don't emit the Plural-Forms header; it's meaningless for us
 141     my $pot_charset = (defined $charset_out? $charset_out: 'CHARSET');
 142     $pot_charset = TmplTokenizer::charset_canon $pot_charset;
 143     # Time stamps aren't exactly right semantically. I don't know how to fix it.
 144     my $time = POSIX::strftime('%Y-%m-%d %H:%M%z', localtime(time));
 145     my $time_pot = $time;
 146     my $time_po  = $po_mode_p? $time: 'YEAR-MO-DA HO:MI+ZONE';
 147     print $OUTPUT <<EOF;
 148 # SOME DESCRIPTIVE TITLE.
 149 # Copyright (C) YEAR THE PACKAGE'S COPYRIGHT HOLDER
 150 # This file is distributed under the same license as the PACKAGE package.
 151 # FIRST AUTHOR <EMAIL\@ADDRESS>, YEAR.
 152 #
 153 EOF
 154     print $OUTPUT <<EOF unless $disable_fuzzy_p;
 155 #, fuzzy
 156 EOF
 157     print $OUTPUT <<EOF;
 158 msgid ""
 159 msgstr ""
 160 "Project-Id-Version: PACKAGE VERSION\\n"
 161 "POT-Creation-Date: $time_pot\\n"
 162 "PO-Revision-Date: $time_po\\n"
 163 "Last-Translator: FULL NAME <EMAIL\@ADDRESS>\\n"
 164 "Language-Team: LANGUAGE <LL\@li.org>\\n"
 165 "MIME-Version: 1.0\\n"
 166 "Content-Type: text/plain; charset=$pot_charset\\n"
 167 "Content-Transfer-Encoding: 8bit\\n"
 168
 169 EOF
 170     my $directory_re = quotemeta("$directory/");
 171     for my $t (string_list) {
 172         if ($text{$t}->[0]->type == C4::TmplTokenType::TEXT_PARAMETRIZED) {
 173             my($token, $n) = ($text{$t}->[0], 0);
 174         printf $OUTPUT "#. For the first occurrence,\n"
 175                     if @{$text{$t}} > 1 && $token->parameters_and_fields > 0;
 176             for my $param ($token->parameters_and_fields) {
 177                 $n += 1;
 178                 my $type = $param->type;
 179                 my $subtype = ($type == C4::TmplTokenType::TAG
 180                         && $param->string =~ /^<input\b/is?
 181                                 $param->attributes->{'type'}->[1]: undef);
 182                 my $fmt = TmplTokenizer::_formalize( $param );
 183                 $fmt =~ s/^%/%$n\$/;
 184                 if ($type == C4::TmplTokenType::DIRECTIVE) {
 185 #                   $type = "Template::Toolkit Directive";
 186                     $type = $param->string =~ /\[%(.*?)%\]/is? $1: 'ERROR';
 187                     my $name = $param->string =~ /\bname=(["']?)([^\s"']+)\1/is?
 188                             $2: undef;
 189             printf $OUTPUT "#. %s: %s\n", $fmt,
 190                         "$type" . (defined $name? " name=$name": '');
 191                 } else {
 192                     my $name = $param->attributes->{'name'};
 193             my $value;
 194             $value = $param->attributes->{'value'}
 195                             unless $subtype =~ /^(?:text)$/;
 196             printf $OUTPUT "#. %s: %s\n", $fmt, "type=$subtype"
 197                             . (defined $name?  " name=$name->[1]": '')
 198                             . (defined $value? " value=$value->[1]": '');
 199                 }
 200             }
 201         } elsif ($text{$t}->[0]->type == C4::TmplTokenType::TAG) {
 202             my($token) = ($text{$t}->[0]);
 203         printf $OUTPUT "#. For the first occurrence,\n"
 204                     if @{$text{$t}} > 1 && $token->parameters_and_fields > 0;
 205             if ($token->string =~ /^<meta\b/is) {
 206                 my $type = $token->attributes->{'http-equiv'}->[1];
 207         print $OUTPUT "#. META http-equiv=$type\n" if defined $type;
 208             } elsif ($token->string =~ /^<([a-z0-9]+)/is) {
 209                 my $tag = uc($1);
 210                 my $type = (lc($tag) eq 'input'?
 211                         $token->attributes->{'type'}: undef);
 212                 my $name = $token->attributes->{'name'};
 213         printf $OUTPUT "#. %s\n", $tag
 214                     . (defined $type? " type=$type->[1]": '')
 215                     . (defined $name? " name=$name->[1]": '');
 216             }
 217         } elsif ($text{$t}->[0]->has_js_data) {
 218         printf $OUTPUT "#. For the first occurrence,\n" if @{$text{$t}} > 1;
 219         printf $OUTPUT "#. SCRIPT\n";
 220         }
 221         my $cformat_p;
 222         for my $token (@{$text{$t}}) {
 223             my $pathname = $token->pathname;
 224             $pathname =~ s/^$directory_re//os;
 225         $pathname =~ s/^.*\/koha-tmpl\/(.*)$/$1/;
 226         printf $OUTPUT "#: %s:%d\n", $pathname, $token->line_number
 227                     if defined $pathname && defined $token->line_number;
 228             $cformat_p = 1 if $token->type == C4::TmplTokenType::TEXT_PARAMETRIZED;
 229         }
 230         printf $OUTPUT "#, c-format\n" if $cformat_p;
 231         printf $OUTPUT "msgid %s\n", TmplTokenizer::quote_po
 232                 TmplTokenizer::string_canon
 233                 TmplTokenizer::charset_convert $t, $charset_in, $charset_out;
 234         printf $OUTPUT "msgstr %s\n\n", (defined $translation{$t}?
 235                 TmplTokenizer::quote_po( $translation{$t} ): "\"\"");
 236     }
 237 }
 238
 239 ###############################################################################
 240
 241 sub convert_translation_file {
 242     open(my $INPUT, '<', $convert_from) || die "$convert_from: $!\n";
 243     VerboseWarnings::set_input_file_name $convert_from;
 244     while (<$INPUT>) {
 245         chomp;
 246         my($msgid, $msgstr) = split(/\t/);
 247         die "$convert_from: $.: Malformed tmpl_process input (no tab)\n"
 248                 unless defined $msgstr;
 249
 250         # Fixup some of the bad strings
 251         $msgid =~ s/^SELECTED>//;
 252
 253         # Create dummy token
 254         my $token = TmplToken->new( $msgid, C4::TmplTokenType::UNKNOWN, undef, undef );
 255         remember( $token, $msgid );
 256         $msgstr =~ s/^(?:LIMIT;|LIMITED;)//g; # unneeded for tmpl_process3
 257         $translation{$msgid} = $msgstr unless $msgstr eq '*****';
 258
 259         if ($msgid  =~ /\bcharset=(["']?)([^;\s"']+)\1/s) {
 260             my $candidate = TmplTokenizer::charset_canon $2;
 261             die "Conflicting charsets in msgid: $candidate vs $charset_in\n"
 262                     if defined $charset_in && $charset_in ne $candidate;
 263             $charset_in = $candidate;
 264         }
 265         if ($msgstr =~ /\bcharset=(["']?)([^;\s"']+)\1/s) {
 266             my $candidate = TmplTokenizer::charset_canon $2;
 267             die "Conflicting charsets in msgid: $candidate vs $charset_out\n"
 268                     if defined $charset_out && $charset_out ne $candidate;
 269             $charset_out = $candidate;
 270         }
 271     }
 272     # The following assumption is correct; that's what HTML::Template assumes
 273     if (!defined $charset_in) {
 274         $charset_in = $charset_out = TmplTokenizer::charset_canon 'utf-8';
 275         warn "Warning: Can't determine original templates' charset, defaulting to $charset_in\n";
 276     }
 277 }
 278
 279 ###############################################################################
 280
 281 sub usage {
 282     my($exitcode) = @_;
 283     my $h = $exitcode? *STDERR: *STDOUT;
 284     print $h <<EOF;
 285 Usage: $0 [OPTIONS]
 286 Extract translatable strings from given HTML::Template input files.
 287
 288 Input file location:
 289   -f, --files-from=FILE          Get list of input files from FILE
 290   -D, --directory=DIRECTORY      Add DIRECTORY to list for input files search
 291
 292 Output file location:
 293   -o, --output=FILE              Write output to specified file
 294
 295 HTML::Template options:
 296   -a, --extract-all              Extract all strings
 297       --pedantic-warnings        Issue warnings even for detected problems
 298                                  which are likely to be harmless
 299
 300 Output details:
 301   -s, --sort-output              generate sorted output
 302   -F, --sort-by-file             sort output by file location
 303   -v, --verbose                  explain what is being done
 304
 305 Informative output:
 306       --help                     Display this help and exit
 307
 308 Try `perldoc $0' for perhaps more information.
 309 EOF
 310     exit($exitcode);
 311 }
 312
 313 ###############################################################################
 314
 315 sub usage_error {
 316     print STDERR "$_[0]\n" if @_;
 317     print STDERR "Try `$0 --help' for more information.\n";
 318     exit(-1);
 319 }
 320
 321 ###############################################################################
 322
 323 Getopt::Long::config qw( bundling no_auto_abbrev );
 324 GetOptions(
 325     'a|extract-all'                     => \$extract_all_p,
 326     'charset=s' => sub { $charset_in = $charset_out = $_[1] },  # INTERNAL
 327     'convert-from=s'                    => \$convert_from,
 328     'D|directory=s'                     => \$directory,
 329     'disable-fuzzy'                     => \$disable_fuzzy_p,   # INTERNAL
 330     'f|files-from=s'                    => \$files_from,
 331     'I|input-charset=s'                 => \$charset_in,        # INTERNAL
 332     'pedantic-warnings|pedantic'        => sub { $pedantic_p = 1 },
 333     'O|output-charset=s'                => \$charset_out,       # INTERNAL
 334     'output|o=s'                        => \$output,
 335     'po-mode'                           => \$po_mode_p,         # INTERNAL
 336     's|sort-output'                     => sub { $sort = 's' },
 337     'F|sort-by-file'                    => sub { $sort = 'F' },
 338     'v|verbose'                         => \$verbose_p,
 339     'help'                              => sub { usage(0) },
 340 ) || usage_error;
 341
 342 VerboseWarnings::set_application_name $0;
 343 VerboseWarnings::set_pedantic_mode $pedantic_p;
 344
 345 usage_error('Missing mandatory option -f')
 346         unless defined $files_from || defined $convert_from;
 347 $directory = '.' unless defined $directory;
 348
 349 usage_error('You cannot specify both --convert-from and --files-from')
 350         if defined $convert_from && defined $files_from;
 351
 352 if (defined $output && $output ne '-') {
 353     print STDERR "$0: Opening output file \"$output\"\n" if $verbose_p;
 354         open($OUTPUT, '>', $output) || die "$output: $!\n";
 355 } else {
 356     print STDERR "$0: Outputting to STDOUT...\n" if $verbose_p;
 357     open($OUTPUT, ">&STDOUT");
 358 }
 359
 360 if (defined $files_from) {
 361     print STDERR "$0: Opening input file list \"$files_from\"\n" if $verbose_p;
 362     open(my $INPUT, '<', $files_from) || die "$files_from: $!\n";
 363     while (<$INPUT>) {
 364         chomp;
 365         my $input = /^\//? $_: "$directory/$_";
 366         my $h = TmplTokenizer->new( $input );
 367         $h->set_allow_cformat( 1 );
 368         VerboseWarnings::set_input_file_name $input;
 369         print STDERR "$0: Processing file \"$input\"\n" if $verbose_p;
 370         text_extract( $h );
 371     }
 372     close $INPUT;
 373 } else {
 374     print STDERR "$0: Converting \"$convert_from\"\n" if $verbose_p;
 375     convert_translation_file;
 376 }
 377 generate_po_file;
 378
 379 warn "This input will not work with Mozilla standards-compliant mode\n", undef
 380         if TmplTokenizer::syntaxerror_p;
 381
 382
 383 exit(-1) if TmplTokenizer::fatal_p;
 384
 385 ###############################################################################
 386
 387 =head1 DESCRIPTION
 388
 389 This is an experimental script based on the modularized
 390 text-extract2.pl script.  It has behaviour similar to
 391 xgettext(1), and generates gettext-compatible output files.
 392
 393 A gettext-like format provides the following advantages:
 394
 395 =over
 396
 397 =item -
 398
 399 Translation to non-English-like languages with different word
 400 order:  gettext's c-format strings can theoretically be
 401 emulated if we are able to do some analysis on the .tt input
 402 and treat <TMPL_VAR> in a way similar to %s.
 403
 404 =item -
 405
 406 Context for the extracted strings:  the gettext format provides
 407 the filenames and line numbers where each string can be found.
 408 The translator can read the source file and see the context,
 409 in case the string by itself can mean several different things.
 410
 411 =item -
 412
 413 Place for the translator to add comments about the translations.
 414
 415 =item -
 416
 417 Gettext-compatible tools, if any, might be usable if we adopt
 418 the gettext format.
 419
 420 =back
 421
 422 This script has already been in use for over a year and should
 423 be reasonable stable. Nevertheless, it is still somewhat
 424 experimental and there are still some issues.
 425
 426 Please refer to the explanation in tmpl_process3 for further
 427 details.
 428
 429 If you want to generate GNOME-style POTFILES.in files, such
 430 files (passed to -f) can be generated thus:
 431
 432     (cd ../.. && find koha-tmpl/opac-tmpl/default/en \
 433         -name \*.inc -o -name \*.tt) > opac/POTFILES.in
 434     (cd ../.. && find koha-tmpl/intranet-tmpl/default/en \
 435         -name \*.inc -o -name \*.tt) > intranet/POTFILES.in
 436
 437 This is, however, quite pointless, because the "create" and
 438 "update" actions have already been implemented in tmpl_process3.pl.
 439
 440 =head2 Strings inside JavaScript
 441
 442 In the SCRIPT elements, the script will attempt to scan for
 443 _("I<string literal>") patterns, and extract the I<string literal>
 444 as a translatable string.
 445
 446 Note that the C-like _(...) notation is required.
 447
 448 The JavaScript must actually define a _ function
 449 so that the code remains correct JavaScript.
 450 A suitable definition of such a function can be
 451
 452         function _(s) { return s } // dummy function for gettext
 453
 454 =head1 SEE ALSO
 455
 456 tmpl_process3.pl,
 457 xgettext(1),
 458 Locale::PO(3),
 459 translator_doc.txt
 460
 461 =head1 BUGS
 462
 463 There probably are some. Bugs related to scanning of <INPUT>
 464 tags seem to be especially likely to be present.
 465
 466 Its diagnostics are probably too verbose.
 467
 468 When a <TMPL_VAR> within a JavaScript-related attribute is
 469 detected, the script currently displays no warnings at all.
 470 It might be good to display some kind of warning.
 471
 472 Its sort order (-s option) seems to be different than the real
 473 xgettext(1)'s sort option. This will result in translation
 474 strings inside the generated PO file spuriously moving about
 475 when tmpl_process3.pl calls msgmerge(1) to update the PO file.
 476
 477 If a Javascript string has leading spaces, it will
 478 generate strings with spurious leading spaces,
 479 leading to failure to match the strings when actually generating
 480 translated files.
 481
 482 =cut