misc/translator/xgettext.pl

   1 #!/usr/bin/perl
   2
   3 =head1 NAME
   4
   5 xgettext.pl - xgettext(1)-like interface for .tmpl strings extraction
   6
   7 =cut
   8
   9 use strict;
  10 use warnings;
  11 use Getopt::Long;
  12 use POSIX;
  13 use Locale::PO;
  14 use TmplTokenizer;
  15 use VerboseWarnings;
  16
  17 use vars qw( $convert_from );
  18 use vars qw( $files_from $directory $output $sort );
  19 use vars qw( $extract_all_p );
  20 use vars qw( $pedantic_p );
  21 use vars qw( %text %translation );
  22 use vars qw( $charset_in $charset_out );
  23 use vars qw( $disable_fuzzy_p );
  24 use vars qw( $verbose_p );
  25 use vars qw( $po_mode_p );
  26
  27 ###############################################################################
  28
  29 sub string_negligible_p ($) {
  30     my($t) = @_;                                # a string
  31     # Don't emit pure whitespace, pure numbers, pure punctuation,
  32     # single letters, or TMPL_VAR's.
  33     # Punctuation should arguably be translated. But without context
  34     # they are untranslatable. Note that $t is a string, not a token object.
  35     return !$extract_all_p && (
  36                TmplTokenizer::blank_p($t)       # blank or TMPL_VAR
  37             || $t =~ /^\d+$/                    # purely digits
  38             || $t =~ /^[-\+\.,:;!\?'"%\(\)\[\]\|]+$/ # punctuation w/o context
  39             || $t =~ /^[A-Za-z]$/               # single letters
  40         )
  41 }
  42
  43 sub token_negligible_p( $ ) {
  44     my($x) = @_;
  45     my $t = $x->type;
  46     return !$extract_all_p && (
  47             $t == TmplTokenType::TEXT? string_negligible_p( $x->string ):
  48             $t == TmplTokenType::DIRECTIVE? 1:
  49             $t == TmplTokenType::TEXT_PARAMETRIZED
  50                 && join( '', map { my $t = $_->type;
  51                         $t == TmplTokenType::DIRECTIVE?
  52                                 '1': $t == TmplTokenType::TAG?
  53                                         '': token_negligible_p( $_ )?
  54                                         '': '1' } @{$x->children} ) eq '' );
  55 }
  56
  57 ###############################################################################
  58
  59 sub remember ($$) {
  60     my($token, $string) = @_;
  61     # If we determine that the string is negligible, don't bother to remember
  62     unless (string_negligible_p( $string ) || token_negligible_p( $token )) {
  63         my $key = TmplTokenizer::string_canon( $string );
  64         $text{$key} = [] unless defined $text{$key};
  65         push @{$text{$key}}, $token;
  66     }
  67 }
  68
  69 ###############################################################################
  70
  71 sub string_list () {
  72     my @t = keys %text;
  73     # The real gettext tools seems to sort case sensitively; I don't know why
  74     @t = sort { $a cmp $b } @t if $sort eq 's';
  75     @t = sort {
  76             my @aa = sort { $a->pathname cmp $b->pathname
  77                     || $a->line_number <=> $b->line_number } @{$text{$a}};
  78             my @bb = sort { $a->pathname cmp $b->pathname
  79                     || $a->line_number <=> $b->line_number } @{$text{$b}};
  80             $aa[0]->pathname cmp $bb[0]->pathname
  81                     || $aa[0]->line_number <=> $bb[0]->line_number;
  82         } @t if $sort eq 'F';
  83     return @t;
  84 }
  85
  86   ###############################################################################
  87
  88 sub text_extract (*) {
  89     my($h) = @_;
  90     for (;;) {
  91         my $s = TmplTokenizer::next_token $h;
  92         last unless defined $s;
  93         my($kind, $t, $attr) = ($s->type, $s->string, $s->attributes);
  94         if ($kind eq TmplTokenType::TEXT) {
  95             remember( $s, $t ) if $t =~ /\S/s;
  96         } elsif ($kind eq TmplTokenType::TEXT_PARAMETRIZED) {
  97             remember( $s, $s->form ) if $s->form =~ /\S/s;
  98         } elsif ($kind eq TmplTokenType::TAG && %$attr) {
  99             # value [tag=input], meta
 100             my $tag = lc($1) if $t =~ /^<(\S+)/s;
 101             for my $a ('alt', 'content', 'title', 'value','label') {
 102                 if ($attr->{$a}) {
 103                     next if $a eq 'label' && $tag ne 'optgroup';
 104                     next if $a eq 'content' && $tag ne 'meta';
 105                     next if $a eq 'value' && ($tag ne 'input'
 106                         || (ref $attr->{'type'} && $attr->{'type'}->[1] =~ /^(?:hidden|radio|checkbox)$/)); # FIXME
 107                     my($key, $val, $val_orig, $order) = @{$attr->{$a}}; #FIXME
 108                     $val = TmplTokenizer::trim $val;
 109                     remember( $s, $val ) if $val =~ /\S/s;
 110                 }
 111             }
 112         } elsif ($s->has_js_data) {
 113             for my $t (@{$s->js_data}) {
 114               remember( $s, $t->[3] ) if $t->[0]; # FIXME
 115             }
 116         }
 117     }
 118 }
 119
 120 ###############################################################################
 121
 122 sub generate_strings_list () {
 123     # Emit all extracted strings.
 124     for my $t (string_list) {
 125         printf OUTPUT "%s\n", $t;
 126     }
 127 }
 128
 129 ###############################################################################
 130
 131 sub generate_po_file () {
 132     # We don't emit the Plural-Forms header; it's meaningless for us
 133     my $pot_charset = (defined $charset_out? $charset_out: 'CHARSET');
 134     $pot_charset = TmplTokenizer::charset_canon $pot_charset;
 135     # Time stamps aren't exactly right semantically. I don't know how to fix it.
 136     my $time = POSIX::strftime('%Y-%m-%d %H:%M%z', localtime(time));
 137     my $time_pot = $time;
 138     my $time_po  = $po_mode_p? $time: 'YEAR-MO-DA HO:MI+ZONE';
 139     print OUTPUT <<EOF;
 140 # SOME DESCRIPTIVE TITLE.
 141 # Copyright (C) YEAR THE PACKAGE'S COPYRIGHT HOLDER
 142 # This file is distributed under the same license as the PACKAGE package.
 143 # FIRST AUTHOR <EMAIL\@ADDRESS>, YEAR.
 144 #
 145 EOF
 146     print OUTPUT <<EOF unless $disable_fuzzy_p;
 147 #, fuzzy
 148 EOF
 149     print OUTPUT <<EOF;
 150 msgid ""
 151 msgstr ""
 152 "Project-Id-Version: PACKAGE VERSION\\n"
 153 "POT-Creation-Date: $time_pot\\n"
 154 "PO-Revision-Date: $time_po\\n"
 155 "Last-Translator: FULL NAME <EMAIL\@ADDRESS>\\n"
 156 "Language-Team: LANGUAGE <LL\@li.org>\\n"
 157 "MIME-Version: 1.0\\n"
 158 "Content-Type: text/plain; charset=$pot_charset\\n"
 159 "Content-Transfer-Encoding: 8bit\\n"
 160
 161 EOF
 162     my $directory_re = quotemeta("$directory/");
 163     for my $t (string_list) {
 164         if ($text{$t}->[0]->type == TmplTokenType::TEXT_PARAMETRIZED) {
 165             my($token, $n) = ($text{$t}->[0], 0);
 166             printf OUTPUT "#. For the first occurrence,\n"
 167                     if @{$text{$t}} > 1 && $token->parameters_and_fields > 0;
 168             for my $param ($token->parameters_and_fields) {
 169                 $n += 1;
 170                 my $type = $param->type;
 171                 my $subtype = ($type == TmplTokenType::TAG
 172                         && $param->string =~ /^<input\b/is?
 173                                 $param->attributes->{'type'}->[1]: undef);
 174                 my $fmt = TmplTokenizer::_formalize( $param );
 175                 $fmt =~ s/^%/%$n\$/;
 176                 if ($type == TmplTokenType::DIRECTIVE) {
 177                     $type = "Template::Toolkit Directive";
 178 #                   $type = $param->string =~ /[%()+/is? $1: 'ERROR';
 179                     my $name = $param->string =~ /\bname=(["']?)([^\s"']+)\1/is?
 180                             $2: undef;
 181                     printf OUTPUT "#. %s: %s\n", $fmt,
 182                         "$type" . (defined $name? " name=$name": '');
 183                 } else {
 184                     my $name = $param->attributes->{'name'};
 185                     my $value = $param->attributes->{'value'}
 186                             unless $subtype =~ /^(?:text)$/;
 187                     printf OUTPUT "#. %s: %s\n", $fmt, "type=$subtype"
 188                             . (defined $name?  " name=$name->[1]": '')
 189                             . (defined $value? " value=$value->[1]": '');
 190                 }
 191             }
 192         } elsif ($text{$t}->[0]->type == TmplTokenType::TAG) {
 193             my($token) = ($text{$t}->[0]);
 194             printf OUTPUT "#. For the first occurrence,\n"
 195                     if @{$text{$t}} > 1 && $token->parameters_and_fields > 0;
 196             if ($token->string =~ /^<meta\b/is) {
 197                 my $type = $token->attributes->{'http-equiv'}->[1];
 198                 print OUTPUT "#. META http-equiv=$type\n" if defined $type;
 199             } elsif ($token->string =~ /^<([a-z0-9]+)/is) {
 200                 my $tag = uc($1);
 201                 my $type = (lc($tag) eq 'input'?
 202                         $token->attributes->{'type'}: undef);
 203                 my $name = $token->attributes->{'name'};
 204                 printf OUTPUT "#. %s\n", $tag
 205                     . (defined $type? " type=$type->[1]": '')
 206                     . (defined $name? " name=$name->[1]": '');
 207             }
 208         } elsif ($text{$t}->[0]->has_js_data) {
 209             printf OUTPUT "#. For the first occurrence,\n" if @{$text{$t}} > 1;
 210             printf OUTPUT "#. SCRIPT\n";
 211         }
 212         my $cformat_p;
 213         for my $token (@{$text{$t}}) {
 214             my $pathname = $token->pathname;
 215             $pathname =~ s/^$directory_re//os;
 216         $pathname =~ s/^.*\/koha-tmpl\/(.*)$/$1/;
 217             printf OUTPUT "#: %s:%d\n", $pathname, $token->line_number
 218                     if defined $pathname && defined $token->line_number;
 219             $cformat_p = 1 if $token->type == TmplTokenType::TEXT_PARAMETRIZED;
 220         }
 221         printf OUTPUT "#, c-format\n" if $cformat_p;
 222         printf OUTPUT "msgid %s\n", TmplTokenizer::quote_po
 223                 TmplTokenizer::string_canon
 224                 TmplTokenizer::charset_convert $t, $charset_in, $charset_out;
 225         printf OUTPUT "msgstr %s\n\n", (defined $translation{$t}?
 226                 TmplTokenizer::quote_po( $translation{$t} ): "\"\"");
 227     }
 228 }
 229
 230 ###############################################################################
 231
 232 sub convert_translation_file () {
 233     open(INPUT, "<$convert_from") || die "$convert_from: $!\n";
 234     VerboseWarnings::set_input_file_name $convert_from;
 235     while (<INPUT>) {
 236         chomp;
 237         my($msgid, $msgstr) = split(/\t/);
 238         die "$convert_from: $.: Malformed tmpl_process input (no tab)\n"
 239                 unless defined $msgstr;
 240
 241         # Fixup some of the bad strings
 242         $msgid =~ s/^SELECTED>//;
 243
 244         # Create dummy token
 245         my $token = TmplToken->new( $msgid, TmplTokenType::UNKNOWN, undef, undef );
 246         remember( $token, $msgid );
 247         $msgstr =~ s/^(?:LIMIT;|LIMITED;)//g; # unneeded for tmpl_process3
 248         $translation{$msgid} = $msgstr unless $msgstr eq '*****';
 249
 250         if ($msgid  =~ /\bcharset=(["']?)([^;\s"']+)\1/s) {
 251             my $candidate = TmplTokenizer::charset_canon $2;
 252             die "Conflicting charsets in msgid: $candidate vs $charset_in\n"
 253                     if defined $charset_in && $charset_in ne $candidate;
 254             $charset_in = $candidate;
 255         }
 256         if ($msgstr =~ /\bcharset=(["']?)([^;\s"']+)\1/s) {
 257             my $candidate = TmplTokenizer::charset_canon $2;
 258             die "Conflicting charsets in msgid: $candidate vs $charset_out\n"
 259                     if defined $charset_out && $charset_out ne $candidate;
 260             $charset_out = $candidate;
 261         }
 262     }
 263     # The following assumption is correct; that's what HTML::Template assumes
 264     if (!defined $charset_in) {
 265         $charset_in = $charset_out = TmplTokenizer::charset_canon 'utf-8';
 266         warn "Warning: Can't determine original templates' charset, defaulting to $charset_in\n";
 267     }
 268 }
 269
 270 ###############################################################################
 271
 272 sub usage ($) {
 273     my($exitcode) = @_;
 274     my $h = $exitcode? *STDERR: *STDOUT;
 275     print $h <<EOF;
 276 Usage: $0 [OPTIONS]
 277 Extract translatable strings from given HTML::Template input files.
 278
 279 Input file location:
 280   -f, --files-from=FILE          Get list of input files from FILE
 281   -D, --directory=DIRECTORY      Add DIRECTORY to list for input files search
 282
 283 Output file location:
 284   -o, --output=FILE              Write output to specified file
 285
 286 HTML::Template options:
 287   -a, --extract-all              Extract all strings
 288       --pedantic-warnings        Issue warnings even for detected problems
 289                                  which are likely to be harmless
 290
 291 Output details:
 292   -s, --sort-output              generate sorted output
 293   -F, --sort-by-file             sort output by file location
 294   -v, --verbose                  explain what is being done
 295
 296 Informative output:
 297       --help                     Display this help and exit
 298
 299 Try `perldoc $0' for perhaps more information.
 300 EOF
 301     exit($exitcode);
 302 }
 303
 304 ###############################################################################
 305
 306 sub usage_error (;$) {
 307     print STDERR "$_[0]\n" if @_;
 308     print STDERR "Try `$0 --help' for more information.\n";
 309     exit(-1);
 310 }
 311
 312 ###############################################################################
 313
 314 Getopt::Long::config qw( bundling no_auto_abbrev );
 315 GetOptions(
 316     'a|extract-all'                     => \$extract_all_p,
 317     'charset=s' => sub { $charset_in = $charset_out = $_[1] },  # INTERNAL
 318     'convert-from=s'                    => \$convert_from,
 319     'D|directory=s'                     => \$directory,
 320     'disable-fuzzy'                     => \$disable_fuzzy_p,   # INTERNAL
 321     'f|files-from=s'                    => \$files_from,
 322     'I|input-charset=s'                 => \$charset_in,        # INTERNAL
 323     'pedantic-warnings|pedantic'        => sub { $pedantic_p = 1 },
 324     'O|output-charset=s'                => \$charset_out,       # INTERNAL
 325     'output|o=s'                        => \$output,
 326     'po-mode'                           => \$po_mode_p,         # INTERNAL
 327     's|sort-output'                     => sub { $sort = 's' },
 328     'F|sort-by-file'                    => sub { $sort = 'F' },
 329     'v|verbose'                         => \$verbose_p,
 330     'help'                              => sub { usage(0) },
 331 ) || usage_error;
 332
 333 VerboseWarnings::set_application_name $0;
 334 VerboseWarnings::set_pedantic_mode $pedantic_p;
 335
 336 usage_error('Missing mandatory option -f')
 337         unless defined $files_from || defined $convert_from;
 338 $directory = '.' unless defined $directory;
 339
 340 usage_error('You cannot specify both --convert-from and --files-from')
 341         if defined $convert_from && defined $files_from;
 342
 343 if (defined $output && $output ne '-') {
 344     print STDERR "$0: Opening output file \"$output\"\n" if $verbose_p;
 345         open(OUTPUT, ">$output") || die "$output: $!\n";
 346     binmode( OUTPUT, ":utf8" );
 347 } else {
 348     print STDERR "$0: Outputting to STDOUT...\n" if $verbose_p;
 349     open(OUTPUT, ">&STDOUT");
 350 }
 351
 352 if (defined $files_from) {
 353     print STDERR "$0: Opening input file list \"$files_from\"\n" if $verbose_p;
 354     open(INPUT, "<$files_from") || die "$files_from: $!\n";
 355     while (<INPUT>) {
 356         chomp;
 357         my $input = /^\//? $_: "$directory/$_";
 358         my $h = TmplTokenizer->new( $input );
 359         $h->set_allow_cformat( 1 );
 360         VerboseWarnings::set_input_file_name $input;
 361         print STDERR "$0: Processing file \"$input\"\n" if $verbose_p;
 362         text_extract( $h );
 363     }
 364     close INPUT;
 365 } else {
 366     print STDERR "$0: Converting \"$convert_from\"\n" if $verbose_p;
 367     convert_translation_file;
 368 }
 369 generate_po_file;
 370
 371 warn "This input will not work with Mozilla standards-compliant mode\n", undef
 372         if TmplTokenizer::syntaxerror_p;
 373
 374
 375 exit(-1) if TmplTokenizer::fatal_p;
 376
 377 ###############################################################################
 378
 379 =head1 DESCRIPTION
 380
 381 This is an experimental script based on the modularized
 382 text-extract2.pl script.  It has behaviour similar to
 383 xgettext(1), and generates gettext-compatible output files.
 384
 385 A gettext-like format provides the following advantages:
 386
 387 =over
 388
 389 =item -
 390
 391 Translation to non-English-like languages with different word
 392 order:  gettext's c-format strings can theoretically be
 393 emulated if we are able to do some analysis on the .tmpl input
 394 and treat <TMPL_VAR> in a way similar to %s.
 395
 396 =item -
 397
 398 Context for the extracted strings:  the gettext format provides
 399 the filenames and line numbers where each string can be found.
 400 The translator can read the source file and see the context,
 401 in case the string by itself can mean several different things.
 402
 403 =item -
 404
 405 Place for the translator to add comments about the translations.
 406
 407 =item -
 408
 409 Gettext-compatible tools, if any, might be usable if we adopt
 410 the gettext format.
 411
 412 =back
 413
 414 This script has already been in use for over a year and should
 415 be reasonable stable. Nevertheless, it is still somewhat
 416 experimental and there are still some issues.
 417
 418 Please refer to the explanation in tmpl_process3 for further
 419 details.
 420
 421 If you want to generate GNOME-style POTFILES.in files, such
 422 files (passed to -f) can be generated thus:
 423
 424         (cd ../.. && find koha-tmpl/opac-tmpl/default/en \
 425                 -name \*.inc -o -name \*.tmpl) > opac/POTFILES.in
 426         (cd ../.. && find koha-tmpl/intranet-tmpl/default/en \
 427                 -name \*.inc -o -name \*.tmpl) > intranet/POTFILES.in
 428
 429 This is, however, quite pointless, because the "create" and
 430 "update" actions have already been implemented in tmpl_process3.pl.
 431
 432 =head2 Strings inside JavaScript
 433
 434 In the SCRIPT elements, the script will attempt to scan for
 435 _("I<string literal>") patterns, and extract the I<string literal>
 436 as a translatable string.
 437
 438 Note that the C-like _(...) notation is required.
 439
 440 The JavaScript must actually define a _ function
 441 so that the code remains correct JavaScript.
 442 A suitable definition of such a function can be
 443
 444         function _(s) { return s } // dummy function for gettext
 445
 446 =head1 SEE ALSO
 447
 448 tmpl_process3.pl,
 449 xgettext(1),
 450 Locale::PO(3),
 451 translator_doc.txt
 452
 453 =head1 BUGS
 454
 455 There probably are some. Bugs related to scanning of <INPUT>
 456 tags seem to be especially likely to be present.
 457
 458 Its diagnostics are probably too verbose.
 459
 460 When a <TMPL_VAR> within a JavaScript-related attribute is
 461 detected, the script currently displays no warnings at all.
 462 It might be good to display some kind of warning.
 463
 464 Its sort order (-s option) seems to be different than the real
 465 xgettext(1)'s sort option. This will result in translation
 466 strings inside the generated PO file spuriously moving about
 467 when tmpl_process3.pl calls msgmerge(1) to update the PO file.
 468
 469 If a Javascript string has leading spaces, it will
 470 generate strings with spurious leading spaces,
 471 leading to failure to match the strings when actually generating
 472 translated files.
 473
 474 =cut