misc/translator/xgettext.pl

   1 #!/usr/bin/perl
   2
   3 =head1 NAME
   4
   5 xgettext.pl - xgettext(1)-like interface for .tmpl strings extraction
   6
   7 =cut
   8
   9 use strict;
  10 use warnings;
  11 use Getopt::Long;
  12 use POSIX;
  13 use Locale::PO;
  14 use TmplTokenizer;
  15 use VerboseWarnings;
  16
  17 use vars qw( $convert_from );
  18 use vars qw( $files_from $directory $output $sort );
  19 use vars qw( $extract_all_p );
  20 use vars qw( $pedantic_p );
  21 use vars qw( %text %translation );
  22 use vars qw( $charset_in $charset_out );
  23 use vars qw( $disable_fuzzy_p );
  24 use vars qw( $verbose_p );
  25 use vars qw( $po_mode_p );
  26
  27 ###############################################################################
  28
  29 sub string_negligible_p ($) {
  30     my($t) = @_;                                # a string
  31     # Don't emit pure whitespace, pure numbers, pure punctuation,
  32     # single letters, or TMPL_VAR's.
  33     # Punctuation should arguably be translated. But without context
  34     # they are untranslatable. Note that $t is a string, not a token object.
  35     return !$extract_all_p && (
  36                TmplTokenizer::blank_p($t)       # blank or TMPL_VAR
  37             || $t =~ /^\d+$/                    # purely digits
  38             || $t =~ /^[-\+\.,:;!\?'"%\(\)\[\]\|]+$/ # punctuation w/o context
  39             || $t =~ /^[A-Za-z]$/               # single letters
  40         )
  41 }
  42
  43 sub token_negligible_p( $ ) {
  44     my($x) = @_;
  45     my $t = $x->type;
  46     return !$extract_all_p && (
  47             $t == C4::TmplTokenType::TEXT? string_negligible_p( $x->string ):
  48             $t == C4::TmplTokenType::DIRECTIVE? 1:
  49             $t == C4::TmplTokenType::TEXT_PARAMETRIZED
  50                 && join( '', map { my $t = $_->type;
  51                         $t == C4::TmplTokenType::DIRECTIVE?
  52                                 '1': $t == C4::TmplTokenType::TAG?
  53                                         '': token_negligible_p( $_ )?
  54                                         '': '1' } @{$x->children} ) eq '' );
  55 }
  56
  57 ###############################################################################
  58
  59 sub remember ($$) {
  60     my($token, $string) = @_;
  61     # If we determine that the string is negligible, don't bother to remember
  62     unless (string_negligible_p( $string ) || token_negligible_p( $token )) {
  63         my $key = TmplTokenizer::string_canon( $string );
  64         $text{$key} = [] unless defined $text{$key};
  65         push @{$text{$key}}, $token;
  66     }
  67 }
  68
  69 ###############################################################################
  70
  71 sub string_list () {
  72     my @t = keys %text;
  73     # The real gettext tools seems to sort case sensitively; I don't know why
  74     @t = sort { $a cmp $b } @t if $sort eq 's';
  75     @t = sort {
  76             my @aa = sort { $a->pathname cmp $b->pathname
  77                     || $a->line_number <=> $b->line_number } @{$text{$a}};
  78             my @bb = sort { $a->pathname cmp $b->pathname
  79                     || $a->line_number <=> $b->line_number } @{$text{$b}};
  80             $aa[0]->pathname cmp $bb[0]->pathname
  81                     || $aa[0]->line_number <=> $bb[0]->line_number;
  82         } @t if $sort eq 'F';
  83     return @t;
  84 }
  85
  86   ###############################################################################
  87
  88 sub text_extract (*) {
  89     my($h) = @_;
  90     for (;;) {
  91         my $s = TmplTokenizer::next_token $h;
  92         last unless defined $s;
  93         my($kind, $t, $attr) = ($s->type, $s->string, $s->attributes);
  94         if ($kind eq C4::TmplTokenType::TEXT) {
  95             if ($t =~ /\S/s && $t !~ /<!/){
  96                 remember( $s, $t );
  97             }
  98         } elsif ($kind eq C4::TmplTokenType::TEXT_PARAMETRIZED) {
  99             if ($s->form =~ /\S/s && $s->form !~ /<!/){
 100                 remember( $s, $s->form );
 101             }
 102         } elsif ($kind eq C4::TmplTokenType::TAG && %$attr) {
 103             # value [tag=input], meta
 104             my $tag = lc($1) if $t =~ /^<(\S+)/s;
 105             for my $a ('alt', 'content', 'title', 'value','label') {
 106                 if ($attr->{$a}) {
 107                     next if $a eq 'label' && $tag ne 'optgroup';
 108                     next if $a eq 'content' && $tag ne 'meta';
 109                     next if $a eq 'value' && ($tag ne 'input'
 110                         || (ref $attr->{'type'} && $attr->{'type'}->[1] =~ /^(?:hidden|radio|checkbox)$/)); # FIXME
 111                     my($key, $val, $val_orig, $order) = @{$attr->{$a}}; #FIXME
 112                     $val = TmplTokenizer::trim $val;
 113                     remember( $s, $val ) if $val =~ /\S/s;
 114                 }
 115             }
 116         } elsif ($s->has_js_data) {
 117             for my $t (@{$s->js_data}) {
 118               remember( $s, $t->[3] ) if $t->[0]; # FIXME
 119             }
 120         }
 121     }
 122 }
 123
 124 ###############################################################################
 125
 126 sub generate_strings_list () {
 127     # Emit all extracted strings.
 128     for my $t (string_list) {
 129         printf OUTPUT "%s\n", $t;
 130     }
 131 }
 132
 133 ###############################################################################
 134
 135 sub generate_po_file () {
 136     # We don't emit the Plural-Forms header; it's meaningless for us
 137     my $pot_charset = (defined $charset_out? $charset_out: 'CHARSET');
 138     $pot_charset = TmplTokenizer::charset_canon $pot_charset;
 139     # Time stamps aren't exactly right semantically. I don't know how to fix it.
 140     my $time = POSIX::strftime('%Y-%m-%d %H:%M%z', localtime(time));
 141     my $time_pot = $time;
 142     my $time_po  = $po_mode_p? $time: 'YEAR-MO-DA HO:MI+ZONE';
 143     print OUTPUT <<EOF;
 144 # SOME DESCRIPTIVE TITLE.
 145 # Copyright (C) YEAR THE PACKAGE'S COPYRIGHT HOLDER
 146 # This file is distributed under the same license as the PACKAGE package.
 147 # FIRST AUTHOR <EMAIL\@ADDRESS>, YEAR.
 148 #
 149 EOF
 150     print OUTPUT <<EOF unless $disable_fuzzy_p;
 151 #, fuzzy
 152 EOF
 153     print OUTPUT <<EOF;
 154 msgid ""
 155 msgstr ""
 156 "Project-Id-Version: PACKAGE VERSION\\n"
 157 "POT-Creation-Date: $time_pot\\n"
 158 "PO-Revision-Date: $time_po\\n"
 159 "Last-Translator: FULL NAME <EMAIL\@ADDRESS>\\n"
 160 "Language-Team: LANGUAGE <LL\@li.org>\\n"
 161 "MIME-Version: 1.0\\n"
 162 "Content-Type: text/plain; charset=$pot_charset\\n"
 163 "Content-Transfer-Encoding: 8bit\\n"
 164
 165 EOF
 166     my $directory_re = quotemeta("$directory/");
 167     for my $t (string_list) {
 168         if ($text{$t}->[0]->type == C4::TmplTokenType::TEXT_PARAMETRIZED) {
 169             my($token, $n) = ($text{$t}->[0], 0);
 170             printf OUTPUT "#. For the first occurrence,\n"
 171                     if @{$text{$t}} > 1 && $token->parameters_and_fields > 0;
 172             for my $param ($token->parameters_and_fields) {
 173                 $n += 1;
 174                 my $type = $param->type;
 175                 my $subtype = ($type == C4::TmplTokenType::TAG
 176                         && $param->string =~ /^<input\b/is?
 177                                 $param->attributes->{'type'}->[1]: undef);
 178                 my $fmt = TmplTokenizer::_formalize( $param );
 179                 $fmt =~ s/^%/%$n\$/;
 180                 if ($type == C4::TmplTokenType::DIRECTIVE) {
 181 #                   $type = "Template::Toolkit Directive";
 182                     $type = $param->string =~ /\[%(.*?)%\]/is? $1: 'ERROR';
 183                     my $name = $param->string =~ /\bname=(["']?)([^\s"']+)\1/is?
 184                             $2: undef;
 185                     printf OUTPUT "#. %s: %s\n", $fmt,
 186                         "$type" . (defined $name? " name=$name": '');
 187                 } else {
 188                     my $name = $param->attributes->{'name'};
 189                     my $value = $param->attributes->{'value'}
 190                             unless $subtype =~ /^(?:text)$/;
 191                     printf OUTPUT "#. %s: %s\n", $fmt, "type=$subtype"
 192                             . (defined $name?  " name=$name->[1]": '')
 193                             . (defined $value? " value=$value->[1]": '');
 194                 }
 195             }
 196         } elsif ($text{$t}->[0]->type == C4::TmplTokenType::TAG) {
 197             my($token) = ($text{$t}->[0]);
 198             printf OUTPUT "#. For the first occurrence,\n"
 199                     if @{$text{$t}} > 1 && $token->parameters_and_fields > 0;
 200             if ($token->string =~ /^<meta\b/is) {
 201                 my $type = $token->attributes->{'http-equiv'}->[1];
 202                 print OUTPUT "#. META http-equiv=$type\n" if defined $type;
 203             } elsif ($token->string =~ /^<([a-z0-9]+)/is) {
 204                 my $tag = uc($1);
 205                 my $type = (lc($tag) eq 'input'?
 206                         $token->attributes->{'type'}: undef);
 207                 my $name = $token->attributes->{'name'};
 208                 printf OUTPUT "#. %s\n", $tag
 209                     . (defined $type? " type=$type->[1]": '')
 210                     . (defined $name? " name=$name->[1]": '');
 211             }
 212         } elsif ($text{$t}->[0]->has_js_data) {
 213             printf OUTPUT "#. For the first occurrence,\n" if @{$text{$t}} > 1;
 214             printf OUTPUT "#. SCRIPT\n";
 215         }
 216         my $cformat_p;
 217         for my $token (@{$text{$t}}) {
 218             my $pathname = $token->pathname;
 219             $pathname =~ s/^$directory_re//os;
 220         $pathname =~ s/^.*\/koha-tmpl\/(.*)$/$1/;
 221             printf OUTPUT "#: %s:%d\n", $pathname, $token->line_number
 222                     if defined $pathname && defined $token->line_number;
 223             $cformat_p = 1 if $token->type == C4::TmplTokenType::TEXT_PARAMETRIZED;
 224         }
 225         printf OUTPUT "#, c-format\n" if $cformat_p;
 226         printf OUTPUT "msgid %s\n", TmplTokenizer::quote_po
 227                 TmplTokenizer::string_canon
 228                 TmplTokenizer::charset_convert $t, $charset_in, $charset_out;
 229         printf OUTPUT "msgstr %s\n\n", (defined $translation{$t}?
 230                 TmplTokenizer::quote_po( $translation{$t} ): "\"\"");
 231     }
 232 }
 233
 234 ###############################################################################
 235
 236 sub convert_translation_file () {
 237     open(INPUT, "<$convert_from") || die "$convert_from: $!\n";
 238     VerboseWarnings::set_input_file_name $convert_from;
 239     while (<INPUT>) {
 240         chomp;
 241         my($msgid, $msgstr) = split(/\t/);
 242         die "$convert_from: $.: Malformed tmpl_process input (no tab)\n"
 243                 unless defined $msgstr;
 244
 245         # Fixup some of the bad strings
 246         $msgid =~ s/^SELECTED>//;
 247
 248         # Create dummy token
 249         my $token = TmplToken->new( $msgid, C4::TmplTokenType::UNKNOWN, undef, undef );
 250         remember( $token, $msgid );
 251         $msgstr =~ s/^(?:LIMIT;|LIMITED;)//g; # unneeded for tmpl_process3
 252         $translation{$msgid} = $msgstr unless $msgstr eq '*****';
 253
 254         if ($msgid  =~ /\bcharset=(["']?)([^;\s"']+)\1/s) {
 255             my $candidate = TmplTokenizer::charset_canon $2;
 256             die "Conflicting charsets in msgid: $candidate vs $charset_in\n"
 257                     if defined $charset_in && $charset_in ne $candidate;
 258             $charset_in = $candidate;
 259         }
 260         if ($msgstr =~ /\bcharset=(["']?)([^;\s"']+)\1/s) {
 261             my $candidate = TmplTokenizer::charset_canon $2;
 262             die "Conflicting charsets in msgid: $candidate vs $charset_out\n"
 263                     if defined $charset_out && $charset_out ne $candidate;
 264             $charset_out = $candidate;
 265         }
 266     }
 267     # The following assumption is correct; that's what HTML::Template assumes
 268     if (!defined $charset_in) {
 269         $charset_in = $charset_out = TmplTokenizer::charset_canon 'utf-8';
 270         warn "Warning: Can't determine original templates' charset, defaulting to $charset_in\n";
 271     }
 272 }
 273
 274 ###############################################################################
 275
 276 sub usage ($) {
 277     my($exitcode) = @_;
 278     my $h = $exitcode? *STDERR: *STDOUT;
 279     print $h <<EOF;
 280 Usage: $0 [OPTIONS]
 281 Extract translatable strings from given HTML::Template input files.
 282
 283 Input file location:
 284   -f, --files-from=FILE          Get list of input files from FILE
 285   -D, --directory=DIRECTORY      Add DIRECTORY to list for input files search
 286
 287 Output file location:
 288   -o, --output=FILE              Write output to specified file
 289
 290 HTML::Template options:
 291   -a, --extract-all              Extract all strings
 292       --pedantic-warnings        Issue warnings even for detected problems
 293                                  which are likely to be harmless
 294
 295 Output details:
 296   -s, --sort-output              generate sorted output
 297   -F, --sort-by-file             sort output by file location
 298   -v, --verbose                  explain what is being done
 299
 300 Informative output:
 301       --help                     Display this help and exit
 302
 303 Try `perldoc $0' for perhaps more information.
 304 EOF
 305     exit($exitcode);
 306 }
 307
 308 ###############################################################################
 309
 310 sub usage_error (;$) {
 311     print STDERR "$_[0]\n" if @_;
 312     print STDERR "Try `$0 --help' for more information.\n";
 313     exit(-1);
 314 }
 315
 316 ###############################################################################
 317
 318 Getopt::Long::config qw( bundling no_auto_abbrev );
 319 GetOptions(
 320     'a|extract-all'                     => \$extract_all_p,
 321     'charset=s' => sub { $charset_in = $charset_out = $_[1] },  # INTERNAL
 322     'convert-from=s'                    => \$convert_from,
 323     'D|directory=s'                     => \$directory,
 324     'disable-fuzzy'                     => \$disable_fuzzy_p,   # INTERNAL
 325     'f|files-from=s'                    => \$files_from,
 326     'I|input-charset=s'                 => \$charset_in,        # INTERNAL
 327     'pedantic-warnings|pedantic'        => sub { $pedantic_p = 1 },
 328     'O|output-charset=s'                => \$charset_out,       # INTERNAL
 329     'output|o=s'                        => \$output,
 330     'po-mode'                           => \$po_mode_p,         # INTERNAL
 331     's|sort-output'                     => sub { $sort = 's' },
 332     'F|sort-by-file'                    => sub { $sort = 'F' },
 333     'v|verbose'                         => \$verbose_p,
 334     'help'                              => sub { usage(0) },
 335 ) || usage_error;
 336
 337 VerboseWarnings::set_application_name $0;
 338 VerboseWarnings::set_pedantic_mode $pedantic_p;
 339
 340 usage_error('Missing mandatory option -f')
 341         unless defined $files_from || defined $convert_from;
 342 $directory = '.' unless defined $directory;
 343
 344 usage_error('You cannot specify both --convert-from and --files-from')
 345         if defined $convert_from && defined $files_from;
 346
 347 if (defined $output && $output ne '-') {
 348     print STDERR "$0: Opening output file \"$output\"\n" if $verbose_p;
 349         open(OUTPUT, ">$output") || die "$output: $!\n";
 350 } else {
 351     print STDERR "$0: Outputting to STDOUT...\n" if $verbose_p;
 352     open(OUTPUT, ">&STDOUT");
 353 }
 354 binmode OUTPUT, ':encoding(UTF-8)';
 355
 356 if (defined $files_from) {
 357     print STDERR "$0: Opening input file list \"$files_from\"\n" if $verbose_p;
 358     open(INPUT, "<$files_from") || die "$files_from: $!\n";
 359     while (<INPUT>) {
 360         chomp;
 361         my $input = /^\//? $_: "$directory/$_";
 362         my $h = TmplTokenizer->new( $input );
 363         $h->set_allow_cformat( 1 );
 364         VerboseWarnings::set_input_file_name $input;
 365         print STDERR "$0: Processing file \"$input\"\n" if $verbose_p;
 366         text_extract( $h );
 367     }
 368     close INPUT;
 369 } else {
 370     print STDERR "$0: Converting \"$convert_from\"\n" if $verbose_p;
 371     convert_translation_file;
 372 }
 373 generate_po_file;
 374
 375 warn "This input will not work with Mozilla standards-compliant mode\n", undef
 376         if TmplTokenizer::syntaxerror_p;
 377
 378
 379 exit(-1) if TmplTokenizer::fatal_p;
 380
 381 ###############################################################################
 382
 383 =head1 DESCRIPTION
 384
 385 This is an experimental script based on the modularized
 386 text-extract2.pl script.  It has behaviour similar to
 387 xgettext(1), and generates gettext-compatible output files.
 388
 389 A gettext-like format provides the following advantages:
 390
 391 =over
 392
 393 =item -
 394
 395 Translation to non-English-like languages with different word
 396 order:  gettext's c-format strings can theoretically be
 397 emulated if we are able to do some analysis on the .tmpl input
 398 and treat <TMPL_VAR> in a way similar to %s.
 399
 400 =item -
 401
 402 Context for the extracted strings:  the gettext format provides
 403 the filenames and line numbers where each string can be found.
 404 The translator can read the source file and see the context,
 405 in case the string by itself can mean several different things.
 406
 407 =item -
 408
 409 Place for the translator to add comments about the translations.
 410
 411 =item -
 412
 413 Gettext-compatible tools, if any, might be usable if we adopt
 414 the gettext format.
 415
 416 =back
 417
 418 This script has already been in use for over a year and should
 419 be reasonable stable. Nevertheless, it is still somewhat
 420 experimental and there are still some issues.
 421
 422 Please refer to the explanation in tmpl_process3 for further
 423 details.
 424
 425 If you want to generate GNOME-style POTFILES.in files, such
 426 files (passed to -f) can be generated thus:
 427
 428         (cd ../.. && find koha-tmpl/opac-tmpl/default/en \
 429                 -name \*.inc -o -name \*.tmpl) > opac/POTFILES.in
 430         (cd ../.. && find koha-tmpl/intranet-tmpl/default/en \
 431                 -name \*.inc -o -name \*.tmpl) > intranet/POTFILES.in
 432
 433 This is, however, quite pointless, because the "create" and
 434 "update" actions have already been implemented in tmpl_process3.pl.
 435
 436 =head2 Strings inside JavaScript
 437
 438 In the SCRIPT elements, the script will attempt to scan for
 439 _("I<string literal>") patterns, and extract the I<string literal>
 440 as a translatable string.
 441
 442 Note that the C-like _(...) notation is required.
 443
 444 The JavaScript must actually define a _ function
 445 so that the code remains correct JavaScript.
 446 A suitable definition of such a function can be
 447
 448         function _(s) { return s } // dummy function for gettext
 449
 450 =head1 SEE ALSO
 451
 452 tmpl_process3.pl,
 453 xgettext(1),
 454 Locale::PO(3),
 455 translator_doc.txt
 456
 457 =head1 BUGS
 458
 459 There probably are some. Bugs related to scanning of <INPUT>
 460 tags seem to be especially likely to be present.
 461
 462 Its diagnostics are probably too verbose.
 463
 464 When a <TMPL_VAR> within a JavaScript-related attribute is
 465 detected, the script currently displays no warnings at all.
 466 It might be good to display some kind of warning.
 467
 468 Its sort order (-s option) seems to be different than the real
 469 xgettext(1)'s sort option. This will result in translation
 470 strings inside the generated PO file spuriously moving about
 471 when tmpl_process3.pl calls msgmerge(1) to update the PO file.
 472
 473 If a Javascript string has leading spaces, it will
 474 generate strings with spurious leading spaces,
 475 leading to failure to match the strings when actually generating
 476 translated files.
 477
 478 =cut