misc/translator/xgettext.pl

   1 #!/usr/bin/perl
   2
   3 =head1 NAME
   4
   5 xgettext.pl - xgettext(1)-like interface for .tmpl strings extraction
   6
   7 =cut
   8
   9 use strict;
  10 use Getopt::Long;
  11 use POSIX;
  12 use Locale::PO;
  13 use TmplTokenizer;
  14 use VerboseWarnings;
  15
  16 use vars qw( $convert_from );
  17 use vars qw( $files_from $directory $output $sort );
  18 use vars qw( $extract_all_p );
  19 use vars qw( $pedantic_p );
  20 use vars qw( %text %translation );
  21 use vars qw( $charset_in $charset_out );
  22 use vars qw( $verbose_p );
  23 use vars qw( $po_mode_p );
  24
  25 ###############################################################################
  26
  27 sub string_negligible_p ($) {
  28     my($t) = @_;                                # a string
  29     # Don't emit pure whitespace, pure numbers, pure punctuation,
  30     # single letters, or TMPL_VAR's.
  31     # Punctuation should arguably be translated. But without context
  32     # they are untranslatable. Note that $t is a string, not a token object.
  33     return !$extract_all_p && (
  34                TmplTokenizer::blank_p($t)       # blank or TMPL_VAR
  35             || $t =~ /^\d+$/                    # purely digits
  36             || $t =~ /^[-\+\.,:;!\?'"%\(\)\[\]\|]+$/ # punctuation w/o context
  37             || $t =~ /^[A-Za-z]$/               # single letters
  38         )
  39 }
  40
  41 sub token_negligible_p( $ ) {
  42     my($x) = @_;
  43     my $t = $x->type;
  44     return !$extract_all_p && (
  45             $t == TmplTokenType::TEXT? string_negligible_p( $x->string ):
  46             $t == TmplTokenType::DIRECTIVE? 1:
  47             $t == TmplTokenType::TEXT_PARAMETRIZED
  48                 && join( '', map { my $t = $_->type;
  49                         $t == TmplTokenType::DIRECTIVE?
  50                                 '1': $t == TmplTokenType::TAG?
  51                                         '': token_negligible_p( $_ )?
  52                                         '': '1' } @{$x->children} ) eq '' );
  53 }
  54
  55 ###############################################################################
  56
  57 sub remember ($$) {
  58     my($token, $string) = @_;
  59     # If we determine that the string is negligible, don't bother to remember
  60     unless (string_negligible_p( $string ) || token_negligible_p( $token )) {
  61         my $key = TmplTokenizer::string_canon( $string );
  62         $text{$key} = [] unless defined $text{$key};
  63         push @{$text{$key}}, $token;
  64     }
  65 }
  66
  67 ###############################################################################
  68
  69 sub string_list () {
  70     my @t = keys %text;
  71     # The real gettext tools seems to sort case sensitively; I don't know why
  72     @t = sort { $a cmp $b } @t if $sort eq 's';
  73     @t = sort {
  74             my @aa = sort { $a->pathname cmp $b->pathname
  75                     || $a->line_number <=> $b->line_number } @{$text{$a}};
  76             my @bb = sort { $a->pathname cmp $b->pathname
  77                     || $a->line_number <=> $b->line_number } @{$text{$b}};
  78             $aa[0]->pathname cmp $bb[0]->pathname
  79                     || $aa[0]->line_number <=> $bb[0]->line_number;
  80         } @t if $sort eq 'F';
  81     return @t;
  82 }
  83
  84 ###############################################################################
  85
  86 sub text_extract (*) {
  87     my($h) = @_;
  88     for (;;) {
  89         my $s = TmplTokenizer::next_token $h;
  90     last unless defined $s;
  91         my($kind, $t, $attr) = ($s->type, $s->string, $s->attributes);
  92         if ($kind eq TmplTokenType::TEXT) {
  93             remember( $s, $t ) if $t =~ /\S/s;
  94         } elsif ($kind eq TmplTokenType::TEXT_PARAMETRIZED) {
  95             remember( $s, $s->form ) if $s->form =~ /\S/s;
  96         } elsif ($kind eq TmplTokenType::TAG && %$attr) {
  97             # value [tag=input], meta
  98             my $tag = lc($1) if $t =~ /^<(\S+)/s;
  99             for my $a ('alt', 'content', 'title', 'value') {
 100                 if ($attr->{$a}) {
 101                     next if $a eq 'content' && $tag ne 'meta';
 102                     next if $a eq 'value' && ($tag ne 'input'
 103                         || (ref $attr->{'type'} && $attr->{'type'}->[1] =~ /^(?:hidden|radio)$/)); # FIXME
 104                     my($key, $val, $val_orig, $order) = @{$attr->{$a}}; #FIXME
 105                     $val = TmplTokenizer::trim $val;
 106                     remember( $s, $val ) if $val =~ /\S/s;
 107                 }
 108             }
 109         }
 110     }
 111 }
 112
 113 ###############################################################################
 114
 115 sub generate_strings_list () {
 116     # Emit all extracted strings.
 117     for my $t (string_list) {
 118         printf OUTPUT "%s\n", $t # unless negligible_p($t);
 119     }
 120 }
 121
 122 ###############################################################################
 123
 124 sub generate_po_file () {
 125     # We don't emit the Plural-Forms header; it's meaningless for us
 126     my $pot_charset = (defined $charset_out? $charset_out: 'CHARSET');
 127     $pot_charset = TmplTokenizer::charset_canon $pot_charset;
 128     # Time stamps aren't exactly right semantically. I don't know how to fix it.
 129     my $time = POSIX::strftime('%Y-%m-%d %H:%M%z', localtime(time));
 130     my $time_pot = $time;
 131     my $time_po  = $po_mode_p? $time: 'YEAR-MO-DA HO:MI+ZONE';
 132     print OUTPUT <<EOF;
 133 # SOME DESCRIPTIVE TITLE.
 134 # Copyright (C) YEAR THE PACKAGE'S COPYRIGHT HOLDER
 135 # This file is distributed under the same license as the PACKAGE package.
 136 # FIRST AUTHOR <EMAIL\@ADDRESS>, YEAR.
 137 #
 138 #, fuzzy
 139 msgid ""
 140 msgstr ""
 141 "Project-Id-Version: PACKAGE VERSION\\n"
 142 "POT-Creation-Date: $time_pot\\n"
 143 "PO-Revision-Date: $time_po\\n"
 144 "Last-Translator: FULL NAME <EMAIL\@ADDRESS>\\n"
 145 "Language-Team: LANGUAGE <LL\@li.org>\\n"
 146 "MIME-Version: 1.0\\n"
 147 "Content-Type: text/plain; charset=$pot_charset\\n"
 148 "Content-Transfer-Encoding: 8bit\\n"
 149
 150 EOF
 151     my $directory_re = quotemeta("$directory/");
 152     for my $t (string_list) {
 153         #next if negligible_p($t);
 154         my $cformat_p;
 155         for my $token (@{$text{$t}}) {
 156             my $pathname = $token->pathname;
 157             $pathname =~ s/^$directory_re//os;
 158             printf OUTPUT "#: %s:%d\n", $pathname, $token->line_number
 159                     if defined $pathname && defined $token->line_number;
 160             $cformat_p = 1 if $token->type == TmplTokenType::TEXT_PARAMETRIZED;
 161         }
 162         printf OUTPUT "#, c-format\n" if $cformat_p;
 163         printf OUTPUT "msgid %s\n", TmplTokenizer::quote_po
 164                 TmplTokenizer::string_canon
 165                 TmplTokenizer::charset_convert $t, $charset_in, $charset_out;
 166         printf OUTPUT "msgstr %s\n\n", (defined $translation{$t}?
 167                 TmplTokenizer::quote_po( $translation{$t} ): "\"\"");
 168     }
 169 }
 170
 171 ###############################################################################
 172
 173 sub convert_translation_file () {
 174     open(INPUT, "<$convert_from") || die "$convert_from: $!\n";
 175     VerboseWarnings::set_input_file_name $convert_from;
 176     while (<INPUT>) {
 177         chomp;
 178         my($msgid, $msgstr) = split(/\t/);
 179         die "$convert_from: $.: Malformed tmpl_process input (no tab)\n"
 180                 unless defined $msgstr;
 181
 182         # Fixup some of the bad strings
 183         $msgid =~ s/^SELECTED>//;
 184
 185         # Create dummy token
 186         my $token = TmplToken->new( $msgid, TmplTokenType::UNKNOWN, undef, undef );
 187         remember( $token, $msgid );
 188         $msgstr =~ s/^(?:LIMIT;|LIMITED;)//g; # unneeded for tmpl_process3
 189         $translation{$msgid} = $msgstr unless $msgstr eq '*****';
 190
 191         if ($msgid  =~ /\bcharset=(["']?)([^;\s"']+)\1/s) {
 192             my $candidate = TmplTokenizer::charset_canon $2;
 193             die "Conflicting charsets in msgid: $candidate vs $charset_in\n"
 194                     if defined $charset_in && $charset_in ne $candidate;
 195             $charset_in = $candidate;
 196         }
 197         if ($msgstr =~ /\bcharset=(["']?)([^;\s"']+)\1/s) {
 198             my $candidate = TmplTokenizer::charset_canon $2;
 199             die "Conflicting charsets in msgid: $candidate vs $charset_out\n"
 200                     if defined $charset_out && $charset_out ne $candidate;
 201             $charset_out = $candidate;
 202         }
 203     }
 204     # The following assumption is correct; that's what HTML::Template assumes
 205     if (!defined $charset_in) {
 206         $charset_in = $charset_out = TmplTokenizer::charset_canon 'iso8859-1';
 207         warn "Warning: Can't determine original templates' charset, defaulting to $charset_in\n";
 208     }
 209 }
 210
 211 ###############################################################################
 212
 213 sub usage ($) {
 214     my($exitcode) = @_;
 215     my $h = $exitcode? *STDERR: *STDOUT;
 216     print $h <<EOF;
 217 Usage: $0 [OPTIONS]
 218 Extract translatable strings from given HTML::Template input files.
 219
 220 Input file location:
 221   -f, --files-from=FILE          Get list of input files from FILE
 222   -D, --directory=DIRECTORY      Add DIRECTORY to list for input files search
 223
 224 Output file location:
 225   -o, --output=FILE              Write output to specified file
 226
 227 HTML::Template options:
 228   -a, --extract-all              Extract all strings
 229       --pedantic-warnings        Issue warnings even for detected problems
 230                                  which are likely to be harmless
 231
 232 Output details:
 233   -s, --sort-output              generate sorted output
 234   -F, --sort-by-file             sort output by file location
 235   -v, --verbose                  explain what is being done
 236
 237 Informative output:
 238       --help                     Display this help and exit
 239
 240 Try `perldoc $0' for perhaps more information.
 241 EOF
 242     exit($exitcode);
 243 }
 244
 245 ###############################################################################
 246
 247 sub usage_error (;$) {
 248     print STDERR "$_[0]\n" if @_;
 249     print STDERR "Try `$0 --help' for more information.\n";
 250     exit(-1);
 251 }
 252
 253 ###############################################################################
 254
 255 Getopt::Long::config qw( bundling no_auto_abbrev );
 256 GetOptions(
 257     'a|extract-all'                     => \$extract_all_p,
 258     'charset=s' => sub { $charset_in = $charset_out = $_[1] },  # INTERNAL
 259     'convert-from=s'                    => \$convert_from,
 260     'D|directory=s'                     => \$directory,
 261     'f|files-from=s'                    => \$files_from,
 262     'I|input-charset=s'                 => \$charset_in,        # INTERNAL
 263     'pedantic-warnings|pedantic'        => sub { $pedantic_p = 1 },
 264     'O|output-charset=s'                => \$charset_out,       # INTERNAL
 265     'output|o=s'                        => \$output,
 266     'po-mode'                           => \$po_mode_p,         # INTERNAL
 267     's|sort-output'                     => sub { $sort = 's' },
 268     'F|sort-by-file'                    => sub { $sort = 'F' },
 269     'v|verbose'                         => \$verbose_p,
 270     'help'                              => sub { usage(0) },
 271 ) || usage_error;
 272
 273 VerboseWarnings::set_application_name $0;
 274 VerboseWarnings::set_pedantic_mode $pedantic_p;
 275
 276 usage_error('Missing mandatory option -f')
 277         unless defined $files_from || defined $convert_from;
 278 $directory = '.' unless defined $directory;
 279
 280 usage_error('You cannot specify both --convert-from and --files-from')
 281         if defined $convert_from && defined $files_from;
 282
 283 if (defined $output && $output ne '-') {
 284     print STDERR "$0: Opening output file \"$output\"\n" if $verbose_p;
 285     open(OUTPUT, ">$output") || die "$output: $!\n";
 286 } else {
 287     print STDERR "$0: Outputting to STDOUT...\n" if $verbose_p;
 288     open(OUTPUT, ">&STDOUT");
 289 }
 290
 291 if (defined $files_from) {
 292     print STDERR "$0: Opening input file list \"$files_from\"\n" if $verbose_p;
 293     open(INPUT, "<$files_from") || die "$files_from: $!\n";
 294     while (<INPUT>) {
 295         chomp;
 296         my $input = /^\//? $_: "$directory/$_";
 297         my $h = TmplTokenizer->new( $input );
 298         $h->set_allow_cformat( 1 );
 299         VerboseWarnings::set_input_file_name $input;
 300         print STDERR "$0: Processing file \"$input\"\n" if $verbose_p;
 301         text_extract( $h );
 302     }
 303     close INPUT;
 304 } else {
 305     print STDERR "$0: Converting \"$convert_from\"\n" if $verbose_p;
 306     convert_translation_file;
 307 }
 308 generate_po_file;
 309
 310 warn "This input will not work with Mozilla standards-compliant mode\n", undef
 311         if TmplTokenizer::syntaxerror_p;
 312
 313
 314 exit(-1) if TmplTokenizer::fatal_p;
 315
 316 ###############################################################################
 317
 318 =head1 DESCRIPTION
 319
 320 This is an experimental script based on the modularized
 321 text-extract2.pl script.  It has behaviour similar to
 322 xgettext(1), and generates gettext-compatible output files.
 323
 324 A gettext-like format provides the following advantages:
 325
 326 =over
 327
 328 =item -
 329
 330 (Future goal)
 331 Translation to non-English-like languages with different word
 332 order:  gettext's c-format strings can theoretically be
 333 emulated if we are able to do some analysis on the .tmpl input
 334 and treat <TMPL_VAR> in a way similar to %s.
 335
 336 =item -
 337
 338 Context for the extracted strings:  the gettext format provides
 339 the filenames and line numbers where each string can be found.
 340 The translator can read the source file and see the context,
 341 in case the string by itself can mean several different things.
 342
 343 =item -
 344
 345 Place for the translator to add comments about the translations.
 346
 347 =item -
 348
 349 Gettext-compatible tools, if any, might be usable if we adopt
 350 the gettext format.
 351
 352 =back
 353
 354 Note that this script is experimental and should still be
 355 considered unstable.
 356
 357 Please refer to the explanation in tmpl_process3 for further
 358 details.
 359
 360 If you want to generate GNOME-style POTFILES.in files, such
 361 files (passed to -f) can be generated thus:
 362
 363         (cd ../.. && find koha-tmpl/opac-tmpl/default/en
 364                 -name \*.inc -o -name \*.tmpl) > opac/POTFILES.in
 365         (cd ../.. && find koha-tmpl/intranet-tmpl/default/en
 366                 -name \*.inc -o -name \*.tmpl) > intranet/POTFILES.in
 367
 368 This is, however, quite pointless, because the "create" and
 369 "update" actions have already been implemented in tmpl_process3.pl.
 370
 371 =head1 SEE ALSO
 372
 373 tmpl_process.pl,
 374 xgettext(1),
 375 Locale::PO(3),
 376 translator_doc.txt
 377
 378 =head1 BUGS
 379
 380 There probably are some. Bugs related to scanning of <INPUT>
 381 tags seem to be especially likely to be present.
 382
 383 Its diagnostics are probably too verbose.
 384
 385 When a <TMPL_VAR> within a JavaScript-related attribute is
 386 detected, the script currently displays no warnings at all.
 387 It might be good to display some kind of warning.
 388
 389 Its sort order (-s option) seems to be different than the real
 390 xgettext(1)'s sort option. This will result in translation
 391 strings inside the generated PO file spuriously moving about
 392 when tmpl_process3.pl calls msgmerge(1) to update the PO file.
 393
 394 =cut