misc/translator/xgettext.pl

   1 #!/usr/bin/perl
   2
   3 =head1 NAME
   4
   5 xgettext.pl - xgettext(1)-like interface for .tmpl strings extraction
   6
   7 =cut
   8
   9 use strict;
  10 use Getopt::Long;
  11 use POSIX;
  12 use Locale::PO;
  13 use TmplTokenizer;
  14 use VerboseWarnings;
  15
  16 use vars qw( $convert_from );
  17 use vars qw( $files_from $directory $output $sort );
  18 use vars qw( $extract_all_p );
  19 use vars qw( $pedantic_p );
  20 use vars qw( %text %translation );
  21 use vars qw( $charset_in $charset_out );
  22 use vars qw( $verbose_p );
  23 use vars qw( $po_mode_p );
  24
  25 ###############################################################################
  26
  27 sub string_negligible_p ($) {
  28     my($t) = @_;                                # a string
  29     # Don't emit pure whitespace, pure numbers, pure punctuation,
  30     # single letters, or TMPL_VAR's.
  31     # Punctuation should arguably be translated. But without context
  32     # they are untranslatable. Note that $t is a string, not a token object.
  33     return !$extract_all_p && (
  34                TmplTokenizer::blank_p($t)       # blank or TMPL_VAR
  35             || $t =~ /^\d+$/                    # purely digits
  36             || $t =~ /^[-\+\.,:;!\?'"%\(\)\[\]\|]+$/ # punctuation w/o context
  37             || $t =~ /^[A-Za-z]$/               # single letters
  38         )
  39 }
  40
  41 sub token_negligible_p( $ ) {
  42     my($x) = @_;
  43     my $t = $x->type;
  44     return !$extract_all_p && (
  45             $t == TmplTokenType::TEXT? string_negligible_p( $x->string ):
  46             $t == TmplTokenType::DIRECTIVE? 1:
  47             $t == TmplTokenType::TEXT_PARAMETRIZED
  48                 && join( '', map { my $t = $_->type;
  49                         $t == TmplTokenType::DIRECTIVE?
  50                                 '1': $t == TmplTokenType::TAG?
  51                                         '': token_negligible_p( $_ )?
  52                                         '': '1' } @{$x->children} ) eq '' );
  53 }
  54
  55 ###############################################################################
  56
  57 sub remember ($$) {
  58     my($token, $string) = @_;
  59     # If we determine that the string is negligible, don't bother to remember
  60     unless (string_negligible_p( $string ) || token_negligible_p( $token )) {
  61         my $key = TmplTokenizer::string_canon( $string );
  62         $text{$key} = [] unless defined $text{$key};
  63         push @{$text{$key}}, $token;
  64     }
  65 }
  66
  67 ###############################################################################
  68
  69 sub string_list () {
  70     my @t = keys %text;
  71     # The real gettext tools seems to sort case sensitively; I don't know why
  72     @t = sort { $a cmp $b } @t if $sort eq 's';
  73     @t = sort {
  74             my @aa = sort { $a->pathname cmp $b->pathname
  75                     || $a->line_number <=> $b->line_number } @{$text{$a}};
  76             my @bb = sort { $a->pathname cmp $b->pathname
  77                     || $a->line_number <=> $b->line_number } @{$text{$b}};
  78             $aa[0]->pathname cmp $bb[0]->pathname
  79                     || $aa[0]->line_number <=> $bb[0]->line_number;
  80         } @t if $sort eq 'F';
  81     return @t;
  82 }
  83
  84 ###############################################################################
  85
  86 sub text_extract (*) {
  87     my($h) = @_;
  88     for (;;) {
  89         my $s = TmplTokenizer::next_token $h;
  90     last unless defined $s;
  91         my($kind, $t, $attr) = ($s->type, $s->string, $s->attributes);
  92         if ($kind eq TmplTokenType::TEXT) {
  93             remember( $s, $t ) if $t =~ /\S/s;
  94         } elsif ($kind eq TmplTokenType::TEXT_PARAMETRIZED) {
  95             remember( $s, $s->form ) if $s->form =~ /\S/s;
  96         } elsif ($kind eq TmplTokenType::TAG && %$attr) {
  97             # value [tag=input], meta
  98             my $tag = lc($1) if $t =~ /^<(\S+)/s;
  99             for my $a ('alt', 'content', 'title', 'value') {
 100                 if ($attr->{$a}) {
 101                     next if $a eq 'content' && $tag ne 'meta';
 102                     next if $a eq 'value' && ($tag ne 'input'
 103                         || (ref $attr->{'type'} && $attr->{'type'}->[1] =~ /^(?:hidden|radio)$/)); # FIXME
 104                     my($key, $val, $val_orig, $order) = @{$attr->{$a}}; #FIXME
 105                     $val = TmplTokenizer::trim $val;
 106                     remember( $s, $val ) if $val =~ /\S/s;
 107                 }
 108             }
 109         }
 110     }
 111 }
 112
 113 ###############################################################################
 114
 115 sub generate_strings_list () {
 116     # Emit all extracted strings.
 117     for my $t (string_list) {
 118         printf OUTPUT "%s\n", $t # unless negligible_p($t);
 119     }
 120 }
 121
 122 ###############################################################################
 123
 124 sub generate_po_file () {
 125     # We don't emit the Plural-Forms header; it's meaningless for us
 126     my $pot_charset = (defined $charset_out? $charset_out: 'CHARSET');
 127     $pot_charset = TmplTokenizer::charset_canon $pot_charset;
 128     # Time stamps aren't exactly right semantically. I don't know how to fix it.
 129     my $time = POSIX::strftime('%Y-%m-%d %H:%M%z', localtime(time));
 130     my $time_pot = $time;
 131     my $time_po  = $po_mode_p? $time: 'YEAR-MO-DA HO:MI+ZONE';
 132     print OUTPUT <<EOF;
 133 # SOME DESCRIPTIVE TITLE.
 134 # Copyright (C) YEAR THE PACKAGE'S COPYRIGHT HOLDER
 135 # This file is distributed under the same license as the PACKAGE package.
 136 # FIRST AUTHOR <EMAIL\@ADDRESS>, YEAR.
 137 #
 138 #, fuzzy
 139 msgid ""
 140 msgstr ""
 141 "Project-Id-Version: PACKAGE VERSION\\n"
 142 "POT-Creation-Date: $time_pot\\n"
 143 "PO-Revision-Date: $time_po\\n"
 144 "Last-Translator: FULL NAME <EMAIL\@ADDRESS>\\n"
 145 "Language-Team: LANGUAGE <LL\@li.org>\\n"
 146 "MIME-Version: 1.0\\n"
 147 "Content-Type: text/plain; charset=$pot_charset\\n"
 148 "Content-Transfer-Encoding: 8bit\\n"
 149
 150 EOF
 151     my $directory_re = quotemeta("$directory/");
 152     for my $t (string_list) {
 153         #next if negligible_p($t);
 154         my $cformat_p;
 155         for my $token (@{$text{$t}}) {
 156             my $pathname = $token->pathname;
 157             $pathname =~ s/^$directory_re//os;
 158             printf OUTPUT "#: %s:%d\n", $pathname, $token->line_number
 159                     if defined $pathname && defined $token->line_number;
 160             $cformat_p = 1 if $token->type == TmplTokenType::TEXT_PARAMETRIZED;
 161         }
 162         printf OUTPUT "#, c-format\n" if $cformat_p;
 163         printf OUTPUT "msgid %s\n", TmplTokenizer::quote_po
 164                 TmplTokenizer::string_canon
 165                 TmplTokenizer::charset_convert $t, $charset_in, $charset_out;
 166         printf OUTPUT "msgstr %s\n\n", (defined $translation{$t}?
 167                 TmplTokenizer::quote_po( $translation{$t} ): "\"\"");
 168     }
 169 }
 170
 171 ###############################################################################
 172
 173 sub convert_translation_file () {
 174     open(INPUT, "<$convert_from") || die "$convert_from: $!\n";
 175     VerboseWarnings::set_input_file_name $convert_from;
 176     while (<INPUT>) {
 177         chomp;
 178         my($msgid, $msgstr) = split(/\t/);
 179         die "$convert_from: $.: Malformed tmpl_process input (no tab)\n"
 180                 unless defined $msgstr;
 181
 182         # Fixup some of the bad strings
 183         $msgid =~ s/^SELECTED>//;
 184
 185         # Create dummy token
 186         my $token = TmplToken->new( $msgid, TmplTokenType::UNKNOWN, undef, undef );
 187         remember( $token, $msgid );
 188         $msgstr =~ s/^(?:LIMIT;|LIMITED;)//g; # unneeded for tmpl_process3
 189         $translation{$msgid} = $msgstr unless $msgstr eq '*****';
 190
 191         if ($msgid  =~ /\bcharset=(["']?)([^;\s"']+)\1/s) {
 192             my $candidate = TmplTokenizer::charset_canon $2;
 193             die "Conflicting charsets in msgid: $candidate vs $charset_in\n"
 194                     if defined $charset_in && $charset_in ne $candidate;
 195             $charset_in = $candidate;
 196         }
 197         if ($msgstr =~ /\bcharset=(["']?)([^;\s"']+)\1/s) {
 198             my $candidate = TmplTokenizer::charset_canon $2;
 199             die "Conflicting charsets in msgid: $candidate vs $charset_out\n"
 200                     if defined $charset_out && $charset_out ne $candidate;
 201             $charset_out = $candidate;
 202         }
 203     }
 204     # The following assumption is correct; that's what HTML::Template assumes
 205     if (!defined $charset_in) {
 206         $charset_in = $charset_out = TmplTokenizer::charset_canon 'iso8859-1';
 207         warn "Warning: Can't determine original templates' charset, defaulting to $charset_in\n";
 208     }
 209 }
 210
 211 ###############################################################################
 212
 213 sub usage ($) {
 214     my($exitcode) = @_;
 215     my $h = $exitcode? *STDERR: *STDOUT;
 216     print $h <<EOF;
 217 Usage: $0 [OPTIONS]
 218 Extract translatable strings from given HTML::Template input files.
 219
 220 Input file location:
 221   -f, --files-from=FILE          Get list of input files from FILE
 222   -D, --directory=DIRECTORY      Add DIRECTORY to list for input files search
 223
 224 Output file location:
 225   -o, --output=FILE              Write output to specified file
 226
 227 HTML::Template options:
 228   -a, --extract-all              Extract all strings
 229       --pedantic-warnings        Issue warnings even for detected problems
 230                                  which are likely to be harmless
 231
 232 Output details:
 233   -s, --sort-output              generate sorted output
 234   -F, --sort-by-file             sort output by file location
 235   -v, --verbose                  explain what is being done
 236
 237 Informative output:
 238       --help                     Display this help and exit
 239 EOF
 240     exit($exitcode);
 241 }
 242
 243 ###############################################################################
 244
 245 sub usage_error (;$) {
 246     print STDERR "$_[0]\n" if @_;
 247     print STDERR "Try `$0 --help' for more information.\n";
 248     exit(-1);
 249 }
 250
 251 ###############################################################################
 252
 253 Getopt::Long::config qw( bundling no_auto_abbrev );
 254 GetOptions(
 255     'a|extract-all'                     => \$extract_all_p,
 256     'charset=s' => sub { $charset_in = $charset_out = $_[1] },  # INTERNAL
 257     'convert-from=s'                    => \$convert_from,
 258     'D|directory=s'                     => \$directory,
 259     'f|files-from=s'                    => \$files_from,
 260     'I|input-charset=s'                 => \$charset_in,        # INTERNAL
 261     'pedantic-warnings|pedantic'        => sub { $pedantic_p = 1 },
 262     'O|output-charset=s'                => \$charset_out,       # INTERNAL
 263     'output|o=s'                        => \$output,
 264     'po-mode'                           => \$po_mode_p,         # INTERNAL
 265     's|sort-output'                     => sub { $sort = 's' },
 266     'F|sort-by-file'                    => sub { $sort = 'F' },
 267     'v|verbose'                         => \$verbose_p,
 268     'help'                              => sub { usage(0) },
 269 ) || usage_error;
 270
 271 VerboseWarnings::set_application_name $0;
 272 VerboseWarnings::set_pedantic_mode $pedantic_p;
 273
 274 usage_error('Missing mandatory option -f')
 275         unless defined $files_from || defined $convert_from;
 276 $directory = '.' unless defined $directory;
 277
 278 usage_error('You cannot specify both --convert-from and --files-from')
 279         if defined $convert_from && defined $files_from;
 280
 281 if (defined $output && $output ne '-') {
 282     print STDERR "$0: Opening output file \"$output\"\n" if $verbose_p;
 283     open(OUTPUT, ">$output") || die "$output: $!\n";
 284 } else {
 285     print STDERR "$0: Outputting to STDOUT...\n" if $verbose_p;
 286     open(OUTPUT, ">&STDOUT");
 287 }
 288
 289 if (defined $files_from) {
 290     print STDERR "$0: Opening input file list \"$files_from\"\n" if $verbose_p;
 291     open(INPUT, "<$files_from") || die "$files_from: $!\n";
 292     while (<INPUT>) {
 293         chomp;
 294         my $input = /^\//? $_: "$directory/$_";
 295         my $h = TmplTokenizer->new( $input );
 296         $h->set_allow_cformat( 1 );
 297         VerboseWarnings::set_input_file_name $input;
 298         print STDERR "$0: Processing file \"$input\"\n" if $verbose_p;
 299         text_extract( $h );
 300     }
 301     close INPUT;
 302 } else {
 303     print STDERR "$0: Converting \"$convert_from\"\n" if $verbose_p;
 304     convert_translation_file;
 305 }
 306 generate_po_file;
 307
 308 warn "This input will not work with Mozilla standards-compliant mode\n", undef
 309         if TmplTokenizer::syntaxerror_p;
 310
 311
 312 exit(-1) if TmplTokenizer::fatal_p;
 313
 314 ###############################################################################
 315
 316 =head1 DESCRIPTION
 317
 318 This is an experimental script based on the modularized
 319 text-extract2.pl script.  It has behaviour similar to
 320 xgettext(1), and generates gettext-compatible output files.
 321
 322 A gettext-like format provides the following advantages:
 323
 324 =over
 325
 326 =item -
 327
 328 (Future goal)
 329 Translation to non-English-like languages with different word
 330 order:  gettext's c-format strings can theoretically be
 331 emulated if we are able to do some analysis on the .tmpl input
 332 and treat <TMPL_VAR> in a way similar to %s.
 333
 334 =item -
 335
 336 Context for the extracted strings:  the gettext format provides
 337 the filenames and line numbers where each string can be found.
 338 The translator can read the source file and see the context,
 339 in case the string by itself can mean several different things.
 340
 341 =item -
 342
 343 Place for the translator to add comments about the translations.
 344
 345 =item -
 346
 347 Gettext-compatible tools, if any, might be usable if we adopt
 348 the gettext format.
 349
 350 =back
 351
 352 Note that this script is experimental and should still be
 353 considered unstable.
 354
 355 Please refer to the explanation in tmpl_process3 for further
 356 details.
 357
 358 If you want to generate GNOME-style POTFILES.in files, such
 359 files (passed to -f) can be generated thus:
 360
 361         (cd ../.. && find koha-tmpl/opac-tmpl/default/en
 362                 -name \*.inc -o -name \*.tmpl) > opac/POTFILES.in
 363         (cd ../.. && find koha-tmpl/intranet-tmpl/default/en
 364                 -name \*.inc -o -name \*.tmpl) > intranet/POTFILES.in
 365
 366 This is, however, quite pointless, because the "create" and
 367 "update" actions have already been implemented in tmpl_process3.pl.
 368
 369 =head1 SEE ALSO
 370
 371 tmpl_process.pl,
 372 xgettext(1),
 373 Locale::PO(3),
 374 translator_doc.txt
 375
 376 =head1 BUGS
 377
 378 There probably are some. Bugs related to scanning of <INPUT>
 379 tags seem to be especially likely to be present.
 380
 381 Its diagnostics are probably too verbose.
 382
 383 When a <TMPL_VAR> within a JavaScript-related attribute is
 384 detected, the script currently displays no warnings at all.
 385 It might be good to display some kind of warning.
 386
 387 Its sort order (-s option) seems to be different than the real
 388 xgettext(1)'s sort option. This will result in translation
 389 strings inside the generated PO file spuriously moving about
 390 when tmpl_process3.pl calls msgmerge(1) to update the PO file.
 391
 392 =cut