misc/translator/xgettext.pl

   1 #!/usr/bin/perl
   2
   3 =head1 NAME
   4
   5 xgettext.pl - xgettext(1)-like interface for .tmpl strings extraction
   6
   7 =cut
   8
   9 use strict;
  10 use Getopt::Long;
  11 use Locale::PO;
  12 use TmplTokenizer;
  13 use VerboseWarnings;
  14
  15 use vars qw( $convert_from );
  16 use vars qw( $files_from $directory $output $sort );
  17 use vars qw( $extract_all_p );
  18 use vars qw( $pedantic_p );
  19 use vars qw( %text %translation );
  20 use vars qw( $charset_in $charset_out );
  21
  22 ###############################################################################
  23
  24 sub string_negligible_p ($) {
  25     my($t) = @_;                                # a string
  26     # Don't emit pure whitespace, pure numbers, pure punctuation,
  27     # single letters, or TMPL_VAR's.
  28     # Punctuation should arguably be translated. But without context
  29     # they are untranslatable. Note that $t is a string, not a token object.
  30     return !$extract_all_p && (
  31                TmplTokenizer::blank_p($t)       # blank or TMPL_VAR
  32             || $t =~ /^\d+$/                    # purely digits
  33             || $t =~ /^[-\+\.,:;!\?'"%\(\)\[\]\|]+$/ # punctuation w/o context
  34             || $t =~ /^[A-Za-z]$/               # single letters
  35         )
  36 }
  37
  38 sub token_negligible_p( $ ) {
  39     my($x) = @_;
  40     my $t = $x->type;
  41     return !$extract_all_p && (
  42             $t == TmplTokenType::TEXT? string_negligible_p( $x->string ):
  43             $t == TmplTokenType::DIRECTIVE? 1:
  44             $t == TmplTokenType::TEXT_PARAMETRIZED
  45                 && join( '', map { my $t = $_->type;
  46                         $t == TmplTokenType::DIRECTIVE?
  47                                 '1': $t == TmplTokenType::TAG?
  48                                         '': token_negligible_p( $_ )?
  49                                         '': '1' } @{$x->children} ) eq '' );
  50 }
  51
  52 ###############################################################################
  53
  54 sub remember ($$) {
  55     my($token, $string) = @_;
  56     # If we determine that the string is negligible, don't bother to remember
  57     unless (string_negligible_p( $string ) || token_negligible_p( $token )) {
  58         my $key = TmplTokenizer::string_canon( $string );
  59         $text{$key} = [] unless defined $text{$key};
  60         push @{$text{$key}}, $token;
  61     }
  62 }
  63
  64 ###############################################################################
  65
  66 sub string_list () {
  67     my @t = keys %text;
  68     # The real gettext tools seems to sort case sensitively; I don't know why
  69     @t = sort { $a cmp $b } @t if $sort eq 's';
  70     @t = sort {
  71             my @aa = sort { $a->pathname cmp $b->pathname
  72                     || $a->line_number <=> $b->line_number } @{$text{$a}};
  73             my @bb = sort { $a->pathname cmp $b->pathname
  74                     || $a->line_number <=> $b->line_number } @{$text{$b}};
  75             $aa[0]->pathname cmp $bb[0]->pathname
  76                     || $aa[0]->line_number <=> $bb[0]->line_number;
  77         } @t if $sort eq 'F';
  78     return @t;
  79 }
  80
  81 ###############################################################################
  82
  83 sub text_extract (*) {
  84     my($h) = @_;
  85     for (;;) {
  86         my $s = TmplTokenizer::next_token $h;
  87     last unless defined $s;
  88         my($kind, $t, $attr) = ($s->type, $s->string, $s->attributes);
  89         if ($kind eq TmplTokenType::TEXT) {
  90             remember( $s, $t ) if $t =~ /\S/s;
  91         } elsif ($kind eq TmplTokenType::TEXT_PARAMETRIZED) {
  92             remember( $s, $s->form ) if $s->form =~ /\S/s;
  93         } elsif ($kind eq TmplTokenType::TAG && %$attr) {
  94             # value [tag=input], meta
  95             my $tag = lc($1) if $t =~ /^<(\S+)/s;
  96             for my $a ('alt', 'content', 'title', 'value') {
  97                 if ($attr->{$a}) {
  98                     next if $a eq 'content' && $tag ne 'meta';
  99                     next if $a eq 'value' && ($tag ne 'input'
 100                         || (ref $attr->{'type'} && $attr->{'type'}->[1] =~ /^(?:hidden|radio)$/)); # FIXME
 101                     my($key, $val, $val_orig, $order) = @{$attr->{$a}}; #FIXME
 102                     $val = TmplTokenizer::trim $val;
 103                     remember( $s, $val ) if $val =~ /\S/s;
 104                 }
 105             }
 106         }
 107     }
 108 }
 109
 110 ###############################################################################
 111
 112 sub generate_strings_list () {
 113     # Emit all extracted strings.
 114     for my $t (string_list) {
 115         printf OUTPUT "%s\n", $t # unless negligible_p($t);
 116     }
 117 }
 118
 119 ###############################################################################
 120
 121 sub generate_po_file () {
 122     # We don't emit the Plural-Forms header; it's meaningless for us
 123     my $pot_charset = (defined $charset_out? $charset_out: 'CHARSET');
 124     $pot_charset = TmplTokenizer::charset_canon $pot_charset;
 125     print OUTPUT <<EOF;
 126 # SOME DESCRIPTIVE TITLE.
 127 # Copyright (C) YEAR THE PACKAGE'S COPYRIGHT HOLDER
 128 # This file is distributed under the same license as the PACKAGE package.
 129 # FIRST AUTHOR <EMAIL\@ADDRESS>, YEAR.
 130 #
 131 #, fuzzy
 132 msgid ""
 133 msgstr ""
 134 "Project-Id-Version: PACKAGE VERSION\\n"
 135 "POT-Creation-Date: 2004-02-05 20:55-0500\\n"
 136 "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\\n"
 137 "Last-Translator: FULL NAME <EMAIL\@ADDRESS>\\n"
 138 "Language-Team: LANGUAGE <LL\@li.org>\\n"
 139 "MIME-Version: 1.0\\n"
 140 "Content-Type: text/plain; charset=$pot_charset\\n"
 141 "Content-Transfer-Encoding: 8bit\\n"
 142
 143 EOF
 144     my $directory_re = quotemeta("$directory/");
 145     for my $t (string_list) {
 146         #next if negligible_p($t);
 147         my $cformat_p;
 148         for my $token (@{$text{$t}}) {
 149             my $pathname = $token->pathname;
 150             $pathname =~ s/^$directory_re//os;
 151             printf OUTPUT "#: %s:%d\n", $pathname, $token->line_number
 152                     if defined $pathname && defined $token->line_number;
 153             $cformat_p = 1 if $token->type == TmplTokenType::TEXT_PARAMETRIZED;
 154         }
 155         printf OUTPUT "#, c-format\n" if $cformat_p;
 156         printf OUTPUT "msgid %s\n", TmplTokenizer::quote_po
 157                 TmplTokenizer::string_canon
 158                 TmplTokenizer::charset_convert $t, $charset_in, $charset_out;
 159         printf OUTPUT "msgstr %s\n\n", (defined $translation{$t}?
 160                 TmplTokenizer::quote_po( $translation{$t} ): "\"\"");
 161     }
 162 }
 163
 164 ###############################################################################
 165
 166 sub convert_translation_file () {
 167     open(INPUT, "<$convert_from") || die "$convert_from: $!\n";
 168     VerboseWarnings::set_input_file_name $convert_from;
 169     while (<INPUT>) {
 170         chomp;
 171         my($msgid, $msgstr) = split(/\t/);
 172         die "$convert_from: $.: Malformed tmpl_process input (no tab)\n"
 173                 unless defined $msgstr;
 174
 175         # Fixup some of the bad strings
 176         $msgid =~ s/^SELECTED>//;
 177
 178         # Create dummy token
 179         my $token = TmplToken->new( $msgid, TmplTokenType::UNKNOWN, undef, undef );
 180         remember( $token, $msgid );
 181         $msgstr =~ s/^(?:LIMIT;|LIMITED;)//g; # unneeded for tmpl_process3
 182         $translation{$msgid} = $msgstr unless $msgstr eq '*****';
 183
 184         if ($msgid  =~ /\bcharset=(["']?)([^;\s"']+)\1/s) {
 185             my $candidate = TmplTokenizer::charset_canon $2;
 186             die "Conflicting charsets in msgid: $candidate vs $charset_in\n"
 187                     if defined $charset_in && $charset_in ne $candidate;
 188             $charset_in = $candidate;
 189         }
 190         if ($msgstr =~ /\bcharset=(["']?)([^;\s"']+)\1/s) {
 191             my $candidate = TmplTokenizer::charset_canon $2;
 192             die "Conflicting charsets in msgid: $candidate vs $charset_out\n"
 193                     if defined $charset_out && $charset_out ne $candidate;
 194             $charset_out = $candidate;
 195         }
 196     }
 197     # The following assumption is correct; that's what HTML::Template assumes
 198     if (!defined $charset_in) {
 199         $charset_in = $charset_out = TmplTokenizer::charset_canon 'iso8859-1';
 200         warn "Warning: Can't determine original templates' charset, defaulting to $charset_in\n";
 201     }
 202 }
 203
 204 ###############################################################################
 205
 206 sub usage ($) {
 207     my($exitcode) = @_;
 208     my $h = $exitcode? *STDERR: *STDOUT;
 209     print $h <<EOF;
 210 Usage: $0 [OPTIONS]
 211 Extract translatable strings from given HTML::Template input files.
 212
 213 Input file location:
 214   -f, --files-from=FILE          Get list of input files from FILE
 215   -D, --directory=DIRECTORY      Add DIRECTORY to list for input files search
 216
 217 Output file location:
 218   -o, --output=FILE              Write output to specified file
 219
 220 HTML::Template options:
 221   -a, --extract-all              Extract all strings
 222       --pedantic-warnings        Issue warnings even for detected problems
 223                                  which are likely to be harmless
 224
 225 Output details:
 226   -s, --sort-output              generate sorted output
 227   -F, --sort-by-file             sort output by file location
 228
 229 Informative output:
 230       --help                     Display this help and exit
 231 EOF
 232     exit($exitcode);
 233 }
 234
 235 ###############################################################################
 236
 237 sub usage_error (;$) {
 238     print STDERR "$_[0]\n" if @_;
 239     print STDERR "Try `$0 --help' for more information.\n";
 240     exit(-1);
 241 }
 242
 243 ###############################################################################
 244
 245 Getopt::Long::config qw( bundling no_auto_abbrev );
 246 GetOptions(
 247     'a|extract-all'                     => \$extract_all_p,
 248     'charset=s' => sub { $charset_in = $charset_out = $_[1] },  # INTERNAL
 249     'convert-from=s'                    => \$convert_from,
 250     'D|directory=s'                     => \$directory,
 251     'f|files-from=s'                    => \$files_from,
 252     'I|input-charset=s'                 => \$charset_in,        # INTERNAL
 253     'pedantic-warnings|pedantic'        => sub { $pedantic_p = 1 },
 254     'O|output-charset=s'                => \$charset_out,       # INTERNAL
 255     'output|o=s'                        => \$output,
 256     's|sort-output'                     => sub { $sort = 's' },
 257     'F|sort-by-file'                    => sub { $sort = 'F' },
 258     'help'                              => sub { usage(0) },
 259 ) || usage_error;
 260
 261 VerboseWarnings::set_application_name $0;
 262 VerboseWarnings::set_pedantic_mode $pedantic_p;
 263
 264 usage_error('Missing mandatory option -f')
 265         unless defined $files_from || defined $convert_from;
 266 $directory = '.' unless defined $directory;
 267
 268 usage_error('You cannot specify both --convert-from and --files-from')
 269         if defined $convert_from && defined $files_from;
 270
 271 if (defined $output && $output ne '-') {
 272     open(OUTPUT, ">$output") || die "$output: $!\n";
 273 } else {
 274     open(OUTPUT, ">&STDOUT");
 275 }
 276
 277 if (defined $files_from) {
 278     open(INPUT, "<$files_from") || die "$files_from: $!\n";
 279     while (<INPUT>) {
 280         chomp;
 281         my $h = TmplTokenizer->new( "$directory/$_" );
 282         $h->set_allow_cformat( 1 );
 283         VerboseWarnings::set_input_file_name "$directory/$_";
 284         text_extract( $h );
 285     }
 286     close INPUT;
 287 } else {
 288     convert_translation_file;
 289 }
 290 generate_po_file;
 291
 292 warn "This input will not work with Mozilla standards-compliant mode\n", undef
 293         if TmplTokenizer::syntaxerror_p;
 294
 295
 296 exit(-1) if TmplTokenizer::fatal_p;
 297
 298 ###############################################################################
 299
 300 =head1 DESCRIPTION
 301
 302 This is an experimental script based on the modularized
 303 text-extract2.pl script.  It has behaviour similar to
 304 xgettext(1), and generates gettext-compatible output files.
 305
 306 A gettext-like format provides the following advantages:
 307
 308 =over
 309
 310 =item -
 311
 312 (Future goal)
 313 Translation to non-English-like languages with different word
 314 order:  gettext's c-format strings can theoretically be
 315 emulated if we are able to do some analysis on the .tmpl input
 316 and treat <TMPL_VAR> in a way similar to %s.
 317
 318 =item -
 319
 320 Context for the extracted strings:  the gettext format provides
 321 the filenames and line numbers where each string can be found.
 322 The translator can read the source file and see the context,
 323 in case the string by itself can mean several different things.
 324
 325 =item -
 326
 327 Place for the translator to add comments about the translations.
 328
 329 =item -
 330
 331 Gettext-compatible tools, if any, might be usable if we adopt
 332 the gettext format.
 333
 334 =back
 335
 336 Note that this script is experimental and should still be
 337 considered unstable.
 338
 339 Please refer to the explanation in tmpl_process3 for further
 340 details.
 341
 342 If you want to generate GNOME-style POTFILES.in files, such
 343 files (passed to -f) can be generated thus:
 344
 345         (cd ../.. && find koha-tmpl/opac-tmpl/default/en
 346                 -name \*.inc -o -name \*.tmpl) > opac/POTFILES.in
 347         (cd ../.. && find koha-tmpl/intranet-tmpl/default/en
 348                 -name \*.inc -o -name \*.tmpl) > intranet/POTFILES.in
 349
 350 This is, however, quite pointless, because the "create" and
 351 "update" actions have already been implemented in tmpl_process3.pl.
 352
 353 =head1 SEE ALSO
 354
 355 tmpl_process.pl,
 356 xgettext(1),
 357 Locale::PO(3),
 358 translator_doc.txt
 359
 360 =head1 BUGS
 361
 362 There probably are some. Bugs related to scanning of <INPUT>
 363 tags seem to be especially likely to be present.
 364
 365 Its diagnostics are probably too verbose.
 366
 367 When a <TMPL_VAR> within a JavaScript-related attribute is
 368 detected, the script currently displays no warnings at all.
 369 It might be good to display some kind of warning.
 370
 371 Its sort order (-s option) seems to be different than the real
 372 xgettext(1)'s sort option. This will result in translation
 373 strings inside the generated PO file spuriously moving about
 374 when tmpl_process3.pl calls msgmerge(1) to update the PO file.
 375
 376 =cut