From 8b57901d85938321f635f30bd6aaaa257846d162 Mon Sep 17 00:00:00 2001
From: acli <acli>
Date: Thu, 19 Feb 2004 21:24:30 +0000
Subject: [PATCH] New scripts for translation into Chinese and other languages
 where English word order is too different than the word order of the target
 language to yield meaningful translations.

The new scripts use a different translation file format (namely standard
gettext-style PO files).

This seems to reasonably work (e.g., producing an empty en_GB translation
then installing seems to not corrupt the "translated" files), but it likely
will still contain some bugs. There is also little documentation, but try
to run perldoc on the .p[lm] files to see what's there. There are also some
spurious warnings (both from bugs in the new scripts and from buggy third-
party Locale::PO module).
---
 misc/translator/TmplToken.pm     |  33 ++++
 misc/translator/TmplTokenType.pm |  87 +++++++--
 misc/translator/TmplTokenizer.pm | 232 ++++++++++++++++++++----
 misc/translator/text-extract2.pl |  31 +++-
 misc/translator/tmpl_process3.pl | 300 +++++++++++++++++++++++++++++++
 misc/translator/xgettext.pl      | 289 +++++++++++++++++++++++++++++
 6 files changed, 918 insertions(+), 54 deletions(-)
 create mode 100755 misc/translator/tmpl_process3.pl
 create mode 100755 misc/translator/xgettext.pl
diff --git a/misc/translator/TmplToken.pm b/misc/translator/TmplToken.pm
index fb04019165..854a70a304 100644
--- a/misc/translator/TmplToken.pm
+++ b/misc/translator/TmplToken.pm
@@ -67,6 +67,39 @@ sub set_attributes {
     return $this;
 }
 
+# only meaningful for TEXT_PARAMETRIZED tokens
+sub children {
+    my $this = shift;
+    return $this->{'_kids'};
+}
+
+# only meaningful for TEXT_PARAMETRIZED tokens
+sub set_children {
+    my $this = shift;
+    $this->{'_kids'} = ref $_[0] eq 'ARRAY'? $_[0]: \@_;
+    return $this;
+}
+
+# only meaningful for TEXT_PARAMETRIZED tokens
+# FIXME: DIRECTIVE is not necessarily TMPL_VAR !!
+sub parameters {
+    my $this = shift;
+    return map { $_->type == TmplTokenType::DIRECTIVE? $_: ()} @{$this->{'_kids'}};
+}
+
+# only meaningful for TEXT_PARAMETRIZED tokens
+sub form {
+    my $this = shift;
+    return $this->{'_form'};
+}
+
+# only meaningful for TEXT_PARAMETRIZED tokens
+sub set_form {
+    my $this = shift;
+    $this->{'_form'} = $_[0];
+    return $this;
+}
+
 ###############################################################################
 
 1;
diff --git a/misc/translator/TmplTokenType.pm b/misc/translator/TmplTokenType.pm
index 9616f1379a..75d4f5256a 100644
--- a/misc/translator/TmplTokenType.pm
+++ b/misc/translator/TmplTokenType.pm
@@ -14,6 +14,7 @@ TmplTokenType.pm - Types of TmplToken objects
 =head1 DESCRIPTION
 
 This is a Java-style "safe enum" singleton class for types of TmplToken objects.
+The predefined constants are
 
 =cut
 
@@ -24,6 +25,7 @@ $VERSION = 0.01;
 @ISA = qw(Exporter);
 @EXPORT_OK = qw(
     &TEXT
+    &TEXT_PARAMETRIZED
     &CDATA
     &TAG
     &DECL
@@ -35,7 +37,8 @@ $VERSION = 0.01;
 
 ###############################################################################
 
-use vars qw( $_text $_cdata $_tag $_decl $_pi $_directive $_comment $_unknown );
+use vars qw( $_text $_text_parametrized $_cdata
+    $_tag $_decl $_pi $_directive $_comment $_null $_unknown );
 
 BEGIN {
     my $new = sub {
@@ -46,14 +49,15 @@ BEGIN {
 	($self->{'id'}, $self->{'name'}, $self->{'desc'}) = @_;
 	return $self;
     };
-    $_text	= &$new(0, 'TEXT');
-    $_cdata	= &$new(1, 'CDATA');
-    $_tag	= &$new(2, 'TAG');
-    $_decl	= &$new(3, 'DECL');
-    $_pi	= &$new(4, 'PI');
-    $_directive	= &$new(5, 'DIRECTIVE');
-    $_comment	= &$new(6, 'COMMENT');
-    $_unknown	= &$new(7, 'UNKNOWN');
+    $_text		= &$new(0, 'TEXT');
+    $_text_parametrized	= &$new(8, 'TEXT-PARAMETRIZED');
+    $_cdata		= &$new(1, 'CDATA');
+    $_tag		= &$new(2, 'TAG');
+    $_decl		= &$new(3, 'DECL');
+    $_pi		= &$new(4, 'PI');
+    $_directive		= &$new(5, 'DIRECTIVE');
+    $_comment		= &$new(6, 'COMMENT');
+    $_unknown		= &$new(7, 'UNKNOWN');
 }
 
 sub to_string {
@@ -61,15 +65,64 @@ sub to_string {
     return $this->{'name'}
 }
 
-sub TEXT	() { $_text }
-sub CDATA	() { $_cdata }
-sub TAG		() { $_tag }
-sub DECL	() { $_decl }
-sub PI		() { $_pi }
-sub DIRECTIVE	() { $_directive }
-sub COMMENT	() { $_comment }
-sub UNKNOWN	() { $_unknown }
+sub TEXT		() { $_text }
+sub TEXT_PARAMETRIZED	() { $_text_parametrized }
+sub CDATA		() { $_cdata }
+sub TAG			() { $_tag }
+sub DECL		() { $_decl }
+sub PI			() { $_pi }
+sub DIRECTIVE		() { $_directive }
+sub COMMENT		() { $_comment }
+sub UNKNOWN		() { $_unknown }
 
 ###############################################################################
 
+=over
+
+=item TEXT
+
+normal text (#text in the DTD)
+
+=item TEXT_PARAMETRIZED
+
+parametrized normal text
+(result of simple recognition of text interspersed with <TMPL_VAR> directives;
+this has to be explicitly enabled in the scanner)
+
+=item CDATA
+
+normal text (CDATA in the DTD)
+
+=item TAG
+
+something that has the form of an HTML tag
+
+=item DECL
+
+something that has the form of an SGML declaration
+
+=item PI
+
+something that has the form of an SGML processing instruction
+
+=item DIRECTIVE
+
+a HTML::Template directive (whether or not embedded in an SGML comment)
+
+=item COMMENT
+
+something that has the form of an HTML comment
+(and is not recognized as an HTML::Template directive)
+
+=item UNKNOWN
+
+something that is not recognized at all by the scanner
+
+=back
+
+Note that end of file is currently represented by undef,
+instead of a constant predefined by this module.
+
+=cut
+
 1;
diff --git a/misc/translator/TmplTokenizer.pm b/misc/translator/TmplTokenizer.pm
index bb37eaa10e..f3ff5b62a9 100644
--- a/misc/translator/TmplTokenizer.pm
+++ b/misc/translator/TmplTokenizer.pm
@@ -22,11 +22,6 @@ it might be better to create a customized scanner
 to scan the template files for tokens.
 This module is a simple-minded attempt at such a scanner.
 
-=head1 HISTORY
-
-This tokenizer is mostly based
-on Ambrose's hideous Perl script known as subst.pl.
-
 =cut
 
 ###############################################################################
@@ -97,6 +92,8 @@ sub LINENUM		() {'lc'}
 sub CDATA_MODE_P	() {'cdata-mode-p'}
 sub CDATA_CLOSE		() {'cdata-close'}
 
+sub ALLOW_CFORMAT_P	() {'allow-cformat-p'}
+
 sub new {
     my $this = shift;
     my($input) = @_;
@@ -170,6 +167,11 @@ sub cdata_close {
     return $this->{+CDATA_CLOSE};
 }
 
+sub allow_cformat_p {
+    my $this = shift;
+    return $this->{+ALLOW_CFORMAT_P};
+}
+
 # Simple setters
 
 sub _set_fatal {
@@ -231,6 +233,12 @@ sub _set_cdata_close {
     return $this;
 }
 
+sub set_allow_cformat {
+    my $this = shift;
+    $this->{+ALLOW_CFORMAT_P} = $_[0];
+    return $this;
+}
+
 ###############################################################################
 
 sub _extract_attributes ($;$) {
@@ -305,7 +313,7 @@ sub _next_token_internal {
 	($kind, $it) = (TmplTokenType::TEXT, $&);
 	$this->_set_readahead( $' );
     # FIXME the following (the [<\s] part) is an unreliable HACK :-(
-    } elsif ($this->_peek_readahead =~ /^(?:[^<]|<[<\s])+/s) {	# non-space normal text
+    } elsif ($this->_peek_readahead =~ /^(?:[^<]|<[<\s])*(?:[^<\s])/s) {	# non-space normal text
 	($kind, $it) = (TmplTokenType::TEXT, $&);
 	$this->_set_readahead( $' );
 	warn_normal "Unescaped < in $it\n", $this->line_number_start
@@ -323,10 +331,17 @@ sub _next_token_internal {
 		    $ok_p = 1;
 		}
 	    } elsif ($this->_peek_readahead =~ /^$re_tag_compat/os) {
-		($kind, $it) = (TmplTokenType::TAG, "$1>");
-		$this->_set_readahead( $3 );
-		$ok_p = 1;
-		warn_normal "SGML \"closed start tag\" notation: $1<\n", $this->line_number_start if $2 eq '';
+		# If we detect a "closed start tag" but we know that the
+		# following token looks like a TMPL_VAR, don't stop
+		my($head, $tail, $post) = ($1, $2, $3);
+		if ($tail eq '' && $post =~ $re_tmpl_var) {
+		    warn_normal "Possible SGML \"closed start tag\" notation: $head<\n", $this->line_number;
+		} else {
+		    ($kind, $it) = (TmplTokenType::TAG, "$head>");
+		    $this->_set_readahead( $post );
+		    $ok_p = 1;
+		    warn_normal "SGML \"closed start tag\" notation: $head<\n", $this->line_number if $tail eq '';
+		}
 	    } elsif ($this->_peek_readahead =~ /^<!--(?:(?!-->).)*-->/s) {
 		($kind, $it) = (TmplTokenType::COMMENT, $&);
 		$this->_set_readahead( $' );
@@ -347,7 +362,7 @@ sub _next_token_internal {
 	    $kind = TmplTokenType::DECL;
 	    $kind = TmplTokenType::COMMENT if $it =~ /^<!--(?:(?!-->).)*-->/;
 	    if ($kind == TmplTokenType::COMMENT && $it =~ /^<!--\s*#include/s) {
-		warn_normal "Apache #include directive found instead of HTML::Template directive <TMPL_INCLUDE>", $this->line_number_start;
+		warn_normal "Apache #include directive found instead of HTML::Template <TMPL_INCLUDE> directive", $this->line_number_start;
 	    }
 	} elsif ($it =~ /^<\?/) {
 	    $kind = TmplTokenType::PI;
@@ -362,17 +377,17 @@ sub _next_token_internal {
 	}
     }
     warn_normal "Unrecognizable token found: $it\n", $this->line_number_start
-	    if $kind eq TmplTokenType::UNKNOWN;
+	    if $kind == TmplTokenType::UNKNOWN;
     return defined $it? (ref $it? $it: TmplToken->new($it, $kind, $this->line_number, $this->filename)): undef;
 }
 
-sub next_token {
+sub _next_token_intermediate {
     my $this = shift;
     my $h = $this->_handle;
     my $it;
     if (!$this->cdata_mode_p) {
 	$it = $this->_next_token_internal($h);
-	if (defined $it && $it->type eq TmplTokenType::TAG) {
+	if (defined $it && $it->type == TmplTokenType::TAG) {
 	    if ($it->string =~ /^<(script|style|textarea)\b/i) {
 		$this->_set_cdata_mode( 1 );
 		$this->_set_cdata_close( "</$1\\s*>" );
@@ -396,9 +411,101 @@ sub next_token {
     return $it;
 }
 
+sub _token_groupable_p ($) { # groupable into a TEXT_PARAMETRIZED token
+    my($t) = @_;
+    return $t->type == TmplTokenType::TEXT
+	|| ($t->type == TmplTokenType::DIRECTIVE
+		&& $t->string =~ /^(?:$re_tmpl_var)$/os)
+	|| ($t->type == TmplTokenType::TAG
+		&& ($t->string =~ /^<\/?(?:b|em|h[123456]|i|u)\b/is
+		|| ($t->string =~ /^<input/i
+		    && $t->attributes->{'type'} =~ /^(?:text)$/i)))
+}
+
+sub _quote_cformat ($) {
+    my($s) = @_;
+    $s =~ s/%/%%/g;
+    return $s;
+}
+
+sub _formalize ($) {
+    my($t) = @_;
+    return $t->type == TmplTokenType::DIRECTIVE? '%s': _quote_cformat($t->string);
+}
+
+sub next_token {
+    my $this = shift;
+    my $h = $this->_handle;
+    my $it;
+    $this->{_queue} = [] unless defined $this->{_queue};
+    if (@{$this->{_queue}}) {
+	$it = pop @{$this->{_queue}};
+    } else {
+	$it = $this->_next_token_intermediate($h);
+	if ($this->allow_cformat_p && defined $it
+	    && ($it->type == TmplTokenType::TEXT?
+		!blank_p( $it->string ): _token_groupable_p( $it ))) {
+	    my @structure = ( $it );
+	    my($n_trailing_spaces, $next) = (0, undef);
+	    my($nonblank_text_p, $parametrized_p, $next) = (0, 0);
+	    if ($it->type == TmplTokenType::TEXT) {
+		$nonblank_text_p = 1 if !blank_p( $it->string );
+	    } elsif ($it->type == TmplTokenType::DIRECTIVE) {
+		$parametrized_p = 1;
+	    }
+	    for (my $i = 1, $n_trailing_spaces = 0;; $i += 1) {
+		$next = $this->_next_token_intermediate($h);
+		push @structure, $next; # for consistency (with initialization)
+	    last unless defined $next && _token_groupable_p( $next );
+		if ($next->type == TmplTokenType::TEXT) {
+		    if (blank_p( $next->string )) {
+			$n_trailing_spaces += 1;
+		    } else {
+			($n_trailing_spaces, $nonblank_text_p) = (0, 1);
+		    }
+		} elsif ($next->type == TmplTokenType::DIRECTIVE) {
+		    $n_trailing_spaces = 0;
+		    $parametrized_p = 1;
+		} else {
+		    $n_trailing_spaces = 0;
+		}
+	    }
+	    # Undo the last token
+	    push @{$this->{_queue}}, pop @structure;
+	    # Undo trailing blank tokens
+	    for (my $i = 0; $i < $n_trailing_spaces; $i += 1) {
+		push @{$this->{_queue}}, pop @structure;
+	    }
+	    if (@structure < 2) {
+		# Nothing to do
+		;
+	    } elsif ($nonblank_text_p && $parametrized_p) {
+		# Create the corresponding c-format string
+		my $string = join('', map { $_->string } @structure);
+		my $form = join('', map { _formalize $_ } @structure);
+		$it = TmplToken->new($string, TmplTokenType::TEXT_PARAMETRIZED, $it->line_number, $it->pathname);
+		$it->set_form( $form );
+		$it->set_children( @structure );
+#	    } elsif ($nonblank_text_p) {
+#		# Combine the strings
+#		my $string = join('', map { $_->string } @structure);
+#		;
+	    } else {
+		# Requeue the tokens thus seen for re-emitting
+		for (;;) {
+		    push @{$this->{_queue}}, pop @structure;
+		last if !@structure;
+		}
+		$it = pop @{$this->{_queue}};
+	    }
+	}
+    }
+    return $it;
+}
+
 ###############################################################################
 
-# Other easy functions
+# Other simple functions (These are not methods)
 
 sub blank_p ($) {
     my($s) = @_;
@@ -414,26 +521,91 @@ sub trim ($) {
     return wantarray? (substr($s0, 0, $l1), $s, substr($s0, $l0 - $l2)): $s;
 }
 
+sub quote_po ($) {
+    my($s) = @_;
+    # Locale::PO->quote is buggy, it doesn't quote newlines :-/
+    $s =~ s/([\\"])/\\\1/gs;
+    $s =~ s/\n/\\n/g;
+    return "\"$s\"";
+}
+
+# Complication function that shouldn't be here
+sub parametrize ($@) {
+    my($fmt, @params) = @_;
+    my $it = '';
+    for (my $n = 0; length $fmt;) {
+	if ($fmt =~ /^[^%]+/) {
+	    $fmt = $';
+	    $it .= $&;
+	} elsif ($fmt =~ /^%%/) {
+	    $fmt = $';
+	    $it .= '%';
+	} elsif ($fmt =~ /^%(?:(\d+)\$)?s/) {
+	    $n += 1;
+	    my $i = (defined $1? $1: $n) - 1;
+	    $fmt = $';
+	    $it .= $params[$i]
+	} elsif ($fmt =~ /^%[^%a-zA-Z]*[a-zA-Z]/) {
+	    $fmt = $';
+	    $it .= $&;
+	    die "Unknown or unsupported format specification: $&\n"; #XXX
+	} else {
+	    die "Completely confused parametrizing: $fmt\n";#XXX
+	}
+    }
+    return $it;
+}
+
 ###############################################################################
 
-=head1 FUTURE PLANS
+=pod
+
+In addition to the basic scanning, this class will also perform
+the following:
+
+=over
+
+=item -
+
+Emulation of c-format strings (see below)
+
+=item -
+
+Display of warnings for certain things that affects either the
+ability of this class to yield correct output, or things that
+are known to cause the original template to cause trouble.
+
+=item -
 
-Code could be written to detect template variables and
-construct gettext-c-format-string-like meta-strings (e.g., "Results %s
-through %s of %s records" that will be more likely to be translatable
-to languages where word order is very unlike English word order.
-This will be relatively major rework, requiring corresponding
-rework in tmpl_process.pl
+Automatic correction of some of the things warned about
+(e.g., SGML "closed start tag" notation).
 
-Gettext-style line number references would also be very helpful in
-disambiguating the strings. Ultimately, we should generate and work
-with gettext-style po files, so that translators are able to use
-tools designed for gettext.
+=back
 
-An example of a string untranslatable to Chinese is "Accounts for";
-"Accounts for %s", however, would be translatable. Short words like
-"in" would also be untranslatable, not only to Chinese, but also to
-languages requiring declension of nouns.
+=head2 c-format strings emulation
+
+Because English word order is not universal, a simple extraction
+of translatable strings may yield some strings like "Accounts for"
+or ambiguous strings like "in". This makes the resulting strings
+difficult to translate, but does not affect all languages alike.
+For example, Chinese (with a somewhat different word order) would
+be hit harder, but French would be relatively unaffected.
+
+To overcome this problem, the scanner can be configured to detect
+patterns with <TMPL_VAR> directives (as well as certain HTML tags),
+and try to construct a larger pattern that will appear in the PO
+file as c-format strings with %s placeholders. This additional
+step allows the translator to deal with cases where word order
+is different (replacing %s with %1$s, %2$s, etc.), or when certain
+words will require certain inflectional suffixes in sentences.
+
+Because this is an incompatible change, this mode must be explicitly
+turned on using the set_cformat(1) method call.
+
+=head1 HISTORY
+
+This tokenizer is mostly based
+on Ambrose's hideous Perl script known as subst.pl.
 
 =cut
 
diff --git a/misc/translator/text-extract2.pl b/misc/translator/text-extract2.pl
index d67cc21875..52dc9909a8 100755
--- a/misc/translator/text-extract2.pl
+++ b/misc/translator/text-extract2.pl
@@ -22,9 +22,15 @@ use strict;
 use vars qw( $input );
 use vars qw( $debug_dump_only_p );
 use vars qw( $pedantic_p );
+use vars qw( $allow_cformat_p ); # FOR TESTING PURPOSES ONLY!!
 
 ###############################################################################
 
+sub underline ($) { # for testing only
+    my($s) = @_;
+    join('', map {/[\0-\37]/? $_: "$_\b$_"} split(//, $s));
+}
+
 sub debug_dump ($) { # for testing only
     my($h) = @_;
     print "re_tag_compat is /", TmplTokenizer::re_tag(1), "/\n";
@@ -34,17 +40,26 @@ sub debug_dump ($) { # for testing only
 	printf "%s\n", ('-' x 79);
 	my($kind, $t, $attr) = ($s->type, $s->string, $s->attributes);
 	printf "%s [line %d]:\n", $kind->to_string, $s->line_number;
-	printf "%4dH%s\n", length($t),
-		join('', map {/[\0-\37]/? $_: "$_\b$_"} split(//, $t));
-	if ($kind eq TmplTokenType::TAG && %$attr) {
+	printf "%4dH%s\n", length($t), underline($t);
+	if ($kind == TmplTokenType::TAG && %$attr) {
 	    printf "Attributes:\n";
 	    for my $a (keys %$attr) {
 		my($key, $val, $val_orig, $order) = @{$attr->{$a}};
-		printf "%s = %dH%s -- %s\n", $a, length $val,
-		join('', map {/[\0-\37]/? $_: "$_\b$_"} split(//, $val)),
+		printf "%s = %dH%s -- %s\n", $a, length $val, underline $val,
 		$val_orig;
 	    }
 	}
+	if ($kind == TmplTokenType::TEXT_PARAMETRIZED) {
+	    printf "Form (c-format string):\n";
+	    printf "%dH%s\n", length $s->form, underline $s->form;
+	    printf "Parameters:\n";
+	    my $i = 1;
+	    for my $a ($s->parameters) {
+		my $t = $a->string;
+		printf "%%%d\$s = %dH%s\n", $i, length $t, underline $t;
+		$i += 1;
+	    }
+	}
     }
 }
 
@@ -57,10 +72,10 @@ sub text_extract ($) {
 	my $s = TmplTokenizer::next_token $h;
     last unless defined $s;
 	my($kind, $t, $attr) = ($s->type, $s->string, $s->attributes);
-	if ($kind eq TmplTokenType::TEXT) {
+	if ($kind == TmplTokenType::TEXT) {
 	    $t = TmplTokenizer::trim $t;
 	    $text{$t} = 1 if $t =~ /\S/s;
-	} elsif ($kind eq TmplTokenType::TAG && %$attr) {
+	} elsif ($kind == TmplTokenType::TAG && %$attr) {
 	    # value [tag=input], meta
 	    my $tag = lc($1) if $t =~ /^<(\S+)/s;
 	    for my $a ('alt', 'content', 'title', 'value') {
@@ -112,6 +127,7 @@ sub usage_error (;$) {
 ###############################################################################
 
 GetOptions(
+    'enable-cformat'	=> \$allow_cformat_p,
     'f|file=s'		=> \$input,
     'debug-dump-only'	=> \$debug_dump_only_p,
     'pedantic-warnings'	=> sub { $pedantic_p = 1 },
@@ -125,6 +141,7 @@ VerboseWarnings::set_pedantic_mode $pedantic_p;
 usage_error('Missing mandatory option -f') unless defined $input;
 
 my $h = TmplTokenizer->new( $input );
+$h->set_allow_cformat( 1 ) if $allow_cformat_p;
 if ($debug_dump_only_p) {
     debug_dump( $h );
 } else {
diff --git a/misc/translator/tmpl_process3.pl b/misc/translator/tmpl_process3.pl
new file mode 100755
index 0000000000..c12df6177d
--- /dev/null
+++ b/misc/translator/tmpl_process3.pl
@@ -0,0 +1,300 @@
+#!/usr/bin/perl
+# This file is part of Koha
+# Parts copyright 2003-2004 Paul Poulain
+# Parts copyright 2003-2004 Jerome Vizcaino
+# Parts copyright 2004 Ambrose Li
+
+=head1 NAME
+
+tmpl_process3.pl - Experimental version of tmpl_process.pl
+using gettext-compatible translation files
+
+=cut
+
+use strict;
+use Getopt::Long;
+use Locale::PO;
+use File::Temp qw( :POSIX );
+use TmplTokenizer;
+use VerboseWarnings qw( error_normal warn_normal );
+
+###############################################################################
+
+use vars qw( @in_files $in_dir $str_file $out_dir );
+use vars qw( @excludes $exclude_regex );
+use vars qw( $recursive_p );
+use vars qw( $pedantic_p );
+use vars qw( $href );
+use vars qw( $type );	# file extension (DOS form without the dot) to match
+
+###############################################################################
+
+sub find_translation ($) {
+    my($s) = @_;
+    my $key = TmplTokenizer::quote_po($s) if $s =~ /\S/;
+    return defined $href->{$key}
+		&& length Locale::PO->dequote($href->{$key}->msgstr)?
+	   Locale::PO->dequote($href->{$key}->msgstr): $s;
+}
+
+sub text_replace_tag ($$) {
+    my($t, $attr) = @_;
+    my $it;
+    # value [tag=input], meta
+    my $tag = lc($1) if $t =~ /^<(\S+)/s;
+    my $translated_p = 0;
+    for my $a ('alt', 'content', 'title', 'value') {
+	if ($attr->{$a}) {
+	    next if $a eq 'content' && $tag ne 'meta';
+	    next if $a eq 'value' && ($tag ne 'input'
+		|| (ref $attr->{'type'} && $attr->{'type'}->[1] =~ /^(?:hidden|radio)$/)); # FIXME
+	    my($key, $val, $val_orig, $order) = @{$attr->{$a}}; #FIXME
+	    my($pre, $trimmed, $post) = TmplTokenizer::trim $val;
+	    if ($val =~ /\S/s) {
+		my $s = $pre . find_translation($trimmed) . $post;
+		if ($attr->{$a}->[1] ne $s) { #FIXME
+		    $attr->{$a}->[1] = $s; # FIXME
+		    $attr->{$a}->[2] = ($s =~ /"/s)? "'$s'": "\"$s\""; #FIXME
+		    $translated_p = 1;
+		}
+	    }
+	}
+    }
+    if ($translated_p) {
+	$it = "<$tag"
+	    . join('', map {
+		    sprintf(' %s=%s', $_, $attr->{$_}->[2]) #FIXME
+		} sort {
+		    $attr->{$a}->[3] <=> $attr->{$b}->[3] #FIXME
+		} keys %$attr)
+	    . '>';
+    } else {
+	$it = $t;
+    }
+    return $it;
+}
+
+sub text_replace (**) {
+    my($h, $output) = @_;
+    for (;;) {
+	my $s = TmplTokenizer::next_token $h;
+    last unless defined $s;
+	my($kind, $t, $attr) = ($s->type, $s->string, $s->attributes);
+	if ($kind eq TmplTokenType::TEXT) {
+	    my($pre, $trimmed, $post) = TmplTokenizer::trim $t;
+	    print $output $pre, find_translation($trimmed), $post;
+	} elsif ($kind eq TmplTokenType::TEXT_PARAMETRIZED) {
+	    my $fmt = find_translation($s->form);
+	    print $output TmplTokenizer::parametrize($fmt, map {
+		my($kind, $t, $attr) = ($_->type, $_->string, $_->attributes);
+		$kind == TmplTokenType::TAG && %$attr?
+		    text_replace_tag($t, $attr): $t } $s->parameters);
+	} elsif ($kind eq TmplTokenType::TAG && %$attr) {
+	    print $output text_replace_tag($t, $attr);
+	} elsif (defined $t) {
+	    print $output $t;
+	}
+    }
+}
+
+# FIXME: Should we use the GNOME convention of using POTFILES.in instead?
+sub listfiles ($$) {
+    my($dir, $type) = @_;
+    my @it = ();
+    if (opendir(DIR, $dir)) {
+	my @dirent = readdir DIR;	# because DIR is shared when recursing
+	closedir DIR;
+	for my $dirent (@dirent) {
+	    my $path = "$dir/$dirent";
+	    if ($dirent =~ /^\./ || $dirent eq 'CVS' || $dirent eq 'RCS'
+	    || (defined $exclude_regex && $dirent =~ /^(?:$exclude_regex)$/)) {
+		;
+	    } elsif (-f $path) {
+		push @it, $path if !defined $type || $dirent =~ /\.(?:$type)$/;
+	    } elsif (-d $path && $recursive_p) {
+		push @it, listfiles($path, $type);
+	    }
+	}
+    } else {
+	warn_normal "$dir: $!", undef;
+    }
+    return @it;
+}
+
+###############################################################################
+
+sub usage_error (;$) {
+    for my $msg (split(/\n/, $_[0])) {
+	print STDERR "$msg\n";
+    }
+    print STDERR "Try `$0 --help' for more information.\n";
+    exit(-1);
+}
+
+###############################################################################
+
+GetOptions(
+    'input|i=s'				=> \@in_files,
+    'outputdir|o=s'			=> \$out_dir,
+    'recursive|r'			=> \$recursive_p,
+    'str-file|s=s'			=> \$str_file,
+    'exclude|x=s'			=> \@excludes,
+    'pedantic-warnings|pedantic'	=> sub { $pedantic_p = 1 },
+) || usage_error;
+
+VerboseWarnings::set_application_name $0;
+VerboseWarnings::set_pedantic_mode $pedantic_p;
+
+my $action = shift or usage_error('You must specify an ACTION.');
+usage_error('You must at least specify input and string list filenames.')
+    if !@in_files || !defined $str_file;
+
+# Type match defaults to *.tmpl plus *.inc if not specified
+$type = "tmpl|inc" if !defined($type);
+
+# Check the inputs for being files or directories
+for my $input (@in_files) {
+    usage_error("$input: Input must be a file or directory.\n"
+	    . "(Symbolic links are not supported at the moment)")
+	unless -d $input || -f $input;;
+}
+
+# Generates the global exclude regular expression
+$exclude_regex =  '(?:'.join('|', @excludes).')' if @excludes;
+
+# Generate the list of input files if a directory is specified
+if (-d $in_files[0]) {
+    die "If you specify a directory as input, you must specify only it.\n"
+	    if @in_files > 1;
+
+    # input is a directory, generates list of files to process
+    $in_dir = $in_files[0];
+    $in_dir =~ s/\/$//; # strips the trailing / if any
+    @in_files = listfiles($in_dir, $type);
+} else {
+    for my $input (@in_files) {
+	die "You cannot specify input files and directories at the same time.\n"
+		unless -f $input;
+    }
+}
+
+if ($action eq 'create')  {
+    # updates the list. As the list is empty, every entry will be added
+    die "$str_file: Output file already exists" if -f $str_file;
+    my($tmph, $tmpfile) = tmpnam();
+    for my $input (@in_files) {
+	print $tmph "$input\n";
+    }
+    close $tmph;
+    system {'./xgettext.pl'} ('xgettext.pl', '-s', '-f', $tmpfile, '-o', $str_file);
+    unlink $tmpfile || warn_normal "$tmpfile: unlink failed: $!\n", undef;
+
+} elsif ($action eq 'update') {
+    my($tmph1, $tmpfile1) = tmpnam();
+    my($tmph2, $tmpfile2) = tmpnam();
+    close $tmph2; # We just want a name
+    for my $input (@in_files) {
+	print $tmph1 "$input\n";
+    }
+    close $tmph1;
+    system('./xgettext.pl', '-s', '-f', $tmpfile1, '-o', $tmpfile2);
+    system('msgmerge', '-U', '-s', $str_file, $tmpfile2);
+    unlink $tmpfile1 || warn_normal "$tmpfile1: unlink failed: $!\n", undef;
+    unlink $tmpfile2 || warn_normal "$tmpfile2: unlink failed: $!\n", undef;
+
+} elsif ($action eq 'install') {
+    if(!defined($out_dir)) {
+	usage_error("You must specify an output directory when using the install method.");
+    }
+	
+    if ($in_dir eq $out_dir) {
+	warn "You must specify a different input and output directory.\n";
+	exit -1;
+    }
+
+    # Make sure the output directory exists
+    # (It will auto-create it, but for compatibility we should not)
+    -d $out_dir || die "$out_dir: The directory does not exist\n";
+
+    # Try to open the file, because Locale::PO doesn't check :-/
+    open(INPUT, "<$str_file") || die "$str_file: $!\n";
+    close INPUT;
+
+    # restores the string list from file
+    $href = Locale::PO->load_file_ashash($str_file);
+
+    # creates the new tmpl file using the new translation
+    for my $input (@in_files) {
+	die "Assertion failed"
+		unless substr($input, 0, length($in_dir) + 1) eq "$in_dir/";
+
+	my $h = TmplTokenizer->new( $input );
+	$h->set_allow_cformat( 1 );
+	VerboseWarnings::set_input_file_name $input;
+
+	my $target = $out_dir . substr($input, length($in_dir));
+	my $targetdir = $` if $target =~ /[^\/]+$/s;
+	if (!-d $targetdir) {
+	    print STDERR "Making directory $targetdir...";
+	    # creates with rwxrwxr-x permissions
+	    mkdir($targetdir, 0775) || warn_normal "$targetdir: $!", undef;
+	}
+	print STDERR "Creating $target...\n";
+	open( OUTPUT, ">$target" ) || die "$target: $!\n";
+	text_replace( $h, *OUTPUT );
+	close OUTPUT;
+    }
+
+} else {
+    usage_error('Unknown action specified.');
+}
+exit 0;
+
+###############################################################################
+
+=head1 SYNOPSIS
+
+./tmpl_process3.pl [ I<tmpl_process.pl options> ]
+
+=head1 DESCRIPTION
+
+This is an experimental version of the tmpl_process.pl script,
+using standard gettext-style PO files.
+
+Currently, the create, update, and install actions have all been
+reimplemented and seem to work.
+
+The create action calls xgettext.pl to do the actual work;
+the update action calls xgettext.pl and msgmerge(1) to do the
+actual work.
+
+The script can detect <TMPL_VAR> directives embedded inside what
+appears to be a full sentence (this actual work being done by
+TmplTokenizer(3)); these larger patterns appear in the translation
+file as c-format strings with %s.
+
+=head1 BUGS
+
+The --help option has not been implemented yet.
+
+There are probably some real bugs too, since this has not been
+tested very much.
+
+xgettext.pl must be present in the current directory; the
+msgmerge(1) command must also be present in the search path.
+The script currently does not check carefully whether these
+dependent commands are present.
+
+Locale::PO(3) has a lot of bugs. It can neither parse nor
+generate GNU PO files properly; a couple of workarounds have
+been written in TmplTokenizer and more is likely to be needed
+(e.g., to get rid of the "Strange line" warning for #~).
+
+=head1 SEE ALSO
+
+xgettext.pl,
+msgmerge(1),
+Locale::PO(3),
+translator_doc.txt
+
+=cut
diff --git a/misc/translator/xgettext.pl b/misc/translator/xgettext.pl
new file mode 100755
index 0000000000..a2c611f9f3
--- /dev/null
+++ b/misc/translator/xgettext.pl
@@ -0,0 +1,289 @@
+#!/usr/bin/perl
+
+=head1 NAME
+
+xgettext.pl - xgettext(1)-like interface for .tmpl strings extraction
+
+=cut
+
+use strict;
+use Getopt::Long;
+use Locale::PO;
+use TmplTokenizer;
+use VerboseWarnings;
+
+use vars qw( $files_from $directory $output $sort );
+use vars qw( $pedantic_p );
+use vars qw( %text );
+
+###############################################################################
+
+sub remember ($$) {
+    my($token, $string) = @_;
+    $text{$string} = [] unless defined $text{$string};
+    push @{$text{$string}}, $token;
+}
+
+###############################################################################
+
+sub string_list () {
+    my @t = keys %text;
+    # The real gettext tools seems to sort case sensitively; I don't know why
+    @t = sort { $a cmp $b } @t if $sort eq 's';
+    @t = sort {
+	    my @aa = sort { $a->pathname cmp $b->pathname
+		    || $a->line_number <=> $b->line_number } @{$text{$a}};
+	    my @bb = sort { $a->pathname cmp $b->pathname
+		    || $a->line_number <=> $b->line_number } @{$text{$b}};
+	    $aa[0]->pathname cmp $bb[0]->pathname
+		    || $aa[0]->line_number <=> $bb[0]->line_number;
+	} @t if $sort eq 'F';
+    return @t;
+}
+
+###############################################################################
+
+sub text_extract (*) {
+    my($h) = @_;
+    for (;;) {
+	my $s = TmplTokenizer::next_token $h;
+    last unless defined $s;
+	my($kind, $t, $attr) = ($s->type, $s->string, $s->attributes);
+	if ($kind eq TmplTokenType::TEXT) {
+	    #$t = TmplTokenizer::trim $t;
+	    remember( $s, $t ) if $t =~ /\S/s;
+	} elsif ($kind eq TmplTokenType::TEXT_PARAMETRIZED) {
+	    #$t = TmplTokenizer::trim $t;
+	    remember( $s, $s->form ) if $s->form =~ /\S/s;
+	} elsif ($kind eq TmplTokenType::TAG && %$attr) {
+	    # value [tag=input], meta
+	    my $tag = lc($1) if $t =~ /^<(\S+)/s;
+	    for my $a ('alt', 'content', 'title', 'value') {
+		if ($attr->{$a}) {
+		    next if $a eq 'content' && $tag ne 'meta';
+		    next if $a eq 'value' && ($tag ne 'input'
+			|| (ref $attr->{'type'} && $attr->{'type'}->[1] =~ /^(?:hidden|radio)$/)); # FIXME
+		    my($key, $val, $val_orig, $order) = @{$attr->{$a}}; #FIXME
+		    $val = TmplTokenizer::trim $val;
+		    remember( $s, $val ) if $val =~ /\S/s;
+		}
+	    }
+	}
+    }
+}
+
+###############################################################################
+
+sub generate_strings_list () {
+    # Emit all extracted strings.
+    # Don't emit pure whitespace, pure numbers, or TMPL_VAR's.
+    for my $t (string_list) {
+	printf OUTPUT "%s\n", $t
+	    unless TmplTokenizer::blank_p($t) || $t =~ /^\d+$/;
+    }
+}
+
+###############################################################################
+
+sub generate_po_file () {
+    # We don't emit the Plural-Forms header; it's meaningless for us
+    print OUTPUT <<EOF;
+# SOME DESCRIPTIVE TITLE.
+# Copyright (C) YEAR THE PACKAGE'S COPYRIGHT HOLDER
+# This file is distributed under the same license as the PACKAGE package.
+# FIRST AUTHOR <EMAIL\@ADDRESS>, YEAR.
+#
+#, fuzzy
+msgid ""
+msgstr ""
+"Project-Id-Version: PACKAGE VERSION\\n"
+"POT-Creation-Date: 2004-02-05 20:55-0500\\n"
+"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\\n"
+"Last-Translator: FULL NAME <EMAIL\@ADDRESS>\\n"
+"Language-Team: LANGUAGE <LL\@li.org>\\n"
+"MIME-Version: 1.0\\n"
+"Content-Type: text/plain; charset=CHARSET\\n"
+"Content-Transfer-Encoding: 8bit\\n"
+
+EOF
+    my $directory_re = quotemeta("$directory/");
+    for my $t (string_list) {
+	next if TmplTokenizer::blank_p($t) || $t =~ /^\d+$/;
+	my $cformat_p;
+	for my $token (@{$text{$t}}) {
+	    my $pathname = $token->pathname;
+	    $pathname =~ s/^$directory_re//os;
+	    printf OUTPUT "#: %s:%d\n", $pathname, $token->line_number;
+	    $cformat_p = 1 if $token->type == TmplTokenType::TEXT_PARAMETRIZED;
+	}
+	printf OUTPUT "#, c-format\n" if $cformat_p;
+	printf OUTPUT "msgid %s\n", TmplTokenizer::quote_po( $t );
+	printf OUTPUT "msgstr \"\"\n\n";
+    }
+}
+
+###############################################################################
+
+sub usage ($) {
+    my($exitcode) = @_;
+    my $h = $exitcode? *STDERR: *STDOUT;
+    print $h <<EOF;
+Usage: $0 [OPTIONS]
+Extract translatable strings from given HTML::Template input files.
+
+Input file location:
+  -f, --files-from=FILE          Get list of input files from FILE
+  -D, --directory=DIRECTORY      Add DIRECTORY to list for input files search
+
+Output file location:
+  -o, --output=FILE              Write output to specified file
+
+HTML::Template options:
+      --pedantic-warnings        Issue warnings even for detected problems
+			         which are likely to be harmless
+
+Output details:
+  -s, --sort-output              generate sorted output
+  -F, --sort-by-file             sort output by file location
+
+Informative output:
+      --help                     Display this help and exit
+EOF
+    exit($exitcode);
+}
+
+###############################################################################
+
+sub usage_error (;$) {
+    print STDERR "$_[0]\n" if @_;
+    print STDERR "Try `$0 --help' for more information.\n";
+    exit(-1);
+}
+
+###############################################################################
+
+Getopt::Long::config qw( bundling no_auto_abbrev );
+GetOptions(
+    'D|directory=s'			=> \$directory,
+    'f|files-from=s'			=> \$files_from,
+    'pedantic-warnings|pedantic'	=> sub { $pedantic_p = 1 },
+    'output|o=s'			=> \$output,
+    's|sort-output'			=> sub { $sort = 's' },
+    'F|sort-by-file'			=> sub { $sort = 'F' },
+    'help'				=> sub { usage(0) },
+) || usage_error;
+
+VerboseWarnings::set_application_name $0;
+VerboseWarnings::set_pedantic_mode $pedantic_p;
+
+usage_error('Missing mandatory option -f') unless defined $files_from;
+$directory = '.' unless defined $directory;
+
+if (defined $output && $output ne '-') {
+    open(OUTPUT, ">$output") || die "$output: $!\n";
+} else {
+    open(OUTPUT, ">&STDOUT");
+}
+
+open(INPUT, "<$files_from") || die "$files_from: $!\n";
+while (<INPUT>) {
+    chomp;
+    my $h = TmplTokenizer->new( "$directory/$_" );
+    $h->set_allow_cformat( 1 );
+    VerboseWarnings::set_input_file_name "$directory/$_";
+    text_extract( $h );
+}
+close INPUT;
+generate_po_file;
+
+warn "This input will not work with Mozilla standards-compliant mode\n", undef
+	if TmplTokenizer::syntaxerror_p;
+
+
+exit(-1) if TmplTokenizer::fatal_p;
+
+###############################################################################
+
+=head1 DESCRIPTION
+
+This is an experimental script based on the modularized
+text-extract2.pl script.  It has behaviour similar to
+xgettext(1), and generates gettext-compatible output files.
+
+A gettext-like format provides the following advantages:
+
+=over
+
+=item -
+
+(Future goal)
+Translation to non-English-like languages with different word
+order:  gettext's c-format strings can theoretically be
+emulated if we are able to do some analysis on the .tmpl input
+and treat <TMPL_VAR> in a way similar to %s.
+
+=item - 
+
+Context for the extracted strings:  the gettext format provides
+the filenames and line numbers where each string can be found.
+The translator can read the source file and see the context,
+in case the string by itself can mean several different things.
+
+=item - 
+
+Place for the translator to add comments about the translations.
+
+=item -
+
+Gettext-compatible tools, if any, might be usable if we adopt
+the gettext format.
+
+=back
+
+Right now it does about the same thing as text-extract2.pl but
+generates gettext-style output; however, because it is scanner-
+instead of parser-based, it is able to address the 4 weaknesses
+listed in translator_doc.txt.  Ultimately, the goal is to make
+this able to do some kind of simple analysis on the input to
+produce gettext-style output with c-format strings, in order to
+facilitate translation to languages with a different word order
+than English.
+
+When the above is finished, the generated po file may contain
+some HTML tags in addition to %s strings.
+
+If you want to generate GNOME-style POTFILES.in files, such
+files (passed to -f) can be generated thus:
+
+	(cd ../.. && find koha-tmpl/opac-tmpl/default/en
+		-name \*.inc -o -name \*.tmpl) > opac/POTFILES.in
+	(cd ../.. && find koha-tmpl/intranet-tmpl/default/en
+		-name \*.inc -o -name \*.tmpl) > intranet/POTFILES.in
+
+This is, however, quite pointless, because the "create" and
+"update" actions have already been implemented in tmpl_process3.pl.
+
+=head1 SEE ALSO
+
+tmpl_process.pl,
+xgettext(1),
+Locale::PO(3),
+translator_doc.txt
+
+=head1 BUGS
+
+There probably are some. Bugs related to scanning of <INPUT>
+tags seem to be especially likely to be present.
+
+Its diagnostics are probably too verbose.
+
+When a <TMPL_VAR> within a JavaScript-related attribute is
+detected, the script currently displays no warnings at all.
+It might be good to display some kind of warning.
+
+Its sort order (-s option) seems to be different than the real
+xgettext(1)'s sort option. This will result in translation
+strings inside the generated PO file spuriously moving about
+when tmpl_process3.pl calls msgmerge(1) to update the PO file.
+
+=cut
-- 
2.39.2