[Koha-cvs] CVS: koha/misc/translator TmplToken.pm,1.3,1.4 TmplTokenizer.pm,1.21,1.22 tmpl_process3.pl,1.4,1.5 xgettext.pl,1.3,1.4

Sun Feb 22 22:34:44 CET 2004

Update of /cvsroot/koha/koha/misc/translator
In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv471

Modified Files:
	TmplToken.pm TmplTokenizer.pm tmpl_process3.pl xgettext.pl 
Log Message:
Preliminary support for "analysis" of strings with <a> tags.

Early termination of analysis if we encounter some strings, such as </h1>
or | or ||, in order to avoid extracting strings that are unnecessarily
long and which doesn't add any meaningful context.

Index: TmplToken.pm
===================================================================
RCS file: /cvsroot/koha/koha/misc/translator/TmplToken.pm,v
retrieving revision 1.3
retrieving revision 1.4
diff -C2 -r1.3 -r1.4
*** TmplToken.pm	19 Feb 2004 21:24:30 -0000	1.3
--- TmplToken.pm	22 Feb 2004 21:34:40 -0000	1.4
***************
*** 89,92 ****
--- 89,98 ----

  # only meaningful for TEXT_PARAMETRIZED tokens
+ sub anchors {
+     my $this = shift;
+     return map { $_->type == TmplTokenType::TAG && $_->string =~ /^<a\b/is? $_: ()} @{$this->{'_kids'}};
+ }
+ 
+ # only meaningful for TEXT_PARAMETRIZED tokens
  sub form {
      my $this = shift;

Index: TmplTokenizer.pm
===================================================================
RCS file: /cvsroot/koha/koha/misc/translator/TmplTokenizer.pm,v
retrieving revision 1.21
retrieving revision 1.22
diff -C2 -r1.21 -r1.22
*** TmplTokenizer.pm	22 Feb 2004 09:04:53 -0000	1.21
--- TmplTokenizer.pm	22 Feb 2004 21:34:40 -0000	1.22
***************
*** 421,431 ****
  }

! sub _token_groupable_p ($) { # groupable into a TEXT_PARAMETRIZED token
      my($t) = @_;
!     return $t->type == TmplTokenType::TEXT
  	|| ($t->type == TmplTokenType::DIRECTIVE
  		&& $t->string =~ /^(?:$re_tmpl_var)$/os)
  	|| ($t->type == TmplTokenType::TAG
! 		&& ($t->string =~ /^<\/?(?:b|em|h[123456]|i|u)\b/is
  		|| ($t->string =~ /^<input/i
  		    && $t->attributes->{'type'} =~ /^(?:text)$/i)))
--- 421,442 ----
  }

! sub _token_groupable1_p ($) { # as first token, groupable into TEXT_PARAMETRIZED
      my($t) = @_;
!     return ($t->type == TmplTokenType::TEXT && $t->string !~ /^[,\.:\|\s]+$/s)
  	|| ($t->type == TmplTokenType::DIRECTIVE
  		&& $t->string =~ /^(?:$re_tmpl_var)$/os)
  	|| ($t->type == TmplTokenType::TAG
! 		&& ($t->string =~ /^<(?:b|em|h[123456]|i|u)\b/is
! 		|| ($t->string =~ /^<input/i
! 		    && $t->attributes->{'type'} =~ /^(?:text)$/i)))
! }
! 
! sub _token_groupable2_p ($) { # as other token, groupable into TEXT_PARAMETRIZED
!     my($t) = @_;
!     return ($t->type == TmplTokenType::TEXT && ($t->string =~ /^\s*$/s || $t->string !~ /^[\|\s]+$/s))
! 	|| ($t->type == TmplTokenType::DIRECTIVE
! 		&& $t->string =~ /^(?:$re_tmpl_var)$/os)
! 	|| ($t->type == TmplTokenType::TAG
! 		&& ($t->string =~ /^<\/?(?:a|b|em|h[123456]|i|u)\b/is
  		|| ($t->string =~ /^<input/i
  		    && $t->attributes->{'type'} =~ /^(?:text)$/i)))
***************
*** 440,444 ****
  sub _formalize ($) {
      my($t) = @_;
!     return $t->type == TmplTokenType::DIRECTIVE? '%s': _quote_cformat($t->string);
  }

--- 451,458 ----
  sub _formalize ($) {
      my($t) = @_;
!     return $t->type == TmplTokenType::DIRECTIVE? '%s':
! 	   $t->type == TmplTokenType::TAG?
! 		   ($t->string =~ /^<a\b/is? '<a>': _quote_cformat($t->string)):
! 	       _quote_cformat($t->string);
  }

***************
*** 453,456 ****
--- 467,471 ----
  		}
  	    };
+     &$undo_trailing_blanks;
      # FIXME: If the last token is a close tag but there are no tags
      # FIXME: before it, drop the close tag back into the queue. This
***************
*** 515,558 ****
  	if (!$this->cdata_mode_p && $this->allow_cformat_p && defined $it
  	    && ($it->type == TmplTokenType::TEXT?
! 		!blank_p( $it->string ): _token_groupable_p( $it ))) {
  	    my @structure = ( $it );
! 	    my($n_trailing_spaces, $next) = (0, undef);
! 	    my($nonblank_text_p, $parametrized_p, $next) = (0, 0);
  	    if ($it->type == TmplTokenType::TEXT) {
  		$nonblank_text_p = 1 if !blank_p( $it->string );
  	    } elsif ($it->type == TmplTokenType::DIRECTIVE) {
  		$parametrized_p = 1;
  	    }
! 	    for (my $i = 1, $n_trailing_spaces = 0;; $i += 1) {
  		$next = $this->_next_token_intermediate($h);
  		push @structure, $next; # for consistency (with initialization)
! 	    last unless defined $next && _token_groupable_p( $next );
  		if ($next->type == TmplTokenType::TEXT) {
! 		    if (blank_p( $next->string )) {
! 			$n_trailing_spaces += 1;
! 		    } else {
! 			($n_trailing_spaces, $nonblank_text_p) = (0, 1);
! 		    }
  		} elsif ($next->type == TmplTokenType::DIRECTIVE) {
- 		    $n_trailing_spaces = 0;
  		    $parametrized_p = 1;
! 		} else {
! 		    $n_trailing_spaces = 0;
  		}
  	    }
  	    # Undo the last token
  	    push @{$this->{_queue}}, pop @structure;
! 	    # Undo trailing blank tokens
! 	    for (my $i = 0; $i < $n_trailing_spaces; $i += 1) {
! 		push @{$this->{_queue}}, pop @structure;
! 	    }
  	    @structure = $this->_optimize( @structure );
  	    if (@structure < 2) {
  		# Nothing to do
  		;
! 	    } elsif ($nonblank_text_p && $parametrized_p) {
  		# Create the corresponding c-format string
  		my $string = join('', map { $_->string } @structure);
  		my $form = join('', map { _formalize $_ } @structure);
  		$it = TmplToken->new($string, TmplTokenType::TEXT_PARAMETRIZED, $it->line_number, $it->pathname);
  		$it->set_form( $form );
--- 530,583 ----
  	if (!$this->cdata_mode_p && $this->allow_cformat_p && defined $it
  	    && ($it->type == TmplTokenType::TEXT?
! 		!blank_p( $it->string ): _token_groupable1_p( $it ))) {
  	    my @structure = ( $it );
! 	    my @tags = ();
! 	    my $next = undef;
! 	    my($nonblank_text_p, $parametrized_p, $with_anchor_p) = (0, 0, 0);
  	    if ($it->type == TmplTokenType::TEXT) {
  		$nonblank_text_p = 1 if !blank_p( $it->string );
  	    } elsif ($it->type == TmplTokenType::DIRECTIVE) {
  		$parametrized_p = 1;
+ 	    } elsif ($it->type == TmplTokenType::TAG && $it->string =~ /^<([A-Z0-9]+)/is) {
+ 		push @tags, lc($1);
+ 		$with_anchor_p = 1 if lc($1) eq 'a';
  	    }
! 	    # We hate | and || in msgid strings, so we try to avoid them
! 	    for (my $i = 1, my $quit_p = 0, my $quit_next_p = ($it->type == TmplTokenType::TEXT && $it->string =~ /^\|+$/s);; $i += 1) {
  		$next = $this->_next_token_intermediate($h);
  		push @structure, $next; # for consistency (with initialization)
! 	    last unless defined $next && _token_groupable2_p( $next );
! 	    last if $quit_next_p;
  		if ($next->type == TmplTokenType::TEXT) {
! 		    $nonblank_text_p = 1 if !blank_p( $next->string );
! 		    $quit_p = 1 if $next->string =~ /^\|+$/s; # We hate | and ||
  		} elsif ($next->type == TmplTokenType::DIRECTIVE) {
  		    $parametrized_p = 1;
! 		} elsif ($next->type == TmplTokenType::TAG) {
! 		    if ($next->string =~ /^<([A-Z0-9]+)/is) {
! 			push @tags, lc($1);
! 			$with_anchor_p = 1 if lc($1) eq 'a';
! 		    } elsif ($next->string =~ /^<\/([A-Z0-9]+)/is) {
! 			my $close = lc($1);
! 			$quit_p = 1 unless @tags && $close eq $tags[$#tags];
! 			$quit_next_p = 1 if $close =~ /^h\d$/;
! 			pop @tags;
! 		    }
  		}
+ 	    last if $quit_p;
  	    }
  	    # Undo the last token
  	    push @{$this->{_queue}}, pop @structure;
! 	    # Simply it a bit more
  	    @structure = $this->_optimize( @structure );
  	    if (@structure < 2) {
  		# Nothing to do
  		;
! 	    } elsif ($nonblank_text_p && ($parametrized_p || $with_anchor_p)) {
  		# Create the corresponding c-format string
  		my $string = join('', map { $_->string } @structure);
  		my $form = join('', map { _formalize $_ } @structure);
+ 		my $a_counter = 0;
+ 		$form =~ s/<a>/ $a_counter += 1, "<a$a_counter>" /egs;
  		$it = TmplToken->new($string, TmplTokenType::TEXT_PARAMETRIZED, $it->line_number, $it->pathname);
  		$it->set_form( $form );
***************
*** 605,612 ****

  # Some functions that shouldn't be here... should be moved out some time
! sub parametrize ($@) {
!     my($fmt, @params) = @_;
      my $it = '';
!     for (my $n = 0; length $fmt;) {
  	if ($fmt =~ /^[^%]+/) {
  	    $fmt = $';
--- 630,637 ----

  # Some functions that shouldn't be here... should be moved out some time
! sub parametrize ($$$) {
!     my($fmt_0, $params, $anchors) = @_;
      my $it = '';
!     for (my $n = 0, my $fmt = $fmt_0; length $fmt;) {
  	if ($fmt =~ /^[^%]+/) {
  	    $fmt = $';
***************
*** 620,624 ****
  	    $fmt = $';
  	    if (!defined $width && !defined $prec) {
! 		$it .= $params[$i]
  	    } elsif (defined $width && defined $prec && !$width && !$prec) {
  		;
--- 645,649 ----
  	    $fmt = $';
  	    if (!defined $width && !defined $prec) {
! 		$it .= $params->[$i]
  	    } elsif (defined $width && defined $prec && !$width && !$prec) {
  		;
***************
*** 634,637 ****
--- 659,678 ----
  	}
      }
+     for (my $n = 0, my $fmt = $it, $it = ''; length $fmt;) {
+ 	if ($fmt =~ /^(?:(?!<a\d+>).)+/is) {
+ 	    $fmt = $';
+ 	    $it .= $&;
+ 	} elsif ($fmt =~ /^<a(\d+)>/is) {
+ 	    $n += 1;
+ 	    my $i  = $1;
+ 	    $fmt = $';
+ 	    my $anchor = $anchors->[$i - 1];
+ 	    warn_normal "$&: Anchor $1 not found for msgid \"$fmt_0\"", undef #FIXME
+ 		    unless defined $anchor;
+ 	    $it .= $anchor->string;
+ 	} else {
+ 	    die "Completely confused decoding anchors: $fmt\n";#XXX
+ 	}
+     }
      return $it;
  }

Index: tmpl_process3.pl
===================================================================
RCS file: /cvsroot/koha/koha/misc/translator/tmpl_process3.pl,v
retrieving revision 1.4
retrieving revision 1.5
diff -C2 -r1.4 -r1.5
*** tmpl_process3.pl	22 Feb 2004 08:18:27 -0000	1.4
--- tmpl_process3.pl	22 Feb 2004 21:34:40 -0000	1.5
***************
*** 89,96 ****
  	} elsif ($kind eq TmplTokenType::TEXT_PARAMETRIZED) {
  	    my $fmt = find_translation($s->form);
! 	    print $output TmplTokenizer::parametrize($fmt, map {
  		my($kind, $t, $attr) = ($_->type, $_->string, $_->attributes);
  		$kind == TmplTokenType::TAG && %$attr?
! 		    text_replace_tag($t, $attr): $t } $s->parameters);
  	} elsif ($kind eq TmplTokenType::TAG && %$attr) {
  	    print $output text_replace_tag($t, $attr);
--- 89,96 ----
  	} elsif ($kind eq TmplTokenType::TEXT_PARAMETRIZED) {
  	    my $fmt = find_translation($s->form);
! 	    print $output TmplTokenizer::parametrize($fmt, [ map {
  		my($kind, $t, $attr) = ($_->type, $_->string, $_->attributes);
  		$kind == TmplTokenType::TAG && %$attr?
! 		    text_replace_tag($t, $attr): $t } $s->parameters ], [ $s->anchors ]);
  	} elsif ($kind eq TmplTokenType::TAG && %$attr) {
  	    print $output text_replace_tag($t, $attr);
***************
*** 298,302 ****

  This is an experimental version of the tmpl_process.pl script,
! using standard gettext-style PO files.

  Currently, the create, update, and install actions have all been
--- 298,303 ----

  This is an experimental version of the tmpl_process.pl script,
! using standard gettext-style PO files.  Note that the behaviour
! of this script should still be considered unstable.

  Currently, the create, update, and install actions have all been
***************
*** 316,321 ****
  The --help option has not been implemented yet.

! There are probably some real bugs too, since this has not been
! tested very much.

  xgettext.pl must be present in the current directory; the
--- 317,328 ----
  The --help option has not been implemented yet.

! If an extracted string contain actual text (versus tags or
! TMPL_VAR directives), the strings are extracted verbatim,
! resulting in unwieldy things like multiple spaces, tabs,
! and/or newlines which are semantically indistinguishable
! from single blanks. If the template writer changes the
! spacing just a little bit, the new formatting would be
! considered new strings. This is arguably wrong, and in any
! case counter-productive.

  xgettext.pl must be present in the current directory; the
***************
*** 332,335 ****
--- 339,345 ----
  (e.g., to get rid of the "Strange line" warning for #~).

+ There are probably some other bugs too, since this has not been
+ tested very much.
+ 
  =head1 SEE ALSO

Index: xgettext.pl
===================================================================
RCS file: /cvsroot/koha/koha/misc/translator/xgettext.pl,v
retrieving revision 1.3
retrieving revision 1.4
diff -C2 -r1.3 -r1.4
*** xgettext.pl	22 Feb 2004 06:46:17 -0000	1.3
--- xgettext.pl	22 Feb 2004 21:34:40 -0000	1.4
***************
*** 22,37 ****
  ###############################################################################

! sub negligible_p ($) {
      my($t) = @_;				# a string
      # Don't emit pure whitespace, pure numbers, pure punctuation,
      # single letters, or TMPL_VAR's.
      # Punctuation should arguably be translated. But without context
!     # they are untranslatable.
      return !$extract_all_p && (
!     	       TmplTokenizer::blank_p($t)		# blank or TMPL_VAR
  	    || $t =~ /^\d+$/			# purely digits
! 	    || $t =~ /^[-\.,:;'"%\(\)\[\]\|]+$/	# pure punctuation w/o context
  	    || $t =~ /^[A-Za-z]$/		# single letters
! 	);
  }

--- 22,51 ----
  ###############################################################################

! sub string_negligible_p ($) {
      my($t) = @_;				# a string
      # Don't emit pure whitespace, pure numbers, pure punctuation,
      # single letters, or TMPL_VAR's.
      # Punctuation should arguably be translated. But without context
!     # they are untranslatable. Note that $t is a string, not a token object.
      return !$extract_all_p && (
!     	       TmplTokenizer::blank_p($t)	# blank or TMPL_VAR
  	    || $t =~ /^\d+$/			# purely digits
! 	    || $t =~ /^[-\+\.,:;!\?'"%\(\)\[\]\|]+$/ # punctuation w/o context
  	    || $t =~ /^[A-Za-z]$/		# single letters
! 	)
! }
! 
! sub token_negligible_p( $ ) {
!     my($x) = @_;
!     my $t = $x->type;
!     return !$extract_all_p && (
! 	    $t == TmplTokenType::TEXT? string_negligible_p( $x->string ):
! 	    $t == TmplTokenType::DIRECTIVE? 1:
! 	    $t == TmplTokenType::TEXT_PARAMETRIZED
! 		&& join( '', map { my $t = $_->type;
! 			$t == TmplTokenType::DIRECTIVE?
! 				'1': $t == TmplTokenType::TAG?
! 					'': token_negligible_p( $_ )?
! 					'': '1' } @{$x->children} ) eq '' );
  }

***************
*** 40,45 ****
  sub remember ($$) {
      my($token, $string) = @_;
!     $text{$string} = [] unless defined $text{$string};
!     push @{$text{$string}}, $token;
  }

--- 54,62 ----
  sub remember ($$) {
      my($token, $string) = @_;
!     # If we determine that the string is negligible, don't bother to remember
!     unless (string_negligible_p( $string ) || token_negligible_p( $token )) {
! 	$text{$string} = [] unless defined $text{$string};
! 	push @{$text{$string}}, $token;
!     }
  }

***************
*** 70,77 ****
  	my($kind, $t, $attr) = ($s->type, $s->string, $s->attributes);
  	if ($kind eq TmplTokenType::TEXT) {
- 	    #$t = TmplTokenizer::trim $t;
  	    remember( $s, $t ) if $t =~ /\S/s;
  	} elsif ($kind eq TmplTokenType::TEXT_PARAMETRIZED) {
- 	    #$t = TmplTokenizer::trim $t;
  	    remember( $s, $s->form ) if $s->form =~ /\S/s;
  	} elsif ($kind eq TmplTokenType::TAG && %$attr) {
--- 87,92 ----
***************
*** 97,101 ****
      # Emit all extracted strings.
      for my $t (string_list) {
! 	printf OUTPUT "%s\n", $t unless negligible_p($t);
      }
  }
--- 112,116 ----
      # Emit all extracted strings.
      for my $t (string_list) {
! 	printf OUTPUT "%s\n", $t # unless negligible_p($t);
      }
  }
***************
*** 128,132 ****
      my $directory_re = quotemeta("$directory/");
      for my $t (string_list) {
! 	next if negligible_p($t);
  	my $cformat_p;
  	for my $token (@{$text{$t}}) {
--- 143,147 ----
      my $directory_re = quotemeta("$directory/");
      for my $t (string_list) {
! 	#next if negligible_p($t);
  	my $cformat_p;
  	for my $token (@{$text{$t}}) {
***************
*** 317,331 ****
  =back

! Right now it does about the same thing as text-extract2.pl but
! generates gettext-style output; however, because it is scanner-
! instead of parser-based, it is able to address the 4 weaknesses
! listed in translator_doc.txt.  Ultimately, the goal is to make
! this able to do some kind of simple analysis on the input to
! produce gettext-style output with c-format strings, in order to
! facilitate translation to languages with a different word order
! than English.

! When the above is finished, the generated po file may contain
! some HTML tags in addition to %s strings.

  If you want to generate GNOME-style POTFILES.in files, such
--- 332,340 ----
  =back

! Note that this script is experimental and should still be
! considered unstable.

! Please refer to the explanation in tmpl_process3 for further
! details.

  If you want to generate GNOME-style POTFILES.in files, such