2 #simple parser for HTML with Template Toolkit directives. Tokens are put into @tokens and are accesible via next_token and peep_token
4 use base qw(HTML::Parser);
9 #seems to be handled post tokenizer
10 ##hash where key is tag we are interested in and the value is a hash of the attributes we want
11 #my %interesting_tags = (
12 # img => { alt => 1 },
15 #tokens found so far (used like a stack)
18 #shiftnext token or undef
23 #unshift token back on @tokens
25 unshift @tokens, shift;
28 #have a peep at next token
34 #please use this method INSTEAD of the HTML::Parser->parse_file method (and HTML::Parser->parse)
35 #signature build_tokens( self, filename)
37 my ($self, $filename) = @_;
38 $self->{filename} = $filename;
39 $self->handler(start => "start", "self, line, tagname, attr, text"); #signature is start( self, linenumber, tagname, hash of attributes, origional text )
40 $self->handler(text => "text", "self, line, text, is_cdata"); #signature is text( self, linenumber, origional text, is_cdata )
41 $self->handler(end => "end", "self, line, tag, attr, text"); #signature is end( self, linenumber, tagename, origional text )
42 $self->handler(declaration => "declaration", "self, line, text, is_cdata"); # declaration
43 $self->handler(comment => "comment", "self, line, text, is_cdata"); # comments
44 # $self->handler(default => "default", "self, line, text, is_cdata"); # anything else
45 $self->marked_sections(1); #treat anything inside CDATA tags as text, should really make it a C4::TmplTokenType::CDATA
46 $self->unbroken_text(1); #make contiguous whitespace into a single token (can span multiple lines)
47 $self->parse_file($filename);
51 #handle parsing of text
55 my $work = shift; # original text
58 # if there is a template_toolkit tag
59 if( $work =~ m/\[%.*?\]/ ){
60 #everything before this tag is text (or possibly CDATA), add a text token to tokens if $`
62 my $t = C4::TmplToken->new( $`, ($is_cdata? C4::TmplTokenType::CDATA : C4::TmplTokenType::TEXT), $line, $self->{filename} );
66 #the match itself is a DIRECTIVE $&
67 my $t = C4::TmplToken->new( $&, C4::TmplTokenType::DIRECTIVE, $line, $self->{filename} );
70 # put work still to do back into work
73 # If there is some left over work, treat it as text token
74 my $t = C4::TmplToken->new( $work, ($is_cdata? C4::TmplTokenType::CDATA : C4::TmplTokenType::TEXT), $line, $self->{filename} );
85 my $work = shift; #original text
87 my $t = C4::TmplToken->new( $work, ($is_cdata? C4::TmplTokenType::CDATA : C4::TmplTokenType::TEXT), $line, $self->{filename} );
94 my $work = shift; #original text
96 my $t = C4::TmplToken->new( $work, ($is_cdata? C4::TmplTokenType::CDATA : C4::TmplTokenType::TEXT), $line, $self->{filename} );
103 my $work = shift; #original text
104 my $is_cdata = shift;
105 my $t = C4::TmplToken->new( $work, ($is_cdata? C4::TmplTokenType::CDATA : C4::TmplTokenType::TEXT), $line, $self->{filename} );
110 #handle opening html tags
115 my $hash = shift; #hash of attr/value pairs
116 my $text = shift; #origional text
117 my $t = C4::TmplToken->new( $text, C4::TmplTokenType::TAG, $line, $self->{filename});
119 # tags seem to be uses in an 'interesting' way elsewhere..
120 for my $key( %$hash ) {
121 next unless defined $hash->{$key};
123 $attr{+lc($key)} = [ $key, $hash->{$key}, $key."=".$hash->{$key}, 1 ];
126 $attr{+lc($key)} = [ $key, $hash->{$key}, $key."=".$hash->{$key}, 0 ];
129 $t->set_attributes( \%attr );
133 #handle closing html tags
140 # what format should this be in?
141 my $t = C4::TmplToken->new( $text, C4::TmplTokenType::TAG, $line, $self->{filename} );
143 # tags seem to be uses in an 'interesting' way elsewhere..
144 for my $key( %$hash ) {
145 next unless defined $hash->{$key};
146 $attr{+lc($key)} = [ $key, $hash->{$key}, $key."=".$hash->{$key}, 0 ];
148 $t->set_attributes( \%attr );