2 #simple parser for HTML with Template Toolkit directives. Tokens are put into @tokens and are accesible via next_token and peep_token
4 use base qw(HTML::Parser);
9 #seems to be handled post tokenizer
10 ##hash where key is tag we are interested in and the value is a hash of the attributes we want
11 #my %interesting_tags = (
12 # img => { alt => 1 },
15 #tokens found so far (used like a stack)
18 #shiftnext token or undef
23 #unshift token back on @tokens
26 unshift @tokens, shift;
29 #have a peep at next token
35 #please use this method INSTEAD of the HTML::Parser->parse_file method (and HTML::Parser->parse)
36 #signature build_tokens( self, filename)
38 my ($self, $filename) = @_;
39 $self->{filename} = $filename;
40 $self->handler(start => "start", "self, line, tagname, attr, text"); #signature is start( self, linenumber, tagname, hash of attributes, origional text )
41 $self->handler(text => "text", "self, line, text, is_cdata"); #signature is text( self, linenumber, origional text, is_cdata )
42 $self->handler(end => "end", "self, line, tag, attr, text"); #signature is end( self, linenumber, tagename, origional text )
43 $self->handler(declaration => "declaration", "self, line, text, is_cdata"); # declaration
44 $self->handler(comment => "comment", "self, line, text, is_cdata"); # comments
45 $self->handler(default => "default", "self, line, text, is_cdata"); # anything else
46 $self->marked_sections(1); #treat anything inside CDATA tags as text, should really make it a TmplTokenType::CDATA
47 $self->unbroken_text(1); #make contiguous whitespace into a single token (can span multiple lines)
48 $self->parse_file($filename);
52 #handle parsing of text
56 my $work = shift; # original text
60 # warn "in text line is $line work is $work";
61 # return if $work =~ m/^\s*$/;
62 # if there is a template_toolkit tag
63 if( $work =~ m/\[%.*?\]/ ){
64 #everything before this tag is text (or possibly CDATA), add a text token to tokens if $`
66 my $t = TmplToken->new( $`, ($is_cdata? TmplTokenType::CDATA : TmplTokenType::TEXT), $line, $self->{filename} );
70 #the match itself is a DIRECTIVE $&
71 my $t = TmplToken->new( $&, TmplTokenType::DIRECTIVE, $line, $self->{filename} );
74 #put work still to do back into work
77 # warn "in the text else work is now $work";
78 #If there is some left over work, treat it as text token
79 my $t = TmplToken->new( $work, ($is_cdata? TmplTokenType::CDATA : TmplTokenType::TEXT), $line, $self->{filename} );
89 my $work = shift; #original text
91 # warn "declaration work is $work";
92 my $t = TmplToken->new( $work, ($is_cdata? TmplTokenType::CDATA : TmplTokenType::TEXT), $line, $self->{filename} );
99 my $work = shift; #original text
100 my $is_cdata = shift;
101 # warn "comment work is $work";
102 my $t = TmplToken->new( $work, ($is_cdata? TmplTokenType::CDATA : TmplTokenType::TEXT), $line, $self->{filename} );
109 my $work = shift; #original text
110 my $is_cdata = shift;
111 # warn "comment work is $work";
112 my $t = TmplToken->new( $work, ($is_cdata? TmplTokenType::CDATA : TmplTokenType::TEXT), $line, $self->{filename} );
117 #handle opening html tags
122 my $hash = shift; #hash of attr/value pairs
123 my $text = shift; #origional text
124 # warn "in start text is $text";
125 # return if ! $interesting_tags{$tag};
127 # print "#### " . $self->{filename} . " " . $tag . "####\n";
128 my $t = TmplToken->new( $text, TmplTokenType::TAG, $line, $self->{filename});
130 # tags seem to be uses in an 'interesting' way elsewhere..
131 for my $key( %$hash ) {
132 next unless defined $hash->{$key};
133 $attr{+lc($key)} = [ $key, $hash->{$key}, $key."=".$hash->{$key}, 0 ];
135 $t->set_attributes( \%attr );
139 #handle closing html tags
146 # warn "in end text is $text";
147 # what format should this be in?
148 my $t = TmplToken->new( $text, TmplTokenType::TAG, $line, $self->{filename} );
150 # tags seem to be uses in an 'interesting' way elsewhere..
151 for my $key( %$hash ) {
152 next unless defined $hash->{$key};
153 $attr{+lc($key)} = [ $key, $hash->{$key}, $key."=".$hash->{$key}, 0 ];
155 $t->set_attributes( \%attr );