1 # ---------------------------------------------------------------------
2 # Utility routines for cleaning and formatting XML related to OAI
5 # ------------------+--------------------+-----------------------------
6 # Hussein Suleman | hussein@vt.edu | www.husseinsspace.com
7 # ------------------+--------------------+-+---------------------------
8 # Department of Computer Science | www.cs.vt.edu
9 # Digital Library Research Laboratory | www.dlib.vt.edu
10 # -----------------------------------------+-------------+-------------
11 # Virginia Polytechnic Institute and State University | www.vt.edu
12 # -------------------------------------------------------+-------------
15 package C4::OAI::Utility;
18 # constructor [create mapping for latin entities to Unicode]
21 my $classname = shift;
23 my $self = { XMLindent => ' ' };
25 my @upperentities = qw (nbsp iexcl cent pound curren yen brvbar sect
26 uml copy ordf laquo not 173 reg macr deg plusmn
27 sup2 sup3 acute micro para middot cedil supl
28 ordm raquo frac14 half frac34 iquest Agrave
29 Aacute Acirc Atilde Auml Aring AElig Ccedil
30 Egrave Eacute Ecirc Euml Igrave Iacute Icirc
31 Iuml ETH Ntilde Ograve Oacute Ocirc Otilde Ouml
32 times Oslash Ugrave Uacute Ucirc Uuml Yacute
33 THORN szlig agrave aacute acirc atilde auml
34 aring aelig ccedil egrave eacute ecirc euml
35 igrave iacute icirc iuml eth ntilde ograve
36 oacute ocirc otilde ouml divide oslash ugrave
37 uacute ucirc uuml yacute thorn yuml);
38 $upperentities[12] = '#173';
40 $self->{'hashentity'} = {};
41 for ( my $i=0; $i<=$#upperentities; $i++ )
43 my $key = '&'.$upperentities[$i].';';
44 $self->{'hashentity'}->{$key}=$i+160;
47 $self->{'hashstr'} = (join (';|', @upperentities)).';';
49 bless $self, $classname;
54 # clean XML version one - for paragraphs
58 return undef if (! defined $t);
59 # make ISOlat1 entities into Unicode character entities
60 $t =~ s/&($self->{'hashstr'})/sprintf ("&#x%04X;", $self->{'hashentity'}->{$&})/geo;
61 # escape non-XML-encoded ampersands (including from other characters sets)
62 $t =~ s/&(?!((#[0-9]*)|(#x[0-9]*)|(amp)|(lt)|(gt)|(apos)|(quot));)/&/go;
63 # convert extended ascii into Unicode character entities
64 $t =~ s/[\xa0-\xff]/'&#'.ord ($&).';'/geo;
65 # remove extended ascii that doesnt translate into ISO8859/1
66 $t =~ s/[\x00-\x08\x0B\x0C\x0E-\x1f\x80-\x9f]//go;
67 # make tags delimiters into entities
70 # convert any whitespace containing lf or cr into a single cr
71 $t =~ s/(\s*[\r\n]\s+)|(\s+[\r\n]\s*)/\n/go;
72 # convert multiples spaces/tabs into a single space
74 # kill leading and terminating spaces
75 $t =~ s/^[ ]+(.+)[ ]+$/$1/;
80 # clean XML version two - for single-line streams
84 return undef if (! defined $t );
85 # make ISOlat1 entities into Unicode character entities
86 $t =~ s/&($self->{'hashstr'})/sprintf ("&#x%04X;", $self->{'hashentity'}->{$&})/geo;
87 # escape non-XML-encoded ampersands (including from other characters sets)
88 $t =~ s/&(?!((#[0-9]*)|(#x[0-9]*)|(amp)|(lt)|(gt)|(apos)|(quot));)/&/go;
89 # convert extended ascii into Unicode character entities
90 $t =~ s/[\xa0-\xff]/'&#'.ord ($&).';'/geo;
91 # remove extended ascii that doesnt translate into ISO8859/1
92 $t =~ s/[\x00-\x08\x0B\x0C\x0E-\x1f\x80-\x9f]//go;
93 # make tags delimiters into entities
97 $t =~ s/[\s\t\r\n]+/ /go;
98 # kill leading and terminating spaces
99 $t =~ s/^[ ]+(.+)[ ]+$/$1/;
104 # remove newlines and carriage returns
108 # eliminate all carriage returns and linefeeds
109 $t =~ s/[\t\r\s\n]+/ /go;
114 # convert a data structure in Perl to XML
119 # { attr1 => val1, attr2 => val2, ... },
123 # { attr1 => val1, attr2 => val2, ... },
129 # tag2 => { children },
130 # tag3 => "text string",
131 # mdorder => [ "tag1", "tag2", "tag3" ]
136 my ($self, $head, $indent) = @_;
137 $indent .= $self->{'XMLindent'};
138 my ($key, $i, $j, $buffer, @orderedkeys);
140 if (exists ($head->{'mdorder'}))
141 { @orderedkeys = @{$head->{'mdorder'}}; }
143 { @orderedkeys = keys %$head; }
144 foreach $key (@orderedkeys)
146 if ((exists ($head->{$key})) && (ref ($head->{$key}) eq 'ARRAY'))
148 foreach $i (@{$head->{$key}})
150 if (ref ($i) eq 'ARRAY')
152 my $atthash = $$i[0];
153 my $childhash = $$i[1];
155 $buffer .= "$indent<$key";
156 foreach $j (keys %$atthash)
158 $buffer .= " $j=\"$atthash->{$j}\"";
162 if (ref ($childhash) eq 'HASH')
164 $buffer .= $self->FormatXML ($childhash, $indent);
168 $buffer .= "$indent$childhash\n";
171 $buffer .= "$indent</$key>\n";
173 elsif (ref ($i) eq 'HASH')
175 my $nestedbuffer = $self->FormatXML ($i, $indent);
176 if ($nestedbuffer ne '')
178 $buffer .= "$indent<$key>\n$nestedbuffer$indent</$key>\n";
183 $buffer .= "$indent<$key>$i</$key>\n";
187 elsif ((exists ($head->{$key})) && (ref ($head->{$key}) eq 'HASH'))
189 my $nestedbuffer = $self->FormatXML ($head->{$key}, $indent);
190 if ($nestedbuffer ne '')
192 $buffer .= "$indent<$key>\n$nestedbuffer$indent</$key>\n";
195 elsif ((exists ($head->{$key})) && ($head->{$key} ne ''))
197 $buffer .= "$indent<$key>$head->{$key}</$key>\n";