4 # Copyright 2008 Tamil s.a.r.l.
6 # This file is part of Koha.
8 # Koha is free software; you can redistribute it and/or modify it
9 # under the terms of the GNU General Public License as published by
10 # the Free Software Foundation; either version 3 of the License, or
11 # (at your option) any later version.
13 # Koha is distributed in the hope that it will be useful, but
14 # WITHOUT ANY WARRANTY; without even the implied warranty of
15 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 # GNU General Public License for more details.
18 # You should have received a copy of the GNU General Public License
19 # along with Koha; if not, see <http://www.gnu.org/licenses>.
24 use Carp qw( carp croak );
26 use Pod::Usage qw( pod2usage );
27 use Getopt::Long qw( GetOptions );
29 use Koha::Script -cron;
31 use C4::Log qw( cronlogaction );
37 my $command_line_options = join(" ",@ARGV);
40 'verbose' => \$verbose,
46 pod2usage( -verbose => 2 );
50 usage() if $help || !$conf;
52 cronlogaction({ info => $command_line_options });
55 print "Reading configuration file: $conf\n" if $verbose;
57 @clouds = YAML::XS::LoadFile( $conf );
59 croak "Unable to read configuration file: $conf\n" if $@;
61 for my $cloud ( @clouds ) {
62 print "Create a cloud\n",
63 " Koha conf file: ", $cloud->{KohaConf} ? $cloud->{KohaConf} : "default", "\n",
64 " Zebra Index: ", $cloud->{ZebraIndex}, "\n",
65 " Koha Keyword: ", $cloud->{KohaIndex}, "\n",
66 " Count: ", $cloud->{Count}, "\n",
67 " Withcss: ", $cloud->{Withcss}, "\n",
68 " Output: ", $cloud->{Output}, "\n",
71 # Set Koha context if KohaConf is present
72 my $set_new_context = 0;
73 if ( $cloud->{KohaConf} ) {
74 if ( -e $cloud->{KohaConf} ) {
75 my $context = C4::Context->new( $cloud->{KohaConf} );
76 $context->set_context();
80 carp "Koha conf file doesn't exist: ", $cloud->{KohaConf}, " ; use KOHA_CONF\n";
84 my $index = ZebraIndex->new( $cloud->{ZebraIndex} );
85 $index->scan( $cloud->{Count} );
87 open my $fh, ">", $cloud->{Output}
88 or croak "Unable to create file ", $cloud->{Output};
90 my $withcss = $cloud->{Withcss} =~ /^yes/i;
91 print $fh $index->html_cloud( $cloud->{KohaIndex}, $withcss );
93 $set_new_context && restore_context C4::Context;
96 cronlogaction({ action => 'End', info => "COMPLETED" });
104 use Carp qw( carp croak );
109 $self->{ zebra_index } = shift;
110 $self->{ top_terms } = undef;
111 $self->{ levels_cloud } = 24;
115 my $zbiblio = C4::Context->Zconn( "biblioserver" );
117 my $ss = $zbiblio->scan_pqf(
118 '@attr 1=' . $self->{ zebra_index } . ' @attr 4=1 @attr 6=3 "a"'
121 croak "Invalid Zebra index: ", $self->{ zebra_index } if $@;
129 # Scan zebra index and populate an array of top terms
132 # $max_terms Max number of top terms
135 # A 4-dimensionnal array in $self->{top_terms}
137 # [1] term number of occurrences
138 # [2] term proportional relative weight in terms set E[0-1]
139 # [3] term logarithmic relative weight E [0-levels_cloud]
141 # This array is sorted alphabetically by terms ([0])
142 # It can be easily sorted by occurrences:
143 # @t = sort { $a[1] <=> $a[1] } @{$self->{top_terms}};
147 my $index_name = $self->{ zebra_index };
148 my $max_terms = shift;
150 my $MAX_OCCURENCE = 1000000000;
152 my $zbiblio = C4::Context->Zconn( "biblioserver" );
153 my $number_of_terms = 0;
154 my @terms; # 2 dimensions array
155 my $min_occurence_index = -1;
162 print "$from\n" if $verbose;
163 $from =~ s/\"/\\\"/g;
164 my $query = '@attr 1=' . $index_name . ' @attr 4=1 @attr 6=3 "'
166 $ss = $zbiblio->scan_pqf( $query );
172 $ss->option( rpnCharset => 'UTF-8' );
173 last if $ss->size() == 0;
176 for my $index ( 0..$ss->size()-1 ) {
177 ($term, $occ) = $ss->display_term($index);
178 #print "$term:$occ\n";
179 if ( $number_of_terms < $max_terms ) {
180 push( @terms, [ $term, $occ ] );
182 if ( $number_of_terms == $max_terms ) {
183 $min_occurence = $MAX_OCCURENCE;
184 for (0..$number_of_terms-1) {
185 my @term = @{ $terms[$_] };
186 if ( $term[1] <= $min_occurence ) {
187 $min_occurence = $term[1];
188 $min_occurence_index = $_;
194 if ( $occ > $min_occurence) {
195 @{ $terms[$min_occurence_index] }[0] = $term;
196 @{ $terms[$min_occurence_index] }[1] = $occ;
197 $min_occurence = $MAX_OCCURENCE;
198 for (0..$max_terms-1) {
199 my @term = @{ $terms[$_] };
200 if ( $term[1] <= $min_occurence ) {
201 $min_occurence = $term[1];
202 $min_occurence_index = $_;
211 # Sort array of array by terms weight
212 @terms = sort { @{$a}[1] <=> @{$b}[1] } @terms;
214 # A relatif weight to other set terms is added to each term
215 my $min = $terms[0][1];
216 my $log_min = log( $min );
217 my $max = $terms[$#terms][1];
218 my $log_max = log( $max );
219 my $delta = $max - $min;
220 $delta = 1 if $delta == 0; # Very unlikely
222 if ($log_max - $log_min == 0) {
223 $log_min = $log_min - $self->{levels_cloud};
227 $factor = $self->{levels_cloud} / ($log_max - $log_min);
230 foreach (0..$#terms) {
231 my $count = @{ $terms[$_] }[1];
232 my $weight = ( $count - $min ) / $delta;
233 my $log_weight = int( (log($count) - $log_min) * $factor);
234 push( @{ $terms[$_] }, $weight );
235 push( @{ $terms[$_] }, $log_weight );
237 $self->{ top_terms } = \@terms;
239 # Sort array of array by terms alphabetical order
240 @terms = sort { @{$a}[0] cmp @{$b}[0] } @terms;
245 # Returns a HTML version of index top terms formatted
250 my $koha_index = shift;
252 my @terms = @{ $self->{top_terms} };
265 font-weight: lighter;
266 text-decoration: none;
268 span.tagcloud0 { font-size: 12px;}
269 span.tagcloud1 { font-size: 13px;}
270 span.tagcloud2 { font-size: 14px;}
271 span.tagcloud3 { font-size: 15px;}
272 span.tagcloud4 { font-size: 16px;}
273 span.tagcloud5 { font-size: 17px;}
274 span.tagcloud6 { font-size: 18px;}
275 span.tagcloud7 { font-size: 19px;}
276 span.tagcloud8 { font-size: 20px;}
277 span.tagcloud9 { font-size: 21px;}
278 span.tagcloud10 { font-size: 22px;}
279 span.tagcloud11 { font-size: 23px;}
280 span.tagcloud12 { font-size: 24px;}
281 span.tagcloud13 { font-size: 25px;}
282 span.tagcloud14 { font-size: 26px;}
283 span.tagcloud15 { font-size: 27px;}
284 span.tagcloud16 { font-size: 28px;}
285 span.tagcloud17 { font-size: 29px;}
286 span.tagcloud18 { font-size: 30px;}
287 span.tagcloud19 { font-size: 31px;}
288 span.tagcloud20 { font-size: 32px;}
289 span.tagcloud21 { font-size: 33px;}
290 span.tagcloud22 { font-size: 34px;}
291 span.tagcloud23 { font-size: 35px;}
292 span.tagcloud24 { font-size: 36px;}
294 <div class="subjectcloud">
298 my @term = @{ $terms[$_] };
301 #print " 0=", $term[0]," - 1=", $term[1], " - 2=", $term[2], " - 3=", $term[3],"\n";
303 . '<span class="tagcloud'
306 . '<a href="/cgi-bin/koha/opac-search.pl?q='
321 cloud-kw.pl - Creates HTML keywords clouds from Koha Zebra Indexes
327 =item cloud-kw.pl [--verbose|--help] --conf=F<cloud.conf>
329 Creates multiple HTML files containing kewords cloud with top terms sorted
330 by their logarithmic weight.
331 F<cloud.conf> is a YAML configuration file driving cloud generation
340 =item B<--conf=configuration file>
342 Specify configuration file name
344 =item B<--verbose|-v>
346 Enable script verbose mode.
350 Print this help page.
356 Configuration file looks like that:
359 # Koha configuration file for a specific installation
360 # If not present, defaults to KOHA_CONF
361 KohaConf: /home/koha/mylibray/etc/koha-conf.xml
362 # Zebra index to scan
364 # Koha index used to link found kewords with an opac search URL
366 # Number of top keyword to use for the cloud
368 # Include CSS style directives with the cloud
369 # This could be used as a model and then CSS directives are
370 # put in the appropriate CSS file directly.
372 # HTML file where to output the cloud
373 Output: /home/koha/mylibrary/koharoot/koha-tmpl/cloud-author.html
375 KohaConf: /home/koha/yourlibray/etc/koha-conf.xml
380 Output: /home/koha/yourlibrary/koharoot/koha-tmpl/cloud-subject.html
384 Generated top terms have more informations than those outputted from
385 the time being. Some parameters could be easily added to improve
392 In order to output terms with the number of occurrences they
393 have been found in Koha Catalogue by Zebra.
397 Number of levels in the cloud. Now 24 levels are hardcoded.
401 Weighting method used to distribute terms in the cloud. We could have two
402 values: Logarithmic and Linear. Now it's Logarithmic by default.
406 Now terms are outputted in the lexical order. They could be sorted