misc/cronjobs/cloud-kw.pl

   1 #!/usr/bin/perl
   2
   3 #
   4 # Copyright 2008 Tamil s.a.r.l.
   5 #
   6 # This file is part of Koha.
   7 #
   8 # Koha is free software; you can redistribute it and/or modify it
   9 # under the terms of the GNU General Public License as published by
  10 # the Free Software Foundation; either version 3 of the License, or
  11 # (at your option) any later version.
  12 #
  13 # Koha is distributed in the hope that it will be useful, but
  14 # WITHOUT ANY WARRANTY; without even the implied warranty of
  15 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  16 # GNU General Public License for more details.
  17 #
  18 # You should have received a copy of the GNU General Public License
  19 # along with Koha; if not, see <http://www.gnu.org/licenses>.
  20
  21 use strict;
  22 use warnings;
  23 use diagnostics;
  24 use Carp qw( carp croak );
  25 use YAML::XS;
  26 use Pod::Usage qw( pod2usage );
  27 use Getopt::Long qw( GetOptions );
  28
  29 use Koha::Script -cron;
  30 use C4::Context;
  31 use C4::Log qw( cronlogaction );
  32
  33 my $verbose     = 0;
  34 my $help        = 0;
  35 my $conf        = '';
  36
  37 my $command_line_options = join(" ",@ARGV);
  38
  39 GetOptions(
  40     'verbose'   => \$verbose,
  41     'help'      => \$help,
  42     'conf=s'    => \$conf,
  43 );
  44
  45 sub usage {
  46     pod2usage( -verbose => 2 );
  47     exit;
  48 }
  49
  50 usage() if $help || !$conf;
  51
  52 cronlogaction({ info => $command_line_options });
  53
  54 my @clouds;
  55 print "Reading configuration file: $conf\n" if $verbose;
  56 eval {
  57     @clouds = YAML::XS::LoadFile( $conf );
  58 };
  59 croak "Unable to read configuration file: $conf\n" if $@;
  60
  61 for my $cloud ( @clouds ) {
  62     print "Create a cloud\n",
  63           "  Koha conf file:  ", $cloud->{KohaConf} ? $cloud->{KohaConf} : "default", "\n",
  64           "  Zebra Index:     ", $cloud->{ZebraIndex}, "\n",
  65           "  Koha Keyword:    ", $cloud->{KohaIndex}, "\n",
  66           "  Count:           ", $cloud->{Count}, "\n",
  67           "  Withcss:         ", $cloud->{Withcss}, "\n",
  68           "  Output:          ", $cloud->{Output}, "\n",
  69       if $verbose;
  70
  71     # Set Koha context if KohaConf is present
  72     my $set_new_context = 0;
  73     if ( $cloud->{KohaConf} ) {
  74         if ( -e $cloud->{KohaConf} ) {
  75             my $context = C4::Context->new( $cloud->{KohaConf} );
  76             $context->set_context();
  77             $set_new_context = 1;
  78         }
  79         else {
  80             carp "Koha conf file doesn't exist: ", $cloud->{KohaConf}, " ; use KOHA_CONF\n";
  81         }
  82     }
  83
  84     my $index = ZebraIndex->new( $cloud->{ZebraIndex} );
  85     $index->scan( $cloud->{Count} );
  86
  87     open my $fh, ">", $cloud->{Output}
  88         or croak "Unable to create file ", $cloud->{Output};
  89
  90     my $withcss = $cloud->{Withcss} =~ /^yes/i;
  91     print $fh $index->html_cloud( $cloud->{KohaIndex}, $withcss );
  92     close $fh;
  93     $set_new_context && restore_context C4::Context;
  94 }
  95
  96 cronlogaction({ action => 'End', info => "COMPLETED" });
  97
  98
  99 package ZebraIndex;
 100
 101 use strict;
 102 use warnings;
 103 use diagnostics;
 104 use Carp qw( carp croak );
 105
 106 sub new {
 107     my $self = {};
 108     my $class = shift;
 109     $self->{ zebra_index  } = shift;
 110     $self->{ top_terms    } = undef;
 111     $self->{ levels_cloud } = 24;
 112     bless $self, $class;
 113
 114     # Test Zebra index
 115     my $zbiblio = C4::Context->Zconn( "biblioserver" );
 116     eval {
 117         my $ss = $zbiblio->scan_pqf(
 118             '@attr 1=' . $self->{ zebra_index } . ' @attr 4=1 @attr 6=3 "a"'
 119         );
 120     };
 121     croak "Invalid Zebra index: ", $self->{ zebra_index } if $@;
 122
 123     return $self;
 124 }
 125
 126
 127 #
 128 # scan
 129 #   Scan zebra index and populate an array of top terms
 130 #
 131 # PARAMETERS:
 132 #   $max_terms    Max number of top terms
 133 #
 134 # RETURN:
 135 #   A 4-dimensionnal array in $self->{top_terms}
 136 #   [0] term
 137 #   [1] term number of occurrences
 138 #   [2] term proportional relative weight in terms set E[0-1]
 139 #   [3] term logarithmic relative weight E [0-levels_cloud]
 140 #
 141 #   This array is sorted alphabetically by terms ([0])
 142 #   It can be easily sorted by occurrences:
 143 #     @t = sort { $a[1] <=> $a[1] } @{$self->{top_terms}};
 144 #
 145 sub scan {
 146     my $self       = shift;
 147     my $index_name = $self->{ zebra_index };
 148     my $max_terms  = shift;
 149
 150     my $MAX_OCCURENCE = 1000000000;
 151
 152     my $zbiblio = C4::Context->Zconn( "biblioserver" );
 153     my $number_of_terms = 0;
 154     my @terms;      # 2 dimensions array
 155     my $min_occurence_index = -1;
 156     my $min_occurence;
 157     my $from = '0';
 158
 159     while (1) {
 160         my $ss;
 161         eval {
 162             print "$from\n" if $verbose;
 163             $from =~ s/\"/\\\"/g;
 164             my $query = '@attr 1=' . $index_name . ' @attr 4=1 @attr 6=3 "'
 165                         . $from . 'a"';
 166             $ss = $zbiblio->scan_pqf( $query );
 167         };
 168         if ($@) {
 169             chop $from;
 170             next;
 171         }
 172         $ss->option( rpnCharset => 'UTF-8' );
 173         last if $ss->size() == 0;
 174         my $term = '';
 175         my $occ = 0;
 176         for my $index ( 0..$ss->size()-1 ) {
 177             ($term, $occ) = $ss->display_term($index);
 178             #print "$term:$occ\n";
 179             if ( $number_of_terms < $max_terms ) {
 180                 push( @terms, [ $term, $occ ] );
 181                 ++$number_of_terms;
 182                 if ( $number_of_terms == $max_terms ) {
 183                     $min_occurence = $MAX_OCCURENCE;
 184                     for (0..$number_of_terms-1) {
 185                         my @term = @{ $terms[$_] };
 186                         if ( $term[1] <= $min_occurence ) {
 187                             $min_occurence       = $term[1];
 188                             $min_occurence_index = $_;
 189                         }
 190                     }
 191                 }
 192             }
 193             else {
 194                 if ( $occ > $min_occurence) {
 195                     @{ $terms[$min_occurence_index] }[0] = $term;
 196                     @{ $terms[$min_occurence_index] }[1] = $occ;
 197                     $min_occurence = $MAX_OCCURENCE;
 198                     for (0..$max_terms-1) {
 199                         my @term = @{ $terms[$_] };
 200                         if ( $term[1] <= $min_occurence ) {
 201                             $min_occurence       = $term[1];
 202                             $min_occurence_index = $_;
 203                         }
 204                     }
 205                 }
 206             }
 207         }
 208         $from = $term;
 209     }
 210
 211     # Sort array of array by terms weight
 212     @terms = sort { @{$a}[1] <=> @{$b}[1] } @terms;
 213
 214     # A relatif weight to other set terms is added to each term
 215     my $min     = $terms[0][1];
 216     my $log_min = log( $min );
 217     my $max     = $terms[$#terms][1];
 218     my $log_max = log( $max );
 219     my $delta   = $max - $min;
 220     $delta = 1 if $delta == 0; # Very unlikely
 221     my $factor;
 222     if ($log_max - $log_min == 0) {
 223         $log_min = $log_min - $self->{levels_cloud};
 224         $factor = 1;
 225     }
 226     else {
 227         $factor = $self->{levels_cloud} / ($log_max - $log_min);
 228     }
 229
 230     foreach (0..$#terms) {
 231         my $count = @{ $terms[$_] }[1];
 232         my $weight = ( $count - $min ) / $delta;
 233         my $log_weight = int( (log($count) - $log_min) * $factor);
 234         push( @{ $terms[$_] }, $weight );
 235         push( @{ $terms[$_] }, $log_weight );
 236     }
 237     $self->{ top_terms } = \@terms;
 238
 239     # Sort array of array by terms alphabetical order
 240     @terms = sort { @{$a}[0] cmp @{$b}[0] } @terms;
 241 }
 242
 243
 244 #
 245 # Returns a HTML version of index top terms formatted
 246 # as a 'tag cloud'.
 247 #
 248 sub html_cloud {
 249     my $self = shift;
 250     my $koha_index = shift;
 251     my $withcss = shift;
 252     my @terms = @{ $self->{top_terms} };
 253     my $html = '';
 254     if ( $withcss ) {
 255         $html = <<EOS;
 256 <style>
 257 .subjectcloud {
 258     text-align:  center;
 259     line-height: 16px;
 260     margin: 20px;
 261     background: #f0f0f0;
 262     padding: 3%;
 263 }
 264 .subjectcloud a {
 265     font-weight: lighter;
 266     text-decoration: none;
 267 }
 268 span.tagcloud0  { font-size: 12px;}
 269 span.tagcloud1  { font-size: 13px;}
 270 span.tagcloud2  { font-size: 14px;}
 271 span.tagcloud3  { font-size: 15px;}
 272 span.tagcloud4  { font-size: 16px;}
 273 span.tagcloud5  { font-size: 17px;}
 274 span.tagcloud6  { font-size: 18px;}
 275 span.tagcloud7  { font-size: 19px;}
 276 span.tagcloud8  { font-size: 20px;}
 277 span.tagcloud9  { font-size: 21px;}
 278 span.tagcloud10 { font-size: 22px;}
 279 span.tagcloud11 { font-size: 23px;}
 280 span.tagcloud12 { font-size: 24px;}
 281 span.tagcloud13 { font-size: 25px;}
 282 span.tagcloud14 { font-size: 26px;}
 283 span.tagcloud15 { font-size: 27px;}
 284 span.tagcloud16 { font-size: 28px;}
 285 span.tagcloud17 { font-size: 29px;}
 286 span.tagcloud18 { font-size: 30px;}
 287 span.tagcloud19 { font-size: 31px;}
 288 span.tagcloud20 { font-size: 32px;}
 289 span.tagcloud21 { font-size: 33px;}
 290 span.tagcloud22 { font-size: 34px;}
 291 span.tagcloud23 { font-size: 35px;}
 292 span.tagcloud24 { font-size: 36px;}
 293 </style>
 294 <div class="subjectcloud">
 295 EOS
 296     }
 297     for (0..$#terms) {
 298         my @term = @{ $terms[$_] };
 299         my $uri = $term[0];
 300         $uri =~ s/\(//g;
 301         #print "  0=", $term[0]," - 1=", $term[1], " - 2=", $term[2], " - 3=", $term[3],"\n";
 302         $html = $html
 303             . '<span class="tagcloud'
 304             . $term[3]
 305             . '">'
 306             . '<a href="/cgi-bin/koha/opac-search.pl?q='
 307             . $koha_index
 308             . '%3A'
 309             . $uri
 310             . '">'
 311             . $term[0]
 312             . "</a></span>\n";
 313     }
 314     $html .= "</div>\n";
 315     return $html;
 316 }
 317
 318
 319 =head1 NAME
 320
 321 cloud-kw.pl - Creates HTML keywords clouds from Koha Zebra Indexes
 322
 323 =head1 USAGE
 324
 325 =over
 326
 327 =item cloud-kw.pl [--verbose|--help] --conf=F<cloud.conf>
 328
 329 Creates multiple HTML files containing kewords cloud with top terms sorted
 330 by their logarithmic weight.
 331 F<cloud.conf> is a YAML configuration file driving cloud generation
 332 process.
 333
 334 =back
 335
 336 =head1 PARAMETERS
 337
 338 =over
 339
 340 =item B<--conf=configuration file>
 341
 342 Specify configuration file name
 343
 344 =item B<--verbose|-v>
 345
 346 Enable script verbose mode.
 347
 348 =item B<--help|-h>
 349
 350 Print this help page.
 351
 352 =back
 353
 354 =head1 CONFIGURATION
 355
 356 Configuration file looks like that:
 357
 358  ---
 359   # Koha configuration file for a specific installation
 360   # If not present, defaults to KOHA_CONF
 361   KohaConf: /home/koha/mylibray/etc/koha-conf.xml
 362   # Zebra index to scan
 363   ZebraIndex: Author
 364   # Koha index used to link found kewords with an opac search URL
 365   KohaIndex: au
 366   # Number of top keyword to use for the cloud
 367   Count: 50
 368   # Include CSS style directives with the cloud
 369   # This could be used as a model and then CSS directives are
 370   # put in the appropriate CSS file directly.
 371   Withcss: Yes
 372   # HTML file where to output the cloud
 373   Output: /home/koha/mylibrary/koharoot/koha-tmpl/cloud-author.html
 374  ---
 375   KohaConf: /home/koha/yourlibray/etc/koha-conf.xml
 376   ZebraIndex: Subject
 377   KohaIndex: su
 378   Count: 200
 379   Withcss: no
 380   Output: /home/koha/yourlibrary/koharoot/koha-tmpl/cloud-subject.html
 381
 382 =head1 IMPROVEMENTS
 383
 384 Generated top terms have more informations than those outputted from
 385 the time being. Some parameters could be easily added to improve
 386 this script:
 387
 388 =over
 389
 390 =item B<WithCount>
 391
 392 In order to output terms with the number of occurrences they
 393 have been found in Koha Catalogue by Zebra.
 394
 395 =item B<CloudLevels>
 396
 397 Number of levels in the cloud. Now 24 levels are hardcoded.
 398
 399 =item B<Weithing>
 400
 401 Weighting method used to distribute terms in the cloud. We could have two
 402 values: Logarithmic and Linear. Now it's Logarithmic by default.
 403
 404 =item B<Order>
 405
 406 Now terms are outputted in the lexical order. They could be sorted
 407 by their weight.
 408
 409 =back
 410
 411 =cut
 412
 413