misc/cronjobs/cloud-kw.pl

   1 #!/usr/bin/perl
   2
   3 #
   4 # Copyright 2008 Tamil s.a.r.l.
   5 #
   6 # This software is placed under the gnu General Public License, v2
   7 # (http://www.gnu.org/licenses/gpl.html)
   8 #
   9
  10 use strict;
  11 use warnings;
  12 use diagnostics;
  13 use Carp;
  14 use YAML::Syck;
  15 use Pod::Usage;
  16 use Getopt::Long;
  17 use C4::Context;
  18
  19
  20 my $verbose     = 0;
  21 my $help        = 0;
  22 my $conf        = '';
  23 GetOptions(
  24     'verbose'   => \$verbose,
  25     'help'      => \$help,
  26     'conf=s'    => \$conf,
  27 );
  28
  29 sub usage {
  30     pod2usage( -verbose => 2 );
  31     exit;
  32 }
  33
  34 usage() if $help || !$conf;
  35
  36
  37 my @clouds;
  38 print "Reading configuration file: $conf\n" if $verbose;
  39 eval {
  40     @clouds = LoadFile( $conf );
  41 };
  42 croak "Unable to read configuration file: $conf\n" if $@;
  43
  44 for my $cloud ( @clouds ) {
  45     print "Create a cloud\n",
  46           "  Koha conf file:  ", $cloud->{KohaConf} ? $cloud->{KohaConf} : "default", "\n",
  47           "  Zebra Index:     ", $cloud->{ZebraIndex}, "\n",
  48           "  Koha Keyword:    ", $cloud->{KohaIndex}, "\n",
  49           "  Count:           ", $cloud->{Count}, "\n",
  50           "  Withcss:         ", $cloud->{Withcss}, "\n",
  51           "  Output:          ", $cloud->{Output}, "\n",
  52       if $verbose;
  53
  54     # Set Koha context if KohaConf is present
  55     my $set_new_context = 0;
  56     if ( $cloud->{KohaConf} ) {
  57         if ( -e $cloud->{KohaConf} ) {
  58             my $context = C4::Context->new( $cloud->{KohaConf} );
  59             $context->set_context();
  60             $set_new_context = 1;
  61         }
  62         else {
  63             carp "Koha conf file doesn't exist: ", $cloud->{KohaConf}, " ; use KOHA_CONF\n";
  64         }
  65     }
  66
  67     my $index = new ZebraIndex( $cloud->{ZebraIndex} );
  68     $index->scan( $cloud->{Count} );
  69
  70     open my $fh, ">", $cloud->{Output}
  71         or croak "Unable to create file ", $cloud->{Output};
  72
  73     my $withcss = $cloud->{Withcss} =~ /^yes/i;
  74     print $fh $index->html_cloud( $cloud->{KohaIndex}, $withcss );
  75     close $fh;
  76     $set_new_context && restore_context C4::Context;
  77 }
  78
  79
  80
  81 package ZebraIndex;
  82
  83 use strict;
  84 use warnings;
  85 use diagnostics;
  86 use Carp;
  87
  88 sub new {
  89     my $self = {};
  90     my $class = shift;
  91     $self->{ zebra_index  } = shift;
  92     $self->{ top_terms    } = undef;
  93     $self->{ levels_cloud } = 24;
  94     bless $self, $class;
  95
  96     # Test Zebra index
  97     my $zbiblio = C4::Context->Zconn( "biblioserver" );
  98     eval {
  99         my $ss = $zbiblio->scan_pqf(
 100             '@attr 1=' . $self->{ zebra_index } . ' @attr 4=1 @attr 6=3 "a"'
 101         );
 102     };
 103     croak "Invalid Zebra index: ", $self->{ zebra_index } if $@;
 104
 105     return $self;
 106 }
 107
 108
 109 #
 110 # scan
 111 #   Scan zebra index and populate an array of top terms
 112 #
 113 # PARAMETERS:
 114 #   $max_terms    Max number of top terms
 115 #
 116 # RETURN:
 117 #   A 4-dimensionnal array in $self->{top_terms}
 118 #   [0] term
 119 #   [1] term number of occurences
 120 #   [2] term proportional relative weight in terms set E[0-1]
 121 #   [3] term logarithmic relative weight E [0-levels_cloud]
 122 #
 123 #   This array is sorted alphabetically by terms ([0])
 124 #   It can be easily sorted by occurences:
 125 #     @t = sort { $a[1] <=> $a[1] } @{$self->{top_terms}};
 126 #
 127 sub scan {
 128     my $self       = shift;
 129     my $index_name = $self->{ zebra_index };
 130     my $max_terms  = shift;
 131
 132     my $MAX_OCCURENCE = 1000000000;
 133
 134     my $zbiblio = C4::Context->Zconn( "biblioserver" );
 135     my $number_of_terms = 0;
 136     my @terms;      # 2 dimensions array
 137     my $min_occurence_index = -1;
 138     my $min_occurence;
 139     my $from = '0';
 140
 141     while (1) {
 142         my $ss;
 143         eval {
 144             print "$from\n" if $verbose;
 145             $from =~ s/\"/\\\"/g;
 146             my $query = '@attr 1=' . $index_name . ' @attr 4=1 @attr 6=3 "'
 147                         . $from . 'a"';
 148             $ss = $zbiblio->scan_pqf( $query );
 149         };
 150         if ($@) {
 151             chop $from;
 152             next;
 153         }
 154         $ss->option( rpnCharset => 'UTF-8' );
 155         last if $ss->size() == 0;
 156         my $term = '';
 157         my $occ = 0;
 158         for my $index ( 0..$ss->size()-1 ) {
 159             ($term, $occ) = $ss->display_term($index);
 160             #print "$term:$occ\n";
 161             if ( $number_of_terms < $max_terms ) {
 162                 push( @terms, [ $term, $occ ] );
 163                 ++$number_of_terms;
 164                 if ( $number_of_terms == $max_terms ) {
 165                     $min_occurence = $MAX_OCCURENCE;
 166                     for (0..$number_of_terms-1) {
 167                         my @term = @{ $terms[$_] };
 168                         if ( $term[1] <= $min_occurence ) {
 169                             $min_occurence       = $term[1];
 170                             $min_occurence_index = $_;
 171                         }
 172                     }
 173                 }
 174             }
 175             else {
 176                 if ( $occ > $min_occurence) {
 177                     @{ $terms[$min_occurence_index] }[0] = $term;
 178                     @{ $terms[$min_occurence_index] }[1] = $occ;
 179                     $min_occurence = $MAX_OCCURENCE;
 180                     for (0..$max_terms-1) {
 181                         my @term = @{ $terms[$_] };
 182                         if ( $term[1] <= $min_occurence ) {
 183                             $min_occurence       = $term[1];
 184                             $min_occurence_index = $_;
 185                         }
 186                     }
 187                 }
 188             }
 189         }
 190         $from = $term;
 191     }
 192
 193     # Sort array of array by terms weight
 194     @terms = sort { @{$a}[1] <=> @{$b}[1] } @terms;
 195
 196     # A relatif weight to other set terms is added to each term
 197     my $min     = $terms[0][1];
 198     my $log_min = log( $min );
 199     my $max     = $terms[$#terms][1];
 200     my $log_max = log( $max );
 201     my $delta   = $max - $min;
 202     $delta = 1 if $delta == 0; # Very unlikely
 203     my $factor;
 204     if ($log_max - $log_min == 0) {
 205         $log_min = $log_min - $self->{levels_cloud};
 206         $factor = 1;
 207     }
 208     else {
 209         $factor = $self->{levels_cloud} / ($log_max - $log_min);
 210     }
 211
 212     foreach (0..$#terms) {
 213         my $count = @{ $terms[$_] }[1];
 214         my $weight = ( $count - $min ) / $delta;
 215         my $log_weight = int( (log($count) - $log_min) * $factor);
 216         push( @{ $terms[$_] }, $weight );
 217         push( @{ $terms[$_] }, $log_weight );
 218     }
 219     $self->{ top_terms } = \@terms;
 220
 221     # Sort array of array by terms alphabetical order
 222     @terms = sort { @{$a}[0] cmp @{$b}[0] } @terms;
 223 }
 224
 225
 226 #
 227 # Returns a HTML version of index top terms formated
 228 # as a 'tag cloud'.
 229 #
 230 sub html_cloud {
 231     my $self = shift;
 232     my $koha_index = shift;
 233     my $withcss = shift;
 234     my @terms = @{ $self->{top_terms} };
 235     my $html = '';
 236     if ( $withcss ) {
 237         $html = <<EOS;
 238 <style>
 239 .subjectcloud {
 240     text-align:  center;
 241     line-height: 16px;
 242     margin: 20px;
 243     background: #f0f0f0;
 244     padding: 3%;
 245 }
 246 .subjectcloud a {
 247     font-weight: lighter;
 248     text-decoration: none;
 249 }
 250 span.tagcloud0  { font-size: 12px;}
 251 span.tagcloud1  { font-size: 13px;}
 252 span.tagcloud2  { font-size: 14px;}
 253 span.tagcloud3  { font-size: 15px;}
 254 span.tagcloud4  { font-size: 16px;}
 255 span.tagcloud5  { font-size: 17px;}
 256 span.tagcloud6  { font-size: 18px;}
 257 span.tagcloud7  { font-size: 19px;}
 258 span.tagcloud8  { font-size: 20px;}
 259 span.tagcloud9  { font-size: 21px;}
 260 span.tagcloud10 { font-size: 22px;}
 261 span.tagcloud11 { font-size: 23px;}
 262 span.tagcloud12 { font-size: 24px;}
 263 span.tagcloud13 { font-size: 25px;}
 264 span.tagcloud14 { font-size: 26px;}
 265 span.tagcloud15 { font-size: 27px;}
 266 span.tagcloud16 { font-size: 28px;}
 267 span.tagcloud17 { font-size: 29px;}
 268 span.tagcloud18 { font-size: 30px;}
 269 span.tagcloud19 { font-size: 31px;}
 270 span.tagcloud20 { font-size: 32px;}
 271 span.tagcloud21 { font-size: 33px;}
 272 span.tagcloud22 { font-size: 34px;}
 273 span.tagcloud23 { font-size: 35px;}
 274 span.tagcloud24 { font-size: 36px;}
 275 </style>
 276 <div class="subjectcloud">
 277 EOS
 278     }
 279     for (0..$#terms) {
 280         my @term = @{ $terms[$_] };
 281         my $uri = $term[0];
 282         $uri =~ s/\(//g;
 283         #print "  0=", $term[0]," - 1=", $term[1], " - 2=", $term[2], " - 3=", $term[3],"\n";
 284         $html = $html
 285             . '<span class="tagcloud'
 286             . $term[3]
 287             . '">'
 288             . '<a href="/cgi-bin/koha/opac-search.pl?q='
 289             . $koha_index
 290             . '%3A'
 291             . $uri
 292             . '">'
 293             . $term[0]
 294             . "</a></span>\n";
 295     }
 296     $html .= "</div>\n";
 297     return $html;
 298 }
 299
 300
 301 =head1 NAME
 302
 303 cloud-kw.pl - Creates HTML keywords clouds from Koha Zebra Indexes
 304
 305 =head1 USAGE
 306
 307 =over
 308
 309 =item cloud-kw.pl [--verbose|--help] --conf=F<cloud.conf>
 310
 311 Creates multiple HTML files containing kewords cloud with top terms sorted
 312 by their logarithmic weight.
 313 F<cloud.conf> is a YAML configuration file driving cloud generation
 314 process.
 315
 316 =back
 317
 318 =head1 PARAMETERS
 319
 320 =over
 321
 322 =item B<--conf=configuration file>
 323
 324 Specify configuration file name
 325
 326 =item B<--verbose|-v>
 327
 328 Enable script verbose mode.
 329
 330 =item B<--help|-h>
 331
 332 Print this help page.
 333
 334 =back
 335
 336 =head1 CONFIGURATION
 337
 338 Configuration file looks like that:
 339
 340  ---
 341   # Koha configuration file for a specific installation
 342   # If not present, defaults to KOHA_CONF
 343   KohaConf: /home/koha/mylibray/etc/koha-conf.xml
 344   # Zebra index to scan
 345   ZebraIndex: Author
 346   # Koha index used to link found kewords with an opac search URL
 347   KohaIndex: au
 348   # Number of top keyword to use for the cloud
 349   Count: 50
 350   # Include CSS style directives with the cloud
 351   # This could be used as a model and then CSS directives are
 352   # put in the appropriate CSS file directly.
 353   Withcss: Yes
 354   # HTML file where to output the cloud
 355   Output: /home/koha/mylibrary/koharoot/koha-tmpl/cloud-author.html
 356  ---
 357   KohaConf: /home/koha/yourlibray/etc/koha-conf.xml
 358   ZebraIndex: Subject
 359   KohaIndex: su
 360   Count: 200
 361   Withcss: no
 362   Output: /home/koha/yourlibrary/koharoot/koha-tmpl/cloud-subject.html
 363
 364 =head1 IMPROVEMENTS
 365
 366 Generated top terms have more informations than those outputted from
 367 the time beeing. Some parameters could be easily added to improve
 368 this script:
 369
 370 =over
 371
 372 =item B<WithCount>
 373
 374 In order to output terms with the number of occurences they
 375 have been found in Koha Catalogue by Zebra.
 376
 377 =item B<CloudLevels>
 378
 379 Number of levels in the cloud. Now 24 levels are hardcoded.
 380
 381 =item B<Weithing>
 382
 383 Weighting method used to distribute terms in the cloud. We could have two
 384 values: Logarithmic and Linear. Now it's Logarithmic by default.
 385
 386 =item B<Order>
 387
 388 Now terms are outputted in the lexical order. They could be sorted
 389 by their weight.
 390
 391 =back
 392
 393 =cut
 394
 395