misc/cronjobs/cloud-kw.pl

   1 #!/usr/bin/perl
   2
   3 #
   4 # Copyright 2008 Tamil s.a.r.l.
   5 #
   6 # This software is placed under the gnu General Public License, v2
   7 # (http://www.gnu.org/licenses/gpl.html)
   8 #
   9
  10 use strict;
  11 use warnings;
  12 use diagnostics;
  13 use Carp;
  14 use YAML::Syck;
  15 use Pod::Usage;
  16 use Getopt::Long;
  17 use C4::Context;
  18
  19
  20 my $verbose     = 0;
  21 my $help        = 0;
  22 my $conf        = '';
  23 GetOptions(
  24     'verbose'   => \$verbose,
  25     'help'      => \$help,
  26     'conf=s'    => \$conf,
  27 );
  28
  29 sub usage {
  30     pod2usage( -verbose => 2 );
  31     exit;
  32 }
  33
  34 usage() if $help || !$conf;
  35
  36
  37 my @clouds;
  38 print "Reading configuration file: $conf\n" if $verbose;
  39 eval {
  40     @clouds = LoadFile( $conf );
  41 };
  42 croak "Unable to read configuration file: $conf\n" if $@;
  43
  44 for my $cloud ( @clouds ) {
  45     print "Create a cloud\n",
  46           "  Koha conf file:  ", $cloud->{KohaConf}, "\n",
  47           "  Zebra Index:     ", $cloud->{ZebraIndex}, "\n",
  48           "  Koha Keyword:    ", $cloud->{KohaIndex}, "\n",
  49           "  Count:           ", $cloud->{Count}, "\n",
  50           "  Withcss:         ", $cloud->{Withcss}, "\n",
  51           "  Output:          ", $cloud->{Output}, "\n",
  52       if $verbose;
  53
  54     # Set Koha context
  55     my $context = new C4::Context( $cloud->{KohaConf} );
  56     $context->set_context();
  57
  58     my $index = new ZebraIndex( $cloud->{ZebraIndex} );
  59     $index->scan( $cloud->{Count} );
  60
  61     open my $fh, ">", $cloud->{Output}
  62         or croak "Unable to create file ", $cloud->{Output};
  63
  64     my $withcss = $cloud->{Withcss};
  65     $withcss =~ /^y/i;
  66     print $fh $index->html_cloud( $cloud->{KohaIndex}, $withcss );
  67     close $fh;
  68 }
  69
  70
  71
  72 package ZebraIndex;
  73
  74
  75 sub new {
  76     my $self = {};
  77     my $class = shift;
  78     $self->{ zebra_index  } = shift;
  79     $self->{ top_terms    } = undef;
  80     $self->{ levels_cloud } = 24;
  81     bless $self, $class;
  82     return $self;
  83 }
  84
  85
  86 #
  87 # scan
  88 #   Scan zebra index and populate an array of top terms
  89 #
  90 # PARAMETERS:
  91 #   $max_terms    Max number of top terms
  92 #
  93 # RETURN:
  94 #   A 4-dimensionnal array in $self->{top_terms}
  95 #   [0] term
  96 #   [1] term number of occurences
  97 #   [2] term proportional relative weight in terms set E[0-1]
  98 #   [3] term logarithmic relative weight E [0-levels_cloud]
  99 #
 100 #   This array is sorted alphabetically by terms ([0])
 101 #   It can be easily sorted by occurences:
 102 #     @t = sort { $a[1] <=> $a[1] } @{$self->{top_terms}};
 103 #
 104 sub scan {
 105     my $self       = shift;
 106     my $index_name = $self->{ zebra_index };
 107     my $max_terms  = shift;
 108
 109     my $MAX_OCCURENCE = 1000000000;
 110
 111     my $zbiblio = C4::Context->Zconn( "biblioserver" );
 112     my $number_of_terms = 0;
 113     my @terms;      # 2 dimensions array
 114     my $min_occurence_index = -1;
 115     my $min_occurence;
 116     my $from = '0';
 117     while (1) {
 118         my $ss;
 119         eval {
 120             print "$from\n" if $verbose;
 121             $from =~ s/\"/\\\"/g;
 122             my $query = '@attr 1=' . $index_name . ' @attr 4=1 @attr 6=3 "'
 123                         . $from . 'a"';
 124             $ss = $zbiblio->scan_pqf( $query );
 125         };
 126         if ($@) {
 127             chop $from;
 128             next;
 129         }
 130         $ss->option( rpnCharset => 'UTF-8' );
 131         last if $ss->size() == 0;
 132         my $term = '';
 133         my $occ = 0;
 134         for my $index ( 0..$ss->size()-1 ) {
 135             ($term, $occ) = $ss->display_term($index);
 136             #print "$term:$occ\n";
 137             if ( $number_of_terms < $max_terms ) {
 138                 push( @terms, [ $term, $occ ] );
 139                 ++$number_of_terms;
 140                 if ( $number_of_terms == $max_terms ) {
 141                     $min_occurence = $MAX_OCCURENCE;
 142                     for (0..$number_of_terms-1) {
 143                         my @term = @{ $terms[$_] };
 144                         if ( $term[1] <= $min_occurence ) {
 145                             $min_occurence       = $term[1];
 146                             $min_occurence_index = $_;
 147                         }
 148                     }
 149                 }
 150             }
 151             else {
 152                 if ( $occ > $min_occurence) {
 153                     @{ $terms[$min_occurence_index] }[0] = $term;
 154                     @{ $terms[$min_occurence_index] }[1] = $occ;
 155                     $min_occurence = $MAX_OCCURENCE;
 156                     for (0..$max_terms-1) {
 157                         my @term = @{ $terms[$_] };
 158                         if ( $term[1] <= $min_occurence ) {
 159                             $min_occurence       = $term[1];
 160                             $min_occurence_index = $_;
 161                         }
 162                     }
 163                 }
 164             }
 165         }
 166         $from = $term;
 167     }
 168
 169     # Sort array of array by terms weight
 170     @terms = sort { @{$a}[1] <=> @{$b}[1] } @terms;
 171
 172     # A relatif weight to other set terms is added to each term
 173     my $min     = $terms[0][1];
 174     my $log_min = log( $min );
 175     my $max     = $terms[$#terms-1][1];
 176     my $log_max = log( $max );
 177     my $delta   = $max - $min;
 178     my $factor;
 179     if ($log_max - $log_min == 0) {
 180         $log_min = $log_min - $self->{levels_cloud};
 181         $factor = 1;
 182     }
 183     else {
 184         $factor = $self->{levels_cloud} / ($log_max - $log_min);
 185     }
 186
 187     foreach (0..$#terms-1) {
 188         my $count = @{ $terms[$_] }[1];
 189         my $weight = ( $count - $min ) / $delta;
 190         my $log_weight = int( (log($count) - $log_min) * $factor);
 191         push( @{ $terms[$_] }, $weight );
 192         push( @{ $terms[$_] }, $log_weight );
 193     }
 194     $self->{ top_terms } = \@terms;
 195
 196     # Sort array of array by terms alphabetical order
 197     @terms = sort { @{$a}[0] cmp @{$b}[0] } @terms;
 198 }
 199
 200
 201 #
 202 # Returns a HTML version of index top terms formated
 203 # as a 'tag cloud'.
 204 #
 205 sub html_cloud {
 206     my $self = shift;
 207     my $koha_index = shift;
 208     my $withcss = shift;
 209     my @terms = @{ $self->{top_terms} };
 210     my $html = <<EOS;
 211 <style>
 212 .subjectcloud {
 213     text-align:  center;
 214     line-height: 16px;
 215     margin: 20px;
 216     background: #f0f0f0;
 217     padding: 3%;
 218 }
 219 .subjectcloud a {
 220     font-weight: lighter;
 221     text-decoration: none;
 222 }
 223 span.tagcloud0  { font-size: 12px;}
 224 span.tagcloud1  { font-size: 13px;}
 225 span.tagcloud2  { font-size: 14px;}
 226 span.tagcloud3  { font-size: 15px;}
 227 span.tagcloud4  { font-size: 16px;}
 228 span.tagcloud5  { font-size: 17px;}
 229 span.tagcloud6  { font-size: 18px;}
 230 span.tagcloud7  { font-size: 19px;}
 231 span.tagcloud8  { font-size: 20px;}
 232 span.tagcloud9  { font-size: 21px;}
 233 span.tagcloud10 { font-size: 22px;}
 234 span.tagcloud11 { font-size: 23px;}
 235 span.tagcloud12 { font-size: 24px;}
 236 span.tagcloud13 { font-size: 25px;}
 237 span.tagcloud14 { font-size: 26px;}
 238 span.tagcloud15 { font-size: 27px;}
 239 span.tagcloud16 { font-size: 28px;}
 240 span.tagcloud17 { font-size: 29px;}
 241 span.tagcloud18 { font-size: 30px;}
 242 span.tagcloud19 { font-size: 31px;}
 243 span.tagcloud20 { font-size: 32px;}
 244 span.tagcloud21 { font-size: 33px;}
 245 span.tagcloud22 { font-size: 34px;}
 246 span.tagcloud23 { font-size: 35px;}
 247 span.tagcloud24 { font-size: 36px;}
 248 </style>
 249 <div class="subjectcloud">
 250 EOS
 251     for (0..$#terms-1) {
 252         my @term = @{ $terms[$_] };
 253         my $uri = $term[0];
 254         $uri =~ s/\(//g;
 255         $html = $html
 256             . '<span class="tagcloud'
 257             . $term[3]
 258             . '">'
 259             . '<a href="/cgi-bin/koha/opac-search.pl?q='
 260             . $koha_index
 261             . '%3A'
 262             . $uri
 263             . '">'
 264             . $term[0]
 265             . "</a></span>\n";
 266     }
 267     $html .= "</div>\n";
 268     return $html;
 269 }
 270
 271
 272 =head1 NAME
 273
 274 cloud-kw.pl - Creates HTML keywords clouds from Koha Zebra Indexes
 275
 276 =head1 USAGE
 277
 278 =over
 279
 280 =item cloud-kw.pl [--verbose|--help] --conf=F<cloud.conf>
 281
 282 Creates multiple HTML files containing kewords cloud.
 283 F<cloud.conf> is a YAML configuration file driving cloud generation
 284 process.
 285
 286 =back
 287
 288 =head1 PARAMETERS
 289
 290 =over
 291
 292 =item B<--conf=configuration file>
 293
 294 Specify configuration file name
 295
 296 =item B<--verbose|-v>
 297
 298 Enable script verbose mode.
 299
 300 =item B<--help|-h>
 301
 302 Print this help page.
 303
 304 =back
 305
 306 =head1 CONFIGURATION
 307
 308 Configuration file looks like that:
 309
 310  ---
 311   # Koha configuration file for a specific installation
 312   KohaConf: /home/koha/mylibray/etc/koha-conf.xml
 313   # Zebra index to scan
 314   ZebraIndex: Author
 315   # Koha index used to link found kewords with an opac search URL
 316   KohaIndex: au
 317   # Number of top keyword to used for the cloud
 318   Count: 50
 319   # Include CSS style directives with the cloud
 320   Withcss: Yes
 321   # HTML file where to output the cloud
 322   Output: /home/koha/mylibrary/koharoot/koha-tmpl/cloud-author.html
 323  ---
 324   KohaConf: /home/koha/yourlibray/etc/koha-conf.xml
 325   ZebraIndex: Subject
 326   KohaIndex: su
 327   Count: 200
 328   Withcss: no
 329   Output: /home/koha/yourlibrary/koharoot/koha-tmpl/cloud-subject.html
 330
 331 =cut
 332
 333