From 6345ddf493e891cf30175980ac933570c0dfbf9d Mon Sep 17 00:00:00 2001 From: Frederic Demians Date: Thu, 18 Dec 2008 18:38:25 +0100 Subject: [PATCH] A script generating keyword clouds from Zebra Indexes See embedded perldoc to see how it works. Signed-off-by: Galen Charlton --- misc/cronjobs/cloud-kw.pl | 333 ++++++++++++++++++++++++++++++++ misc/cronjobs/cloud-sample.conf | 14 ++ 2 files changed, 347 insertions(+) create mode 100755 misc/cronjobs/cloud-kw.pl create mode 100644 misc/cronjobs/cloud-sample.conf diff --git a/misc/cronjobs/cloud-kw.pl b/misc/cronjobs/cloud-kw.pl new file mode 100755 index 0000000000..8896d15d74 --- /dev/null +++ b/misc/cronjobs/cloud-kw.pl @@ -0,0 +1,333 @@ +#!/usr/bin/perl + +# +# Copyright 2008 Tamil s.a.r.l. +# +# This software is placed under the gnu General Public License, v2 +# (http://www.gnu.org/licenses/gpl.html) +# + +use strict; +use warnings; +use diagnostics; +use Carp; +use YAML::Syck; +use Pod::Usage; +use Getopt::Long; +use C4::Context; + + +my $verbose = 0; +my $help = 0; +my $conf = ''; +GetOptions( + 'verbose' => \$verbose, + 'help' => \$help, + 'conf=s' => \$conf, +); + +sub usage { + pod2usage( -verbose => 2 ); + exit; +} + +usage() if $help || !$conf; + + +my @clouds; +print "Reading configuration file: $conf\n" if $verbose; +eval { + @clouds = LoadFile( $conf ); +}; +croak "Unable to read configuration file: $conf\n" if $@; + +for my $cloud ( @clouds ) { + print "Create a cloud\n", + " Koha conf file: ", $cloud->{KohaConf}, "\n", + " Zebra Index: ", $cloud->{ZebraIndex}, "\n", + " Koha Keyword: ", $cloud->{KohaIndex}, "\n", + " Count: ", $cloud->{Count}, "\n", + " Withcss: ", $cloud->{Withcss}, "\n", + " Output: ", $cloud->{Output}, "\n", + if $verbose; + + # Set Koha context + my $context = new C4::Context( $cloud->{KohaConf} ); + $context->set_context(); + + my $index = new ZebraIndex( $cloud->{ZebraIndex} ); + $index->scan( $cloud->{Count} ); + + open my $fh, ">", $cloud->{Output} + or croak "Unable to create file ", $cloud->{Output}; + + my $withcss = $cloud->{Withcss}; + $withcss =~ /^y/i; + print $fh $index->html_cloud( $cloud->{KohaIndex}, $withcss ); + close $fh; +} + + + +package ZebraIndex; + + +sub new { + my $self = {}; + my $class = shift; + $self->{ zebra_index } = shift; + $self->{ top_terms } = undef; + $self->{ levels_cloud } = 24; + bless $self, $class; + return $self; +} + + +# +# scan +# Scan zebra index and populate an array of top terms +# +# PARAMETERS: +# $max_terms Max number of top terms +# +# RETURN: +# A 4-dimensionnal array in $self->{top_terms} +# [0] term +# [1] term number of occurences +# [2] term proportional relative weight in terms set E[0-1] +# [3] term logarithmic relative weight E [0-levels_cloud] +# +# This array is sorted alphabetically by terms ([0]) +# It can be easily sorted by occurences: +# @t = sort { $a[1] <=> $a[1] } @{$self->{top_terms}}; +# +sub scan { + my $self = shift; + my $index_name = $self->{ zebra_index }; + my $max_terms = shift; + + my $MAX_OCCURENCE = 1000000000; + + my $zbiblio = C4::Context->Zconn( "biblioserver" ); + my $number_of_terms = 0; + my @terms; # 2 dimensions array + my $min_occurence_index = -1; + my $min_occurence; + my $from = '0'; + while (1) { + my $ss; + eval { + print "$from\n" if $verbose; + $from =~ s/\"/\\\"/g; + my $query = '@attr 1=' . $index_name . ' @attr 4=1 @attr 6=3 "' + . $from . 'a"'; + $ss = $zbiblio->scan_pqf( $query ); + }; + if ($@) { + chop $from; + next; + } + $ss->option( rpnCharset => 'UTF-8' ); + last if $ss->size() == 0; + my $term = ''; + my $occ = 0; + for my $index ( 0..$ss->size()-1 ) { + ($term, $occ) = $ss->display_term($index); + #print "$term:$occ\n"; + if ( $number_of_terms < $max_terms ) { + push( @terms, [ $term, $occ ] ); + ++$number_of_terms; + if ( $number_of_terms == $max_terms ) { + $min_occurence = $MAX_OCCURENCE; + for (0..$number_of_terms-1) { + my @term = @{ $terms[$_] }; + if ( $term[1] <= $min_occurence ) { + $min_occurence = $term[1]; + $min_occurence_index = $_; + } + } + } + } + else { + if ( $occ > $min_occurence) { + @{ $terms[$min_occurence_index] }[0] = $term; + @{ $terms[$min_occurence_index] }[1] = $occ; + $min_occurence = $MAX_OCCURENCE; + for (0..$max_terms-1) { + my @term = @{ $terms[$_] }; + if ( $term[1] <= $min_occurence ) { + $min_occurence = $term[1]; + $min_occurence_index = $_; + } + } + } + } + } + $from = $term; + } + + # Sort array of array by terms weight + @terms = sort { @{$a}[1] <=> @{$b}[1] } @terms; + + # A relatif weight to other set terms is added to each term + my $min = $terms[0][1]; + my $log_min = log( $min ); + my $max = $terms[$#terms-1][1]; + my $log_max = log( $max ); + my $delta = $max - $min; + my $factor; + if ($log_max - $log_min == 0) { + $log_min = $log_min - $self->{levels_cloud}; + $factor = 1; + } + else { + $factor = $self->{levels_cloud} / ($log_max - $log_min); + } + + foreach (0..$#terms-1) { + my $count = @{ $terms[$_] }[1]; + my $weight = ( $count - $min ) / $delta; + my $log_weight = int( (log($count) - $log_min) * $factor); + push( @{ $terms[$_] }, $weight ); + push( @{ $terms[$_] }, $log_weight ); + } + $self->{ top_terms } = \@terms; + + # Sort array of array by terms alphabetical order + @terms = sort { @{$a}[0] cmp @{$b}[0] } @terms; +} + + +# +# Returns a HTML version of index top terms formated +# as a 'tag cloud'. +# +sub html_cloud { + my $self = shift; + my $koha_index = shift; + my $withcss = shift; + my @terms = @{ $self->{top_terms} }; + my $html = < +.subjectcloud { + text-align: center; + line-height: 16px; + margin: 20px; + background: #f0f0f0; + padding: 3%; +} +.subjectcloud a { + font-weight: lighter; + text-decoration: none; +} +span.tagcloud0 { font-size: 12px;} +span.tagcloud1 { font-size: 13px;} +span.tagcloud2 { font-size: 14px;} +span.tagcloud3 { font-size: 15px;} +span.tagcloud4 { font-size: 16px;} +span.tagcloud5 { font-size: 17px;} +span.tagcloud6 { font-size: 18px;} +span.tagcloud7 { font-size: 19px;} +span.tagcloud8 { font-size: 20px;} +span.tagcloud9 { font-size: 21px;} +span.tagcloud10 { font-size: 22px;} +span.tagcloud11 { font-size: 23px;} +span.tagcloud12 { font-size: 24px;} +span.tagcloud13 { font-size: 25px;} +span.tagcloud14 { font-size: 26px;} +span.tagcloud15 { font-size: 27px;} +span.tagcloud16 { font-size: 28px;} +span.tagcloud17 { font-size: 29px;} +span.tagcloud18 { font-size: 30px;} +span.tagcloud19 { font-size: 31px;} +span.tagcloud20 { font-size: 32px;} +span.tagcloud21 { font-size: 33px;} +span.tagcloud22 { font-size: 34px;} +span.tagcloud23 { font-size: 35px;} +span.tagcloud24 { font-size: 36px;} + +
+EOS + for (0..$#terms-1) { + my @term = @{ $terms[$_] }; + my $uri = $term[0]; + $uri =~ s/\(//g; + $html = $html + . '' + . '' + . $term[0] + . "\n"; + } + $html .= "
\n"; + return $html; +} + + +=head1 NAME + +cloud-kw.pl - Creates HTML keywords clouds from Koha Zebra Indexes + +=head1 USAGE + +=over + +=item cloud-kw.pl [--verbose|--help] --conf=F + +Creates multiple HTML files containing kewords cloud. +F is a YAML configuration file driving cloud generation +process. + +=back + +=head1 PARAMETERS + +=over + +=item B<--conf=configuration file> + +Specify configuration file name + +=item B<--verbose|-v> + +Enable script verbose mode. + +=item B<--help|-h> + +Print this help page. + +=back + +=head1 CONFIGURATION + +Configuration file looks like that: + + --- + # Koha configuration file for a specific installation + KohaConf: /home/koha/mylibray/etc/koha-conf.xml + # Zebra index to scan + ZebraIndex: Author + # Koha index used to link found kewords with an opac search URL + KohaIndex: au + # Number of top keyword to used for the cloud + Count: 50 + # Include CSS style directives with the cloud + Withcss: Yes + # HTML file where to output the cloud + Output: /home/koha/mylibrary/koharoot/koha-tmpl/cloud-author.html + --- + KohaConf: /home/koha/yourlibray/etc/koha-conf.xml + ZebraIndex: Subject + KohaIndex: su + Count: 200 + Withcss: no + Output: /home/koha/yourlibrary/koharoot/koha-tmpl/cloud-subject.html + +=cut + + diff --git a/misc/cronjobs/cloud-sample.conf b/misc/cronjobs/cloud-sample.conf new file mode 100644 index 0000000000..d79ed266ab --- /dev/null +++ b/misc/cronjobs/cloud-sample.conf @@ -0,0 +1,14 @@ +--- +KohaConf: /home/mylibrary/etc/koha-conf.xml +ZebraIndex: Author-name-corporate +KohaIndex: au +Count: 50 +Withcss: yes +Output: /home/mylibrary/koha/koha-tmpl/cloud-author.html +--- +KohaConf: /home/yourlibrary/etc/koha-conf.xml +ZebraIndex: Subject +KohaIndex: su +Count: 200 +Withcss: no +Output: /home/yourlibrary/koha/koha-tmpl/cloud-subject.html -- 2.20.1