From b53be9cdaf97b61e827f25c4b07b6352bc8df570 Mon Sep 17 00:00:00 2001
From: tipaul <tipaul>
Date: Wed, 25 Apr 2007 16:26:42 +0000
Subject: [PATCH] Koha 3.0 nozebra 1st commit : the script
 misc/migration_tools/rebuild_nozebra.pl build the nozebra table, and, if you
 set NoZebra to Yes, queries will be done through zebra. TODO : - add nozebra
 table management on biblio editing - the index table content is hardcoded. I
 still have to add some specific systempref to let the library update it -
 manage pagination (next/previous) - manage facets WHAT works : - NZgetRecords
 : has exactly the same API & returns as zebra getQuery, except that some
 parameters are unused - search & sort works quite good - CQL parser is better
 that what I thought I could do : title="harry and sally" and
 publicationyear>2000 not itemtype=LIVR should work fine

---
 C4/Biblio.pm                            |  21 +-
 C4/Search.pm                            | 329 ++++++++++++++++++++++++
 catalogue/search.pl                     |  14 +-
 misc/migration_tools/rebuild_nozebra.pl | 100 +++++++
 4 files changed, 454 insertions(+), 10 deletions(-)
 create mode 100755 misc/migration_tools/rebuild_nozebra.pl
diff --git a/C4/Biblio.pm b/C4/Biblio.pm
index 6d73d84095..912410a707 100644
--- a/C4/Biblio.pm
+++ b/C4/Biblio.pm
@@ -1539,11 +1539,11 @@ sub GetMarcBiblio {
     $sth->execute($biblionumber);
     my ($marcxml) = $sth->fetchrow;
     MARC::File::XML->default_record_format(C4::Context->preference('marcflavour'));
-#     $marcxml =~ s/\x1e//g;
-#     $marcxml =~ s/\x1f//g;
-#     $marcxml =~ s/\x1d//g;
-#     $marcxml =~ s/\x0f//g;
-#     $marcxml =~ s/\x0c//g;
+    $marcxml =~ s/\x1e//g;
+    $marcxml =~ s/\x1f//g;
+    $marcxml =~ s/\x1d//g;
+    $marcxml =~ s/\x0f//g;
+    $marcxml =~ s/\x0c//g;
     my $record = MARC::Record->new();
     $record = MARC::Record::new_from_xml( $marcxml, "utf8",C4::Context->preference('marcflavour')) if $marcxml;
     return $record;
@@ -3691,6 +3691,17 @@ Joshua Ferraro jmf@liblime.com
 
 # $Id$
 # $Log$
+# Revision 1.200  2007/04/25 16:26:42  tipaul
+# Koha 3.0 nozebra 1st commit : the script misc/migration_tools/rebuild_nozebra.pl build the nozebra table, and, if you set NoZebra to Yes, queries will be done through zebra. TODO :
+# - add nozebra table management on biblio editing
+# - the index table content is hardcoded. I still have to add some specific systempref to let the library update it
+# - manage pagination (next/previous)
+# - manage facets
+# WHAT works :
+# - NZgetRecords : has exactly the same API & returns as zebra getQuery, except that some parameters are unused
+# - search & sort works quite good
+# - CQL parser is better that what I thought I could do : title="harry and sally" and publicationyear>2000 not itemtype=LIVR should work fine
+#
 # Revision 1.199  2007/04/24 09:07:53  tipaul
 # moving dotransfer to Biblio.pm::ModItemTransfer + some CheckReserves fixes
 #
diff --git a/C4/Search.pm b/C4/Search.pm
index c4eef801f7..72001b54ad 100755
--- a/C4/Search.pm
+++ b/C4/Search.pm
@@ -53,6 +53,7 @@ This module provides the searching facilities for the Koha into a zebra catalog.
   &searchResults
   &getRecords
   &buildQuery
+  &NZgetRecords
 );
 
 # make all your functions, whether exported or not;
@@ -489,6 +490,8 @@ sub getRecords {
             }
         }
     }
+    use Data::Dumper;
+    warn Dumper($results_hashref);
     return ( undef, $results_hashref, \@facets_loop );
 }
 
@@ -1042,6 +1045,332 @@ sub searchResults {
     return @newresults;
 }
 
+
+#----------------------------------------------------------------------
+#
+# Non-Zebra GetRecords#
+#----------------------------------------------------------------------
+
+=item
+  NZgetRecords has the same API as zera getRecords, even if some parameters are not managed
+=cut
+
+sub NZgetRecords {
+    my (
+        $koha_query,     $federated_query,  $sort_by_ref,
+        $servers_ref,    $results_per_page, $offset,
+        $expanded_facet, $branches,         $query_type,
+        $scan
+    ) = @_;
+    my $result = NZanalyse($koha_query);
+#     use Data::Dumper;
+#     warn "==========".@$sort_by_ref[0];
+    return (undef,NZorder($result,@$sort_by_ref[0]),undef);
+}
+
+=item
+
+  NZanalyse : get a CQL string as parameter, and returns a list of biblionumber;title,biblionumber;title,...
+  the list is builded from inverted index in nozebra SQL table
+  note that title is here only for convenience : the sorting will be very fast when requested on title
+  if the sorting is requested on something else, we will have to reread all results, and that may be longer.
+
+=cut
+
+sub NZanalyse {
+    my ($string) = @_;
+    # if we have a ", replace the content to discard temporarily any and/or/not inside
+    my $commacontent;
+    if ($string =~/"/) {
+        $string =~ s/"(.*?)"/__X__/;
+        $commacontent = $1;
+#         print "commacontent : $commacontent\n";
+    }
+    # split the query string in 3 parts : X AND Y means : $left="X", $operand="AND" and $right="Y"
+    # then, call again NZanalyse with $left and $right
+    # (recursive until we find a leaf (=> something without and/or/not)
+    $string =~ /(.*)( and | or | not )(.*)/;
+    my $left = $1;
+    my $right = $3;
+    my $operand = $2;
+    # it's not a leaf, we have a and/or/not
+    if ($operand) {
+        # reintroduce comma content if needed
+        $right =~ s/__X__/"$commacontent"/ if $commacontent;
+        $left =~ s/__X__/"$commacontent"/ if $commacontent;
+#         print "noeud : $left / $operand / $right\n";
+        my $leftresult = NZanalyse($left);
+        my $rightresult = NZanalyse($right);
+        # OK, we have the results for right and left part of the query
+        # depending of operand, intersect, union or exclude both lists
+        # to get a result list
+        if ($operand eq ' and ') {
+            my @leftresult = split /,/, $leftresult;
+#             my @rightresult = split /,/,$leftresult;
+            my $finalresult;
+            # parse the left results, and if the biblionumber exist in the right result, save it in finalresult
+            # the result is stored twice, to have the same weight for AND than OR.
+            # example : TWO : 61,61,64,121 (two is twice in the biblio #61) / TOWER : 61,64,130
+            # result : 61,61,61,61,64,64 for two AND tower : 61 has more weight than 64
+            foreach (@leftresult) {
+                if ($rightresult =~ "$_,") {
+                    $finalresult .= "$_,$_,";
+                }
+            }
+            return $finalresult;
+        } elsif ($operand eq ' or ') {
+            # just merge the 2 strings
+            return $leftresult.$rightresult;
+        } elsif ($operand eq ' not ') {
+            my @leftresult = split /,/, $leftresult;
+#             my @rightresult = split /,/,$leftresult;
+            my $finalresult;
+            foreach (@leftresult) {
+                unless ($rightresult =~ "$_,") {
+                    $finalresult .= "$_,";
+                }
+            }
+            return $finalresult;
+        } else {
+            # this error is impossible, because of the regexp that isolate the operand, but just in case...
+            die "error : operand unknown : $operand for $string";
+        }
+    # it's a leaf, do the real SQL query and return the result
+    } else {
+        $string =~  s/__X__/"$commacontent"/ if $commacontent;
+        $string =~ s/-|\.|\?|,|;|!|'|\(|\)|\[|\]|{|}|"|<|>|&|\+|\*|\// /g;
+#         print "feuille : $string\n";
+        # parse the string in in operator/operand/value again
+        $string =~ /(.*)(=|>|>=|<|<=)(.*)/;
+        my $left = $1;
+        my $operator = $2;
+        my $right = $3;
+        my $results;
+        if ($operator) {
+            #do a specific search
+            my $dbh = C4::Context->dbh;
+            $operator='LIKE' if $operator eq '=' and $right=~ /%/;
+            my $sth = $dbh->prepare("SELECT biblionumbers FROM nozebra WHERE indexname=? AND value $operator ?");
+#             print "$left / $operator / $right\n";
+            # split each word, query the DB and build the biblionumbers result
+            foreach (split / /,$right) {
+                my $biblionumbers;
+                $sth->execute($left,$_);
+                while (my $line = $sth->fetchrow) {
+                    $biblionumbers .= $line;
+                }
+                # do a AND with existing list if there is one, otherwise, use the biblionumbers list as 1st result list
+                if ($results) {
+                    my @leftresult = split /,/, $biblionumbers;
+                    my $temp;
+                    foreach (@leftresult) {
+                        if ($results =~ "$_,") {
+                            $temp .= "$_,$_,";
+                        }
+                    }
+                    $results = $temp;
+                } else {
+                    $results = $biblionumbers;
+                }
+            }
+        } else {
+            #do a complete search (all indexes)
+            my $dbh = C4::Context->dbh;
+            my $sth = $dbh->prepare("SELECT biblionumbers FROM nozebra WHERE value LIKE ?");
+            # split each word, query the DB and build the biblionumbers result
+            foreach (split / /,$string) {
+                my $biblionumbers;
+                $sth->execute($_);
+                while (my $line = $sth->fetchrow) {
+                    $biblionumbers .= $line;
+                }
+                # do a AND with existing list if there is one, otherwise, use the biblionumbers list as 1st result list
+                if ($results) {
+                    my @leftresult = split /,/, $biblionumbers;
+                    my $temp;
+                    foreach (@leftresult) {
+                        if ($results =~ "$_,") {
+                            $temp .= "$_,$_,";
+                        }
+                    }
+                    $results = $temp;
+                } else {
+                    $results = $biblionumbers;
+                }
+            }
+        }
+        return $results;
+    }
+}
+
+sub NZorder {
+    my ($biblionumbers, $ordering) = @_;
+    # order title asc by default
+    $ordering = '1=36 <i' unless $ordering;
+    my $dbh = C4::Context->dbh;
+    #
+    # order by POPULARITY
+    #
+    if ($ordering =~ /1=9523/) {
+        my %result;
+        my %popularity;
+        # popularity is not in MARC record, it's builded from a specific query
+        my $sth = $dbh->prepare("select sum(issues) from items where biblionumber=?");
+        foreach (split /,/,$biblionumbers) {
+            my ($biblionumber,$title) = split /;/,$_;
+            $result{$biblionumber}=GetMarcBiblio($biblionumber);
+            $sth->execute($biblionumber);
+            my $popularity= $sth->fetchrow ||0;
+            # hint : the key is popularity.title because we can have
+            # many results with the same popularity. In this cas, sub-ordering is done by title
+            # we also have biblionumber to avoid bug for 2 biblios with the same title & popularity
+            # (un-frequent, I agree, but we won't forget anything that way ;-)
+            $popularity{sprintf("%10d",$popularity).$title.$biblionumber} = $biblionumber;
+        }
+        # sort the hash and return the same structure as GetRecords (Zebra querying)
+        my $result_hash;
+        my $numbers=0;
+        if ($ordering eq '1=9523 >i') { # sort popularity DESC
+            foreach my $key (sort {$b <=> $a} (keys %popularity)) {
+                $result_hash->{'RECORDS'}[$numbers++] = $result{$popularity{$key}}->as_usmarc();
+            }
+        } else { # sort popularity ASC
+            foreach my $key (sort (keys %popularity)) {
+                $result_hash->{'RECORDS'}[$numbers++] = $result{$popularity{$key}}->as_usmarc();
+            }
+        }
+        my $finalresult=();
+        $result_hash->{'hits'} = $numbers;
+        $finalresult->{'biblioserver'} = $result_hash;
+        return $finalresult;
+    #
+    # ORDER BY author
+    #
+    } elsif ($ordering eq '1=1003 <i'){
+        my %result;
+        foreach (split /,/,$biblionumbers) {
+            my ($biblionumber,$title) = split /;/,$_;
+            my $record=GetMarcBiblio($biblionumber);
+            my $author;
+            if (C4::Context->preference('marcflavour') eq 'UNIMARC') {
+                $author=$record->subfield('200','f');
+                $author=$record->subfield('700','a') unless $author;
+            } else {
+                $author=$record->subfield('100','a');
+            }
+            # hint : the result is sorted by title.biblionumber because we can have X biblios with the same title
+            # and we don't want to get only 1 result for each of them !!!
+            $result{$author.$biblionumber}=$record;
+        }
+        # sort the hash and return the same structure as GetRecords (Zebra querying)
+        my $result_hash;
+        my $numbers=0;
+        if ($ordering eq '1=1003 <i') { # sort by title desc
+            foreach my $key (sort (keys %result)) {
+                $result_hash->{'RECORDS'}[$numbers++] = $result{$key}->as_usmarc();
+            }
+        } else { # sort by title ASC
+            foreach my $key (sort { $a <=> $b } (keys %result)) {
+                $result_hash->{'RECORDS'}[$numbers++] = $result{$key}->as_usmarc();
+            }
+        }
+        my $finalresult=();
+        $result_hash->{'hits'} = $numbers;
+        $finalresult->{'biblioserver'} = $result_hash;
+        return $finalresult;
+    #
+    # ORDER BY callnumber
+    #
+    } elsif ($ordering eq '1=20 <i'){
+        my %result;
+        foreach (split /,/,$biblionumbers) {
+            my ($biblionumber,$title) = split /;/,$_;
+            my $record=GetMarcBiblio($biblionumber);
+            my $callnumber;
+            my ($callnumber_tag,$callnumber_subfield)=GetMarcFromKohaField($dbh,'items.itemcallnumber');
+            ($callnumber_tag,$callnumber_subfield)= GetMarcFromKohaField('biblioitems.callnumber') unless $callnumber_tag;
+            if (C4::Context->preference('marcflavour') eq 'UNIMARC') {
+                $callnumber=$record->subfield('200','f');
+            } else {
+                $callnumber=$record->subfield('100','a');
+            }
+            # hint : the result is sorted by title.biblionumber because we can have X biblios with the same title
+            # and we don't want to get only 1 result for each of them !!!
+            $result{$callnumber.$biblionumber}=$record;
+        }
+        # sort the hash and return the same structure as GetRecords (Zebra querying)
+        my $result_hash;
+        my $numbers=0;
+        if ($ordering eq '1=1003 <i') { # sort by title desc
+            foreach my $key (sort (keys %result)) {
+                $result_hash->{'RECORDS'}[$numbers++] = $result{$key}->as_usmarc();
+            }
+        } else { # sort by title ASC
+            foreach my $key (sort { $a <=> $b } (keys %result)) {
+                $result_hash->{'RECORDS'}[$numbers++] = $result{$key}->as_usmarc();
+            }
+        }
+        my $finalresult=();
+        $result_hash->{'hits'} = $numbers;
+        $finalresult->{'biblioserver'} = $result_hash;
+        return $finalresult;
+    } elsif ($ordering =~ /1=31/){ #pub year
+        my %result;
+        foreach (split /,/,$biblionumbers) {
+            my ($biblionumber,$title) = split /;/,$_;
+            my $record=GetMarcBiblio($biblionumber);
+            my ($publicationyear_tag,$publicationyear_subfield)=GetMarcFromKohaField($dbh,'biblioitems.publicationyear');
+            my $publicationyear=$record->subfield($publicationyear_tag,$publicationyear_subfield);
+            # hint : the result is sorted by title.biblionumber because we can have X biblios with the same title
+            # and we don't want to get only 1 result for each of them !!!
+            $result{$publicationyear.$biblionumber}=$record;
+        }
+        # sort the hash and return the same structure as GetRecords (Zebra querying)
+        my $result_hash;
+        my $numbers=0;
+        if ($ordering eq '1=31 <i') { # sort by title desc
+            foreach my $key (sort (keys %result)) {
+                $result_hash->{'RECORDS'}[$numbers++] = $result{$key}->as_usmarc();
+            }
+        } else { # sort by title ASC
+            foreach my $key (sort { $a <=> $b } (keys %result)) {
+                $result_hash->{'RECORDS'}[$numbers++] = $result{$key}->as_usmarc();
+            }
+        }
+        my $finalresult=();
+        $result_hash->{'hits'} = $numbers;
+        $finalresult->{'biblioserver'} = $result_hash;
+        return $finalresult;
+    #
+    # ORDER BY title
+    #
+    } else { 
+        # the title is in the biblionumbers string, so we just need to build a hash, sort it and return
+        my %result;
+        foreach (split /,/,$biblionumbers) {
+            my ($biblionumber,$title) = split /;/,$_;
+            # hint : the result is sorted by title.biblionumber because we can have X biblios with the same title
+            # and we don't want to get only 1 result for each of them !!!
+            $result{$title.$biblionumber}=GetMarcBiblio($biblionumber);
+        }
+        # sort the hash and return the same structure as GetRecords (Zebra querying)
+        my $result_hash;
+        my $numbers=0;
+        if ($ordering eq '1=36 <i') { # sort by title desc
+            foreach my $key (sort (keys %result)) {
+                $result_hash->{'RECORDS'}[$numbers++] = $result{$key}->as_usmarc();
+            }
+        } else { # sort by title ASC
+            foreach my $key (sort { $a <=> $b } (keys %result)) {
+                $result_hash->{'RECORDS'}[$numbers++] = $result{$key}->as_usmarc();
+            }
+        }
+        my $finalresult=();
+        $result_hash->{'hits'} = $numbers;
+        $finalresult->{'biblioserver'} = $result_hash;
+        return $finalresult;
+    }
+}
 END { }    # module clean-up code here (global destructor)
 
 1;
diff --git a/catalogue/search.pl b/catalogue/search.pl
index f91081cd94..3b95cc9e77 100755
--- a/catalogue/search.pl
+++ b/catalogue/search.pl
@@ -400,11 +400,15 @@ my $facets; # this object stores the faceted results that display on the left-ha
 my @results_array;
 my $results_hashref;
 
-eval {
-
-    ($error, $results_hashref, $facets) = getRecords($koha_query,$federated_query,\@sort_by,\@servers,$results_per_page,$offset,$expanded_facet,$branches,$query_type,$scan);
-
-};
+if (C4::Context->preference('NoZebra')) {
+    eval {
+        ($error, $results_hashref, $facets) = NZgetRecords($koha_query,$federated_query,\@sort_by,\@servers,$results_per_page,$offset,$expanded_facet,$branches,$query_type,$scan);
+    };
+} else {
+    eval {
+        ($error, $results_hashref, $facets) = getRecords($koha_query,$federated_query,\@sort_by,\@servers,$results_per_page,$offset,$expanded_facet,$branches,$query_type,$scan);
+    };
+}
 if ($@ || $error) {
     $template->param(query_error => $error.$@);
 
diff --git a/misc/migration_tools/rebuild_nozebra.pl b/misc/migration_tools/rebuild_nozebra.pl
new file mode 100755
index 0000000000..6b98ec5b31
--- /dev/null
+++ b/misc/migration_tools/rebuild_nozebra.pl
@@ -0,0 +1,100 @@
+#!/usr/bin/perl
+
+use C4::Context;
+use Getopt::Long;
+use C4::Biblio;
+use C4::AuthoritiesMarc;
+
+use strict;
+# 
+# script that fills the nozebra table
+#
+#
+
+$|=1; # flushes output
+
+# limit for database dumping
+my $limit = "LIMIT 1000";
+my $directory;
+my $skip_export;
+my $keep_export;
+my $reset;
+my $biblios;
+my $authorities;
+GetOptions(
+	'd:s'      => \$directory,
+	'reset'      => \$reset,
+	's'        => \$skip_export,
+	'k'        => \$keep_export,
+	'b'        => \$biblios,
+	'a'        => \$authorities,
+	);
+
+$directory = "export" unless $directory;
+my $dbh=C4::Context->dbh;
+$dbh->do("truncate nozebra");
+my $sth;
+$sth=$dbh->prepare("select biblionumber from biblioitems order by biblionumber $limit");
+$sth->execute();
+my $i=0;
+my %result;
+
+my %index = (
+    'title' => '200a,200c,200d',
+    'author' =>'200f,700*,701*,702*'
+    );
+
+$|=1;
+while (my ($biblionumber) = $sth->fetchrow) {
+    $i++;
+    print "\r$i";
+    my $record = GetMarcBiblio($biblionumber);
+
+    # get title of the record (to store the 10 first letters with the index)
+    my $title;
+    if (C4::Context->preference('marcflavour') eq 'UNIMARC') {
+        $title = lc($record->subfield('200','a'));
+    } else {
+        $title = lc($record->subfield('245','a'));
+    }
+    # remove blancks and comma (that could cause problem when decoding the string for CQL retrieval
+    $title =~ s/ |,|;//g;
+    # limit to 10 char, should be enough, and limit the DB size
+    $title = substr($title,0,10);
+    #parse each field
+    foreach my $field ($record->fields()) {
+        #parse each subfield
+        next if $field->tag <10;
+        foreach my $subfield ($field->subfields()) {
+            my $tag = $field->tag();
+            my $subfieldcode = $subfield->[0];
+            my $indexed=0;
+            # check each index to see if the subfield is stored somewhere
+            # otherwise, store it in __RAW__ index
+            foreach my $key (keys %index) {
+                if ($index{$key} =~ /$tag\*/ or $index{$key} =~ /$tag$subfield/) {
+                    $indexed=1;
+                    my $line= lc $subfield->[1];
+                    $line =~ s/-|\.|\?|,|;|!|'|\(|\)|\[|\]|{|}|"|<|>|&|\+|\*|\// /g;
+                    foreach (split / /,$line) {
+                        $result{$key}->{$_}.="$biblionumber;$title," unless $subfield->[0] eq '9';
+                    }
+                }
+            }
+            # the subfield is not indexed, store it in __RAW__ index anyway
+            unless ($indexed) {
+                my $line= lc $subfield->[1];
+                $line =~ s/-|\.|\?|,|;|!|'|\(|\)|\[|\]|{|}|"|<|>|&|\+|\*|\// /g;
+                foreach (split / /,$line) {
+                    $result{'__RAW__'}->{$_}.="$biblionumber;$title," unless $subfield->[0] eq '9';
+                }
+            }
+        }
+    }
+}
+my $sth = $dbh->prepare("INSERT INTO nozebra (indexname,value,biblionumbers) VALUES (?,?,?)");
+foreach my $key (keys %result) {
+    foreach my $index (keys %{$result{$key}}) {
+        $sth->execute($key,$index,$result{$key}->{$index});
+    }
+}
-- 
2.39.5