From b53be9cdaf97b61e827f25c4b07b6352bc8df570 Mon Sep 17 00:00:00 2001 From: tipaul Date: Wed, 25 Apr 2007 16:26:42 +0000 Subject: [PATCH] Koha 3.0 nozebra 1st commit : the script misc/migration_tools/rebuild_nozebra.pl build the nozebra table, and, if you set NoZebra to Yes, queries will be done through zebra. TODO : - add nozebra table management on biblio editing - the index table content is hardcoded. I still have to add some specific systempref to let the library update it - manage pagination (next/previous) - manage facets WHAT works : - NZgetRecords : has exactly the same API & returns as zebra getQuery, except that some parameters are unused - search & sort works quite good - CQL parser is better that what I thought I could do : title="harry and sally" and publicationyear>2000 not itemtype=LIVR should work fine --- C4/Biblio.pm | 21 +- C4/Search.pm | 329 ++++++++++++++++++++++++ catalogue/search.pl | 14 +- misc/migration_tools/rebuild_nozebra.pl | 100 +++++++ 4 files changed, 454 insertions(+), 10 deletions(-) create mode 100755 misc/migration_tools/rebuild_nozebra.pl diff --git a/C4/Biblio.pm b/C4/Biblio.pm index 6d73d84095..912410a707 100644 --- a/C4/Biblio.pm +++ b/C4/Biblio.pm @@ -1539,11 +1539,11 @@ sub GetMarcBiblio { $sth->execute($biblionumber); my ($marcxml) = $sth->fetchrow; MARC::File::XML->default_record_format(C4::Context->preference('marcflavour')); -# $marcxml =~ s/\x1e//g; -# $marcxml =~ s/\x1f//g; -# $marcxml =~ s/\x1d//g; -# $marcxml =~ s/\x0f//g; -# $marcxml =~ s/\x0c//g; + $marcxml =~ s/\x1e//g; + $marcxml =~ s/\x1f//g; + $marcxml =~ s/\x1d//g; + $marcxml =~ s/\x0f//g; + $marcxml =~ s/\x0c//g; my $record = MARC::Record->new(); $record = MARC::Record::new_from_xml( $marcxml, "utf8",C4::Context->preference('marcflavour')) if $marcxml; return $record; @@ -3691,6 +3691,17 @@ Joshua Ferraro jmf@liblime.com # $Id$ # $Log$ +# Revision 1.200 2007/04/25 16:26:42 tipaul +# Koha 3.0 nozebra 1st commit : the script misc/migration_tools/rebuild_nozebra.pl build the nozebra table, and, if you set NoZebra to Yes, queries will be done through zebra. TODO : +# - add nozebra table management on biblio editing +# - the index table content is hardcoded. I still have to add some specific systempref to let the library update it +# - manage pagination (next/previous) +# - manage facets +# WHAT works : +# - NZgetRecords : has exactly the same API & returns as zebra getQuery, except that some parameters are unused +# - search & sort works quite good +# - CQL parser is better that what I thought I could do : title="harry and sally" and publicationyear>2000 not itemtype=LIVR should work fine +# # Revision 1.199 2007/04/24 09:07:53 tipaul # moving dotransfer to Biblio.pm::ModItemTransfer + some CheckReserves fixes # diff --git a/C4/Search.pm b/C4/Search.pm index c4eef801f7..72001b54ad 100755 --- a/C4/Search.pm +++ b/C4/Search.pm @@ -53,6 +53,7 @@ This module provides the searching facilities for the Koha into a zebra catalog. &searchResults &getRecords &buildQuery + &NZgetRecords ); # make all your functions, whether exported or not; @@ -489,6 +490,8 @@ sub getRecords { } } } + use Data::Dumper; + warn Dumper($results_hashref); return ( undef, $results_hashref, \@facets_loop ); } @@ -1042,6 +1045,332 @@ sub searchResults { return @newresults; } + +#---------------------------------------------------------------------- +# +# Non-Zebra GetRecords# +#---------------------------------------------------------------------- + +=item + NZgetRecords has the same API as zera getRecords, even if some parameters are not managed +=cut + +sub NZgetRecords { + my ( + $koha_query, $federated_query, $sort_by_ref, + $servers_ref, $results_per_page, $offset, + $expanded_facet, $branches, $query_type, + $scan + ) = @_; + my $result = NZanalyse($koha_query); +# use Data::Dumper; +# warn "==========".@$sort_by_ref[0]; + return (undef,NZorder($result,@$sort_by_ref[0]),undef); +} + +=item + + NZanalyse : get a CQL string as parameter, and returns a list of biblionumber;title,biblionumber;title,... + the list is builded from inverted index in nozebra SQL table + note that title is here only for convenience : the sorting will be very fast when requested on title + if the sorting is requested on something else, we will have to reread all results, and that may be longer. + +=cut + +sub NZanalyse { + my ($string) = @_; + # if we have a ", replace the content to discard temporarily any and/or/not inside + my $commacontent; + if ($string =~/"/) { + $string =~ s/"(.*?)"/__X__/; + $commacontent = $1; +# print "commacontent : $commacontent\n"; + } + # split the query string in 3 parts : X AND Y means : $left="X", $operand="AND" and $right="Y" + # then, call again NZanalyse with $left and $right + # (recursive until we find a leaf (=> something without and/or/not) + $string =~ /(.*)( and | or | not )(.*)/; + my $left = $1; + my $right = $3; + my $operand = $2; + # it's not a leaf, we have a and/or/not + if ($operand) { + # reintroduce comma content if needed + $right =~ s/__X__/"$commacontent"/ if $commacontent; + $left =~ s/__X__/"$commacontent"/ if $commacontent; +# print "noeud : $left / $operand / $right\n"; + my $leftresult = NZanalyse($left); + my $rightresult = NZanalyse($right); + # OK, we have the results for right and left part of the query + # depending of operand, intersect, union or exclude both lists + # to get a result list + if ($operand eq ' and ') { + my @leftresult = split /,/, $leftresult; +# my @rightresult = split /,/,$leftresult; + my $finalresult; + # parse the left results, and if the biblionumber exist in the right result, save it in finalresult + # the result is stored twice, to have the same weight for AND than OR. + # example : TWO : 61,61,64,121 (two is twice in the biblio #61) / TOWER : 61,64,130 + # result : 61,61,61,61,64,64 for two AND tower : 61 has more weight than 64 + foreach (@leftresult) { + if ($rightresult =~ "$_,") { + $finalresult .= "$_,$_,"; + } + } + return $finalresult; + } elsif ($operand eq ' or ') { + # just merge the 2 strings + return $leftresult.$rightresult; + } elsif ($operand eq ' not ') { + my @leftresult = split /,/, $leftresult; +# my @rightresult = split /,/,$leftresult; + my $finalresult; + foreach (@leftresult) { + unless ($rightresult =~ "$_,") { + $finalresult .= "$_,"; + } + } + return $finalresult; + } else { + # this error is impossible, because of the regexp that isolate the operand, but just in case... + die "error : operand unknown : $operand for $string"; + } + # it's a leaf, do the real SQL query and return the result + } else { + $string =~ s/__X__/"$commacontent"/ if $commacontent; + $string =~ s/-|\.|\?|,|;|!|'|\(|\)|\[|\]|{|}|"|<|>|&|\+|\*|\// /g; +# print "feuille : $string\n"; + # parse the string in in operator/operand/value again + $string =~ /(.*)(=|>|>=|<|<=)(.*)/; + my $left = $1; + my $operator = $2; + my $right = $3; + my $results; + if ($operator) { + #do a specific search + my $dbh = C4::Context->dbh; + $operator='LIKE' if $operator eq '=' and $right=~ /%/; + my $sth = $dbh->prepare("SELECT biblionumbers FROM nozebra WHERE indexname=? AND value $operator ?"); +# print "$left / $operator / $right\n"; + # split each word, query the DB and build the biblionumbers result + foreach (split / /,$right) { + my $biblionumbers; + $sth->execute($left,$_); + while (my $line = $sth->fetchrow) { + $biblionumbers .= $line; + } + # do a AND with existing list if there is one, otherwise, use the biblionumbers list as 1st result list + if ($results) { + my @leftresult = split /,/, $biblionumbers; + my $temp; + foreach (@leftresult) { + if ($results =~ "$_,") { + $temp .= "$_,$_,"; + } + } + $results = $temp; + } else { + $results = $biblionumbers; + } + } + } else { + #do a complete search (all indexes) + my $dbh = C4::Context->dbh; + my $sth = $dbh->prepare("SELECT biblionumbers FROM nozebra WHERE value LIKE ?"); + # split each word, query the DB and build the biblionumbers result + foreach (split / /,$string) { + my $biblionumbers; + $sth->execute($_); + while (my $line = $sth->fetchrow) { + $biblionumbers .= $line; + } + # do a AND with existing list if there is one, otherwise, use the biblionumbers list as 1st result list + if ($results) { + my @leftresult = split /,/, $biblionumbers; + my $temp; + foreach (@leftresult) { + if ($results =~ "$_,") { + $temp .= "$_,$_,"; + } + } + $results = $temp; + } else { + $results = $biblionumbers; + } + } + } + return $results; + } +} + +sub NZorder { + my ($biblionumbers, $ordering) = @_; + # order title asc by default + $ordering = '1=36 dbh; + # + # order by POPULARITY + # + if ($ordering =~ /1=9523/) { + my %result; + my %popularity; + # popularity is not in MARC record, it's builded from a specific query + my $sth = $dbh->prepare("select sum(issues) from items where biblionumber=?"); + foreach (split /,/,$biblionumbers) { + my ($biblionumber,$title) = split /;/,$_; + $result{$biblionumber}=GetMarcBiblio($biblionumber); + $sth->execute($biblionumber); + my $popularity= $sth->fetchrow ||0; + # hint : the key is popularity.title because we can have + # many results with the same popularity. In this cas, sub-ordering is done by title + # we also have biblionumber to avoid bug for 2 biblios with the same title & popularity + # (un-frequent, I agree, but we won't forget anything that way ;-) + $popularity{sprintf("%10d",$popularity).$title.$biblionumber} = $biblionumber; + } + # sort the hash and return the same structure as GetRecords (Zebra querying) + my $result_hash; + my $numbers=0; + if ($ordering eq '1=9523 >i') { # sort popularity DESC + foreach my $key (sort {$b <=> $a} (keys %popularity)) { + $result_hash->{'RECORDS'}[$numbers++] = $result{$popularity{$key}}->as_usmarc(); + } + } else { # sort popularity ASC + foreach my $key (sort (keys %popularity)) { + $result_hash->{'RECORDS'}[$numbers++] = $result{$popularity{$key}}->as_usmarc(); + } + } + my $finalresult=(); + $result_hash->{'hits'} = $numbers; + $finalresult->{'biblioserver'} = $result_hash; + return $finalresult; + # + # ORDER BY author + # + } elsif ($ordering eq '1=1003 preference('marcflavour') eq 'UNIMARC') { + $author=$record->subfield('200','f'); + $author=$record->subfield('700','a') unless $author; + } else { + $author=$record->subfield('100','a'); + } + # hint : the result is sorted by title.biblionumber because we can have X biblios with the same title + # and we don't want to get only 1 result for each of them !!! + $result{$author.$biblionumber}=$record; + } + # sort the hash and return the same structure as GetRecords (Zebra querying) + my $result_hash; + my $numbers=0; + if ($ordering eq '1=1003 {'RECORDS'}[$numbers++] = $result{$key}->as_usmarc(); + } + } else { # sort by title ASC + foreach my $key (sort { $a <=> $b } (keys %result)) { + $result_hash->{'RECORDS'}[$numbers++] = $result{$key}->as_usmarc(); + } + } + my $finalresult=(); + $result_hash->{'hits'} = $numbers; + $finalresult->{'biblioserver'} = $result_hash; + return $finalresult; + # + # ORDER BY callnumber + # + } elsif ($ordering eq '1=20 preference('marcflavour') eq 'UNIMARC') { + $callnumber=$record->subfield('200','f'); + } else { + $callnumber=$record->subfield('100','a'); + } + # hint : the result is sorted by title.biblionumber because we can have X biblios with the same title + # and we don't want to get only 1 result for each of them !!! + $result{$callnumber.$biblionumber}=$record; + } + # sort the hash and return the same structure as GetRecords (Zebra querying) + my $result_hash; + my $numbers=0; + if ($ordering eq '1=1003 {'RECORDS'}[$numbers++] = $result{$key}->as_usmarc(); + } + } else { # sort by title ASC + foreach my $key (sort { $a <=> $b } (keys %result)) { + $result_hash->{'RECORDS'}[$numbers++] = $result{$key}->as_usmarc(); + } + } + my $finalresult=(); + $result_hash->{'hits'} = $numbers; + $finalresult->{'biblioserver'} = $result_hash; + return $finalresult; + } elsif ($ordering =~ /1=31/){ #pub year + my %result; + foreach (split /,/,$biblionumbers) { + my ($biblionumber,$title) = split /;/,$_; + my $record=GetMarcBiblio($biblionumber); + my ($publicationyear_tag,$publicationyear_subfield)=GetMarcFromKohaField($dbh,'biblioitems.publicationyear'); + my $publicationyear=$record->subfield($publicationyear_tag,$publicationyear_subfield); + # hint : the result is sorted by title.biblionumber because we can have X biblios with the same title + # and we don't want to get only 1 result for each of them !!! + $result{$publicationyear.$biblionumber}=$record; + } + # sort the hash and return the same structure as GetRecords (Zebra querying) + my $result_hash; + my $numbers=0; + if ($ordering eq '1=31 {'RECORDS'}[$numbers++] = $result{$key}->as_usmarc(); + } + } else { # sort by title ASC + foreach my $key (sort { $a <=> $b } (keys %result)) { + $result_hash->{'RECORDS'}[$numbers++] = $result{$key}->as_usmarc(); + } + } + my $finalresult=(); + $result_hash->{'hits'} = $numbers; + $finalresult->{'biblioserver'} = $result_hash; + return $finalresult; + # + # ORDER BY title + # + } else { + # the title is in the biblionumbers string, so we just need to build a hash, sort it and return + my %result; + foreach (split /,/,$biblionumbers) { + my ($biblionumber,$title) = split /;/,$_; + # hint : the result is sorted by title.biblionumber because we can have X biblios with the same title + # and we don't want to get only 1 result for each of them !!! + $result{$title.$biblionumber}=GetMarcBiblio($biblionumber); + } + # sort the hash and return the same structure as GetRecords (Zebra querying) + my $result_hash; + my $numbers=0; + if ($ordering eq '1=36 {'RECORDS'}[$numbers++] = $result{$key}->as_usmarc(); + } + } else { # sort by title ASC + foreach my $key (sort { $a <=> $b } (keys %result)) { + $result_hash->{'RECORDS'}[$numbers++] = $result{$key}->as_usmarc(); + } + } + my $finalresult=(); + $result_hash->{'hits'} = $numbers; + $finalresult->{'biblioserver'} = $result_hash; + return $finalresult; + } +} END { } # module clean-up code here (global destructor) 1; diff --git a/catalogue/search.pl b/catalogue/search.pl index f91081cd94..3b95cc9e77 100755 --- a/catalogue/search.pl +++ b/catalogue/search.pl @@ -400,11 +400,15 @@ my $facets; # this object stores the faceted results that display on the left-ha my @results_array; my $results_hashref; -eval { - - ($error, $results_hashref, $facets) = getRecords($koha_query,$federated_query,\@sort_by,\@servers,$results_per_page,$offset,$expanded_facet,$branches,$query_type,$scan); - -}; +if (C4::Context->preference('NoZebra')) { + eval { + ($error, $results_hashref, $facets) = NZgetRecords($koha_query,$federated_query,\@sort_by,\@servers,$results_per_page,$offset,$expanded_facet,$branches,$query_type,$scan); + }; +} else { + eval { + ($error, $results_hashref, $facets) = getRecords($koha_query,$federated_query,\@sort_by,\@servers,$results_per_page,$offset,$expanded_facet,$branches,$query_type,$scan); + }; +} if ($@ || $error) { $template->param(query_error => $error.$@); diff --git a/misc/migration_tools/rebuild_nozebra.pl b/misc/migration_tools/rebuild_nozebra.pl new file mode 100755 index 0000000000..6b98ec5b31 --- /dev/null +++ b/misc/migration_tools/rebuild_nozebra.pl @@ -0,0 +1,100 @@ +#!/usr/bin/perl + +use C4::Context; +use Getopt::Long; +use C4::Biblio; +use C4::AuthoritiesMarc; + +use strict; +# +# script that fills the nozebra table +# +# + +$|=1; # flushes output + +# limit for database dumping +my $limit = "LIMIT 1000"; +my $directory; +my $skip_export; +my $keep_export; +my $reset; +my $biblios; +my $authorities; +GetOptions( + 'd:s' => \$directory, + 'reset' => \$reset, + 's' => \$skip_export, + 'k' => \$keep_export, + 'b' => \$biblios, + 'a' => \$authorities, + ); + +$directory = "export" unless $directory; +my $dbh=C4::Context->dbh; +$dbh->do("truncate nozebra"); +my $sth; +$sth=$dbh->prepare("select biblionumber from biblioitems order by biblionumber $limit"); +$sth->execute(); +my $i=0; +my %result; + +my %index = ( + 'title' => '200a,200c,200d', + 'author' =>'200f,700*,701*,702*' + ); + +$|=1; +while (my ($biblionumber) = $sth->fetchrow) { + $i++; + print "\r$i"; + my $record = GetMarcBiblio($biblionumber); + + # get title of the record (to store the 10 first letters with the index) + my $title; + if (C4::Context->preference('marcflavour') eq 'UNIMARC') { + $title = lc($record->subfield('200','a')); + } else { + $title = lc($record->subfield('245','a')); + } + # remove blancks and comma (that could cause problem when decoding the string for CQL retrieval + $title =~ s/ |,|;//g; + # limit to 10 char, should be enough, and limit the DB size + $title = substr($title,0,10); + #parse each field + foreach my $field ($record->fields()) { + #parse each subfield + next if $field->tag <10; + foreach my $subfield ($field->subfields()) { + my $tag = $field->tag(); + my $subfieldcode = $subfield->[0]; + my $indexed=0; + # check each index to see if the subfield is stored somewhere + # otherwise, store it in __RAW__ index + foreach my $key (keys %index) { + if ($index{$key} =~ /$tag\*/ or $index{$key} =~ /$tag$subfield/) { + $indexed=1; + my $line= lc $subfield->[1]; + $line =~ s/-|\.|\?|,|;|!|'|\(|\)|\[|\]|{|}|"|<|>|&|\+|\*|\// /g; + foreach (split / /,$line) { + $result{$key}->{$_}.="$biblionumber;$title," unless $subfield->[0] eq '9'; + } + } + } + # the subfield is not indexed, store it in __RAW__ index anyway + unless ($indexed) { + my $line= lc $subfield->[1]; + $line =~ s/-|\.|\?|,|;|!|'|\(|\)|\[|\]|{|}|"|<|>|&|\+|\*|\// /g; + foreach (split / /,$line) { + $result{'__RAW__'}->{$_}.="$biblionumber;$title," unless $subfield->[0] eq '9'; + } + } + } + } +} +my $sth = $dbh->prepare("INSERT INTO nozebra (indexname,value,biblionumbers) VALUES (?,?,?)"); +foreach my $key (keys %result) { + foreach my $index (keys %{$result{$key}}) { + $sth->execute($key,$index,$result{$key}->{$index}); + } +} -- 2.39.5