From 4213b6ec988ab2f34a62efc5be54d5ec87ec306a Mon Sep 17 00:00:00 2001 From: tipaul Date: Wed, 2 May 2007 11:57:11 +0000 Subject: [PATCH] improving NOzebra search : - changing nozebra table to have biblionumber,title-ranking; (; is the entry separator. Now, if a value is several times in an index, it is stored only once, with a higher ranking (the ranking is the number of times the word appeard for this index) - improving search to have ranking value (default order). The ranking is the sum of ranking of all terms. The list is ordered by ranking+title, from most to lower --- C4/Search.pm | 80 +++++++++++++++++++------ misc/migration_tools/rebuild_nozebra.pl | 52 +++++++++++++--- 2 files changed, 106 insertions(+), 26 deletions(-) diff --git a/C4/Search.pm b/C4/Search.pm index 3f29418cba..66d2257f38 100755 --- a/C4/Search.pm +++ b/C4/Search.pm @@ -1225,11 +1225,11 @@ sub NZanalyse { } # do a AND with existing list if there is one, otherwise, use the biblionumbers list as 1st result list if ($results) { - my @leftresult = split /,/, $biblionumbers; + my @leftresult = split /;/, $biblionumbers; my $temp; foreach (@leftresult) { - if ($results =~ "$_,") { - $temp .= "$_,$_,"; + if ($results =~ "$_;") { + $temp .= "$_;$_;"; } } $results = $temp; @@ -1253,8 +1253,8 @@ sub NZanalyse { my @leftresult = split /,/, $biblionumbers; my $temp; foreach (@leftresult) { - if ($results =~ "$_,") { - $temp .= "$_,$_,"; + if ($results =~ "$_;") { + $temp .= "$_;$_;"; } } $results = $temp; @@ -1270,7 +1270,7 @@ sub NZanalyse { sub NZorder { my ($biblionumbers, $ordering,$results_per_page,$offset) = @_; # order title asc by default - $ordering = '1=36 dbh; @@ -1282,8 +1282,8 @@ sub NZorder { my %popularity; # popularity is not in MARC record, it's builded from a specific query my $sth = $dbh->prepare("select sum(issues) from items where biblionumber=?"); - foreach (split /,/,$biblionumbers) { - my ($biblionumber,$title) = split /;/,$_; + foreach (split /;/,$biblionumbers) { + my ($biblionumber,$title) = split /,/,$_; $result{$biblionumber}=GetMarcBiblio($biblionumber); $sth->execute($biblionumber); my $popularity= $sth->fetchrow ||0; @@ -1314,8 +1314,8 @@ sub NZorder { # } elsif ($ordering eq '1=1003 preference('marcflavour') eq 'UNIMARC') { @@ -1349,8 +1349,8 @@ sub NZorder { # } elsif ($ordering eq '1=20 subfield($publicationyear_tag,$publicationyear_subfield); @@ -1410,13 +1410,11 @@ sub NZorder { # # ORDER BY title # - } else { + } elsif ($ordering =~ /1=36/) { # the title is in the biblionumbers string, so we just need to build a hash, sort it and return my %result; -# splice(@X,$results_per_page*(1+$offset)); -# splice(@X,0,$results_per_page*$offset); - foreach (split /,/,$biblionumbers) { - my ($biblionumber,$title) = split /;/,$_; + foreach (split /;/,$biblionumbers) { + my ($biblionumber,$title) = split /,/,$_; # hint : the result is sorted by title.biblionumber because we can have X biblios with the same title # and we don't want to get only 1 result for each of them !!! # hint & speed improvement : we can order without reading the record @@ -1444,8 +1442,52 @@ sub NZorder { $result_hash->{'hits'} = $numbers; $finalresult->{'biblioserver'} = $result_hash; return $finalresult; + } else { + # + # order by ranking + # + # we need 2 hashes to order by ranking : the 1st one to count the ranking, the 2nd to order by ranking + my %result; + my %count_ranking; + foreach (split /;/,$biblionumbers) { + my ($biblionumber,$title) = split /,/,$_; + $title =~ /(.*)-(\d)/; + # get weight + my $ranking =$2; + # hint : the result is sorted by title.biblionumber because we can have X biblios with the same title + # and we don't want to get only 1 result for each of them !!! + # note that we + the ranking because ranking is calculated on weight of EACH term requested. + # if we ask for "two towers", and "two" has weight 2 in biblio N, and "towers" has weight 4 in biblio N + # biblio N has ranking = 6 + $count_ranking{$biblionumber}=0 unless $count_ranking{$biblionumber}; + $count_ranking{$biblionumber} =+ $ranking; + } + # build the result by "inverting" the count_ranking hash + # hing : as usual, we don't order by ranking only, to avoid having only 1 result for each rank. We build an hash on concat(ranking,biblionumber) instead +# warn "counting"; + foreach (keys %count_ranking) { + warn "$_ =".sprintf("%10d",$count_ranking{$_}).'-'.$_; + $result{sprintf("%10d",$count_ranking{$_}).'-'.$_} = $_; + } + # sort the hash and return the same structure as GetRecords (Zebra querying) + my $result_hash; + my $numbers=0; + foreach my $key (sort {$b <=> $a} (keys %result)) { + warn "KEY : $key = ".$result{$key}; + $result_hash->{'RECORDS'}[$numbers++] = $result{$key}; + } + # for the requested page, replace biblionumber by the complete record + # speed improvement : avoid reading too much things + for (my $counter=$offset;$counter<=$offset+$results_per_page;$counter++) { + $result_hash->{'RECORDS'}[$counter] = GetMarcBiblio($result_hash->{'RECORDS'}[$counter])->as_usmarc; + } + my $finalresult=(); + $result_hash->{'hits'} = $numbers; + $finalresult->{'biblioserver'} = $result_hash; + return $finalresult; } } + END { } # module clean-up code here (global destructor) 1; diff --git a/misc/migration_tools/rebuild_nozebra.pl b/misc/migration_tools/rebuild_nozebra.pl index 6b98ec5b31..7e88ef4f0f 100755 --- a/misc/migration_tools/rebuild_nozebra.pl +++ b/misc/migration_tools/rebuild_nozebra.pl @@ -14,7 +14,7 @@ use strict; $|=1; # flushes output # limit for database dumping -my $limit = "LIMIT 1000"; +my $limit;# = "LIMIT 1000"; my $directory; my $skip_export; my $keep_export; @@ -32,6 +32,14 @@ GetOptions( $directory = "export" unless $directory; my $dbh=C4::Context->dbh; +$dbh->do("update systempreferences set value=1 where variable='NoZebra'"); +$dbh->do("CREATE TABLE `nozebra` ( + `indexname` varchar(40) character set latin1 NOT NULL, + `value` varchar(250) character set latin1 NOT NULL, + `biblionumbers` longtext character set latin1 NOT NULL, + KEY `indexname` (`indexname`), + KEY `value` (`value`)) + ENGINE=InnoDB DEFAULT CHARSET=utf8"); $dbh->do("truncate nozebra"); my $sth; $sth=$dbh->prepare("select biblionumber from biblioitems order by biblionumber $limit"); @@ -40,8 +48,20 @@ my $i=0; my %result; my %index = ( - 'title' => '200a,200c,200d', - 'author' =>'200f,700*,701*,702*' + 'title' => '200a,200c,200d,200e,225a,225d,225e,225f,225h,225i,225v,500*,501*,503*,510*,512*,513*,514*,515*,516*,517*,518*,519*,520*,530*,531*,532*,540*,541*,545*,604t,610t,605a', + 'author' =>'200f,600a,601a,604a,700a,700b,700c,700d,700a,701b,701c,701d,702a,702b,702c,702d,710a,710b,710c,710d,711a,711b,711c,711d,712a,712b,712c,712d', + 'isbn' => '010a', + 'issn' => '011a', + 'biblionumber' =>'0909', + 'itemtype' => '200b', + 'language' => '010a', + 'publisher' => '210x', + 'date' => '210d', + 'note' => '300a,301a,302a,303a,304a,305a,306az,307a,308a,309a,310a,311a,312a,313a,314a,315a,316a,317a,318a,319a,320a,321a,322a,323a,324a,325a,326a,327a,328a,330a,332a,333a,336a,337a,345a', + 'Koha-Auth-Number' => '6009,6019,6029,6039,6049,6059,6069,6109', + 'subject' => '600*,601*,606*,610*', + 'dewey' => '676a', + 'host-item' => '995a,995c', ); $|=1; @@ -57,8 +77,8 @@ while (my ($biblionumber) = $sth->fetchrow) { } else { $title = lc($record->subfield('245','a')); } - # remove blancks and comma (that could cause problem when decoding the string for CQL retrieval - $title =~ s/ |,|;//g; + # remove blancks comma (that could cause problem when decoding the string for CQL retrieval) and regexp specific values + $title =~ s/ |,|;|\[|\]|\(|\)|\*//g; # limit to 10 char, should be enough, and limit the DB size $title = substr($title,0,10); #parse each field @@ -77,7 +97,14 @@ while (my ($biblionumber) = $sth->fetchrow) { my $line= lc $subfield->[1]; $line =~ s/-|\.|\?|,|;|!|'|\(|\)|\[|\]|{|}|"|<|>|&|\+|\*|\// /g; foreach (split / /,$line) { - $result{$key}->{$_}.="$biblionumber;$title," unless $subfield->[0] eq '9'; + # see if the entry is already here + if ($result{$key}->{$_} =~ /$biblionumber,$title\-(\d);/) { + my $weight=$1+1; + $result{$key}->{$_} =~ s/$biblionumber,$title\-(\d);//; + $result{$key}->{$_} .= "$biblionumber,$title-$weight;"; + } else { + $result{$key}->{$_}.="$biblionumber,$title-1;"; + } } } } @@ -86,7 +113,15 @@ while (my ($biblionumber) = $sth->fetchrow) { my $line= lc $subfield->[1]; $line =~ s/-|\.|\?|,|;|!|'|\(|\)|\[|\]|{|}|"|<|>|&|\+|\*|\// /g; foreach (split / /,$line) { - $result{'__RAW__'}->{$_}.="$biblionumber;$title," unless $subfield->[0] eq '9'; +# warn $record->as_formatted."$_ =>".$title; + if ($result{__RAW__}->{$_} =~ /$biblionumber,$title\-(\d);/) { + my $weight=$1+1; +# $weight++; + $result{__RAW__}->{$_} =~ s/$biblionumber,$title\-(\d);//; + $result{__RAW__}->{$_} .= "$biblionumber,$title-$weight;"; + } else { + $result{__RAW__}->{$_}.="$biblionumber,$title-1;"; + } } } } @@ -96,5 +131,8 @@ my $sth = $dbh->prepare("INSERT INTO nozebra (indexname,value,biblionumbers) VAL foreach my $key (keys %result) { foreach my $index (keys %{$result{$key}}) { $sth->execute($key,$index,$result{$key}->{$index}); + if (length($result{$key}->{$index}) >40000) { + print length($result{$key}->{$index})." for $key / $index"; + } } } -- 2.39.5