improving NOzebra search :
- changing nozebra table to have biblionumber,title-ranking; (; is the entry separator. Now, if a value is several times in an index, it is stored only once, with a higher ranking (the ranking is the number of times the word appeard for this index) - improving search to have ranking value (default order). The ranking is the sum of ranking of all terms. The list is ordered by ranking+title, from most to lower
This commit is contained in:
parent
a0b19e3e94
commit
4213b6ec98
2 changed files with 106 additions and 26 deletions
80
C4/Search.pm
80
C4/Search.pm
|
@ -1225,11 +1225,11 @@ sub NZanalyse {
|
|||
}
|
||||
# do a AND with existing list if there is one, otherwise, use the biblionumbers list as 1st result list
|
||||
if ($results) {
|
||||
my @leftresult = split /,/, $biblionumbers;
|
||||
my @leftresult = split /;/, $biblionumbers;
|
||||
my $temp;
|
||||
foreach (@leftresult) {
|
||||
if ($results =~ "$_,") {
|
||||
$temp .= "$_,$_,";
|
||||
if ($results =~ "$_;") {
|
||||
$temp .= "$_;$_;";
|
||||
}
|
||||
}
|
||||
$results = $temp;
|
||||
|
@ -1253,8 +1253,8 @@ sub NZanalyse {
|
|||
my @leftresult = split /,/, $biblionumbers;
|
||||
my $temp;
|
||||
foreach (@leftresult) {
|
||||
if ($results =~ "$_,") {
|
||||
$temp .= "$_,$_,";
|
||||
if ($results =~ "$_;") {
|
||||
$temp .= "$_;$_;";
|
||||
}
|
||||
}
|
||||
$results = $temp;
|
||||
|
@ -1270,7 +1270,7 @@ sub NZanalyse {
|
|||
sub NZorder {
|
||||
my ($biblionumbers, $ordering,$results_per_page,$offset) = @_;
|
||||
# order title asc by default
|
||||
$ordering = '1=36 <i' unless $ordering;
|
||||
# $ordering = '1=36 <i' unless $ordering;
|
||||
$results_per_page=20 unless $results_per_page;
|
||||
$offset = 0 unless $offset;
|
||||
my $dbh = C4::Context->dbh;
|
||||
|
@ -1282,8 +1282,8 @@ sub NZorder {
|
|||
my %popularity;
|
||||
# popularity is not in MARC record, it's builded from a specific query
|
||||
my $sth = $dbh->prepare("select sum(issues) from items where biblionumber=?");
|
||||
foreach (split /,/,$biblionumbers) {
|
||||
my ($biblionumber,$title) = split /;/,$_;
|
||||
foreach (split /;/,$biblionumbers) {
|
||||
my ($biblionumber,$title) = split /,/,$_;
|
||||
$result{$biblionumber}=GetMarcBiblio($biblionumber);
|
||||
$sth->execute($biblionumber);
|
||||
my $popularity= $sth->fetchrow ||0;
|
||||
|
@ -1314,8 +1314,8 @@ sub NZorder {
|
|||
#
|
||||
} elsif ($ordering eq '1=1003 <i'){
|
||||
my %result;
|
||||
foreach (split /,/,$biblionumbers) {
|
||||
my ($biblionumber,$title) = split /;/,$_;
|
||||
foreach (split /;/,$biblionumbers) {
|
||||
my ($biblionumber,$title) = split /,/,$_;
|
||||
my $record=GetMarcBiblio($biblionumber);
|
||||
my $author;
|
||||
if (C4::Context->preference('marcflavour') eq 'UNIMARC') {
|
||||
|
@ -1349,8 +1349,8 @@ sub NZorder {
|
|||
#
|
||||
} elsif ($ordering eq '1=20 <i'){
|
||||
my %result;
|
||||
foreach (split /,/,$biblionumbers) {
|
||||
my ($biblionumber,$title) = split /;/,$_;
|
||||
foreach (split /;/,$biblionumbers) {
|
||||
my ($biblionumber,$title) = split /,/,$_;
|
||||
my $record=GetMarcBiblio($biblionumber);
|
||||
my $callnumber;
|
||||
my ($callnumber_tag,$callnumber_subfield)=GetMarcFromKohaField($dbh,'items.itemcallnumber');
|
||||
|
@ -1382,8 +1382,8 @@ sub NZorder {
|
|||
return $finalresult;
|
||||
} elsif ($ordering =~ /1=31/){ #pub year
|
||||
my %result;
|
||||
foreach (split /,/,$biblionumbers) {
|
||||
my ($biblionumber,$title) = split /;/,$_;
|
||||
foreach (split /;/,$biblionumbers) {
|
||||
my ($biblionumber,$title) = split /,/,$_;
|
||||
my $record=GetMarcBiblio($biblionumber);
|
||||
my ($publicationyear_tag,$publicationyear_subfield)=GetMarcFromKohaField($dbh,'biblioitems.publicationyear');
|
||||
my $publicationyear=$record->subfield($publicationyear_tag,$publicationyear_subfield);
|
||||
|
@ -1410,13 +1410,11 @@ sub NZorder {
|
|||
#
|
||||
# ORDER BY title
|
||||
#
|
||||
} else {
|
||||
} elsif ($ordering =~ /1=36/) {
|
||||
# the title is in the biblionumbers string, so we just need to build a hash, sort it and return
|
||||
my %result;
|
||||
# splice(@X,$results_per_page*(1+$offset));
|
||||
# splice(@X,0,$results_per_page*$offset);
|
||||
foreach (split /,/,$biblionumbers) {
|
||||
my ($biblionumber,$title) = split /;/,$_;
|
||||
foreach (split /;/,$biblionumbers) {
|
||||
my ($biblionumber,$title) = split /,/,$_;
|
||||
# hint : the result is sorted by title.biblionumber because we can have X biblios with the same title
|
||||
# and we don't want to get only 1 result for each of them !!!
|
||||
# hint & speed improvement : we can order without reading the record
|
||||
|
@ -1444,8 +1442,52 @@ sub NZorder {
|
|||
$result_hash->{'hits'} = $numbers;
|
||||
$finalresult->{'biblioserver'} = $result_hash;
|
||||
return $finalresult;
|
||||
} else {
|
||||
#
|
||||
# order by ranking
|
||||
#
|
||||
# we need 2 hashes to order by ranking : the 1st one to count the ranking, the 2nd to order by ranking
|
||||
my %result;
|
||||
my %count_ranking;
|
||||
foreach (split /;/,$biblionumbers) {
|
||||
my ($biblionumber,$title) = split /,/,$_;
|
||||
$title =~ /(.*)-(\d)/;
|
||||
# get weight
|
||||
my $ranking =$2;
|
||||
# hint : the result is sorted by title.biblionumber because we can have X biblios with the same title
|
||||
# and we don't want to get only 1 result for each of them !!!
|
||||
# note that we + the ranking because ranking is calculated on weight of EACH term requested.
|
||||
# if we ask for "two towers", and "two" has weight 2 in biblio N, and "towers" has weight 4 in biblio N
|
||||
# biblio N has ranking = 6
|
||||
$count_ranking{$biblionumber}=0 unless $count_ranking{$biblionumber};
|
||||
$count_ranking{$biblionumber} =+ $ranking;
|
||||
}
|
||||
# build the result by "inverting" the count_ranking hash
|
||||
# hing : as usual, we don't order by ranking only, to avoid having only 1 result for each rank. We build an hash on concat(ranking,biblionumber) instead
|
||||
# warn "counting";
|
||||
foreach (keys %count_ranking) {
|
||||
warn "$_ =".sprintf("%10d",$count_ranking{$_}).'-'.$_;
|
||||
$result{sprintf("%10d",$count_ranking{$_}).'-'.$_} = $_;
|
||||
}
|
||||
# sort the hash and return the same structure as GetRecords (Zebra querying)
|
||||
my $result_hash;
|
||||
my $numbers=0;
|
||||
foreach my $key (sort {$b <=> $a} (keys %result)) {
|
||||
warn "KEY : $key = ".$result{$key};
|
||||
$result_hash->{'RECORDS'}[$numbers++] = $result{$key};
|
||||
}
|
||||
# for the requested page, replace biblionumber by the complete record
|
||||
# speed improvement : avoid reading too much things
|
||||
for (my $counter=$offset;$counter<=$offset+$results_per_page;$counter++) {
|
||||
$result_hash->{'RECORDS'}[$counter] = GetMarcBiblio($result_hash->{'RECORDS'}[$counter])->as_usmarc;
|
||||
}
|
||||
my $finalresult=();
|
||||
$result_hash->{'hits'} = $numbers;
|
||||
$finalresult->{'biblioserver'} = $result_hash;
|
||||
return $finalresult;
|
||||
}
|
||||
}
|
||||
|
||||
END { } # module clean-up code here (global destructor)
|
||||
|
||||
1;
|
||||
|
|
|
@ -14,7 +14,7 @@ use strict;
|
|||
$|=1; # flushes output
|
||||
|
||||
# limit for database dumping
|
||||
my $limit = "LIMIT 1000";
|
||||
my $limit;# = "LIMIT 1000";
|
||||
my $directory;
|
||||
my $skip_export;
|
||||
my $keep_export;
|
||||
|
@ -32,6 +32,14 @@ GetOptions(
|
|||
|
||||
$directory = "export" unless $directory;
|
||||
my $dbh=C4::Context->dbh;
|
||||
$dbh->do("update systempreferences set value=1 where variable='NoZebra'");
|
||||
$dbh->do("CREATE TABLE `nozebra` (
|
||||
`indexname` varchar(40) character set latin1 NOT NULL,
|
||||
`value` varchar(250) character set latin1 NOT NULL,
|
||||
`biblionumbers` longtext character set latin1 NOT NULL,
|
||||
KEY `indexname` (`indexname`),
|
||||
KEY `value` (`value`))
|
||||
ENGINE=InnoDB DEFAULT CHARSET=utf8");
|
||||
$dbh->do("truncate nozebra");
|
||||
my $sth;
|
||||
$sth=$dbh->prepare("select biblionumber from biblioitems order by biblionumber $limit");
|
||||
|
@ -40,8 +48,20 @@ my $i=0;
|
|||
my %result;
|
||||
|
||||
my %index = (
|
||||
'title' => '200a,200c,200d',
|
||||
'author' =>'200f,700*,701*,702*'
|
||||
'title' => '200a,200c,200d,200e,225a,225d,225e,225f,225h,225i,225v,500*,501*,503*,510*,512*,513*,514*,515*,516*,517*,518*,519*,520*,530*,531*,532*,540*,541*,545*,604t,610t,605a',
|
||||
'author' =>'200f,600a,601a,604a,700a,700b,700c,700d,700a,701b,701c,701d,702a,702b,702c,702d,710a,710b,710c,710d,711a,711b,711c,711d,712a,712b,712c,712d',
|
||||
'isbn' => '010a',
|
||||
'issn' => '011a',
|
||||
'biblionumber' =>'0909',
|
||||
'itemtype' => '200b',
|
||||
'language' => '010a',
|
||||
'publisher' => '210x',
|
||||
'date' => '210d',
|
||||
'note' => '300a,301a,302a,303a,304a,305a,306az,307a,308a,309a,310a,311a,312a,313a,314a,315a,316a,317a,318a,319a,320a,321a,322a,323a,324a,325a,326a,327a,328a,330a,332a,333a,336a,337a,345a',
|
||||
'Koha-Auth-Number' => '6009,6019,6029,6039,6049,6059,6069,6109',
|
||||
'subject' => '600*,601*,606*,610*',
|
||||
'dewey' => '676a',
|
||||
'host-item' => '995a,995c',
|
||||
);
|
||||
|
||||
$|=1;
|
||||
|
@ -57,8 +77,8 @@ while (my ($biblionumber) = $sth->fetchrow) {
|
|||
} else {
|
||||
$title = lc($record->subfield('245','a'));
|
||||
}
|
||||
# remove blancks and comma (that could cause problem when decoding the string for CQL retrieval
|
||||
$title =~ s/ |,|;//g;
|
||||
# remove blancks comma (that could cause problem when decoding the string for CQL retrieval) and regexp specific values
|
||||
$title =~ s/ |,|;|\[|\]|\(|\)|\*//g;
|
||||
# limit to 10 char, should be enough, and limit the DB size
|
||||
$title = substr($title,0,10);
|
||||
#parse each field
|
||||
|
@ -77,7 +97,14 @@ while (my ($biblionumber) = $sth->fetchrow) {
|
|||
my $line= lc $subfield->[1];
|
||||
$line =~ s/-|\.|\?|,|;|!|'|\(|\)|\[|\]|{|}|"|<|>|&|\+|\*|\// /g;
|
||||
foreach (split / /,$line) {
|
||||
$result{$key}->{$_}.="$biblionumber;$title," unless $subfield->[0] eq '9';
|
||||
# see if the entry is already here
|
||||
if ($result{$key}->{$_} =~ /$biblionumber,$title\-(\d);/) {
|
||||
my $weight=$1+1;
|
||||
$result{$key}->{$_} =~ s/$biblionumber,$title\-(\d);//;
|
||||
$result{$key}->{$_} .= "$biblionumber,$title-$weight;";
|
||||
} else {
|
||||
$result{$key}->{$_}.="$biblionumber,$title-1;";
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -86,7 +113,15 @@ while (my ($biblionumber) = $sth->fetchrow) {
|
|||
my $line= lc $subfield->[1];
|
||||
$line =~ s/-|\.|\?|,|;|!|'|\(|\)|\[|\]|{|}|"|<|>|&|\+|\*|\// /g;
|
||||
foreach (split / /,$line) {
|
||||
$result{'__RAW__'}->{$_}.="$biblionumber;$title," unless $subfield->[0] eq '9';
|
||||
# warn $record->as_formatted."$_ =>".$title;
|
||||
if ($result{__RAW__}->{$_} =~ /$biblionumber,$title\-(\d);/) {
|
||||
my $weight=$1+1;
|
||||
# $weight++;
|
||||
$result{__RAW__}->{$_} =~ s/$biblionumber,$title\-(\d);//;
|
||||
$result{__RAW__}->{$_} .= "$biblionumber,$title-$weight;";
|
||||
} else {
|
||||
$result{__RAW__}->{$_}.="$biblionumber,$title-1;";
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -96,5 +131,8 @@ my $sth = $dbh->prepare("INSERT INTO nozebra (indexname,value,biblionumbers) VAL
|
|||
foreach my $key (keys %result) {
|
||||
foreach my $index (keys %{$result{$key}}) {
|
||||
$sth->execute($key,$index,$result{$key}->{$index});
|
||||
if (length($result{$key}->{$index}) >40000) {
|
||||
print length($result{$key}->{$index})." for $key / $index";
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in a new issue