proof of concept for inverted index tables for search
how it works : * create the table marc_Tword with the following structure : CREATE TABLE `marc_Tword` ( `word` varchar(80) NOT NULL default '', `usedin` text NOT NULL, `tagsubfield` varchar(4) NOT NULL default '', PRIMARY KEY (`word`,`tagsubfield`) ) TYPE=MyISAM; * open a console & type export PERL5LIB & export KOHA_CONF as usual. * fill this table with misc/build_marc_Tword.pl. Warning, this script uses a very very consumming but very fast method to fill the table : it does everything in memory, then write everything. Another method is provided (& commented), but it's 100x times slower (really !) * open opac-search.pl and replace use C4::SearchMarc; by use C4::SearchMarcTest; as the API hasn't changed, it will work immediatly. * go to opac-search (advanced search) & search whatever you want. Should work fine. LIMITS : * build_marc_Tword has problem with extended chars (accented ones mainly). So don't be afraid if you get sql errors. They are not a problem for a POC * search works always order by title, whatever you choose. * search works only search WORDA and WOARDB, not yet WORDA or WORDB or WORDA except WORDB.
This commit is contained in:
parent
a887b00436
commit
41c209a4bd
2 changed files with 674 additions and 0 deletions
549
C4/SearchMarcTest.pm
Normal file
549
C4/SearchMarcTest.pm
Normal file
|
@ -0,0 +1,549 @@
|
|||
package C4::SearchMarcTest;
|
||||
|
||||
# Copyright 2000-2002 Katipo Communications
|
||||
#
|
||||
# This file is part of Koha.
|
||||
#
|
||||
# Koha is free software; you can redistribute it and/or modify it under the
|
||||
# terms of the GNU General Public License as published by the Free Software
|
||||
# Foundation; either version 2 of the License, or (at your option) any later
|
||||
# version.
|
||||
#
|
||||
# Koha is distributed in the hope that it will be useful, but WITHOUT ANY
|
||||
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
|
||||
# A PARTICULAR PURPOSE. See the GNU General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public License along with
|
||||
# Koha; if not, write to the Free Software Foundation, Inc., 59 Temple Place,
|
||||
# Suite 330, Boston, MA 02111-1307 USA
|
||||
|
||||
use strict;
|
||||
require Exporter;
|
||||
use DBI;
|
||||
use C4::Context;
|
||||
use C4::Biblio;
|
||||
use C4::Date;
|
||||
use Date::Manip;
|
||||
|
||||
use vars qw($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
|
||||
|
||||
# set the version for version checking
|
||||
$VERSION = 0.02;
|
||||
|
||||
=head1 NAME
|
||||
|
||||
C4::Search - Functions for searching the Koha MARC catalog
|
||||
|
||||
=head1 FUNCTIONS
|
||||
|
||||
This module provides the searching facilities for the Koha MARC catalog
|
||||
|
||||
A COPY of official SearchMarc, with some tests for inverted index table
|
||||
works only with 1 MARC tag/subfield
|
||||
|
||||
=cut
|
||||
|
||||
@ISA = qw(Exporter);
|
||||
@EXPORT = qw(&catalogsearch &findseealso &findsuggestion &getMARCnotes &getMARCsubjects);
|
||||
|
||||
=head1 findsuggestion($dbh,$values);
|
||||
|
||||
=head2 $dbh is a link to the DB handler.
|
||||
|
||||
use C4::Context;
|
||||
my $dbh =C4::Context->dbh;
|
||||
|
||||
=head2 $values is a word
|
||||
|
||||
Searches words with the same soundex, ordered by frequency of use.
|
||||
Useful to suggest other searches to the users.
|
||||
|
||||
=cut
|
||||
|
||||
sub findsuggestion {
|
||||
my ($dbh,$values) = @_;
|
||||
my $sth = $dbh->prepare("SELECT count( * ) AS total, word FROM marc_word WHERE sndx_word = soundex( ? ) AND word <> ? GROUP BY word ORDER BY total DESC");
|
||||
my @results;
|
||||
for(my $i = 0 ; $i <= $#{$values} ; $i++) {
|
||||
if (length(@$values[$i]) >=5) {
|
||||
$sth->execute(@$values[$i],@$values[$i]);
|
||||
my $resfound = 1;
|
||||
my @resline;
|
||||
while ((my ($count,$word) = $sth->fetchrow) and $resfound <=10) {
|
||||
push @results, "@$values[$i]|$word|$count";
|
||||
# $results{@$values[$i]} = \@resline;
|
||||
$resfound++;
|
||||
}
|
||||
}
|
||||
}
|
||||
return \@results;
|
||||
}
|
||||
|
||||
=head1 findseealso($dbh,$fields);
|
||||
|
||||
=head2 $dbh is a link to the DB handler.
|
||||
|
||||
use C4::Context;
|
||||
my $dbh =C4::Context->dbh;
|
||||
|
||||
=head2 $fields is a reference to the fields array
|
||||
|
||||
This function modify the @$fields array and add related fields to search on.
|
||||
|
||||
=cut
|
||||
|
||||
sub findseealso {
|
||||
my ($dbh, $fields) = @_;
|
||||
my $tagslib = MARCgettagslib ($dbh,1);
|
||||
for (my $i=0;$i<=$#{$fields};$i++) {
|
||||
my ($tag) =substr(@$fields[$i],1,3);
|
||||
my ($subfield) =substr(@$fields[$i],4,1);
|
||||
@$fields[$i].=','.$tagslib->{$tag}->{$subfield}->{seealso} if ($tagslib->{$tag}->{$subfield}->{seealso});
|
||||
}
|
||||
}
|
||||
|
||||
=head1 my ($count, @results) = catalogsearch($dbh, $tags, $and_or, $excluding, $operator, $value, $offset,$length,$orderby);
|
||||
|
||||
=head2 $dbh is a link to the DB handler.
|
||||
|
||||
use C4::Context;
|
||||
my $dbh =C4::Context->dbh;
|
||||
|
||||
$tags,$and_or, $excluding, $operator, $value are references to array
|
||||
|
||||
=head2 $tags
|
||||
|
||||
contains the list of tags+subfields (for example : $@tags[0] = '200a')
|
||||
A field can be a list of fields : '200f','700a','700b','701a','701b'
|
||||
|
||||
Example
|
||||
|
||||
=head2 $and_or
|
||||
|
||||
contains a list of strings containing and or or. The 1st value is useless.
|
||||
|
||||
=head2 $excluding
|
||||
|
||||
contains 0 or 1. If 1, then the request is negated.
|
||||
|
||||
=head2 $operator
|
||||
|
||||
contains contains,=,start,>,>=,<,<= the = and start work on the complete subfield. The contains operator works on every word in the subfield.
|
||||
|
||||
examples :
|
||||
contains home, search home anywhere.
|
||||
= home, search a string being home.
|
||||
|
||||
=head2 $value
|
||||
|
||||
contains the value to search
|
||||
If it contains a * or a %, then the search is partial.
|
||||
|
||||
=head2 $offset and $length
|
||||
|
||||
returns $length results, beginning at $offset
|
||||
|
||||
=head2 $orderby
|
||||
|
||||
define the field used to order the request. Any field in the biblio/biblioitem tables can be used. DESC is possible too
|
||||
|
||||
(for example title, title DESC,...)
|
||||
|
||||
=head2 RETURNS
|
||||
|
||||
returns an array containing hashes. The hash contains all biblio & biblioitems fields and a reference to an item hash. The "item hash contains one line for each callnumber & the number of items related to the callnumber.
|
||||
|
||||
=cut
|
||||
|
||||
=head2 my $marcnotesarray = &getMARCnotes($dbh,$bibid,$marcflavour);
|
||||
|
||||
Returns a reference to an array containing all the notes stored in the MARC database for the given bibid.
|
||||
$marcflavour ("MARC21" or "UNIMARC") determines which tags are used for retrieving subjects.
|
||||
|
||||
=head2 my $marcsubjctsarray = &getMARCsubjects($dbh,$bibid,$marcflavour);
|
||||
|
||||
Returns a reference to an array containing all the subjects stored in the MARC database for the given bibid.
|
||||
$marcflavour ("MARC21" or "UNIMARC") determines which tags are used for retrieving subjects.
|
||||
|
||||
=cut
|
||||
|
||||
sub catalogsearch {
|
||||
my ($dbh, $tags, $and_or, $excluding, $operator, $value, $offset,$length,$orderby,$desc_or_asc) = @_;
|
||||
# "Normal" statements
|
||||
my @normal_tags = ();
|
||||
my @normal_and_or = ();
|
||||
my @normal_operator = ();
|
||||
my @normal_value = ();
|
||||
# Extracts the NOT statements from the list of statements
|
||||
my @not_tags = ();
|
||||
my @not_and_or = ();
|
||||
my @not_operator = ();
|
||||
my @not_value = ();
|
||||
my $any_not = 0;
|
||||
$orderby = "biblio.title" unless $orderby;
|
||||
$desc_or_asc = "ASC" unless $desc_or_asc;
|
||||
|
||||
# the item.notforloan contains an integer. Every value <>0 means "book unavailable for loan".
|
||||
# but each library can have it's own table of meaning for each value. Get them
|
||||
# 1st search if there is a list of authorised values connected to items.notforloan
|
||||
my $sth = $dbh->prepare('select authorised_value from marc_subfield_structure where kohafield="items.notforloan"');
|
||||
$sth->execute;
|
||||
my %notforloanstatus;
|
||||
my ($authorised_valuecode) = $sth->fetchrow;
|
||||
if ($authorised_valuecode) {
|
||||
$sth = $dbh->prepare("select authorised_value,lib from authorised_values where category=?");
|
||||
$sth->execute($authorised_valuecode);
|
||||
while (my ($authorised_value,$lib) = $sth->fetchrow) {
|
||||
$notforloanstatus{$authorised_value} = $lib?$lib:$authorised_value;
|
||||
}
|
||||
}
|
||||
#
|
||||
#
|
||||
# marc_T_word PROOF OF CONCEPT BEGINNING
|
||||
#
|
||||
# fixme : only do a search on "contains every word"
|
||||
# misses :
|
||||
# - begins or is equal to
|
||||
# - excluding
|
||||
# - or
|
||||
# the global array result.
|
||||
my @result;
|
||||
for(my $i = 0 ; $i <= $#{$value} ; $i++)
|
||||
{
|
||||
# replace * by %
|
||||
@$value[$i] =~ s/\*/%/g;
|
||||
# remove % at the beginning
|
||||
@$value[$i] =~ s/^%//g;
|
||||
@$value[$i] =~ s/(\.|\?|\:|\!|\'|,|\-|\"|\(|\)|\[|\]|\{|\}|\/)/ /g if @$operator[$i] eq "contains";
|
||||
foreach my $word (split(/ /, @$value[$i])) # if operator is contains, splits the words in separate requests
|
||||
{
|
||||
# the array from this word
|
||||
my @thiswordresults;
|
||||
my $Tquery = $dbh->prepare("select tagsubfield,usedin from marc_Tword where tagsubfield in (@$tags[$i]) and word like ?");
|
||||
$Tquery->execute($word);
|
||||
warn "EXECUTING select tagsubfield,usedin from marc_Tword where tagsubfield in (@$tags[$i]) and word like $word";
|
||||
# get the list of biblionumber - title
|
||||
while (my ($tagsubfield,$usedin) = $Tquery->fetchrow) {
|
||||
# warn "$word with ".@$tags[$i]." used in $usedin";
|
||||
# split it in an array
|
||||
my @lines = split /,/,$usedin;
|
||||
# and copy it to an hash.
|
||||
foreach my $line (@lines) {
|
||||
# warn "PUSHING $line" if $line; # the if $line avoid pushing the 1st entry, that is empty (usedin begins by a ,)
|
||||
push @thiswordresults, $line if $line;
|
||||
}
|
||||
}
|
||||
# now, as it's a AND, merge %results & %thiswordresults in 1 hash
|
||||
@result = @thiswordresults if $#result<0; #for the 1st loop, fill the global array
|
||||
my %intersect;
|
||||
my %union;
|
||||
my $x; # temp variable
|
||||
foreach $x (@result, @thiswordresults) {
|
||||
$union{$x}++ && $intersect{$x}++;
|
||||
}
|
||||
@result = keys %intersect;
|
||||
}
|
||||
}
|
||||
|
||||
# we have biblionumber array.
|
||||
# now, sort it
|
||||
my @result = sort @result;
|
||||
|
||||
#Now, loads title and author from [offset] to [offset]+[length]
|
||||
my $counter = $offset;
|
||||
# HINT : biblionumber as bn is important. The hash is fills biblionumber with items.biblionumber.
|
||||
# so if you dont' has an item, you get a not nice empty value.
|
||||
$sth = $dbh->prepare("SELECT biblio.biblionumber as bn,biblioitems.*,biblio.*, marc_biblio.bibid,itemtypes.notforloan,itemtypes.description
|
||||
FROM biblio, marc_biblio
|
||||
LEFT JOIN biblioitems on biblio.biblionumber = biblioitems.biblionumber
|
||||
LEFT JOIN itemtypes on itemtypes.itemtype=biblioitems.itemtype
|
||||
WHERE biblio.biblionumber = marc_biblio.biblionumber AND biblio.biblionumber = ?");
|
||||
#
|
||||
#
|
||||
# marc_Tword Proof of concept
|
||||
#
|
||||
#
|
||||
my $subtitle;
|
||||
my $sth_subtitle = $dbh->prepare("SELECT subtitle FROM bibliosubtitle WHERE biblionumber=?"); # Added BY JF for Subtitles
|
||||
my @finalresult = ();
|
||||
my @CNresults=();
|
||||
my $totalitems=0;
|
||||
my $oldline;
|
||||
my ($oldbibid, $oldauthor, $oldtitle);
|
||||
my $sth_itemCN = $dbh->prepare("select items.* from items where biblionumber=? and (itemlost = 0 or itemlost is NULL)");
|
||||
my $sth_issue = $dbh->prepare("select date_due,returndate from issues where itemnumber=?");
|
||||
# parse all biblios between start & end.
|
||||
warn "RESULT SIZE : ".$#result;
|
||||
while (($counter <= $#result) && ($counter <= ($offset + $length))) {
|
||||
# search & parse all items & note itemcallnumber
|
||||
# 1st, get the biblionumber
|
||||
$result[$counter] =~ /(.*)-(.*)/;
|
||||
$sth->execute($2);
|
||||
warn "EXECUTING SELECT biblio.biblionumber as bn,biblioitems.*,biblio.*, marc_biblio.bibid,itemtypes.notforloan,itemtypes.description FROM biblio, marc_biblio LEFT JOIN biblioitems on biblio.biblionumber = biblioitems.biblionumber LEFT JOIN itemtypes on itemtypes.itemtype=biblioitems.itemtype WHERE biblio.biblionumber = marc_biblio.biblionumber AND biblio.biblionumber = $2";
|
||||
my $continue=1;
|
||||
my $line = $sth->fetchrow_hashref;
|
||||
my $biblionumber=$line->{bn};
|
||||
# Return subtitles first ADDED BY JF
|
||||
$sth_subtitle->execute($biblionumber);
|
||||
warn "EXECUTING SELECT subtitle FROM bibliosubtitle WHERE biblionumber=$biblionumber";
|
||||
my $subtitle_here.= $sth_subtitle->fetchrow." ";
|
||||
chop $subtitle_here;
|
||||
$subtitle = $subtitle_here;
|
||||
# /ADDED BY JF
|
||||
|
||||
# $continue=0 unless $line->{bn};
|
||||
# my $lastitemnumber;
|
||||
$sth_itemCN->execute($biblionumber);
|
||||
warn "EXECUTING itemCN select items.* from items where biblionumber=$biblionumber and (itemlost = 0 or itemlost is NULL)";
|
||||
my @CNresults = ();
|
||||
my $notforloan=1; # to see if there is at least 1 item that can be issued
|
||||
while (my $item = $sth_itemCN->fetchrow_hashref) {
|
||||
# parse the result, putting holdingbranch & itemcallnumber in separate array
|
||||
# then all other fields in the main array
|
||||
|
||||
# search if item is on loan
|
||||
my $date_due;
|
||||
$sth_issue->execute($item->{itemnumber});
|
||||
warn "EXECUTING ISSUES select date_due,returndate from issues where itemnumber=".$item->{itemnumber};
|
||||
while (my $loan = $sth_issue->fetchrow_hashref) {
|
||||
if ($loan->{date_due} and !$loan->{returndate}) {
|
||||
$date_due = $loan->{date_due};
|
||||
}
|
||||
}
|
||||
# store this item
|
||||
my %lineCN;
|
||||
$lineCN{holdingbranch} = $item->{holdingbranch};
|
||||
$lineCN{itemcallnumber} = $item->{itemcallnumber};
|
||||
$lineCN{location} = $item->{location};
|
||||
$lineCN{date_due} = format_date($date_due);
|
||||
$lineCN{notforloan} = $notforloanstatus{$line->{notforloan}} if ($line->{notforloan}); # setting not forloan if itemtype is not for loan
|
||||
$lineCN{notforloan} = $notforloanstatus{$item->{notforloan}} if ($item->{notforloan}); # setting not forloan it this item is not for loan
|
||||
$notforloan=0 unless ($item->{notforloan} or $item->{wthdrawn} or $item->{itemlost});
|
||||
push @CNresults,\%lineCN;
|
||||
$totalitems++;
|
||||
}
|
||||
# save the biblio in the final array, with item and item issue status
|
||||
my %newline;
|
||||
%newline = %$line;
|
||||
$newline{totitem} = $totalitems;
|
||||
# if $totalitems == 0, check if it's being ordered.
|
||||
if ($totalitems == 0) {
|
||||
my $sth = $dbh->prepare("select count(*) from aqorders where biblionumber=? and datecancellationprinted is NULL");
|
||||
$sth->execute($biblionumber);
|
||||
warn "EXECUTING select count(*) from aqorders where biblionumber=$biblionumber and datecancellationprinted is NULL";
|
||||
my ($ordered) = $sth->fetchrow;
|
||||
$newline{onorder} = 1 if $ordered;
|
||||
}
|
||||
$newline{biblionumber} = $biblionumber;
|
||||
$newline{norequests} = 0;
|
||||
$newline{norequests} = 1 if ($line->{notforloan}); # itemtype not issuable
|
||||
$newline{norequests} = 1 if (!$line->{notforloan} && $notforloan); # itemtype issuable but all items not issuable for instance
|
||||
$newline{subtitle} = $subtitle; # put the subtitle in ADDED BY JF
|
||||
|
||||
my @CNresults2= @CNresults;
|
||||
$newline{CN} = \@CNresults2;
|
||||
$newline{'even'} = 1 if $#finalresult % 2 == 0;
|
||||
$newline{'odd'} = 1 if $#finalresult % 2 == 1;
|
||||
$newline{'timestamp'} = format_date($newline{timestamp});
|
||||
@CNresults = ();
|
||||
push @finalresult, \%newline;
|
||||
$totalitems=0;
|
||||
$counter++;
|
||||
}
|
||||
my $nbresults = $#result+1;
|
||||
return (\@finalresult, $nbresults);
|
||||
}
|
||||
|
||||
# Creates the SQL Request
|
||||
|
||||
sub create_request {
|
||||
my ($dbh,$tags, $and_or, $operator, $value) = @_;
|
||||
|
||||
my $sql_tables; # will contain marc_subfield_table as m1,...
|
||||
my $sql_where1; # will contain the "true" where
|
||||
my $sql_where2 = "("; # will contain m1.bibid=m2.bibid
|
||||
my $nb_active=0; # will contain the number of "active" entries. an entry is active if a value is provided.
|
||||
my $nb_table=1; # will contain the number of table. ++ on each entry EXCEPT when an OR is provided.
|
||||
|
||||
my $maxloop=8; # the maximum number of words to avoid a too complex search.
|
||||
$maxloop = @$value if @$value<$maxloop;
|
||||
|
||||
for(my $i=0; $i<=$maxloop;$i++) {
|
||||
if (@$value[$i]) {
|
||||
$nb_active++;
|
||||
if ($nb_active==1) {
|
||||
if (@$operator[$i] eq "start") {
|
||||
$sql_tables .= "marc_subfield_table as m$nb_table,";
|
||||
$sql_where1 .= "(m1.subfieldvalue like ".$dbh->quote("@$value[$i]%");
|
||||
if (@$tags[$i]) {
|
||||
$sql_where1 .=" and concat(m1.tag,m1.subfieldcode) in (@$tags[$i])";
|
||||
}
|
||||
$sql_where1.=")";
|
||||
} elsif (@$operator[$i] eq "contains") {
|
||||
$sql_tables .= "marc_word as m$nb_table,";
|
||||
$sql_where1 .= "(m1.word like ".$dbh->quote("@$value[$i]");
|
||||
if (@$tags[$i]) {
|
||||
$sql_where1 .=" and m1.tagsubfield in (@$tags[$i])";
|
||||
}
|
||||
$sql_where1.=")";
|
||||
} else {
|
||||
$sql_tables .= "marc_subfield_table as m$nb_table,";
|
||||
$sql_where1 .= "(m1.subfieldvalue @$operator[$i] ".$dbh->quote("@$value[$i]");
|
||||
if (@$tags[$i]) {
|
||||
$sql_where1 .=" and concat(m1.tag,m1.subfieldcode) in (@$tags[$i])";
|
||||
}
|
||||
$sql_where1.=")";
|
||||
}
|
||||
} else {
|
||||
if (@$operator[$i] eq "start") {
|
||||
$nb_table++;
|
||||
$sql_tables .= "marc_subfield_table as m$nb_table,";
|
||||
$sql_where1 .= "@$and_or[$i] (m$nb_table.subfieldvalue like ".$dbh->quote("@$value[$i]%");
|
||||
if (@$tags[$i]) {
|
||||
$sql_where1 .=" and concat(m$nb_table.tag,m$nb_table.subfieldcode) in (@$tags[$i])";
|
||||
}
|
||||
$sql_where1.=")";
|
||||
$sql_where2 .= "m1.bibid=m$nb_table.bibid and ";
|
||||
} elsif (@$operator[$i] eq "contains") {
|
||||
if (@$and_or[$i] eq 'and') {
|
||||
$nb_table++;
|
||||
$sql_tables .= "marc_word as m$nb_table,";
|
||||
$sql_where1 .= "@$and_or[$i] (m$nb_table.word like ".$dbh->quote("@$value[$i]");
|
||||
if (@$tags[$i]) {
|
||||
$sql_where1 .=" and m$nb_table.tagsubfield in(@$tags[$i])";
|
||||
}
|
||||
$sql_where1.=")";
|
||||
$sql_where2 .= "m1.bibid=m$nb_table.bibid and ";
|
||||
} else {
|
||||
$sql_where1 .= "@$and_or[$i] (m$nb_table.word like ".$dbh->quote("@$value[$i]");
|
||||
if (@$tags[$i]) {
|
||||
$sql_where1 .=" and m$nb_table.tagsubfield in (@$tags[$i])";
|
||||
}
|
||||
$sql_where1.=")";
|
||||
$sql_where2 .= "m1.bibid=m$nb_table.bibid and ";
|
||||
}
|
||||
} else {
|
||||
$nb_table++;
|
||||
$sql_tables .= "marc_subfield_table as m$nb_table,";
|
||||
$sql_where1 .= "@$and_or[$i] (m$nb_table.subfieldvalue @$operator[$i] ".$dbh->quote(@$value[$i]);
|
||||
if (@$tags[$i]) {
|
||||
$sql_where1 .=" and concat(m$nb_table.tag,m$nb_table.subfieldcode) in (@$tags[$i])";
|
||||
}
|
||||
$sql_where2 .= "m1.bibid=m$nb_table.bibid and ";
|
||||
$sql_where1.=")";
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if($sql_where2 ne "(") # some datas added to sql_where2, processing
|
||||
{
|
||||
$sql_where2 = substr($sql_where2, 0, (length($sql_where2)-5)); # deletes the trailing ' and '
|
||||
$sql_where2 .= ")";
|
||||
}
|
||||
else # no sql_where2 statement, deleting '('
|
||||
{
|
||||
$sql_where2 = "";
|
||||
}
|
||||
chop $sql_tables; # deletes the trailing ','
|
||||
return ($sql_tables, $sql_where1, $sql_where2);
|
||||
}
|
||||
|
||||
sub getMARCnotes {
|
||||
my ($dbh, $bibid, $marcflavour) = @_;
|
||||
my ($mintag, $maxtag);
|
||||
if ($marcflavour eq "MARC21") {
|
||||
$mintag = "500";
|
||||
$maxtag = "599";
|
||||
} else { # assume unimarc if not marc21
|
||||
$mintag = "300";
|
||||
$maxtag = "399";
|
||||
}
|
||||
|
||||
my $sth=$dbh->prepare("SELECT subfieldvalue,tag FROM marc_subfield_table WHERE bibid=? AND tag BETWEEN ? AND ? ORDER BY tagorder");
|
||||
|
||||
$sth->execute($bibid,$mintag,$maxtag);
|
||||
|
||||
my @marcnotes;
|
||||
my $note = "";
|
||||
my $tag = "";
|
||||
my $marcnote;
|
||||
|
||||
while (my $data=$sth->fetchrow_arrayref) {
|
||||
my $value=$data->[0];
|
||||
my $thistag=$data->[1];
|
||||
if ($value=~/\.$/) {
|
||||
$value=$value . " ";
|
||||
}
|
||||
if ($thistag ne $tag && $note ne "") {
|
||||
$marcnote = {marcnote => $note,};
|
||||
push @marcnotes, $marcnote;
|
||||
$note=$value;
|
||||
$tag=$thistag;
|
||||
}
|
||||
if ($note ne $value) {
|
||||
$note = $note." ".$value;
|
||||
}
|
||||
}
|
||||
|
||||
if ($note) {
|
||||
$marcnote = {marcnote => $note};
|
||||
push @marcnotes, $marcnote; #load last tag into array
|
||||
}
|
||||
|
||||
$sth->finish;
|
||||
$dbh->disconnect;
|
||||
|
||||
my $marcnotesarray=\@marcnotes;
|
||||
return $marcnotesarray;
|
||||
} # end getMARCnotes
|
||||
|
||||
|
||||
sub getMARCsubjects {
|
||||
my ($dbh, $bibid, $marcflavour) = @_;
|
||||
my ($mintag, $maxtag);
|
||||
if ($marcflavour eq "MARC21") {
|
||||
$mintag = "600";
|
||||
$maxtag = "699";
|
||||
} else { # assume unimarc if not marc21
|
||||
$mintag = "600";
|
||||
$maxtag = "619";
|
||||
}
|
||||
my $sth=$dbh->prepare("SELECT subfieldvalue,subfieldcode FROM marc_subfield_table WHERE bibid=? AND tag BETWEEN ? AND ? ORDER BY tagorder");
|
||||
|
||||
$sth->execute($bibid,$mintag,$maxtag);
|
||||
|
||||
my @marcsubjcts;
|
||||
my $subjct = "";
|
||||
my $subfield = "";
|
||||
my $marcsubjct;
|
||||
|
||||
while (my $data=$sth->fetchrow_arrayref) {
|
||||
my $value = $data->[0];
|
||||
my $subfield = $data->[1];
|
||||
if ($subfield eq "a" && $value ne $subjct) {
|
||||
$marcsubjct = {MARCSUBJCT => $value,};
|
||||
push @marcsubjcts, $marcsubjct;
|
||||
$subjct = $value;
|
||||
}
|
||||
}
|
||||
|
||||
$sth->finish;
|
||||
$dbh->disconnect;
|
||||
|
||||
my $marcsubjctsarray=\@marcsubjcts;
|
||||
return $marcsubjctsarray;
|
||||
} #end getMARCsubjects
|
||||
|
||||
END { } # module clean-up code here (global destructor)
|
||||
|
||||
1;
|
||||
__END__
|
||||
|
||||
=back
|
||||
|
||||
=head1 AUTHOR
|
||||
|
||||
Koha Developement team <info@koha.org>
|
||||
|
||||
=cut
|
125
misc/build_marc_Tword.pl
Executable file
125
misc/build_marc_Tword.pl
Executable file
|
@ -0,0 +1,125 @@
|
|||
#!/usr/bin/perl -w
|
||||
#-----------------------------------
|
||||
# Script Name: build_marc_Tword.pl
|
||||
# Script Version: 0.1.0
|
||||
# Date: 2004/06/05
|
||||
|
||||
# script to build a marc_Tword table.
|
||||
# create the table :
|
||||
# CREATE TABLE `marc_Tword` (
|
||||
# `word` varchar(80) NOT NULL default '',
|
||||
# `usedin` text NOT NULL,
|
||||
# `tagsubfield` varchar(4) NOT NULL default '',
|
||||
# PRIMARY KEY (`word`,`tagsubfield`)
|
||||
#) TYPE=MyISAM;
|
||||
# just to test the idea of a reversed index searching.
|
||||
# reversed index for searchs on Title.
|
||||
# the marc_Tword table contains for each word & marc field/subfield, the list of biblios using it, with the title
|
||||
# reminder : the inverted index is only done to search on a "contain". For a "=" or "start by", the marc_subfield_table is perfect & correctly indexed.
|
||||
# if this POC becomes more than a POC, then I think we will have to build 1 table for each sorting (marc_Tword for title, Aword for author, Cword for callnumber...)
|
||||
|
||||
# FIXME :
|
||||
# * indexes empty words too (it's just a proof of concept)
|
||||
# * maybe it would be OK to store only 20 char of the title.
|
||||
|
||||
use strict;
|
||||
use locale;
|
||||
use C4::Context;
|
||||
use C4::Biblio;
|
||||
my $dbh=C4::Context->dbh;
|
||||
use Time::HiRes qw(gettimeofday);
|
||||
|
||||
# fields & subfields to ignore
|
||||
# in real situation, we should add a marc constraint on this.
|
||||
# ideally, we should not inde isbn, as every would be different, so it makes the table very big.
|
||||
# but in this case we have to find a way to automatically search "isbn = XXX" in marc_subfield_table
|
||||
|
||||
my %ignore_list = (
|
||||
'001' =>1,
|
||||
'010b'=>1,
|
||||
'0909' => 1,
|
||||
'090a' => 1,
|
||||
'100' => 1,
|
||||
'105' => 1,
|
||||
'6069' => 1,
|
||||
'7009' => 1,
|
||||
'7019' => 1,
|
||||
'7109' => 1,
|
||||
'7129' => 1,
|
||||
'9959' => 1,
|
||||
);
|
||||
|
||||
my $starttime = gettimeofday;
|
||||
|
||||
$dbh->do("delete from marc_Tword");
|
||||
|
||||
# parse every line
|
||||
my $query="SELECT biblio.biblionumber,tag,subfieldcode,subfieldvalue,biblio.title FROM marc_subfield_table left join marc_biblio on marc_biblio.bibid=marc_subfield_table.bibid left join biblio on marc_biblio.biblionumber=biblio.biblionumber";
|
||||
my $sth=$dbh->prepare($query);
|
||||
|
||||
print "******** SELECTING \n";
|
||||
$sth->execute;
|
||||
print "******** DONE \n";
|
||||
$|=1; # flushes output
|
||||
|
||||
my $sthT=$dbh->prepare("select usedin from marc_Tword where tagsubfield=? and word=?");
|
||||
my $updateT=$dbh->prepare("update marc_Tword set usedin=? where tagsubfield=? and word=?");
|
||||
my $insertT=$dbh->prepare("insert into marc_Tword (tagsubfield,word,usedin) values (?,?,?)");
|
||||
my $i=0;
|
||||
my $timeneeded;
|
||||
# 1st version, slower, but less RAM consumming
|
||||
# while (my ($biblionumber, $tag, $subfieldcode, $subfieldvalue, $title) = $sth->fetchrow) {
|
||||
# next if $ignore_list{"$tag.$subfieldcode"};
|
||||
# $subfieldvalue =~ s/(\.|\?|\:|\!|\'|,|\-|\"|\(|\)|\[|\]|\{|\}|\/)/ /g;
|
||||
# # remove useless chars in the title.
|
||||
# $title =~ s/(\.|\?|\:|\!|\'|,|\-|\"|\(|\)|\[|\]|\{|\}|\/)/ /g;
|
||||
# my @words = split / /, $subfieldvalue;
|
||||
# # and retrieve the reversed entry
|
||||
# foreach my $word (@words) {
|
||||
# $sthT->execute($tag.$subfieldcode,$word);
|
||||
# if (my ($usedin) = $sthT->fetchrow) {
|
||||
# # add the field & save it once again.
|
||||
# $usedin.=",$biblionumber-$title";
|
||||
# $updateT->execute($usedin,$tag.$subfieldcode,$word);
|
||||
# } else {
|
||||
# $insertT->execute($tag.$subfieldcode,$word,",$title-$biblionumber");
|
||||
# }
|
||||
# }
|
||||
# $timeneeded = gettimeofday - $starttime unless ($i % 100);
|
||||
# print "$i in $timeneeded s\n" unless ($i % 100);
|
||||
# print ".";
|
||||
# $i++;
|
||||
# }
|
||||
|
||||
# 2nd version : faster (about 100 times !), bug maybe too much RAM consumming...
|
||||
my %largehash;
|
||||
print "READING\n";
|
||||
while (my ($biblionumber, $tag, $subfieldcode, $subfieldvalue, $title) = $sth->fetchrow) {
|
||||
next unless $subfieldvalue;
|
||||
next if $ignore_list{$tag.$subfieldcode};
|
||||
$subfieldvalue =~ s/(\.|\?|\:|\!|\'|,|\-|\"|\(|\)|\[|\]|\{|\}|\/)/ /g;
|
||||
# remove useless chars in the title.
|
||||
$title =~ s/(\.|\?|\:|\!|\'|,|\-|\"|\(|\)|\[|\]|\{|\}|\/)/ /g;
|
||||
my @words = split / /, $subfieldvalue;
|
||||
# and retrieve the reversed entry
|
||||
foreach my $word (@words) {
|
||||
my $localkey = $tag.$subfieldcode.'|'.uc($word);
|
||||
$largehash{$localkey}.=",$title-$biblionumber";
|
||||
}
|
||||
$timeneeded = gettimeofday - $starttime unless ($i % 30000);
|
||||
print "$i in $timeneeded s\n" unless ($i % 30000);
|
||||
print "." unless ($i % 500);
|
||||
$i++;
|
||||
}
|
||||
$i=0;
|
||||
print "WRITING\n";
|
||||
foreach my $k (keys %largehash) {
|
||||
$k =~ /(.*)\|(.*)/;
|
||||
$insertT->execute($1,$2,$largehash{$k});
|
||||
$timeneeded = gettimeofday - $starttime unless ($i % 30000);
|
||||
print "$i in $timeneeded s\n" unless ($i % 30000);
|
||||
print "." unless ($i % 500);
|
||||
$i++;
|
||||
}
|
||||
|
||||
$dbh->disconnect();
|
Loading…
Reference in a new issue