From 41c209a4bd2f5ca075be562423c3903fe6c5d664 Mon Sep 17 00:00:00 2001 From: tipaul Date: Fri, 27 May 2005 09:30:23 +0000 Subject: [PATCH] proof of concept for inverted index tables for search how it works : * create the table marc_Tword with the following structure : CREATE TABLE `marc_Tword` ( `word` varchar(80) NOT NULL default '', `usedin` text NOT NULL, `tagsubfield` varchar(4) NOT NULL default '', PRIMARY KEY (`word`,`tagsubfield`) ) TYPE=MyISAM; * open a console & type export PERL5LIB & export KOHA_CONF as usual. * fill this table with misc/build_marc_Tword.pl. Warning, this script uses a very very consumming but very fast method to fill the table : it does everything in memory, then write everything. Another method is provided (& commented), but it's 100x times slower (really !) * open opac-search.pl and replace use C4::SearchMarc; by use C4::SearchMarcTest; as the API hasn't changed, it will work immediatly. * go to opac-search (advanced search) & search whatever you want. Should work fine. LIMITS : * build_marc_Tword has problem with extended chars (accented ones mainly). So don't be afraid if you get sql errors. They are not a problem for a POC * search works always order by title, whatever you choose. * search works only search WORDA and WOARDB, not yet WORDA or WORDB or WORDA except WORDB. --- C4/SearchMarcTest.pm | 549 +++++++++++++++++++++++++++++++++++++++ misc/build_marc_Tword.pl | 125 +++++++++ 2 files changed, 674 insertions(+) create mode 100644 C4/SearchMarcTest.pm create mode 100755 misc/build_marc_Tword.pl diff --git a/C4/SearchMarcTest.pm b/C4/SearchMarcTest.pm new file mode 100644 index 0000000000..0dd1d822cd --- /dev/null +++ b/C4/SearchMarcTest.pm @@ -0,0 +1,549 @@ +package C4::SearchMarcTest; + +# Copyright 2000-2002 Katipo Communications +# +# This file is part of Koha. +# +# Koha is free software; you can redistribute it and/or modify it under the +# terms of the GNU General Public License as published by the Free Software +# Foundation; either version 2 of the License, or (at your option) any later +# version. +# +# Koha is distributed in the hope that it will be useful, but WITHOUT ANY +# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR +# A PARTICULAR PURPOSE. See the GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along with +# Koha; if not, write to the Free Software Foundation, Inc., 59 Temple Place, +# Suite 330, Boston, MA 02111-1307 USA + +use strict; +require Exporter; +use DBI; +use C4::Context; +use C4::Biblio; +use C4::Date; +use Date::Manip; + +use vars qw($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS); + +# set the version for version checking +$VERSION = 0.02; + +=head1 NAME + +C4::Search - Functions for searching the Koha MARC catalog + +=head1 FUNCTIONS + +This module provides the searching facilities for the Koha MARC catalog + +A COPY of official SearchMarc, with some tests for inverted index table +works only with 1 MARC tag/subfield + +=cut + +@ISA = qw(Exporter); +@EXPORT = qw(&catalogsearch &findseealso &findsuggestion &getMARCnotes &getMARCsubjects); + +=head1 findsuggestion($dbh,$values); + +=head2 $dbh is a link to the DB handler. + +use C4::Context; +my $dbh =C4::Context->dbh; + +=head2 $values is a word + +Searches words with the same soundex, ordered by frequency of use. +Useful to suggest other searches to the users. + +=cut + +sub findsuggestion { + my ($dbh,$values) = @_; + my $sth = $dbh->prepare("SELECT count( * ) AS total, word FROM marc_word WHERE sndx_word = soundex( ? ) AND word <> ? GROUP BY word ORDER BY total DESC"); + my @results; + for(my $i = 0 ; $i <= $#{$values} ; $i++) { + if (length(@$values[$i]) >=5) { + $sth->execute(@$values[$i],@$values[$i]); + my $resfound = 1; + my @resline; + while ((my ($count,$word) = $sth->fetchrow) and $resfound <=10) { + push @results, "@$values[$i]|$word|$count"; +# $results{@$values[$i]} = \@resline; + $resfound++; + } + } + } + return \@results; +} + +=head1 findseealso($dbh,$fields); + +=head2 $dbh is a link to the DB handler. + +use C4::Context; +my $dbh =C4::Context->dbh; + +=head2 $fields is a reference to the fields array + +This function modify the @$fields array and add related fields to search on. + +=cut + +sub findseealso { + my ($dbh, $fields) = @_; + my $tagslib = MARCgettagslib ($dbh,1); + for (my $i=0;$i<=$#{$fields};$i++) { + my ($tag) =substr(@$fields[$i],1,3); + my ($subfield) =substr(@$fields[$i],4,1); + @$fields[$i].=','.$tagslib->{$tag}->{$subfield}->{seealso} if ($tagslib->{$tag}->{$subfield}->{seealso}); + } +} + +=head1 my ($count, @results) = catalogsearch($dbh, $tags, $and_or, $excluding, $operator, $value, $offset,$length,$orderby); + +=head2 $dbh is a link to the DB handler. + +use C4::Context; +my $dbh =C4::Context->dbh; + +$tags,$and_or, $excluding, $operator, $value are references to array + +=head2 $tags + +contains the list of tags+subfields (for example : $@tags[0] = '200a') +A field can be a list of fields : '200f','700a','700b','701a','701b' + +Example + +=head2 $and_or + +contains a list of strings containing and or or. The 1st value is useless. + +=head2 $excluding + +contains 0 or 1. If 1, then the request is negated. + +=head2 $operator + +contains contains,=,start,>,>=,<,<= the = and start work on the complete subfield. The contains operator works on every word in the subfield. + +examples : +contains home, search home anywhere. += home, search a string being home. + +=head2 $value + +contains the value to search +If it contains a * or a %, then the search is partial. + +=head2 $offset and $length + +returns $length results, beginning at $offset + +=head2 $orderby + +define the field used to order the request. Any field in the biblio/biblioitem tables can be used. DESC is possible too + +(for example title, title DESC,...) + +=head2 RETURNS + +returns an array containing hashes. The hash contains all biblio & biblioitems fields and a reference to an item hash. The "item hash contains one line for each callnumber & the number of items related to the callnumber. + +=cut + +=head2 my $marcnotesarray = &getMARCnotes($dbh,$bibid,$marcflavour); + +Returns a reference to an array containing all the notes stored in the MARC database for the given bibid. +$marcflavour ("MARC21" or "UNIMARC") determines which tags are used for retrieving subjects. + +=head2 my $marcsubjctsarray = &getMARCsubjects($dbh,$bibid,$marcflavour); + +Returns a reference to an array containing all the subjects stored in the MARC database for the given bibid. +$marcflavour ("MARC21" or "UNIMARC") determines which tags are used for retrieving subjects. + +=cut + +sub catalogsearch { + my ($dbh, $tags, $and_or, $excluding, $operator, $value, $offset,$length,$orderby,$desc_or_asc) = @_; + # "Normal" statements + my @normal_tags = (); + my @normal_and_or = (); + my @normal_operator = (); + my @normal_value = (); + # Extracts the NOT statements from the list of statements + my @not_tags = (); + my @not_and_or = (); + my @not_operator = (); + my @not_value = (); + my $any_not = 0; + $orderby = "biblio.title" unless $orderby; + $desc_or_asc = "ASC" unless $desc_or_asc; + +# the item.notforloan contains an integer. Every value <>0 means "book unavailable for loan". +# but each library can have it's own table of meaning for each value. Get them +# 1st search if there is a list of authorised values connected to items.notforloan + my $sth = $dbh->prepare('select authorised_value from marc_subfield_structure where kohafield="items.notforloan"'); + $sth->execute; + my %notforloanstatus; + my ($authorised_valuecode) = $sth->fetchrow; + if ($authorised_valuecode) { + $sth = $dbh->prepare("select authorised_value,lib from authorised_values where category=?"); + $sth->execute($authorised_valuecode); + while (my ($authorised_value,$lib) = $sth->fetchrow) { + $notforloanstatus{$authorised_value} = $lib?$lib:$authorised_value; + } + } +# +# +# marc_T_word PROOF OF CONCEPT BEGINNING +# +# fixme : only do a search on "contains every word" +# misses : +# - begins or is equal to +# - excluding +# - or + # the global array result. + my @result; + for(my $i = 0 ; $i <= $#{$value} ; $i++) + { + # replace * by % + @$value[$i] =~ s/\*/%/g; + # remove % at the beginning + @$value[$i] =~ s/^%//g; + @$value[$i] =~ s/(\.|\?|\:|\!|\'|,|\-|\"|\(|\)|\[|\]|\{|\}|\/)/ /g if @$operator[$i] eq "contains"; + foreach my $word (split(/ /, @$value[$i])) # if operator is contains, splits the words in separate requests + { + # the array from this word + my @thiswordresults; + my $Tquery = $dbh->prepare("select tagsubfield,usedin from marc_Tword where tagsubfield in (@$tags[$i]) and word like ?"); + $Tquery->execute($word); + warn "EXECUTING select tagsubfield,usedin from marc_Tword where tagsubfield in (@$tags[$i]) and word like $word"; + # get the list of biblionumber - title + while (my ($tagsubfield,$usedin) = $Tquery->fetchrow) { +# warn "$word with ".@$tags[$i]." used in $usedin"; + # split it in an array + my @lines = split /,/,$usedin; + # and copy it to an hash. + foreach my $line (@lines) { +# warn "PUSHING $line" if $line; # the if $line avoid pushing the 1st entry, that is empty (usedin begins by a ,) + push @thiswordresults, $line if $line; + } + } + # now, as it's a AND, merge %results & %thiswordresults in 1 hash + @result = @thiswordresults if $#result<0; #for the 1st loop, fill the global array + my %intersect; + my %union; + my $x; # temp variable + foreach $x (@result, @thiswordresults) { + $union{$x}++ && $intersect{$x}++; + } + @result = keys %intersect; + } + } + + # we have biblionumber array. + # now, sort it + my @result = sort @result; + + #Now, loads title and author from [offset] to [offset]+[length] + my $counter = $offset; + # HINT : biblionumber as bn is important. The hash is fills biblionumber with items.biblionumber. + # so if you dont' has an item, you get a not nice empty value. + $sth = $dbh->prepare("SELECT biblio.biblionumber as bn,biblioitems.*,biblio.*, marc_biblio.bibid,itemtypes.notforloan,itemtypes.description + FROM biblio, marc_biblio + LEFT JOIN biblioitems on biblio.biblionumber = biblioitems.biblionumber + LEFT JOIN itemtypes on itemtypes.itemtype=biblioitems.itemtype + WHERE biblio.biblionumber = marc_biblio.biblionumber AND biblio.biblionumber = ?"); +# +# +# marc_Tword Proof of concept +# +# +my $subtitle; + my $sth_subtitle = $dbh->prepare("SELECT subtitle FROM bibliosubtitle WHERE biblionumber=?"); # Added BY JF for Subtitles + my @finalresult = (); + my @CNresults=(); + my $totalitems=0; + my $oldline; + my ($oldbibid, $oldauthor, $oldtitle); + my $sth_itemCN = $dbh->prepare("select items.* from items where biblionumber=? and (itemlost = 0 or itemlost is NULL)"); + my $sth_issue = $dbh->prepare("select date_due,returndate from issues where itemnumber=?"); + # parse all biblios between start & end. + warn "RESULT SIZE : ".$#result; + while (($counter <= $#result) && ($counter <= ($offset + $length))) { + # search & parse all items & note itemcallnumber + # 1st, get the biblionumber + $result[$counter] =~ /(.*)-(.*)/; + $sth->execute($2); + warn "EXECUTING SELECT biblio.biblionumber as bn,biblioitems.*,biblio.*, marc_biblio.bibid,itemtypes.notforloan,itemtypes.description FROM biblio, marc_biblio LEFT JOIN biblioitems on biblio.biblionumber = biblioitems.biblionumber LEFT JOIN itemtypes on itemtypes.itemtype=biblioitems.itemtype WHERE biblio.biblionumber = marc_biblio.biblionumber AND biblio.biblionumber = $2"; + my $continue=1; + my $line = $sth->fetchrow_hashref; + my $biblionumber=$line->{bn}; + # Return subtitles first ADDED BY JF + $sth_subtitle->execute($biblionumber); + warn "EXECUTING SELECT subtitle FROM bibliosubtitle WHERE biblionumber=$biblionumber"; + my $subtitle_here.= $sth_subtitle->fetchrow." "; + chop $subtitle_here; + $subtitle = $subtitle_here; + # /ADDED BY JF + +# $continue=0 unless $line->{bn}; +# my $lastitemnumber; + $sth_itemCN->execute($biblionumber); + warn "EXECUTING itemCN select items.* from items where biblionumber=$biblionumber and (itemlost = 0 or itemlost is NULL)"; + my @CNresults = (); + my $notforloan=1; # to see if there is at least 1 item that can be issued + while (my $item = $sth_itemCN->fetchrow_hashref) { + # parse the result, putting holdingbranch & itemcallnumber in separate array + # then all other fields in the main array + + # search if item is on loan + my $date_due; + $sth_issue->execute($item->{itemnumber}); + warn "EXECUTING ISSUES select date_due,returndate from issues where itemnumber=".$item->{itemnumber}; + while (my $loan = $sth_issue->fetchrow_hashref) { + if ($loan->{date_due} and !$loan->{returndate}) { + $date_due = $loan->{date_due}; + } + } + # store this item + my %lineCN; + $lineCN{holdingbranch} = $item->{holdingbranch}; + $lineCN{itemcallnumber} = $item->{itemcallnumber}; + $lineCN{location} = $item->{location}; + $lineCN{date_due} = format_date($date_due); + $lineCN{notforloan} = $notforloanstatus{$line->{notforloan}} if ($line->{notforloan}); # setting not forloan if itemtype is not for loan + $lineCN{notforloan} = $notforloanstatus{$item->{notforloan}} if ($item->{notforloan}); # setting not forloan it this item is not for loan + $notforloan=0 unless ($item->{notforloan} or $item->{wthdrawn} or $item->{itemlost}); + push @CNresults,\%lineCN; + $totalitems++; + } + # save the biblio in the final array, with item and item issue status + my %newline; + %newline = %$line; + $newline{totitem} = $totalitems; + # if $totalitems == 0, check if it's being ordered. + if ($totalitems == 0) { + my $sth = $dbh->prepare("select count(*) from aqorders where biblionumber=? and datecancellationprinted is NULL"); + $sth->execute($biblionumber); + warn "EXECUTING select count(*) from aqorders where biblionumber=$biblionumber and datecancellationprinted is NULL"; + my ($ordered) = $sth->fetchrow; + $newline{onorder} = 1 if $ordered; + } + $newline{biblionumber} = $biblionumber; + $newline{norequests} = 0; + $newline{norequests} = 1 if ($line->{notforloan}); # itemtype not issuable + $newline{norequests} = 1 if (!$line->{notforloan} && $notforloan); # itemtype issuable but all items not issuable for instance + $newline{subtitle} = $subtitle; # put the subtitle in ADDED BY JF + + my @CNresults2= @CNresults; + $newline{CN} = \@CNresults2; + $newline{'even'} = 1 if $#finalresult % 2 == 0; + $newline{'odd'} = 1 if $#finalresult % 2 == 1; + $newline{'timestamp'} = format_date($newline{timestamp}); + @CNresults = (); + push @finalresult, \%newline; + $totalitems=0; + $counter++; + } + my $nbresults = $#result+1; + return (\@finalresult, $nbresults); +} + +# Creates the SQL Request + +sub create_request { + my ($dbh,$tags, $and_or, $operator, $value) = @_; + + my $sql_tables; # will contain marc_subfield_table as m1,... + my $sql_where1; # will contain the "true" where + my $sql_where2 = "("; # will contain m1.bibid=m2.bibid + my $nb_active=0; # will contain the number of "active" entries. an entry is active if a value is provided. + my $nb_table=1; # will contain the number of table. ++ on each entry EXCEPT when an OR is provided. + + my $maxloop=8; # the maximum number of words to avoid a too complex search. + $maxloop = @$value if @$value<$maxloop; + + for(my $i=0; $i<=$maxloop;$i++) { + if (@$value[$i]) { + $nb_active++; + if ($nb_active==1) { + if (@$operator[$i] eq "start") { + $sql_tables .= "marc_subfield_table as m$nb_table,"; + $sql_where1 .= "(m1.subfieldvalue like ".$dbh->quote("@$value[$i]%"); + if (@$tags[$i]) { + $sql_where1 .=" and concat(m1.tag,m1.subfieldcode) in (@$tags[$i])"; + } + $sql_where1.=")"; + } elsif (@$operator[$i] eq "contains") { + $sql_tables .= "marc_word as m$nb_table,"; + $sql_where1 .= "(m1.word like ".$dbh->quote("@$value[$i]"); + if (@$tags[$i]) { + $sql_where1 .=" and m1.tagsubfield in (@$tags[$i])"; + } + $sql_where1.=")"; + } else { + $sql_tables .= "marc_subfield_table as m$nb_table,"; + $sql_where1 .= "(m1.subfieldvalue @$operator[$i] ".$dbh->quote("@$value[$i]"); + if (@$tags[$i]) { + $sql_where1 .=" and concat(m1.tag,m1.subfieldcode) in (@$tags[$i])"; + } + $sql_where1.=")"; + } + } else { + if (@$operator[$i] eq "start") { + $nb_table++; + $sql_tables .= "marc_subfield_table as m$nb_table,"; + $sql_where1 .= "@$and_or[$i] (m$nb_table.subfieldvalue like ".$dbh->quote("@$value[$i]%"); + if (@$tags[$i]) { + $sql_where1 .=" and concat(m$nb_table.tag,m$nb_table.subfieldcode) in (@$tags[$i])"; + } + $sql_where1.=")"; + $sql_where2 .= "m1.bibid=m$nb_table.bibid and "; + } elsif (@$operator[$i] eq "contains") { + if (@$and_or[$i] eq 'and') { + $nb_table++; + $sql_tables .= "marc_word as m$nb_table,"; + $sql_where1 .= "@$and_or[$i] (m$nb_table.word like ".$dbh->quote("@$value[$i]"); + if (@$tags[$i]) { + $sql_where1 .=" and m$nb_table.tagsubfield in(@$tags[$i])"; + } + $sql_where1.=")"; + $sql_where2 .= "m1.bibid=m$nb_table.bibid and "; + } else { + $sql_where1 .= "@$and_or[$i] (m$nb_table.word like ".$dbh->quote("@$value[$i]"); + if (@$tags[$i]) { + $sql_where1 .=" and m$nb_table.tagsubfield in (@$tags[$i])"; + } + $sql_where1.=")"; + $sql_where2 .= "m1.bibid=m$nb_table.bibid and "; + } + } else { + $nb_table++; + $sql_tables .= "marc_subfield_table as m$nb_table,"; + $sql_where1 .= "@$and_or[$i] (m$nb_table.subfieldvalue @$operator[$i] ".$dbh->quote(@$value[$i]); + if (@$tags[$i]) { + $sql_where1 .=" and concat(m$nb_table.tag,m$nb_table.subfieldcode) in (@$tags[$i])"; + } + $sql_where2 .= "m1.bibid=m$nb_table.bibid and "; + $sql_where1.=")"; + } + } + } + } + + if($sql_where2 ne "(") # some datas added to sql_where2, processing + { + $sql_where2 = substr($sql_where2, 0, (length($sql_where2)-5)); # deletes the trailing ' and ' + $sql_where2 .= ")"; + } + else # no sql_where2 statement, deleting '(' + { + $sql_where2 = ""; + } + chop $sql_tables; # deletes the trailing ',' + return ($sql_tables, $sql_where1, $sql_where2); +} + +sub getMARCnotes { + my ($dbh, $bibid, $marcflavour) = @_; + my ($mintag, $maxtag); + if ($marcflavour eq "MARC21") { + $mintag = "500"; + $maxtag = "599"; + } else { # assume unimarc if not marc21 + $mintag = "300"; + $maxtag = "399"; + } + + my $sth=$dbh->prepare("SELECT subfieldvalue,tag FROM marc_subfield_table WHERE bibid=? AND tag BETWEEN ? AND ? ORDER BY tagorder"); + + $sth->execute($bibid,$mintag,$maxtag); + + my @marcnotes; + my $note = ""; + my $tag = ""; + my $marcnote; + + while (my $data=$sth->fetchrow_arrayref) { + my $value=$data->[0]; + my $thistag=$data->[1]; + if ($value=~/\.$/) { + $value=$value . " "; + } + if ($thistag ne $tag && $note ne "") { + $marcnote = {marcnote => $note,}; + push @marcnotes, $marcnote; + $note=$value; + $tag=$thistag; + } + if ($note ne $value) { + $note = $note." ".$value; + } + } + + if ($note) { + $marcnote = {marcnote => $note}; + push @marcnotes, $marcnote; #load last tag into array + } + + $sth->finish; + $dbh->disconnect; + + my $marcnotesarray=\@marcnotes; + return $marcnotesarray; +} # end getMARCnotes + + +sub getMARCsubjects { + my ($dbh, $bibid, $marcflavour) = @_; + my ($mintag, $maxtag); + if ($marcflavour eq "MARC21") { + $mintag = "600"; + $maxtag = "699"; + } else { # assume unimarc if not marc21 + $mintag = "600"; + $maxtag = "619"; + } + my $sth=$dbh->prepare("SELECT subfieldvalue,subfieldcode FROM marc_subfield_table WHERE bibid=? AND tag BETWEEN ? AND ? ORDER BY tagorder"); + + $sth->execute($bibid,$mintag,$maxtag); + + my @marcsubjcts; + my $subjct = ""; + my $subfield = ""; + my $marcsubjct; + + while (my $data=$sth->fetchrow_arrayref) { + my $value = $data->[0]; + my $subfield = $data->[1]; + if ($subfield eq "a" && $value ne $subjct) { + $marcsubjct = {MARCSUBJCT => $value,}; + push @marcsubjcts, $marcsubjct; + $subjct = $value; + } + } + + $sth->finish; + $dbh->disconnect; + + my $marcsubjctsarray=\@marcsubjcts; + return $marcsubjctsarray; +} #end getMARCsubjects + +END { } # module clean-up code here (global destructor) + +1; +__END__ + +=back + +=head1 AUTHOR + +Koha Developement team + +=cut diff --git a/misc/build_marc_Tword.pl b/misc/build_marc_Tword.pl new file mode 100755 index 0000000000..7985bb38f6 --- /dev/null +++ b/misc/build_marc_Tword.pl @@ -0,0 +1,125 @@ +#!/usr/bin/perl -w +#----------------------------------- +# Script Name: build_marc_Tword.pl +# Script Version: 0.1.0 +# Date: 2004/06/05 + +# script to build a marc_Tword table. +# create the table : +# CREATE TABLE `marc_Tword` ( +# `word` varchar(80) NOT NULL default '', +# `usedin` text NOT NULL, +# `tagsubfield` varchar(4) NOT NULL default '', +# PRIMARY KEY (`word`,`tagsubfield`) +#) TYPE=MyISAM; +# just to test the idea of a reversed index searching. +# reversed index for searchs on Title. +# the marc_Tword table contains for each word & marc field/subfield, the list of biblios using it, with the title +# reminder : the inverted index is only done to search on a "contain". For a "=" or "start by", the marc_subfield_table is perfect & correctly indexed. +# if this POC becomes more than a POC, then I think we will have to build 1 table for each sorting (marc_Tword for title, Aword for author, Cword for callnumber...) + +# FIXME : +# * indexes empty words too (it's just a proof of concept) +# * maybe it would be OK to store only 20 char of the title. + +use strict; +use locale; +use C4::Context; +use C4::Biblio; +my $dbh=C4::Context->dbh; +use Time::HiRes qw(gettimeofday); + +# fields & subfields to ignore +# in real situation, we should add a marc constraint on this. +# ideally, we should not inde isbn, as every would be different, so it makes the table very big. +# but in this case we have to find a way to automatically search "isbn = XXX" in marc_subfield_table + +my %ignore_list = ( + '001' =>1, + '010b'=>1, + '0909' => 1, + '090a' => 1, + '100' => 1, + '105' => 1, + '6069' => 1, + '7009' => 1, + '7019' => 1, + '7109' => 1, + '7129' => 1, + '9959' => 1, +); + +my $starttime = gettimeofday; + +$dbh->do("delete from marc_Tword"); + +# parse every line +my $query="SELECT biblio.biblionumber,tag,subfieldcode,subfieldvalue,biblio.title FROM marc_subfield_table left join marc_biblio on marc_biblio.bibid=marc_subfield_table.bibid left join biblio on marc_biblio.biblionumber=biblio.biblionumber"; +my $sth=$dbh->prepare($query); + +print "******** SELECTING \n"; +$sth->execute; +print "******** DONE \n"; +$|=1; # flushes output + +my $sthT=$dbh->prepare("select usedin from marc_Tword where tagsubfield=? and word=?"); +my $updateT=$dbh->prepare("update marc_Tword set usedin=? where tagsubfield=? and word=?"); +my $insertT=$dbh->prepare("insert into marc_Tword (tagsubfield,word,usedin) values (?,?,?)"); +my $i=0; +my $timeneeded; +# 1st version, slower, but less RAM consumming +# while (my ($biblionumber, $tag, $subfieldcode, $subfieldvalue, $title) = $sth->fetchrow) { +# next if $ignore_list{"$tag.$subfieldcode"}; +# $subfieldvalue =~ s/(\.|\?|\:|\!|\'|,|\-|\"|\(|\)|\[|\]|\{|\}|\/)/ /g; +# # remove useless chars in the title. +# $title =~ s/(\.|\?|\:|\!|\'|,|\-|\"|\(|\)|\[|\]|\{|\}|\/)/ /g; +# my @words = split / /, $subfieldvalue; +# # and retrieve the reversed entry +# foreach my $word (@words) { +# $sthT->execute($tag.$subfieldcode,$word); +# if (my ($usedin) = $sthT->fetchrow) { +# # add the field & save it once again. +# $usedin.=",$biblionumber-$title"; +# $updateT->execute($usedin,$tag.$subfieldcode,$word); +# } else { +# $insertT->execute($tag.$subfieldcode,$word,",$title-$biblionumber"); +# } +# } +# $timeneeded = gettimeofday - $starttime unless ($i % 100); +# print "$i in $timeneeded s\n" unless ($i % 100); +# print "."; +# $i++; +# } + +# 2nd version : faster (about 100 times !), bug maybe too much RAM consumming... +my %largehash; +print "READING\n"; +while (my ($biblionumber, $tag, $subfieldcode, $subfieldvalue, $title) = $sth->fetchrow) { + next unless $subfieldvalue; + next if $ignore_list{$tag.$subfieldcode}; + $subfieldvalue =~ s/(\.|\?|\:|\!|\'|,|\-|\"|\(|\)|\[|\]|\{|\}|\/)/ /g; + # remove useless chars in the title. + $title =~ s/(\.|\?|\:|\!|\'|,|\-|\"|\(|\)|\[|\]|\{|\}|\/)/ /g; + my @words = split / /, $subfieldvalue; + # and retrieve the reversed entry + foreach my $word (@words) { + my $localkey = $tag.$subfieldcode.'|'.uc($word); + $largehash{$localkey}.=",$title-$biblionumber"; + } + $timeneeded = gettimeofday - $starttime unless ($i % 30000); + print "$i in $timeneeded s\n" unless ($i % 30000); + print "." unless ($i % 500); + $i++; +} +$i=0; +print "WRITING\n"; +foreach my $k (keys %largehash) { + $k =~ /(.*)\|(.*)/; + $insertT->execute($1,$2,$largehash{$k}); + $timeneeded = gettimeofday - $starttime unless ($i % 30000); + print "$i in $timeneeded s\n" unless ($i % 30000); + print "." unless ($i % 500); + $i++; +} + +$dbh->disconnect();