2 #-----------------------------------
3 # Script Name: build_marc_Tword.pl
4 # Script Version: 0.1.0
7 # script to build a marc_Tword table.
9 # CREATE TABLE `marc_Tword` (
10 # `word` varchar(80) NOT NULL default '',
11 # `usedin` text NOT NULL,
12 # `tagsubfield` varchar(4) NOT NULL default '',
13 # PRIMARY KEY (`word`,`tagsubfield`)
15 # just to test the idea of a reversed index searching.
16 # reversed index for searchs on Title.
17 # the marc_Tword table contains for each word & marc field/subfield, the list of biblios using it, with the title
18 # reminder : the inverted index is only done to search on a "contain". For a "=" or "start by", the marc_subfield_table is perfect & correctly indexed.
19 # if this POC becomes more than a POC, then I think we will have to build 1 table for each sorting (marc_Tword for title, Aword for author, Cword for callnumber...)
22 # * indexes empty words too (it's just a proof of concept)
23 # * maybe it would be OK to store only 20 char of the title.
29 my $dbh=C4::Context->dbh;
30 use Time::HiRes qw(gettimeofday);
32 # fields & subfields to ignore
33 # in real situation, we should add a marc constraint on this.
34 # ideally, we should not inde isbn, as every would be different, so it makes the table very big.
35 # but in this case we have to find a way to automatically search "isbn = XXX" in marc_subfield_table
52 my $starttime = gettimeofday;
54 $dbh->do("delete from marc_Tword");
57 my $query="SELECT biblio.biblionumber,tag,subfieldcode,subfieldvalue,biblio.title FROM marc_subfield_table left join marc_biblio on marc_biblio.bibid=marc_subfield_table.bibid left join biblio on marc_biblio.biblionumber=biblio.biblionumber";
58 my $sth=$dbh->prepare($query);
60 print "******** SELECTING \n";
62 print "******** DONE \n";
63 $|=1; # flushes output
65 my $sthT=$dbh->prepare("select usedin from marc_Tword where tagsubfield=? and word=?");
66 my $updateT=$dbh->prepare("update marc_Tword set usedin=? where tagsubfield=? and word=?");
67 my $insertT=$dbh->prepare("insert into marc_Tword (tagsubfield,word,usedin) values (?,?,?)");
70 # 1st version, slower, but less RAM consumming
71 # while (my ($biblionumber, $tag, $subfieldcode, $subfieldvalue, $title) = $sth->fetchrow) {
72 # next if $ignore_list{"$tag.$subfieldcode"};
73 # $subfieldvalue =~ s/(\.|\?|\:|\!|\'|,|\-|\"|\(|\)|\[|\]|\{|\}|\/)/ /g;
74 # # remove useless chars in the title.
75 # $title =~ s/(\.|\?|\:|\!|\'|,|\-|\"|\(|\)|\[|\]|\{|\}|\/)/ /g;
76 # my @words = split / /, $subfieldvalue;
77 # # and retrieve the reversed entry
78 # foreach my $word (@words) {
79 # $sthT->execute($tag.$subfieldcode,$word);
80 # if (my ($usedin) = $sthT->fetchrow) {
81 # # add the field & save it once again.
82 # $usedin.=",$biblionumber-$title";
83 # $updateT->execute($usedin,$tag.$subfieldcode,$word);
85 # $insertT->execute($tag.$subfieldcode,$word,",$title-$biblionumber");
88 # $timeneeded = gettimeofday - $starttime unless ($i % 100);
89 # print "$i in $timeneeded s\n" unless ($i % 100);
94 # 2nd version : faster (about 100 times !), bug maybe too much RAM consumming...
97 while (my ($biblionumber, $tag, $subfieldcode, $subfieldvalue, $title) = $sth->fetchrow) {
98 next unless $subfieldvalue;
99 next if $ignore_list{$tag.$subfieldcode};
100 $subfieldvalue =~ s/(\.|\?|\:|\!|\'|,|\-|\"|\(|\)|\[|\]|\{|\}|\/)/ /g;
101 # remove useless chars in the title.
102 $title =~ s/(\.|\?|\:|\!|\'|,|\-|\"|\(|\)|\[|\]|\{|\}|\/)/ /g;
103 my @words = split / /, $subfieldvalue;
104 # and retrieve the reversed entry
105 foreach my $word (@words) {
106 my $localkey = $tag.$subfieldcode.'|'.uc($word);
107 $largehash{$localkey}.=",$title-$biblionumber";
109 $timeneeded = gettimeofday - $starttime unless ($i % 30000);
110 print "$i in $timeneeded s\n" unless ($i % 30000);
111 print "." unless ($i % 500);
116 foreach my $k (keys %largehash) {
118 $insertT->execute($1,$2,$largehash{$k});
119 $timeneeded = gettimeofday - $starttime unless ($i % 30000);
120 print "$i in $timeneeded s\n" unless ($i % 30000);
121 print "." unless ($i % 500);