misc/build_marc_Tword.pl

   1 #!/usr/bin/perl -w
   2 #-----------------------------------
   3 # Script Name: build_marc_Tword.pl
   4 # Script Version: 0.1.0
   5 # Date:  2004/06/05
   6
   7 # script to build a marc_Tword table.
   8 # create the table :
   9 # CREATE TABLE `marc_Tword` (
  10 #  `word` varchar(80) NOT NULL default '',
  11 #  `usedin` text NOT NULL,
  12 #  `tagsubfield` varchar(4) NOT NULL default '',
  13 #  PRIMARY KEY  (`word`,`tagsubfield`)
  14 #) TYPE=MyISAM;
  15 # just to test the idea of a reversed index searching.
  16 # reversed index for searchs on Title.
  17 # the marc_Tword table contains for each word & marc field/subfield, the list of biblios using it, with the title
  18 # reminder : the inverted index is only done to search on a "contain". For a "=" or "start by", the marc_subfield_table is perfect & correctly indexed.
  19 # if this POC becomes more than a POC, then I think we will have to build 1 table for each sorting (marc_Tword for title, Aword for author, Cword for callnumber...)
  20
  21 # FIXME :
  22 # * indexes empty words too (it's just a proof of concept)
  23 # * maybe it would be OK to store only 20 char of the title.
  24
  25 use strict;
  26 use locale;
  27 use C4::Context;
  28 use C4::Biblio;
  29 my $dbh=C4::Context->dbh;
  30 use Time::HiRes qw(gettimeofday);
  31
  32 # fields & subfields to ignore
  33 # in real situation, we should add a marc constraint on this.
  34 # ideally, we should not inde isbn, as every would be different, so it makes the table very big.
  35 # but in this case we have to find a way to automatically search "isbn = XXX" in marc_subfield_table
  36
  37 my %ignore_list = (
  38         '001' =>1,
  39         '010b'=>1,
  40         '0909' => 1,
  41         '090a' => 1,
  42         '100' => 1,
  43         '105' => 1,
  44         '6069' => 1,
  45         '7009' => 1,
  46         '7019' => 1,
  47         '7109' => 1,
  48         '7129' => 1,
  49         '9959' => 1,
  50 );
  51
  52 my $starttime = gettimeofday;
  53
  54 $dbh->do("delete from marc_Tword");
  55
  56 # parse every line
  57 my $query="SELECT biblio.biblionumber,tag,subfieldcode,subfieldvalue,biblio.title FROM marc_subfield_table left join marc_biblio on marc_biblio.bibid=marc_subfield_table.bibid left join biblio on marc_biblio.biblionumber=biblio.biblionumber";
  58 my $sth=$dbh->prepare($query);
  59
  60 print "******** SELECTING \n";
  61 $sth->execute;
  62 print "******** DONE \n";
  63 $|=1; # flushes output
  64
  65 my $sthT=$dbh->prepare("select usedin from marc_Tword where tagsubfield=? and word=?");
  66 my $updateT=$dbh->prepare("update marc_Tword set usedin=? where tagsubfield=? and word=?");
  67 my $insertT=$dbh->prepare("insert into marc_Tword (tagsubfield,word,usedin) values (?,?,?)");
  68 my $i=0;
  69 my $timeneeded;
  70 # 1st version, slower, but less RAM consumming
  71 # while (my ($biblionumber, $tag, $subfieldcode, $subfieldvalue, $title) = $sth->fetchrow) {
  72 #       next if $ignore_list{"$tag.$subfieldcode"};
  73 #     $subfieldvalue =~ s/(\.|\?|\:|\!|\'|,|\-|\"|\(|\)|\[|\]|\{|\}|\/)/ /g;
  74 #       # remove useless chars in the title.
  75 #     $title =~ s/(\.|\?|\:|\!|\'|,|\-|\"|\(|\)|\[|\]|\{|\}|\/)/ /g;
  76 #     my @words = split / /, $subfieldvalue;
  77 #       # and retrieve the reversed entry
  78 #       foreach my $word (@words) {
  79 #               $sthT->execute($tag.$subfieldcode,$word);
  80 #               if (my ($usedin) = $sthT->fetchrow) {
  81 #                       # add the field & save it once again.
  82 #                       $usedin.=",$biblionumber-$title";
  83 #                       $updateT->execute($usedin,$tag.$subfieldcode,$word);
  84 #               } else {
  85 #                       $insertT->execute($tag.$subfieldcode,$word,",$title-$biblionumber");
  86 #               }
  87 #       }
  88 #       $timeneeded = gettimeofday - $starttime unless ($i % 100);
  89 #       print "$i in $timeneeded s\n" unless ($i % 100);
  90 #       print ".";
  91 #       $i++;
  92 # }
  93
  94 # 2nd version : faster (about 100 times !), bug maybe too much RAM consumming...
  95 my %largehash;
  96 print "READING\n";
  97 while (my ($biblionumber, $tag, $subfieldcode, $subfieldvalue, $title) = $sth->fetchrow) {
  98         next unless $subfieldvalue;
  99         next if $ignore_list{$tag.$subfieldcode};
 100     $subfieldvalue =~ s/(\.|\?|\:|\!|\'|,|\-|\"|\(|\)|\[|\]|\{|\}|\/)/ /g;
 101         # remove useless chars in the title.
 102     $title =~ s/(\.|\?|\:|\!|\'|,|\-|\"|\(|\)|\[|\]|\{|\}|\/)/ /g;
 103     my @words = split / /, $subfieldvalue;
 104         # and retrieve the reversed entry
 105         foreach my $word (@words) {
 106                 my $localkey = $tag.$subfieldcode.'|'.uc($word);
 107                 $largehash{$localkey}.=",$title-$biblionumber";
 108         }
 109         $timeneeded = gettimeofday - $starttime unless ($i % 30000);
 110         print "$i in $timeneeded s\n" unless ($i % 30000);
 111         print "." unless ($i % 500);
 112         $i++;
 113 }
 114 $i=0;
 115 print "WRITING\n";
 116 foreach my $k (keys %largehash) {
 117         $k =~ /(.*)\|(.*)/;
 118         $insertT->execute($1,$2,$largehash{$k});
 119         $timeneeded = gettimeofday - $starttime unless ($i % 30000);
 120         print "$i in $timeneeded s\n" unless ($i % 30000);
 121         print "." unless ($i % 500);
 122         $i++;
 123 }
 124
 125 $dbh->disconnect();