misc/migration_tools/rebuild_nozebra.pl

   1 #!/usr/bin/perl
   2
   3 use C4::Context;
   4 use Getopt::Long;
   5 use C4::Biblio;
   6 use C4::AuthoritiesMarc;
   7
   8 use strict;
   9 #
  10 # script that fills the nozebra table
  11 #
  12 #
  13
  14 $|=1; # flushes output
  15
  16 # limit for database dumping
  17 my $limit = "LIMIT 100";
  18 my $directory;
  19 my $skip_export;
  20 my $keep_export;
  21 my $reset;
  22 my $biblios;
  23 my $authorities;
  24 GetOptions(
  25         'd:s'      => \$directory,
  26         'reset'      => \$reset,
  27         's'        => \$skip_export,
  28         'k'        => \$keep_export,
  29         'b'        => \$biblios,
  30         'a'        => \$authorities,
  31         );
  32
  33 $directory = "export" unless $directory;
  34 my $dbh=C4::Context->dbh;
  35 $dbh->do("update systempreferences set value=1 where variable='NoZebra'");
  36 $dbh->do("CREATE TABLE `nozebra` (
  37                 `indexname` varchar(40) character set latin1 NOT NULL,
  38                 `value` varchar(250) character set latin1 NOT NULL,
  39                 `biblionumbers` longtext character set latin1 NOT NULL,
  40                 KEY `indexname` (`indexname`),
  41                 KEY `value` (`value`))
  42                 ENGINE=InnoDB DEFAULT CHARSET=utf8");
  43 $dbh->do("truncate nozebra");
  44 my $sth;
  45 $sth=$dbh->prepare("select biblionumber from biblioitems order by biblionumber $limit");
  46 $sth->execute();
  47 my $i=0;
  48 my %result;
  49
  50 my %index = GetNoZebraIndexes();
  51
  52 $|=1;
  53 while (my ($biblionumber) = $sth->fetchrow) {
  54     $i++;
  55     print "\r$i";
  56     my $record = GetMarcBiblio($biblionumber);
  57
  58     # get title of the record (to store the 10 first letters with the index)
  59     my ($titletag,$titlesubfield) = GetMarcFromKohaField('biblio.title');
  60     my $title = lc($record->subfield($titletag,$titlesubfield));
  61
  62     # remove blancks comma (that could cause problem when decoding the string for CQL retrieval) and regexp specific values
  63     $title =~ s/ |,|;|\[|\]|\(|\)|\*|-|'|=//g;
  64     # limit to 10 char, should be enough, and limit the DB size
  65     $title = substr($title,0,10);
  66     #parse each field
  67     foreach my $field ($record->fields()) {
  68         #parse each subfield
  69         next if $field->tag <10;
  70         foreach my $subfield ($field->subfields()) {
  71             my $tag = $field->tag();
  72             my $subfieldcode = $subfield->[0];
  73             my $indexed=0;
  74             # check each index to see if the subfield is stored somewhere
  75             # otherwise, store it in __RAW__ index
  76             foreach my $key (keys %index) {
  77                 if ($index{$key} =~ /$tag\*/ or $index{$key} =~ /$tag$subfieldcode/) {
  78                     $indexed=1;
  79                     my $line= lc $subfield->[1];
  80                     # remove meaningless value in the field...
  81                     $line =~ s/-|\.|\?|,|;|!|'|\(|\)|\[|\]|{|}|"|<|>|&|\+|\*|\/|=/ /g;
  82                     # ... and split in words
  83                     foreach (split / /,$line) {
  84                         next unless $_; # skip  empty values (multiple spaces)
  85                         # if the entry is already here, improve weight
  86                         if ($result{$key}->{$_} =~ /$biblionumber,$title\-(\d);/) {
  87                             my $weight=$1+1;
  88                             $result{$key}->{$_} =~ s/$biblionumber,$title\-(\d);//;
  89                             $result{$key}->{$_} .= "$biblionumber,$title-$weight;";
  90                         # otherwise, create it, with weight=1
  91                         } else {
  92                             $result{$key}->{$_}.="$biblionumber,$title-1;";
  93                         }
  94                     }
  95                 }
  96             }
  97             # the subfield is not indexed, store it in __RAW__ index anyway
  98             unless ($indexed) {
  99                 my $line= lc $subfield->[1];
 100                 $line =~ s/-|\.|\?|,|;|!|'|\(|\)|\[|\]|{|}|"|<|>|&|\+|\*|\/|=/ /g;
 101                 foreach (split / /,$line) {
 102                         next unless $_;
 103 #                     warn $record->as_formatted."$_ =>".$title;
 104                         if ($result{__RAW__}->{$_} =~ /$biblionumber,$title\-(\d);/) {
 105                             my $weight=$1+1;
 106 #                             $weight++;
 107                             $result{__RAW__}->{$_} =~ s/$biblionumber,$title\-(\d);//;
 108                             $result{__RAW__}->{$_} .= "$biblionumber,$title-$weight;";
 109                         } else {
 110                             $result{__RAW__}->{$_}.="$biblionumber,$title-1;";
 111                         }
 112                 }
 113             }
 114         }
 115     }
 116 }
 117 my $sth = $dbh->prepare("INSERT INTO nozebra (indexname,value,biblionumbers) VALUES (?,?,?)");
 118 foreach my $key (keys %result) {
 119     foreach my $index (keys %{$result{$key}}) {
 120         $sth->execute($key,$index,$result{$key}->{$index});
 121         if (length($result{$key}->{$index}) > 40000) {
 122             print length($result{$key}->{$index})."\n for $key / $index\n";
 123         }
 124     }
 125 }