misc/migration_tools/rebuild_nozebra.pl

   1 #!/usr/bin/perl
   2
   3 use C4::Context;
   4 use Getopt::Long;
   5 use C4::Biblio;
   6 use C4::AuthoritiesMarc;
   7
   8 use strict;
   9 #
  10 # script that fills the nozebra table
  11 #
  12 #
  13
  14 $|=1; # flushes output
  15
  16 # limit for database dumping
  17 my $limit = "LIMIT 1000";
  18 my $directory;
  19 my $skip_export;
  20 my $keep_export;
  21 my $reset;
  22 my $biblios;
  23 my $authorities;
  24 GetOptions(
  25         'd:s'      => \$directory,
  26         'reset'      => \$reset,
  27         's'        => \$skip_export,
  28         'k'        => \$keep_export,
  29         'b'        => \$biblios,
  30         'a'        => \$authorities,
  31         );
  32
  33 $directory = "export" unless $directory;
  34 my $dbh=C4::Context->dbh;
  35 $dbh->do("truncate nozebra");
  36 my $sth;
  37 $sth=$dbh->prepare("select biblionumber from biblioitems order by biblionumber $limit");
  38 $sth->execute();
  39 my $i=0;
  40 my %result;
  41
  42 my %index = (
  43     'title' => '200a,200c,200d',
  44     'author' =>'200f,700*,701*,702*'
  45     );
  46
  47 $|=1;
  48 while (my ($biblionumber) = $sth->fetchrow) {
  49     $i++;
  50     print "\r$i";
  51     my $record = GetMarcBiblio($biblionumber);
  52
  53     # get title of the record (to store the 10 first letters with the index)
  54     my $title;
  55     if (C4::Context->preference('marcflavour') eq 'UNIMARC') {
  56         $title = lc($record->subfield('200','a'));
  57     } else {
  58         $title = lc($record->subfield('245','a'));
  59     }
  60     # remove blancks and comma (that could cause problem when decoding the string for CQL retrieval
  61     $title =~ s/ |,|;//g;
  62     # limit to 10 char, should be enough, and limit the DB size
  63     $title = substr($title,0,10);
  64     #parse each field
  65     foreach my $field ($record->fields()) {
  66         #parse each subfield
  67         next if $field->tag <10;
  68         foreach my $subfield ($field->subfields()) {
  69             my $tag = $field->tag();
  70             my $subfieldcode = $subfield->[0];
  71             my $indexed=0;
  72             # check each index to see if the subfield is stored somewhere
  73             # otherwise, store it in __RAW__ index
  74             foreach my $key (keys %index) {
  75                 if ($index{$key} =~ /$tag\*/ or $index{$key} =~ /$tag$subfield/) {
  76                     $indexed=1;
  77                     my $line= lc $subfield->[1];
  78                     $line =~ s/-|\.|\?|,|;|!|'|\(|\)|\[|\]|{|}|"|<|>|&|\+|\*|\// /g;
  79                     foreach (split / /,$line) {
  80                         $result{$key}->{$_}.="$biblionumber;$title," unless $subfield->[0] eq '9';
  81                     }
  82                 }
  83             }
  84             # the subfield is not indexed, store it in __RAW__ index anyway
  85             unless ($indexed) {
  86                 my $line= lc $subfield->[1];
  87                 $line =~ s/-|\.|\?|,|;|!|'|\(|\)|\[|\]|{|}|"|<|>|&|\+|\*|\// /g;
  88                 foreach (split / /,$line) {
  89                     $result{'__RAW__'}->{$_}.="$biblionumber;$title," unless $subfield->[0] eq '9';
  90                 }
  91             }
  92         }
  93     }
  94 }
  95 my $sth = $dbh->prepare("INSERT INTO nozebra (indexname,value,biblionumbers) VALUES (?,?,?)");
  96 foreach my $key (keys %result) {
  97     foreach my $index (keys %{$result{$key}}) {
  98         $sth->execute($key,$index,$result{$key}->{$index});
  99     }
 100 }