Koha/misc/migration_tools/rebuild_nozebra.pl

343 lines
13 KiB
Perl
Executable file

#!/usr/bin/perl
use C4::Context;
use Getopt::Long;
use C4::Biblio;
use C4::AuthoritiesMarc;
use strict;
#use warnings; FIXME - Bug 2505
#
# script that fills the nozebra table
#
#
$|=1; # flushes output
# limit for database dumping
my $limit;# = "LIMIT 100";
my $directory;
#my $skip_export;
#my $keep_export;
#my $reset;
#my $biblios;
my $authorities;
my $sysprefs;
my $commit;
my $want_help;
my $result = GetOptions(
'd:s' => \$directory,
# 'reset' => \$reset,
# 's' => \$skip_export, # Not used and conflicts with 's' option some lines below for sysprefs!!!
# 'k' => \$keep_export,
# 'b' => \$biblios,
# 'a' => \$authorities,
's' => \$sysprefs, # rebuild 'NoZebraIndexes' syspref
'h|help' => \$want_help,
'commit:f' => \$commit,
);
if (not $result or $want_help) {
print_usage();
exit 0;
}
sub print_usage {
print <<_USAGE_;
$0: reindex MARC bibs and authorities if NOT using Zebra ("NoZebra").
Use this batch job to reindex all biblio and authority
records in your Koha database. This job is useful
only if you are NOT using Zebra ('NoZebra'); if you are
using the 'Zebra'mode, this job should NOT be used.
Parameters:
-d Temporary directory for indexing.
If not specified, one is automatically
created. The export directory
is automatically deleted unless
you supply the -k switch.
-s Rebuild "NoZebraIndexes" System Preference
--help or -h show this message.
_USAGE_
} # END of print_usage sub
my $commitnum = 1000;
$commitnum = $commit if ($commit) ;
$directory = "export" unless $directory;
my $dbh=C4::Context->dbh;
$dbh->do("update systempreferences set value=1 where variable='NoZebra'");
$dbh->do("truncate nozebra");
my %index = GetNoZebraIndexes();
if (!%index || $sysprefs ) {
if (C4::Context->preference('marcflavour') eq 'UNIMARC') {
$dbh->do("UPDATE systempreferences SET value=\"'title' => '200a,200c,200d,200e,225a,225d,225e,225f,225h,225i,225v,500*,501*,503*,510*,512*,513*,514*,515*,516*,517*,518*,519*,520*,530*,531*,532*,540*,541*,545*,604t,610t,605a',
'author' =>'200f,600a,601a,604a,700a,700b,700c,700d,700a,701b,701c,701d,702a,702b,702c,702d,710a,710b,710c,710d,711a,711b,711c,711d,712a,712b,712c,712d',
'isbn' => '010a',
'issn' => '011a',
'biblionumber' =>'0909',
'itemtype' => '200b',
'language' => '101a',
'publisher' => '210c',
'date' => '210d',
'note' => '300a,301a,302a,303a,304a,305a,306az,307a,308a,309a,310a,311a,312a,313a,314a,315a,316a,317a,318a,319a,320a,321a,322a,323a,324a,325a,326a,327a,328a,330a,332a,333a,336a,337a,345a',
'Koha-Auth-Number' => '6009,6019,6029,6039,6049,6059,6069,6109,7009,7019,7029,7109,7119,7129',
'subject' => '600*,601*,606*,610*',
'dewey' => '676a',
'host-item' => '995a,995c',\" where variable='NoZebraIndexes'");
%index = GetNoZebraIndexes();
} elsif (C4::Context->preference('marcflavour') eq 'MARC21') {
$dbh->do("UPDATE systempreferences SET value=\"
'title' => '130a,210a,222a,240a,243a,245a,245b,246a,246b,247a,247b,250a,250b,440a,830a',
'author' => '100a,100b,100c,100d,110a,111a,111b,111c,111d,245c,700a,710a,711a,800a,810a,811a',
'isbn' => '020a',
'issn' => '022a',
'lccn' => '010a',
'biblionumber => '999c',
'itemtype' => '942c',
'publisher' => '260b',
'date' => '260c',
'note' => '500a, 501a,504a,505a,508a,511a,518a,520a,521a,522a,524a,526a,530a,533a,538a,541a,546a,555a,556a,562a,563a,583a,585a,582a',
'subject' => '600*,610*,611*,630*,650*,651*,653*,654*,655*,662*,690*',
'dewey' => '082',
'bc' => '952p',
'callnum' => '952o',
'an' => '6009,6109,6119',
'series' => 440*,490*,
'host-item' => '9529
'shelf' => '952c',
'collection' => '9528',
\"WHERE variable='NoZebraIndexes'");
%index = GetNoZebraIndexes();
}
}
$|=1;
$dbh->{AutoCommit} = 0;
print "***********************************\n";
print "***** building BIBLIO indexes *****\n";
print "***********************************\n";
my $sth;
$sth=$dbh->prepare("select biblionumber from biblioitems order by biblionumber $limit");
$sth->execute();
my $i=0;
my %result;
while (my ($biblionumber) = $sth->fetchrow) {
$i++;
print "\r$i";
my $record;
eval{
$record = GetMarcBiblio($biblionumber);
};
if($@){
print " There was some pb getting biblionumber : ".$biblionumber."\n";
next;
}
next unless $record;
# get title of the record (to store the 10 first letters with the index)
my ($titletag,$titlesubfield) = GetMarcFromKohaField('biblio.title', '');
my $title = lc($record->subfield($titletag,$titlesubfield));
# remove blancks comma (that could cause problem when decoding the string for CQL retrieval) and regexp specific values
$title =~ s/ |\.|,|;|\[|\]|\(|\)|\*|-|'|=|://g;
# limit to 10 char, should be enough, and limit the DB size
$title = substr($title,0,10);
#parse each field
foreach my $field ($record->fields()) {
#parse each subfield
next if $field->tag <10;
foreach my $subfield ($field->subfields()) {
my $tag = $field->tag();
my $subfieldcode = $subfield->[0];
my $indexed=0;
# check each index to see if the subfield is stored somewhere
# otherwise, store it in __RAW__ index
foreach my $key (keys %index) {
if ($index{$key} =~ /\Q$tag\E\*/ or $index{$key} =~ /\Q$tag$subfieldcode\E/) {
$indexed=1;
my $line= lc $subfield->[1];
# remove meaningless value in the field...
$line =~ s/-|\.|\?|,|;|!|'|\(|\)|\[|\]|{|}|"|<|>|&|\+|\*|\/|=|:/ /g;
# ... and split in words
foreach (split / /,$line) {
next unless $_; # skip empty values (multiple spaces)
# remove any accented char
# if the entry is already here, improve weight
if ($result{$key}->{"$_"} =~ /$biblionumber,\Q$title\E\-(\d);/) {
my $weight=$1+1;
$result{$key}->{"$_"} =~ s/$biblionumber,\Q$title\E\-(\d);//;
$result{$key}->{"$_"} .= "$biblionumber,$title-$weight;";
# otherwise, create it, with weight=1
} else {
$result{$key}->{"$_"}.="$biblionumber,$title-1;";
}
}
}
}
# the subfield is not indexed, store it in __RAW__ index anyway
unless ($indexed) {
my $line= lc $subfield->[1];
$line =~ s/-|\.|\?|,|;|!|'|\(|\)|\[|\]|{|}|"|<|>|&|\+|\*|\/|=/ /g;
foreach (split / /,$line) {
next unless $_;
# warn $record->as_formatted."$_ =>".$title;
if ($result{__RAW__}->{"$_"} =~ /$biblionumber,\Q$title\E\-(\d);/) {
my $weight=$1+1;
# $weight++;
$result{__RAW__}->{"$_"} =~ s/$biblionumber,\Q$title\E\-(\d);//;
$result{__RAW__}->{"$_"} .= "$biblionumber,$title-$weight;";
} else {
$result{__RAW__}->{"$_"}.="$biblionumber,$title-1;";
}
}
}
}
}
}
print "\nInserting records...\n";
$i=0;
my $commitnum = 100;
$dbh->{AutoCommit} = 0;
$sth = $dbh->prepare("INSERT INTO nozebra (server,indexname,value,biblionumbers) VALUES ('biblioserver',?,?,?)");
foreach my $key (keys %result) {
foreach my $index (keys %{$result{$key}}) {
if (length($result{$key}->{$index}) > 1000000) {
print "very long index (".length($result{$key}->{$index}).")for $key / $index. update mySQL config file if you have an error just after this warning (max_paquet_size parameter)\n";
}
print "\r$i";
$i++;
$sth->execute($key,$index,$result{$key}->{$index});
$dbh->commit() if (0 == $i % $commitnum);
}
$dbh->commit() if (0 == $i % $commitnum);
}
$dbh->commit;
print "\nbiblios done\n";
print "\n***********************************\n";
print "***** building AUTHORITIES indexes *****\n";
print "***********************************\n";
$sth=$dbh->prepare("select authid from auth_header order by authid $limit");
$sth->execute();
$i=0;
%result = ();
while (my ($authid) = $sth->fetchrow) {
$i++;
print "\r$i";
my $record;
eval{
$record = GetAuthority($authid);
};
if($@){
print " There was some pb getting authnumber : ".$authid."\n";
next;
}
my %index;
# for authorities, the "title" is the $a mainentry
my $authref = C4::AuthoritiesMarc::GetAuthType(C4::AuthoritiesMarc::GetAuthTypeCode($authid));
warn "ERROR : authtype undefined for ".$record->as_formatted unless $authref;
my $title = $record->subfield($authref->{auth_tag_to_report},'a');
$index{'mainmainentry'}= $authref->{'auth_tag_to_report'}.'a';
$index{'mainentry'} = $authref->{'auth_tag_to_report'}.'*';
$index{'auth_type'} = '152b';
# remove blancks comma (that could cause problem when decoding the string for CQL retrieval) and regexp specific values
$title =~ s/ |\.|,|;|\[|\]|\(|\)|\*|-|'|:|=//g;
$title = quotemeta $title;
# limit to 10 char, should be enough, and limit the DB size
$title = substr($title,0,10);
#parse each field
foreach my $field ($record->fields()) {
#parse each subfield
next if $field->tag <10;
foreach my $subfield ($field->subfields()) {
my $tag = $field->tag();
my $subfieldcode = $subfield->[0];
my $indexed=0;
# check each index to see if the subfield is stored somewhere
# otherwise, store it in __RAW__ index
foreach my $key (keys %index) {
if ($index{$key} =~ /$tag\*/ or $index{$key} =~ /$tag$subfieldcode/) {
$indexed=1;
my $line= lc $subfield->[1];
# remove meaningless value in the field...
$line =~ s/-|\.|\?|,|;|!|'|\(|\)|\[|\]|{|}|"|<|>|&|\+|\*|\/|=|:/ /g;
# ... and split in words
foreach (split / /,$line) {
next unless $_; # skip empty values (multiple spaces)
# if the entry is already here, improve weight
if ($result{$key}->{"$_"} =~ /$authid,$title\-(\d);/) {
my $weight=$1+1;
$result{$key}->{"$_"} =~ s/$authid,$title\-(\d);//;
$result{$key}->{"$_"} .= "$authid,$title-$weight;";
# otherwise, create it, with weight=1
} else {
$result{$key}->{"$_"}.="$authid,$title-1;";
}
}
}
}
# the subfield is not indexed, store it in __RAW__ index anyway
unless ($indexed) {
my $line= lc $subfield->[1];
$line =~ s/-|\.|\?|,|;|!|'|\(|\)|\[|\]|{|}|"|<|>|&|\+|\*|\/|=/ /g;
foreach (split / /,$line) {
next unless $_;
# warn $record->as_formatted."$_ =>".$title;
if ($result{__RAW__}->{"$_"} =~ /$authid,$title\-(\d);/) {
my $weight=$1+1;
# $weight++;
$result{__RAW__}->{"$_"} =~ s/$authid,$title\-(\d);//;
$result{__RAW__}->{"$_"} .= "$authid,$title-$weight;";
} else {
$result{__RAW__}->{"$_"}.="$authid,$title-1;";
}
}
}
}
}
}
print "\nInserting...\n";
$i=0;
my $commitnum = 100;
$dbh->{AutoCommit} = 0;
$sth = $dbh->prepare("INSERT INTO nozebra (server,indexname,value,biblionumbers) VALUES ('authorityserver',?,?,?)");
foreach my $key (keys %result) {
foreach my $index (keys %{$result{$key}}) {
if (length($result{$key}->{$index}) > 1000000) {
print "very long index (".length($result{$key}->{$index}).")for $key / $index. update mySQL config file if you have an error just after this warning (max_paquet_size parameter)\n";
}
print "\r$i";
$i++;
$sth->execute($key,$index,$result{$key}->{$index});
$dbh->commit() if (0 == $i % $commitnum);
}
$dbh->commit() if (0 == $i % $commitnum);
}
$dbh->commit;
print "\nauthorities done\n";