#!/usr/bin/perl use C4::Context; use Getopt::Long; use C4::Biblio; use C4::AuthoritiesMarc; use strict; #use warnings; FIXME - Bug 2505 # # script that fills the nozebra table # # $|=1; # flushes output # limit for database dumping my $limit;# = "LIMIT 100"; my $directory; #my $skip_export; #my $keep_export; #my $reset; #my $biblios; my $authorities; my $sysprefs; my $commit; my $want_help; my $result = GetOptions( 'd:s' => \$directory, # 'reset' => \$reset, # 's' => \$skip_export, # Not used and conflicts with 's' option some lines below for sysprefs!!! # 'k' => \$keep_export, # 'b' => \$biblios, # 'a' => \$authorities, 's' => \$sysprefs, # rebuild 'NoZebraIndexes' syspref 'h|help' => \$want_help, 'commit:f' => \$commit, ); if (not $result or $want_help) { print_usage(); exit 0; } sub print_usage { print <<_USAGE_; $0: reindex MARC bibs and authorities if NOT using Zebra ("NoZebra"). Use this batch job to reindex all biblio and authority records in your Koha database. This job is useful only if you are NOT using Zebra ('NoZebra'); if you are using the 'Zebra'mode, this job should NOT be used. Parameters: -d Temporary directory for indexing. If not specified, one is automatically created. The export directory is automatically deleted unless you supply the -k switch. -s Rebuild "NoZebraIndexes" System Preference --help or -h show this message. _USAGE_ } # END of print_usage sub my $commitnum = 1000; $commitnum = $commit if ($commit) ; $directory = "export" unless $directory; my $dbh=C4::Context->dbh; $dbh->do("update systempreferences set value=1 where variable='NoZebra'"); $dbh->do("truncate nozebra"); my %index = GetNoZebraIndexes(); if (!%index || $sysprefs ) { if (C4::Context->preference('marcflavour') eq 'UNIMARC') { $dbh->do("UPDATE systempreferences SET value=\"'title' => '200a,200c,200d,200e,225a,225d,225e,225f,225h,225i,225v,500*,501*,503*,510*,512*,513*,514*,515*,516*,517*,518*,519*,520*,530*,531*,532*,540*,541*,545*,604t,610t,605a', 'author' =>'200f,600a,601a,604a,700a,700b,700c,700d,700a,701b,701c,701d,702a,702b,702c,702d,710a,710b,710c,710d,711a,711b,711c,711d,712a,712b,712c,712d', 'isbn' => '010a', 'issn' => '011a', 'biblionumber' =>'0909', 'itemtype' => '200b', 'language' => '101a', 'publisher' => '210c', 'date' => '210d', 'note' => '300a,301a,302a,303a,304a,305a,306az,307a,308a,309a,310a,311a,312a,313a,314a,315a,316a,317a,318a,319a,320a,321a,322a,323a,324a,325a,326a,327a,328a,330a,332a,333a,336a,337a,345a', 'Koha-Auth-Number' => '6009,6019,6029,6039,6049,6059,6069,6109,7009,7019,7029,7109,7119,7129', 'subject' => '600*,601*,606*,610*', 'dewey' => '676a', 'host-item' => '995a,995c',\" where variable='NoZebraIndexes'"); %index = GetNoZebraIndexes(); } elsif (C4::Context->preference('marcflavour') eq 'MARC21') { $dbh->do("UPDATE systempreferences SET value=\" 'title' => '130a,210a,222a,240a,243a,245a,245b,246a,246b,247a,247b,250a,250b,440a,830a', 'author' => '100a,100b,100c,100d,110a,111a,111b,111c,111d,245c,700a,710a,711a,800a,810a,811a', 'isbn' => '020a', 'issn' => '022a', 'lccn' => '010a', 'biblionumber => '999c', 'itemtype' => '942c', 'publisher' => '260b', 'date' => '260c', 'note' => '500a, 501a,504a,505a,508a,511a,518a,520a,521a,522a,524a,526a,530a,533a,538a,541a,546a,555a,556a,562a,563a,583a,585a,582a', 'subject' => '600*,610*,611*,630*,650*,651*,653*,654*,655*,662*,690*', 'dewey' => '082', 'bc' => '952p', 'callnum' => '952o', 'an' => '6009,6109,6119', 'series' => 440*,490*, 'host-item' => '9529 'shelf' => '952c', 'collection' => '9528', \"WHERE variable='NoZebraIndexes'"); %index = GetNoZebraIndexes(); } } $|=1; $dbh->{AutoCommit} = 0; print "***********************************\n"; print "***** building BIBLIO indexes *****\n"; print "***********************************\n"; my $sth; $sth=$dbh->prepare("select biblionumber from biblioitems order by biblionumber $limit"); $sth->execute(); my $i=0; my %result; while (my ($biblionumber) = $sth->fetchrow) { $i++; print "\r$i"; my $record; eval{ $record = GetMarcBiblio($biblionumber); }; if($@){ print " There was some pb getting biblionumber : ".$biblionumber."\n"; next; } next unless $record; # get title of the record (to store the 10 first letters with the index) my ($titletag,$titlesubfield) = GetMarcFromKohaField('biblio.title', ''); my $title = lc($record->subfield($titletag,$titlesubfield)); # remove blancks comma (that could cause problem when decoding the string for CQL retrieval) and regexp specific values $title =~ s/ |\.|,|;|\[|\]|\(|\)|\*|-|'|=|://g; # limit to 10 char, should be enough, and limit the DB size $title = substr($title,0,10); #parse each field foreach my $field ($record->fields()) { #parse each subfield next if $field->tag <10; foreach my $subfield ($field->subfields()) { my $tag = $field->tag(); my $subfieldcode = $subfield->[0]; my $indexed=0; # check each index to see if the subfield is stored somewhere # otherwise, store it in __RAW__ index foreach my $key (keys %index) { if ($index{$key} =~ /\Q$tag\E\*/ or $index{$key} =~ /\Q$tag$subfieldcode\E/) { $indexed=1; my $line= lc $subfield->[1]; # remove meaningless value in the field... $line =~ s/-|\.|\?|,|;|!|'|\(|\)|\[|\]|{|}|"|<|>|&|\+|\*|\/|=|:/ /g; # ... and split in words foreach (split / /,$line) { next unless $_; # skip empty values (multiple spaces) # remove any accented char # if the entry is already here, improve weight if ($result{$key}->{"$_"} =~ /$biblionumber,\Q$title\E\-(\d);/) { my $weight=$1+1; $result{$key}->{"$_"} =~ s/$biblionumber,\Q$title\E\-(\d);//; $result{$key}->{"$_"} .= "$biblionumber,$title-$weight;"; # otherwise, create it, with weight=1 } else { $result{$key}->{"$_"}.="$biblionumber,$title-1;"; } } } } # the subfield is not indexed, store it in __RAW__ index anyway unless ($indexed) { my $line= lc $subfield->[1]; $line =~ s/-|\.|\?|,|;|!|'|\(|\)|\[|\]|{|}|"|<|>|&|\+|\*|\/|=/ /g; foreach (split / /,$line) { next unless $_; # warn $record->as_formatted."$_ =>".$title; if ($result{__RAW__}->{"$_"} =~ /$biblionumber,\Q$title\E\-(\d);/) { my $weight=$1+1; # $weight++; $result{__RAW__}->{"$_"} =~ s/$biblionumber,\Q$title\E\-(\d);//; $result{__RAW__}->{"$_"} .= "$biblionumber,$title-$weight;"; } else { $result{__RAW__}->{"$_"}.="$biblionumber,$title-1;"; } } } } } } print "\nInserting records...\n"; $i=0; my $commitnum = 100; $dbh->{AutoCommit} = 0; $sth = $dbh->prepare("INSERT INTO nozebra (server,indexname,value,biblionumbers) VALUES ('biblioserver',?,?,?)"); foreach my $key (keys %result) { foreach my $index (keys %{$result{$key}}) { if (length($result{$key}->{$index}) > 1000000) { print "very long index (".length($result{$key}->{$index}).")for $key / $index. update mySQL config file if you have an error just after this warning (max_paquet_size parameter)\n"; } print "\r$i"; $i++; $sth->execute($key,$index,$result{$key}->{$index}); $dbh->commit() if (0 == $i % $commitnum); } $dbh->commit() if (0 == $i % $commitnum); } $dbh->commit; print "\nbiblios done\n"; print "\n***********************************\n"; print "***** building AUTHORITIES indexes *****\n"; print "***********************************\n"; $sth=$dbh->prepare("select authid from auth_header order by authid $limit"); $sth->execute(); $i=0; %result = (); while (my ($authid) = $sth->fetchrow) { $i++; print "\r$i"; my $record; eval{ $record = GetAuthority($authid); }; if($@){ print " There was some pb getting authnumber : ".$authid."\n"; next; } my %index; # for authorities, the "title" is the $a mainentry my $authref = C4::AuthoritiesMarc::GetAuthType(C4::AuthoritiesMarc::GetAuthTypeCode($authid)); warn "ERROR : authtype undefined for ".$record->as_formatted unless $authref; my $title = $record->subfield($authref->{auth_tag_to_report},'a'); $index{'mainmainentry'}= $authref->{'auth_tag_to_report'}.'a'; $index{'mainentry'} = $authref->{'auth_tag_to_report'}.'*'; $index{'auth_type'} = '152b'; # remove blancks comma (that could cause problem when decoding the string for CQL retrieval) and regexp specific values $title =~ s/ |\.|,|;|\[|\]|\(|\)|\*|-|'|:|=//g; $title = quotemeta $title; # limit to 10 char, should be enough, and limit the DB size $title = substr($title,0,10); #parse each field foreach my $field ($record->fields()) { #parse each subfield next if $field->tag <10; foreach my $subfield ($field->subfields()) { my $tag = $field->tag(); my $subfieldcode = $subfield->[0]; my $indexed=0; # check each index to see if the subfield is stored somewhere # otherwise, store it in __RAW__ index foreach my $key (keys %index) { if ($index{$key} =~ /$tag\*/ or $index{$key} =~ /$tag$subfieldcode/) { $indexed=1; my $line= lc $subfield->[1]; # remove meaningless value in the field... $line =~ s/-|\.|\?|,|;|!|'|\(|\)|\[|\]|{|}|"|<|>|&|\+|\*|\/|=|:/ /g; # ... and split in words foreach (split / /,$line) { next unless $_; # skip empty values (multiple spaces) # if the entry is already here, improve weight if ($result{$key}->{"$_"} =~ /$authid,$title\-(\d);/) { my $weight=$1+1; $result{$key}->{"$_"} =~ s/$authid,$title\-(\d);//; $result{$key}->{"$_"} .= "$authid,$title-$weight;"; # otherwise, create it, with weight=1 } else { $result{$key}->{"$_"}.="$authid,$title-1;"; } } } } # the subfield is not indexed, store it in __RAW__ index anyway unless ($indexed) { my $line= lc $subfield->[1]; $line =~ s/-|\.|\?|,|;|!|'|\(|\)|\[|\]|{|}|"|<|>|&|\+|\*|\/|=/ /g; foreach (split / /,$line) { next unless $_; # warn $record->as_formatted."$_ =>".$title; if ($result{__RAW__}->{"$_"} =~ /$authid,$title\-(\d);/) { my $weight=$1+1; # $weight++; $result{__RAW__}->{"$_"} =~ s/$authid,$title\-(\d);//; $result{__RAW__}->{"$_"} .= "$authid,$title-$weight;"; } else { $result{__RAW__}->{"$_"}.="$authid,$title-1;"; } } } } } } print "\nInserting...\n"; $i=0; my $commitnum = 100; $dbh->{AutoCommit} = 0; $sth = $dbh->prepare("INSERT INTO nozebra (server,indexname,value,biblionumbers) VALUES ('authorityserver',?,?,?)"); foreach my $key (keys %result) { foreach my $index (keys %{$result{$key}}) { if (length($result{$key}->{$index}) > 1000000) { print "very long index (".length($result{$key}->{$index}).")for $key / $index. update mySQL config file if you have an error just after this warning (max_paquet_size parameter)\n"; } print "\r$i"; $i++; $sth->execute($key,$index,$result{$key}->{$index}); $dbh->commit() if (0 == $i % $commitnum); } $dbh->commit() if (0 == $i % $commitnum); } $dbh->commit; print "\nauthorities done\n";