6 use C4::AuthoritiesMarc;
9 #use warnings; FIXME - Bug 2505
11 # script that fills the nozebra table
15 $|=1; # flushes output
17 # limit for database dumping
18 my $limit;# = "LIMIT 100";
29 my $result = GetOptions(
32 # 's' => \$skip_export, # Not used and conflicts with 's' option some lines below for sysprefs!!!
33 # 'k' => \$keep_export,
35 # 'a' => \$authorities,
36 's' => \$sysprefs, # rebuild 'NoZebraIndexes' syspref
37 'h|help' => \$want_help,
38 'commit:f' => \$commit,
41 if (not $result or $want_help) {
49 $0: reindex MARC bibs and authorities if NOT using Zebra ("NoZebra").
51 Use this batch job to reindex all biblio and authority
52 records in your Koha database. This job is useful
53 only if you are NOT using Zebra ('NoZebra'); if you are
54 using the 'Zebra'mode, this job should NOT be used.
57 -d Temporary directory for indexing.
58 If not specified, one is automatically
59 created. The export directory
60 is automatically deleted unless
61 you supply the -k switch.
63 -s Rebuild "NoZebraIndexes" System Preference
65 --help or -h show this message.
67 } # END of print_usage sub
71 $commitnum = $commit if ($commit) ;
73 $directory = "export" unless $directory;
74 my $dbh=C4::Context->dbh;
75 $dbh->do("update systempreferences set value=1 where variable='NoZebra'");
77 $dbh->do("truncate nozebra");
79 my %index = GetNoZebraIndexes();
81 if (!%index || $sysprefs ) {
82 if (C4::Context->preference('marcflavour') eq 'UNIMARC') {
83 $dbh->do("UPDATE systempreferences SET value=\"'title' => '200a,200c,200d,200e,225a,225d,225e,225f,225h,225i,225v,500*,501*,503*,510*,512*,513*,514*,515*,516*,517*,518*,519*,520*,530*,531*,532*,540*,541*,545*,604t,610t,605a',
84 'author' =>'200f,600a,601a,604a,700a,700b,700c,700d,700a,701b,701c,701d,702a,702b,702c,702d,710a,710b,710c,710d,711a,711b,711c,711d,712a,712b,712c,712d',
87 'biblionumber' =>'0909',
90 'publisher' => '210c',
92 'note' => '300a,301a,302a,303a,304a,305a,306az,307a,308a,309a,310a,311a,312a,313a,314a,315a,316a,317a,318a,319a,320a,321a,322a,323a,324a,325a,326a,327a,328a,330a,332a,333a,336a,337a,345a',
93 'Koha-Auth-Number' => '6009,6019,6029,6039,6049,6059,6069,6109,7009,7019,7029,7109,7119,7129',
94 'subject' => '600*,601*,606*,610*',
96 'host-item' => '995a,995c',\" where variable='NoZebraIndexes'");
97 %index = GetNoZebraIndexes();
98 } elsif (C4::Context->preference('marcflavour') eq 'MARC21') {
99 $dbh->do("UPDATE systempreferences SET value=\"
100 'title' => '130a,210a,222a,240a,243a,245a,245b,246a,246b,247a,247b,250a,250b,440a,830a',
101 'author' => '100a,100b,100c,100d,110a,111a,111b,111c,111d,245c,700a,710a,711a,800a,810a,811a',
105 'biblionumber => '999c',
106 'itemtype' => '942c',
107 'publisher' => '260b',
109 'note' => '500a, 501a,504a,505a,508a,511a,518a,520a,521a,522a,524a,526a,530a,533a,538a,541a,546a,555a,556a,562a,563a,583a,585a,582a',
110 'subject' => '600*,610*,611*,630*,650*,651*,653*,654*,655*,662*,690*',
114 'an' => '6009,6109,6119',
115 'series' => 440*,490*,
118 'collection' => '9528',
119 \"WHERE variable='NoZebraIndexes'");
121 %index = GetNoZebraIndexes();
126 $dbh->{AutoCommit} = 0;
128 print "***********************************\n";
129 print "***** building BIBLIO indexes *****\n";
130 print "***********************************\n";
133 $sth=$dbh->prepare("select biblionumber from biblioitems order by biblionumber $limit");
137 while (my ($biblionumber) = $sth->fetchrow) {
142 $record = GetMarcBiblio($biblionumber);
145 print " There was some pb getting biblionumber : ".$biblionumber."\n";
149 # get title of the record (to store the 10 first letters with the index)
150 my ($titletag,$titlesubfield) = GetMarcFromKohaField('biblio.title', '');
151 my $title = lc($record->subfield($titletag,$titlesubfield));
153 # remove blancks comma (that could cause problem when decoding the string for CQL retrieval) and regexp specific values
154 $title =~ s/ |\.|,|;|\[|\]|\(|\)|\*|-|'|=|://g;
155 # limit to 10 char, should be enough, and limit the DB size
156 $title = substr($title,0,10);
158 foreach my $field ($record->fields()) {
160 next if $field->tag <10;
161 foreach my $subfield ($field->subfields()) {
162 my $tag = $field->tag();
163 my $subfieldcode = $subfield->[0];
165 # check each index to see if the subfield is stored somewhere
166 # otherwise, store it in __RAW__ index
167 foreach my $key (keys %index) {
168 if ($index{$key} =~ /\Q$tag\E\*/ or $index{$key} =~ /\Q$tag$subfieldcode\E/) {
170 my $line= lc $subfield->[1];
171 # remove meaningless value in the field...
172 $line =~ s/-|\.|\?|,|;|!|'|\(|\)|\[|\]|{|}|"|<|>|&|\+|\*|\/|=|:/ /g;
173 # ... and split in words
174 foreach (split / /,$line) {
175 next unless $_; # skip empty values (multiple spaces)
176 # remove any accented char
177 # if the entry is already here, improve weight
178 if ($result{$key}->{"$_"} =~ /$biblionumber,\Q$title\E\-(\d);/) {
180 $result{$key}->{"$_"} =~ s/$biblionumber,\Q$title\E\-(\d);//;
181 $result{$key}->{"$_"} .= "$biblionumber,$title-$weight;";
182 # otherwise, create it, with weight=1
184 $result{$key}->{"$_"}.="$biblionumber,$title-1;";
189 # the subfield is not indexed, store it in __RAW__ index anyway
191 my $line= lc $subfield->[1];
192 $line =~ s/-|\.|\?|,|;|!|'|\(|\)|\[|\]|{|}|"|<|>|&|\+|\*|\/|=/ /g;
193 foreach (split / /,$line) {
195 # warn $record->as_formatted."$_ =>".$title;
196 if ($result{__RAW__}->{"$_"} =~ /$biblionumber,\Q$title\E\-(\d);/) {
199 $result{__RAW__}->{"$_"} =~ s/$biblionumber,\Q$title\E\-(\d);//;
200 $result{__RAW__}->{"$_"} .= "$biblionumber,$title-$weight;";
202 $result{__RAW__}->{"$_"}.="$biblionumber,$title-1;";
211 print "\nInserting records...\n";
215 $dbh->{AutoCommit} = 0;
217 $sth = $dbh->prepare("INSERT INTO nozebra (server,indexname,value,biblionumbers) VALUES ('biblioserver',?,?,?)");
218 foreach my $key (keys %result) {
219 foreach my $index (keys %{$result{$key}}) {
220 if (length($result{$key}->{$index}) > 1000000) {
221 print "very long index (".length($result{$key}->{$index}).")for $key / $index. update mySQL config file if you have an error just after this warning (max_paquet_size parameter)\n";
225 $sth->execute($key,$index,$result{$key}->{$index});
226 $dbh->commit() if (0 == $i % $commitnum);
228 $dbh->commit() if (0 == $i % $commitnum);
233 print "\nbiblios done\n";
235 print "\n***********************************\n";
236 print "***** building AUTHORITIES indexes *****\n";
237 print "***********************************\n";
239 $sth=$dbh->prepare("select authid from auth_header order by authid $limit");
243 while (my ($authid) = $sth->fetchrow) {
248 $record = GetAuthority($authid);
251 print " There was some pb getting authnumber : ".$authid."\n";
256 # for authorities, the "title" is the $a mainentry
257 my $authref = C4::AuthoritiesMarc::GetAuthType(C4::AuthoritiesMarc::GetAuthTypeCode($authid));
259 warn "ERROR : authtype undefined for ".$record->as_formatted unless $authref;
260 my $title = $record->subfield($authref->{auth_tag_to_report},'a');
261 $index{'mainmainentry'}= $authref->{'auth_tag_to_report'}.'a';
262 $index{'mainentry'} = $authref->{'auth_tag_to_report'}.'*';
263 $index{'auth_type'} = '152b';
265 # remove blancks comma (that could cause problem when decoding the string for CQL retrieval) and regexp specific values
266 $title =~ s/ |\.|,|;|\[|\]|\(|\)|\*|-|'|:|=//g;
267 $title = quotemeta $title;
268 # limit to 10 char, should be enough, and limit the DB size
269 $title = substr($title,0,10);
271 foreach my $field ($record->fields()) {
273 next if $field->tag <10;
274 foreach my $subfield ($field->subfields()) {
275 my $tag = $field->tag();
276 my $subfieldcode = $subfield->[0];
278 # check each index to see if the subfield is stored somewhere
279 # otherwise, store it in __RAW__ index
280 foreach my $key (keys %index) {
281 if ($index{$key} =~ /$tag\*/ or $index{$key} =~ /$tag$subfieldcode/) {
283 my $line= lc $subfield->[1];
284 # remove meaningless value in the field...
285 $line =~ s/-|\.|\?|,|;|!|'|\(|\)|\[|\]|{|}|"|<|>|&|\+|\*|\/|=|:/ /g;
286 # ... and split in words
287 foreach (split / /,$line) {
288 next unless $_; # skip empty values (multiple spaces)
289 # if the entry is already here, improve weight
290 if ($result{$key}->{"$_"} =~ /$authid,$title\-(\d);/) {
292 $result{$key}->{"$_"} =~ s/$authid,$title\-(\d);//;
293 $result{$key}->{"$_"} .= "$authid,$title-$weight;";
294 # otherwise, create it, with weight=1
296 $result{$key}->{"$_"}.="$authid,$title-1;";
301 # the subfield is not indexed, store it in __RAW__ index anyway
303 my $line= lc $subfield->[1];
304 $line =~ s/-|\.|\?|,|;|!|'|\(|\)|\[|\]|{|}|"|<|>|&|\+|\*|\/|=/ /g;
305 foreach (split / /,$line) {
307 # warn $record->as_formatted."$_ =>".$title;
308 if ($result{__RAW__}->{"$_"} =~ /$authid,$title\-(\d);/) {
311 $result{__RAW__}->{"$_"} =~ s/$authid,$title\-(\d);//;
312 $result{__RAW__}->{"$_"} .= "$authid,$title-$weight;";
314 $result{__RAW__}->{"$_"}.="$authid,$title-1;";
324 print "\nInserting...\n";
328 $dbh->{AutoCommit} = 0;
329 $sth = $dbh->prepare("INSERT INTO nozebra (server,indexname,value,biblionumbers) VALUES ('authorityserver',?,?,?)");
330 foreach my $key (keys %result) {
331 foreach my $index (keys %{$result{$key}}) {
332 if (length($result{$key}->{$index}) > 1000000) {
333 print "very long index (".length($result{$key}->{$index}).")for $key / $index. update mySQL config file if you have an error just after this warning (max_paquet_size parameter)\n";
337 $sth->execute($key,$index,$result{$key}->{$index});
338 $dbh->commit() if (0 == $i % $commitnum);
340 $dbh->commit() if (0 == $i % $commitnum);
343 print "\nauthorities done\n";