From f74823bf1bfb88b42762d8eeefad31816a576540 Mon Sep 17 00:00:00 2001
From: tipaul <tipaul>
Date: Thu, 9 Feb 2006 10:59:34 +0000
Subject: [PATCH] OK, this time it seems to work. The last blocking problem
 was... a space in recordId: (bib1,Identifier-standard) just after the comma.
 Adam agreed it was a bug, and it should be solved soon. But now we are aware,
 we can avoid putting the space !

In this commit you have all what is needed to setup a working zebra DB in Unimarc :
* collection.abs is UNIMARC specific and must be rewritten for MARC21, in marc21 directory
* pdf.properties is to be copied unmodified in the marc21 directory (can also be put somewhere else)
* rebuild_zebra.pl is SLOW, but 1 step reindexing tool, using ZOOM
* rebuild_zebra_idx is FAST, but 2 step reindexing tool, and does not use zebra. run it, it will create all biblios XML files in /zebra/biblios directory, then zebraidx update biblios in your zebra directory
* zebra.cfg is the zebra config file ;-)
* test_cql2rpn.pl is a script that will query the database and show the results. Works for me, just change the query at the beginning to get answers you expect.

What has to be done :
* benchmarking : it seems the zebraidx update is faster than lightning (400biblios/sec : 10 000biblios in 25seconds), while ZOOM indexing is slow (something like 25biblios/second) More benchmarking could be done.
* completing collection.abs for UNIMARC. I'll take care of it.
* modifying Biblio.pm to use ZOOM instead of the "zebraidx through exec" running actually. I'll take care of it also.
* modify the search API & tools & screens. I'll let the ball to someone else (chris ?) for this. I agree SearchMarc.pm can be dropped and replaced by something else (maybe a new-and-clean Search.pm package)
---
 misc/migration_tools/rebuild_zebra.pl     |  17 +--
 misc/migration_tools/rebuild_zebra_idx.pl |  55 ++++++++
 misc/migration_tools/test_cql2rpn.pl      |  21 +++
 misc/zebra/unimarc/collection.abs         |  32 +++++
 misc/zebra/unimarc/pqf.properties         | 150 ++++++++++++++++++++++
 misc/zebra/unimarc/zebra.cfg              |  11 +-
 6 files changed, 271 insertions(+), 15 deletions(-)
 create mode 100755 misc/migration_tools/rebuild_zebra_idx.pl
 create mode 100755 misc/migration_tools/test_cql2rpn.pl
 create mode 100644 misc/zebra/unimarc/collection.abs
 create mode 100644 misc/zebra/unimarc/pqf.properties
diff --git a/misc/migration_tools/rebuild_zebra.pl b/misc/migration_tools/rebuild_zebra.pl
index b0f4cc9bf7..570905806d 100755
--- a/misc/migration_tools/rebuild_zebra.pl
+++ b/misc/migration_tools/rebuild_zebra.pl
@@ -41,11 +41,11 @@ if ($@) {
 }
 
 # first, drop Zebra DB
-eval {
-	my $Zpackage = $Zconn->package();
-	$Zpackage->option(databaseName => 'Koha');
-# 	$Zpackage->send("drop");
-};
+# eval {
+# 	my $Zpackage = $Zconn->package();
+# 	$Zpackage->option(databaseName => 'Koha');
+# # 	$Zpackage->send("drop");
+# };
 
 eval {
 	my $Zpackage = $Zconn->package();
@@ -56,19 +56,20 @@ my $cgidir = C4::Context->intranetdir ."/cgi-bin";
 unless (opendir(DIR, "$cgidir")) {
 		$cgidir = C4::Context->intranetdir."/";
 } 
-
 my $starttime = gettimeofday;
 my $sth = $dbh->prepare("select biblionumber from biblio");
 $sth->execute;
 my $i=0;
 while ((my $biblionumber) = $sth->fetchrow) {
 	my $record = XMLgetbiblio($dbh,$biblionumber);
+# 	warn "\n==============\n$record\n==================\n";
 	my $Zpackage = $Zconn->package();
 	$Zpackage->option(databaseName => 'Koha');
-	$Zpackage->option(action => "recordInsert");
+	$Zpackage->option(action => "specialUpdate");
+# 	$Zpackage->option(recordIdNumber => $biblionumber);
 	$Zpackage->option(record => $record);
 	$Zpackage->send("update");
-	$Zpackage->destroy;
+# 	$Zpackage->destroy;
 	$i++;
 	print '.';
 	print "$i\r" unless ($i % 100);
diff --git a/misc/migration_tools/rebuild_zebra_idx.pl b/misc/migration_tools/rebuild_zebra_idx.pl
new file mode 100755
index 0000000000..6ed6c4e93c
--- /dev/null
+++ b/misc/migration_tools/rebuild_zebra_idx.pl
@@ -0,0 +1,55 @@
+#!/usr/bin/perl
+# small script that import an iso2709 file into koha 2.0
+
+use strict;
+
+# Koha modules used
+use MARC::File::USMARC;
+use MARC::Record;
+use MARC::Batch;
+use C4::Context;
+use C4::Biblio;
+use Time::HiRes qw(gettimeofday);
+
+use Getopt::Long;
+my ( $input_marc_file, $number) = ('',0);
+my ($confirm);
+GetOptions(
+    'c' => \$confirm,
+);
+
+unless ($confirm) {
+	print <<EOF
+
+script to write files for zebra DB reindexing. Once it's done, run zebraidx update biblios
+
+run the script with -c to confirm the reindexing.
+
+EOF
+;#'
+die;
+}
+
+$|=1; # flushes output
+
+my $dbh = C4::Context->dbh;
+my $cgidir = C4::Context->intranetdir ."/cgi-bin";
+unless (opendir(DIR, "$cgidir")) {
+		$cgidir = C4::Context->intranetdir."/";
+} 
+
+my $starttime = gettimeofday;
+my $sth = $dbh->prepare("select biblionumber from biblio");
+$sth->execute;
+my $i=0;
+while ((my $biblionumber) = $sth->fetchrow) {
+	my $record = MARCgetbiblio($dbh,$biblionumber);
+	my $filename = $cgidir."/zebra/biblios/BIBLIO".$biblionumber."iso2709";
+	open F,"> $filename";
+	print F $record->as_xml();
+	close F;
+	$i++;
+	print "\r$i" unless ($i % 100);
+}
+my $timeneeded = gettimeofday - $starttime;
+print "\n$i MARC record done in $timeneeded seconds\n";
\ No newline at end of file
diff --git a/misc/migration_tools/test_cql2rpn.pl b/misc/migration_tools/test_cql2rpn.pl
new file mode 100755
index 0000000000..96d6203cb3
--- /dev/null
+++ b/misc/migration_tools/test_cql2rpn.pl
@@ -0,0 +1,21 @@
+#!/usr/bin/perl
+use strict;
+use ZOOM;
+
+my $query="Introduction";
+warn "QUERY : $query";
+my $Zconn;
+eval {
+	$Zconn = new ZOOM::Connection('localhost:2100/Koha');
+};
+$Zconn->option(cqlfile => "/home/paul/koha.dev/head/zebra/pqf.properties");
+my $q = new ZOOM::Query::CQL2RPN( $query, $Zconn);
+# warn "Q : $q";
+my $rs= $Zconn->search($q);
+my $n = $rs->size()-1;
+print "found ".($n+1)." results";
+for my $i (0..$n) {
+	my $rec = $rs->record($i);
+	print $rec->render();
+}
+# 	warn "ERROR : ".$Zconn->errcode();
diff --git a/misc/zebra/unimarc/collection.abs b/misc/zebra/unimarc/collection.abs
new file mode 100644
index 0000000000..9ccede9ee9
--- /dev/null
+++ b/misc/zebra/unimarc/collection.abs
@@ -0,0 +1,32 @@
+# $Id$
+
+# complete UNIMARC indexing rules for Bath level 0 and 1 service
+# (author, title, subject, keyword and exact services).
+# inspired by marc21.abs, from indexdata
+# Feel free to
+# elaborate on it, and if you do, please consider sharing your additions.
+# NOTE: This is designed to be used with the grs.marcxml input filter
+# for ISO2709 (ANSI Z39.2) or grs.xml for MARCXML-formatted records. It
+# won't work for the old grs.marc input filter, which yields a different
+# internal structure.
+
+name collection
+attset bib1.att
+
+esetname F @
+esetname B @
+
+marc usmarc.mar
+xpath enable
+
+all any
+
+melm 090$a	identifier-standard,identifier-standard:p
+melm 700	author,author:p
+melm 200$a	title,title:p
+melm 200$e	title,title:p
+melm 020$a	isbn
+melm 011$a	issn
+#elm 090            Identifier-standard            -
+#elm 090/?          Identifier-standard            -
+#elm 090/?/a        Identifier-standard            !:w
diff --git a/misc/zebra/unimarc/pqf.properties b/misc/zebra/unimarc/pqf.properties
new file mode 100644
index 0000000000..b68461c410
--- /dev/null
+++ b/misc/zebra/unimarc/pqf.properties
@@ -0,0 +1,150 @@
+# $Id$
+#
+# Propeties file to drive org.z3950.zing.cql.CQLNode's toPQF()
+# back-end and the YAZ CQL-to-PQF converter.  This specifies the
+# interpretation of various CQL indexes, relations, etc. in terms
+# of Type-1 query attributes.
+#
+# This configuration file generates queries using BIB-1 attributes.
+# See http://www.loc.gov/z3950/agency/zing/cql/dc-indexes.html
+# for the Maintenance Agency's work-in-progress mapping of Dublin Core
+# indexes to Attribute Architecture (util, XD and BIB-2)
+# attributes.
+
+# Identifiers for prefixes used in this file. (index.*)
+set.cql		= info:srw/cql-context-set/1/cql-v1.1
+set.rec		= info:srw/cql-context-set/2/rec-1.0
+set.dc		= info:srw/cql-context-set/1/dc-v1.1
+set.bath	= http://zing.z3950.org/cql/bath/2.0/
+
+# default set (in query)
+set		= info:srw/cql-context-set/1/dc-v1.1
+
+# The default access point and result-set references
+index.cql.serverChoice			= 1=1016
+	# srw.serverChoice is deprecated in favour of cql.serverChoice
+	# BIB-1 "any"
+
+index.rec.id				= 1=12
+
+index.dc.title				= 1=4
+index.dc.subject			= 1=21
+index.dc.creator			= 1=1003
+index.dc.author				= 1=1003
+	### Unofficial synonym for "creator"
+index.dc.editor				= 1=1020
+index.dc.publisher			= 1=1018
+index.dc.description			= 1=62
+	# "abstract"
+index.dc.date				= 1=30
+index.dc.resourceType			= 1=1031
+	# guesswork: "Material-type"
+index.dc.format				= 1=1034
+	# guesswork: "Content-type"
+index.dc.resourceIdentifier		= 1=12
+	# "Local number"
+index.dc.source				= 1=1019
+	# "Record-source"
+index.dc.language			= 1=54
+	# "Code--language"
+index.dc.relation			= 1=?
+	### No idea how to represent this
+index.dc.coverage			= 1=?
+	### No idea how to represent this
+index.dc.rights				= 1=?
+	### No idea how to represent this
+
+# Relation attributes are selected according to the CQL relation by
+# looking up the "relation.<relation>" property:
+#
+relation.<				= 2=1
+relation.le				= 2=2
+relation.eq				= 2=3
+relation.exact				= 2=3
+relation.ge				= 2=4
+relation.>				= 2=5
+relation.<>				= 2=6
+
+### These two are not really right:
+relation.all				= 2=3
+relation.any				= 2=3
+
+# BIB-1 doesn't have a server choice relation, so we just make the
+# choice here, and use equality (which is clearly correct).
+relation.scr				= 2=3
+
+# Relation modifiers.
+#
+relationModifier.relevant		= 2=102
+relationModifier.fuzzy			= 2=100
+	### 100 is "phonetic", which is not quite the same thing
+relationModifier.stem			= 2=101
+relationModifier.phonetic		= 2=100
+
+# Position attributes may be specified for anchored terms (those
+# beginning with "^", which is stripped) and unanchored (those not
+# beginning with "^").  This may change when we get a BIB-1 truncation
+# attribute that says "do what CQL does".
+#
+position.first				= 3=1 6=1
+	# "first in field"
+position.any				= 3=3 6=1
+	# "any position in field"
+position.last				= 3=4 6=1
+	# not a standard BIB-1 attribute
+position.firstAndLast			= 3=3 6=3
+	# search term is anchored to be complete field
+
+# Structure attributes may be specified for individual relations; a
+# default structure attribute my be specified by the pseudo-relation
+# "*", to be used whenever a relation not listed here occurs.
+#
+structure.exact				= 4=108
+	# string
+structure.all				= 4=2
+structure.any				= 4=2
+structure.*				= 4=1
+	# phrase
+
+# Truncation attributes used to implement CQL wildcard patterns.  The
+# simpler forms, left, right- and both-truncation will be used for the
+# simplest patterns, so that we produce PQF queries that conform more
+# closely to the Bath Profile.  However, when a more complex pattern
+# such as "foo*bar" is used, we fall back on Z39.58-style masking.
+#
+truncation.right			= 5=1
+truncation.left				= 5=2
+truncation.both				= 5=3
+truncation.none				= 5=100
+truncation.z3958			= 5=104
+
+# Finally, any additional attributes that should always be included
+# with each term can be specified in the "always" property.
+#
+always					= 6=1
+# 6=1: completeness = incomplete subfield
+
+
+# Bath Profile support, added Thu Dec 18 13:06:20 GMT 2003
+# See the Bath Profile for SRW at
+#	http://zing.z3950.org/cql/bath.html
+# including the Bath Context Set defined within that document.
+#
+# In this file, we only map index-names to BIB-1 use attributes, doing
+# so in accordance with the specifications of the Z39.50 Bath Profile,
+# and leaving the relations, wildcards, etc. to fend for themselves.
+
+index.bath.keyTitle			= 1=33
+index.bath.possessingInstitution	= 1=1044
+index.bath.name				= 1=1002
+index.bath.personalName			= 1=1
+index.bath.corporateName		= 1=2
+index.bath.conferenceName		= 1=3
+index.bath.uniformTitle			= 1=6
+index.bath.isbn				= 1=7
+index.bath.issn				= 1=8
+index.bath.geographicName		= 1=58
+index.bath.notes			= 1=63
+index.bath.topicalSubject		= 1=1079
+index.bath.genreForm			= 1=1075
+
diff --git a/misc/zebra/unimarc/zebra.cfg b/misc/zebra/unimarc/zebra.cfg
index b166abf422..db209402ed 100644
--- a/misc/zebra/unimarc/zebra.cfg
+++ b/misc/zebra/unimarc/zebra.cfg
@@ -1,18 +1,15 @@
-# Simple Zebra configuration file that defines
-# a database with USMARC records.
-# $Id$
-#
 # Where are the config files located?
 profilePath: ${srcdir:-.}:/usr/local/share/idzebra/tab/
 
+
 # Files that describe the attribute sets supported.
 attset: bib1.att
 attset: explain.att
 
-recordId: (bib1,Local-number) 
+recordId: (bib1,Identifier-standard)
 
 # Specify record type
-recordType: grs.marc.unimarc
+recordType: grs.xml
 
 # Lock File Area
 lockDir: lock
@@ -27,7 +24,7 @@ keyTmpDir: tmp
 memMax: 100
 
 perm.anonymous: rw 
-encoding utf8
+encoding utf-8
 
 storeKeys:1
 storeData:1
-- 
2.39.2