From f74823bf1bfb88b42762d8eeefad31816a576540 Mon Sep 17 00:00:00 2001 From: tipaul Date: Thu, 9 Feb 2006 10:59:34 +0000 Subject: [PATCH] OK, this time it seems to work. The last blocking problem was... a space in recordId: (bib1,Identifier-standard) just after the comma. Adam agreed it was a bug, and it should be solved soon. But now we are aware, we can avoid putting the space ! In this commit you have all what is needed to setup a working zebra DB in Unimarc : * collection.abs is UNIMARC specific and must be rewritten for MARC21, in marc21 directory * pdf.properties is to be copied unmodified in the marc21 directory (can also be put somewhere else) * rebuild_zebra.pl is SLOW, but 1 step reindexing tool, using ZOOM * rebuild_zebra_idx is FAST, but 2 step reindexing tool, and does not use zebra. run it, it will create all biblios XML files in /zebra/biblios directory, then zebraidx update biblios in your zebra directory * zebra.cfg is the zebra config file ;-) * test_cql2rpn.pl is a script that will query the database and show the results. Works for me, just change the query at the beginning to get answers you expect. What has to be done : * benchmarking : it seems the zebraidx update is faster than lightning (400biblios/sec : 10 000biblios in 25seconds), while ZOOM indexing is slow (something like 25biblios/second) More benchmarking could be done. * completing collection.abs for UNIMARC. I'll take care of it. * modifying Biblio.pm to use ZOOM instead of the "zebraidx through exec" running actually. I'll take care of it also. * modify the search API & tools & screens. I'll let the ball to someone else (chris ?) for this. I agree SearchMarc.pm can be dropped and replaced by something else (maybe a new-and-clean Search.pm package) --- misc/migration_tools/rebuild_zebra.pl | 17 +-- misc/migration_tools/rebuild_zebra_idx.pl | 55 ++++++++ misc/migration_tools/test_cql2rpn.pl | 21 +++ misc/zebra/unimarc/collection.abs | 32 +++++ misc/zebra/unimarc/pqf.properties | 150 ++++++++++++++++++++++ misc/zebra/unimarc/zebra.cfg | 11 +- 6 files changed, 271 insertions(+), 15 deletions(-) create mode 100755 misc/migration_tools/rebuild_zebra_idx.pl create mode 100755 misc/migration_tools/test_cql2rpn.pl create mode 100644 misc/zebra/unimarc/collection.abs create mode 100644 misc/zebra/unimarc/pqf.properties diff --git a/misc/migration_tools/rebuild_zebra.pl b/misc/migration_tools/rebuild_zebra.pl index b0f4cc9bf7..570905806d 100755 --- a/misc/migration_tools/rebuild_zebra.pl +++ b/misc/migration_tools/rebuild_zebra.pl @@ -41,11 +41,11 @@ if ($@) { } # first, drop Zebra DB -eval { - my $Zpackage = $Zconn->package(); - $Zpackage->option(databaseName => 'Koha'); -# $Zpackage->send("drop"); -}; +# eval { +# my $Zpackage = $Zconn->package(); +# $Zpackage->option(databaseName => 'Koha'); +# # $Zpackage->send("drop"); +# }; eval { my $Zpackage = $Zconn->package(); @@ -56,19 +56,20 @@ my $cgidir = C4::Context->intranetdir ."/cgi-bin"; unless (opendir(DIR, "$cgidir")) { $cgidir = C4::Context->intranetdir."/"; } - my $starttime = gettimeofday; my $sth = $dbh->prepare("select biblionumber from biblio"); $sth->execute; my $i=0; while ((my $biblionumber) = $sth->fetchrow) { my $record = XMLgetbiblio($dbh,$biblionumber); +# warn "\n==============\n$record\n==================\n"; my $Zpackage = $Zconn->package(); $Zpackage->option(databaseName => 'Koha'); - $Zpackage->option(action => "recordInsert"); + $Zpackage->option(action => "specialUpdate"); +# $Zpackage->option(recordIdNumber => $biblionumber); $Zpackage->option(record => $record); $Zpackage->send("update"); - $Zpackage->destroy; +# $Zpackage->destroy; $i++; print '.'; print "$i\r" unless ($i % 100); diff --git a/misc/migration_tools/rebuild_zebra_idx.pl b/misc/migration_tools/rebuild_zebra_idx.pl new file mode 100755 index 0000000000..6ed6c4e93c --- /dev/null +++ b/misc/migration_tools/rebuild_zebra_idx.pl @@ -0,0 +1,55 @@ +#!/usr/bin/perl +# small script that import an iso2709 file into koha 2.0 + +use strict; + +# Koha modules used +use MARC::File::USMARC; +use MARC::Record; +use MARC::Batch; +use C4::Context; +use C4::Biblio; +use Time::HiRes qw(gettimeofday); + +use Getopt::Long; +my ( $input_marc_file, $number) = ('',0); +my ($confirm); +GetOptions( + 'c' => \$confirm, +); + +unless ($confirm) { + print <dbh; +my $cgidir = C4::Context->intranetdir ."/cgi-bin"; +unless (opendir(DIR, "$cgidir")) { + $cgidir = C4::Context->intranetdir."/"; +} + +my $starttime = gettimeofday; +my $sth = $dbh->prepare("select biblionumber from biblio"); +$sth->execute; +my $i=0; +while ((my $biblionumber) = $sth->fetchrow) { + my $record = MARCgetbiblio($dbh,$biblionumber); + my $filename = $cgidir."/zebra/biblios/BIBLIO".$biblionumber."iso2709"; + open F,"> $filename"; + print F $record->as_xml(); + close F; + $i++; + print "\r$i" unless ($i % 100); +} +my $timeneeded = gettimeofday - $starttime; +print "\n$i MARC record done in $timeneeded seconds\n"; \ No newline at end of file diff --git a/misc/migration_tools/test_cql2rpn.pl b/misc/migration_tools/test_cql2rpn.pl new file mode 100755 index 0000000000..96d6203cb3 --- /dev/null +++ b/misc/migration_tools/test_cql2rpn.pl @@ -0,0 +1,21 @@ +#!/usr/bin/perl +use strict; +use ZOOM; + +my $query="Introduction"; +warn "QUERY : $query"; +my $Zconn; +eval { + $Zconn = new ZOOM::Connection('localhost:2100/Koha'); +}; +$Zconn->option(cqlfile => "/home/paul/koha.dev/head/zebra/pqf.properties"); +my $q = new ZOOM::Query::CQL2RPN( $query, $Zconn); +# warn "Q : $q"; +my $rs= $Zconn->search($q); +my $n = $rs->size()-1; +print "found ".($n+1)." results"; +for my $i (0..$n) { + my $rec = $rs->record($i); + print $rec->render(); +} +# warn "ERROR : ".$Zconn->errcode(); diff --git a/misc/zebra/unimarc/collection.abs b/misc/zebra/unimarc/collection.abs new file mode 100644 index 0000000000..9ccede9ee9 --- /dev/null +++ b/misc/zebra/unimarc/collection.abs @@ -0,0 +1,32 @@ +# $Id$ + +# complete UNIMARC indexing rules for Bath level 0 and 1 service +# (author, title, subject, keyword and exact services). +# inspired by marc21.abs, from indexdata +# Feel free to +# elaborate on it, and if you do, please consider sharing your additions. +# NOTE: This is designed to be used with the grs.marcxml input filter +# for ISO2709 (ANSI Z39.2) or grs.xml for MARCXML-formatted records. It +# won't work for the old grs.marc input filter, which yields a different +# internal structure. + +name collection +attset bib1.att + +esetname F @ +esetname B @ + +marc usmarc.mar +xpath enable + +all any + +melm 090$a identifier-standard,identifier-standard:p +melm 700 author,author:p +melm 200$a title,title:p +melm 200$e title,title:p +melm 020$a isbn +melm 011$a issn +#elm 090 Identifier-standard - +#elm 090/? Identifier-standard - +#elm 090/?/a Identifier-standard !:w diff --git a/misc/zebra/unimarc/pqf.properties b/misc/zebra/unimarc/pqf.properties new file mode 100644 index 0000000000..b68461c410 --- /dev/null +++ b/misc/zebra/unimarc/pqf.properties @@ -0,0 +1,150 @@ +# $Id$ +# +# Propeties file to drive org.z3950.zing.cql.CQLNode's toPQF() +# back-end and the YAZ CQL-to-PQF converter. This specifies the +# interpretation of various CQL indexes, relations, etc. in terms +# of Type-1 query attributes. +# +# This configuration file generates queries using BIB-1 attributes. +# See http://www.loc.gov/z3950/agency/zing/cql/dc-indexes.html +# for the Maintenance Agency's work-in-progress mapping of Dublin Core +# indexes to Attribute Architecture (util, XD and BIB-2) +# attributes. + +# Identifiers for prefixes used in this file. (index.*) +set.cql = info:srw/cql-context-set/1/cql-v1.1 +set.rec = info:srw/cql-context-set/2/rec-1.0 +set.dc = info:srw/cql-context-set/1/dc-v1.1 +set.bath = http://zing.z3950.org/cql/bath/2.0/ + +# default set (in query) +set = info:srw/cql-context-set/1/dc-v1.1 + +# The default access point and result-set references +index.cql.serverChoice = 1=1016 + # srw.serverChoice is deprecated in favour of cql.serverChoice + # BIB-1 "any" + +index.rec.id = 1=12 + +index.dc.title = 1=4 +index.dc.subject = 1=21 +index.dc.creator = 1=1003 +index.dc.author = 1=1003 + ### Unofficial synonym for "creator" +index.dc.editor = 1=1020 +index.dc.publisher = 1=1018 +index.dc.description = 1=62 + # "abstract" +index.dc.date = 1=30 +index.dc.resourceType = 1=1031 + # guesswork: "Material-type" +index.dc.format = 1=1034 + # guesswork: "Content-type" +index.dc.resourceIdentifier = 1=12 + # "Local number" +index.dc.source = 1=1019 + # "Record-source" +index.dc.language = 1=54 + # "Code--language" +index.dc.relation = 1=? + ### No idea how to represent this +index.dc.coverage = 1=? + ### No idea how to represent this +index.dc.rights = 1=? + ### No idea how to represent this + +# Relation attributes are selected according to the CQL relation by +# looking up the "relation." property: +# +relation.< = 2=1 +relation.le = 2=2 +relation.eq = 2=3 +relation.exact = 2=3 +relation.ge = 2=4 +relation.> = 2=5 +relation.<> = 2=6 + +### These two are not really right: +relation.all = 2=3 +relation.any = 2=3 + +# BIB-1 doesn't have a server choice relation, so we just make the +# choice here, and use equality (which is clearly correct). +relation.scr = 2=3 + +# Relation modifiers. +# +relationModifier.relevant = 2=102 +relationModifier.fuzzy = 2=100 + ### 100 is "phonetic", which is not quite the same thing +relationModifier.stem = 2=101 +relationModifier.phonetic = 2=100 + +# Position attributes may be specified for anchored terms (those +# beginning with "^", which is stripped) and unanchored (those not +# beginning with "^"). This may change when we get a BIB-1 truncation +# attribute that says "do what CQL does". +# +position.first = 3=1 6=1 + # "first in field" +position.any = 3=3 6=1 + # "any position in field" +position.last = 3=4 6=1 + # not a standard BIB-1 attribute +position.firstAndLast = 3=3 6=3 + # search term is anchored to be complete field + +# Structure attributes may be specified for individual relations; a +# default structure attribute my be specified by the pseudo-relation +# "*", to be used whenever a relation not listed here occurs. +# +structure.exact = 4=108 + # string +structure.all = 4=2 +structure.any = 4=2 +structure.* = 4=1 + # phrase + +# Truncation attributes used to implement CQL wildcard patterns. The +# simpler forms, left, right- and both-truncation will be used for the +# simplest patterns, so that we produce PQF queries that conform more +# closely to the Bath Profile. However, when a more complex pattern +# such as "foo*bar" is used, we fall back on Z39.58-style masking. +# +truncation.right = 5=1 +truncation.left = 5=2 +truncation.both = 5=3 +truncation.none = 5=100 +truncation.z3958 = 5=104 + +# Finally, any additional attributes that should always be included +# with each term can be specified in the "always" property. +# +always = 6=1 +# 6=1: completeness = incomplete subfield + + +# Bath Profile support, added Thu Dec 18 13:06:20 GMT 2003 +# See the Bath Profile for SRW at +# http://zing.z3950.org/cql/bath.html +# including the Bath Context Set defined within that document. +# +# In this file, we only map index-names to BIB-1 use attributes, doing +# so in accordance with the specifications of the Z39.50 Bath Profile, +# and leaving the relations, wildcards, etc. to fend for themselves. + +index.bath.keyTitle = 1=33 +index.bath.possessingInstitution = 1=1044 +index.bath.name = 1=1002 +index.bath.personalName = 1=1 +index.bath.corporateName = 1=2 +index.bath.conferenceName = 1=3 +index.bath.uniformTitle = 1=6 +index.bath.isbn = 1=7 +index.bath.issn = 1=8 +index.bath.geographicName = 1=58 +index.bath.notes = 1=63 +index.bath.topicalSubject = 1=1079 +index.bath.genreForm = 1=1075 + diff --git a/misc/zebra/unimarc/zebra.cfg b/misc/zebra/unimarc/zebra.cfg index b166abf422..db209402ed 100644 --- a/misc/zebra/unimarc/zebra.cfg +++ b/misc/zebra/unimarc/zebra.cfg @@ -1,18 +1,15 @@ -# Simple Zebra configuration file that defines -# a database with USMARC records. -# $Id$ -# # Where are the config files located? profilePath: ${srcdir:-.}:/usr/local/share/idzebra/tab/ + # Files that describe the attribute sets supported. attset: bib1.att attset: explain.att -recordId: (bib1,Local-number) +recordId: (bib1,Identifier-standard) # Specify record type -recordType: grs.marc.unimarc +recordType: grs.xml # Lock File Area lockDir: lock @@ -27,7 +24,7 @@ keyTmpDir: tmp memMax: 100 perm.anonymous: rw -encoding utf8 +encoding utf-8 storeKeys:1 storeData:1 -- 2.39.5