Koha/misc/build_marc_word.pl

#!/usr/bin/perl -w
#-----------------------------------
# Script Name: build_marc_word.pl
# Script Version: 0.1.0
# Date:  2004/06/05
# Author:  Joshua Ferraro [jmf at kados dot org]
# Description: This script builds a new marc_word
#  table with a reduced number of tags (only those
#  tags that should be searched) allowing for
#  faster and more accurate searching when used
#  with the SearchMarc routines.  Make sure that
#  the MARCaddword routine in Biblio.pm will index
#  characters >= 1 char; otherwise, searches like
#  "O'brian, Patrick" will fail as the search
#  routines will seperate that query into "o",
#  "brian", and "patrick".  (If "o" is not in the
#  database the search will fail)
# Usage: build_marc_word.pl
# Revision History:
#    0.1.0  2004/06/11:  first working version.
#    			 Thanks to Chris Cormack
#    			 for helping with the $data object
#    			 and Stephen Hedges for providing
#    			 the list of MARC tags.
# FixMe:
#   *Should add a few parameters like 'delete from
#    marc_word' or make script ask user whether to
#    perform that task ...
#   *Add a 'status' report as the data is loaded ...
#-----------------------------------
use lib '/usr/local/koha/intranet/modules/';
use strict;
use C4::Context;
use C4::Biblio;
my $dbh=C4::Context->dbh;

#Here is where you name the tags that you wish to index.  If you
# are using MARC21 this set of default tags should be fine but you
# may need to add holdings tags specific to your library (e.g., holding
# branch for Nelsonville is 942k but that may not be the case for your
# library).
my @tags=(

#Tag documentation from http://lcweb.loc.gov/marc/bibliographic/ecbdhome.html

"020a", # INTERNATIONAL STANDARD BOOK NUMBER
"022a", # INTERNATIONAL STANDARD SERIAL NUMBER
"100a",	# MAIN ENTRY--PERSONAL NAME
"110a",	# MAIN ENTRY--CORPORATE NAME
"110b",	#   Subordinate unit
"110c",	#   Location of meeting
"111a", # MAIN ENTRY--MEETING NAME
"111c", #   Location of meeting
"130a", # MAIN ENTRY--UNIFORM TITLE
"240a", # UNIFORM TITLE
"245a", # TITLE STATEMENT
"245b", #   Remainder of title
"245c", #   Statement of responsibility, etc.
"245p", #   Name of part/section of a work
"246a", # VARYING FORM OF TITLE
"246b", #   Remainder of title
"260b", # PUBLICATION, DISTRIBUTION, ETC. (IMPRINT)
"440a", # SERIES STATEMENT/ADDED ENTRY--TITLE
"440p", #   Name of part/section of a work
"500a", # GENERAL NOTE
"505t", # FORMATTED CONTENTS NOTE (t is Title)
"511a", # PARTICIPANT OR PERFORMER NOTE
"520a", # SUMMARY, ETC.
"534a", # ORIGINAL VERSION NOTE
"534k", #   Key title of original
"534t", #   Title statement of original
"586a", # AWARDS NOTE
"600a", # SUBJECT ADDED ENTRY--PERSONAL NAME
"610a", # SUBJECT ADDED ENTRY--CORPORATE NAME
"611a", # SUBJECT ADDED ENTRY--MEETING NAME
"630a", # SUBJECT ADDED ENTRY--UNIFORM TITLE
"650a", # SUBJECT ADDED ENTRY--TOPICAL TERM
"651a", # SUBJECT ADDED ENTRY--GEOGRAPHIC NAME
"700a", # ADDED ENTRY--PERSONAL NAME
"710a", # ADDED ENTRY--CORPORATE NAME
"711a", # ADDED ENTRY--MEETING NAME
"720a", # ADDED ENTRY--UNCONTROLLED NAME
"730a", # ADDED ENTRY--UNIFORM TITLE
"740a", # ADDED ENTRY--UNCONTROLLED RELATED/ANALYTICAL TITLE
"752a", # ADDED ENTRY--HIERARCHICAL PLACE NAME
"800a", # SERIES ADDED ENTRY--PERSONAL NAME
"810a", # SERIES ADDED ENTRY--CORPORATE NAME
"811a", # SERIES ADDED ENTRY--MEETING NAME
"830a", # SERIES ADDED ENTRY--UNIFORM TITLE
"942k"  # Holdings Branch ?? Unique to NPL??
);

#note that subfieldcode in marc_subfield_table is subfieldid in marc_word ... even
#though there is another subfieldid in marc_subfield_table--very confusing naming conventions!

#For each tag we run a search to find the necessary data for building the marc_word table
foreach my $this_tagid(@tags) {
	my $query="SELECT bibid,tag,tagorder,subfieldcode,subfieldorder,subfieldvalue FROM marc_subfield_table WHERE tag=? AND subfieldcode=?";
	my $sth=$dbh->prepare($query);

	my ($tag, $subfieldid);

#split the tag into tag, subfield
	if ($this_tagid =~ s/(\D+)//) {
		$subfieldid = $1;
		$tag = $this_tagid;
	}
#Then we pass this information on to MARCaddword in Biblio.pm to actually perform the import into marc_word
	$sth->execute($tag, $subfieldid);
	while (my $data=$sth->fetchrow_hashref()){
		MARCaddword($dbh,$data->{'bibid'},$data->{'tag'},$data->{'tagorder'},$data->{'subfieldcode'},$data->{'subfieldorder'},$data->{'subfieldvalue'});
	}
}
$dbh->disconnect();