From c75eefd1aceacc31590d7b905a9472466c3e5246 Mon Sep 17 00:00:00 2001 From: Galen Charlton Date: Mon, 26 Mar 2012 01:05:58 -0400 Subject: [PATCH] Bug 7818: utility to generate DOM indexing configs misc/maintenance/make_zebra_dom_cfg_from_record_abs: generate a DOM filter Zebra index config from a GRS-1 config Given a Zebra record.abs file containing a set of index definitions for Zebra's GRS-1 filter, write an equivalent DOM filter configuration. To generate the XSLT that is to be used by Zebra, run something like the following on the output of this utility: xsltproc ZEBRA_CFG_DIR/xsl/koha-indexdefs-to-zebra.xsl \ biblio-koha-indexdefs.xml \ > ZEBRA_CFG_DIR/marc_defs/marc21/biblios/biblio-zebra-indexdefs.xsl The above example assumes that the output of the program was named biblio-koha-indexdefs.xsl. This commit also introduces Koha::Indexer::Utils, a new package for misceallenous routines that support Koha's indexing definitions. Signed-off-by: Galen Charlton Signed-off-by: Jared Camins-Esakov Signed-off-by: Paul Poulain Signed-off-by: Chris Cormack --- Koha/Indexer/Utils.pm | 222 ++++++++++++++++++ .../make_zebra_dom_cfg_from_record_abs | 72 ++++++ 2 files changed, 294 insertions(+) create mode 100644 Koha/Indexer/Utils.pm create mode 100755 misc/maintenance/make_zebra_dom_cfg_from_record_abs diff --git a/Koha/Indexer/Utils.pm b/Koha/Indexer/Utils.pm new file mode 100644 index 0000000000..14c23673ae --- /dev/null +++ b/Koha/Indexer/Utils.pm @@ -0,0 +1,222 @@ +package Koha::Indexer::Utils; + +# Copyright (c) 2012 Equinox Software, Inc. +# This file is part of Koha. +# +# Koha is free software; you can redistribute it and/or modify it under the +# terms of the GNU General Public License as published by the Free Software +# Foundation; either version 2 of the License, or (at your option) any later +# version. +# +# Koha is distributed in the hope that it will be useful, but WITHOUT ANY +# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR +# A PARTICULAR PURPOSE. See the GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along with +# Koha; if not, write to the Free Software Foundation, Inc., 59 Temple Place, +# Suite 330, Boston, MA 02111-1307 USA + +use strict; +use warnings; +use 5.010; + +use XML::LibXML; + +=head1 Koha::Indexer::Utils + +Koha::Indexer::Utils - utility functions for managing search indexes + +=head1 DESCRIPTION + +This modules contains utility functions for managing various aspects +of Koha's bibliographic and authority search indexes. + +=head1 FUNCTIONS + +=cut + +=head2 zebra_record_abs_to_dom + +$dom_config = Koha::Indexer::Utils::zebra_record_abs_to_dom($record_abs_config, $marcflavour); + +Given a string containing the contents of a records.abs configuration file as +used by Zebra's GRS-1 filter, emit an equivalent DOM configuration. + +=cut + +our $idxNS = 'http://www.koha-community.org/schemas/index-defs'; + +sub zebra_record_abs_to_dom { + my $grs1_cfg = shift; + my $marcflavour = shift; + + chomp $grs1_cfg; + my @grs1_cfg_lines = split /\n/, $grs1_cfg, -1; + my $grs1_defs = []; + + # generate an arrayref of structures representing + # each records.abs line + for (my $i = 0; $i <= $#grs1_cfg_lines; $i++) { + my $line = $grs1_cfg_lines[$i]; + next if _can_ignore_grs1_cfg_line($line); + my $grs1_def = _parse_grs1_cfg_line($line); + $grs1_def->{orig_def} = $line; + $grs1_def->{lineno} = $i + 1; + push @$grs1_defs, $grs1_def; + } + + # map the index definitions to a DOM tree representing + # the index definitions -- if you squint hard, you + # can see the beginnings of a more general definition language + # for Koha index definitions + my $dom_cfg = XML::LibXML::Document->new('1.0', 'utf-8'); + my $root = $dom_cfg->createElement('index_defs'); + $root->setNamespace($idxNS, 'kohaidx'); + foreach my $grs1_def (@$grs1_defs) { + _append_grs1_def_to_dom_cfg($dom_cfg, $root, $grs1_def, $marcflavour); + } + + # and emit the result as a string + $dom_cfg->setDocumentElement($root); + return $dom_cfg->toString(1); +} + +# +# bunch of utility functions for zebra_record_abs_to_dom +# +sub _can_ignore_grs1_cfg_line { + my $line = shift; + return 1 if $line =~ /^\s*$/ or + $line =~ /^#/ or + $line =~ /^(encoding|name|attset|esetname|marc|systag|xpath)/ or + $line =~ /^all/; # DOM filter automatically indexes all tokens, so + # no need to deal with 'all any' lines in record.abs + return 0; +} + +sub _parse_grs1_cfg_line { + my $line = shift; + my $grs1_def; + + if ($line =~ /^melm\s+(.*)/ || $line =~ m!^xelm /record/(.*)!) { + $grs1_def = _parse_xelm_melm($1); + } + return $grs1_def; +} + +sub _parse_xelm_melm { + my $line = shift; + + my ($field, $index_defs) = split /\s+/, $line, 2; + + # munge fixed field range indicators + $index_defs =~ s/range\(data,(\d+),(\d+)\)/$1:$2/g; + + my ($tag, $subfield) = split /\$/, $field, 2; + return { + tag => $tag, + subfield => $subfield, + index_defs => [ map { _parse_grs1_index_def($_) } split /,/, $index_defs ], + }; +} + +sub _parse_grs1_index_def { + my $index_def = shift; + + my @parts = split /:/, $index_def, -1; + my $parsed_def = {}; + $parsed_def->{name} = shift @parts; + $parsed_def->{index_type} = shift @parts; + $parsed_def->{offset} = shift @parts; + $parsed_def->{length} = shift @parts; + # if the original index definition didn't specify an index + # type, set it 'w' -- the DOM filter needs the index type + # to be specified explicitly + $parsed_def->{index_type} = 'w' unless defined $parsed_def->{index_type}; + return $parsed_def; +} + +sub _append_grs1_def_to_dom_cfg { + my $dom_cfg = shift; + my $root = shift; + my $grs1_def = shift; + my $marcflavour = shift; + + my $comment = $dom_cfg->createComment('record.abs line ' . + $grs1_def->{lineno} . ': ' . + $grs1_def->{orig_def}); + $root->appendChild($comment); + + if (defined $grs1_def->{tag} && defined $grs1_def->{subfield}) { + my $dom_def = $dom_cfg->createElementNS($idxNS, 'index_subfields'); + $dom_def->setAttribute('tag', $grs1_def->{tag}); + $dom_def->setAttribute('subfields', $grs1_def->{subfield}); + _append_target_indexes($dom_cfg, $dom_def, $grs1_def); + $root->appendChild($dom_def); + } elsif (defined $grs1_def->{tag} and $grs1_def->{tag} eq 'leader') { + # we're the leader + _append_grs1_defs_for_leader($dom_cfg, $root, $grs1_def); + } elsif (defined $grs1_def->{tag} and $grs1_def->{tag} < 10) { + # we're a control field + _append_grs1_defs_for_control_field($dom_cfg, $root, $grs1_def); + } elsif (defined $grs1_def->{tag}) { + # we're indexing an entire variable data field + my $dom_def = $dom_cfg->createElementNS($idxNS, 'index_data_field'); + $dom_def->setAttribute('tag', $grs1_def->{tag}); + _append_target_indexes($dom_cfg, $dom_def, $grs1_def); + $root->appendChild($dom_def); + } +} + +sub _append_target_indexes { + my $dom_cfg = shift; + my $dom_def = shift; + my $grs1_def = shift; + + foreach my $index_def (@{ $grs1_def->{index_defs} }) { + _append_one_target_index($dom_cfg, $dom_def, $index_def); + } +} + +sub _append_one_target_index { + my $dom_cfg = shift; + my $dom_def = shift; + my $index_def = shift; + my $tgt_idx = $dom_cfg->createElementNS($idxNS, 'target_index'); + my $index_name = "$index_def->{name}:$index_def->{index_type}"; + $tgt_idx->appendText($index_name); + $dom_def->appendChild($tgt_idx); +} + +sub _append_grs1_defs_for_leader { + my $dom_cfg = shift; + my $root = shift; + my $grs1_def = shift; + foreach my $index_def (@{ $grs1_def->{index_defs} }) { + my $dom_def = $dom_cfg->createElementNS($idxNS, 'index_leader'); + if (defined $index_def->{offset} && defined $index_def->{length}) { + $dom_def->setAttribute('offset', $index_def->{offset}); + $dom_def->setAttribute('length', $index_def->{length}); + } + _append_one_target_index($dom_cfg, $dom_def, $index_def); + $root->appendChild($dom_def); + } +} + +sub _append_grs1_defs_for_control_field { + my $dom_cfg = shift; + my $root = shift; + my $grs1_def = shift; + foreach my $index_def (@{ $grs1_def->{index_defs} }) { + my $dom_def = $dom_cfg->createElementNS($idxNS, 'index_control_field'); + $dom_def->setAttribute('tag', $grs1_def->{tag}); + if (defined $index_def->{offset} && defined $index_def->{length}) { + $dom_def->setAttribute('offset', $index_def->{offset}); + $dom_def->setAttribute('length', $index_def->{length}); + } + _append_one_target_index($dom_cfg, $dom_def, $index_def); + $root->appendChild($dom_def); + } +} + +1; diff --git a/misc/maintenance/make_zebra_dom_cfg_from_record_abs b/misc/maintenance/make_zebra_dom_cfg_from_record_abs new file mode 100755 index 0000000000..0c982bdff1 --- /dev/null +++ b/misc/maintenance/make_zebra_dom_cfg_from_record_abs @@ -0,0 +1,72 @@ +#!/usr/bin/perl + +# Copyright (c) 2012 Equinox Software, Inc. +# This file is part of Koha. +# +# Koha is free software; you can redistribute it and/or modify it under the +# terms of the GNU General Public License as published by the Free Software +# Foundation; either version 2 of the License, or (at your option) any later +# version. +# +# Koha is distributed in the hope that it will be useful, but WITHOUT ANY +# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR +# A PARTICULAR PURPOSE. See the GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along with +# Koha; if not, write to the Free Software Foundation, Inc., 59 Temple Place, +# Suite 330, Boston, MA 02111-1307 USA + +use strict; +use warnings; +use 5.010; + +use Koha::Indexer::Utils; + +use Getopt::Long; + +my $input_file; +my $output_file; +my $want_help; +my $result = GetOptions( + 'input:s' => \$input_file, + 'output:s' => \$output_file, + 'help|h' => \$want_help, +); + +if ( not $result or $want_help or not defined $input_file or not defined $output_file ) { + print_usage(); + exit 0; +} + +open my $infh, '<', $input_file or die "$0: cannot open input file $input_file: $!\n"; +open my $outfh, '>', $output_file or die "$0: cannot open output file $output_file: $!\n"; + +my $grs1_cfg = join('', <$infh>); +close $infh; +my $dom_cfg = Koha::Indexer::Utils::zebra_record_abs_to_dom($grs1_cfg); +print $outfh $dom_cfg; +close $outfh; + +sub print_usage { + print <<_USAGE_; +$0: generate a DOM filter Zebra index config from a GRS-1 config + +Given a Zebra record.abs file containing a set of index definitions for +Zebra's GRS-1 filter, write an equivalent DOM filter configuration. + +To generate the XSLT that is to be used by Zebra, run something like +the following on the output of this utility: + +xsltproc ZEBRA_CFG_DIR/xsl/koha-indexdefs-to-zebra.xsl \\ + biblio-koha-indexdefs.xml \\ + > ZEBRA_CFG_DIR/marc_defs/marc21/biblios/biblio-zebra-indexdefs.xsl + +The above example assumes that the output of the program was named +biblio-koha-indexdefs.xsl. + +Parameters: + --input input file name + --output output file name + --help or -h show this message +_USAGE_ +} -- 2.39.5