Bug 7818: utility to generate DOM indexing configs
misc/maintenance/make_zebra_dom_cfg_from_record_abs: generate a DOM filter Zebra index config from a GRS-1 config Given a Zebra record.abs file containing a set of index definitions for Zebra's GRS-1 filter, write an equivalent DOM filter configuration. To generate the XSLT that is to be used by Zebra, run something like the following on the output of this utility: xsltproc ZEBRA_CFG_DIR/xsl/koha-indexdefs-to-zebra.xsl \ biblio-koha-indexdefs.xml \ > ZEBRA_CFG_DIR/marc_defs/marc21/biblios/biblio-zebra-indexdefs.xsl The above example assumes that the output of the program was named biblio-koha-indexdefs.xsl. This commit also introduces Koha::Indexer::Utils, a new package for misceallenous routines that support Koha's indexing definitions. Signed-off-by: Galen Charlton <gmc@esilibrary.com> Signed-off-by: Jared Camins-Esakov <jcamins@cpbibliography.com> Signed-off-by: Paul Poulain <paul.poulain@biblibre.com>
This commit is contained in:
parent
f3e5160111
commit
4559fa3a27
2 changed files with 294 additions and 0 deletions
222
Koha/Indexer/Utils.pm
Normal file
222
Koha/Indexer/Utils.pm
Normal file
|
@ -0,0 +1,222 @@
|
|||
package Koha::Indexer::Utils;
|
||||
|
||||
# Copyright (c) 2012 Equinox Software, Inc.
|
||||
# This file is part of Koha.
|
||||
#
|
||||
# Koha is free software; you can redistribute it and/or modify it under the
|
||||
# terms of the GNU General Public License as published by the Free Software
|
||||
# Foundation; either version 2 of the License, or (at your option) any later
|
||||
# version.
|
||||
#
|
||||
# Koha is distributed in the hope that it will be useful, but WITHOUT ANY
|
||||
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
|
||||
# A PARTICULAR PURPOSE. See the GNU General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public License along with
|
||||
# Koha; if not, write to the Free Software Foundation, Inc., 59 Temple Place,
|
||||
# Suite 330, Boston, MA 02111-1307 USA
|
||||
|
||||
use strict;
|
||||
use warnings;
|
||||
use 5.010;
|
||||
|
||||
use XML::LibXML;
|
||||
|
||||
=head1 Koha::Indexer::Utils
|
||||
|
||||
Koha::Indexer::Utils - utility functions for managing search indexes
|
||||
|
||||
=head1 DESCRIPTION
|
||||
|
||||
This modules contains utility functions for managing various aspects
|
||||
of Koha's bibliographic and authority search indexes.
|
||||
|
||||
=head1 FUNCTIONS
|
||||
|
||||
=cut
|
||||
|
||||
=head2 zebra_record_abs_to_dom
|
||||
|
||||
$dom_config = Koha::Indexer::Utils::zebra_record_abs_to_dom($record_abs_config, $marcflavour);
|
||||
|
||||
Given a string containing the contents of a records.abs configuration file as
|
||||
used by Zebra's GRS-1 filter, emit an equivalent DOM configuration.
|
||||
|
||||
=cut
|
||||
|
||||
our $idxNS = 'http://www.koha-community.org/schemas/index-defs';
|
||||
|
||||
sub zebra_record_abs_to_dom {
|
||||
my $grs1_cfg = shift;
|
||||
my $marcflavour = shift;
|
||||
|
||||
chomp $grs1_cfg;
|
||||
my @grs1_cfg_lines = split /\n/, $grs1_cfg, -1;
|
||||
my $grs1_defs = [];
|
||||
|
||||
# generate an arrayref of structures representing
|
||||
# each records.abs line
|
||||
for (my $i = 0; $i <= $#grs1_cfg_lines; $i++) {
|
||||
my $line = $grs1_cfg_lines[$i];
|
||||
next if _can_ignore_grs1_cfg_line($line);
|
||||
my $grs1_def = _parse_grs1_cfg_line($line);
|
||||
$grs1_def->{orig_def} = $line;
|
||||
$grs1_def->{lineno} = $i + 1;
|
||||
push @$grs1_defs, $grs1_def;
|
||||
}
|
||||
|
||||
# map the index definitions to a DOM tree representing
|
||||
# the index definitions -- if you squint hard, you
|
||||
# can see the beginnings of a more general definition language
|
||||
# for Koha index definitions
|
||||
my $dom_cfg = XML::LibXML::Document->new('1.0', 'utf-8');
|
||||
my $root = $dom_cfg->createElement('index_defs');
|
||||
$root->setNamespace($idxNS, 'kohaidx');
|
||||
foreach my $grs1_def (@$grs1_defs) {
|
||||
_append_grs1_def_to_dom_cfg($dom_cfg, $root, $grs1_def, $marcflavour);
|
||||
}
|
||||
|
||||
# and emit the result as a string
|
||||
$dom_cfg->setDocumentElement($root);
|
||||
return $dom_cfg->toString(1);
|
||||
}
|
||||
|
||||
#
|
||||
# bunch of utility functions for zebra_record_abs_to_dom
|
||||
#
|
||||
sub _can_ignore_grs1_cfg_line {
|
||||
my $line = shift;
|
||||
return 1 if $line =~ /^\s*$/ or
|
||||
$line =~ /^#/ or
|
||||
$line =~ /^(encoding|name|attset|esetname|marc|systag|xpath)/ or
|
||||
$line =~ /^all/; # DOM filter automatically indexes all tokens, so
|
||||
# no need to deal with 'all any' lines in record.abs
|
||||
return 0;
|
||||
}
|
||||
|
||||
sub _parse_grs1_cfg_line {
|
||||
my $line = shift;
|
||||
my $grs1_def;
|
||||
|
||||
if ($line =~ /^melm\s+(.*)/ || $line =~ m!^xelm /record/(.*)!) {
|
||||
$grs1_def = _parse_xelm_melm($1);
|
||||
}
|
||||
return $grs1_def;
|
||||
}
|
||||
|
||||
sub _parse_xelm_melm {
|
||||
my $line = shift;
|
||||
|
||||
my ($field, $index_defs) = split /\s+/, $line, 2;
|
||||
|
||||
# munge fixed field range indicators
|
||||
$index_defs =~ s/range\(data,(\d+),(\d+)\)/$1:$2/g;
|
||||
|
||||
my ($tag, $subfield) = split /\$/, $field, 2;
|
||||
return {
|
||||
tag => $tag,
|
||||
subfield => $subfield,
|
||||
index_defs => [ map { _parse_grs1_index_def($_) } split /,/, $index_defs ],
|
||||
};
|
||||
}
|
||||
|
||||
sub _parse_grs1_index_def {
|
||||
my $index_def = shift;
|
||||
|
||||
my @parts = split /:/, $index_def, -1;
|
||||
my $parsed_def = {};
|
||||
$parsed_def->{name} = shift @parts;
|
||||
$parsed_def->{index_type} = shift @parts;
|
||||
$parsed_def->{offset} = shift @parts;
|
||||
$parsed_def->{length} = shift @parts;
|
||||
# if the original index definition didn't specify an index
|
||||
# type, set it 'w' -- the DOM filter needs the index type
|
||||
# to be specified explicitly
|
||||
$parsed_def->{index_type} = 'w' unless defined $parsed_def->{index_type};
|
||||
return $parsed_def;
|
||||
}
|
||||
|
||||
sub _append_grs1_def_to_dom_cfg {
|
||||
my $dom_cfg = shift;
|
||||
my $root = shift;
|
||||
my $grs1_def = shift;
|
||||
my $marcflavour = shift;
|
||||
|
||||
my $comment = $dom_cfg->createComment('record.abs line ' .
|
||||
$grs1_def->{lineno} . ': ' .
|
||||
$grs1_def->{orig_def});
|
||||
$root->appendChild($comment);
|
||||
|
||||
if (defined $grs1_def->{tag} && defined $grs1_def->{subfield}) {
|
||||
my $dom_def = $dom_cfg->createElementNS($idxNS, 'index_subfields');
|
||||
$dom_def->setAttribute('tag', $grs1_def->{tag});
|
||||
$dom_def->setAttribute('subfields', $grs1_def->{subfield});
|
||||
_append_target_indexes($dom_cfg, $dom_def, $grs1_def);
|
||||
$root->appendChild($dom_def);
|
||||
} elsif (defined $grs1_def->{tag} and $grs1_def->{tag} eq 'leader') {
|
||||
# we're the leader
|
||||
_append_grs1_defs_for_leader($dom_cfg, $root, $grs1_def);
|
||||
} elsif (defined $grs1_def->{tag} and $grs1_def->{tag} < 10) {
|
||||
# we're a control field
|
||||
_append_grs1_defs_for_control_field($dom_cfg, $root, $grs1_def);
|
||||
} elsif (defined $grs1_def->{tag}) {
|
||||
# we're indexing an entire variable data field
|
||||
my $dom_def = $dom_cfg->createElementNS($idxNS, 'index_data_field');
|
||||
$dom_def->setAttribute('tag', $grs1_def->{tag});
|
||||
_append_target_indexes($dom_cfg, $dom_def, $grs1_def);
|
||||
$root->appendChild($dom_def);
|
||||
}
|
||||
}
|
||||
|
||||
sub _append_target_indexes {
|
||||
my $dom_cfg = shift;
|
||||
my $dom_def = shift;
|
||||
my $grs1_def = shift;
|
||||
|
||||
foreach my $index_def (@{ $grs1_def->{index_defs} }) {
|
||||
_append_one_target_index($dom_cfg, $dom_def, $index_def);
|
||||
}
|
||||
}
|
||||
|
||||
sub _append_one_target_index {
|
||||
my $dom_cfg = shift;
|
||||
my $dom_def = shift;
|
||||
my $index_def = shift;
|
||||
my $tgt_idx = $dom_cfg->createElementNS($idxNS, 'target_index');
|
||||
my $index_name = "$index_def->{name}:$index_def->{index_type}";
|
||||
$tgt_idx->appendText($index_name);
|
||||
$dom_def->appendChild($tgt_idx);
|
||||
}
|
||||
|
||||
sub _append_grs1_defs_for_leader {
|
||||
my $dom_cfg = shift;
|
||||
my $root = shift;
|
||||
my $grs1_def = shift;
|
||||
foreach my $index_def (@{ $grs1_def->{index_defs} }) {
|
||||
my $dom_def = $dom_cfg->createElementNS($idxNS, 'index_leader');
|
||||
if (defined $index_def->{offset} && defined $index_def->{length}) {
|
||||
$dom_def->setAttribute('offset', $index_def->{offset});
|
||||
$dom_def->setAttribute('length', $index_def->{length});
|
||||
}
|
||||
_append_one_target_index($dom_cfg, $dom_def, $index_def);
|
||||
$root->appendChild($dom_def);
|
||||
}
|
||||
}
|
||||
|
||||
sub _append_grs1_defs_for_control_field {
|
||||
my $dom_cfg = shift;
|
||||
my $root = shift;
|
||||
my $grs1_def = shift;
|
||||
foreach my $index_def (@{ $grs1_def->{index_defs} }) {
|
||||
my $dom_def = $dom_cfg->createElementNS($idxNS, 'index_control_field');
|
||||
$dom_def->setAttribute('tag', $grs1_def->{tag});
|
||||
if (defined $index_def->{offset} && defined $index_def->{length}) {
|
||||
$dom_def->setAttribute('offset', $index_def->{offset});
|
||||
$dom_def->setAttribute('length', $index_def->{length});
|
||||
}
|
||||
_append_one_target_index($dom_cfg, $dom_def, $index_def);
|
||||
$root->appendChild($dom_def);
|
||||
}
|
||||
}
|
||||
|
||||
1;
|
72
misc/maintenance/make_zebra_dom_cfg_from_record_abs
Executable file
72
misc/maintenance/make_zebra_dom_cfg_from_record_abs
Executable file
|
@ -0,0 +1,72 @@
|
|||
#!/usr/bin/perl
|
||||
|
||||
# Copyright (c) 2012 Equinox Software, Inc.
|
||||
# This file is part of Koha.
|
||||
#
|
||||
# Koha is free software; you can redistribute it and/or modify it under the
|
||||
# terms of the GNU General Public License as published by the Free Software
|
||||
# Foundation; either version 2 of the License, or (at your option) any later
|
||||
# version.
|
||||
#
|
||||
# Koha is distributed in the hope that it will be useful, but WITHOUT ANY
|
||||
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
|
||||
# A PARTICULAR PURPOSE. See the GNU General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public License along with
|
||||
# Koha; if not, write to the Free Software Foundation, Inc., 59 Temple Place,
|
||||
# Suite 330, Boston, MA 02111-1307 USA
|
||||
|
||||
use strict;
|
||||
use warnings;
|
||||
use 5.010;
|
||||
|
||||
use Koha::Indexer::Utils;
|
||||
|
||||
use Getopt::Long;
|
||||
|
||||
my $input_file;
|
||||
my $output_file;
|
||||
my $want_help;
|
||||
my $result = GetOptions(
|
||||
'input:s' => \$input_file,
|
||||
'output:s' => \$output_file,
|
||||
'help|h' => \$want_help,
|
||||
);
|
||||
|
||||
if ( not $result or $want_help or not defined $input_file or not defined $output_file ) {
|
||||
print_usage();
|
||||
exit 0;
|
||||
}
|
||||
|
||||
open my $infh, '<', $input_file or die "$0: cannot open input file $input_file: $!\n";
|
||||
open my $outfh, '>', $output_file or die "$0: cannot open output file $output_file: $!\n";
|
||||
|
||||
my $grs1_cfg = join('', <$infh>);
|
||||
close $infh;
|
||||
my $dom_cfg = Koha::Indexer::Utils::zebra_record_abs_to_dom($grs1_cfg);
|
||||
print $outfh $dom_cfg;
|
||||
close $outfh;
|
||||
|
||||
sub print_usage {
|
||||
print <<_USAGE_;
|
||||
$0: generate a DOM filter Zebra index config from a GRS-1 config
|
||||
|
||||
Given a Zebra record.abs file containing a set of index definitions for
|
||||
Zebra's GRS-1 filter, write an equivalent DOM filter configuration.
|
||||
|
||||
To generate the XSLT that is to be used by Zebra, run something like
|
||||
the following on the output of this utility:
|
||||
|
||||
xsltproc ZEBRA_CFG_DIR/xsl/koha-indexdefs-to-zebra.xsl \\
|
||||
biblio-koha-indexdefs.xml \\
|
||||
> ZEBRA_CFG_DIR/marc_defs/marc21/biblios/biblio-zebra-indexdefs.xsl
|
||||
|
||||
The above example assumes that the output of the program was named
|
||||
biblio-koha-indexdefs.xsl.
|
||||
|
||||
Parameters:
|
||||
--input input file name
|
||||
--output output file name
|
||||
--help or -h show this message
|
||||
_USAGE_
|
||||
}
|
Loading…
Reference in a new issue