Main Koha release repository
https://koha-community.org
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
551 lines
16 KiB
551 lines
16 KiB
package Koha::SearchEngine::Elasticsearch;
|
|
|
|
# Copyright 2015 Catalyst IT
|
|
#
|
|
# This file is part of Koha.
|
|
#
|
|
# Koha is free software; you can redistribute it and/or modify it under the
|
|
# terms of the GNU General Public License as published by the Free Software
|
|
# Foundation; either version 3 of the License, or (at your option) any later
|
|
# version.
|
|
#
|
|
# Koha is distributed in the hope that it will be useful, but WITHOUT ANY
|
|
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
|
|
# A PARTICULAR PURPOSE. See the GNU General Public License for more details.
|
|
#
|
|
# You should have received a copy of the GNU General Public License along
|
|
# with Koha; if not, write to the Free Software Foundation, Inc.,
|
|
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
|
|
|
use base qw(Class::Accessor);
|
|
|
|
use C4::Context;
|
|
|
|
use Koha::Database;
|
|
use Koha::Exceptions::Config;
|
|
use Koha::SearchFields;
|
|
use Koha::SearchMarcMaps;
|
|
|
|
use Carp;
|
|
use JSON;
|
|
use Modern::Perl;
|
|
use Readonly;
|
|
use Search::Elasticsearch;
|
|
use Try::Tiny;
|
|
use YAML::Syck;
|
|
|
|
use Data::Dumper; # TODO remove
|
|
|
|
__PACKAGE__->mk_ro_accessors(qw( index ));
|
|
__PACKAGE__->mk_accessors(qw( sort_fields ));
|
|
|
|
# Constants to refer to the standard index names
|
|
Readonly our $BIBLIOS_INDEX => 'biblios';
|
|
Readonly our $AUTHORITIES_INDEX => 'authorities';
|
|
|
|
=head1 NAME
|
|
|
|
Koha::SearchEngine::Elasticsearch - Base module for things using elasticsearch
|
|
|
|
=head1 ACCESSORS
|
|
|
|
=over 4
|
|
|
|
=item index
|
|
|
|
The name of the index to use, generally 'biblios' or 'authorities'.
|
|
|
|
=back
|
|
|
|
=head1 FUNCTIONS
|
|
|
|
=cut
|
|
|
|
sub new {
|
|
my $class = shift @_;
|
|
my $self = $class->SUPER::new(@_);
|
|
# Check for a valid index
|
|
croak('No index name provided') unless $self->index;
|
|
return $self;
|
|
}
|
|
|
|
=head2 get_elasticsearch_params
|
|
|
|
my $params = $self->get_elasticsearch_params();
|
|
|
|
This provides a hashref that contains the parameters for connecting to the
|
|
ElasicSearch servers, in the form:
|
|
|
|
{
|
|
'nodes' => ['127.0.0.1:9200', 'anotherserver:9200'],
|
|
'index_name' => 'koha_instance_index',
|
|
}
|
|
|
|
This is configured by the following in the C<config> block in koha-conf.xml:
|
|
|
|
<elasticsearch>
|
|
<server>127.0.0.1:9200</server>
|
|
<server>anotherserver:9200</server>
|
|
<index_name>koha_instance</index_name>
|
|
</elasticsearch>
|
|
|
|
=cut
|
|
|
|
sub get_elasticsearch_params {
|
|
my ($self) = @_;
|
|
|
|
# Copy the hash so that we're not modifying the original
|
|
my $conf = C4::Context->config('elasticsearch');
|
|
die "No 'elasticsearch' block is defined in koha-conf.xml.\n" if ( !$conf );
|
|
my $es = { %{ $conf } };
|
|
|
|
# Helpfully, the multiple server lines end up in an array for us anyway
|
|
# if there are multiple ones, but not if there's only one.
|
|
my $server = $es->{server};
|
|
delete $es->{server};
|
|
if ( ref($server) eq 'ARRAY' ) {
|
|
|
|
# store it called 'nodes' (which is used by newer Search::Elasticsearch)
|
|
$es->{nodes} = $server;
|
|
}
|
|
elsif ($server) {
|
|
$es->{nodes} = [$server];
|
|
}
|
|
else {
|
|
die "No elasticsearch servers were specified in koha-conf.xml.\n";
|
|
}
|
|
die "No elasticserver index_name was specified in koha-conf.xml.\n"
|
|
if ( !$es->{index_name} );
|
|
# Append the name of this particular index to our namespace
|
|
$es->{index_name} .= '_' . $self->index;
|
|
|
|
$es->{key_prefix} = 'es_';
|
|
return $es;
|
|
}
|
|
|
|
=head2 get_elasticsearch_settings
|
|
|
|
my $settings = $self->get_elasticsearch_settings();
|
|
|
|
This provides the settings provided to elasticsearch when an index is created.
|
|
These can do things like define tokenisation methods.
|
|
|
|
A hashref containing the settings is returned.
|
|
|
|
=cut
|
|
|
|
sub get_elasticsearch_settings {
|
|
my ($self) = @_;
|
|
|
|
# Ultimately this should come from a file or something, and not be
|
|
# hardcoded.
|
|
my $settings = {
|
|
index => {
|
|
analysis => {
|
|
analyzer => {
|
|
analyser_phrase => {
|
|
tokenizer => 'icu_tokenizer',
|
|
filter => ['icu_folding'],
|
|
},
|
|
analyser_standard => {
|
|
tokenizer => 'icu_tokenizer',
|
|
filter => ['icu_folding'],
|
|
},
|
|
},
|
|
}
|
|
}
|
|
};
|
|
return $settings;
|
|
}
|
|
|
|
=head2 get_elasticsearch_mappings
|
|
|
|
my $mappings = $self->get_elasticsearch_mappings();
|
|
|
|
This provides the mappings that get passed to elasticsearch when an index is
|
|
created.
|
|
|
|
=cut
|
|
|
|
sub get_elasticsearch_mappings {
|
|
my ($self) = @_;
|
|
|
|
# TODO cache in the object?
|
|
my $mappings = {
|
|
data => {
|
|
_all => {type => "string", analyzer => "analyser_standard"},
|
|
properties => {
|
|
record => {
|
|
store => "true",
|
|
include_in_all => JSON::false,
|
|
type => "text",
|
|
},
|
|
}
|
|
}
|
|
};
|
|
my %sort_fields;
|
|
my $marcflavour = lc C4::Context->preference('marcflavour');
|
|
$self->_foreach_mapping(
|
|
sub {
|
|
my ( $name, $type, $facet, $suggestible, $sort, $marc_type ) = @_;
|
|
return if $marc_type ne $marcflavour;
|
|
# TODO if this gets any sort of complexity to it, it should
|
|
# be broken out into its own function.
|
|
|
|
# TODO be aware of date formats, but this requires pre-parsing
|
|
# as ES will simply reject anything with an invalid date.
|
|
my $es_type =
|
|
$type eq 'boolean'
|
|
? 'boolean'
|
|
: 'text';
|
|
|
|
if ($es_type eq 'boolean') {
|
|
$mappings->{data}{properties}{$name} = _elasticsearch_mapping_for_boolean( $name, $es_type, $facet, $suggestible, $sort, $marc_type );
|
|
return; #Boolean cannot have facets nor sorting nor suggestions
|
|
} else {
|
|
$mappings->{data}{properties}{$name} = _elasticsearch_mapping_for_default( $name, $es_type, $facet, $suggestible, $sort, $marc_type );
|
|
}
|
|
|
|
if ($facet) {
|
|
$mappings->{data}{properties}{ $name . '__facet' } = {
|
|
type => "keyword",
|
|
};
|
|
}
|
|
if ($suggestible) {
|
|
$mappings->{data}{properties}{ $name . '__suggestion' } = {
|
|
type => 'completion',
|
|
analyzer => 'simple',
|
|
search_analyzer => 'simple',
|
|
};
|
|
}
|
|
# Sort is a bit special as it can be true, false, undef.
|
|
# We care about "true" or "undef",
|
|
# "undef" means to do the default thing, which is make it sortable.
|
|
if ($sort || !defined $sort) {
|
|
$mappings->{data}{properties}{ $name . '__sort' } = {
|
|
search_analyzer => "analyser_phrase",
|
|
analyzer => "analyser_phrase",
|
|
type => "text",
|
|
include_in_all => JSON::false,
|
|
fields => {
|
|
phrase => {
|
|
type => "keyword",
|
|
},
|
|
},
|
|
};
|
|
$sort_fields{$name} = 1;
|
|
}
|
|
}
|
|
);
|
|
$self->sort_fields(\%sort_fields);
|
|
return $mappings;
|
|
}
|
|
|
|
=head2 _elasticsearch_mapping_for_*
|
|
|
|
Get the ES mappings for the given data type or a special mapping case
|
|
|
|
Receives the same parameters from the $self->_foreach_mapping() dispatcher
|
|
|
|
=cut
|
|
|
|
sub _elasticsearch_mapping_for_boolean {
|
|
my ( $name, $type, $facet, $suggestible, $sort, $marc_type ) = @_;
|
|
|
|
return {
|
|
type => $type,
|
|
null_value => 0,
|
|
};
|
|
}
|
|
|
|
sub _elasticsearch_mapping_for_default {
|
|
my ( $name, $type, $facet, $suggestible, $sort, $marc_type ) = @_;
|
|
|
|
return {
|
|
search_analyzer => "analyser_standard",
|
|
analyzer => "analyser_standard",
|
|
type => $type,
|
|
fields => {
|
|
phrase => {
|
|
search_analyzer => "analyser_phrase",
|
|
analyzer => "analyser_phrase",
|
|
type => "text",
|
|
},
|
|
raw => {
|
|
type => "keyword",
|
|
}
|
|
},
|
|
};
|
|
}
|
|
|
|
sub reset_elasticsearch_mappings {
|
|
my $mappings_yaml = C4::Context->config('intranetdir') . '/admin/searchengine/elasticsearch/mappings.yaml';
|
|
my $indexes = LoadFile( $mappings_yaml );
|
|
|
|
while ( my ( $index_name, $fields ) = each %$indexes ) {
|
|
while ( my ( $field_name, $data ) = each %$fields ) {
|
|
my $field_type = $data->{type};
|
|
my $field_label = $data->{label};
|
|
my $mappings = $data->{mappings};
|
|
my $search_field = Koha::SearchFields->find_or_create({ name => $field_name, label => $field_label, type => $field_type }, { key => 'name' });
|
|
for my $mapping ( @$mappings ) {
|
|
my $marc_field = Koha::SearchMarcMaps->find_or_create({ index_name => $index_name, marc_type => $mapping->{marc_type}, marc_field => $mapping->{marc_field} });
|
|
$search_field->add_to_search_marc_maps($marc_field, { facet => $mapping->{facet} || 0, suggestible => $mapping->{suggestible} || 0, sort => $mapping->{sort} } );
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
# This overrides the accessor provided by Class::Accessor so that if
|
|
# sort_fields isn't set, then it'll generate it.
|
|
sub sort_fields {
|
|
my $self = shift;
|
|
if (@_) {
|
|
$self->_sort_fields_accessor(@_);
|
|
return;
|
|
}
|
|
my $val = $self->_sort_fields_accessor();
|
|
return $val if $val;
|
|
|
|
# This will populate the accessor as a side effect
|
|
$self->get_elasticsearch_mappings();
|
|
return $self->_sort_fields_accessor();
|
|
}
|
|
|
|
# Provides the rules for data conversion.
|
|
sub get_fixer_rules {
|
|
my ($self) = @_;
|
|
|
|
my $marcflavour = lc C4::Context->preference('marcflavour');
|
|
my @rules;
|
|
|
|
$self->_foreach_mapping(
|
|
sub {
|
|
my ( $name, $type, $facet, $suggestible, $sort, $marc_type, $marc_field ) = @_;
|
|
return if $marc_type ne $marcflavour;
|
|
my $options ='';
|
|
|
|
push @rules, "marc_map('$marc_field','${name}.\$append', $options)";
|
|
if ($facet) {
|
|
push @rules, "marc_map('$marc_field','${name}__facet.\$append', $options)";
|
|
}
|
|
if ($suggestible) {
|
|
push @rules,
|
|
#"marc_map('$marc_field','${name}__suggestion.input.\$append', '')"; #must not have nested data structures in .input
|
|
"marc_map('$marc_field','${name}__suggestion.input.\$append')";
|
|
}
|
|
if ( $type eq 'boolean' ) {
|
|
|
|
# boolean gets special handling, basically if it doesn't exist,
|
|
# it's added and set to false. Otherwise we can't query it.
|
|
push @rules,
|
|
"unless exists('$name') add_field('$name', 0) end";
|
|
}
|
|
if ($type eq 'sum' ) {
|
|
push @rules, "sum('$name')";
|
|
}
|
|
if ($self->sort_fields()->{$name}) {
|
|
if ($sort || !defined $sort) {
|
|
push @rules, "marc_map('$marc_field','${name}__sort.\$append', $options)";
|
|
}
|
|
}
|
|
}
|
|
);
|
|
|
|
push @rules, "move_field(_id,es_id)"; #Also you must set the Catmandu::Store::ElasticSearch->new(key_prefix: 'es_');
|
|
return \@rules;
|
|
}
|
|
|
|
=head2 _foreach_mapping
|
|
|
|
$self->_foreach_mapping(
|
|
sub {
|
|
my ( $name, $type, $facet, $suggestible, $sort, $marc_type,
|
|
$marc_field )
|
|
= @_;
|
|
return unless $marc_type eq 'marc21';
|
|
print "Data comes from: " . $marc_field . "\n";
|
|
}
|
|
);
|
|
|
|
This allows you to apply a function to each entry in the elasticsearch mappings
|
|
table, in order to build the mappings for whatever is needed.
|
|
|
|
In the provided function, the files are:
|
|
|
|
=over 4
|
|
|
|
=item C<$name>
|
|
|
|
The field name for elasticsearch (corresponds to the 'mapping' column in the
|
|
database.
|
|
|
|
=item C<$type>
|
|
|
|
The type for this value, e.g. 'string'.
|
|
|
|
=item C<$facet>
|
|
|
|
True if this value should be facetised. This only really makes sense if the
|
|
field is understood by the facet processing code anyway.
|
|
|
|
=item C<$sort>
|
|
|
|
True if this is a field that a) needs special sort handling, and b) if it
|
|
should be sorted on. False if a) but not b). Undef if not a). This allows,
|
|
for example, author to be sorted on but not everything marked with "author"
|
|
to be included in that sort.
|
|
|
|
=item C<$marc_type>
|
|
|
|
A string that indicates the MARC type that this mapping is for, e.g. 'marc21',
|
|
'unimarc', 'normarc'.
|
|
|
|
=item C<$marc_field>
|
|
|
|
A string that describes the MARC field that contains the data to extract.
|
|
These are of a form suited to Catmandu's MARC fixers.
|
|
|
|
=back
|
|
|
|
=cut
|
|
|
|
sub _foreach_mapping {
|
|
my ( $self, $sub ) = @_;
|
|
|
|
# TODO use a caching framework here
|
|
my $search_fields = Koha::Database->schema->resultset('SearchField')->search(
|
|
{
|
|
'search_marc_map.index_name' => $self->index,
|
|
},
|
|
{ join => { search_marc_to_fields => 'search_marc_map' },
|
|
'+select' => [
|
|
'search_marc_to_fields.facet',
|
|
'search_marc_to_fields.suggestible',
|
|
'search_marc_to_fields.sort',
|
|
'search_marc_map.marc_type',
|
|
'search_marc_map.marc_field',
|
|
],
|
|
'+as' => [
|
|
'facet',
|
|
'suggestible',
|
|
'sort',
|
|
'marc_type',
|
|
'marc_field',
|
|
],
|
|
}
|
|
);
|
|
|
|
while ( my $search_field = $search_fields->next ) {
|
|
$sub->(
|
|
$search_field->name,
|
|
$search_field->type,
|
|
$search_field->get_column('facet'),
|
|
$search_field->get_column('suggestible'),
|
|
$search_field->get_column('sort'),
|
|
$search_field->get_column('marc_type'),
|
|
$search_field->get_column('marc_field'),
|
|
);
|
|
}
|
|
}
|
|
|
|
=head2 process_error
|
|
|
|
die process_error($@);
|
|
|
|
This parses an Elasticsearch error message and produces a human-readable
|
|
result from it. This result is probably missing all the useful information
|
|
that you might want in diagnosing an issue, so the warning is also logged.
|
|
|
|
Note that currently the resulting message is not internationalised. This
|
|
will happen eventually by some method or other.
|
|
|
|
=cut
|
|
|
|
sub process_error {
|
|
my ($self, $msg) = @_;
|
|
|
|
warn $msg; # simple logging
|
|
|
|
# This is super-primitive
|
|
return "Unable to understand your search query, please rephrase and try again.\n" if $msg =~ /ParseException/;
|
|
|
|
return "Unable to perform your search. Please try again.\n";
|
|
}
|
|
|
|
=head2 _read_configuration
|
|
|
|
my $conf = _read_configuration();
|
|
|
|
Reads the I<configuration file> and returns a hash structure with the
|
|
configuration information. It raises an exception if mandatory entries
|
|
are missing.
|
|
|
|
The hashref structure has the following form:
|
|
|
|
{
|
|
'nodes' => ['127.0.0.1:9200', 'anotherserver:9200'],
|
|
'index_name' => 'koha_instance',
|
|
}
|
|
|
|
This is configured by the following in the C<config> block in koha-conf.xml:
|
|
|
|
<elasticsearch>
|
|
<server>127.0.0.1:9200</server>
|
|
<server>anotherserver:9200</server>
|
|
<index_name>koha_instance</index_name>
|
|
</elasticsearch>
|
|
|
|
=cut
|
|
|
|
sub _read_configuration {
|
|
|
|
my $configuration;
|
|
|
|
my $conf = C4::Context->config('elasticsearch');
|
|
Koha::Exceptions::Config::MissingEntry->throw(
|
|
"Missing 'elasticsearch' block in config file")
|
|
unless defined $conf;
|
|
|
|
if ( $conf && $conf->{server} ) {
|
|
my $nodes = $conf->{server};
|
|
if ( ref($nodes) eq 'ARRAY' ) {
|
|
$configuration->{nodes} = $nodes;
|
|
}
|
|
else {
|
|
$configuration->{nodes} = [$nodes];
|
|
}
|
|
}
|
|
else {
|
|
Koha::Exceptions::Config::MissingEntry->throw(
|
|
"Missing 'server' entry in config file for elasticsearch");
|
|
}
|
|
|
|
if ( defined $conf->{index_name} ) {
|
|
$configuration->{index_name} = $conf->{index_name};
|
|
}
|
|
else {
|
|
Koha::Exceptions::Config::MissingEntry->throw(
|
|
"Missing 'index_name' entry in config file for elasticsearch");
|
|
}
|
|
|
|
return $configuration;
|
|
}
|
|
|
|
1;
|
|
|
|
__END__
|
|
|
|
=head1 AUTHOR
|
|
|
|
=over 4
|
|
|
|
=item Chris Cormack C<< <chrisc@catalyst.net.nz> >>
|
|
|
|
=item Robin Sheat C<< <robin@catalyst.net.nz> >>
|
|
|
|
=item Jonathan Druart C<< <jonathan.druart@bugs.koha-community.org> >>
|
|
|
|
=back
|
|
|
|
=cut
|
|
|