Main Koha release repository
https://koha-community.org
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
1203 lines
39 KiB
1203 lines
39 KiB
package Koha::SearchEngine::Elasticsearch::QueryBuilder;
|
|
|
|
# This file is part of Koha.
|
|
#
|
|
# Copyright 2014 Catalyst IT Ltd.
|
|
#
|
|
# Koha is free software; you can redistribute it and/or modify it
|
|
# under the terms of the GNU General Public License as published by
|
|
# the Free Software Foundation; either version 3 of the License, or
|
|
# (at your option) any later version.
|
|
#
|
|
# Koha is distributed in the hope that it will be useful, but
|
|
# WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
# GNU General Public License for more details.
|
|
#
|
|
# You should have received a copy of the GNU General Public License
|
|
# along with Koha; if not, see <http://www.gnu.org/licenses>.
|
|
|
|
=head1 NAME
|
|
|
|
Koha::SearchEngine::Elasticsearch::QueryBuilder - constructs elasticsearch
|
|
query objects from user-supplied queries
|
|
|
|
=head1 DESCRIPTION
|
|
|
|
This provides the functions that take a user-supplied search query, and
|
|
provides something that can be given to elasticsearch to get answers.
|
|
|
|
=head1 SYNOPSIS
|
|
|
|
use Koha::SearchEngine::Elasticsearch::QueryBuilder;
|
|
$builder = Koha::SearchEngine::Elasticsearch->new({ index => $index });
|
|
my $simple_query = $builder->build_query("hello");
|
|
# This is currently undocumented because the original code is undocumented
|
|
my $adv_query = $builder->build_advanced_query($indexes, $operands, $operators);
|
|
|
|
=head1 METHODS
|
|
|
|
=cut
|
|
|
|
use base qw(Koha::SearchEngine::Elasticsearch);
|
|
use Carp;
|
|
use JSON;
|
|
use List::MoreUtils qw/ each_array /;
|
|
use Modern::Perl;
|
|
use URI::Escape;
|
|
|
|
use C4::Context;
|
|
use Koha::Exceptions;
|
|
use Koha::Caches;
|
|
|
|
=head2 build_query
|
|
|
|
my $simple_query = $builder->build_query("hello", %options)
|
|
|
|
This will build a query that can be issued to elasticsearch from the provided
|
|
string input. This expects a lucene style search form (see
|
|
L<http://www.elasticsearch.org/guide/en/elasticsearch/reference/current/query-dsl-query-string-query.html#query-string-syntax>
|
|
for details.)
|
|
|
|
It'll make an attempt to respect the various query options.
|
|
|
|
Additional options can be provided with the C<%options> hash.
|
|
|
|
=over 4
|
|
|
|
=item sort
|
|
|
|
This should be an arrayref of hashrefs, each containing a C<field> and an
|
|
C<direction> (optional, defaults to C<asc>.) The results will be sorted
|
|
according to these values. Valid values for C<direction> are 'asc' and 'desc'.
|
|
|
|
=back
|
|
|
|
=cut
|
|
|
|
sub build_query {
|
|
my ( $self, $query, %options ) = @_;
|
|
|
|
my $stemming = C4::Context->preference("QueryStemming") || 0;
|
|
my $auto_truncation = C4::Context->preference("QueryAutoTruncate") || 0;
|
|
my $fuzzy_enabled = C4::Context->preference("QueryFuzzy") || 0;
|
|
|
|
$query = '*' unless defined $query;
|
|
|
|
my $res;
|
|
my $fields = $self->_search_fields({
|
|
is_opac => $options{is_opac},
|
|
weighted_fields => $options{weighted_fields},
|
|
});
|
|
if ($options{whole_record}) {
|
|
push @$fields, 'marc_data_array.*';
|
|
}
|
|
$res->{query} = {
|
|
query_string => {
|
|
query => $query,
|
|
fuzziness => $fuzzy_enabled ? 'auto' : '0',
|
|
default_operator => 'AND',
|
|
fields => $fields,
|
|
lenient => JSON::true,
|
|
analyze_wildcard => JSON::true,
|
|
type => 'cross_fields',
|
|
}
|
|
};
|
|
|
|
if ( $options{sort} ) {
|
|
foreach my $sort ( @{ $options{sort} } ) {
|
|
my ( $f, $d ) = @$sort{qw/ field direction /};
|
|
die "Invalid sort direction, $d"
|
|
if $d && ( $d ne 'asc' && $d ne 'desc' );
|
|
$d = 'asc' unless $d;
|
|
|
|
$f = $self->_sort_field($f);
|
|
push @{ $res->{sort} }, { $f => { order => $d } };
|
|
}
|
|
}
|
|
|
|
# See _convert_facets in Search.pm for how these get turned into
|
|
# things that Koha can use.
|
|
my $size = C4::Context->preference('FacetMaxCount');
|
|
$res->{aggregations} = {
|
|
author => { terms => { field => "author__facet" , size => $size } },
|
|
subject => { terms => { field => "subject__facet", size => $size } },
|
|
itype => { terms => { field => "itype__facet", size => $size} },
|
|
location => { terms => { field => "location__facet", size => $size } },
|
|
'su-geo' => { terms => { field => "su-geo__facet", size => $size} },
|
|
'title-series' => { terms => { field => "title-series__facet", size => $size } },
|
|
ccode => { terms => { field => "ccode__facet", size => $size } },
|
|
ln => { terms => { field => "ln__facet", size => $size } },
|
|
};
|
|
|
|
my $display_library_facets = C4::Context->preference('DisplayLibraryFacets');
|
|
if ( $display_library_facets eq 'both'
|
|
or $display_library_facets eq 'home' ) {
|
|
$res->{aggregations}{homebranch} = { terms => { field => "homebranch__facet", size => $size } };
|
|
}
|
|
if ( $display_library_facets eq 'both'
|
|
or $display_library_facets eq 'holding' ) {
|
|
$res->{aggregations}{holdingbranch} = { terms => { field => "holdingbranch__facet", size => $size } };
|
|
}
|
|
return $res;
|
|
}
|
|
|
|
=head2 build_query_compat
|
|
|
|
my (
|
|
$error, $query, $simple_query, $query_cgi,
|
|
$query_desc, $limit, $limit_cgi, $limit_desc,
|
|
$stopwords_removed, $query_type
|
|
)
|
|
= $builder->build_query_compat( \@operators, \@operands, \@indexes,
|
|
\@limits, \@sort_by, $scan, $lang, $params );
|
|
|
|
This handles a search using the same api as L<C4::Search::buildQuery> does.
|
|
|
|
A very simple query will go in with C<$operands> set to ['query'], and
|
|
C<$sort_by> set to ['pubdate_dsc']. This simple case will return with
|
|
C<$query> set to something that can perform the search, C<$simple_query>
|
|
set to just the search term, C<$query_cgi> set to something that can
|
|
reproduce this search, and C<$query_desc> set to something else.
|
|
|
|
=cut
|
|
|
|
sub build_query_compat {
|
|
my ( $self, $operators, $operands, $indexes, $orig_limits, $sort_by, $scan,
|
|
$lang, $params )
|
|
= @_;
|
|
|
|
my $query;
|
|
my $query_str = '';
|
|
my $search_param_query_str = '';
|
|
my $limits = ();
|
|
if ( $scan ) {
|
|
($query, $query_str) = $self->_build_scan_query( $operands, $indexes );
|
|
$search_param_query_str = $query_str;
|
|
} else {
|
|
my @sort_params = $self->_convert_sort_fields(@$sort_by);
|
|
my @index_params = $self->_convert_index_fields(@$indexes);
|
|
$limits = $self->_fix_limit_special_cases($orig_limits);
|
|
if ( $params->{suppress} ) { push @$limits, "suppress:false"; }
|
|
# Merge the indexes in with the search terms and the operands so that
|
|
# each search thing is a handy unit.
|
|
unshift @$operators, undef; # The first one can't have an op
|
|
my @search_params;
|
|
my $truncate = C4::Context->preference("QueryAutoTruncate") || 0;
|
|
my $ea = each_array( @$operands, @$operators, @index_params );
|
|
while ( my ( $oand, $otor, $index ) = $ea->() ) {
|
|
next if ( !defined($oand) || $oand eq '' );
|
|
$oand = $self->_clean_search_term($oand);
|
|
$oand = $self->_truncate_terms($oand) if ($truncate);
|
|
push @search_params, {
|
|
operand => $oand, # the search terms
|
|
operator => defined($otor) ? uc $otor : undef, # AND and so on
|
|
$index ? %$index : (),
|
|
};
|
|
}
|
|
|
|
# We build a string query from limits and the queries. An alternative
|
|
# would be to pass them separately into build_query and let it build
|
|
# them into a structured ES query itself. Maybe later, though that'd be
|
|
# more robust.
|
|
$search_param_query_str = join( ' ', $self->_create_query_string(@search_params) );
|
|
$query_str = join( ' AND ',
|
|
$search_param_query_str || (),
|
|
$self->_join_queries( $self->_convert_index_strings(@$limits) ) || () );
|
|
|
|
# If there's no query on the left, let's remove the junk left behind
|
|
$query_str =~ s/^ AND //;
|
|
my %options;
|
|
$options{sort} = \@sort_params;
|
|
$options{is_opac} = $params->{is_opac};
|
|
$options{weighted_fields} = $params->{weighted_fields};
|
|
$options{whole_record} = $params->{whole_record};
|
|
$query = $self->build_query( $query_str, %options );
|
|
}
|
|
|
|
# We roughly emulate the CGI parameters of the zebra query builder
|
|
my $query_cgi = '';
|
|
shift @$operators; # Shift out the one we unshifted before
|
|
my $ea = each_array( @$operands, @$operators, @$indexes );
|
|
while ( my ( $oand, $otor, $index ) = $ea->() ) {
|
|
$query_cgi .= '&' if $query_cgi;
|
|
$query_cgi .= 'idx=' . uri_escape_utf8( $index // '') . '&q=' . uri_escape_utf8( $oand );
|
|
$query_cgi .= '&op=' . uri_escape_utf8( $otor ) if $otor;
|
|
}
|
|
$query_cgi .= '&scan=1' if ( $scan );
|
|
|
|
my $simple_query;
|
|
$simple_query = $operands->[0] if @$operands == 1;
|
|
my $query_desc;
|
|
if ( $simple_query ) {
|
|
$query_desc = $simple_query;
|
|
} else {
|
|
$query_desc = $search_param_query_str;
|
|
}
|
|
my $limit = $self->_join_queries( $self->_convert_index_strings(@$limits));
|
|
my $limit_cgi = ( $orig_limits and @$orig_limits )
|
|
? '&limit=' . join( '&limit=', map { uri_escape_utf8($_) } @$orig_limits )
|
|
: '';
|
|
my $limit_desc;
|
|
$limit_desc = "$limit" if $limit;
|
|
|
|
return (
|
|
undef, $query, $simple_query, $query_cgi, $query_desc,
|
|
$limit, $limit_cgi, $limit_desc, undef, undef
|
|
);
|
|
}
|
|
|
|
=head2 build_authorities_query
|
|
|
|
my $query = $builder->build_authorities_query(\%search);
|
|
|
|
This takes a nice description of an authority search and turns it into a black-box
|
|
query that can then be passed to the appropriate searcher.
|
|
|
|
The search description is a hashref that looks something like:
|
|
|
|
{
|
|
searches => [
|
|
{
|
|
where => 'Heading', # search the main entry
|
|
operator => 'exact', # require an exact match
|
|
value => 'frogs', # the search string
|
|
},
|
|
{
|
|
where => '', # search all entries
|
|
operator => '', # default keyword, right truncation
|
|
value => 'pond',
|
|
},
|
|
],
|
|
sort => {
|
|
field => 'Heading',
|
|
order => 'desc',
|
|
},
|
|
authtypecode => 'TOPIC_TERM',
|
|
}
|
|
|
|
=cut
|
|
|
|
sub build_authorities_query {
|
|
my ( $self, $search ) = @_;
|
|
|
|
# Start by making the query parts
|
|
my @query_parts;
|
|
|
|
foreach my $s ( @{ $search->{searches} } ) {
|
|
my ( $wh, $op, $val ) = @{$s}{qw(where operator value)};
|
|
if ( defined $op && ($op eq 'is' || $op eq '=' || $op eq 'exact') ) {
|
|
if ($wh) {
|
|
# Match the whole field, case insensitive, UTF normalized.
|
|
push @query_parts, { term => { "$wh.ci_raw" => $val } };
|
|
}
|
|
else {
|
|
# Match the whole field for all searchable fields, case insensitive,
|
|
# UTF normalized.
|
|
# Given that field data is "The quick brown fox"
|
|
# "The quick brown fox" and "the quick brown fox" will match
|
|
# but not "quick brown fox".
|
|
push @query_parts, {
|
|
multi_match => {
|
|
query => $val,
|
|
fields => $self->_search_fields({ subfield => 'ci_raw' }),
|
|
}
|
|
};
|
|
}
|
|
}
|
|
elsif ( defined $op && $op eq 'start') {
|
|
# Match the prefix within a field for all searchable fields.
|
|
# Given that field data is "The quick brown fox"
|
|
# "The quick bro" will match, but not "quick bro"
|
|
|
|
# Does not seems to be a multi prefix query
|
|
# so we need to create one
|
|
if ($wh) {
|
|
# Match prefix of the field.
|
|
push @query_parts, { prefix => {"$wh.ci_raw" => $val} };
|
|
}
|
|
else {
|
|
my @prefix_queries;
|
|
foreach my $field (@{$self->_search_fields()}) {
|
|
push @prefix_queries, {
|
|
prefix => { "$field.ci_raw" => $val }
|
|
};
|
|
}
|
|
push @query_parts, {
|
|
'bool' => {
|
|
'should' => \@prefix_queries,
|
|
'minimum_should_match' => 1
|
|
}
|
|
};
|
|
}
|
|
}
|
|
else {
|
|
# Query all searchable fields.
|
|
# Given that field data is "The quick brown fox"
|
|
# a search containing any of the words will match, regardless
|
|
# of order.
|
|
|
|
my @tokens = $self->_split_query( $val );
|
|
foreach my $token ( @tokens ) {
|
|
$token = $self->_truncate_terms(
|
|
$self->_clean_search_term( $token )
|
|
);
|
|
}
|
|
my $query = $self->_join_queries( @tokens );
|
|
|
|
if ($wh) {
|
|
push @query_parts, { query_string => {
|
|
default_field => $wh,
|
|
analyze_wildcard => JSON::true,
|
|
query => $query
|
|
} };
|
|
}
|
|
else {
|
|
push @query_parts, {
|
|
query_string => {
|
|
analyze_wildcard => JSON::true,
|
|
query => $query,
|
|
fields => $self->_search_fields(),
|
|
}
|
|
};
|
|
}
|
|
}
|
|
}
|
|
|
|
# Merge the query parts appropriately
|
|
# 'should' behaves like 'or'
|
|
# 'must' behaves like 'and'
|
|
# Zebra behaviour seem to match must so using that here
|
|
my $elastic_query = {};
|
|
$elastic_query->{bool}->{must} = \@query_parts;
|
|
|
|
# Filter by authtypecode if set
|
|
if ($search->{authtypecode}) {
|
|
$elastic_query->{bool}->{filter} = {
|
|
term => {
|
|
"authtype.raw" => $search->{authtypecode}
|
|
}
|
|
};
|
|
}
|
|
|
|
my $query = {
|
|
query => $elastic_query
|
|
};
|
|
|
|
# Add the sort stuff
|
|
$query->{sort} = [ $search->{sort} ] if exists $search->{sort};
|
|
|
|
return $query;
|
|
}
|
|
|
|
=head2 build_authorities_query_compat
|
|
|
|
my ($query) =
|
|
$builder->build_authorities_query_compat( \@marclist, \@and_or,
|
|
\@excluding, \@operator, \@value, $authtypecode, $orderby );
|
|
|
|
This builds a query for searching for authorities, in the style of
|
|
L<C4::AuthoritiesMarc::SearchAuthorities>.
|
|
|
|
Arguments:
|
|
|
|
=over 4
|
|
|
|
=item marclist
|
|
|
|
An arrayref containing where the particular term should be searched for.
|
|
Options are: mainmainentry, mainentry, match, match-heading, see-from, and
|
|
thesaurus. If left blank, any field is used.
|
|
|
|
=item and_or
|
|
|
|
Totally ignored. It is never used in L<C4::AuthoritiesMarc::SearchAuthorities>.
|
|
|
|
=item excluding
|
|
|
|
Also ignored.
|
|
|
|
=item operator
|
|
|
|
What form of search to do. Options are: is (phrase, no truncation, whole field
|
|
must match), = (number exact match), exact (phrase, no truncation, whole field
|
|
must match). If left blank, then word list, right truncated, anywhere is used.
|
|
|
|
=item value
|
|
|
|
The actual user-provided string value to search for.
|
|
|
|
=item authtypecode
|
|
|
|
The authority type code to search within. If blank, then all will be searched.
|
|
|
|
=item orderby
|
|
|
|
The order to sort the results by. Options are Relevance, HeadingAsc,
|
|
HeadingDsc, AuthidAsc, AuthidDsc.
|
|
|
|
=back
|
|
|
|
marclist, operator, and value must be the same length, and the values at
|
|
index /i/ all relate to each other.
|
|
|
|
This returns a query, which is a black box object that can be passed to the
|
|
appropriate search object.
|
|
|
|
=cut
|
|
|
|
our $koha_to_index_name = {
|
|
mainmainentry => 'heading-main',
|
|
mainentry => 'heading',
|
|
match => 'match',
|
|
'match-heading' => 'match-heading',
|
|
'see-from' => 'match-heading-see-from',
|
|
thesaurus => 'subject-heading-thesaurus',
|
|
any => '',
|
|
all => ''
|
|
};
|
|
|
|
sub build_authorities_query_compat {
|
|
my ( $self, $marclist, $and_or, $excluding, $operator, $value,
|
|
$authtypecode, $orderby )
|
|
= @_;
|
|
|
|
# This turns the old-style many-options argument form into a more
|
|
# extensible hash form that is understood by L<build_authorities_query>.
|
|
my @searches;
|
|
my $mappings = $self->get_elasticsearch_mappings();
|
|
|
|
# Convert to lower case
|
|
$marclist = [map(lc, @{$marclist})];
|
|
$orderby = lc $orderby;
|
|
|
|
my @indexes;
|
|
# Make sure everything exists
|
|
foreach my $m (@$marclist) {
|
|
|
|
$m = exists $koha_to_index_name->{$m} ? $koha_to_index_name->{$m} : $m;
|
|
push @indexes, $m;
|
|
warn "Unknown search field $m in marclist" unless (defined $mappings->{data}->{properties}->{$m} || $m eq '' || $m eq 'match-heading');
|
|
}
|
|
for ( my $i = 0 ; $i < @$value ; $i++ ) {
|
|
next unless $value->[$i]; #clean empty form values, ES doesn't like undefined searches
|
|
push @searches,
|
|
{
|
|
where => $indexes[$i],
|
|
operator => $operator->[$i],
|
|
value => $value->[$i],
|
|
};
|
|
}
|
|
|
|
my %sort;
|
|
my $sort_field =
|
|
( $orderby =~ /^heading/ ) ? 'heading__sort'
|
|
: ( $orderby =~ /^auth/ ) ? 'local-number__sort'
|
|
: undef;
|
|
if ($sort_field) {
|
|
my $sort_order = ( $orderby =~ /asc$/ ) ? 'asc' : 'desc';
|
|
%sort = ( $sort_field => $sort_order, );
|
|
}
|
|
my %search = (
|
|
searches => \@searches,
|
|
authtypecode => $authtypecode,
|
|
);
|
|
$search{sort} = \%sort if %sort;
|
|
my $query = $self->build_authorities_query( \%search );
|
|
return $query;
|
|
}
|
|
|
|
=head2 _build_scan_query
|
|
|
|
my ($query, $query_str) = $builder->_build_scan_query(\@operands, \@indexes)
|
|
|
|
This will build an aggregation scan query that can be issued to elasticsearch from
|
|
the provided string input.
|
|
|
|
=cut
|
|
|
|
our %scan_field_convert = (
|
|
'ti' => 'title',
|
|
'au' => 'author',
|
|
'su' => 'subject',
|
|
'se' => 'title-series',
|
|
'pb' => 'publisher',
|
|
);
|
|
|
|
sub _build_scan_query {
|
|
my ( $self, $operands, $indexes ) = @_;
|
|
|
|
my $term = scalar( @$operands ) == 0 ? '' : $operands->[0];
|
|
my $index = scalar( @$indexes ) == 0 ? 'subject' : $indexes->[0];
|
|
|
|
my ( $f, $d ) = split( /,/, $index);
|
|
$index = $scan_field_convert{$f} || $f;
|
|
|
|
my $res;
|
|
$res->{query} = {
|
|
query_string => {
|
|
query => '*'
|
|
}
|
|
};
|
|
$res->{aggregations} = {
|
|
$index => {
|
|
terms => {
|
|
field => $index . '__facet',
|
|
order => { '_term' => 'asc' },
|
|
include => $self->_create_regex_filter($self->_clean_search_term($term)) . '.*'
|
|
}
|
|
}
|
|
};
|
|
return ($res, $term);
|
|
}
|
|
|
|
=head2 _create_regex_filter
|
|
|
|
my $filter = $builder->_create_regex_filter('term')
|
|
|
|
This will create a regex filter that can be used with an aggregation query.
|
|
|
|
=cut
|
|
|
|
sub _create_regex_filter {
|
|
my ($self, $term) = @_;
|
|
|
|
my $result = '';
|
|
foreach my $c (split(//, quotemeta($term))) {
|
|
my $lc = lc($c);
|
|
my $uc = uc($c);
|
|
$result .= $lc ne $uc ? '[' . $lc . $uc . ']' : $c;
|
|
}
|
|
return $result;
|
|
}
|
|
|
|
=head2 _convert_sort_fields
|
|
|
|
my @sort_params = _convert_sort_fields(@sort_by)
|
|
|
|
Converts the zebra-style sort index information into elasticsearch-style.
|
|
|
|
C<@sort_by> is the same as presented to L<build_query_compat>, and it returns
|
|
something that can be sent to L<build_query>.
|
|
|
|
=cut
|
|
|
|
sub _convert_sort_fields {
|
|
my ( $self, @sort_by ) = @_;
|
|
|
|
# Turn the sorting into something we care about.
|
|
my %sort_field_convert = (
|
|
acqdate => 'date-of-acquisition',
|
|
author => 'author',
|
|
call_number => 'local-classification',
|
|
popularity => 'issues',
|
|
relevance => undef, # default
|
|
title => 'title',
|
|
pubdate => 'date-of-publication',
|
|
);
|
|
my %sort_order_convert =
|
|
( qw( desc desc ), qw( dsc desc ), qw( asc asc ), qw( az asc ), qw( za desc ) );
|
|
|
|
# Convert the fields and orders, drop anything we don't know about.
|
|
grep { $_->{field} } map {
|
|
my ( $f, $d ) = /(.+)_(.+)/;
|
|
{
|
|
field => $sort_field_convert{$f},
|
|
direction => $sort_order_convert{$d}
|
|
}
|
|
} @sort_by;
|
|
}
|
|
|
|
=head2 _convert_index_fields
|
|
|
|
my @index_params = $self->_convert_index_fields(@indexes);
|
|
|
|
Converts zebra-style search index notation into elasticsearch-style.
|
|
|
|
C<@indexes> is an array of index names, as presented to L<build_query_compat>,
|
|
and it returns something that can be sent to L<build_query>.
|
|
|
|
B<TODO>: this will pull from the elasticsearch mappings table to figure out
|
|
types.
|
|
|
|
=cut
|
|
|
|
our %index_field_convert = (
|
|
'kw' => '',
|
|
'ab' => 'abstract',
|
|
'au' => 'author',
|
|
'lcn' => 'local-classification',
|
|
'callnum' => 'local-classification',
|
|
'record-type' => 'rtype',
|
|
'mc-rtype' => 'rtype',
|
|
'mus' => 'rtype',
|
|
'lc-card' => 'lc-card-number',
|
|
'sn' => 'local-number',
|
|
'biblionumber' => 'local-number',
|
|
'yr' => 'date-of-publication',
|
|
'pubdate' => 'date-of-publication',
|
|
'acqdate' => 'date-of-acquisition',
|
|
'date/time-last-modified' => 'date-time-last-modified',
|
|
'dtlm' => 'date-time-last-modified',
|
|
'diss' => 'dissertation-information',
|
|
'nb' => 'isbn',
|
|
'ns' => 'issn',
|
|
'music-number' => 'identifier-publisher-for-music',
|
|
'number-music-publisher' => 'identifier-publisher-for-music',
|
|
'music' => 'identifier-publisher-for-music',
|
|
'ident' => 'identifier-standard',
|
|
'cpn' => 'corporate-name',
|
|
'cfn' => 'conference-name',
|
|
'pn' => 'personal-name',
|
|
'pb' => 'publisher',
|
|
'pv' => 'provider',
|
|
'nt' => 'note',
|
|
'notes' => 'note',
|
|
'rcn' => 'record-control-number',
|
|
'su' => 'subject',
|
|
'su-to' => 'subject',
|
|
#'su-geo' => 'subject',
|
|
'su-ut' => 'subject',
|
|
'ti' => 'title',
|
|
'se' => 'title-series',
|
|
'ut' => 'title-uniform',
|
|
'an' => 'koha-auth-number',
|
|
'authority-number' => 'koha-auth-number',
|
|
'at' => 'authtype',
|
|
'he' => 'heading',
|
|
'rank' => 'relevance',
|
|
'phr' => 'st-phrase',
|
|
'wrdl' => 'st-word-list',
|
|
'rt' => 'right-truncation',
|
|
'rtrn' => 'right-truncation',
|
|
'ltrn' => 'left-truncation',
|
|
'rltrn' => 'left-and-right',
|
|
'mc-itemtype' => 'itemtype',
|
|
'mc-ccode' => 'ccode',
|
|
'branch' => 'homebranch',
|
|
'mc-loc' => 'location',
|
|
'loc' => 'location',
|
|
'stocknumber' => 'number-local-acquisition',
|
|
'inv' => 'number-local-acquisition',
|
|
'bc' => 'barcode',
|
|
'mc-itype' => 'itype',
|
|
'aub' => 'author-personal-bibliography',
|
|
'auo' => 'author-in-order',
|
|
'ff8-22' => 'ta',
|
|
'aud' => 'ta',
|
|
'audience' => 'ta',
|
|
'frequency-code' => 'ff8-18',
|
|
'illustration-code' => 'ff8-18-21',
|
|
'regularity-code' => 'ff8-19',
|
|
'type-of-serial' => 'ff8-21',
|
|
'format' => 'ff8-23',
|
|
'conference-code' => 'ff8-29',
|
|
'festschrift-indicator' => 'ff8-30',
|
|
'index-indicator' => 'ff8-31',
|
|
'fiction' => 'lf',
|
|
'fic' => 'lf',
|
|
'literature-code' => 'lf',
|
|
'biography' => 'bio',
|
|
'ff8-34' => 'bio',
|
|
'biography-code' => 'bio',
|
|
'l-format' => 'ff7-01-02',
|
|
'lex' => 'lexile-number',
|
|
'hi' => 'host-item-number',
|
|
'itu' => 'index-term-uncontrolled',
|
|
'itg' => 'index-term-genre',
|
|
);
|
|
my $field_name_pattern = '[\w\-]+';
|
|
my $multi_field_pattern = "(?:\\.$field_name_pattern)*";
|
|
|
|
sub _convert_index_fields {
|
|
my ( $self, @indexes ) = @_;
|
|
|
|
my %index_type_convert =
|
|
( __default => undef, phr => 'phrase', rtrn => 'right-truncate', 'st-year' => 'st-year' );
|
|
|
|
# Convert according to our table, drop anything that doesn't convert.
|
|
# If a field starts with mc- we save it as it's used (and removed) later
|
|
# when joining things, to indicate we make it an 'OR' join.
|
|
# (Sorry, this got a bit ugly after special cases were found.)
|
|
map {
|
|
# Lower case all field names
|
|
my ( $f, $t ) = map(lc, split /,/);
|
|
my $mc = '';
|
|
if ($f =~ /^mc-/) {
|
|
$mc = 'mc-';
|
|
$f =~ s/^mc-//;
|
|
}
|
|
my $r = {
|
|
field => exists $index_field_convert{$f} ? $index_field_convert{$f} : $f,
|
|
type => $index_type_convert{ $t // '__default' }
|
|
};
|
|
$r->{field} = ($mc . $r->{field}) if $mc && $r->{field};
|
|
$r->{field} ? $r : undef;
|
|
} @indexes;
|
|
}
|
|
|
|
=head2 _convert_index_strings
|
|
|
|
my @searches = $self->_convert_index_strings(@searches);
|
|
|
|
Similar to L<_convert_index_fields>, this takes strings of the form
|
|
B<field:search term> and rewrites the field from zebra-style to
|
|
elasticsearch-style. Anything it doesn't understand is returned verbatim.
|
|
|
|
=cut
|
|
|
|
sub _convert_index_strings {
|
|
my ( $self, @searches ) = @_;
|
|
my @res;
|
|
foreach my $s (@searches) {
|
|
next if $s eq '';
|
|
my ( $field, $term ) = $s =~ /^\s*([\w,-]*?):(.*)/;
|
|
unless ( defined($field) && defined($term) ) {
|
|
push @res, $s;
|
|
next;
|
|
}
|
|
my ($conv) = $self->_convert_index_fields($field);
|
|
unless ( defined($conv) ) {
|
|
push @res, $s;
|
|
next;
|
|
}
|
|
push @res, ($conv->{field} ? $conv->{field} . ':' : '')
|
|
. $self->_modify_string_by_type( %$conv, operand => $term );
|
|
}
|
|
return @res;
|
|
}
|
|
|
|
=head2 _convert_index_strings_freeform
|
|
|
|
my $search = $self->_convert_index_strings_freeform($search);
|
|
|
|
This is similar to L<_convert_index_strings>, however it'll search out the
|
|
things to change within the string. So it can handle strings such as
|
|
C<(su:foo) AND (su:bar)>, converting the C<su> appropriately.
|
|
|
|
If there is something of the form "su,complete-subfield" or something, the
|
|
second part is stripped off as we can't yet handle that. Making it work
|
|
will have to wait for a real query parser.
|
|
|
|
=cut
|
|
|
|
sub _convert_index_strings_freeform {
|
|
my ( $self, $search ) = @_;
|
|
# @TODO: Currenty will alter also fields contained within quotes:
|
|
# `searching for "stuff cn:123"` for example will become
|
|
# `searching for "stuff local-number:123"
|
|
#
|
|
# Fixing this is tricky, one possibility:
|
|
# https://stackoverflow.com/questions/19193876/perl-regex-to-match-a-string-that-is-not-enclosed-in-quotes
|
|
# Still not perfect, and will not handle escaped quotes within quotes and assumes balanced quotes.
|
|
#
|
|
# Another, not so elegant, solution could be to replace all quoted content with placeholders, and put
|
|
# them back when processing is done.
|
|
|
|
# Lower case field names
|
|
$search =~ s/($field_name_pattern)(?:,[\w-]*)?($multi_field_pattern):/\L$1\E$2:/og;
|
|
# Resolve possible field aliases
|
|
$search =~ s/($field_name_pattern)($multi_field_pattern):/(exists $index_field_convert{$1} ? $index_field_convert{$1} : $1)."$2:"/oge;
|
|
return $search;
|
|
}
|
|
|
|
=head2 _modify_string_by_type
|
|
|
|
my $str = $self->_modify_string_by_type(%index_field);
|
|
|
|
If you have a search term (operand) and a type (phrase, right-truncated), this
|
|
will convert the string to have the function in lucene search terms, e.g.
|
|
wrapping quotes around it.
|
|
|
|
=cut
|
|
|
|
sub _modify_string_by_type {
|
|
my ( $self, %idx ) = @_;
|
|
|
|
my $type = $idx{type} || '';
|
|
my $str = $idx{operand};
|
|
return $str unless $str; # Empty or undef, we can't use it.
|
|
|
|
$str .= '*' if $type eq 'right-truncate';
|
|
$str = '"' . $str . '"' if $type eq 'phrase' && $str !~ /^".*"$/;
|
|
if ($type eq 'st-year') {
|
|
if ($str =~ /^(.*)-(.*)$/) {
|
|
my $from = $1 || '*';
|
|
my $until = $2 || '*';
|
|
$str = "[$from TO $until]";
|
|
}
|
|
}
|
|
return $str;
|
|
}
|
|
|
|
=head2 _join_queries
|
|
|
|
my $query_str = $self->_join_queries(@query_parts);
|
|
|
|
This takes a list of query parts, that might be search terms on their own, or
|
|
booleaned together, or specifying fields, or whatever, wraps them in
|
|
parentheses, and ANDs them all together. Suitable for feeding to the ES
|
|
query string query.
|
|
|
|
Note: doesn't AND them together if they specify an index that starts with "mc"
|
|
as that was a special case in the original code for dealing with multiple
|
|
choice options (you can't search for something that has an itype of A and
|
|
and itype of B otherwise.)
|
|
|
|
=cut
|
|
|
|
sub _join_queries {
|
|
my ( $self, @parts ) = @_;
|
|
|
|
my @norm_parts = grep { defined($_) && $_ ne '' && $_ !~ /^mc-/ } @parts;
|
|
my @mc_parts =
|
|
map { s/^mc-//r } grep { defined($_) && $_ ne '' && $_ =~ /^mc-/ } @parts;
|
|
return () unless @norm_parts + @mc_parts;
|
|
return ( @norm_parts, @mc_parts )[0] if @norm_parts + @mc_parts == 1;
|
|
|
|
# Group limits by field, so they can be OR'ed together
|
|
my %mc_limits;
|
|
foreach my $mc_part (@mc_parts) {
|
|
my ($field, $value) = split /:/, $mc_part, 2;
|
|
$mc_limits{$field} //= [];
|
|
push @{ $mc_limits{$field} }, $value;
|
|
}
|
|
|
|
@mc_parts = map {
|
|
sprintf('%s:(%s)', $_, join (' OR ', @{ $mc_limits{$_} }));
|
|
} sort keys %mc_limits;
|
|
|
|
@norm_parts = map { "($_)" } @norm_parts;
|
|
|
|
return join( ' AND ', @norm_parts, @mc_parts);
|
|
}
|
|
|
|
=head2 _make_phrases
|
|
|
|
my @phrased_queries = $self->_make_phrases(@query_parts);
|
|
|
|
This takes the supplied queries and forces them to be phrases by wrapping
|
|
quotes around them. It understands field prefixes, e.g. 'subject:' and puts
|
|
the quotes outside of them if they're there.
|
|
|
|
=cut
|
|
|
|
sub _make_phrases {
|
|
my ( $self, @parts ) = @_;
|
|
map { s/^\s*(\w*?:)(.*)$/$1"$2"/r } @parts;
|
|
}
|
|
|
|
=head2 _create_query_string
|
|
|
|
my @query_strings = $self->_create_query_string(@queries);
|
|
|
|
Given a list of hashrefs, it will turn them into a lucene-style query string.
|
|
The hash should contain field, type (both for the indexes), operator, and
|
|
operand.
|
|
|
|
=cut
|
|
|
|
sub _create_query_string {
|
|
my ( $self, @queries ) = @_;
|
|
|
|
map {
|
|
my $otor = $_->{operator} ? $_->{operator} . ' ' : '';
|
|
my $field = $_->{field} ? $_->{field} . ':' : '';
|
|
|
|
my $oand = $self->_modify_string_by_type(%$_);
|
|
$oand = "($oand)" if $field && scalar(split(/\s+/, $oand)) > 1 && (!defined $_->{type} || $_->{type} ne 'st-year');
|
|
"$otor($field$oand)";
|
|
} @queries;
|
|
}
|
|
|
|
=head2 _clean_search_term
|
|
|
|
my $term = $self->_clean_search_term($term);
|
|
|
|
This cleans a search term by removing any funny characters that may upset
|
|
ES and give us an error. It also calls L<_convert_index_strings_freeform>
|
|
to ensure those parts are correct.
|
|
|
|
=cut
|
|
|
|
sub _clean_search_term {
|
|
my ( $self, $term ) = @_;
|
|
|
|
# Lookahead for checking if we are inside quotes
|
|
my $lookahead = '(?=(?:[^\"]*+\"[^\"]*+\")*+[^\"]*+$)';
|
|
|
|
# Some hardcoded searches (like with authorities) produce things like
|
|
# 'an=123', when it ought to be 'an:123' for our purposes.
|
|
$term =~ s/=/:/g;
|
|
|
|
$term = $self->_convert_index_strings_freeform($term);
|
|
$term =~ s/[{}]/"/g;
|
|
|
|
# Remove unbalanced quotes
|
|
my $unquoted = $term;
|
|
my $count = ($unquoted =~ tr/"/ /);
|
|
if ($count % 2 == 1) {
|
|
$term = $unquoted;
|
|
}
|
|
|
|
# Remove unquoted colons that have whitespace on either side of them
|
|
$term =~ s/(:+)(\s+)$lookahead/$2/g;
|
|
$term =~ s/(\s+)(:+)$lookahead/$1/g;
|
|
|
|
$term = $self->_query_regex_escape_process($term);
|
|
|
|
return $term;
|
|
}
|
|
|
|
=head2 _query_regex_escape_process
|
|
|
|
my $query = $self->_query_regex_escape_process($query);
|
|
|
|
Processes query in accordance with current "QueryRegexEscapeOptions" system preference setting.
|
|
|
|
=cut
|
|
|
|
sub _query_regex_escape_process {
|
|
my ($self, $query) = @_;
|
|
my $regex_escape_options = C4::Context->preference("QueryRegexEscapeOptions");
|
|
if ($regex_escape_options ne 'dont_escape') {
|
|
if ($regex_escape_options eq 'escape') {
|
|
# Will escape unescaped slashes (/) while preserving
|
|
# unescaped slashes within quotes
|
|
# @TODO: assumes quotes are always balanced and will
|
|
# not handle escaped qoutes properly, should perhaps be
|
|
# replaced with a more general parser solution
|
|
# so that this function is ever only provided with unqouted
|
|
# query parts
|
|
$query =~ s@(?:(?<!\\)((?:[\\]{2})*)(?=/))(?![^"]*"(?:[^"]*"[^"]*")*[^"]*$)@\\$1@g;
|
|
}
|
|
elsif($regex_escape_options eq 'unescape_escaped') {
|
|
# Will unescape escaped slashes (\/) and escape
|
|
# unescaped slashes (/) while preserving slashes within quotes
|
|
# The same limitatations as above apply for handling of quotes
|
|
$query =~ s@(?:(?<!\\)(?:((?:[\\]{2})*[\\])|((?:[\\]{2})*))(?=/))(?![^"]*"(?:[^"]*"[^"]*")*[^"]*$)@($1 ? substr($1, 0, -1) : ($2 . "\\"))@ge;
|
|
}
|
|
}
|
|
return $query;
|
|
}
|
|
|
|
=head2 _fix_limit_special_cases
|
|
|
|
my $limits = $self->_fix_limit_special_cases($limits);
|
|
|
|
This converts any special cases that the limit specifications have into things
|
|
that are more readily processable by the rest of the code.
|
|
|
|
The argument should be an arrayref, and it'll return an arrayref.
|
|
|
|
=cut
|
|
|
|
sub _fix_limit_special_cases {
|
|
my ( $self, $limits ) = @_;
|
|
|
|
my @new_lim;
|
|
foreach my $l (@$limits) {
|
|
|
|
# This is set up by opac-search.pl
|
|
if ( $l =~ /^yr,st-numeric,ge=/ ) {
|
|
my ( $start, $end ) =
|
|
( $l =~ /^yr,st-numeric,ge=(.*) and yr,st-numeric,le=(.*)$/ );
|
|
next unless defined($start) && defined($end);
|
|
push @new_lim, "copydate:[$start TO $end]";
|
|
}
|
|
elsif ( $l =~ /^yr,st-numeric=/ ) {
|
|
my ($date) = ( $l =~ /^yr,st-numeric=(.*)$/ );
|
|
next unless defined($date);
|
|
$date = $self->_modify_string_by_type(type => 'st-year', operand => $date);
|
|
push @new_lim, "copydate:$date";
|
|
}
|
|
elsif ( $l =~ /^available$/ ) {
|
|
push @new_lim, 'onloan:false';
|
|
}
|
|
else {
|
|
my ( $field, $term ) = $l =~ /^\s*([\w,-]*?):(.*)/;
|
|
$field =~ s/,phr$//; #We are quoting all the limits as phrase, this prevents from quoting again later
|
|
if ( defined($field) && defined($term) ) {
|
|
push @new_lim, "$field:(\"$term\")";
|
|
}
|
|
else {
|
|
push @new_lim, $l;
|
|
}
|
|
}
|
|
}
|
|
return \@new_lim;
|
|
}
|
|
|
|
=head2 _sort_field
|
|
|
|
my $field = $self->_sort_field($field);
|
|
|
|
Given a field name, this works out what the actual name of the field to sort
|
|
on should be. A '__sort' suffix is added for fields with a sort version, and
|
|
for text fields either '.phrase' (for sortable versions) or '.raw' is appended
|
|
to avoid sorting on a tokenized value.
|
|
|
|
=cut
|
|
|
|
sub _sort_field {
|
|
my ($self, $f) = @_;
|
|
|
|
my $mappings = $self->get_elasticsearch_mappings();
|
|
my $textField = defined $mappings->{data}{properties}{$f}{type} && $mappings->{data}{properties}{$f}{type} eq 'text';
|
|
if (!defined $self->sort_fields()->{$f} || $self->sort_fields()->{$f}) {
|
|
$f .= '__sort';
|
|
} else {
|
|
# We need to add '.raw' to text fields without a sort field,
|
|
# otherwise it'll sort based on the tokenised form.
|
|
$f .= '.raw' if $textField;
|
|
}
|
|
return $f;
|
|
}
|
|
|
|
=head2 _truncate_terms
|
|
|
|
my $query = $self->_truncate_terms($query);
|
|
|
|
Given a string query this function appends '*' wildcard to all terms except
|
|
operands and double quoted strings.
|
|
|
|
=cut
|
|
|
|
sub _truncate_terms {
|
|
my ( $self, $query ) = @_;
|
|
|
|
my @tokens = $self->_split_query( $query );
|
|
|
|
# Filter out empty tokens
|
|
my @words = grep { $_ !~ /^\s*$/ } @tokens;
|
|
|
|
# Append '*' to words if needed, ie. if it ends in a word character and is not a keyword
|
|
my @terms = map {
|
|
my $w = $_;
|
|
(/\W$/ or grep {lc($w) eq $_} qw/and or not/) ? $_ : "$_*";
|
|
} @words;
|
|
|
|
return join ' ', @terms;
|
|
}
|
|
|
|
=head2 _split_query
|
|
|
|
my @token = $self->_split_query($query_str);
|
|
|
|
Given a string query this function splits it to tokens taking into account
|
|
any field prefixes and quoted strings.
|
|
|
|
=cut
|
|
|
|
my $tokenize_split_re = qr/((?:${field_name_pattern}${multi_field_pattern}:)?"[^"]+"|\s+)/;
|
|
|
|
sub _split_query {
|
|
my ( $self, $query ) = @_;
|
|
|
|
# '"donald duck" title:"the mouse" and peter" get split into
|
|
# ['', '"donald duck"', '', ' ', '', 'title:"the mouse"', '', ' ', 'and', ' ', 'pete']
|
|
my @tokens = split $tokenize_split_re, $query;
|
|
|
|
# Filter out empty values
|
|
@tokens = grep( /\S/, @tokens );
|
|
|
|
return @tokens;
|
|
}
|
|
|
|
=head2 _search_fields
|
|
my $weighted_fields = $self->_search_fields({
|
|
is_opac => 0,
|
|
weighted_fields => 1,
|
|
subfield => 'raw'
|
|
});
|
|
|
|
Generate a list of searchable fields to be used for Elasticsearch queries
|
|
applied to multiple fields.
|
|
|
|
Returns an arrayref of field names for either OPAC or staff interface, with
|
|
possible weights and subfield appended to each field name depending on the
|
|
options provided.
|
|
|
|
=over 4
|
|
|
|
=item C<$params>
|
|
|
|
Hashref with options. The parameter C<is_opac> indicates whether the searchable
|
|
fields for OPAC or staff interface should be retrieved. If C<weighted_fields> is set
|
|
fields weights will be applied on returned fields. C<subfield> can be used to
|
|
provide a subfield that will be appended to fields as "C<field_name>.C<subfield>".
|
|
|
|
=back
|
|
|
|
=cut
|
|
|
|
sub _search_fields {
|
|
my ($self, $params) = @_;
|
|
$params //= {
|
|
is_opac => 0,
|
|
weighted_fields => 0,
|
|
whole_record => 0,
|
|
# This is a hack for authorities build_authorities_query
|
|
# can hopefully be removed in the future
|
|
subfield => undef,
|
|
};
|
|
my $cache = Koha::Caches->get_instance();
|
|
my $cache_key = 'elasticsearch_search_fields' . ($params->{is_opac} ? '_opac' : '_staff_client') . "_" . $self->index;
|
|
my $search_fields = $cache->get_from_cache($cache_key, { unsafe => 1 });
|
|
if (!$search_fields) {
|
|
# The reason we don't use Koha::SearchFields->search here is we don't
|
|
# want or need resultset wrapped as Koha::SearchField object.
|
|
# It does not make any sense in this context and would cause
|
|
# unnecessary overhead sice we are only querying for data
|
|
# Also would not work, or produce strange results, with the "columns"
|
|
# option.
|
|
my $schema = Koha::Database->schema;
|
|
my $result = $schema->resultset('SearchField')->search(
|
|
{
|
|
$params->{is_opac} ? (
|
|
'opac' => 1,
|
|
) : (
|
|
'staff_client' => 1
|
|
),
|
|
'type' => { '!=' => 'boolean' },
|
|
'search_marc_map.index_name' => $self->index,
|
|
'search_marc_map.marc_type' => C4::Context->preference('marcflavour'),
|
|
'search_marc_to_fields.search' => 1,
|
|
},
|
|
{
|
|
columns => [qw/name weight/],
|
|
collapse => 1,
|
|
join => {search_marc_to_fields => 'search_marc_map'},
|
|
}
|
|
);
|
|
my @search_fields;
|
|
while (my $search_field = $result->next) {
|
|
push @search_fields, [
|
|
lc $search_field->name,
|
|
$search_field->weight ? $search_field->weight : ()
|
|
];
|
|
}
|
|
$search_fields = \@search_fields;
|
|
$cache->set_in_cache($cache_key, $search_fields);
|
|
}
|
|
if ($params->{subfield}) {
|
|
my $subfield = $params->{subfield};
|
|
$search_fields = [
|
|
map {
|
|
# Copy values to avoid mutating cached
|
|
# data (since unsafe is used)
|
|
my ($field, $weight) = @{$_};
|
|
["${field}.${subfield}", $weight];
|
|
} @{$search_fields}
|
|
];
|
|
}
|
|
if ($params->{weighted_fields}) {
|
|
return [map { join('^', @{$_}) } @{$search_fields}];
|
|
}
|
|
else {
|
|
# Exclude weight from field
|
|
return [map { $_->[0] } @{$search_fields}];
|
|
}
|
|
}
|
|
|
|
1;
|
|
|