Browse Source

Bug 22592: Add index scan emulation to Elasticsearch

Adds support for using the "scan indexes" action in advanced search by using faceting with a prefix filter. Requires that the field be set as facetable for anything to be found.

Test plan:
1. Apply patch
2. Go to advanced search and click "More options"
3. Select author as the search field, enter a last name and check "Scan indexes"
4. Perform search and observe the result list resembling scan results

Signed-off-by: Michal Denar <black23@gmail.com>
Signed-off-by: Séverine QUEUNE <severine.queune@bulac.fr>
Signed-off-by: Katrin Fischer <katrin.fischer.83@web.de>
Signed-off-by: Martin Renvoize <martin.renvoize@ptfs-europe.com>
remotes/origin/19.11.x
Ere Maijala 3 years ago
committed by Martin Renvoize
parent
commit
6e1d759981
Signed by: martin.renvoize GPG Key ID: 422B469130441A0F
  1. 191
      Koha/SearchEngine/Elasticsearch/QueryBuilder.pm
  2. 80
      Koha/SearchEngine/Elasticsearch/Search.pm
  3. 9
      catalogue/search.pl
  4. 3
      koha-tmpl/intranet-tmpl/prog/en/modules/catalogue/results.tt
  5. 42
      t/db_dependent/Koha/SearchEngine/Elasticsearch/QueryBuilder.t
  6. 12
      t/db_dependent/Koha/SearchEngine/Elasticsearch/Search.t

191
Koha/SearchEngine/Elasticsearch/QueryBuilder.pm

@ -141,45 +141,6 @@ sub build_query {
return $res;
}
=head2 build_browse_query
my $browse_query = $builder->build_browse_query($field, $query);
This performs a "starts with" style query on a particular field. The field
to be searched must have been indexed with an appropriate mapping as a
"phrase" subfield, which pretty much everything has.
=cut
# XXX this isn't really a browse query like we want in the end
sub build_browse_query {
my ( $self, $field, $query ) = @_;
my $fuzzy_enabled = C4::Context->preference("QueryFuzzy") || 0;
return { query => '*' } if !defined $query;
# TODO this should come from Koha::SearchEngine::Elasticsearch
my %field_whitelist = (
title => 1,
author => 1,
);
$field = 'title' if !exists $field_whitelist{$field};
my $sort = $self->_sort_field($field);
my $res = {
query => {
match_phrase_prefix => {
"$field.phrase" => {
query => $query,
operator => 'or',
fuzziness => $fuzzy_enabled ? 'auto' : '0',
}
}
},
sort => [ { $sort => { order => "asc" } } ],
};
}
=head2 build_query_compat
my (
@ -205,50 +166,58 @@ sub build_query_compat {
$lang, $params )
= @_;
#die Dumper ( $self, $operators, $operands, $indexes, $orig_limits, $sort_by, $scan, $lang );
my @sort_params = $self->_convert_sort_fields(@$sort_by);
my @index_params = $self->_convert_index_fields(@$indexes);
my $limits = $self->_fix_limit_special_cases($orig_limits);
if ( $params->{suppress} ) { push @$limits, "suppress:0"; }
# Merge the indexes in with the search terms and the operands so that
# each search thing is a handy unit.
unshift @$operators, undef; # The first one can't have an op
my @search_params;
my $truncate = C4::Context->preference("QueryAutoTruncate") || 0;
my $ea = each_array( @$operands, @$operators, @index_params );
while ( my ( $oand, $otor, $index ) = $ea->() ) {
next if ( !defined($oand) || $oand eq '' );
$oand = $self->_clean_search_term($oand);
$oand = $self->_truncate_terms($oand) if ($truncate);
push @search_params, {
operand => $oand, # the search terms
operator => defined($otor) ? uc $otor : undef, # AND and so on
$index ? %$index : (),
};
}
my $query;
my $query_str = '';
my $search_param_query_str = '';
my $limits = ();
if ( $scan ) {
($query, $query_str) = $self->_build_scan_query( $operands, $indexes );
$search_param_query_str = $query_str;
} else {
my @sort_params = $self->_convert_sort_fields(@$sort_by);
my @index_params = $self->_convert_index_fields(@$indexes);
my $limits = $self->_fix_limit_special_cases($orig_limits);
if ( $params->{suppress} ) { push @$limits, "suppress:0"; }
# Merge the indexes in with the search terms and the operands so that
# each search thing is a handy unit.
unshift @$operators, undef; # The first one can't have an op
my @search_params;
my $truncate = C4::Context->preference("QueryAutoTruncate") || 0;
my $ea = each_array( @$operands, @$operators, @index_params );
while ( my ( $oand, $otor, $index ) = $ea->() ) {
next if ( !defined($oand) || $oand eq '' );
$oand = $self->_clean_search_term($oand);
$oand = $self->_truncate_terms($oand) if ($truncate);
push @search_params, {
operand => $oand, # the search terms
operator => defined($otor) ? uc $otor : undef, # AND and so on
$index ? %$index : (),
};
}
# We build a string query from limits and the queries. An alternative
# would be to pass them separately into build_query and let it build
# them into a structured ES query itself. Maybe later, though that'd be
# more robust.
my $search_param_query_str = join( ' ', $self->_create_query_string(@search_params) );
my $query_str = join( ' AND ',
$search_param_query_str || (),
$self->_join_queries( $self->_convert_index_strings(@$limits) ) || () );
# If there's no query on the left, let's remove the junk left behind
$query_str =~ s/^ AND //;
my %options;
$options{sort} = \@sort_params;
$options{is_opac} = $params->{is_opac};
$options{weighted_fields} = $params->{weighted_fields};
$options{whole_record} = $params->{whole_record};
my $query = $self->build_query( $query_str, %options );
# We build a string query from limits and the queries. An alternative
# would be to pass them separately into build_query and let it build
# them into a structured ES query itself. Maybe later, though that'd be
# more robust.
$search_param_query_str = join( ' ', $self->_create_query_string(@search_params) );
$query_str = join( ' AND ',
$search_param_query_str || (),
$self->_join_queries( $self->_convert_index_strings(@$limits) ) || () );
# If there's no query on the left, let's remove the junk left behind
$query_str =~ s/^ AND //;
my %options;
$options{sort} = \@sort_params;
$options{is_opac} = $params->{is_opac};
$options{weighted_fields} = $params->{weighted_fields};
$options{whole_record} = $params->{whole_record};
$query = $self->build_query( $query_str, %options );
}
# We roughly emulate the CGI parameters of the zebra query builder
my $query_cgi = '';
shift @$operators; # Shift out the one we unshifted before
$ea = each_array( @$operands, @$operators, @$indexes );
my $ea = each_array( @$operands, @$operators, @$indexes );
while ( my ( $oand, $otor, $index ) = $ea->() ) {
$query_cgi .= '&' if $query_cgi;
$query_cgi .= 'idx=' . uri_escape_utf8( $index // '') . '&q=' . uri_escape_utf8( $oand );
@ -533,6 +502,70 @@ sub build_authorities_query_compat {
return $query;
}
=head2 _build_scan_query
my ($query, $query_str) = $builder->_build_scan_query(\@operands, \@indexes)
This will build an aggregation scan query that can be issued to elasticsearch from
the provided string input.
=cut
our %scan_field_convert = (
'ti' => 'title',
'au' => 'author',
'su' => 'subject',
'se' => 'title-series',
'pb' => 'publisher',
);
sub _build_scan_query {
my ( $self, $operands, $indexes ) = @_;
my $term = scalar( @$operands ) == 0 ? '' : $operands->[0];
my $index = scalar( @$indexes ) == 0 ? 'subject' : $indexes->[0];
my ( $f, $d ) = split( /,/, $index);
$index = $scan_field_convert{$f} || $f;
my $res;
$res->{query} = {
query_string => {
query => '*'
}
};
$res->{aggregations} = {
$index => {
terms => {
field => $index . '__facet',
order => { '_term' => 'asc' },
include => $self->_create_regex_filter($self->_clean_search_term($term)) . '.*'
}
}
};
return ($res, $term);
}
=head2 _create_regex_filter
my $filter = $builder->_create_regex_filter('term')
This will create a regex filter that can be used with an aggregation query.
=cut
sub _create_regex_filter {
my ($self, $term) = @_;
my $result = '';
foreach my $c (split(//, quotemeta($term))) {
my $lc = lc($c);
my $uc = uc($c);
$result .= $lc ne $uc ? '[' . $lc . $uc . ']' : $c;
}
return $result;
}
=head2 _convert_sort_fields
my @sort_params = _convert_sort_fields(@sort_by)
@ -779,7 +812,7 @@ sub _modify_string_by_type {
return $str unless $str; # Empty or undef, we can't use it.
$str .= '*' if $type eq 'right-truncate';
$str = '"' . $str . '"' if $type eq 'phrase';
$str = '"' . $str . '"' if $type eq 'phrase' && $str !~ /^".*"$/;
if ($type eq 'st-year') {
if ($str =~ /^(.*)-(.*)$/) {
my $from = $1 || '*';

80
Koha/SearchEngine/Elasticsearch/Search.pm

@ -85,7 +85,7 @@ sub search {
my $params = $self->get_elasticsearch_params();
# 20 is the default number of results per page
$query->{size} = $count || 20;
$query->{size} = $count // 20;
# ES doesn't want pages, it wants a record to start from.
if (exists $options{offset}) {
$query->{from} = $options{offset};
@ -132,8 +132,8 @@ sub count {
my ( $error, $results, $facets ) = $search->search_compat(
$query, $simple_query, \@sort_by, \@servers,
$results_per_page, $offset, $branches, $query_type,
$scan
$results_per_page, $offset, undef, $item_types,
$query_type, $scan
)
A search interface somewhat compatible with L<C4::Search->getRecords>. Anything
@ -144,10 +144,15 @@ get ignored here, along with some other things (like C<@servers>.)
sub search_compat {
my (
$self, $query, $simple_query, $sort_by,
$servers, $results_per_page, $offset, $branches,
$query_type, $scan
$self, $query, $simple_query, $sort_by,
$servers, $results_per_page, $offset, $branches,
$item_types, $query_type, $scan
) = @_;
if ( $scan ) {
return $self->_aggregation_scan( $query, $results_per_page, $offset );
}
my %options;
if ( !defined $offset or $offset < 0 ) {
$offset = 0;
@ -509,4 +514,67 @@ sub _convert_facets {
return \@facets;
}
=head2 _aggregation_scan
my $result = $self->_aggregration_scan($query, 10, 0);
Perform an aggregation request for scan purposes.
=cut
sub _aggregation_scan {
my ($self, $query, $results_per_page, $offset) = @_;
if (!scalar(keys %{$query->{aggregations}})) {
my %result = {
biblioserver => {
hits => 0,
RECORDS => undef
}
};
return (undef, \%result, undef);
}
my ($field) = keys %{$query->{aggregations}};
$query->{aggregations}{$field}{terms}{size} = 1000;
my $results = $self->search($query, 1, 0);
# Convert each result into a MARC::Record
my (@records, $index);
# opac-search expects results to be put in the
# right place in the array, according to $offset
$index = $offset - 1;
my $count = scalar(@{$results->{aggregations}{$field}{buckets}});
for (my $index = $offset; $index - $offset < $results_per_page && $index < $count; $index++) {
my $bucket = $results->{aggregations}{$field}{buckets}->[$index];
# Scan values are expressed as:
# - MARC21: 100a (count) and 245a (term)
# - UNIMARC: 200f (count) and 200a (term)
my $marc = MARC::Record->new;
$marc->encoding('UTF-8');
if (C4::Context->preference('marcflavour') eq 'UNIMARC') {
$marc->append_fields(
MARC::Field->new((200, ' ', ' ', 'f' => $bucket->{doc_count}))
);
$marc->append_fields(
MARC::Field->new((200, ' ', ' ', 'a' => $bucket->{key}))
);
} else {
$marc->append_fields(
MARC::Field->new((100, ' ', ' ', 'a' => $bucket->{doc_count}))
);
$marc->append_fields(
MARC::Field->new((245, ' ', ' ', 'a' => $bucket->{key}))
);
}
$records[$index] = $marc->as_usmarc();
};
# consumers of this expect a namespaced result, we provide the default
# configuration.
my %result;
$result{biblioserver}{hits} = $count;
$result{biblioserver}{RECORDS} = \@records;
return (undef, \%result, undef);
}
1;

9
catalogue/search.pl

@ -498,9 +498,10 @@ if ($query_cgi) {
my $input_value = $2;
push @query_inputs, { input_name => $input_name, input_value => Encode::decode_utf8( uri_unescape( $input_value ) ) };
if ($input_name eq 'idx') {
$scan_index_to_use = $input_value; # unless $scan_index_to_use;
# The form contains multiple fields, so take the first value as the scan index
$scan_index_to_use = $input_value unless $scan_index_to_use;
}
if ($input_name eq 'q') {
if (!defined $scan_search_term_to_use && $input_name eq 'q') {
$scan_search_term_to_use = Encode::decode_utf8( uri_unescape( $input_value ));
}
}
@ -584,8 +585,8 @@ for (my $i=0;$i<@servers;$i++) {
$template->param( EnableSearchHistory => 1 );
}
## If there's just one result, redirect to the detail page
if ($total == 1) {
## If there's just one result, redirect to the detail page unless doing an index scan
if ($total == 1 && !$scan) {
my $biblionumber = $newresults[0]->{biblionumber};
my $defaultview = C4::Context->preference('IntranetBiblioDefaultView');
my $views = { C4::Search::enabled_staff_search_views };

3
koha-tmpl/intranet-tmpl/prog/en/modules/catalogue/results.tt

@ -317,7 +317,7 @@
[% ELSE %]<option value="ti">Title</option>[% END %]
[% IF ( ms_ticommaphr ) %]<option selected="selected" value="ti,phr">Title phrase</option>
[% ELSE %]<option value="ti,phr">Title phrase</option>[% END %]
[% IF ( ms_aucommaphr ) %]<option selected="selected" value="au,phr">Author</option>
[% IF ( ms_au || ms_aucommaphr ) %]<option selected="selected" value="au,phr">Author</option>
[% ELSE %]<option value="au,phr">Author</option>[% END %]
[% IF ( ms_su ) %]<option selected="selected" value="su">Subject</option>
[% ELSE %]<option value="su">Subject</option>[% END %]
@ -335,6 +335,7 @@
[% ELSE %]<option value="ss">ISSN</option>[% END %]
</select>
<input type="hidden" name="scan" value="1" />
<input class="submit" type="submit" value="Submit" />
</td>
</tr>
</table>

42
t/db_dependent/Koha/SearchEngine/Elasticsearch/QueryBuilder.t

@ -47,7 +47,8 @@ $se->mock( 'get_elasticsearch_mappings', sub {
type => 'text'
},
subject => {
type => 'text'
type => 'text',
facet => 1
},
itemnumber => {
type => 'integer'
@ -192,7 +193,7 @@ subtest 'build_authorities_query_compat() tests' => sub {
};
subtest 'build_query tests' => sub {
plan tests => 40;
plan tests => 48;
my $qb;
@ -439,6 +440,43 @@ subtest 'build_query tests' => sub {
);
is($query_cgi, 'idx=&q=title%3A%22donald%20duck%22', 'query cgi');
is($query_desc, 'title:"donald duck"', 'query desc ok');
# Scan queries
( undef, $query, $simple_query, $query_cgi, $query_desc ) = $qb->build_query_compat( undef, ['new'], ['au'], undef, undef, 1 );
is(
$query->{query}{query_string}{query},
'*',
"scan query is properly formed"
);
is_deeply(
$query->{aggregations}{'author'}{'terms'},
{
field => 'author__facet',
order => { '_term' => 'asc' },
include => '[nN][eE][wW].*'
},
"scan aggregation request is properly formed"
);
is($query_cgi, 'idx=au&q=new&scan=1', 'query cgi');
is($query_desc, 'new', 'query desc ok');
( undef, $query, $simple_query, $query_cgi, $query_desc ) = $qb->build_query_compat( undef, ['new'], [], undef, undef, 1 );
is(
$query->{query}{query_string}{query},
'*',
"scan query is properly formed"
);
is_deeply(
$query->{aggregations}{'subject'}{'terms'},
{
field => 'subject__facet',
order => { '_term' => 'asc' },
include => '[nN][eE][wW].*'
},
"scan aggregation request is properly formed"
);
is($query_cgi, 'idx=&q=new&scan=1', 'query cgi');
is($query_desc, 'new', 'query desc ok');
};

12
t/db_dependent/Koha/SearchEngine/Elasticsearch/Search.t

@ -17,7 +17,7 @@
use Modern::Perl;
use Test::More tests => 11;
use Test::More tests => 13;
use t::lib::Mocks;
use Koha::SearchEngine::Elasticsearch::QueryBuilder;
@ -100,6 +100,16 @@ SKIP: {
ok ($results = $searcher->search_compat( $query ), 'Test search_compat' );
my ( undef, $scan_query ) = $builder->build_query_compat( undef, ['easy'], [], undef, undef, 1 );
ok ((undef, $results) = $searcher->search_compat( $scan_query, undef, [], [], 20, 0, undef, undef, undef, 1 ), 'Test search_compat scan query' );
my $expected = {
biblioserver => {
hits => 0,
RECORDS => []
}
};
is_deeply($results, $expected, 'Scan query results ok');
ok (($results,$count) = $searcher->search_auth_compat ( $query ), 'Test search_auth_compat' );
is ( $count = $searcher->count_auth_use($searcher,1), 0, 'Testing count_auth_use');

Loading…
Cancel
Save