From 6e1d759981b442b00306405ddb0c0c970dea9d29 Mon Sep 17 00:00:00 2001 From: Ere Maijala Date: Thu, 28 Mar 2019 13:37:13 +0200 Subject: [PATCH] Bug 22592: Add index scan emulation to Elasticsearch MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Adds support for using the "scan indexes" action in advanced search by using faceting with a prefix filter. Requires that the field be set as facetable for anything to be found. Test plan: 1. Apply patch 2. Go to advanced search and click "More options" 3. Select author as the search field, enter a last name and check "Scan indexes" 4. Perform search and observe the result list resembling scan results Signed-off-by: Michal Denar Signed-off-by: Séverine QUEUNE Signed-off-by: Katrin Fischer Signed-off-by: Martin Renvoize --- .../Elasticsearch/QueryBuilder.pm | 191 ++++++++++-------- Koha/SearchEngine/Elasticsearch/Search.pm | 80 +++++++- catalogue/search.pl | 9 +- .../prog/en/modules/catalogue/results.tt | 3 +- .../SearchEngine/Elasticsearch/QueryBuilder.t | 42 +++- .../Koha/SearchEngine/Elasticsearch/Search.t | 12 +- 6 files changed, 244 insertions(+), 93 deletions(-) diff --git a/Koha/SearchEngine/Elasticsearch/QueryBuilder.pm b/Koha/SearchEngine/Elasticsearch/QueryBuilder.pm index 28c9297df4..2613cc3ecd 100644 --- a/Koha/SearchEngine/Elasticsearch/QueryBuilder.pm +++ b/Koha/SearchEngine/Elasticsearch/QueryBuilder.pm @@ -141,45 +141,6 @@ sub build_query { return $res; } -=head2 build_browse_query - - my $browse_query = $builder->build_browse_query($field, $query); - -This performs a "starts with" style query on a particular field. The field -to be searched must have been indexed with an appropriate mapping as a -"phrase" subfield, which pretty much everything has. - -=cut - -# XXX this isn't really a browse query like we want in the end -sub build_browse_query { - my ( $self, $field, $query ) = @_; - - my $fuzzy_enabled = C4::Context->preference("QueryFuzzy") || 0; - - return { query => '*' } if !defined $query; - - # TODO this should come from Koha::SearchEngine::Elasticsearch - my %field_whitelist = ( - title => 1, - author => 1, - ); - $field = 'title' if !exists $field_whitelist{$field}; - my $sort = $self->_sort_field($field); - my $res = { - query => { - match_phrase_prefix => { - "$field.phrase" => { - query => $query, - operator => 'or', - fuzziness => $fuzzy_enabled ? 'auto' : '0', - } - } - }, - sort => [ { $sort => { order => "asc" } } ], - }; -} - =head2 build_query_compat my ( @@ -205,50 +166,58 @@ sub build_query_compat { $lang, $params ) = @_; -#die Dumper ( $self, $operators, $operands, $indexes, $orig_limits, $sort_by, $scan, $lang ); - my @sort_params = $self->_convert_sort_fields(@$sort_by); - my @index_params = $self->_convert_index_fields(@$indexes); - my $limits = $self->_fix_limit_special_cases($orig_limits); - if ( $params->{suppress} ) { push @$limits, "suppress:0"; } - # Merge the indexes in with the search terms and the operands so that - # each search thing is a handy unit. - unshift @$operators, undef; # The first one can't have an op - my @search_params; - my $truncate = C4::Context->preference("QueryAutoTruncate") || 0; - my $ea = each_array( @$operands, @$operators, @index_params ); - while ( my ( $oand, $otor, $index ) = $ea->() ) { - next if ( !defined($oand) || $oand eq '' ); - $oand = $self->_clean_search_term($oand); - $oand = $self->_truncate_terms($oand) if ($truncate); - push @search_params, { - operand => $oand, # the search terms - operator => defined($otor) ? uc $otor : undef, # AND and so on - $index ? %$index : (), - }; - } + my $query; + my $query_str = ''; + my $search_param_query_str = ''; + my $limits = (); + if ( $scan ) { + ($query, $query_str) = $self->_build_scan_query( $operands, $indexes ); + $search_param_query_str = $query_str; + } else { + my @sort_params = $self->_convert_sort_fields(@$sort_by); + my @index_params = $self->_convert_index_fields(@$indexes); + my $limits = $self->_fix_limit_special_cases($orig_limits); + if ( $params->{suppress} ) { push @$limits, "suppress:0"; } + # Merge the indexes in with the search terms and the operands so that + # each search thing is a handy unit. + unshift @$operators, undef; # The first one can't have an op + my @search_params; + my $truncate = C4::Context->preference("QueryAutoTruncate") || 0; + my $ea = each_array( @$operands, @$operators, @index_params ); + while ( my ( $oand, $otor, $index ) = $ea->() ) { + next if ( !defined($oand) || $oand eq '' ); + $oand = $self->_clean_search_term($oand); + $oand = $self->_truncate_terms($oand) if ($truncate); + push @search_params, { + operand => $oand, # the search terms + operator => defined($otor) ? uc $otor : undef, # AND and so on + $index ? %$index : (), + }; + } - # We build a string query from limits and the queries. An alternative - # would be to pass them separately into build_query and let it build - # them into a structured ES query itself. Maybe later, though that'd be - # more robust. - my $search_param_query_str = join( ' ', $self->_create_query_string(@search_params) ); - my $query_str = join( ' AND ', - $search_param_query_str || (), - $self->_join_queries( $self->_convert_index_strings(@$limits) ) || () ); - - # If there's no query on the left, let's remove the junk left behind - $query_str =~ s/^ AND //; - my %options; - $options{sort} = \@sort_params; - $options{is_opac} = $params->{is_opac}; - $options{weighted_fields} = $params->{weighted_fields}; - $options{whole_record} = $params->{whole_record}; - my $query = $self->build_query( $query_str, %options ); + # We build a string query from limits and the queries. An alternative + # would be to pass them separately into build_query and let it build + # them into a structured ES query itself. Maybe later, though that'd be + # more robust. + $search_param_query_str = join( ' ', $self->_create_query_string(@search_params) ); + $query_str = join( ' AND ', + $search_param_query_str || (), + $self->_join_queries( $self->_convert_index_strings(@$limits) ) || () ); + + # If there's no query on the left, let's remove the junk left behind + $query_str =~ s/^ AND //; + my %options; + $options{sort} = \@sort_params; + $options{is_opac} = $params->{is_opac}; + $options{weighted_fields} = $params->{weighted_fields}; + $options{whole_record} = $params->{whole_record}; + $query = $self->build_query( $query_str, %options ); + } # We roughly emulate the CGI parameters of the zebra query builder my $query_cgi = ''; shift @$operators; # Shift out the one we unshifted before - $ea = each_array( @$operands, @$operators, @$indexes ); + my $ea = each_array( @$operands, @$operators, @$indexes ); while ( my ( $oand, $otor, $index ) = $ea->() ) { $query_cgi .= '&' if $query_cgi; $query_cgi .= 'idx=' . uri_escape_utf8( $index // '') . '&q=' . uri_escape_utf8( $oand ); @@ -533,6 +502,70 @@ sub build_authorities_query_compat { return $query; } +=head2 _build_scan_query + + my ($query, $query_str) = $builder->_build_scan_query(\@operands, \@indexes) + +This will build an aggregation scan query that can be issued to elasticsearch from +the provided string input. + +=cut + +our %scan_field_convert = ( + 'ti' => 'title', + 'au' => 'author', + 'su' => 'subject', + 'se' => 'title-series', + 'pb' => 'publisher', +); + +sub _build_scan_query { + my ( $self, $operands, $indexes ) = @_; + + my $term = scalar( @$operands ) == 0 ? '' : $operands->[0]; + my $index = scalar( @$indexes ) == 0 ? 'subject' : $indexes->[0]; + + my ( $f, $d ) = split( /,/, $index); + $index = $scan_field_convert{$f} || $f; + + my $res; + $res->{query} = { + query_string => { + query => '*' + } + }; + $res->{aggregations} = { + $index => { + terms => { + field => $index . '__facet', + order => { '_term' => 'asc' }, + include => $self->_create_regex_filter($self->_clean_search_term($term)) . '.*' + } + } + }; + return ($res, $term); +} + +=head2 _create_regex_filter + + my $filter = $builder->_create_regex_filter('term') + +This will create a regex filter that can be used with an aggregation query. + +=cut + +sub _create_regex_filter { + my ($self, $term) = @_; + + my $result = ''; + foreach my $c (split(//, quotemeta($term))) { + my $lc = lc($c); + my $uc = uc($c); + $result .= $lc ne $uc ? '[' . $lc . $uc . ']' : $c; + } + return $result; +} + =head2 _convert_sort_fields my @sort_params = _convert_sort_fields(@sort_by) @@ -779,7 +812,7 @@ sub _modify_string_by_type { return $str unless $str; # Empty or undef, we can't use it. $str .= '*' if $type eq 'right-truncate'; - $str = '"' . $str . '"' if $type eq 'phrase'; + $str = '"' . $str . '"' if $type eq 'phrase' && $str !~ /^".*"$/; if ($type eq 'st-year') { if ($str =~ /^(.*)-(.*)$/) { my $from = $1 || '*'; diff --git a/Koha/SearchEngine/Elasticsearch/Search.pm b/Koha/SearchEngine/Elasticsearch/Search.pm index 705feb27fa..635d6b5687 100644 --- a/Koha/SearchEngine/Elasticsearch/Search.pm +++ b/Koha/SearchEngine/Elasticsearch/Search.pm @@ -85,7 +85,7 @@ sub search { my $params = $self->get_elasticsearch_params(); # 20 is the default number of results per page - $query->{size} = $count || 20; + $query->{size} = $count // 20; # ES doesn't want pages, it wants a record to start from. if (exists $options{offset}) { $query->{from} = $options{offset}; @@ -132,8 +132,8 @@ sub count { my ( $error, $results, $facets ) = $search->search_compat( $query, $simple_query, \@sort_by, \@servers, - $results_per_page, $offset, $branches, $query_type, - $scan + $results_per_page, $offset, undef, $item_types, + $query_type, $scan ) A search interface somewhat compatible with LgetRecords>. Anything @@ -144,10 +144,15 @@ get ignored here, along with some other things (like C<@servers>.) sub search_compat { my ( - $self, $query, $simple_query, $sort_by, - $servers, $results_per_page, $offset, $branches, - $query_type, $scan + $self, $query, $simple_query, $sort_by, + $servers, $results_per_page, $offset, $branches, + $item_types, $query_type, $scan ) = @_; + + if ( $scan ) { + return $self->_aggregation_scan( $query, $results_per_page, $offset ); + } + my %options; if ( !defined $offset or $offset < 0 ) { $offset = 0; @@ -509,4 +514,67 @@ sub _convert_facets { return \@facets; } +=head2 _aggregation_scan + + my $result = $self->_aggregration_scan($query, 10, 0); + +Perform an aggregation request for scan purposes. + +=cut + +sub _aggregation_scan { + my ($self, $query, $results_per_page, $offset) = @_; + + if (!scalar(keys %{$query->{aggregations}})) { + my %result = { + biblioserver => { + hits => 0, + RECORDS => undef + } + }; + return (undef, \%result, undef); + } + my ($field) = keys %{$query->{aggregations}}; + $query->{aggregations}{$field}{terms}{size} = 1000; + my $results = $self->search($query, 1, 0); + + # Convert each result into a MARC::Record + my (@records, $index); + # opac-search expects results to be put in the + # right place in the array, according to $offset + $index = $offset - 1; + + my $count = scalar(@{$results->{aggregations}{$field}{buckets}}); + for (my $index = $offset; $index - $offset < $results_per_page && $index < $count; $index++) { + my $bucket = $results->{aggregations}{$field}{buckets}->[$index]; + # Scan values are expressed as: + # - MARC21: 100a (count) and 245a (term) + # - UNIMARC: 200f (count) and 200a (term) + my $marc = MARC::Record->new; + $marc->encoding('UTF-8'); + if (C4::Context->preference('marcflavour') eq 'UNIMARC') { + $marc->append_fields( + MARC::Field->new((200, ' ', ' ', 'f' => $bucket->{doc_count})) + ); + $marc->append_fields( + MARC::Field->new((200, ' ', ' ', 'a' => $bucket->{key})) + ); + } else { + $marc->append_fields( + MARC::Field->new((100, ' ', ' ', 'a' => $bucket->{doc_count})) + ); + $marc->append_fields( + MARC::Field->new((245, ' ', ' ', 'a' => $bucket->{key})) + ); + } + $records[$index] = $marc->as_usmarc(); + }; + # consumers of this expect a namespaced result, we provide the default + # configuration. + my %result; + $result{biblioserver}{hits} = $count; + $result{biblioserver}{RECORDS} = \@records; + return (undef, \%result, undef); +} + 1; diff --git a/catalogue/search.pl b/catalogue/search.pl index e1e9e85a3a..89a5211b5c 100755 --- a/catalogue/search.pl +++ b/catalogue/search.pl @@ -498,9 +498,10 @@ if ($query_cgi) { my $input_value = $2; push @query_inputs, { input_name => $input_name, input_value => Encode::decode_utf8( uri_unescape( $input_value ) ) }; if ($input_name eq 'idx') { - $scan_index_to_use = $input_value; # unless $scan_index_to_use; + # The form contains multiple fields, so take the first value as the scan index + $scan_index_to_use = $input_value unless $scan_index_to_use; } - if ($input_name eq 'q') { + if (!defined $scan_search_term_to_use && $input_name eq 'q') { $scan_search_term_to_use = Encode::decode_utf8( uri_unescape( $input_value )); } } @@ -584,8 +585,8 @@ for (my $i=0;$i<@servers;$i++) { $template->param( EnableSearchHistory => 1 ); } - ## If there's just one result, redirect to the detail page - if ($total == 1) { + ## If there's just one result, redirect to the detail page unless doing an index scan + if ($total == 1 && !$scan) { my $biblionumber = $newresults[0]->{biblionumber}; my $defaultview = C4::Context->preference('IntranetBiblioDefaultView'); my $views = { C4::Search::enabled_staff_search_views }; diff --git a/koha-tmpl/intranet-tmpl/prog/en/modules/catalogue/results.tt b/koha-tmpl/intranet-tmpl/prog/en/modules/catalogue/results.tt index d52245a71d..2f8d5dbaec 100644 --- a/koha-tmpl/intranet-tmpl/prog/en/modules/catalogue/results.tt +++ b/koha-tmpl/intranet-tmpl/prog/en/modules/catalogue/results.tt @@ -317,7 +317,7 @@ [% ELSE %][% END %] [% IF ( ms_ticommaphr ) %] [% ELSE %][% END %] - [% IF ( ms_aucommaphr ) %] + [% IF ( ms_au || ms_aucommaphr ) %] [% ELSE %][% END %] [% IF ( ms_su ) %] [% ELSE %][% END %] @@ -335,6 +335,7 @@ [% ELSE %][% END %] + diff --git a/t/db_dependent/Koha/SearchEngine/Elasticsearch/QueryBuilder.t b/t/db_dependent/Koha/SearchEngine/Elasticsearch/QueryBuilder.t index 3a393d1c8c..08e823578a 100644 --- a/t/db_dependent/Koha/SearchEngine/Elasticsearch/QueryBuilder.t +++ b/t/db_dependent/Koha/SearchEngine/Elasticsearch/QueryBuilder.t @@ -47,7 +47,8 @@ $se->mock( 'get_elasticsearch_mappings', sub { type => 'text' }, subject => { - type => 'text' + type => 'text', + facet => 1 }, itemnumber => { type => 'integer' @@ -192,7 +193,7 @@ subtest 'build_authorities_query_compat() tests' => sub { }; subtest 'build_query tests' => sub { - plan tests => 40; + plan tests => 48; my $qb; @@ -439,6 +440,43 @@ subtest 'build_query tests' => sub { ); is($query_cgi, 'idx=&q=title%3A%22donald%20duck%22', 'query cgi'); is($query_desc, 'title:"donald duck"', 'query desc ok'); + + # Scan queries + ( undef, $query, $simple_query, $query_cgi, $query_desc ) = $qb->build_query_compat( undef, ['new'], ['au'], undef, undef, 1 ); + is( + $query->{query}{query_string}{query}, + '*', + "scan query is properly formed" + ); + is_deeply( + $query->{aggregations}{'author'}{'terms'}, + { + field => 'author__facet', + order => { '_term' => 'asc' }, + include => '[nN][eE][wW].*' + }, + "scan aggregation request is properly formed" + ); + is($query_cgi, 'idx=au&q=new&scan=1', 'query cgi'); + is($query_desc, 'new', 'query desc ok'); + + ( undef, $query, $simple_query, $query_cgi, $query_desc ) = $qb->build_query_compat( undef, ['new'], [], undef, undef, 1 ); + is( + $query->{query}{query_string}{query}, + '*', + "scan query is properly formed" + ); + is_deeply( + $query->{aggregations}{'subject'}{'terms'}, + { + field => 'subject__facet', + order => { '_term' => 'asc' }, + include => '[nN][eE][wW].*' + }, + "scan aggregation request is properly formed" + ); + is($query_cgi, 'idx=&q=new&scan=1', 'query cgi'); + is($query_desc, 'new', 'query desc ok'); }; diff --git a/t/db_dependent/Koha/SearchEngine/Elasticsearch/Search.t b/t/db_dependent/Koha/SearchEngine/Elasticsearch/Search.t index 8d73f14058..19c33bf6ec 100644 --- a/t/db_dependent/Koha/SearchEngine/Elasticsearch/Search.t +++ b/t/db_dependent/Koha/SearchEngine/Elasticsearch/Search.t @@ -17,7 +17,7 @@ use Modern::Perl; -use Test::More tests => 11; +use Test::More tests => 13; use t::lib::Mocks; use Koha::SearchEngine::Elasticsearch::QueryBuilder; @@ -100,6 +100,16 @@ SKIP: { ok ($results = $searcher->search_compat( $query ), 'Test search_compat' ); + my ( undef, $scan_query ) = $builder->build_query_compat( undef, ['easy'], [], undef, undef, 1 ); + ok ((undef, $results) = $searcher->search_compat( $scan_query, undef, [], [], 20, 0, undef, undef, undef, 1 ), 'Test search_compat scan query' ); + my $expected = { + biblioserver => { + hits => 0, + RECORDS => [] + } + }; + is_deeply($results, $expected, 'Scan query results ok'); + ok (($results,$count) = $searcher->search_auth_compat ( $query ), 'Test search_auth_compat' ); is ( $count = $searcher->count_auth_use($searcher,1), 0, 'Testing count_auth_use'); -- 2.39.5