1 package Koha::SearchEngine::Elasticsearch::QueryBuilder;
3 # This file is part of Koha.
5 # Copyright 2014 Catalyst IT Ltd.
7 # Koha is free software; you can redistribute it and/or modify it
8 # under the terms of the GNU General Public License as published by
9 # the Free Software Foundation; either version 3 of the License, or
10 # (at your option) any later version.
12 # Koha is distributed in the hope that it will be useful, but
13 # WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 # GNU General Public License for more details.
17 # You should have received a copy of the GNU General Public License
18 # along with Koha; if not, see <http://www.gnu.org/licenses>.
22 Koha::SearchEngine::Elasticsearch::QueryBuilder - constructs elasticsearch
23 query objects from user-supplied queries
27 This provides the functions that take a user-supplied search query, and
28 provides something that can be given to elasticsearch to get answers.
32 use Koha::SearchEngine::Elasticsearch;
33 $builder = Koha::SearchEngine::Elasticsearch->new();
34 my $simple_query = $builder->build_query("hello");
35 # This is currently undocumented because the original code is undocumented
36 my $adv_query = $builder->build_advanced_query($indexes, $operands, $operators);
42 use base qw(Class::Accessor);
43 use List::MoreUtils qw/ each_array /;
47 use Data::Dumper; # TODO remove
51 my $simple_query = $builder->build_query("hello", %options)
53 This will build a query that can be issued to elasticsearch from the provided
54 string input. This expects a lucene style search form (see
55 L<http://www.elasticsearch.org/guide/en/elasticsearch/reference/current/query-dsl-query-string-query.html#query-string-syntax>
58 It'll make an attempt to respect the various query options.
60 Additional options can be provided with the C<%options> hash.
66 This should be an arrayref of hashrefs, each containing a C<field> and an
67 C<direction> (optional, defaults to C<asc>.) The results will be sorted
68 according to these values. Valid values for C<direction> are 'asc' and 'desc'.
75 my ( $self, $query, %options ) = @_;
77 my $stemming = C4::Context->preference("QueryStemming") || 0;
78 my $auto_truncation = C4::Context->preference("QueryAutoTruncate") || 0;
79 my $weight_fields = C4::Context->preference("QueryWeightFields") || 0;
80 my $fuzzy_enabled = C4::Context->preference("QueryFuzzy") || 0;
82 $query = '*' unless defined $query;
88 fuzziness => $fuzzy_enabled ? 'auto' : '0',
89 default_operator => "AND",
90 default_field => "_all",
94 if ( $options{sort} ) {
95 foreach my $sort ( @{ $options{sort} } ) {
96 my ( $f, $d ) = @$sort{qw/ field direction /};
97 die "Invalid sort direction, $d"
98 if $d && ( $d ne 'asc' && $d ne 'desc' );
101 # TODO account for fields that don't have a 'phrase' type
102 push @{ $res->{sort} }, { "$f.phrase" => { order => $d } };
106 # See _convert_facets in Search.pm for how these get turned into
107 # things that Koha can use.
109 author => { terms => { field => "author__facet" } },
110 subject => { terms => { field => "subject__facet" } },
111 itype => { terms => { field => "itype__facet" } },
116 =head2 build_browse_query
118 my $browse_query = $builder->build_browse_query($field, $query);
120 This performs a "starts with" style query on a particular field. The field
121 to be searched must have been indexed with an appropriate mapping as a
126 sub build_browse_query {
127 my ( $self, $field, $query ) = @_;
129 my $fuzzy_enabled = C4::Context->preference("QueryFuzzy") || 0;
131 return { query => '*' } if !defined $query;
133 # TODO this should come from Koha::Elasticsearch
134 my %field_whitelist = (
138 $field = 'title' if !exists $field_whitelist{$field};
142 match_phrase_prefix => {
146 fuzziness => $fuzzy_enabled ? 'auto' : '0',
150 sort => [ { "$field.phrase" => { order => "asc" } } ],
154 =head2 build_query_compat
157 $error, $query, $simple_query, $query_cgi,
158 $query_desc, $limit, $limit_cgi, $limit_desc,
159 $stopwords_removed, $query_type
161 = $builder->build_query_compat( \@operators, \@operands, \@indexes,
162 \@limits, \@sort_by, $scan, $lang );
164 This handles a search using the same api as L<C4::Search::buildQuery> does.
166 A very simple query will go in with C<$operands> set to ['query'], and
167 C<$sort_by> set to ['pubdate_dsc']. This simple case will return with
168 C<$query> set to something that can perform the search, C<$simple_query>
169 set to just the search term, C<$query_cgi> set to something that can
170 reproduce this search, and C<$query_desc> set to something else.
174 sub build_query_compat {
175 my ( $self, $operators, $operands, $indexes, $orig_limits, $sort_by, $scan,
179 #die Dumper ( $self, $operators, $operands, $indexes, $limits, $sort_by, $scan, $lang );
180 my @sort_params = $self->_convert_sort_fields(@$sort_by);
181 my @index_params = $self->_convert_index_fields(@$indexes);
182 my $limits = $self->_fix_limit_special_cases($orig_limits);
184 # Merge the indexes in with the search terms and the operands so that
185 # each search thing is a handy unit.
186 unshift @$operators, undef; # The first one can't have an op
188 my $ea = each_array( @$operands, @$operators, @index_params );
189 while ( my ( $oand, $otor, $index ) = $ea->() ) {
190 next if ( !defined($oand) || $oand eq '' );
191 push @search_params, {
192 operand => $self->_clean_search_term($oand), # the search terms
193 operator => defined($otor) ? uc $otor : undef, # AND and so on
194 $index ? %$index : (),
198 # We build a string query from limits and the queries. An alternative
199 # would be to pass them separately into build_query and let it build
200 # them into a structured ES query itself. Maybe later, though that'd be
202 my $query_str = join( ' AND ',
203 join( ' ', $self->_create_query_string(@search_params) ),
204 $self->_join_queries( $self->_convert_index_strings(@$limits) ) );
206 # If there's no query on the left, let's remove the junk left behind
207 $query_str =~ s/^ AND //;
209 $options{sort} = \@sort_params;
210 my $query = $self->build_query( $query_str, %options );
213 # We roughly emulate the CGI parameters of the zebra query builder
214 my $query_cgi = 'idx=kw&q=' . uri_escape( $operands->[0] ) if @$operands;
215 my $simple_query = $operands->[0] if @$operands == 1;
216 my $query_desc = $simple_query;
217 my $limit = 'and ' . join( ' and ', @$limits );
219 '&limit=' . join( '&limit=', map { uri_escape($_) } @$orig_limits );
220 my $limit_desc = "@$limits";
223 undef, $query, $simple_query, $query_cgi, $query_desc,
224 $limit, $limit_cgi, $limit_desc, undef, undef
228 =head2 _convert_sort_fields
230 my @sort_params = _convert_sort_fields(@sort_by)
232 Converts the zebra-style sort index information into elasticsearch-style.
234 C<@sort_by> is the same as presented to L<build_query_compat>, and it returns
235 something that can be sent to L<build_query>.
239 sub _convert_sort_fields {
240 my ( $self, @sort_by ) = @_;
242 # Turn the sorting into something we care about.
243 my %sort_field_convert = (
244 acqdate => 'acqdate',
246 call_number => 'callnum',
247 popularity => 'issues',
248 relevance => undef, # default
250 pubdate => 'pubdate',
252 my %sort_order_convert =
253 ( qw( dsc desc ), qw( asc asc ), qw( az asc ), qw( za desc ) );
255 # Convert the fields and orders, drop anything we don't know about.
256 grep { $_->{field} } map {
257 my ( $f, $d ) = split /_/;
259 field => $sort_field_convert{$f},
260 direction => $sort_order_convert{$d}
265 =head2 _convert_index_fields
267 my @index_params = $self->_convert_index_fields(@indexes);
269 Converts zebra-style search index notation into elasticsearch-style.
271 C<@indexes> is an array of index names, as presented to L<build_query_compat>,
272 and it returns something that can be sent to L<build_query>.
274 B<TODO>: this will pull from the elasticsearch mappings table to figure out
279 our %index_field_convert = (
285 'se' => 'title-series',
286 'callnum' => 'callnum',
287 'mc-itype' => 'itype',
289 'branch' => 'homebranch',
295 sub _convert_index_fields {
296 my ( $self, @indexes ) = @_;
298 my %index_type_convert =
299 ( __default => undef, phr => 'phrase', rtrn => 'right-truncate' );
301 # Convert according to our table, drop anything that doesn't convert
302 grep { $_->{field} } map {
303 my ( $f, $t ) = split /,/;
305 field => $index_field_convert{$f},
306 type => $index_type_convert{ $t // '__default' }
311 =head2 _convert_index_strings
313 my @searches = $self->_convert_index_strings(@searches);
315 Similar to L<_convert_index_fields>, this takes strings of the form
316 B<field:search term> and rewrites the field from zebra-style to
317 elasticsearch-style. Anything it doesn't understand is returned verbatim.
321 sub _convert_index_strings {
322 my ( $self, @searches ) = @_;
325 foreach my $s (@searches) {
327 my ( $field, $term ) = $s =~ /^\s*([\w,-]*?):(.*)/;
328 unless ( defined($field) && defined($term) ) {
332 my ($conv) = $self->_convert_index_fields($field);
333 unless ( defined($conv) ) {
337 push @res, $conv->{field} . ":"
338 . $self->_modify_string_by_type( %$conv, operand => $term );
343 =head2 _modify_string_by_type
345 my $str = $self->_modify_string_by_type(%index_field);
347 If you have a search term (operand) and a type (phrase, right-truncated), this
348 will convert the string to have the function in lucene search terms, e.g.
349 wrapping quotes around it.
353 sub _modify_string_by_type {
354 my ( $self, %idx ) = @_;
356 my $type = $idx{type} || '';
357 my $str = $idx{operand};
358 return $str unless $str; # Empty or undef, we can't use it.
360 $str .= '*' if $type eq 'right-truncate';
361 $str = '"' . $str . '"' if $type eq 'phrase';
365 =head2 _convert_index_strings_freeform
367 my $search = $self->_convert_index_strings_freeform($search);
369 This is similar to L<_convert_index_strings>, however it'll search out the
370 things to change within the string. So it can handle strings such as
371 C<(su:foo) AND (su:bar)>, converting the C<su> appropriately.
375 sub _convert_index_strings_freeform {
376 my ( $self, $search ) = @_;
378 while ( my ( $zeb, $es ) = each %index_field_convert ) {
379 $search =~ s/\b$zeb:/$es:/g;
386 my $query_str = $self->_join_queries(@query_parts);
388 This takes a list of query parts, that might be search terms on their own, or
389 booleaned together, or specifying fields, or whatever, wraps them in
390 parentheses, and ANDs them all together. Suitable for feeding to the ES
396 my ( $self, @parts ) = @_;
398 @parts = grep { defined($_) && $_ ne '' } @parts;
399 return () unless @parts;
400 return $parts[0] if @parts < 2;
401 join ' AND ', map { "($_)" } @parts;
406 my @phrased_queries = $self->_make_phrases(@query_parts);
408 This takes the supplied queries and forces them to be phrases by wrapping
409 quotes around them. It understands field prefixes, e.g. 'subject:' and puts
410 the quotes outside of them if they're there.
415 my ( $self, @parts ) = @_;
416 map { s/^\s*(\w*?:)(.*)$/$1"$2"/r } @parts;
419 =head2 _create_query_string
421 my @query_strings = $self->_create_query_string(@queries);
423 Given a list of hashrefs, it will turn them into a lucene-style query string.
424 The hash should contain field, type (both for the indexes), operator, and
429 sub _create_query_string {
430 my ( $self, @queries ) = @_;
433 my $otor = $_->{operator} ? $_->{operator} . ' ' : '';
434 my $field = $_->{field} ? $_->{field} . ':' : '';
436 my $oand = $self->_modify_string_by_type(%$_);
437 "$otor($field$oand)";
441 =head2 _clean_search_term
443 my $term = $self->_clean_search_term($term);
445 This cleans a search term by removing any funny characters that may upset
446 ES and give us an error. It also calls L<_convert_index_strings_freeform>
447 to ensure those parts are correct.
451 sub _clean_search_term {
452 my ( $self, $term ) = @_;
454 $term = $self->_convert_index_strings_freeform($term);
459 =head2 _fix_limit_special_cases
461 my $limits = $self->_fix_limit_special_cases($limits);
463 This converts any special cases that the limit specifications have into things
464 that are more readily processable by the rest of the code.
466 The argument should be an arrayref, and it'll return an arrayref.
470 sub _fix_limit_special_cases {
471 my ( $self, $limits ) = @_;
474 foreach my $l (@$limits) {
476 # This is set up by opac-search.pl
477 if ( $l =~ /^yr,st-numeric,ge=/ ) {
478 my ( $start, $end ) =
479 ( $l =~ /^yr,st-numeric,ge=(.*) and yr,st-numeric,le=(.*)$/ );
480 next unless defined($start) && defined($end);
481 push @new_lim, "copydate:[$start TO $end]";
483 elsif ( $l =~ /^yr,st-numeric=/ ) {
484 my ($date) = ( $l =~ /^yr,st-numeric=(.*)$/ );
485 next unless defined($date);
486 push @new_lim, "copydate:$date";
488 elsif ( $l =~ /^available$/ ) {
489 push @new_lim, 'onloan:false';