1 package Koha::SearchEngine::Elasticsearch::QueryBuilder;
3 # This file is part of Koha.
5 # Copyright 2014 Catalyst IT Ltd.
7 # Koha is free software; you can redistribute it and/or modify it
8 # under the terms of the GNU General Public License as published by
9 # the Free Software Foundation; either version 3 of the License, or
10 # (at your option) any later version.
12 # Koha is distributed in the hope that it will be useful, but
13 # WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 # GNU General Public License for more details.
17 # You should have received a copy of the GNU General Public License
18 # along with Koha; if not, see <http://www.gnu.org/licenses>.
22 Koha::SearchEngine::Elasticsearch::QueryBuilder - constructs elasticsearch
23 query objects from user-supplied queries
27 This provides the functions that take a user-supplied search query, and
28 provides something that can be given to elasticsearch to get answers.
32 use Koha::SearchEngine::Elasticsearch::QueryBuilder;
33 $builder = Koha::SearchEngine::Elasticsearch->new({ index => $index });
34 my $simple_query = $builder->build_query("hello");
35 # This is currently undocumented because the original code is undocumented
36 my $adv_query = $builder->build_advanced_query($indexes, $operands, $operators);
42 use base qw(Koha::ElasticSearch);
45 use List::MoreUtils qw/ each_array /;
49 use Data::Dumper; # TODO remove
53 my $simple_query = $builder->build_query("hello", %options)
55 This will build a query that can be issued to elasticsearch from the provided
56 string input. This expects a lucene style search form (see
57 L<http://www.elasticsearch.org/guide/en/elasticsearch/reference/current/query-dsl-query-string-query.html#query-string-syntax>
60 It'll make an attempt to respect the various query options.
62 Additional options can be provided with the C<%options> hash.
68 This should be an arrayref of hashrefs, each containing a C<field> and an
69 C<direction> (optional, defaults to C<asc>.) The results will be sorted
70 according to these values. Valid values for C<direction> are 'asc' and 'desc'.
77 my ( $self, $query, %options ) = @_;
79 my $stemming = C4::Context->preference("QueryStemming") || 0;
80 my $auto_truncation = C4::Context->preference("QueryAutoTruncate") || 0;
81 my $weight_fields = C4::Context->preference("QueryWeightFields") || 0;
82 my $fuzzy_enabled = C4::Context->preference("QueryFuzzy") || 0;
84 $query = '*' unless defined $query;
90 fuzziness => $fuzzy_enabled ? 'auto' : '0',
91 default_operator => 'AND',
92 default_field => '_all',
93 lenient => JSON::true,
97 if ( $options{sort} ) {
98 foreach my $sort ( @{ $options{sort} } ) {
99 my ( $f, $d ) = @$sort{qw/ field direction /};
100 die "Invalid sort direction, $d"
101 if $d && ( $d ne 'asc' && $d ne 'desc' );
102 $d = 'asc' unless $d;
104 # TODO account for fields that don't have a 'phrase' type
106 $f = $self->_sort_field($f);
107 push @{ $res->{sort} }, { "$f.phrase" => { order => $d } };
111 # See _convert_facets in Search.pm for how these get turned into
112 # things that Koha can use.
114 author => { terms => { field => "author__facet" } },
115 subject => { terms => { field => "subject__facet" } },
116 itype => { terms => { field => "itype__facet" } },
121 =head2 build_browse_query
123 my $browse_query = $builder->build_browse_query($field, $query);
125 This performs a "starts with" style query on a particular field. The field
126 to be searched must have been indexed with an appropriate mapping as a
127 "phrase" subfield, which pretty much everything has.
130 # XXX this isn't really a browse query like we want in the end
131 sub build_browse_query {
132 my ( $self, $field, $query ) = @_;
134 my $fuzzy_enabled = C4::Context->preference("QueryFuzzy") || 0;
136 return { query => '*' } if !defined $query;
138 # TODO this should come from Koha::Elasticsearch
139 my %field_whitelist = (
143 $field = 'title' if !exists $field_whitelist{$field};
144 my $sort = $self->_sort_field($field);
147 match_phrase_prefix => {
151 fuzziness => $fuzzy_enabled ? 'auto' : '0',
155 sort => [ { "$sort.phrase" => { order => "asc" } } ],
159 =head2 build_query_compat
162 $error, $query, $simple_query, $query_cgi,
163 $query_desc, $limit, $limit_cgi, $limit_desc,
164 $stopwords_removed, $query_type
166 = $builder->build_query_compat( \@operators, \@operands, \@indexes,
167 \@limits, \@sort_by, $scan, $lang );
169 This handles a search using the same api as L<C4::Search::buildQuery> does.
171 A very simple query will go in with C<$operands> set to ['query'], and
172 C<$sort_by> set to ['pubdate_dsc']. This simple case will return with
173 C<$query> set to something that can perform the search, C<$simple_query>
174 set to just the search term, C<$query_cgi> set to something that can
175 reproduce this search, and C<$query_desc> set to something else.
179 sub build_query_compat {
180 my ( $self, $operators, $operands, $indexes, $orig_limits, $sort_by, $scan,
184 #die Dumper ( $self, $operators, $operands, $indexes, $orig_limits, $sort_by, $scan, $lang );
185 my @sort_params = $self->_convert_sort_fields(@$sort_by);
186 my @index_params = $self->_convert_index_fields(@$indexes);
187 my $limits = $self->_fix_limit_special_cases($orig_limits);
189 # Merge the indexes in with the search terms and the operands so that
190 # each search thing is a handy unit.
191 unshift @$operators, undef; # The first one can't have an op
193 my $ea = each_array( @$operands, @$operators, @index_params );
194 while ( my ( $oand, $otor, $index ) = $ea->() ) {
195 next if ( !defined($oand) || $oand eq '' );
196 push @search_params, {
197 operand => $self->_clean_search_term($oand), # the search terms
198 operator => defined($otor) ? uc $otor : undef, # AND and so on
199 $index ? %$index : (),
203 # We build a string query from limits and the queries. An alternative
204 # would be to pass them separately into build_query and let it build
205 # them into a structured ES query itself. Maybe later, though that'd be
207 my $query_str = join( ' AND ',
208 join( ' ', $self->_create_query_string(@search_params) ) || (),
209 $self->_join_queries( $self->_convert_index_strings(@$limits) ) || () );
211 # If there's no query on the left, let's remove the junk left behind
212 $query_str =~ s/^ AND //;
214 $options{sort} = \@sort_params;
215 my $query = $self->build_query( $query_str, %options );
218 # We roughly emulate the CGI parameters of the zebra query builder
219 my $query_cgi = 'idx=kw&q=' . uri_escape_utf8( $operands->[0] ) if @$operands;
220 my $simple_query = $operands->[0] if @$operands == 1;
221 my $query_desc = $simple_query;
222 my $limit = $self->_join_queries( $self->_convert_index_strings(@$limits));
224 '&limit=' . join( '&limit=', map { uri_escape_utf8($_) } @$orig_limits );
225 my $limit_desc = "$limit" if $limit;
227 undef, $query, $simple_query, $query_cgi, $query_desc,
228 $limit, $limit_cgi, $limit_desc, undef, undef
232 =head2 build_authorities_query
234 my $query = $builder->build_authorities_query(\%search);
236 This takes a nice description of an authority search and turns it into a black-box
237 query that can then be passed to the appropriate searcher.
239 The search description is a hashref that looks something like:
244 where => 'Heading', # search the main entry
245 operator => 'exact', # require an exact match
246 value => 'frogs', # the search string
249 where => '', # search all entries
250 operator => '', # default keyword, right truncation
258 authtypecode => 'TOPIC_TERM',
263 sub build_authorities_query {
264 my ( $self, $search ) = @_;
266 # Start by making the query parts
269 foreach my $s ( @{ $search->{searches} } ) {
270 my ( $wh, $op, $val ) = @{$s}{qw(where operator value)};
271 $wh = '_all' if $wh eq '';
272 if ( $op eq 'is' || $op eq '=' ) {
274 # look for something that matches completely
275 # note, '=' is about numerical vals. May need special handling.
276 # _allphrase is a special field that only groups the exact
277 # matches. Also, we lowercase our search because the ES
278 # index lowercases its values, and term searches don't get the
279 # search analyzer applied to them.
280 push @filter_parts, { term => { "$wh.phrase" => lc $val } };
282 elsif ( $op eq 'exact' ) {
284 # left and right truncation, otherwise an exact phrase
285 push @query_parts, { match_phrase => { $wh => $val } };
287 elsif ( $op eq 'start' ) {
290 push @query_parts, { wildcard => { "$wh.phrase" => lc "$val*" } };
293 # regular wordlist stuff
294 push @query_parts, { match => { $wh => $val } };
298 # Merge the query and filter parts appropriately
299 # 'should' behaves like 'or', if we want 'and', use 'must'
300 my $query_part = { bool => { should => \@query_parts } };
301 my $filter_part = { bool => { should => \@filter_parts } };
303 # We need to add '.phrase' to all the sort headings otherwise it'll sort
304 # based on the tokenised form.
305 if ( exists $search->{sort} ) {
307 foreach my $k ( keys %{ $search->{sort} } ) {
308 my $f = $self->_sort_field($k);
309 $s{"$f.phrase"} = $search->{sort}{$k};
311 $search->{sort} = \%s;
314 # extract the sort stuff
315 my %sort = ( sort => [ $search->{sort} ] ) if exists $search->{sort};
320 { filtered => { filter => $filter_part, query => $query_part } }
324 $query = { query => $query_part };
326 $query = { %$query, %sort };
331 =head2 build_authorities_query_compat
334 $builder->build_authorities_query_compat( \@marclist, \@and_or,
335 \@excluding, \@operator, \@value, $authtypecode, $orderby );
337 This builds a query for searching for authorities, in the style of
338 L<C4::AuthoritiesMarc::SearchAuthorities>.
346 An arrayref containing where the particular term should be searched for.
347 Options are: mainmainentry, mainentry, match, match-heading, see-from, and
348 thesaurus. If left blank, any field is used.
352 Totally ignored. It is never used in L<C4::AuthoritiesMarc::SearchAuthorities>.
360 What form of search to do. Options are: is (phrase, no trunction, whole field
361 must match), = (number exact match), exact (phrase, but with left and right
362 truncation). If left blank, then word list, right truncted, anywhere is used.
366 The actual user-provided string value to search for.
370 The authority type code to search within. If blank, then all will be searched.
374 The order to sort the results by. Options are Relevance, HeadingAsc,
375 HeadingDsc, AuthidAsc, AuthidDsc.
379 marclist, operator, and value must be the same length, and the values at
380 index /i/ all relate to each other.
382 This returns a query, which is a black box object that can be passed to the
383 appropriate search object.
387 sub build_authorities_query_compat {
388 my ( $self, $marclist, $and_or, $excluding, $operator, $value,
389 $authtypecode, $orderby )
392 # This turns the old-style many-options argument form into a more
393 # extensible hash form that is understood by L<build_authorities_query>.
396 my %koha_to_index_name = (
397 mainmainentry => 'Heading-Main',
398 mainentry => 'Heading',
400 'match-heading' => 'Match-heading',
401 'see-from' => 'Match-heading-see-from',
402 thesaurus => 'Subject-heading-thesaurus',
406 # Make sure everything exists
407 foreach my $m (@$marclist) {
408 confess "Invalid marclist field provided: $m" unless exists $koha_to_index_name{$m};
410 for ( my $i = 0 ; $i < @$value ; $i++ ) {
413 where => $koha_to_index_name{$marclist->[$i]},
414 operator => $operator->[$i],
415 value => $value->[$i],
421 ( $orderby =~ /^Heading/ ) ? 'Heading'
422 : ( $orderby =~ /^Auth/ ) ? 'Local-Number'
425 my $sort_order = ( $orderby =~ /Asc$/ ) ? 'asc' : 'desc';
426 %sort = ( $sort_field => $sort_order, );
429 searches => \@searches,
430 authtypecode => $authtypecode,
432 $search{sort} = \%sort if %sort;
433 my $query = $self->build_authorities_query( \%search );
437 =head2 _convert_sort_fields
439 my @sort_params = _convert_sort_fields(@sort_by)
441 Converts the zebra-style sort index information into elasticsearch-style.
443 C<@sort_by> is the same as presented to L<build_query_compat>, and it returns
444 something that can be sent to L<build_query>.
448 sub _convert_sort_fields {
449 my ( $self, @sort_by ) = @_;
451 # Turn the sorting into something we care about.
452 my %sort_field_convert = (
453 acqdate => 'acqdate',
455 call_number => 'callnum',
456 popularity => 'issues',
457 relevance => undef, # default
459 pubdate => 'pubdate',
461 my %sort_order_convert =
462 ( qw( dsc desc ), qw( asc asc ), qw( az asc ), qw( za desc ) );
464 # Convert the fields and orders, drop anything we don't know about.
465 grep { $_->{field} } map {
466 my ( $f, $d ) = split /_/;
468 field => $sort_field_convert{$f},
469 direction => $sort_order_convert{$d}
474 =head2 _convert_index_fields
476 my @index_params = $self->_convert_index_fields(@indexes);
478 Converts zebra-style search index notation into elasticsearch-style.
480 C<@indexes> is an array of index names, as presented to L<build_query_compat>,
481 and it returns something that can be sent to L<build_query>.
483 B<TODO>: this will pull from the elasticsearch mappings table to figure out
488 our %index_field_convert = (
494 'se' => 'title-series',
495 'callnum' => 'callnum',
498 'branch' => 'homebranch',
502 'hi' => 'Host-Item-Number',
505 sub _convert_index_fields {
506 my ( $self, @indexes ) = @_;
508 my %index_type_convert =
509 ( __default => undef, phr => 'phrase', rtrn => 'right-truncate' );
511 # Convert according to our table, drop anything that doesn't convert.
512 # If a field starts with mc- we save it as it's used (and removed) later
513 # when joining things, to indicate we make it an 'OR' join.
514 # (Sorry, this got a bit ugly after special cases were found.)
515 grep { $_->{field} } map {
516 my ( $f, $t ) = split /,/;
523 field => $index_field_convert{$f},
524 type => $index_type_convert{ $t // '__default' }
526 $r->{field} = ($mc . $r->{field}) if $mc && $r->{field};
531 =head2 _convert_index_strings
533 my @searches = $self->_convert_index_strings(@searches);
535 Similar to L<_convert_index_fields>, this takes strings of the form
536 B<field:search term> and rewrites the field from zebra-style to
537 elasticsearch-style. Anything it doesn't understand is returned verbatim.
541 sub _convert_index_strings {
542 my ( $self, @searches ) = @_;
544 foreach my $s (@searches) {
546 my ( $field, $term ) = $s =~ /^\s*([\w,-]*?):(.*)/;
547 unless ( defined($field) && defined($term) ) {
551 my ($conv) = $self->_convert_index_fields($field);
552 unless ( defined($conv) ) {
556 push @res, $conv->{field} . ":"
557 . $self->_modify_string_by_type( %$conv, operand => $term );
562 =head2 _convert_index_strings_freeform
564 my $search = $self->_convert_index_strings_freeform($search);
566 This is similar to L<_convert_index_strings>, however it'll search out the
567 things to change within the string. So it can handle strings such as
568 C<(su:foo) AND (su:bar)>, converting the C<su> appropriately.
570 If there is something of the form "su,complete-subfield" or something, the
571 second part is stripped off as we can't yet handle that. Making it work
572 will have to wait for a real query parser.
576 sub _convert_index_strings_freeform {
577 my ( $self, $search ) = @_;
578 while ( my ( $zeb, $es ) = each %index_field_convert ) {
579 $search =~ s/\b$zeb(?:,[\w-]*)?:/$es:/g;
584 =head2 _modify_string_by_type
586 my $str = $self->_modify_string_by_type(%index_field);
588 If you have a search term (operand) and a type (phrase, right-truncated), this
589 will convert the string to have the function in lucene search terms, e.g.
590 wrapping quotes around it.
594 sub _modify_string_by_type {
595 my ( $self, %idx ) = @_;
597 my $type = $idx{type} || '';
598 my $str = $idx{operand};
599 return $str unless $str; # Empty or undef, we can't use it.
601 $str .= '*' if $type eq 'right-truncate';
602 $str = '"' . $str . '"' if $type eq 'phrase';
608 my $query_str = $self->_join_queries(@query_parts);
610 This takes a list of query parts, that might be search terms on their own, or
611 booleaned together, or specifying fields, or whatever, wraps them in
612 parentheses, and ANDs them all together. Suitable for feeding to the ES
615 Note: doesn't AND them together if they specify an index that starts with "mc"
616 as that was a special case in the original code for dealing with multiple
617 choice options (you can't search for something that has an itype of A and
618 and itype of B otherwise.)
623 my ( $self, @parts ) = @_;
625 my @norm_parts = grep { defined($_) && $_ ne '' && $_ !~ /^mc-/ } @parts;
627 map { s/^mc-//r } grep { defined($_) && $_ ne '' && $_ =~ /^mc-/ } @parts;
628 return () unless @norm_parts + @mc_parts;
629 return ( @norm_parts, @mc_parts )[0] if @norm_parts + @mc_parts == 1;
631 @mc_parts ? '(' . ( join ' OR ', map { "($_)" } @mc_parts ) . ')' : ();
633 # Handy trick: $x || () inside a join means that if $x ends up as an
634 # empty string, it gets replaced with (), which makes join ignore it.
635 # (bad effect: this'll also happen to '0', this hopefully doesn't matter
638 join( ' AND ', map { "($_)" } @norm_parts ) || (),
644 my @phrased_queries = $self->_make_phrases(@query_parts);
646 This takes the supplied queries and forces them to be phrases by wrapping
647 quotes around them. It understands field prefixes, e.g. 'subject:' and puts
648 the quotes outside of them if they're there.
653 my ( $self, @parts ) = @_;
654 map { s/^\s*(\w*?:)(.*)$/$1"$2"/r } @parts;
657 =head2 _create_query_string
659 my @query_strings = $self->_create_query_string(@queries);
661 Given a list of hashrefs, it will turn them into a lucene-style query string.
662 The hash should contain field, type (both for the indexes), operator, and
667 sub _create_query_string {
668 my ( $self, @queries ) = @_;
671 my $otor = $_->{operator} ? $_->{operator} . ' ' : '';
672 my $field = $_->{field} ? $_->{field} . ':' : '';
674 my $oand = $self->_modify_string_by_type(%$_);
675 "$otor($field$oand)";
679 =head2 _clean_search_term
681 my $term = $self->_clean_search_term($term);
683 This cleans a search term by removing any funny characters that may upset
684 ES and give us an error. It also calls L<_convert_index_strings_freeform>
685 to ensure those parts are correct.
689 sub _clean_search_term {
690 my ( $self, $term ) = @_;
692 # Some hardcoded searches (like with authorities) produce things like
693 # 'an=123', when it ought to be 'an:123' for our purposes.
695 $term = $self->_convert_index_strings_freeform($term);
700 =head2 _fix_limit_special_cases
702 my $limits = $self->_fix_limit_special_cases($limits);
704 This converts any special cases that the limit specifications have into things
705 that are more readily processable by the rest of the code.
707 The argument should be an arrayref, and it'll return an arrayref.
711 sub _fix_limit_special_cases {
712 my ( $self, $limits ) = @_;
715 foreach my $l (@$limits) {
717 # This is set up by opac-search.pl
718 if ( $l =~ /^yr,st-numeric,ge=/ ) {
719 my ( $start, $end ) =
720 ( $l =~ /^yr,st-numeric,ge=(.*) and yr,st-numeric,le=(.*)$/ );
721 next unless defined($start) && defined($end);
722 push @new_lim, "copydate:[$start TO $end]";
724 elsif ( $l =~ /^yr,st-numeric=/ ) {
725 my ($date) = ( $l =~ /^yr,st-numeric=(.*)$/ );
726 next unless defined($date);
727 push @new_lim, "copydate:$date";
729 elsif ( $l =~ /^available$/ ) {
730 push @new_lim, 'onloan:false';
741 my $field = $self->_sort_field($field);
743 Given a field name, this works out what the actual name of the version to sort
744 on should be. Often it's the same, sometimes it involves sticking "__sort" on
745 the end. Maybe it'll be something else in the future, who knows?
751 if ($self->sort_fields()->{$f}) {