1 package Koha::SearchEngine::Elasticsearch::QueryBuilder;
3 # This file is part of Koha.
5 # Copyright 2014 Catalyst IT Ltd.
7 # Koha is free software; you can redistribute it and/or modify it
8 # under the terms of the GNU General Public License as published by
9 # the Free Software Foundation; either version 3 of the License, or
10 # (at your option) any later version.
12 # Koha is distributed in the hope that it will be useful, but
13 # WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 # GNU General Public License for more details.
17 # You should have received a copy of the GNU General Public License
18 # along with Koha; if not, see <http://www.gnu.org/licenses>.
22 Koha::SearchEngine::Elasticsearch::QueryBuilder - constructs elasticsearch
23 query objects from user-supplied queries
27 This provides the functions that take a user-supplied search query, and
28 provides something that can be given to elasticsearch to get answers.
32 use Koha::SearchEngine::Elasticsearch::QueryBuilder;
33 $builder = Koha::SearchEngine::Elasticsearch->new({ index => $index });
34 my $simple_query = $builder->build_query("hello");
35 # This is currently undocumented because the original code is undocumented
36 my $adv_query = $builder->build_advanced_query($indexes, $operands, $operators);
42 use base qw(Koha::ElasticSearch);
45 use List::MoreUtils qw/ each_array /;
49 use Data::Dumper; # TODO remove
53 my $simple_query = $builder->build_query("hello", %options)
55 This will build a query that can be issued to elasticsearch from the provided
56 string input. This expects a lucene style search form (see
57 L<http://www.elasticsearch.org/guide/en/elasticsearch/reference/current/query-dsl-query-string-query.html#query-string-syntax>
60 It'll make an attempt to respect the various query options.
62 Additional options can be provided with the C<%options> hash.
68 This should be an arrayref of hashrefs, each containing a C<field> and an
69 C<direction> (optional, defaults to C<asc>.) The results will be sorted
70 according to these values. Valid values for C<direction> are 'asc' and 'desc'.
77 my ( $self, $query, %options ) = @_;
79 my $stemming = C4::Context->preference("QueryStemming") || 0;
80 my $auto_truncation = C4::Context->preference("QueryAutoTruncate") || 0;
81 my $weight_fields = C4::Context->preference("QueryWeightFields") || 0;
82 my $fuzzy_enabled = C4::Context->preference("QueryFuzzy") || 0;
84 $query = '*' unless defined $query;
90 fuzziness => $fuzzy_enabled ? 'auto' : '0',
91 default_operator => 'AND',
92 default_field => '_all',
93 lenient => JSON::true,
97 if ( $options{sort} ) {
98 foreach my $sort ( @{ $options{sort} } ) {
99 my ( $f, $d ) = @$sort{qw/ field direction /};
100 die "Invalid sort direction, $d"
101 if $d && ( $d ne 'asc' && $d ne 'desc' );
102 $d = 'asc' unless $d;
104 # TODO account for fields that don't have a 'phrase' type
106 $f = $self->_sort_field($f);
107 push @{ $res->{sort} }, { "$f.phrase" => { order => $d } };
111 # See _convert_facets in Search.pm for how these get turned into
112 # things that Koha can use.
114 author => { terms => { field => "author__facet" } },
115 subject => { terms => { field => "subject__facet" } },
116 itype => { terms => { field => "itype__facet" } },
117 location => { terms => { field => "homebranch__facet" } },
118 'su-geo' => { terms => { field => "su-geo__facet" } },
119 se => { terms => { field => "se__facet" } },
124 =head2 build_browse_query
126 my $browse_query = $builder->build_browse_query($field, $query);
128 This performs a "starts with" style query on a particular field. The field
129 to be searched must have been indexed with an appropriate mapping as a
130 "phrase" subfield, which pretty much everything has.
133 # XXX this isn't really a browse query like we want in the end
134 sub build_browse_query {
135 my ( $self, $field, $query ) = @_;
137 my $fuzzy_enabled = C4::Context->preference("QueryFuzzy") || 0;
139 return { query => '*' } if !defined $query;
141 # TODO this should come from Koha::Elasticsearch
142 my %field_whitelist = (
146 $field = 'title' if !exists $field_whitelist{$field};
147 my $sort = $self->_sort_field($field);
150 match_phrase_prefix => {
154 fuzziness => $fuzzy_enabled ? 'auto' : '0',
158 sort => [ { "$sort.phrase" => { order => "asc" } } ],
162 =head2 build_query_compat
165 $error, $query, $simple_query, $query_cgi,
166 $query_desc, $limit, $limit_cgi, $limit_desc,
167 $stopwords_removed, $query_type
169 = $builder->build_query_compat( \@operators, \@operands, \@indexes,
170 \@limits, \@sort_by, $scan, $lang );
172 This handles a search using the same api as L<C4::Search::buildQuery> does.
174 A very simple query will go in with C<$operands> set to ['query'], and
175 C<$sort_by> set to ['pubdate_dsc']. This simple case will return with
176 C<$query> set to something that can perform the search, C<$simple_query>
177 set to just the search term, C<$query_cgi> set to something that can
178 reproduce this search, and C<$query_desc> set to something else.
182 sub build_query_compat {
183 my ( $self, $operators, $operands, $indexes, $orig_limits, $sort_by, $scan,
187 #die Dumper ( $self, $operators, $operands, $indexes, $orig_limits, $sort_by, $scan, $lang );
188 my @sort_params = $self->_convert_sort_fields(@$sort_by);
189 my @index_params = $self->_convert_index_fields(@$indexes);
190 my $limits = $self->_fix_limit_special_cases($orig_limits);
192 # Merge the indexes in with the search terms and the operands so that
193 # each search thing is a handy unit.
194 unshift @$operators, undef; # The first one can't have an op
196 my $ea = each_array( @$operands, @$operators, @index_params );
197 while ( my ( $oand, $otor, $index ) = $ea->() ) {
198 next if ( !defined($oand) || $oand eq '' );
199 push @search_params, {
200 operand => $self->_clean_search_term($oand), # the search terms
201 operator => defined($otor) ? uc $otor : undef, # AND and so on
202 $index ? %$index : (),
206 # We build a string query from limits and the queries. An alternative
207 # would be to pass them separately into build_query and let it build
208 # them into a structured ES query itself. Maybe later, though that'd be
210 my $query_str = join( ' AND ',
211 join( ' ', $self->_create_query_string(@search_params) ) || (),
212 $self->_join_queries( $self->_convert_index_strings(@$limits) ) || () );
214 # If there's no query on the left, let's remove the junk left behind
215 $query_str =~ s/^ AND //;
217 $options{sort} = \@sort_params;
218 my $query = $self->build_query( $query_str, %options );
221 # We roughly emulate the CGI parameters of the zebra query builder
222 my $query_cgi = 'idx=kw&q=' . uri_escape_utf8( $operands->[0] ) if @$operands;
223 my $simple_query = $operands->[0] if @$operands == 1;
224 my $query_desc = $simple_query;
225 my $limit = $self->_join_queries( $self->_convert_index_strings(@$limits));
226 my $limit_cgi = ( $orig_limits and @$orig_limits )
227 ? '&limit=' . join( '&limit=', map { uri_escape_utf8($_) } @$orig_limits )
229 my $limit_desc = "$limit" if $limit;
231 undef, $query, $simple_query, $query_cgi, $query_desc,
232 $limit, $limit_cgi, $limit_desc, undef, undef
236 =head2 build_authorities_query
238 my $query = $builder->build_authorities_query(\%search);
240 This takes a nice description of an authority search and turns it into a black-box
241 query that can then be passed to the appropriate searcher.
243 The search description is a hashref that looks something like:
248 where => 'Heading', # search the main entry
249 operator => 'exact', # require an exact match
250 value => 'frogs', # the search string
253 where => '', # search all entries
254 operator => '', # default keyword, right truncation
262 authtypecode => 'TOPIC_TERM',
267 sub build_authorities_query {
268 my ( $self, $search ) = @_;
270 # Start by making the query parts
273 foreach my $s ( @{ $search->{searches} } ) {
274 my ( $wh, $op, $val ) = @{$s}{qw(where operator value)};
275 $wh = '_all' if $wh eq '';
276 if ( $op eq 'is' || $op eq '=' ) {
278 # look for something that matches completely
279 # note, '=' is about numerical vals. May need special handling.
280 # _allphrase is a special field that only groups the exact
281 # matches. Also, we lowercase our search because the ES
282 # index lowercases its values, and term searches don't get the
283 # search analyzer applied to them.
284 push @filter_parts, { term => { "$wh.phrase" => lc $val } };
286 elsif ( $op eq 'exact' ) {
288 # left and right truncation, otherwise an exact phrase
289 push @query_parts, { match_phrase => { $wh => $val } };
291 elsif ( $op eq 'start' ) {
294 push @query_parts, { wildcard => { "$wh.phrase" => lc "$val*" } };
297 # regular wordlist stuff
298 push @query_parts, { match => { $wh => $val } };
302 # Merge the query and filter parts appropriately
303 # 'should' behaves like 'or', if we want 'and', use 'must'
304 my $query_part = { bool => { should => \@query_parts } };
305 my $filter_part = { bool => { should => \@filter_parts } };
307 # We need to add '.phrase' to all the sort headings otherwise it'll sort
308 # based on the tokenised form.
309 if ( exists $search->{sort} ) {
311 foreach my $k ( keys %{ $search->{sort} } ) {
312 my $f = $self->_sort_field($k);
313 $s{"$f.phrase"} = $search->{sort}{$k};
315 $search->{sort} = \%s;
318 # extract the sort stuff
319 my %sort = ( sort => [ $search->{sort} ] ) if exists $search->{sort};
324 { filtered => { filter => $filter_part, query => $query_part } }
328 $query = { query => $query_part };
330 $query = { %$query, %sort };
335 =head2 build_authorities_query_compat
338 $builder->build_authorities_query_compat( \@marclist, \@and_or,
339 \@excluding, \@operator, \@value, $authtypecode, $orderby );
341 This builds a query for searching for authorities, in the style of
342 L<C4::AuthoritiesMarc::SearchAuthorities>.
350 An arrayref containing where the particular term should be searched for.
351 Options are: mainmainentry, mainentry, match, match-heading, see-from, and
352 thesaurus. If left blank, any field is used.
356 Totally ignored. It is never used in L<C4::AuthoritiesMarc::SearchAuthorities>.
364 What form of search to do. Options are: is (phrase, no trunction, whole field
365 must match), = (number exact match), exact (phrase, but with left and right
366 truncation). If left blank, then word list, right truncted, anywhere is used.
370 The actual user-provided string value to search for.
374 The authority type code to search within. If blank, then all will be searched.
378 The order to sort the results by. Options are Relevance, HeadingAsc,
379 HeadingDsc, AuthidAsc, AuthidDsc.
383 marclist, operator, and value must be the same length, and the values at
384 index /i/ all relate to each other.
386 This returns a query, which is a black box object that can be passed to the
387 appropriate search object.
391 sub build_authorities_query_compat {
392 my ( $self, $marclist, $and_or, $excluding, $operator, $value,
393 $authtypecode, $orderby )
396 # This turns the old-style many-options argument form into a more
397 # extensible hash form that is understood by L<build_authorities_query>.
400 my %koha_to_index_name = (
401 mainmainentry => 'Heading-Main',
402 mainentry => 'Heading',
404 'match-heading' => 'Match-heading',
405 'see-from' => 'Match-heading-see-from',
406 thesaurus => 'Subject-heading-thesaurus',
410 # Make sure everything exists
411 foreach my $m (@$marclist) {
412 confess "Invalid marclist field provided: $m" unless exists $koha_to_index_name{$m};
414 for ( my $i = 0 ; $i < @$value ; $i++ ) {
417 where => $koha_to_index_name{$marclist->[$i]},
418 operator => $operator->[$i],
419 value => $value->[$i],
425 ( $orderby =~ /^Heading/ ) ? 'Heading'
426 : ( $orderby =~ /^Auth/ ) ? 'Local-Number'
429 my $sort_order = ( $orderby =~ /Asc$/ ) ? 'asc' : 'desc';
430 %sort = ( $sort_field => $sort_order, );
433 searches => \@searches,
434 authtypecode => $authtypecode,
436 $search{sort} = \%sort if %sort;
437 my $query = $self->build_authorities_query( \%search );
441 =head2 _convert_sort_fields
443 my @sort_params = _convert_sort_fields(@sort_by)
445 Converts the zebra-style sort index information into elasticsearch-style.
447 C<@sort_by> is the same as presented to L<build_query_compat>, and it returns
448 something that can be sent to L<build_query>.
452 sub _convert_sort_fields {
453 my ( $self, @sort_by ) = @_;
455 # Turn the sorting into something we care about.
456 my %sort_field_convert = (
457 acqdate => 'acqdate',
459 call_number => 'callnum',
460 popularity => 'issues',
461 relevance => undef, # default
463 pubdate => 'pubdate',
465 my %sort_order_convert =
466 ( qw( dsc desc ), qw( asc asc ), qw( az asc ), qw( za desc ) );
468 # Convert the fields and orders, drop anything we don't know about.
469 grep { $_->{field} } map {
470 my ( $f, $d ) = split /_/;
472 field => $sort_field_convert{$f},
473 direction => $sort_order_convert{$d}
478 =head2 _convert_index_fields
480 my @index_params = $self->_convert_index_fields(@indexes);
482 Converts zebra-style search index notation into elasticsearch-style.
484 C<@indexes> is an array of index names, as presented to L<build_query_compat>,
485 and it returns something that can be sent to L<build_query>.
487 B<TODO>: this will pull from the elasticsearch mappings table to figure out
492 our %index_field_convert = (
498 'se' => 'title-series',
499 'callnum' => 'callnum',
502 'branch' => 'homebranch',
506 'hi' => 'Host-Item-Number',
509 sub _convert_index_fields {
510 my ( $self, @indexes ) = @_;
512 my %index_type_convert =
513 ( __default => undef, phr => 'phrase', rtrn => 'right-truncate' );
515 # Convert according to our table, drop anything that doesn't convert.
516 # If a field starts with mc- we save it as it's used (and removed) later
517 # when joining things, to indicate we make it an 'OR' join.
518 # (Sorry, this got a bit ugly after special cases were found.)
519 grep { $_->{field} } map {
520 my ( $f, $t ) = split /,/;
527 field => $index_field_convert{$f},
528 type => $index_type_convert{ $t // '__default' }
530 $r->{field} = ($mc . $r->{field}) if $mc && $r->{field};
535 =head2 _convert_index_strings
537 my @searches = $self->_convert_index_strings(@searches);
539 Similar to L<_convert_index_fields>, this takes strings of the form
540 B<field:search term> and rewrites the field from zebra-style to
541 elasticsearch-style. Anything it doesn't understand is returned verbatim.
545 sub _convert_index_strings {
546 my ( $self, @searches ) = @_;
548 foreach my $s (@searches) {
550 my ( $field, $term ) = $s =~ /^\s*([\w,-]*?):(.*)/;
551 unless ( defined($field) && defined($term) ) {
555 my ($conv) = $self->_convert_index_fields($field);
556 unless ( defined($conv) ) {
560 push @res, $conv->{field} . ":"
561 . $self->_modify_string_by_type( %$conv, operand => $term );
566 =head2 _convert_index_strings_freeform
568 my $search = $self->_convert_index_strings_freeform($search);
570 This is similar to L<_convert_index_strings>, however it'll search out the
571 things to change within the string. So it can handle strings such as
572 C<(su:foo) AND (su:bar)>, converting the C<su> appropriately.
574 If there is something of the form "su,complete-subfield" or something, the
575 second part is stripped off as we can't yet handle that. Making it work
576 will have to wait for a real query parser.
580 sub _convert_index_strings_freeform {
581 my ( $self, $search ) = @_;
582 while ( my ( $zeb, $es ) = each %index_field_convert ) {
583 $search =~ s/\b$zeb(?:,[\w-]*)?:/$es:/g;
588 =head2 _modify_string_by_type
590 my $str = $self->_modify_string_by_type(%index_field);
592 If you have a search term (operand) and a type (phrase, right-truncated), this
593 will convert the string to have the function in lucene search terms, e.g.
594 wrapping quotes around it.
598 sub _modify_string_by_type {
599 my ( $self, %idx ) = @_;
601 my $type = $idx{type} || '';
602 my $str = $idx{operand};
603 return $str unless $str; # Empty or undef, we can't use it.
605 $str .= '*' if $type eq 'right-truncate';
606 $str = '"' . $str . '"' if $type eq 'phrase';
612 my $query_str = $self->_join_queries(@query_parts);
614 This takes a list of query parts, that might be search terms on their own, or
615 booleaned together, or specifying fields, or whatever, wraps them in
616 parentheses, and ANDs them all together. Suitable for feeding to the ES
619 Note: doesn't AND them together if they specify an index that starts with "mc"
620 as that was a special case in the original code for dealing with multiple
621 choice options (you can't search for something that has an itype of A and
622 and itype of B otherwise.)
627 my ( $self, @parts ) = @_;
629 my @norm_parts = grep { defined($_) && $_ ne '' && $_ !~ /^mc-/ } @parts;
631 map { s/^mc-//r } grep { defined($_) && $_ ne '' && $_ =~ /^mc-/ } @parts;
632 return () unless @norm_parts + @mc_parts;
633 return ( @norm_parts, @mc_parts )[0] if @norm_parts + @mc_parts == 1;
635 @mc_parts ? '(' . ( join ' OR ', map { "($_)" } @mc_parts ) . ')' : ();
637 # Handy trick: $x || () inside a join means that if $x ends up as an
638 # empty string, it gets replaced with (), which makes join ignore it.
639 # (bad effect: this'll also happen to '0', this hopefully doesn't matter
642 join( ' AND ', map { "($_)" } @norm_parts ) || (),
648 my @phrased_queries = $self->_make_phrases(@query_parts);
650 This takes the supplied queries and forces them to be phrases by wrapping
651 quotes around them. It understands field prefixes, e.g. 'subject:' and puts
652 the quotes outside of them if they're there.
657 my ( $self, @parts ) = @_;
658 map { s/^\s*(\w*?:)(.*)$/$1"$2"/r } @parts;
661 =head2 _create_query_string
663 my @query_strings = $self->_create_query_string(@queries);
665 Given a list of hashrefs, it will turn them into a lucene-style query string.
666 The hash should contain field, type (both for the indexes), operator, and
671 sub _create_query_string {
672 my ( $self, @queries ) = @_;
675 my $otor = $_->{operator} ? $_->{operator} . ' ' : '';
676 my $field = $_->{field} ? $_->{field} . ':' : '';
678 my $oand = $self->_modify_string_by_type(%$_);
679 "$otor($field$oand)";
683 =head2 _clean_search_term
685 my $term = $self->_clean_search_term($term);
687 This cleans a search term by removing any funny characters that may upset
688 ES and give us an error. It also calls L<_convert_index_strings_freeform>
689 to ensure those parts are correct.
693 sub _clean_search_term {
694 my ( $self, $term ) = @_;
696 # Some hardcoded searches (like with authorities) produce things like
697 # 'an=123', when it ought to be 'an:123' for our purposes.
699 $term = $self->_convert_index_strings_freeform($term);
704 =head2 _fix_limit_special_cases
706 my $limits = $self->_fix_limit_special_cases($limits);
708 This converts any special cases that the limit specifications have into things
709 that are more readily processable by the rest of the code.
711 The argument should be an arrayref, and it'll return an arrayref.
715 sub _fix_limit_special_cases {
716 my ( $self, $limits ) = @_;
719 foreach my $l (@$limits) {
721 # This is set up by opac-search.pl
722 if ( $l =~ /^yr,st-numeric,ge=/ ) {
723 my ( $start, $end ) =
724 ( $l =~ /^yr,st-numeric,ge=(.*) and yr,st-numeric,le=(.*)$/ );
725 next unless defined($start) && defined($end);
726 push @new_lim, "copydate:[$start TO $end]";
728 elsif ( $l =~ /^yr,st-numeric=/ ) {
729 my ($date) = ( $l =~ /^yr,st-numeric=(.*)$/ );
730 next unless defined($date);
731 push @new_lim, "copydate:$date";
733 elsif ( $l =~ /^available$/ ) {
734 push @new_lim, 'onloan:false';
745 my $field = $self->_sort_field($field);
747 Given a field name, this works out what the actual name of the version to sort
748 on should be. Often it's the same, sometimes it involves sticking "__sort" on
749 the end. Maybe it'll be something else in the future, who knows?
755 if ($self->sort_fields()->{$f}) {