1 package Koha::SearchEngine::Elasticsearch::QueryBuilder;
3 # This file is part of Koha.
5 # Copyright 2014 Catalyst IT Ltd.
7 # Koha is free software; you can redistribute it and/or modify it
8 # under the terms of the GNU General Public License as published by
9 # the Free Software Foundation; either version 3 of the License, or
10 # (at your option) any later version.
12 # Koha is distributed in the hope that it will be useful, but
13 # WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 # GNU General Public License for more details.
17 # You should have received a copy of the GNU General Public License
18 # along with Koha; if not, see <http://www.gnu.org/licenses>.
22 Koha::SearchEngine::Elasticsearch::QueryBuilder - constructs elasticsearch
23 query objects from user-supplied queries
27 This provides the functions that take a user-supplied search query, and
28 provides something that can be given to elasticsearch to get answers.
32 use Koha::SearchEngine::Elasticsearch::QueryBuilder;
33 $builder = Koha::SearchEngine::Elasticsearch->new({ index => $index });
34 my $simple_query = $builder->build_query("hello");
35 # This is currently undocumented because the original code is undocumented
36 my $adv_query = $builder->build_advanced_query($indexes, $operands, $operators);
42 use base qw(Koha::SearchEngine::Elasticsearch);
45 use List::MoreUtils qw/ each_array /;
54 my $simple_query = $builder->build_query("hello", %options)
56 This will build a query that can be issued to elasticsearch from the provided
57 string input. This expects a lucene style search form (see
58 L<http://www.elasticsearch.org/guide/en/elasticsearch/reference/current/query-dsl-query-string-query.html#query-string-syntax>
61 It'll make an attempt to respect the various query options.
63 Additional options can be provided with the C<%options> hash.
69 This should be an arrayref of hashrefs, each containing a C<field> and an
70 C<direction> (optional, defaults to C<asc>.) The results will be sorted
71 according to these values. Valid values for C<direction> are 'asc' and 'desc'.
78 my ( $self, $query, %options ) = @_;
80 my $stemming = C4::Context->preference("QueryStemming") || 0;
81 my $auto_truncation = C4::Context->preference("QueryAutoTruncate") || 0;
82 my $weight_fields = C4::Context->preference("QueryWeightFields") || 0;
83 my $fuzzy_enabled = C4::Context->preference("QueryFuzzy") || 0;
85 $query = '*' unless defined $query;
91 fuzziness => $fuzzy_enabled ? 'auto' : '0',
92 default_operator => 'AND',
93 default_field => '_all',
94 lenient => JSON::true,
95 fields => $options{fields},
99 if ( $options{sort} ) {
100 foreach my $sort ( @{ $options{sort} } ) {
101 my ( $f, $d ) = @$sort{qw/ field direction /};
102 die "Invalid sort direction, $d"
103 if $d && ( $d ne 'asc' && $d ne 'desc' );
104 $d = 'asc' unless $d;
106 # TODO account for fields that don't have a 'phrase' type
108 $f = $self->_sort_field($f);
109 push @{ $res->{sort} }, { "$f.phrase" => { order => $d } };
113 # See _convert_facets in Search.pm for how these get turned into
114 # things that Koha can use.
115 $res->{aggregations} = {
116 author => { terms => { field => "author__facet" } },
117 subject => { terms => { field => "subject__facet" } },
118 itype => { terms => { field => "itype__facet" } },
119 location => { terms => { field => "location__facet" } },
120 'su-geo' => { terms => { field => "su-geo__facet" } },
121 se => { terms => { field => "se__facet" } },
122 ccode => { terms => { field => "ccode__facet" } },
125 my $display_library_facets = C4::Context->preference('DisplayLibraryFacets');
126 if ( $display_library_facets eq 'both'
127 or $display_library_facets eq 'home' ) {
128 $res->{aggregations}{homebranch} = { terms => { field => "homebranch__facet" } };
130 if ( $display_library_facets eq 'both'
131 or $display_library_facets eq 'holding' ) {
132 $res->{aggregations}{holdingbranch} = { terms => { field => "holdingbranch__facet" } };
134 if ( my $ef = $options{expanded_facet} ) {
135 $res->{aggregations}{$ef}{terms}{size} = C4::Context->preference('FacetMaxCount');
140 =head2 build_browse_query
142 my $browse_query = $builder->build_browse_query($field, $query);
144 This performs a "starts with" style query on a particular field. The field
145 to be searched must have been indexed with an appropriate mapping as a
146 "phrase" subfield, which pretty much everything has.
150 # XXX this isn't really a browse query like we want in the end
151 sub build_browse_query {
152 my ( $self, $field, $query ) = @_;
154 my $fuzzy_enabled = C4::Context->preference("QueryFuzzy") || 0;
156 return { query => '*' } if !defined $query;
158 # TODO this should come from Koha::SearchEngine::Elasticsearch
159 my %field_whitelist = (
163 $field = 'title' if !exists $field_whitelist{$field};
164 my $sort = $self->_sort_field($field);
167 match_phrase_prefix => {
171 fuzziness => $fuzzy_enabled ? 'auto' : '0',
175 sort => [ { "$sort.phrase" => { order => "asc" } } ],
179 =head2 build_query_compat
182 $error, $query, $simple_query, $query_cgi,
183 $query_desc, $limit, $limit_cgi, $limit_desc,
184 $stopwords_removed, $query_type
186 = $builder->build_query_compat( \@operators, \@operands, \@indexes,
187 \@limits, \@sort_by, $scan, $lang );
189 This handles a search using the same api as L<C4::Search::buildQuery> does.
191 A very simple query will go in with C<$operands> set to ['query'], and
192 C<$sort_by> set to ['pubdate_dsc']. This simple case will return with
193 C<$query> set to something that can perform the search, C<$simple_query>
194 set to just the search term, C<$query_cgi> set to something that can
195 reproduce this search, and C<$query_desc> set to something else.
199 sub build_query_compat {
200 my ( $self, $operators, $operands, $indexes, $orig_limits, $sort_by, $scan,
204 #die Dumper ( $self, $operators, $operands, $indexes, $orig_limits, $sort_by, $scan, $lang );
205 my @sort_params = $self->_convert_sort_fields(@$sort_by);
206 my @index_params = $self->_convert_index_fields(@$indexes);
207 my $limits = $self->_fix_limit_special_cases($orig_limits);
208 if ( $params->{suppress} ) { push @$limits, "suppress:0"; }
210 # Merge the indexes in with the search terms and the operands so that
211 # each search thing is a handy unit.
212 unshift @$operators, undef; # The first one can't have an op
214 my $ea = each_array( @$operands, @$operators, @index_params );
215 while ( my ( $oand, $otor, $index ) = $ea->() ) {
216 next if ( !defined($oand) || $oand eq '' );
217 push @search_params, {
218 operand => $self->_clean_search_term($oand), # the search terms
219 operator => defined($otor) ? uc $otor : undef, # AND and so on
220 $index ? %$index : (),
224 # We build a string query from limits and the queries. An alternative
225 # would be to pass them separately into build_query and let it build
226 # them into a structured ES query itself. Maybe later, though that'd be
228 my $query_str = join( ' AND ',
229 join( ' ', $self->_create_query_string(@search_params) ) || (),
230 $self->_join_queries( $self->_convert_index_strings(@$limits) ) || () );
233 if ( defined($params->{weighted_fields}) && $params->{weighted_fields} ) {
234 push @fields, sprintf("%s^%s", $_->name, $_->weight) for Koha::SearchFields->weighted_fields;
237 # If there's no query on the left, let's remove the junk left behind
238 $query_str =~ s/^ AND //;
240 $options{fields} = \@fields;
241 $options{sort} = \@sort_params;
242 $options{expanded_facet} = $params->{expanded_facet};
243 my $query = $self->build_query( $query_str, %options );
246 # We roughly emulate the CGI parameters of the zebra query builder
248 $query_cgi = 'q=' . uri_escape_utf8( $operands->[0] ) if @$operands;
250 $simple_query = $operands->[0] if @$operands == 1;
251 my $query_desc = $simple_query;
252 my $limit = $self->_join_queries( $self->_convert_index_strings(@$limits));
253 my $limit_cgi = ( $orig_limits and @$orig_limits )
254 ? '&limit=' . join( '&limit=', map { uri_escape_utf8($_) } @$orig_limits )
257 $limit_desc = "$limit" if $limit;
259 undef, $query, $simple_query, $query_cgi, $query_desc,
260 $limit, $limit_cgi, $limit_desc, undef, undef
264 =head2 build_authorities_query
266 my $query = $builder->build_authorities_query(\%search);
268 This takes a nice description of an authority search and turns it into a black-box
269 query that can then be passed to the appropriate searcher.
271 The search description is a hashref that looks something like:
276 where => 'Heading', # search the main entry
277 operator => 'exact', # require an exact match
278 value => 'frogs', # the search string
281 where => '', # search all entries
282 operator => '', # default keyword, right truncation
290 authtypecode => 'TOPIC_TERM',
295 sub build_authorities_query {
296 my ( $self, $search ) = @_;
298 # Start by making the query parts
301 foreach my $s ( @{ $search->{searches} } ) {
302 my ( $wh, $op, $val ) = @{$s}{qw(where operator value)};
303 $wh = '_all' if $wh eq '';
304 if ( $op eq 'is' || $op eq '=' ) {
306 # look for something that matches a term completely
307 # note, '=' is about numerical vals. May need special handling.
308 # Also, we lowercase our search because the ES
309 # index lowercases its values, and term searches don't get the
310 # search analyzer applied to them.
311 push @query_parts, { term => {"$wh.phrase" => lc $val} };
313 elsif ( $op eq 'exact' ) {
314 # left and right truncation, otherwise an exact phrase
315 push @query_parts, { match_phrase => {"$wh.phrase" => lc $val} };
317 elsif ( $op eq 'start' ) {
318 # startswith search, uses lowercase untokenized version of heading
319 push @query_parts, { prefix => {"$wh.lc_raw" => lc $val} };
322 # regular wordlist stuff
323 # push @query_parts, { match => {$wh => { query => $val, operator => 'and' }} };
324 my @values = split(' ',$val);
325 foreach my $v (@values) {
326 push @query_parts, { wildcard => { "$wh.phrase" => "*" . lc $v . "*" } };
331 # Merge the query parts appropriately
332 # 'should' behaves like 'or'
333 # 'must' behaves like 'and'
334 # Zebra results seem to match must so using that here
335 my $query = { query=>
337 { must => \@query_parts }
341 # We need to add '.phrase' to all the sort headings otherwise it'll sort
342 # based on the tokenised form.
344 if ( exists $search->{sort} ) {
345 foreach my $k ( keys %{ $search->{sort} } ) {
346 my $f = $self->_sort_field($k);
347 $s{"$f.phrase"} = $search->{sort}{$k};
349 $search->{sort} = \%s;
353 $query->{sort} = [ $search->{sort} ] if exists $search->{sort};
359 =head2 build_authorities_query_compat
362 $builder->build_authorities_query_compat( \@marclist, \@and_or,
363 \@excluding, \@operator, \@value, $authtypecode, $orderby );
365 This builds a query for searching for authorities, in the style of
366 L<C4::AuthoritiesMarc::SearchAuthorities>.
374 An arrayref containing where the particular term should be searched for.
375 Options are: mainmainentry, mainentry, match, match-heading, see-from, and
376 thesaurus. If left blank, any field is used.
380 Totally ignored. It is never used in L<C4::AuthoritiesMarc::SearchAuthorities>.
388 What form of search to do. Options are: is (phrase, no trunction, whole field
389 must match), = (number exact match), exact (phrase, but with left and right
390 truncation). If left blank, then word list, right truncted, anywhere is used.
394 The actual user-provided string value to search for.
398 The authority type code to search within. If blank, then all will be searched.
402 The order to sort the results by. Options are Relevance, HeadingAsc,
403 HeadingDsc, AuthidAsc, AuthidDsc.
407 marclist, operator, and value must be the same length, and the values at
408 index /i/ all relate to each other.
410 This returns a query, which is a black box object that can be passed to the
411 appropriate search object.
415 our $koha_to_index_name = {
416 mainmainentry => 'Heading-Main',
417 mainentry => 'Heading',
419 'match-heading' => 'Match-heading',
420 'see-from' => 'Match-heading-see-from',
421 thesaurus => 'Subject-heading-thesaurus',
425 sub build_authorities_query_compat {
426 my ( $self, $marclist, $and_or, $excluding, $operator, $value,
427 $authtypecode, $orderby )
430 # This turns the old-style many-options argument form into a more
431 # extensible hash form that is understood by L<build_authorities_query>.
434 # Make sure everything exists
435 foreach my $m (@$marclist) {
436 Koha::Exceptions::WrongParameter->throw("Invalid marclist field provided: $m")
437 unless exists $koha_to_index_name->{$m};
439 for ( my $i = 0 ; $i < @$value ; $i++ ) {
440 next unless $value->[$i]; #clean empty form values, ES doesn't like undefined searches
443 where => $koha_to_index_name->{$marclist->[$i]},
444 operator => $operator->[$i],
445 value => $value->[$i],
451 ( $orderby =~ /^Heading/ ) ? 'Heading__sort'
452 : ( $orderby =~ /^Auth/ ) ? 'Local-Number'
455 my $sort_order = ( $orderby =~ /Asc$/ ) ? 'asc' : 'desc';
456 %sort = ( $sort_field => $sort_order, );
459 searches => \@searches,
460 authtypecode => $authtypecode,
462 $search{sort} = \%sort if %sort;
463 my $query = $self->build_authorities_query( \%search );
467 =head2 _convert_sort_fields
469 my @sort_params = _convert_sort_fields(@sort_by)
471 Converts the zebra-style sort index information into elasticsearch-style.
473 C<@sort_by> is the same as presented to L<build_query_compat>, and it returns
474 something that can be sent to L<build_query>.
478 sub _convert_sort_fields {
479 my ( $self, @sort_by ) = @_;
481 # Turn the sorting into something we care about.
482 my %sort_field_convert = (
483 acqdate => 'acqdate',
485 call_number => 'callnum',
486 popularity => 'issues',
487 relevance => undef, # default
489 pubdate => 'pubdate',
491 my %sort_order_convert =
492 ( qw( desc desc ), qw( dsc desc ), qw( asc asc ), qw( az asc ), qw( za desc ) );
494 # Convert the fields and orders, drop anything we don't know about.
495 grep { $_->{field} } map {
496 my ( $f, $d ) = /(.+)_(.+)/;
498 field => $sort_field_convert{$f},
499 direction => $sort_order_convert{$d}
504 =head2 _convert_index_fields
506 my @index_params = $self->_convert_index_fields(@indexes);
508 Converts zebra-style search index notation into elasticsearch-style.
510 C<@indexes> is an array of index names, as presented to L<build_query_compat>,
511 and it returns something that can be sent to L<build_query>.
513 B<TODO>: this will pull from the elasticsearch mappings table to figure out
518 our %index_field_convert = (
524 'se' => 'title-series',
525 'callnum' => 'callnum',
528 'branch' => 'homebranch',
532 'hi' => 'Host-Item-Number',
535 sub _convert_index_fields {
536 my ( $self, @indexes ) = @_;
538 my %index_type_convert =
539 ( __default => undef, phr => 'phrase', rtrn => 'right-truncate' );
541 # Convert according to our table, drop anything that doesn't convert.
542 # If a field starts with mc- we save it as it's used (and removed) later
543 # when joining things, to indicate we make it an 'OR' join.
544 # (Sorry, this got a bit ugly after special cases were found.)
545 grep { $_->{field} } map {
546 my ( $f, $t ) = split /,/;
553 field => $index_field_convert{$f},
554 type => $index_type_convert{ $t // '__default' }
556 $r->{field} = ($mc . $r->{field}) if $mc && $r->{field};
561 =head2 _convert_index_strings
563 my @searches = $self->_convert_index_strings(@searches);
565 Similar to L<_convert_index_fields>, this takes strings of the form
566 B<field:search term> and rewrites the field from zebra-style to
567 elasticsearch-style. Anything it doesn't understand is returned verbatim.
571 sub _convert_index_strings {
572 my ( $self, @searches ) = @_;
574 foreach my $s (@searches) {
576 my ( $field, $term ) = $s =~ /^\s*([\w,-]*?):(.*)/;
577 unless ( defined($field) && defined($term) ) {
581 my ($conv) = $self->_convert_index_fields($field);
582 unless ( defined($conv) ) {
586 push @res, $conv->{field} . ":"
587 . $self->_modify_string_by_type( %$conv, operand => $term );
592 =head2 _convert_index_strings_freeform
594 my $search = $self->_convert_index_strings_freeform($search);
596 This is similar to L<_convert_index_strings>, however it'll search out the
597 things to change within the string. So it can handle strings such as
598 C<(su:foo) AND (su:bar)>, converting the C<su> appropriately.
600 If there is something of the form "su,complete-subfield" or something, the
601 second part is stripped off as we can't yet handle that. Making it work
602 will have to wait for a real query parser.
606 sub _convert_index_strings_freeform {
607 my ( $self, $search ) = @_;
608 while ( my ( $zeb, $es ) = each %index_field_convert ) {
609 $search =~ s/\b$zeb(?:,[\w\-]*)?:/$es:/g;
614 =head2 _modify_string_by_type
616 my $str = $self->_modify_string_by_type(%index_field);
618 If you have a search term (operand) and a type (phrase, right-truncated), this
619 will convert the string to have the function in lucene search terms, e.g.
620 wrapping quotes around it.
624 sub _modify_string_by_type {
625 my ( $self, %idx ) = @_;
627 my $type = $idx{type} || '';
628 my $str = $idx{operand};
629 return $str unless $str; # Empty or undef, we can't use it.
631 $str .= '*' if $type eq 'right-truncate';
632 $str = '"' . $str . '"' if $type eq 'phrase';
638 my $query_str = $self->_join_queries(@query_parts);
640 This takes a list of query parts, that might be search terms on their own, or
641 booleaned together, or specifying fields, or whatever, wraps them in
642 parentheses, and ANDs them all together. Suitable for feeding to the ES
645 Note: doesn't AND them together if they specify an index that starts with "mc"
646 as that was a special case in the original code for dealing with multiple
647 choice options (you can't search for something that has an itype of A and
648 and itype of B otherwise.)
653 my ( $self, @parts ) = @_;
655 my @norm_parts = grep { defined($_) && $_ ne '' && $_ !~ /^mc-/ } @parts;
657 map { s/^mc-//r } grep { defined($_) && $_ ne '' && $_ =~ /^mc-/ } @parts;
658 return () unless @norm_parts + @mc_parts;
659 return ( @norm_parts, @mc_parts )[0] if @norm_parts + @mc_parts == 1;
661 @mc_parts ? '(' . ( join ' OR ', map { "($_)" } @mc_parts ) . ')' : ();
663 # Handy trick: $x || () inside a join means that if $x ends up as an
664 # empty string, it gets replaced with (), which makes join ignore it.
665 # (bad effect: this'll also happen to '0', this hopefully doesn't matter
668 join( ' AND ', map { "($_)" } @norm_parts ) || (),
674 my @phrased_queries = $self->_make_phrases(@query_parts);
676 This takes the supplied queries and forces them to be phrases by wrapping
677 quotes around them. It understands field prefixes, e.g. 'subject:' and puts
678 the quotes outside of them if they're there.
683 my ( $self, @parts ) = @_;
684 map { s/^\s*(\w*?:)(.*)$/$1"$2"/r } @parts;
687 =head2 _create_query_string
689 my @query_strings = $self->_create_query_string(@queries);
691 Given a list of hashrefs, it will turn them into a lucene-style query string.
692 The hash should contain field, type (both for the indexes), operator, and
697 sub _create_query_string {
698 my ( $self, @queries ) = @_;
701 my $otor = $_->{operator} ? $_->{operator} . ' ' : '';
702 my $field = $_->{field} ? $_->{field} . ':' : '';
704 my $oand = $self->_modify_string_by_type(%$_);
705 "$otor($field$oand)";
709 =head2 _clean_search_term
711 my $term = $self->_clean_search_term($term);
713 This cleans a search term by removing any funny characters that may upset
714 ES and give us an error. It also calls L<_convert_index_strings_freeform>
715 to ensure those parts are correct.
719 sub _clean_search_term {
720 my ( $self, $term ) = @_;
722 my $auto_truncation = C4::Context->preference("QueryAutoTruncate") || 0;
724 # Some hardcoded searches (like with authorities) produce things like
725 # 'an=123', when it ought to be 'an:123' for our purposes.
727 $term = $self->_convert_index_strings_freeform($term);
729 $term = $self->_truncate_terms($term) if ($auto_truncation);
733 =head2 _fix_limit_special_cases
735 my $limits = $self->_fix_limit_special_cases($limits);
737 This converts any special cases that the limit specifications have into things
738 that are more readily processable by the rest of the code.
740 The argument should be an arrayref, and it'll return an arrayref.
744 sub _fix_limit_special_cases {
745 my ( $self, $limits ) = @_;
748 foreach my $l (@$limits) {
750 # This is set up by opac-search.pl
751 if ( $l =~ /^yr,st-numeric,ge=/ ) {
752 my ( $start, $end ) =
753 ( $l =~ /^yr,st-numeric,ge=(.*) and yr,st-numeric,le=(.*)$/ );
754 next unless defined($start) && defined($end);
755 push @new_lim, "copydate:[$start TO $end]";
757 elsif ( $l =~ /^yr,st-numeric=/ ) {
758 my ($date) = ( $l =~ /^yr,st-numeric=(.*)$/ );
759 next unless defined($date);
760 push @new_lim, "copydate:$date";
762 elsif ( $l =~ /^available$/ ) {
763 push @new_lim, 'onloan:0';
774 my $field = $self->_sort_field($field);
776 Given a field name, this works out what the actual name of the version to sort
777 on should be. Often it's the same, sometimes it involves sticking "__sort" on
778 the end. Maybe it'll be something else in the future, who knows?
784 if ($self->sort_fields()->{$f}) {
790 =head2 _truncate_terms
792 my $query = $self->_truncate_terms($query);
794 Given a string query this function appends '*' wildcard to all terms except
795 operands and double quoted strings.
799 sub _truncate_terms {
800 my ( $self, $query ) = @_;
802 # '"donald duck" title:"the mouse" and peter" get split into
803 # ['', '"donald duck"', '', ' ', '', 'title:"the mouse"', '', ' ', 'and', ' ', 'pete']
804 my @tokens = split /((?:[\w\-.]+:)?"[^"]+"|\s+)/, $query;
806 # Filter out empty tokens
807 my @words = grep { $_ !~ /^\s*$/ } @tokens;
809 # Append '*' to words if needed, ie. if it's not surrounded by quotes, not
810 # terminated by '*' and not a keyword
813 (/"$/ or /\*$/ or grep {lc($w) eq $_} qw/and or not/) ? $_ : "$_*";
816 return join ' ', @terms;