1 package Koha::SearchEngine::Elasticsearch::QueryBuilder;
3 # This file is part of Koha.
5 # Copyright 2014 Catalyst IT Ltd.
7 # Koha is free software; you can redistribute it and/or modify it
8 # under the terms of the GNU General Public License as published by
9 # the Free Software Foundation; either version 3 of the License, or
10 # (at your option) any later version.
12 # Koha is distributed in the hope that it will be useful, but
13 # WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 # GNU General Public License for more details.
17 # You should have received a copy of the GNU General Public License
18 # along with Koha; if not, see <http://www.gnu.org/licenses>.
22 Koha::SearchEngine::Elasticsearch::QueryBuilder - constructs elasticsearch
23 query objects from user-supplied queries
27 This provides the functions that take a user-supplied search query, and
28 provides something that can be given to elasticsearch to get answers.
32 use Koha::SearchEngine::Elasticsearch::QueryBuilder;
33 $builder = Koha::SearchEngine::Elasticsearch->new({ index => $index });
34 my $simple_query = $builder->build_query("hello");
35 # This is currently undocumented because the original code is undocumented
36 my $adv_query = $builder->build_advanced_query($indexes, $operands, $operators);
42 use base qw(Koha::ElasticSearch);
45 use List::MoreUtils qw/ each_array /;
50 use Data::Dumper; # TODO remove
54 my $simple_query = $builder->build_query("hello", %options)
56 This will build a query that can be issued to elasticsearch from the provided
57 string input. This expects a lucene style search form (see
58 L<http://www.elasticsearch.org/guide/en/elasticsearch/reference/current/query-dsl-query-string-query.html#query-string-syntax>
61 It'll make an attempt to respect the various query options.
63 Additional options can be provided with the C<%options> hash.
69 This should be an arrayref of hashrefs, each containing a C<field> and an
70 C<direction> (optional, defaults to C<asc>.) The results will be sorted
71 according to these values. Valid values for C<direction> are 'asc' and 'desc'.
78 my ( $self, $query, %options ) = @_;
80 my $stemming = C4::Context->preference("QueryStemming") || 0;
81 my $auto_truncation = C4::Context->preference("QueryAutoTruncate") || 0;
82 my $weight_fields = C4::Context->preference("QueryWeightFields") || 0;
83 my $fuzzy_enabled = C4::Context->preference("QueryFuzzy") || 0;
85 $query = '*' unless defined $query;
91 fuzziness => $fuzzy_enabled ? 'auto' : '0',
92 default_operator => 'AND',
93 default_field => '_all',
94 lenient => JSON::true,
98 if ( $options{sort} ) {
99 foreach my $sort ( @{ $options{sort} } ) {
100 my ( $f, $d ) = @$sort{qw/ field direction /};
101 die "Invalid sort direction, $d"
102 if $d && ( $d ne 'asc' && $d ne 'desc' );
103 $d = 'asc' unless $d;
105 # TODO account for fields that don't have a 'phrase' type
107 $f = $self->_sort_field($f);
108 push @{ $res->{sort} }, { "$f.phrase" => { order => $d } };
112 # See _convert_facets in Search.pm for how these get turned into
113 # things that Koha can use.
115 author => { terms => { field => "author__facet" } },
116 subject => { terms => { field => "subject__facet" } },
117 itype => { terms => { field => "itype__facet" } },
118 location => { terms => { field => "homebranch__facet" } },
119 'su-geo' => { terms => { field => "su-geo__facet" } },
120 se => { terms => { field => "se__facet" } },
122 if ( my $ef = $options{expanded_facet} ) {
123 $res->{facets}{$ef}{terms}{size} = C4::Context->preference('FacetMaxCount');
128 =head2 build_browse_query
130 my $browse_query = $builder->build_browse_query($field, $query);
132 This performs a "starts with" style query on a particular field. The field
133 to be searched must have been indexed with an appropriate mapping as a
134 "phrase" subfield, which pretty much everything has.
137 # XXX this isn't really a browse query like we want in the end
138 sub build_browse_query {
139 my ( $self, $field, $query ) = @_;
141 my $fuzzy_enabled = C4::Context->preference("QueryFuzzy") || 0;
143 return { query => '*' } if !defined $query;
145 # TODO this should come from Koha::Elasticsearch
146 my %field_whitelist = (
150 $field = 'title' if !exists $field_whitelist{$field};
151 my $sort = $self->_sort_field($field);
154 match_phrase_prefix => {
158 fuzziness => $fuzzy_enabled ? 'auto' : '0',
162 sort => [ { "$sort.phrase" => { order => "asc" } } ],
166 =head2 build_query_compat
169 $error, $query, $simple_query, $query_cgi,
170 $query_desc, $limit, $limit_cgi, $limit_desc,
171 $stopwords_removed, $query_type
173 = $builder->build_query_compat( \@operators, \@operands, \@indexes,
174 \@limits, \@sort_by, $scan, $lang );
176 This handles a search using the same api as L<C4::Search::buildQuery> does.
178 A very simple query will go in with C<$operands> set to ['query'], and
179 C<$sort_by> set to ['pubdate_dsc']. This simple case will return with
180 C<$query> set to something that can perform the search, C<$simple_query>
181 set to just the search term, C<$query_cgi> set to something that can
182 reproduce this search, and C<$query_desc> set to something else.
186 sub build_query_compat {
187 my ( $self, $operators, $operands, $indexes, $orig_limits, $sort_by, $scan,
191 #die Dumper ( $self, $operators, $operands, $indexes, $orig_limits, $sort_by, $scan, $lang );
192 my @sort_params = $self->_convert_sort_fields(@$sort_by);
193 my @index_params = $self->_convert_index_fields(@$indexes);
194 my $limits = $self->_fix_limit_special_cases($orig_limits);
196 # Merge the indexes in with the search terms and the operands so that
197 # each search thing is a handy unit.
198 unshift @$operators, undef; # The first one can't have an op
200 my $ea = each_array( @$operands, @$operators, @index_params );
201 while ( my ( $oand, $otor, $index ) = $ea->() ) {
202 next if ( !defined($oand) || $oand eq '' );
203 push @search_params, {
204 operand => $self->_clean_search_term($oand), # the search terms
205 operator => defined($otor) ? uc $otor : undef, # AND and so on
206 $index ? %$index : (),
210 # We build a string query from limits and the queries. An alternative
211 # would be to pass them separately into build_query and let it build
212 # them into a structured ES query itself. Maybe later, though that'd be
214 my $query_str = join( ' AND ',
215 join( ' ', $self->_create_query_string(@search_params) ) || (),
216 $self->_join_queries( $self->_convert_index_strings(@$limits) ) || () );
218 # If there's no query on the left, let's remove the junk left behind
219 $query_str =~ s/^ AND //;
221 $options{sort} = \@sort_params;
222 $options{expanded_facet} = $params->{expanded_facet};
223 my $query = $self->build_query( $query_str, %options );
226 # We roughly emulate the CGI parameters of the zebra query builder
227 my $query_cgi = 'idx=kw&q=' . uri_escape_utf8( $operands->[0] ) if @$operands;
228 my $simple_query = $operands->[0] if @$operands == 1;
229 my $query_desc = $simple_query;
230 my $limit = $self->_join_queries( $self->_convert_index_strings(@$limits));
231 my $limit_cgi = ( $orig_limits and @$orig_limits )
232 ? '&limit=' . join( '&limit=', map { uri_escape_utf8($_) } @$orig_limits )
234 my $limit_desc = "$limit" if $limit;
236 undef, $query, $simple_query, $query_cgi, $query_desc,
237 $limit, $limit_cgi, $limit_desc, undef, undef
241 =head2 build_authorities_query
243 my $query = $builder->build_authorities_query(\%search);
245 This takes a nice description of an authority search and turns it into a black-box
246 query that can then be passed to the appropriate searcher.
248 The search description is a hashref that looks something like:
253 where => 'Heading', # search the main entry
254 operator => 'exact', # require an exact match
255 value => 'frogs', # the search string
258 where => '', # search all entries
259 operator => '', # default keyword, right truncation
267 authtypecode => 'TOPIC_TERM',
272 sub build_authorities_query {
273 my ( $self, $search ) = @_;
275 # Start by making the query parts
278 foreach my $s ( @{ $search->{searches} } ) {
279 my ( $wh, $op, $val ) = @{$s}{qw(where operator value)};
280 $wh = '_all' if $wh eq '';
281 if ( $op eq 'is' || $op eq '=' ) {
283 # look for something that matches completely
284 # note, '=' is about numerical vals. May need special handling.
285 # _allphrase is a special field that only groups the exact
286 # matches. Also, we lowercase our search because the ES
287 # index lowercases its values, and term searches don't get the
288 # search analyzer applied to them.
289 push @filter_parts, { term => { "$wh.phrase" => lc $val } };
291 elsif ( $op eq 'exact' ) {
293 # left and right truncation, otherwise an exact phrase
294 push @query_parts, { match_phrase => { $wh => $val } };
296 elsif ( $op eq 'start' ) {
299 push @query_parts, { wildcard => { "$wh.phrase" => lc "$val*" } };
302 # regular wordlist stuff
303 push @query_parts, { match => { $wh => $val } };
307 # Merge the query and filter parts appropriately
308 # 'should' behaves like 'or', if we want 'and', use 'must'
309 my $query_part = { bool => { should => \@query_parts } };
310 my $filter_part = { bool => { should => \@filter_parts } };
312 # We need to add '.phrase' to all the sort headings otherwise it'll sort
313 # based on the tokenised form.
314 if ( exists $search->{sort} ) {
316 foreach my $k ( keys %{ $search->{sort} } ) {
317 my $f = $self->_sort_field($k);
318 $s{"$f.phrase"} = $search->{sort}{$k};
320 $search->{sort} = \%s;
323 # extract the sort stuff
324 my %sort = ( sort => [ $search->{sort} ] ) if exists $search->{sort};
329 { filtered => { filter => $filter_part, query => $query_part } }
333 $query = { query => $query_part };
335 $query = { %$query, %sort };
340 =head2 build_authorities_query_compat
343 $builder->build_authorities_query_compat( \@marclist, \@and_or,
344 \@excluding, \@operator, \@value, $authtypecode, $orderby );
346 This builds a query for searching for authorities, in the style of
347 L<C4::AuthoritiesMarc::SearchAuthorities>.
355 An arrayref containing where the particular term should be searched for.
356 Options are: mainmainentry, mainentry, match, match-heading, see-from, and
357 thesaurus. If left blank, any field is used.
361 Totally ignored. It is never used in L<C4::AuthoritiesMarc::SearchAuthorities>.
369 What form of search to do. Options are: is (phrase, no trunction, whole field
370 must match), = (number exact match), exact (phrase, but with left and right
371 truncation). If left blank, then word list, right truncted, anywhere is used.
375 The actual user-provided string value to search for.
379 The authority type code to search within. If blank, then all will be searched.
383 The order to sort the results by. Options are Relevance, HeadingAsc,
384 HeadingDsc, AuthidAsc, AuthidDsc.
388 marclist, operator, and value must be the same length, and the values at
389 index /i/ all relate to each other.
391 This returns a query, which is a black box object that can be passed to the
392 appropriate search object.
396 sub build_authorities_query_compat {
397 my ( $self, $marclist, $and_or, $excluding, $operator, $value,
398 $authtypecode, $orderby )
401 # This turns the old-style many-options argument form into a more
402 # extensible hash form that is understood by L<build_authorities_query>.
405 my %koha_to_index_name = (
406 mainmainentry => 'Heading-Main',
407 mainentry => 'Heading',
409 'match-heading' => 'Match-heading',
410 'see-from' => 'Match-heading-see-from',
411 thesaurus => 'Subject-heading-thesaurus',
415 # Make sure everything exists
416 foreach my $m (@$marclist) {
417 confess "Invalid marclist field provided: $m" unless exists $koha_to_index_name{$m};
419 for ( my $i = 0 ; $i < @$value ; $i++ ) {
422 where => $koha_to_index_name{$marclist->[$i]},
423 operator => $operator->[$i],
424 value => $value->[$i],
430 ( $orderby =~ /^Heading/ ) ? 'Heading'
431 : ( $orderby =~ /^Auth/ ) ? 'Local-Number'
434 my $sort_order = ( $orderby =~ /Asc$/ ) ? 'asc' : 'desc';
435 %sort = ( $sort_field => $sort_order, );
438 searches => \@searches,
439 authtypecode => $authtypecode,
441 $search{sort} = \%sort if %sort;
442 my $query = $self->build_authorities_query( \%search );
446 =head2 _convert_sort_fields
448 my @sort_params = _convert_sort_fields(@sort_by)
450 Converts the zebra-style sort index information into elasticsearch-style.
452 C<@sort_by> is the same as presented to L<build_query_compat>, and it returns
453 something that can be sent to L<build_query>.
457 sub _convert_sort_fields {
458 my ( $self, @sort_by ) = @_;
460 # Turn the sorting into something we care about.
461 my %sort_field_convert = (
462 acqdate => 'acqdate',
464 call_number => 'callnum',
465 popularity => 'issues',
466 relevance => undef, # default
468 pubdate => 'pubdate',
470 my %sort_order_convert =
471 ( qw( dsc desc ), qw( asc asc ), qw( az asc ), qw( za desc ) );
473 # Convert the fields and orders, drop anything we don't know about.
474 grep { $_->{field} } map {
475 my ( $f, $d ) = split /_/;
477 field => $sort_field_convert{$f},
478 direction => $sort_order_convert{$d}
483 =head2 _convert_index_fields
485 my @index_params = $self->_convert_index_fields(@indexes);
487 Converts zebra-style search index notation into elasticsearch-style.
489 C<@indexes> is an array of index names, as presented to L<build_query_compat>,
490 and it returns something that can be sent to L<build_query>.
492 B<TODO>: this will pull from the elasticsearch mappings table to figure out
497 our %index_field_convert = (
503 'se' => 'title-series',
504 'callnum' => 'callnum',
507 'branch' => 'homebranch',
511 'hi' => 'Host-Item-Number',
514 sub _convert_index_fields {
515 my ( $self, @indexes ) = @_;
517 my %index_type_convert =
518 ( __default => undef, phr => 'phrase', rtrn => 'right-truncate' );
520 # Convert according to our table, drop anything that doesn't convert.
521 # If a field starts with mc- we save it as it's used (and removed) later
522 # when joining things, to indicate we make it an 'OR' join.
523 # (Sorry, this got a bit ugly after special cases were found.)
524 grep { $_->{field} } map {
525 my ( $f, $t ) = split /,/;
532 field => $index_field_convert{$f},
533 type => $index_type_convert{ $t // '__default' }
535 $r->{field} = ($mc . $r->{field}) if $mc && $r->{field};
540 =head2 _convert_index_strings
542 my @searches = $self->_convert_index_strings(@searches);
544 Similar to L<_convert_index_fields>, this takes strings of the form
545 B<field:search term> and rewrites the field from zebra-style to
546 elasticsearch-style. Anything it doesn't understand is returned verbatim.
550 sub _convert_index_strings {
551 my ( $self, @searches ) = @_;
553 foreach my $s (@searches) {
555 my ( $field, $term ) = $s =~ /^\s*([\w,-]*?):(.*)/;
556 unless ( defined($field) && defined($term) ) {
560 my ($conv) = $self->_convert_index_fields($field);
561 unless ( defined($conv) ) {
565 push @res, $conv->{field} . ":"
566 . $self->_modify_string_by_type( %$conv, operand => $term );
571 =head2 _convert_index_strings_freeform
573 my $search = $self->_convert_index_strings_freeform($search);
575 This is similar to L<_convert_index_strings>, however it'll search out the
576 things to change within the string. So it can handle strings such as
577 C<(su:foo) AND (su:bar)>, converting the C<su> appropriately.
579 If there is something of the form "su,complete-subfield" or something, the
580 second part is stripped off as we can't yet handle that. Making it work
581 will have to wait for a real query parser.
585 sub _convert_index_strings_freeform {
586 my ( $self, $search ) = @_;
587 while ( my ( $zeb, $es ) = each %index_field_convert ) {
588 $search =~ s/\b$zeb(?:,[\w-]*)?:/$es:/g;
593 =head2 _modify_string_by_type
595 my $str = $self->_modify_string_by_type(%index_field);
597 If you have a search term (operand) and a type (phrase, right-truncated), this
598 will convert the string to have the function in lucene search terms, e.g.
599 wrapping quotes around it.
603 sub _modify_string_by_type {
604 my ( $self, %idx ) = @_;
606 my $type = $idx{type} || '';
607 my $str = $idx{operand};
608 return $str unless $str; # Empty or undef, we can't use it.
610 $str .= '*' if $type eq 'right-truncate';
611 $str = '"' . $str . '"' if $type eq 'phrase';
617 my $query_str = $self->_join_queries(@query_parts);
619 This takes a list of query parts, that might be search terms on their own, or
620 booleaned together, or specifying fields, or whatever, wraps them in
621 parentheses, and ANDs them all together. Suitable for feeding to the ES
624 Note: doesn't AND them together if they specify an index that starts with "mc"
625 as that was a special case in the original code for dealing with multiple
626 choice options (you can't search for something that has an itype of A and
627 and itype of B otherwise.)
632 my ( $self, @parts ) = @_;
634 my @norm_parts = grep { defined($_) && $_ ne '' && $_ !~ /^mc-/ } @parts;
636 map { s/^mc-//r } grep { defined($_) && $_ ne '' && $_ =~ /^mc-/ } @parts;
637 return () unless @norm_parts + @mc_parts;
638 return ( @norm_parts, @mc_parts )[0] if @norm_parts + @mc_parts == 1;
640 @mc_parts ? '(' . ( join ' OR ', map { "($_)" } @mc_parts ) . ')' : ();
642 # Handy trick: $x || () inside a join means that if $x ends up as an
643 # empty string, it gets replaced with (), which makes join ignore it.
644 # (bad effect: this'll also happen to '0', this hopefully doesn't matter
647 join( ' AND ', map { "($_)" } @norm_parts ) || (),
653 my @phrased_queries = $self->_make_phrases(@query_parts);
655 This takes the supplied queries and forces them to be phrases by wrapping
656 quotes around them. It understands field prefixes, e.g. 'subject:' and puts
657 the quotes outside of them if they're there.
662 my ( $self, @parts ) = @_;
663 map { s/^\s*(\w*?:)(.*)$/$1"$2"/r } @parts;
666 =head2 _create_query_string
668 my @query_strings = $self->_create_query_string(@queries);
670 Given a list of hashrefs, it will turn them into a lucene-style query string.
671 The hash should contain field, type (both for the indexes), operator, and
676 sub _create_query_string {
677 my ( $self, @queries ) = @_;
680 my $otor = $_->{operator} ? $_->{operator} . ' ' : '';
681 my $field = $_->{field} ? $_->{field} . ':' : '';
683 my $oand = $self->_modify_string_by_type(%$_);
684 "$otor($field$oand)";
688 =head2 _clean_search_term
690 my $term = $self->_clean_search_term($term);
692 This cleans a search term by removing any funny characters that may upset
693 ES and give us an error. It also calls L<_convert_index_strings_freeform>
694 to ensure those parts are correct.
698 sub _clean_search_term {
699 my ( $self, $term ) = @_;
701 # Some hardcoded searches (like with authorities) produce things like
702 # 'an=123', when it ought to be 'an:123' for our purposes.
704 $term = $self->_convert_index_strings_freeform($term);
709 =head2 _fix_limit_special_cases
711 my $limits = $self->_fix_limit_special_cases($limits);
713 This converts any special cases that the limit specifications have into things
714 that are more readily processable by the rest of the code.
716 The argument should be an arrayref, and it'll return an arrayref.
720 sub _fix_limit_special_cases {
721 my ( $self, $limits ) = @_;
724 foreach my $l (@$limits) {
726 # This is set up by opac-search.pl
727 if ( $l =~ /^yr,st-numeric,ge=/ ) {
728 my ( $start, $end ) =
729 ( $l =~ /^yr,st-numeric,ge=(.*) and yr,st-numeric,le=(.*)$/ );
730 next unless defined($start) && defined($end);
731 push @new_lim, "copydate:[$start TO $end]";
733 elsif ( $l =~ /^yr,st-numeric=/ ) {
734 my ($date) = ( $l =~ /^yr,st-numeric=(.*)$/ );
735 next unless defined($date);
736 push @new_lim, "copydate:$date";
738 elsif ( $l =~ /^available$/ ) {
739 push @new_lim, 'onloan:false';
750 my $field = $self->_sort_field($field);
752 Given a field name, this works out what the actual name of the version to sort
753 on should be. Often it's the same, sometimes it involves sticking "__sort" on
754 the end. Maybe it'll be something else in the future, who knows?
760 if ($self->sort_fields()->{$f}) {