1 package Koha::SearchEngine::Elasticsearch::QueryBuilder;
3 # This file is part of Koha.
5 # Copyright 2014 Catalyst IT Ltd.
7 # Koha is free software; you can redistribute it and/or modify it
8 # under the terms of the GNU General Public License as published by
9 # the Free Software Foundation; either version 3 of the License, or
10 # (at your option) any later version.
12 # Koha is distributed in the hope that it will be useful, but
13 # WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 # GNU General Public License for more details.
17 # You should have received a copy of the GNU General Public License
18 # along with Koha; if not, see <http://www.gnu.org/licenses>.
22 Koha::SearchEngine::Elasticsearch::QueryBuilder - constructs elasticsearch
23 query objects from user-supplied queries
27 This provides the functions that take a user-supplied search query, and
28 provides something that can be given to elasticsearch to get answers.
32 use Koha::SearchEngine::Elasticsearch;
33 $builder = Koha::SearchEngine::Elasticsearch->new();
34 my $simple_query = $builder->build_query("hello");
35 # This is currently undocumented because the original code is undocumented
36 my $adv_query = $builder->build_advanced_query($indexes, $operands, $operators);
42 use base qw(Class::Accessor);
44 use List::MoreUtils qw/ each_array /;
48 use Data::Dumper; # TODO remove
52 my $simple_query = $builder->build_query("hello", %options)
54 This will build a query that can be issued to elasticsearch from the provided
55 string input. This expects a lucene style search form (see
56 L<http://www.elasticsearch.org/guide/en/elasticsearch/reference/current/query-dsl-query-string-query.html#query-string-syntax>
59 It'll make an attempt to respect the various query options.
61 Additional options can be provided with the C<%options> hash.
67 This should be an arrayref of hashrefs, each containing a C<field> and an
68 C<direction> (optional, defaults to C<asc>.) The results will be sorted
69 according to these values. Valid values for C<direction> are 'asc' and 'desc'.
76 my ( $self, $query, %options ) = @_;
78 my $stemming = C4::Context->preference("QueryStemming") || 0;
79 my $auto_truncation = C4::Context->preference("QueryAutoTruncate") || 0;
80 my $weight_fields = C4::Context->preference("QueryWeightFields") || 0;
81 my $fuzzy_enabled = C4::Context->preference("QueryFuzzy") || 0;
83 $query = '*' unless defined $query;
89 fuzziness => $fuzzy_enabled ? 'auto' : '0',
90 default_operator => "AND",
91 default_field => "_all",
95 if ( $options{sort} ) {
96 foreach my $sort ( @{ $options{sort} } ) {
97 my ( $f, $d ) = @$sort{qw/ field direction /};
98 die "Invalid sort direction, $d"
99 if $d && ( $d ne 'asc' && $d ne 'desc' );
100 $d = 'asc' unless $d;
102 # TODO account for fields that don't have a 'phrase' type
103 push @{ $res->{sort} }, { "$f.phrase" => { order => $d } };
107 # See _convert_facets in Search.pm for how these get turned into
108 # things that Koha can use.
110 author => { terms => { field => "author__facet" } },
111 subject => { terms => { field => "subject__facet" } },
112 itype => { terms => { field => "itype__facet" } },
117 =head2 build_browse_query
119 my $browse_query = $builder->build_browse_query($field, $query);
121 This performs a "starts with" style query on a particular field. The field
122 to be searched must have been indexed with an appropriate mapping as a
123 "phrase" subfield, which pretty much everything has.
126 # XXX this isn't really a browse query like we want in the end
127 sub build_browse_query {
128 my ( $self, $field, $query ) = @_;
130 my $fuzzy_enabled = C4::Context->preference("QueryFuzzy") || 0;
132 return { query => '*' } if !defined $query;
134 # TODO this should come from Koha::Elasticsearch
135 my %field_whitelist = (
139 $field = 'title' if !exists $field_whitelist{$field};
143 match_phrase_prefix => {
147 fuzziness => $fuzzy_enabled ? 'auto' : '0',
151 sort => [ { "$field.phrase" => { order => "asc" } } ],
155 =head2 build_query_compat
158 $error, $query, $simple_query, $query_cgi,
159 $query_desc, $limit, $limit_cgi, $limit_desc,
160 $stopwords_removed, $query_type
162 = $builder->build_query_compat( \@operators, \@operands, \@indexes,
163 \@limits, \@sort_by, $scan, $lang );
165 This handles a search using the same api as L<C4::Search::buildQuery> does.
167 A very simple query will go in with C<$operands> set to ['query'], and
168 C<$sort_by> set to ['pubdate_dsc']. This simple case will return with
169 C<$query> set to something that can perform the search, C<$simple_query>
170 set to just the search term, C<$query_cgi> set to something that can
171 reproduce this search, and C<$query_desc> set to something else.
175 sub build_query_compat {
176 my ( $self, $operators, $operands, $indexes, $orig_limits, $sort_by, $scan,
180 #die Dumper ( $self, $operators, $operands, $indexes, $orig_limits, $sort_by, $scan, $lang );
181 my @sort_params = $self->_convert_sort_fields(@$sort_by);
182 my @index_params = $self->_convert_index_fields(@$indexes);
183 my $limits = $self->_fix_limit_special_cases($orig_limits);
185 # Merge the indexes in with the search terms and the operands so that
186 # each search thing is a handy unit.
187 unshift @$operators, undef; # The first one can't have an op
189 my $ea = each_array( @$operands, @$operators, @index_params );
190 while ( my ( $oand, $otor, $index ) = $ea->() ) {
191 next if ( !defined($oand) || $oand eq '' );
192 push @search_params, {
193 operand => $self->_clean_search_term($oand), # the search terms
194 operator => defined($otor) ? uc $otor : undef, # AND and so on
195 $index ? %$index : (),
199 # We build a string query from limits and the queries. An alternative
200 # would be to pass them separately into build_query and let it build
201 # them into a structured ES query itself. Maybe later, though that'd be
203 my $query_str = join( ' AND ',
204 join( ' ', $self->_create_query_string(@search_params) ) || (),
205 $self->_join_queries( $self->_convert_index_strings(@$limits) ) || () );
207 # If there's no query on the left, let's remove the junk left behind
208 $query_str =~ s/^ AND //;
210 $options{sort} = \@sort_params;
211 my $query = $self->build_query( $query_str, %options );
214 # We roughly emulate the CGI parameters of the zebra query builder
215 my $query_cgi = 'idx=kw&q=' . uri_escape( $operands->[0] ) if @$operands;
216 my $simple_query = $operands->[0] if @$operands == 1;
217 my $query_desc = $simple_query;
218 my $limit = $self->_join_queries( $self->_convert_index_strings(@$limits));
220 '&limit=' . join( '&limit=', map { uri_escape($_) } @$orig_limits );
221 my $limit_desc = "$limit";
223 undef, $query, $simple_query, $query_cgi, $query_desc,
224 $limit, $limit_cgi, $limit_desc, undef, undef
228 =head2 build_authorities_query
230 my $query = $builder->build_authorities_query(\%search);
232 This takes a nice description of an authority search and turns it into a black-box
233 query that can then be passed to the appropriate searcher.
235 The search description is a hashref that looks something like:
240 where => 'Heading', # search the main entry
241 operator => 'exact', # require an exact match
242 value => 'frogs', # the search string
245 where => '', # search all entries
246 operator => '', # default keyword, right truncation
254 authtypecode => 'TOPIC_TERM',
259 sub build_authorities_query {
260 my ( $self, $search ) = @_;
262 # Start by making the query parts
265 foreach my $s ( @{ $search->{searches} } ) {
266 my ( $wh, $op, $val ) = @{$s}{qw(where operator value)};
267 $wh = '_all' if $wh eq '';
268 if ( $op eq 'is' || $op eq '=' ) {
270 # look for something that matches completely
271 # note, '=' is about numerical vals. May need special handling.
272 # _allphrase is a special field that only groups the exact
273 # matches. Also, we lowercase our search because the ES
274 # index lowercases its values, and term searches don't get the
275 # search analyzer applied to them.
276 push @filter_parts, { term => { "$wh.phrase" => lc $val } };
278 elsif ( $op eq 'exact' ) {
280 # left and right truncation, otherwise an exact phrase
281 push @query_parts, { match_phrase => { $wh => $val } };
283 elsif ( $op eq 'start' ) {
286 push @query_parts, { wildcard => { "$wh.phrase" => lc "$val*" } };
289 # regular wordlist stuff
290 push @query_parts, { match => { $wh => $val } };
294 # Merge the query and filter parts appropriately
295 # 'should' behaves like 'or', if we want 'and', use 'must'
296 my $query_part = { bool => { should => \@query_parts } };
297 my $filter_part = { bool => { should => \@filter_parts } };
299 # We need to add '.phrase' to all the sort headings otherwise it'll sort
300 # based on the tokenised form.
301 if ( exists $search->{sort} ) {
303 foreach my $k ( keys %{ $search->{sort} } ) {
304 $s{"$k.phrase"} = $search->{sort}{$k};
306 $search->{sort} = \%s;
309 # extract the sort stuff
310 my %sort = ( sort => [ $search->{sort} ] ) if exists $search->{sort};
315 { filtered => { filter => $filter_part, query => $query_part } }
319 $query = { query => $query_part };
321 $query = { %$query, %sort };
326 =head2 build_authorities_query_compat
329 $builder->build_authorities_query_compat( \@marclist, \@and_or,
330 \@excluding, \@operator, \@value, $authtypecode, $orderby );
332 This builds a query for searching for authorities, in the style of
333 L<C4::AuthoritiesMarc::SearchAuthorities>.
341 An arrayref containing where the particular term should be searched for.
342 Options are: mainmainentry, mainentry, match, match-heading, see-from, and
343 thesaurus. If left blank, any field is used.
347 Totally ignored. It is never used in L<C4::AuthoritiesMarc::SearchAuthorities>.
355 What form of search to do. Options are: is (phrase, no trunction, whole field
356 must match), = (number exact match), exact (phrase, but with left and right
357 truncation). If left blank, then word list, right truncted, anywhere is used.
361 The actual user-provided string value to search for.
365 The authority type code to search within. If blank, then all will be searched.
369 The order to sort the results by. Options are Relevance, HeadingAsc,
370 HeadingDsc, AuthidAsc, AuthidDsc.
374 marclist, operator, and value must be the same length, and the values at
375 index /i/ all relate to each other.
377 This returns a query, which is a black box object that can be passed to the
378 appropriate search object.
382 sub build_authorities_query_compat {
383 my ( $self, $marclist, $and_or, $excluding, $operator, $value,
384 $authtypecode, $orderby )
387 # This turns the old-style many-options argument form into a more
388 # extensible hash form that is understood by L<build_authorities_query>.
391 my %koha_to_index_name = (
392 mainmainentry => 'Heading-Main',
393 mainentry => 'Heading',
395 'match-heading' => 'Match-heading',
396 'see-from' => 'Match-heading-see-from',
397 thesaurus => 'Subject-heading-thesaurus',
401 # Make sure everything exists
402 foreach my $m (@$marclist) {
403 confess "Invalid marclist field provided: $m" unless exists $koha_to_index_name{$m};
405 for ( my $i = 0 ; $i < @$value ; $i++ ) {
408 where => $koha_to_index_name{$marclist->[$i]},
409 operator => $operator->[$i],
410 value => $value->[$i],
416 ( $orderby =~ /^Heading/ ) ? 'Heading'
417 : ( $orderby =~ /^Auth/ ) ? 'Local-Number'
420 my $sort_order = ( $orderby =~ /Asc$/ ) ? 'asc' : 'desc';
421 %sort = ( $sort_field => $sort_order, );
424 searches => \@searches,
425 authtypecode => $authtypecode,
427 $search{sort} = \%sort if %sort;
428 my $query = $self->build_authorities_query( \%search );
432 =head2 _convert_sort_fields
434 my @sort_params = _convert_sort_fields(@sort_by)
436 Converts the zebra-style sort index information into elasticsearch-style.
438 C<@sort_by> is the same as presented to L<build_query_compat>, and it returns
439 something that can be sent to L<build_query>.
443 sub _convert_sort_fields {
444 my ( $self, @sort_by ) = @_;
446 # Turn the sorting into something we care about.
447 my %sort_field_convert = (
448 acqdate => 'acqdate',
450 call_number => 'callnum',
451 popularity => 'issues',
452 relevance => undef, # default
454 pubdate => 'pubdate',
456 my %sort_order_convert =
457 ( qw( dsc desc ), qw( asc asc ), qw( az asc ), qw( za desc ) );
459 # Convert the fields and orders, drop anything we don't know about.
460 grep { $_->{field} } map {
461 my ( $f, $d ) = split /_/;
463 field => $sort_field_convert{$f},
464 direction => $sort_order_convert{$d}
469 =head2 _convert_index_fields
471 my @index_params = $self->_convert_index_fields(@indexes);
473 Converts zebra-style search index notation into elasticsearch-style.
475 C<@indexes> is an array of index names, as presented to L<build_query_compat>,
476 and it returns something that can be sent to L<build_query>.
478 B<TODO>: this will pull from the elasticsearch mappings table to figure out
483 our %index_field_convert = (
489 'se' => 'title-series',
490 'callnum' => 'callnum',
493 'branch' => 'homebranch',
499 sub _convert_index_fields {
500 my ( $self, @indexes ) = @_;
502 my %index_type_convert =
503 ( __default => undef, phr => 'phrase', rtrn => 'right-truncate' );
505 # Convert according to our table, drop anything that doesn't convert.
506 # If a field starts with mc- we save it as it's used (and removed) later
507 # when joining things, to indicate we make it an 'OR' join.
508 # (Sorry, this got a bit ugly after special cases were found.)
509 grep { $_->{field} } map {
510 my ( $f, $t ) = split /,/;
517 field => $index_field_convert{$f},
518 type => $index_type_convert{ $t // '__default' }
520 $r->{field} = ($mc . $r->{field}) if $mc && $r->{field};
525 =head2 _convert_index_strings
527 my @searches = $self->_convert_index_strings(@searches);
529 Similar to L<_convert_index_fields>, this takes strings of the form
530 B<field:search term> and rewrites the field from zebra-style to
531 elasticsearch-style. Anything it doesn't understand is returned verbatim.
535 sub _convert_index_strings {
536 my ( $self, @searches ) = @_;
538 foreach my $s (@searches) {
540 my ( $field, $term ) = $s =~ /^\s*([\w,-]*?):(.*)/;
541 unless ( defined($field) && defined($term) ) {
545 my ($conv) = $self->_convert_index_fields($field);
546 unless ( defined($conv) ) {
550 push @res, $conv->{field} . ":"
551 . $self->_modify_string_by_type( %$conv, operand => $term );
556 =head2 _convert_index_strings_freeform
558 my $search = $self->_convert_index_strings_freeform($search);
560 This is similar to L<_convert_index_strings>, however it'll search out the
561 things to change within the string. So it can handle strings such as
562 C<(su:foo) AND (su:bar)>, converting the C<su> appropriately.
564 If there is something of the form "su,complete-subfield" or something, the
565 second part is stripped off as we can't yet handle that. Making it work
566 will have to wait for a real query parser.
570 sub _convert_index_strings_freeform {
571 my ( $self, $search ) = @_;
572 while ( my ( $zeb, $es ) = each %index_field_convert ) {
573 $search =~ s/\b$zeb(?:,[\w-]*)?:/$es:/g;
578 =head2 _modify_string_by_type
580 my $str = $self->_modify_string_by_type(%index_field);
582 If you have a search term (operand) and a type (phrase, right-truncated), this
583 will convert the string to have the function in lucene search terms, e.g.
584 wrapping quotes around it.
588 sub _modify_string_by_type {
589 my ( $self, %idx ) = @_;
591 my $type = $idx{type} || '';
592 my $str = $idx{operand};
593 return $str unless $str; # Empty or undef, we can't use it.
595 $str .= '*' if $type eq 'right-truncate';
596 $str = '"' . $str . '"' if $type eq 'phrase';
602 my $query_str = $self->_join_queries(@query_parts);
604 This takes a list of query parts, that might be search terms on their own, or
605 booleaned together, or specifying fields, or whatever, wraps them in
606 parentheses, and ANDs them all together. Suitable for feeding to the ES
609 Note: doesn't AND them together if they specify an index that starts with "mc"
610 as that was a special case in the original code for dealing with multiple
611 choice options (you can't search for something that has an itype of A and
612 and itype of B otherwise.)
617 my ( $self, @parts ) = @_;
619 my @norm_parts = grep { defined($_) && $_ ne '' && $_ !~ /^mc-/ } @parts;
621 map { s/^mc-//r } grep { defined($_) && $_ ne '' && $_ =~ /^mc-/ } @parts;
622 return () unless @norm_parts + @mc_parts;
623 return ( @norm_parts, @mc_parts )[0] if @norm_parts + @mc_parts == 1;
625 @mc_parts ? '(' . ( join ' OR ', map { "($_)" } @mc_parts ) . ')' : ();
627 # Handy trick: $x || () inside a join means that if $x ends up as an
628 # empty string, it gets replaced with (), which makes join ignore it.
629 # (bad effect: this'll also happen to '0', this hopefully doesn't matter
632 join( ' AND ', map { "($_)" } @norm_parts ) || (),
638 my @phrased_queries = $self->_make_phrases(@query_parts);
640 This takes the supplied queries and forces them to be phrases by wrapping
641 quotes around them. It understands field prefixes, e.g. 'subject:' and puts
642 the quotes outside of them if they're there.
647 my ( $self, @parts ) = @_;
648 map { s/^\s*(\w*?:)(.*)$/$1"$2"/r } @parts;
651 =head2 _create_query_string
653 my @query_strings = $self->_create_query_string(@queries);
655 Given a list of hashrefs, it will turn them into a lucene-style query string.
656 The hash should contain field, type (both for the indexes), operator, and
661 sub _create_query_string {
662 my ( $self, @queries ) = @_;
665 my $otor = $_->{operator} ? $_->{operator} . ' ' : '';
666 my $field = $_->{field} ? $_->{field} . ':' : '';
668 my $oand = $self->_modify_string_by_type(%$_);
669 "$otor($field$oand)";
673 =head2 _clean_search_term
675 my $term = $self->_clean_search_term($term);
677 This cleans a search term by removing any funny characters that may upset
678 ES and give us an error. It also calls L<_convert_index_strings_freeform>
679 to ensure those parts are correct.
683 sub _clean_search_term {
684 my ( $self, $term ) = @_;
686 $term = $self->_convert_index_strings_freeform($term);
688 # Some hardcoded searches (like with authorities) produce things like
689 # 'an=123', when it ought to be 'an:123'.
694 =head2 _fix_limit_special_cases
696 my $limits = $self->_fix_limit_special_cases($limits);
698 This converts any special cases that the limit specifications have into things
699 that are more readily processable by the rest of the code.
701 The argument should be an arrayref, and it'll return an arrayref.
705 sub _fix_limit_special_cases {
706 my ( $self, $limits ) = @_;
709 foreach my $l (@$limits) {
711 # This is set up by opac-search.pl
712 if ( $l =~ /^yr,st-numeric,ge=/ ) {
713 my ( $start, $end ) =
714 ( $l =~ /^yr,st-numeric,ge=(.*) and yr,st-numeric,le=(.*)$/ );
715 next unless defined($start) && defined($end);
716 push @new_lim, "copydate:[$start TO $end]";
718 elsif ( $l =~ /^yr,st-numeric=/ ) {
719 my ($date) = ( $l =~ /^yr,st-numeric=(.*)$/ );
720 next unless defined($date);
721 push @new_lim, "copydate:$date";
723 elsif ( $l =~ /^available$/ ) {
724 push @new_lim, 'onloan:false';