1 package Koha::SearchEngine::Elasticsearch::QueryBuilder;
3 # This file is part of Koha.
5 # Copyright 2014 Catalyst IT Ltd.
7 # Koha is free software; you can redistribute it and/or modify it
8 # under the terms of the GNU General Public License as published by
9 # the Free Software Foundation; either version 3 of the License, or
10 # (at your option) any later version.
12 # Koha is distributed in the hope that it will be useful, but
13 # WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 # GNU General Public License for more details.
17 # You should have received a copy of the GNU General Public License
18 # along with Koha; if not, see <http://www.gnu.org/licenses>.
22 Koha::SearchEngine::Elasticsearch::QueryBuilder - constructs elasticsearch
23 query objects from user-supplied queries
27 This provides the functions that take a user-supplied search query, and
28 provides something that can be given to elasticsearch to get answers.
32 use Koha::SearchEngine::Elasticsearch::QueryBuilder;
33 $builder = Koha::SearchEngine::Elasticsearch->new({ index => $index });
34 my $simple_query = $builder->build_query("hello");
35 # This is currently undocumented because the original code is undocumented
36 my $adv_query = $builder->build_advanced_query($indexes, $operands, $operators);
42 use base qw(Koha::SearchEngine::Elasticsearch);
45 use List::MoreUtils qw/ each_array /;
53 our %index_field_convert = (
57 'lcn' => 'local-classification',
58 'callnum' => 'local-classification',
59 'record-type' => 'rtype',
60 'mc-rtype' => 'rtype',
62 'lc-card' => 'lc-card-number',
63 'sn' => 'local-number',
64 'biblionumber' => 'local-number',
65 'yr' => 'date-of-publication',
66 'pubdate' => 'date-of-publication',
67 'acqdate' => 'date-of-acquisition',
68 'date/time-last-modified' => 'date-time-last-modified',
69 'dtlm' => 'date-time-last-modified',
70 'diss' => 'dissertation-information',
73 'music-number' => 'identifier-publisher-for-music',
74 'number-music-publisher' => 'identifier-publisher-for-music',
75 'music' => 'identifier-publisher-for-music',
76 'ident' => 'identifier-standard',
77 'cpn' => 'corporate-name',
78 'cfn' => 'conference-name',
79 'pn' => 'personal-name',
84 'rcn' => 'record-control-number',
87 #'su-geo' => 'subject',
90 'se' => 'title-series',
91 'ut' => 'title-uniform',
92 'an' => 'koha-auth-number',
93 'authority-number' => 'koha-auth-number',
96 'rank' => 'relevance',
98 'wrdl' => 'st-word-list',
99 'rt' => 'right-truncation',
100 'rtrn' => 'right-truncation',
101 'ltrn' => 'left-truncation',
102 'rltrn' => 'left-and-right',
103 'mc-itemtype' => 'itemtype',
104 'mc-ccode' => 'ccode',
105 'branch' => 'homebranch',
106 'mc-loc' => 'location',
108 'stocknumber' => 'number-local-acquisition',
109 'inv' => 'number-local-acquisition',
111 'mc-itype' => 'itype',
112 'aub' => 'author-personal-bibliography',
113 'auo' => 'author-in-order',
117 'frequency-code' => 'ff8-18',
118 'illustration-code' => 'ff8-18-21',
119 'regularity-code' => 'ff8-19',
120 'type-of-serial' => 'ff8-21',
121 'format' => 'ff8-23',
122 'conference-code' => 'ff8-29',
123 'festschrift-indicator' => 'ff8-30',
124 'index-indicator' => 'ff8-31',
127 'literature-code' => 'lf',
128 'biography' => 'bio',
130 'biography-code' => 'bio',
131 'l-format' => 'ff7-01-02',
132 'lex' => 'lexile-number',
133 'hi' => 'host-item-number',
134 'itu' => 'index-term-uncontrolled',
135 'itg' => 'index-term-genre',
137 my $field_name_pattern = '[\w\-]+';
138 my $multi_field_pattern = "(?:\\.$field_name_pattern)*";
140 =head2 get_index_field_convert
142 my @index_params = Koha::SearchEngine::Elasticsearch::QueryBuilder->get_index_field_convert();
144 Converts zebra-style search index notation into elasticsearch-style.
146 C<@indexes> is an array of index names, as presented to L<build_query_compat>,
147 and it returns something that can be sent to L<build_query>.
149 B<TODO>: this will pull from the elasticsearch mappings table to figure out
154 sub get_index_field_convert() {
155 return \%index_field_convert;
160 my $simple_query = $builder->build_query("hello", %options)
162 This will build a query that can be issued to elasticsearch from the provided
163 string input. This expects a lucene style search form (see
164 L<http://www.elasticsearch.org/guide/en/elasticsearch/reference/current/query-dsl-query-string-query.html#query-string-syntax>
167 It'll make an attempt to respect the various query options.
169 Additional options can be provided with the C<%options> hash.
175 This should be an arrayref of hashrefs, each containing a C<field> and an
176 C<direction> (optional, defaults to C<asc>.) The results will be sorted
177 according to these values. Valid values for C<direction> are 'asc' and 'desc'.
184 my ( $self, $query, %options ) = @_;
186 my $stemming = C4::Context->preference("QueryStemming") || 0;
187 my $auto_truncation = C4::Context->preference("QueryAutoTruncate") || 0;
188 my $fuzzy_enabled = C4::Context->preference("QueryFuzzy") || 0;
190 $query = '*' unless defined $query;
193 my $fields = $self->_search_fields({
194 is_opac => $options{is_opac},
195 weighted_fields => $options{weighted_fields},
197 if ($options{whole_record}) {
198 push @$fields, 'marc_data_array.*';
203 fuzziness => $fuzzy_enabled ? 'auto' : '0',
204 default_operator => 'AND',
206 lenient => JSON::true,
207 analyze_wildcard => JSON::true,
210 $res->{query}->{query_string}->{type} = 'cross_fields' if C4::Context->preference('ElasticsearchCrossFields');
212 if ( $options{sort} ) {
213 foreach my $sort ( @{ $options{sort} } ) {
214 my ( $f, $d ) = @$sort{qw/ field direction /};
215 die "Invalid sort direction, $d"
216 if $d && ( $d ne 'asc' && $d ne 'desc' );
217 $d = 'asc' unless $d;
219 $f = $self->_sort_field($f);
220 push @{ $res->{sort} }, { $f => { order => $d } };
224 # See _convert_facets in Search.pm for how these get turned into
225 # things that Koha can use.
226 my $size = C4::Context->preference('FacetMaxCount');
227 $res->{aggregations} = {
228 author => { terms => { field => "author__facet" , size => $size } },
229 subject => { terms => { field => "subject__facet", size => $size } },
230 itype => { terms => { field => "itype__facet", size => $size} },
231 location => { terms => { field => "location__facet", size => $size } },
232 'su-geo' => { terms => { field => "su-geo__facet", size => $size} },
233 'title-series' => { terms => { field => "title-series__facet", size => $size } },
234 ccode => { terms => { field => "ccode__facet", size => $size } },
235 ln => { terms => { field => "ln__facet", size => $size } },
238 my $display_library_facets = C4::Context->preference('DisplayLibraryFacets');
239 if ( $display_library_facets eq 'both'
240 or $display_library_facets eq 'home' ) {
241 $res->{aggregations}{homebranch} = { terms => { field => "homebranch__facet", size => $size } };
243 if ( $display_library_facets eq 'both'
244 or $display_library_facets eq 'holding' ) {
245 $res->{aggregations}{holdingbranch} = { terms => { field => "holdingbranch__facet", size => $size } };
250 =head2 build_query_compat
253 $error, $query, $simple_query, $query_cgi,
254 $query_desc, $limit, $limit_cgi, $limit_desc,
255 $stopwords_removed, $query_type
257 = $builder->build_query_compat( \@operators, \@operands, \@indexes,
258 \@limits, \@sort_by, $scan, $lang, $params );
260 This handles a search using the same api as L<C4::Search::buildQuery> does.
262 A very simple query will go in with C<$operands> set to ['query'], and
263 C<$sort_by> set to ['pubdate_dsc']. This simple case will return with
264 C<$query> set to something that can perform the search, C<$simple_query>
265 set to just the search term, C<$query_cgi> set to something that can
266 reproduce this search, and C<$query_desc> set to something else.
270 sub build_query_compat {
271 my ( $self, $operators, $operands, $indexes, $orig_limits, $sort_by, $scan,
277 my $search_param_query_str = '';
280 ($query, $query_str) = $self->_build_scan_query( $operands, $indexes );
281 $search_param_query_str = $query_str;
283 my @sort_params = $self->_convert_sort_fields(@$sort_by);
284 my @index_params = $self->_convert_index_fields(@$indexes);
285 $limits = $self->_fix_limit_special_cases($orig_limits);
286 if ( $params->{suppress} ) { push @$limits, "suppress:false"; }
287 # Merge the indexes in with the search terms and the operands so that
288 # each search thing is a handy unit.
289 unshift @$operators, undef; # The first one can't have an op
291 my $truncate = C4::Context->preference("QueryAutoTruncate") || 0;
292 my $ea = each_array( @$operands, @$operators, @index_params );
293 while ( my ( $oand, $otor, $index ) = $ea->() ) {
294 next if ( !defined($oand) || $oand eq '' );
295 $oand = $self->_clean_search_term($oand);
296 $oand = $self->_truncate_terms($oand) if ($truncate);
297 push @search_params, {
298 operand => $oand, # the search terms
299 operator => defined($otor) ? uc $otor : undef, # AND and so on
300 $index ? %$index : (),
304 # We build a string query from limits and the queries. An alternative
305 # would be to pass them separately into build_query and let it build
306 # them into a structured ES query itself. Maybe later, though that'd be
308 $search_param_query_str = join( ' ', $self->_create_query_string(@search_params) );
309 $query_str = join( ' AND ',
310 $search_param_query_str || (),
311 $self->_join_queries( $self->_convert_index_strings(@$limits) ) || () );
313 # If there's no query on the left, let's remove the junk left behind
314 $query_str =~ s/^ AND //;
316 $options{sort} = \@sort_params;
317 $options{is_opac} = $params->{is_opac};
318 $options{weighted_fields} = $params->{weighted_fields};
319 $options{whole_record} = $params->{whole_record};
320 $query = $self->build_query( $query_str, %options );
323 # We roughly emulate the CGI parameters of the zebra query builder
325 shift @$operators; # Shift out the one we unshifted before
326 my $ea = each_array( @$operands, @$operators, @$indexes );
327 while ( my ( $oand, $otor, $index ) = $ea->() ) {
328 $query_cgi .= '&' if $query_cgi;
329 $query_cgi .= 'idx=' . uri_escape_utf8( $index // '') . '&q=' . uri_escape_utf8( $oand );
330 $query_cgi .= '&op=' . uri_escape_utf8( $otor ) if $otor;
332 $query_cgi .= '&scan=1' if ( $scan );
335 $simple_query = $operands->[0] if @$operands == 1;
337 if ( $simple_query ) {
338 $query_desc = $simple_query;
340 $query_desc = $search_param_query_str;
342 my $limit = $self->_join_queries( $self->_convert_index_strings(@$limits));
343 my $limit_cgi = ( $orig_limits and @$orig_limits )
344 ? '&limit=' . join( '&limit=', map { uri_escape_utf8($_) } @$orig_limits )
347 $limit_desc = "$limit" if $limit;
350 undef, $query, $simple_query, $query_cgi, $query_desc,
351 $limit, $limit_cgi, $limit_desc, undef, undef
355 =head2 build_authorities_query
357 my $query = $builder->build_authorities_query(\%search);
359 This takes a nice description of an authority search and turns it into a black-box
360 query that can then be passed to the appropriate searcher.
362 The search description is a hashref that looks something like:
367 where => 'Heading', # search the main entry
368 operator => 'exact', # require an exact match
369 value => 'frogs', # the search string
372 where => '', # search all entries
373 operator => '', # default keyword, right truncation
381 authtypecode => 'TOPIC_TERM',
386 sub build_authorities_query {
387 my ( $self, $search ) = @_;
389 # Start by making the query parts
392 foreach my $s ( @{ $search->{searches} } ) {
393 my ( $wh, $op, $val ) = @{$s}{qw(where operator value)};
394 if ( defined $op && ($op eq 'is' || $op eq '=' || $op eq 'exact') ) {
396 # Match the whole field, case insensitive, UTF normalized.
397 push @query_parts, { term => { "$wh.ci_raw" => $val } };
400 # Match the whole field for all searchable fields, case insensitive,
402 # Given that field data is "The quick brown fox"
403 # "The quick brown fox" and "the quick brown fox" will match
404 # but not "quick brown fox".
408 fields => $self->_search_fields({ subfield => 'ci_raw' }),
413 elsif ( defined $op && $op eq 'start') {
414 # Match the prefix within a field for all searchable fields.
415 # Given that field data is "The quick brown fox"
416 # "The quick bro" will match, but not "quick bro"
418 # Does not seems to be a multi prefix query
419 # so we need to create one
421 # Match prefix of the field.
422 push @query_parts, { prefix => {"$wh.ci_raw" => $val} };
426 foreach my $field (@{$self->_search_fields()}) {
427 push @prefix_queries, {
428 prefix => { "$field.ci_raw" => $val }
433 'should' => \@prefix_queries,
434 'minimum_should_match' => 1
440 # Query all searchable fields.
441 # Given that field data is "The quick brown fox"
442 # a search containing any of the words will match, regardless
445 my @tokens = $self->_split_query( $val );
446 foreach my $token ( @tokens ) {
447 $token = $self->_truncate_terms(
448 $self->_clean_search_term( $token )
451 my $query = $self->_join_queries( @tokens );
454 push @query_parts, { query_string => {
455 default_field => $wh,
456 analyze_wildcard => JSON::true,
463 analyze_wildcard => JSON::true,
465 fields => $self->_search_fields(),
472 # Merge the query parts appropriately
473 # 'should' behaves like 'or'
474 # 'must' behaves like 'and'
475 # Zebra behaviour seem to match must so using that here
476 my $elastic_query = {};
477 $elastic_query->{bool}->{must} = \@query_parts;
479 # Filter by authtypecode if set
480 if ($search->{authtypecode}) {
481 $elastic_query->{bool}->{filter} = {
483 "authtype.raw" => $search->{authtypecode}
489 query => $elastic_query
493 $query->{sort} = [ $search->{sort} ] if exists $search->{sort};
498 =head2 build_authorities_query_compat
501 $builder->build_authorities_query_compat( \@marclist, \@and_or,
502 \@excluding, \@operator, \@value, $authtypecode, $orderby );
504 This builds a query for searching for authorities, in the style of
505 L<C4::AuthoritiesMarc::SearchAuthorities>.
513 An arrayref containing where the particular term should be searched for.
514 Options are: mainmainentry, mainentry, match, match-heading, see-from, and
515 thesaurus. If left blank, any field is used.
519 Totally ignored. It is never used in L<C4::AuthoritiesMarc::SearchAuthorities>.
527 What form of search to do. Options are: is (phrase, no truncation, whole field
528 must match), = (number exact match), exact (phrase, no truncation, whole field
529 must match). If left blank, then word list, right truncated, anywhere is used.
533 The actual user-provided string value to search for.
537 The authority type code to search within. If blank, then all will be searched.
541 The order to sort the results by. Options are Relevance, HeadingAsc,
542 HeadingDsc, AuthidAsc, AuthidDsc.
546 marclist, operator, and value must be the same length, and the values at
547 index /i/ all relate to each other.
549 This returns a query, which is a black box object that can be passed to the
550 appropriate search object.
554 our $koha_to_index_name = {
555 mainmainentry => 'heading-main',
556 mainentry => 'heading',
558 'match-heading' => 'match-heading',
559 'see-from' => 'match-heading-see-from',
560 thesaurus => 'subject-heading-thesaurus',
565 sub build_authorities_query_compat {
566 my ( $self, $marclist, $and_or, $excluding, $operator, $value,
567 $authtypecode, $orderby )
570 # This turns the old-style many-options argument form into a more
571 # extensible hash form that is understood by L<build_authorities_query>.
573 my $mappings = $self->get_elasticsearch_mappings();
575 # Convert to lower case
576 $marclist = [map(lc, @{$marclist})];
577 $orderby = lc $orderby;
580 # Make sure everything exists
581 foreach my $m (@$marclist) {
583 $m = exists $koha_to_index_name->{$m} ? $koha_to_index_name->{$m} : $m;
585 warn "Unknown search field $m in marclist" unless (defined $mappings->{data}->{properties}->{$m} || $m eq '' || $m eq 'match-heading');
587 for ( my $i = 0 ; $i < @$value ; $i++ ) {
588 next unless $value->[$i]; #clean empty form values, ES doesn't like undefined searches
591 where => $indexes[$i],
592 operator => $operator->[$i],
593 value => $value->[$i],
599 ( $orderby =~ /^heading/ ) ? 'heading__sort'
600 : ( $orderby =~ /^auth/ ) ? 'local-number__sort'
603 my $sort_order = ( $orderby =~ /asc$/ ) ? 'asc' : 'desc';
604 %sort = ( $sort_field => $sort_order, );
607 searches => \@searches,
608 authtypecode => $authtypecode,
610 $search{sort} = \%sort if %sort;
611 my $query = $self->build_authorities_query( \%search );
615 =head2 _build_scan_query
617 my ($query, $query_str) = $builder->_build_scan_query(\@operands, \@indexes)
619 This will build an aggregation scan query that can be issued to elasticsearch from
620 the provided string input.
624 our %scan_field_convert = (
628 'se' => 'title-series',
632 sub _build_scan_query {
633 my ( $self, $operands, $indexes ) = @_;
635 my $term = scalar( @$operands ) == 0 ? '' : $operands->[0];
636 my $index = scalar( @$indexes ) == 0 ? 'subject' : $indexes->[0];
638 my ( $f, $d ) = split( /,/, $index);
639 $index = $scan_field_convert{$f} || $f;
647 $res->{aggregations} = {
650 field => $index . '__facet',
651 order => { '_term' => 'asc' },
652 include => $self->_create_regex_filter($self->_clean_search_term($term)) . '.*'
656 return ($res, $term);
659 =head2 _create_regex_filter
661 my $filter = $builder->_create_regex_filter('term')
663 This will create a regex filter that can be used with an aggregation query.
667 sub _create_regex_filter {
668 my ($self, $term) = @_;
671 foreach my $c (split(//, quotemeta($term))) {
674 $result .= $lc ne $uc ? '[' . $lc . $uc . ']' : $c;
679 =head2 _convert_sort_fields
681 my @sort_params = _convert_sort_fields(@sort_by)
683 Converts the zebra-style sort index information into elasticsearch-style.
685 C<@sort_by> is the same as presented to L<build_query_compat>, and it returns
686 something that can be sent to L<build_query>.
690 sub _convert_sort_fields {
691 my ( $self, @sort_by ) = @_;
693 # Turn the sorting into something we care about.
694 my %sort_field_convert = (
695 acqdate => 'date-of-acquisition',
697 call_number => 'cn-sort',
698 popularity => 'issues',
699 relevance => undef, # default
701 pubdate => 'date-of-publication',
703 my %sort_order_convert =
704 ( qw( desc desc ), qw( dsc desc ), qw( asc asc ), qw( az asc ), qw( za desc ) );
706 # Convert the fields and orders, drop anything we don't know about.
707 grep { $_->{field} } map {
708 my ( $f, $d ) = /(.+)_(.+)/;
710 field => $sort_field_convert{$f},
711 direction => $sort_order_convert{$d}
716 sub _convert_index_fields {
717 my ( $self, @indexes ) = @_;
719 my %index_type_convert =
720 ( __default => undef, phr => 'phrase', rtrn => 'right-truncate', 'st-year' => 'st-year' );
722 # Convert according to our table, drop anything that doesn't convert.
723 # If a field starts with mc- we save it as it's used (and removed) later
724 # when joining things, to indicate we make it an 'OR' join.
725 # (Sorry, this got a bit ugly after special cases were found.)
727 # Lower case all field names
728 my ( $f, $t ) = map(lc, split /,/);
735 field => exists $index_field_convert{$f} ? $index_field_convert{$f} : $f,
736 type => $index_type_convert{ $t // '__default' }
738 $r->{field} = ($mc . $r->{field}) if $mc && $r->{field};
739 $r->{field} || $r->{type} ? $r : undef;
743 =head2 _convert_index_strings
745 my @searches = $self->_convert_index_strings(@searches);
747 Similar to L<_convert_index_fields>, this takes strings of the form
748 B<field:search term> and rewrites the field from zebra-style to
749 elasticsearch-style. Anything it doesn't understand is returned verbatim.
753 sub _convert_index_strings {
754 my ( $self, @searches ) = @_;
756 foreach my $s (@searches) {
758 my ( $field, $term ) = $s =~ /^\s*([\w,-]*?):(.*)/;
759 unless ( defined($field) && defined($term) ) {
763 my ($conv) = $self->_convert_index_fields($field);
764 unless ( defined($conv) ) {
768 push @res, ($conv->{field} ? $conv->{field} . ':' : '')
769 . $self->_modify_string_by_type( %$conv, operand => $term );
774 =head2 _convert_index_strings_freeform
776 my $search = $self->_convert_index_strings_freeform($search);
778 This is similar to L<_convert_index_strings>, however it'll search out the
779 things to change within the string. So it can handle strings such as
780 C<(su:foo) AND (su:bar)>, converting the C<su> appropriately.
782 If there is something of the form "su,complete-subfield" or something, the
783 second part is stripped off as we can't yet handle that. Making it work
784 will have to wait for a real query parser.
788 sub _convert_index_strings_freeform {
789 my ( $self, $search ) = @_;
790 # @TODO: Currenty will alter also fields contained within quotes:
791 # `searching for "stuff cn:123"` for example will become
792 # `searching for "stuff local-number:123"
794 # Fixing this is tricky, one possibility:
795 # https://stackoverflow.com/questions/19193876/perl-regex-to-match-a-string-that-is-not-enclosed-in-quotes
796 # Still not perfect, and will not handle escaped quotes within quotes and assumes balanced quotes.
798 # Another, not so elegant, solution could be to replace all quoted content with placeholders, and put
799 # them back when processing is done.
801 # Lower case field names
802 $search =~ s/($field_name_pattern)(?:,[\w-]*)?($multi_field_pattern):/\L$1\E$2:/og;
803 # Resolve possible field aliases
804 $search =~ s/($field_name_pattern)($multi_field_pattern):/(exists $index_field_convert{$1} ? $index_field_convert{$1} : $1)."$2:"/oge;
808 =head2 _modify_string_by_type
810 my $str = $self->_modify_string_by_type(%index_field);
812 If you have a search term (operand) and a type (phrase, right-truncated), this
813 will convert the string to have the function in lucene search terms, e.g.
814 wrapping quotes around it.
818 sub _modify_string_by_type {
819 my ( $self, %idx ) = @_;
821 my $type = $idx{type} || '';
822 my $str = $idx{operand};
823 return $str unless $str; # Empty or undef, we can't use it.
825 $str .= '*' if $type eq 'right-truncate';
826 $str = '"' . $str . '"' if $type eq 'phrase' && $str !~ /^".*"$/;
827 if ($type eq 'st-year') {
828 if ($str =~ /^(.*)-(.*)$/) {
829 my $from = $1 || '*';
830 my $until = $2 || '*';
831 $str = "[$from TO $until]";
839 my $query_str = $self->_join_queries(@query_parts);
841 This takes a list of query parts, that might be search terms on their own, or
842 booleaned together, or specifying fields, or whatever, wraps them in
843 parentheses, and ANDs them all together. Suitable for feeding to the ES
846 Note: doesn't AND them together if they specify an index that starts with "mc"
847 as that was a special case in the original code for dealing with multiple
848 choice options (you can't search for something that has an itype of A and
849 and itype of B otherwise.)
854 my ( $self, @parts ) = @_;
856 my @norm_parts = grep { defined($_) && $_ ne '' && $_ !~ /^mc-/ } @parts;
858 map { s/^mc-//r } grep { defined($_) && $_ ne '' && $_ =~ /^mc-/ } @parts;
859 return () unless @norm_parts + @mc_parts;
860 return ( @norm_parts, @mc_parts )[0] if @norm_parts + @mc_parts == 1;
862 # Group limits by field, so they can be OR'ed together
864 foreach my $mc_part (@mc_parts) {
865 my ($field, $value) = split /:/, $mc_part, 2;
866 $mc_limits{$field} //= [];
867 push @{ $mc_limits{$field} }, $value;
871 sprintf('%s:(%s)', $_, join (' OR ', @{ $mc_limits{$_} }));
872 } sort keys %mc_limits;
874 @norm_parts = map { "($_)" } @norm_parts;
876 return join( ' AND ', @norm_parts, @mc_parts);
881 my @phrased_queries = $self->_make_phrases(@query_parts);
883 This takes the supplied queries and forces them to be phrases by wrapping
884 quotes around them. It understands field prefixes, e.g. 'subject:' and puts
885 the quotes outside of them if they're there.
890 my ( $self, @parts ) = @_;
891 map { s/^\s*(\w*?:)(.*)$/$1"$2"/r } @parts;
894 =head2 _create_query_string
896 my @query_strings = $self->_create_query_string(@queries);
898 Given a list of hashrefs, it will turn them into a lucene-style query string.
899 The hash should contain field, type (both for the indexes), operator, and
904 sub _create_query_string {
905 my ( $self, @queries ) = @_;
908 my $otor = $_->{operator} ? $_->{operator} . ' ' : '';
909 my $field = $_->{field} ? $_->{field} . ':' : '';
911 my $oand = $self->_modify_string_by_type(%$_);
912 $oand = "($oand)" if $field && scalar(split(/\s+/, $oand)) > 1 && (!defined $_->{type} || $_->{type} ne 'st-year');
913 "$otor($field$oand)";
917 =head2 _clean_search_term
919 my $term = $self->_clean_search_term($term);
921 This cleans a search term by removing any funny characters that may upset
922 ES and give us an error. It also calls L<_convert_index_strings_freeform>
923 to ensure those parts are correct.
927 sub _clean_search_term {
928 my ( $self, $term ) = @_;
930 # Lookahead for checking if we are inside quotes
931 my $lookahead = '(?=(?:[^\"]*+\"[^\"]*+\")*+[^\"]*+$)';
933 # Some hardcoded searches (like with authorities) produce things like
934 # 'an=123', when it ought to be 'an:123' for our purposes.
937 $term = $self->_convert_index_strings_freeform($term);
940 # Remove unbalanced quotes
941 my $unquoted = $term;
942 my $count = ($unquoted =~ tr/"/ /);
943 if ($count % 2 == 1) {
947 # Remove unquoted colons that have whitespace on either side of them
948 $term =~ s/(:+)(\s+)$lookahead/$2/g;
949 $term =~ s/(\s+)(:+)$lookahead/$1/g;
952 $term = $self->_query_regex_escape_process($term);
957 =head2 _query_regex_escape_process
959 my $query = $self->_query_regex_escape_process($query);
961 Processes query in accordance with current "QueryRegexEscapeOptions" system preference setting.
965 sub _query_regex_escape_process {
966 my ($self, $query) = @_;
967 my $regex_escape_options = C4::Context->preference("QueryRegexEscapeOptions");
968 if ($regex_escape_options ne 'dont_escape') {
969 if ($regex_escape_options eq 'escape') {
970 # Will escape unescaped slashes (/) while preserving
971 # unescaped slashes within quotes
972 # @TODO: assumes quotes are always balanced and will
973 # not handle escaped qoutes properly, should perhaps be
974 # replaced with a more general parser solution
975 # so that this function is ever only provided with unqouted
977 $query =~ s@(?:(?<!\\)((?:[\\]{2})*)(?=/))(?![^"]*"(?:[^"]*"[^"]*")*[^"]*$)@\\$1@g;
979 elsif($regex_escape_options eq 'unescape_escaped') {
980 # Will unescape escaped slashes (\/) and escape
981 # unescaped slashes (/) while preserving slashes within quotes
982 # The same limitatations as above apply for handling of quotes
983 $query =~ s@(?:(?<!\\)(?:((?:[\\]{2})*[\\])|((?:[\\]{2})*))(?=/))(?![^"]*"(?:[^"]*"[^"]*")*[^"]*$)@($1 ? substr($1, 0, -1) : ($2 . "\\"))@ge;
989 =head2 _fix_limit_special_cases
991 my $limits = $self->_fix_limit_special_cases($limits);
993 This converts any special cases that the limit specifications have into things
994 that are more readily processable by the rest of the code.
996 The argument should be an arrayref, and it'll return an arrayref.
1000 sub _fix_limit_special_cases {
1001 my ( $self, $limits ) = @_;
1004 foreach my $l (@$limits) {
1006 # This is set up by opac-search.pl
1007 if ( $l =~ /^yr,st-numeric,ge=/ ) {
1008 my ( $start, $end ) =
1009 ( $l =~ /^yr,st-numeric,ge=(.*) and yr,st-numeric,le=(.*)$/ );
1010 next unless defined($start) && defined($end);
1011 push @new_lim, "copydate:[$start TO $end]";
1013 elsif ( $l =~ /^yr,st-numeric=/ ) {
1014 my ($date) = ( $l =~ /^yr,st-numeric=(.*)$/ );
1015 next unless defined($date);
1016 $date = $self->_modify_string_by_type(type => 'st-year', operand => $date);
1017 push @new_lim, "copydate:$date";
1019 elsif ( $l =~ /^available$/ ) {
1020 push @new_lim, 'onloan:false';
1023 my ( $field, $term ) = $l =~ /^\s*([\w,-]*?):(.*)/;
1024 $field =~ s/,phr$//; #We are quoting all the limits as phrase, this prevents from quoting again later
1025 if ( defined($field) && defined($term) ) {
1026 push @new_lim, "$field:(\"$term\")";
1038 my $field = $self->_sort_field($field);
1040 Given a field name, this works out what the actual name of the field to sort
1041 on should be. A '__sort' suffix is added for fields with a sort version, and
1042 for text fields either '.phrase' (for sortable versions) or '.raw' is appended
1043 to avoid sorting on a tokenized value.
1048 my ($self, $f) = @_;
1050 my $mappings = $self->get_elasticsearch_mappings();
1051 my $textField = defined $mappings->{data}{properties}{$f}{type} && $mappings->{data}{properties}{$f}{type} eq 'text';
1052 if (!defined $self->sort_fields()->{$f} || $self->sort_fields()->{$f}) {
1055 # We need to add '.raw' to text fields without a sort field,
1056 # otherwise it'll sort based on the tokenised form.
1057 $f .= '.raw' if $textField;
1062 =head2 _truncate_terms
1064 my $query = $self->_truncate_terms($query);
1066 Given a string query this function appends '*' wildcard to all terms except
1067 operands and double quoted strings.
1071 sub _truncate_terms {
1072 my ( $self, $query ) = @_;
1074 my @tokens = $self->_split_query( $query );
1076 # Filter out empty tokens
1077 my @words = grep { $_ !~ /^\s*$/ } @tokens;
1079 # Append '*' to words if needed, ie. if it ends in a word character and is not a keyword
1082 (/\W$/ or grep {lc($w) eq $_} qw/and or not/) ? $_ : "$_*";
1085 return join ' ', @terms;
1090 my @token = $self->_split_query($query_str);
1092 Given a string query this function splits it to tokens taking into account
1093 any field prefixes and quoted strings.
1097 my $tokenize_split_re = qr/((?:${field_name_pattern}${multi_field_pattern}:)?"[^"]+"|\s+)/;
1100 my ( $self, $query ) = @_;
1102 # '"donald duck" title:"the mouse" and peter" get split into
1103 # ['', '"donald duck"', '', ' ', '', 'title:"the mouse"', '', ' ', 'and', ' ', 'pete']
1104 my @tokens = split $tokenize_split_re, $query;
1106 # Filter out empty values
1107 @tokens = grep( /\S/, @tokens );
1112 =head2 _search_fields
1113 my $weighted_fields = $self->_search_fields({
1115 weighted_fields => 1,
1119 Generate a list of searchable fields to be used for Elasticsearch queries
1120 applied to multiple fields.
1122 Returns an arrayref of field names for either OPAC or staff interface, with
1123 possible weights and subfield appended to each field name depending on the
1130 Hashref with options. The parameter C<is_opac> indicates whether the searchable
1131 fields for OPAC or staff interface should be retrieved. If C<weighted_fields> is set
1132 fields weights will be applied on returned fields. C<subfield> can be used to
1133 provide a subfield that will be appended to fields as "C<field_name>.C<subfield>".
1139 sub _search_fields {
1140 my ($self, $params) = @_;
1143 weighted_fields => 0,
1145 # This is a hack for authorities build_authorities_query
1146 # can hopefully be removed in the future
1149 my $cache = Koha::Caches->get_instance();
1150 my $cache_key = 'elasticsearch_search_fields' . ($params->{is_opac} ? '_opac' : '_staff_client') . "_" . $self->index;
1151 my $search_fields = $cache->get_from_cache($cache_key, { unsafe => 1 });
1152 if (!$search_fields) {
1153 # The reason we don't use Koha::SearchFields->search here is we don't
1154 # want or need resultset wrapped as Koha::SearchField object.
1155 # It does not make any sense in this context and would cause
1156 # unnecessary overhead sice we are only querying for data
1157 # Also would not work, or produce strange results, with the "columns"
1159 my $schema = Koha::Database->schema;
1160 my $result = $schema->resultset('SearchField')->search(
1162 $params->{is_opac} ? (
1167 'type' => { '!=' => 'boolean' },
1168 'search_marc_map.index_name' => $self->index,
1169 'search_marc_map.marc_type' => C4::Context->preference('marcflavour'),
1170 'search_marc_to_fields.search' => 1,
1173 columns => [qw/name weight/],
1175 join => {search_marc_to_fields => 'search_marc_map'},
1179 while (my $search_field = $result->next) {
1180 push @search_fields, [
1181 lc $search_field->name,
1182 $search_field->weight ? $search_field->weight : ()
1185 $search_fields = \@search_fields;
1186 $cache->set_in_cache($cache_key, $search_fields);
1188 if ($params->{subfield}) {
1189 my $subfield = $params->{subfield};
1192 # Copy values to avoid mutating cached
1193 # data (since unsafe is used)
1194 my ($field, $weight) = @{$_};
1195 ["${field}.${subfield}", $weight];
1199 if ($params->{weighted_fields}) {
1200 return [map { join('^', @{$_}) } @{$search_fields}];
1203 # Exclude weight from field
1204 return [map { $_->[0] } @{$search_fields}];