1 package Koha::SearchEngine::Elasticsearch::QueryBuilder;
3 # This file is part of Koha.
5 # Copyright 2014 Catalyst IT Ltd.
7 # Koha is free software; you can redistribute it and/or modify it
8 # under the terms of the GNU General Public License as published by
9 # the Free Software Foundation; either version 3 of the License, or
10 # (at your option) any later version.
12 # Koha is distributed in the hope that it will be useful, but
13 # WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 # GNU General Public License for more details.
17 # You should have received a copy of the GNU General Public License
18 # along with Koha; if not, see <http://www.gnu.org/licenses>.
22 Koha::SearchEngine::Elasticsearch::QueryBuilder - constructs elasticsearch
23 query objects from user-supplied queries
27 This provides the functions that take a user-supplied search query, and
28 provides something that can be given to elasticsearch to get answers.
32 use Koha::SearchEngine::Elasticsearch::QueryBuilder;
33 $builder = Koha::SearchEngine::Elasticsearch->new({ index => $index });
34 my $simple_query = $builder->build_query("hello");
35 # This is currently undocumented because the original code is undocumented
36 my $adv_query = $builder->build_advanced_query($indexes, $operands, $operators);
42 use base qw(Koha::SearchEngine::Elasticsearch);
44 use List::MoreUtils qw( each_array );
46 use URI::Escape qw( uri_escape_utf8 );
52 our %index_field_convert = (
56 'lcn' => 'local-classification',
57 'callnum' => 'local-classification',
58 'record-type' => 'rtype',
59 'mc-rtype' => 'rtype',
61 'lc-card' => 'lc-card-number',
62 'sn' => 'local-number',
63 'biblionumber' => 'local-number',
64 'yr' => 'date-of-publication',
65 'pubdate' => 'date-of-publication',
66 'acqdate' => 'date-of-acquisition',
67 'date/time-last-modified' => 'date-time-last-modified',
68 'dtlm' => 'date-time-last-modified',
69 'diss' => 'dissertation-information',
72 'music-number' => 'identifier-publisher-for-music',
73 'number-music-publisher' => 'identifier-publisher-for-music',
74 'music' => 'identifier-publisher-for-music',
75 'ident' => 'identifier-standard',
76 'cpn' => 'corporate-name',
77 'cfn' => 'conference-name',
78 'pn' => 'personal-name',
83 'rcn' => 'record-control-number',
86 #'su-geo' => 'subject',
89 'se' => 'title-series',
90 'ut' => 'title-uniform',
91 'an' => 'koha-auth-number',
92 'authority-number' => 'koha-auth-number',
95 'rank' => 'relevance',
97 'wrdl' => 'st-word-list',
98 'rt' => 'right-truncation',
99 'rtrn' => 'right-truncation',
100 'ltrn' => 'left-truncation',
101 'rltrn' => 'left-and-right',
102 'mc-itemtype' => 'itemtype',
103 'mc-ccode' => 'ccode',
104 'branch' => 'homebranch',
105 'mc-loc' => 'location',
107 'stocknumber' => 'number-local-acquisition',
108 'inv' => 'number-local-acquisition',
110 'mc-itype' => 'itype',
111 'aub' => 'author-personal-bibliography',
112 'auo' => 'author-in-order',
116 'frequency-code' => 'ff8-18',
117 'illustration-code' => 'ff8-18-21',
118 'regularity-code' => 'ff8-19',
119 'type-of-serial' => 'ff8-21',
120 'format' => 'ff8-23',
121 'conference-code' => 'ff8-29',
122 'festschrift-indicator' => 'ff8-30',
123 'index-indicator' => 'ff8-31',
126 'literature-code' => 'lf',
127 'biography' => 'bio',
129 'biography-code' => 'bio',
130 'l-format' => 'ff7-01-02',
131 'lex' => 'lexile-number',
132 'hi' => 'host-item-number',
133 'itu' => 'index-term-uncontrolled',
134 'itg' => 'index-term-genre',
136 my $field_name_pattern = '[\w\-]+';
137 my $multi_field_pattern = "(?:\\.$field_name_pattern)*";
139 =head2 get_index_field_convert
141 my @index_params = Koha::SearchEngine::Elasticsearch::QueryBuilder->get_index_field_convert();
143 Converts zebra-style search index notation into elasticsearch-style.
145 C<@indexes> is an array of index names, as presented to L<build_query_compat>,
146 and it returns something that can be sent to L<build_query>.
148 B<TODO>: this will pull from the elasticsearch mappings table to figure out
153 sub get_index_field_convert() {
154 return \%index_field_convert;
159 my $simple_query = $builder->build_query("hello", %options)
161 This will build a query that can be issued to elasticsearch from the provided
162 string input. This expects a lucene style search form (see
163 L<http://www.elasticsearch.org/guide/en/elasticsearch/reference/current/query-dsl-query-string-query.html#query-string-syntax>
166 It'll make an attempt to respect the various query options.
168 Additional options can be provided with the C<%options> hash.
174 This should be an arrayref of hashrefs, each containing a C<field> and an
175 C<direction> (optional, defaults to C<asc>.) The results will be sorted
176 according to these values. Valid values for C<direction> are 'asc' and 'desc'.
183 my ( $self, $query, %options ) = @_;
185 my $stemming = C4::Context->preference("QueryStemming") || 0;
186 my $auto_truncation = C4::Context->preference("QueryAutoTruncate") || 0;
187 my $fuzzy_enabled = C4::Context->preference("QueryFuzzy") || 0;
189 $query = '*' unless defined $query;
192 my $fields = $self->_search_fields({
193 is_opac => $options{is_opac},
194 weighted_fields => $options{weighted_fields},
196 if ($options{whole_record}) {
197 push @$fields, 'marc_data_array.*';
202 fuzziness => $fuzzy_enabled ? 'auto' : '0',
203 default_operator => 'AND',
205 lenient => JSON::true,
206 analyze_wildcard => JSON::true,
209 $res->{query}->{query_string}->{type} = 'cross_fields' if C4::Context->preference('ElasticsearchCrossFields');
211 if ( $options{sort} ) {
212 foreach my $sort ( @{ $options{sort} } ) {
213 my ( $f, $d ) = @$sort{qw/ field direction /};
214 die "Invalid sort direction, $d"
215 if $d && ( $d ne 'asc' && $d ne 'desc' );
216 $d = 'asc' unless $d;
218 $f = $self->_sort_field($f);
219 push @{ $res->{sort} }, { $f => { order => $d } };
223 # See _convert_facets in Search.pm for how these get turned into
224 # things that Koha can use.
225 my $size = C4::Context->preference('FacetMaxCount');
226 $res->{aggregations} = {
227 author => { terms => { field => "author__facet" , size => $size } },
228 subject => { terms => { field => "subject__facet", size => $size } },
229 itype => { terms => { field => "itype__facet", size => $size} },
230 location => { terms => { field => "location__facet", size => $size } },
231 'su-geo' => { terms => { field => "su-geo__facet", size => $size} },
232 'title-series' => { terms => { field => "title-series__facet", size => $size } },
233 ccode => { terms => { field => "ccode__facet", size => $size } },
234 ln => { terms => { field => "ln__facet", size => $size } },
237 my $display_library_facets = C4::Context->preference('DisplayLibraryFacets');
238 if ( $display_library_facets eq 'both'
239 or $display_library_facets eq 'home' ) {
240 $res->{aggregations}{homebranch} = { terms => { field => "homebranch__facet", size => $size } };
242 if ( $display_library_facets eq 'both'
243 or $display_library_facets eq 'holding' ) {
244 $res->{aggregations}{holdingbranch} = { terms => { field => "holdingbranch__facet", size => $size } };
249 =head2 build_query_compat
252 $error, $query, $simple_query, $query_cgi,
253 $query_desc, $limit, $limit_cgi, $limit_desc,
254 $stopwords_removed, $query_type
256 = $builder->build_query_compat( \@operators, \@operands, \@indexes,
257 \@limits, \@sort_by, $scan, $lang, $params );
259 This handles a search using the same api as L<C4::Search::buildQuery> does.
261 A very simple query will go in with C<$operands> set to ['query'], and
262 C<$sort_by> set to ['pubdate_dsc']. This simple case will return with
263 C<$query> set to something that can perform the search, C<$simple_query>
264 set to just the search term, C<$query_cgi> set to something that can
265 reproduce this search, and C<$query_desc> set to something else.
269 sub build_query_compat {
270 my ( $self, $operators, $operands, $indexes, $orig_limits, $sort_by, $scan,
276 my $search_param_query_str = '';
279 ($query, $query_str) = $self->_build_scan_query( $operands, $indexes );
280 $search_param_query_str = $query_str;
282 my @sort_params = $self->_convert_sort_fields(@$sort_by);
283 my @index_params = $self->_convert_index_fields(@$indexes);
284 $limits = $self->_fix_limit_special_cases($orig_limits);
285 if ( $params->{suppress} ) { push @$limits, "suppress:false"; }
286 # Merge the indexes in with the search terms and the operands so that
287 # each search thing is a handy unit.
288 unshift @$operators, undef; # The first one can't have an op
290 my $truncate = C4::Context->preference("QueryAutoTruncate") || 0;
291 my $ea = each_array( @$operands, @$operators, @index_params );
292 while ( my ( $oand, $otor, $index ) = $ea->() ) {
293 next if ( !defined($oand) || $oand eq '' );
294 $oand = $self->_clean_search_term($oand);
295 $oand = $self->_truncate_terms($oand) if ($truncate);
296 push @search_params, {
297 operand => $oand, # the search terms
298 operator => defined($otor) ? uc $otor : undef, # AND and so on
299 $index ? %$index : (),
303 # We build a string query from limits and the queries. An alternative
304 # would be to pass them separately into build_query and let it build
305 # them into a structured ES query itself. Maybe later, though that'd be
307 $search_param_query_str = join( ' ', $self->_create_query_string(@search_params) );
308 $query_str = join( ' AND ',
309 $search_param_query_str || (),
310 $self->_join_queries( $self->_convert_index_strings(@$limits) ) || () );
312 # If there's no query on the left, let's remove the junk left behind
313 $query_str =~ s/^ AND //;
315 $options{sort} = \@sort_params;
316 $options{is_opac} = $params->{is_opac};
317 $options{weighted_fields} = $params->{weighted_fields};
318 $options{whole_record} = $params->{whole_record};
319 $query = $self->build_query( $query_str, %options );
322 # We roughly emulate the CGI parameters of the zebra query builder
324 shift @$operators; # Shift out the one we unshifted before
325 my $ea = each_array( @$operands, @$operators, @$indexes );
326 while ( my ( $oand, $otor, $index ) = $ea->() ) {
327 $query_cgi .= '&' if $query_cgi;
328 $query_cgi .= 'idx=' . uri_escape_utf8( $index // '') . '&q=' . uri_escape_utf8( $oand );
329 $query_cgi .= '&op=' . uri_escape_utf8( $otor ) if $otor;
331 $query_cgi .= '&scan=1' if ( $scan );
334 $simple_query = $operands->[0] if @$operands == 1;
336 if ( $simple_query ) {
337 $query_desc = $simple_query;
339 $query_desc = $search_param_query_str;
341 my $limit = $self->_join_queries( $self->_convert_index_strings(@$limits));
342 my $limit_cgi = ( $orig_limits and @$orig_limits )
343 ? '&limit=' . join( '&limit=', map { uri_escape_utf8($_) } @$orig_limits )
346 $limit_desc = "$limit" if $limit;
349 undef, $query, $simple_query, $query_cgi, $query_desc,
350 $limit, $limit_cgi, $limit_desc, undef, undef
354 =head2 build_authorities_query
356 my $query = $builder->build_authorities_query(\%search);
358 This takes a nice description of an authority search and turns it into a black-box
359 query that can then be passed to the appropriate searcher.
361 The search description is a hashref that looks something like:
366 where => 'Heading', # search the main entry
367 operator => 'exact', # require an exact match
368 value => 'frogs', # the search string
371 where => '', # search all entries
372 operator => '', # default keyword, right truncation
380 authtypecode => 'TOPIC_TERM',
385 sub build_authorities_query {
386 my ( $self, $search ) = @_;
388 # Start by making the query parts
391 foreach my $s ( @{ $search->{searches} } ) {
392 my ( $wh, $op, $val ) = @{$s}{qw(where operator value)};
393 if ( defined $op && ($op eq 'is' || $op eq '=' || $op eq 'exact') ) {
395 # Match the whole field, case insensitive, UTF normalized.
396 push @query_parts, { term => { "$wh.ci_raw" => $val } };
399 # Match the whole field for all searchable fields, case insensitive,
401 # Given that field data is "The quick brown fox"
402 # "The quick brown fox" and "the quick brown fox" will match
403 # but not "quick brown fox".
407 fields => $self->_search_fields({ subfield => 'ci_raw' }),
412 elsif ( defined $op && $op eq 'start') {
413 # Match the prefix within a field for all searchable fields.
414 # Given that field data is "The quick brown fox"
415 # "The quick bro" will match, but not "quick bro"
417 # Does not seems to be a multi prefix query
418 # so we need to create one
420 # Match prefix of the field.
421 push @query_parts, { prefix => {"$wh.ci_raw" => $val} };
425 foreach my $field (@{$self->_search_fields()}) {
426 push @prefix_queries, {
427 prefix => { "$field.ci_raw" => $val }
432 'should' => \@prefix_queries,
433 'minimum_should_match' => 1
439 # Query all searchable fields.
440 # Given that field data is "The quick brown fox"
441 # a search containing any of the words will match, regardless
444 my @tokens = $self->_split_query( $val );
445 foreach my $token ( @tokens ) {
446 $token = $self->_truncate_terms(
447 $self->_clean_search_term( $token )
450 my $query = $self->_join_queries( @tokens );
453 lenient => JSON::true,
454 analyze_wildcard => JSON::true,
457 $query_string->{default_field} = $wh;
460 $query_string->{fields} = $self->_search_fields();
462 push @query_parts, { query_string => $query_string };
466 # Merge the query parts appropriately
467 # 'should' behaves like 'or'
468 # 'must' behaves like 'and'
469 # Zebra behaviour seem to match must so using that here
470 my $elastic_query = {};
471 $elastic_query->{bool}->{must} = \@query_parts;
473 # Filter by authtypecode if set
474 if ($search->{authtypecode}) {
475 $elastic_query->{bool}->{filter} = {
477 "authtype.raw" => $search->{authtypecode}
483 query => $elastic_query
487 $query->{sort} = [ $search->{sort} ] if exists $search->{sort};
492 =head2 build_authorities_query_compat
495 $builder->build_authorities_query_compat( \@marclist, \@and_or,
496 \@excluding, \@operator, \@value, $authtypecode, $orderby );
498 This builds a query for searching for authorities, in the style of
499 L<C4::AuthoritiesMarc::SearchAuthorities>.
507 An arrayref containing where the particular term should be searched for.
508 Options are: mainmainentry, mainentry, match, match-heading, see-from, and
509 thesaurus. If left blank, any field is used.
513 Totally ignored. It is never used in L<C4::AuthoritiesMarc::SearchAuthorities>.
521 What form of search to do. Options are: is (phrase, no truncation, whole field
522 must match), = (number exact match), exact (phrase, no truncation, whole field
523 must match). If left blank, then word list, right truncated, anywhere is used.
527 The actual user-provided string value to search for.
531 The authority type code to search within. If blank, then all will be searched.
535 The order to sort the results by. Options are Relevance, HeadingAsc,
536 HeadingDsc, AuthidAsc, AuthidDsc.
540 marclist, operator, and value must be the same length, and the values at
541 index /i/ all relate to each other.
543 This returns a query, which is a black box object that can be passed to the
544 appropriate search object.
548 our $koha_to_index_name = {
549 mainmainentry => 'heading-main',
550 mainentry => 'heading',
552 'match-heading' => 'match-heading',
553 'see-from' => 'match-heading-see-from',
554 thesaurus => 'subject-heading-thesaurus',
559 sub build_authorities_query_compat {
560 my ( $self, $marclist, $and_or, $excluding, $operator, $value,
561 $authtypecode, $orderby )
564 # This turns the old-style many-options argument form into a more
565 # extensible hash form that is understood by L<build_authorities_query>.
567 my $mappings = $self->get_elasticsearch_mappings();
569 # Convert to lower case
570 $marclist = [map(lc, @{$marclist})];
571 $orderby = lc $orderby;
574 # Make sure everything exists
575 foreach my $m (@$marclist) {
577 $m = exists $koha_to_index_name->{$m} ? $koha_to_index_name->{$m} : $m;
579 warn "Unknown search field $m in marclist" unless (defined $mappings->{data}->{properties}->{$m} || $m eq '' || $m eq 'match-heading');
581 for ( my $i = 0 ; $i < @$value ; $i++ ) {
582 next unless $value->[$i]; #clean empty form values, ES doesn't like undefined searches
585 where => $indexes[$i],
586 operator => $operator->[$i],
587 value => $value->[$i],
593 ( $orderby =~ /^heading/ ) ? 'heading__sort'
594 : ( $orderby =~ /^auth/ ) ? 'local-number__sort'
597 my $sort_order = ( $orderby =~ /asc$/ ) ? 'asc' : 'desc';
598 %sort = ( $sort_field => $sort_order, );
601 searches => \@searches,
602 authtypecode => $authtypecode,
604 $search{sort} = \%sort if %sort;
605 my $query = $self->build_authorities_query( \%search );
609 =head2 _build_scan_query
611 my ($query, $query_str) = $builder->_build_scan_query(\@operands, \@indexes)
613 This will build an aggregation scan query that can be issued to elasticsearch from
614 the provided string input.
618 our %scan_field_convert = (
622 'se' => 'title-series',
626 sub _build_scan_query {
627 my ( $self, $operands, $indexes ) = @_;
629 my $term = scalar( @$operands ) == 0 ? '' : $operands->[0];
630 my $index = scalar( @$indexes ) == 0 ? 'subject' : $indexes->[0];
632 my ( $f, $d ) = split( /,/, $index);
633 $index = $scan_field_convert{$f} || $f;
641 $res->{aggregations} = {
644 field => $index . '__facet',
645 order => { '_term' => 'asc' },
646 include => $self->_create_regex_filter($self->_clean_search_term($term)) . '.*'
650 return ($res, $term);
653 =head2 _create_regex_filter
655 my $filter = $builder->_create_regex_filter('term')
657 This will create a regex filter that can be used with an aggregation query.
661 sub _create_regex_filter {
662 my ($self, $term) = @_;
665 foreach my $c (split(//, quotemeta($term))) {
668 $result .= $lc ne $uc ? '[' . $lc . $uc . ']' : $c;
673 =head2 _convert_sort_fields
675 my @sort_params = _convert_sort_fields(@sort_by)
677 Converts the zebra-style sort index information into elasticsearch-style.
679 C<@sort_by> is the same as presented to L<build_query_compat>, and it returns
680 something that can be sent to L<build_query>.
684 sub _convert_sort_fields {
685 my ( $self, @sort_by ) = @_;
687 # Turn the sorting into something we care about.
688 my %sort_field_convert = (
689 acqdate => 'date-of-acquisition',
691 call_number => 'cn-sort',
692 popularity => 'issues',
693 relevance => undef, # default
695 pubdate => 'date-of-publication',
697 my %sort_order_convert =
698 ( qw( desc desc ), qw( dsc desc ), qw( asc asc ), qw( az asc ), qw( za desc ) );
700 # Convert the fields and orders, drop anything we don't know about.
701 grep { $_->{field} } map {
702 my ( $f, $d ) = /(.+)_(.+)/;
704 field => $sort_field_convert{$f},
705 direction => $sort_order_convert{$d}
710 sub _convert_index_fields {
711 my ( $self, @indexes ) = @_;
713 my %index_type_convert =
714 ( __default => undef, phr => 'phrase', rtrn => 'right-truncate', 'st-year' => 'st-year' );
716 # Convert according to our table, drop anything that doesn't convert.
717 # If a field starts with mc- we save it as it's used (and removed) later
718 # when joining things, to indicate we make it an 'OR' join.
719 # (Sorry, this got a bit ugly after special cases were found.)
721 # Lower case all field names
722 my ( $f, $t ) = map(lc, split /,/);
729 field => exists $index_field_convert{$f} ? $index_field_convert{$f} : $f,
730 type => $index_type_convert{ $t // '__default' }
732 $r->{field} = ($mc . $r->{field}) if $mc && $r->{field};
733 $r->{field} || $r->{type} ? $r : undef;
737 =head2 _convert_index_strings
739 my @searches = $self->_convert_index_strings(@searches);
741 Similar to L<_convert_index_fields>, this takes strings of the form
742 B<field:search term> and rewrites the field from zebra-style to
743 elasticsearch-style. Anything it doesn't understand is returned verbatim.
747 sub _convert_index_strings {
748 my ( $self, @searches ) = @_;
750 foreach my $s (@searches) {
752 my ( $field, $term ) = $s =~ /^\s*([\w,-]*?):(.*)/;
753 unless ( defined($field) && defined($term) ) {
757 my ($conv) = $self->_convert_index_fields($field);
758 unless ( defined($conv) ) {
762 push @res, ($conv->{field} ? $conv->{field} . ':' : '')
763 . $self->_modify_string_by_type( %$conv, operand => $term );
768 =head2 _convert_index_strings_freeform
770 my $search = $self->_convert_index_strings_freeform($search);
772 This is similar to L<_convert_index_strings>, however it'll search out the
773 things to change within the string. So it can handle strings such as
774 C<(su:foo) AND (su:bar)>, converting the C<su> appropriately.
776 If there is something of the form "su,complete-subfield" or something, the
777 second part is stripped off as we can't yet handle that. Making it work
778 will have to wait for a real query parser.
782 sub _convert_index_strings_freeform {
783 my ( $self, $search ) = @_;
784 # @TODO: Currenty will alter also fields contained within quotes:
785 # `searching for "stuff cn:123"` for example will become
786 # `searching for "stuff local-number:123"
788 # Fixing this is tricky, one possibility:
789 # https://stackoverflow.com/questions/19193876/perl-regex-to-match-a-string-that-is-not-enclosed-in-quotes
790 # Still not perfect, and will not handle escaped quotes within quotes and assumes balanced quotes.
792 # Another, not so elegant, solution could be to replace all quoted content with placeholders, and put
793 # them back when processing is done.
795 # Lower case field names
796 $search =~ s/($field_name_pattern)(?:,[\w-]*)?($multi_field_pattern):/\L$1\E$2:/og;
797 # Resolve possible field aliases
798 $search =~ s/($field_name_pattern)($multi_field_pattern):/(exists $index_field_convert{$1} ? $index_field_convert{$1} : $1)."$2:"/oge;
802 =head2 _modify_string_by_type
804 my $str = $self->_modify_string_by_type(%index_field);
806 If you have a search term (operand) and a type (phrase, right-truncated), this
807 will convert the string to have the function in lucene search terms, e.g.
808 wrapping quotes around it.
812 sub _modify_string_by_type {
813 my ( $self, %idx ) = @_;
815 my $type = $idx{type} || '';
816 my $str = $idx{operand};
817 return $str unless $str; # Empty or undef, we can't use it.
819 $str .= '*' if $type eq 'right-truncate';
820 $str = '"' . $str . '"' if $type eq 'phrase' && $str !~ /^".*"$/;
821 if ($type eq 'st-year') {
822 if ($str =~ /^(.*)-(.*)$/) {
823 my $from = $1 || '*';
824 my $until = $2 || '*';
825 $str = "[$from TO $until]";
833 my $query_str = $self->_join_queries(@query_parts);
835 This takes a list of query parts, that might be search terms on their own, or
836 booleaned together, or specifying fields, or whatever, wraps them in
837 parentheses, and ANDs them all together. Suitable for feeding to the ES
840 Note: doesn't AND them together if they specify an index that starts with "mc"
841 as that was a special case in the original code for dealing with multiple
842 choice options (you can't search for something that has an itype of A and
843 and itype of B otherwise.)
848 my ( $self, @parts ) = @_;
850 my @norm_parts = grep { defined($_) && $_ ne '' && $_ !~ /^mc-/ } @parts;
852 map { s/^mc-//r } grep { defined($_) && $_ ne '' && $_ =~ /^mc-/ } @parts;
853 return () unless @norm_parts + @mc_parts;
854 return ( @norm_parts, @mc_parts )[0] if @norm_parts + @mc_parts == 1;
856 # Group limits by field, so they can be OR'ed together
858 foreach my $mc_part (@mc_parts) {
859 my ($field, $value) = split /:/, $mc_part, 2;
860 $mc_limits{$field} //= [];
861 push @{ $mc_limits{$field} }, $value;
865 sprintf('%s:(%s)', $_, join (' OR ', @{ $mc_limits{$_} }));
866 } sort keys %mc_limits;
868 @norm_parts = map { "($_)" } @norm_parts;
870 return join( ' AND ', @norm_parts, @mc_parts);
875 my @phrased_queries = $self->_make_phrases(@query_parts);
877 This takes the supplied queries and forces them to be phrases by wrapping
878 quotes around them. It understands field prefixes, e.g. 'subject:' and puts
879 the quotes outside of them if they're there.
884 my ( $self, @parts ) = @_;
885 map { s/^\s*(\w*?:)(.*)$/$1"$2"/r } @parts;
888 =head2 _create_query_string
890 my @query_strings = $self->_create_query_string(@queries);
892 Given a list of hashrefs, it will turn them into a lucene-style query string.
893 The hash should contain field, type (both for the indexes), operator, and
898 sub _create_query_string {
899 my ( $self, @queries ) = @_;
902 my $otor = $_->{operator} ? $_->{operator} . ' ' : '';
903 my $field = $_->{field} ? $_->{field} . ':' : '';
905 my $oand = $self->_modify_string_by_type(%$_);
906 $oand = "($oand)" if $field && scalar(split(/\s+/, $oand)) > 1 && (!defined $_->{type} || $_->{type} ne 'st-year');
907 "$otor($field$oand)";
911 =head2 _clean_search_term
913 my $term = $self->_clean_search_term($term);
915 This cleans a search term by removing any funny characters that may upset
916 ES and give us an error. It also calls L<_convert_index_strings_freeform>
917 to ensure those parts are correct.
921 sub _clean_search_term {
922 my ( $self, $term ) = @_;
924 # Lookahead for checking if we are inside quotes
925 my $lookahead = '(?=(?:[^\"]*+\"[^\"]*+\")*+[^\"]*+$)';
927 # Some hardcoded searches (like with authorities) produce things like
928 # 'an=123', when it ought to be 'an:123' for our purposes.
931 $term = $self->_convert_index_strings_freeform($term);
934 # Remove unbalanced quotes
935 my $unquoted = $term;
936 my $count = ($unquoted =~ tr/"/ /);
937 if ($count % 2 == 1) {
941 # Remove unquoted colons that have whitespace on either side of them
942 $term =~ s/(:+)(\s+)$lookahead/$2/g;
943 $term =~ s/(\s+)(:+)$lookahead/$1/g;
946 $term = $self->_query_regex_escape_process($term);
951 =head2 _query_regex_escape_process
953 my $query = $self->_query_regex_escape_process($query);
955 Processes query in accordance with current "QueryRegexEscapeOptions" system preference setting.
959 sub _query_regex_escape_process {
960 my ($self, $query) = @_;
961 my $regex_escape_options = C4::Context->preference("QueryRegexEscapeOptions");
962 if ($regex_escape_options ne 'dont_escape') {
963 if ($regex_escape_options eq 'escape') {
964 # Will escape unescaped slashes (/) while preserving
965 # unescaped slashes within quotes
966 # @TODO: assumes quotes are always balanced and will
967 # not handle escaped qoutes properly, should perhaps be
968 # replaced with a more general parser solution
969 # so that this function is ever only provided with unqouted
971 $query =~ s@(?:(?<!\\)((?:[\\]{2})*)(?=/))(?![^"]*"(?:[^"]*"[^"]*")*[^"]*$)@\\$1@g;
973 elsif($regex_escape_options eq 'unescape_escaped') {
974 # Will unescape escaped slashes (\/) and escape
975 # unescaped slashes (/) while preserving slashes within quotes
976 # The same limitatations as above apply for handling of quotes
977 $query =~ s@(?:(?<!\\)(?:((?:[\\]{2})*[\\])|((?:[\\]{2})*))(?=/))(?![^"]*"(?:[^"]*"[^"]*")*[^"]*$)@($1 ? substr($1, 0, -1) : ($2 . "\\"))@ge;
983 =head2 _fix_limit_special_cases
985 my $limits = $self->_fix_limit_special_cases($limits);
987 This converts any special cases that the limit specifications have into things
988 that are more readily processable by the rest of the code.
990 The argument should be an arrayref, and it'll return an arrayref.
994 sub _fix_limit_special_cases {
995 my ( $self, $limits ) = @_;
998 foreach my $l (@$limits) {
1000 # This is set up by opac-search.pl
1001 if ( $l =~ /^yr,st-numeric,ge=/ ) {
1002 my ( $start, $end ) =
1003 ( $l =~ /^yr,st-numeric,ge=(.*) and yr,st-numeric,le=(.*)$/ );
1004 next unless defined($start) && defined($end);
1005 push @new_lim, "copydate:[$start TO $end]";
1007 elsif ( $l =~ /^yr,st-numeric=/ ) {
1008 my ($date) = ( $l =~ /^yr,st-numeric=(.*)$/ );
1009 next unless defined($date);
1010 $date = $self->_modify_string_by_type(type => 'st-year', operand => $date);
1011 push @new_lim, "copydate:$date";
1013 elsif ( $l =~ /^available$/ ) {
1014 push @new_lim, 'onloan:false';
1017 my ( $field, $term ) = $l =~ /^\s*([\w,-]*?):(.*)/;
1018 $field =~ s/,phr$//; #We are quoting all the limits as phrase, this prevents from quoting again later
1019 if ( defined($field) && defined($term) ) {
1020 push @new_lim, "$field:(\"$term\")";
1032 my $field = $self->_sort_field($field);
1034 Given a field name, this works out what the actual name of the field to sort
1035 on should be. A '__sort' suffix is added for fields with a sort version, and
1036 for text fields either '.phrase' (for sortable versions) or '.raw' is appended
1037 to avoid sorting on a tokenized value.
1042 my ($self, $f) = @_;
1044 my $mappings = $self->get_elasticsearch_mappings();
1045 my $textField = defined $mappings->{data}{properties}{$f}{type} && $mappings->{data}{properties}{$f}{type} eq 'text';
1046 if (!defined $self->sort_fields()->{$f} || $self->sort_fields()->{$f}) {
1049 # We need to add '.raw' to text fields without a sort field,
1050 # otherwise it'll sort based on the tokenised form.
1051 $f .= '.raw' if $textField;
1056 =head2 _truncate_terms
1058 my $query = $self->_truncate_terms($query);
1060 Given a string query this function appends '*' wildcard to all terms except
1061 operands and double quoted strings.
1065 sub _truncate_terms {
1066 my ( $self, $query ) = @_;
1068 my @tokens = $self->_split_query( $query );
1070 # Filter out empty tokens
1071 my @words = grep { $_ !~ /^\s*$/ } @tokens;
1073 # Append '*' to words if needed, ie. if it ends in a word character and is not a keyword
1076 (/\W$/ or grep {lc($w) eq $_} qw/and or not/) ? $_ : "$_*";
1079 return join ' ', @terms;
1084 my @token = $self->_split_query($query_str);
1086 Given a string query this function splits it to tokens taking into account
1087 any field prefixes and quoted strings.
1091 my $tokenize_split_re = qr/((?:${field_name_pattern}${multi_field_pattern}:)?"[^"]+"|\s+)/;
1094 my ( $self, $query ) = @_;
1096 # '"donald duck" title:"the mouse" and peter" get split into
1097 # ['', '"donald duck"', '', ' ', '', 'title:"the mouse"', '', ' ', 'and', ' ', 'pete']
1098 my @tokens = split $tokenize_split_re, $query;
1100 # Filter out empty values
1101 @tokens = grep( /\S/, @tokens );
1106 =head2 _search_fields
1107 my $weighted_fields = $self->_search_fields({
1109 weighted_fields => 1,
1113 Generate a list of searchable fields to be used for Elasticsearch queries
1114 applied to multiple fields.
1116 Returns an arrayref of field names for either OPAC or staff interface, with
1117 possible weights and subfield appended to each field name depending on the
1124 Hashref with options. The parameter C<is_opac> indicates whether the searchable
1125 fields for OPAC or staff interface should be retrieved. If C<weighted_fields> is set
1126 fields weights will be applied on returned fields. C<subfield> can be used to
1127 provide a subfield that will be appended to fields as "C<field_name>.C<subfield>".
1133 sub _search_fields {
1134 my ($self, $params) = @_;
1137 weighted_fields => 0,
1139 # This is a hack for authorities build_authorities_query
1140 # can hopefully be removed in the future
1143 my $cache = Koha::Caches->get_instance();
1144 my $cache_key = 'elasticsearch_search_fields' . ($params->{is_opac} ? '_opac' : '_staff_client') . "_" . $self->index;
1145 my $search_fields = $cache->get_from_cache($cache_key, { unsafe => 1 });
1146 if (!$search_fields) {
1147 # The reason we don't use Koha::SearchFields->search here is we don't
1148 # want or need resultset wrapped as Koha::SearchField object.
1149 # It does not make any sense in this context and would cause
1150 # unnecessary overhead sice we are only querying for data
1151 # Also would not work, or produce strange results, with the "columns"
1153 my $schema = Koha::Database->schema;
1154 my $result = $schema->resultset('SearchField')->search(
1156 $params->{is_opac} ? (
1161 'type' => { '!=' => 'boolean' },
1162 'search_marc_map.index_name' => $self->index,
1163 'search_marc_map.marc_type' => C4::Context->preference('marcflavour'),
1164 'search_marc_to_fields.search' => 1,
1167 columns => [qw/name weight/],
1169 join => {search_marc_to_fields => 'search_marc_map'},
1173 while (my $search_field = $result->next) {
1174 push @search_fields, [
1175 lc $search_field->name,
1176 $search_field->weight ? $search_field->weight : ()
1179 $search_fields = \@search_fields;
1180 $cache->set_in_cache($cache_key, $search_fields);
1182 if ($params->{subfield}) {
1183 my $subfield = $params->{subfield};
1186 # Copy values to avoid mutating cached
1187 # data (since unsafe is used)
1188 my ($field, $weight) = @{$_};
1189 ["${field}.${subfield}", $weight];
1193 if ($params->{weighted_fields}) {
1194 return [map { join('^', @{$_}) } @{$search_fields}];
1197 # Exclude weight from field
1198 return [map { $_->[0] } @{$search_fields}];