1 package Koha::SearchEngine::Elasticsearch::QueryBuilder;
3 # This file is part of Koha.
5 # Copyright 2014 Catalyst IT Ltd.
7 # Koha is free software; you can redistribute it and/or modify it
8 # under the terms of the GNU General Public License as published by
9 # the Free Software Foundation; either version 3 of the License, or
10 # (at your option) any later version.
12 # Koha is distributed in the hope that it will be useful, but
13 # WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 # GNU General Public License for more details.
17 # You should have received a copy of the GNU General Public License
18 # along with Koha; if not, see <http://www.gnu.org/licenses>.
22 Koha::SearchEngine::Elasticsearch::QueryBuilder - constructs elasticsearch
23 query objects from user-supplied queries
27 This provides the functions that take a user-supplied search query, and
28 provides something that can be given to elasticsearch to get answers.
32 use Koha::SearchEngine::Elasticsearch::QueryBuilder;
33 $builder = Koha::SearchEngine::Elasticsearch->new({ index => $index });
34 my $simple_query = $builder->build_query("hello");
35 # This is currently undocumented because the original code is undocumented
36 my $adv_query = $builder->build_advanced_query($indexes, $operands, $operators);
42 use base qw(Koha::SearchEngine::Elasticsearch);
44 use List::MoreUtils qw( each_array );
46 use URI::Escape qw( uri_escape_utf8 );
52 our %index_field_convert = (
56 'lcn' => 'local-classification',
57 'callnum' => 'local-classification',
58 'record-type' => 'rtype',
59 'mc-rtype' => 'rtype',
61 'lc-card' => 'lc-card-number',
62 'sn' => 'local-number',
63 'biblionumber' => 'local-number',
64 'yr' => 'date-of-publication',
65 'pubdate' => 'date-of-publication',
66 'acqdate' => 'date-of-acquisition',
67 'date/time-last-modified' => 'date-time-last-modified',
68 'dtlm' => 'date-time-last-modified',
69 'diss' => 'dissertation-information',
72 'music-number' => 'identifier-publisher-for-music',
73 'number-music-publisher' => 'identifier-publisher-for-music',
74 'music' => 'identifier-publisher-for-music',
75 'ident' => 'identifier-standard',
76 'cpn' => 'corporate-name',
77 'cfn' => 'conference-name',
78 'pn' => 'personal-name',
83 'rcn' => 'record-control-number',
84 'cni' => 'control-number-identifier',
87 #'su-geo' => 'subject',
90 'se' => 'title-series',
91 'ut' => 'title-uniform',
92 'an' => 'koha-auth-number',
93 'authority-number' => 'koha-auth-number',
96 'rank' => 'relevance',
98 'wrdl' => 'st-word-list',
99 'rt' => 'right-truncation',
100 'rtrn' => 'right-truncation',
101 'ltrn' => 'left-truncation',
102 'rltrn' => 'left-and-right',
103 'mc-itemtype' => 'itemtype',
104 'mc-ccode' => 'ccode',
105 'branch' => 'homebranch',
106 'mc-loc' => 'location',
108 'stocknumber' => 'number-local-acquisition',
109 'inv' => 'number-local-acquisition',
111 'mc-itype' => 'itype',
112 'aub' => 'author-personal-bibliography',
113 'auo' => 'author-in-order',
117 'frequency-code' => 'ff8-18',
118 'illustration-code' => 'ff8-18-21',
119 'regularity-code' => 'ff8-19',
120 'type-of-serial' => 'ff8-21',
121 'format' => 'ff8-23',
122 'conference-code' => 'ff8-29',
123 'festschrift-indicator' => 'ff8-30',
124 'index-indicator' => 'ff8-31',
127 'literature-code' => 'lf',
128 'biography' => 'bio',
130 'biography-code' => 'bio',
131 'l-format' => 'ff7-01-02',
132 'lex' => 'lexile-number',
133 'hi' => 'host-item-number',
134 'itu' => 'index-term-uncontrolled',
135 'itg' => 'index-term-genre',
137 my $field_name_pattern = '[\w\-]+';
138 my $multi_field_pattern = "(?:\\.$field_name_pattern)*";
140 =head2 get_index_field_convert
142 my @index_params = Koha::SearchEngine::Elasticsearch::QueryBuilder->get_index_field_convert();
144 Converts zebra-style search index notation into elasticsearch-style.
146 C<@indexes> is an array of index names, as presented to L<build_query_compat>,
147 and it returns something that can be sent to L<build_query>.
149 B<TODO>: this will pull from the elasticsearch mappings table to figure out
154 sub get_index_field_convert() {
155 return \%index_field_convert;
160 my $simple_query = $builder->build_query("hello", %options)
162 This will build a query that can be issued to elasticsearch from the provided
163 string input. This expects a lucene style search form (see
164 L<http://www.elasticsearch.org/guide/en/elasticsearch/reference/current/query-dsl-query-string-query.html#query-string-syntax>
167 It'll make an attempt to respect the various query options.
169 Additional options can be provided with the C<%options> hash.
175 This should be an arrayref of hashrefs, each containing a C<field> and an
176 C<direction> (optional, defaults to C<asc>.) The results will be sorted
177 according to these values. Valid values for C<direction> are 'asc' and 'desc'.
184 my ( $self, $query, %options ) = @_;
186 my $stemming = C4::Context->preference("QueryStemming") || 0;
187 my $auto_truncation = C4::Context->preference("QueryAutoTruncate") || 0;
188 my $fuzzy_enabled = C4::Context->preference("QueryFuzzy") || 0;
190 $query = '*' unless defined $query;
193 my $fields = $self->_search_fields({
194 is_opac => $options{is_opac},
195 weighted_fields => $options{weighted_fields},
197 if ($options{whole_record}) {
198 push @$fields, 'marc_data_array.*';
203 fuzziness => $fuzzy_enabled ? 'auto' : '0',
204 default_operator => 'AND',
206 lenient => JSON::true,
207 analyze_wildcard => JSON::true,
210 $res->{query}->{query_string}->{type} = 'cross_fields' if C4::Context->preference('ElasticsearchCrossFields');
212 if ( $options{sort} ) {
213 foreach my $sort ( @{ $options{sort} } ) {
214 my ( $f, $d ) = @$sort{qw/ field direction /};
215 die "Invalid sort direction, $d"
216 if $d && ( $d ne 'asc' && $d ne 'desc' );
217 $d = 'asc' unless $d;
219 $f = $self->_sort_field($f);
220 push @{ $res->{sort} }, { $f => { order => $d } };
224 # See _convert_facets in Search.pm for how these get turned into
225 # things that Koha can use.
226 my $size = C4::Context->preference('FacetMaxCount');
227 $res->{aggregations} = {
228 author => { terms => { field => "author__facet" , size => $size } },
229 subject => { terms => { field => "subject__facet", size => $size } },
230 itype => { terms => { field => "itype__facet", size => $size} },
231 location => { terms => { field => "location__facet", size => $size } },
232 'su-geo' => { terms => { field => "su-geo__facet", size => $size} },
233 'title-series' => { terms => { field => "title-series__facet", size => $size } },
234 ccode => { terms => { field => "ccode__facet", size => $size } },
235 ln => { terms => { field => "ln__facet", size => $size } },
238 my $display_library_facets = C4::Context->preference('DisplayLibraryFacets');
239 if ( $display_library_facets eq 'both'
240 or $display_library_facets eq 'home' ) {
241 $res->{aggregations}{homebranch} = { terms => { field => "homebranch__facet", size => $size } };
243 if ( $display_library_facets eq 'both'
244 or $display_library_facets eq 'holding' ) {
245 $res->{aggregations}{holdingbranch} = { terms => { field => "holdingbranch__facet", size => $size } };
250 =head2 build_query_compat
253 $error, $query, $simple_query, $query_cgi,
254 $query_desc, $limit, $limit_cgi, $limit_desc,
255 $stopwords_removed, $query_type
257 = $builder->build_query_compat( \@operators, \@operands, \@indexes,
258 \@limits, \@sort_by, $scan, $lang, $params );
260 This handles a search using the same api as L<C4::Search::buildQuery> does.
262 A very simple query will go in with C<$operands> set to ['query'], and
263 C<$sort_by> set to ['pubdate_dsc']. This simple case will return with
264 C<$query> set to something that can perform the search, C<$simple_query>
265 set to just the search term, C<$query_cgi> set to something that can
266 reproduce this search, and C<$query_desc> set to something else.
270 sub build_query_compat {
271 my ( $self, $operators, $operands, $indexes, $orig_limits, $sort_by, $scan,
277 my $search_param_query_str = '';
280 ($query, $query_str) = $self->_build_scan_query( $operands, $indexes );
281 $search_param_query_str = $query_str;
283 my @sort_params = $self->_convert_sort_fields(@$sort_by);
284 my @index_params = $self->_convert_index_fields(@$indexes);
285 $limits = $self->_fix_limit_special_cases($orig_limits);
286 if ( $params->{suppress} ) { push @$limits, "suppress:false"; }
287 # Merge the indexes in with the search terms and the operands so that
288 # each search thing is a handy unit.
289 unshift @$operators, undef; # The first one can't have an op
291 my $truncate = C4::Context->preference("QueryAutoTruncate") || 0;
292 my $ea = each_array( @$operands, @$operators, @index_params );
293 while ( my ( $oand, $otor, $index ) = $ea->() ) {
294 next if ( !defined($oand) || $oand eq '' );
295 $oand = $self->clean_search_term($oand);
296 $oand = $self->_truncate_terms($oand) if ($truncate);
297 push @search_params, {
298 operand => $oand, # the search terms
299 operator => defined($otor) ? uc $otor : undef, # AND and so on
300 $index ? %$index : (),
304 # We build a string query from limits and the queries. An alternative
305 # would be to pass them separately into build_query and let it build
306 # them into a structured ES query itself. Maybe later, though that'd be
308 my @search_param_query_array = $self->_create_query_string(@search_params);
309 $search_param_query_str = join( ' ', @search_param_query_array );
310 my $search_param_limit_str =
311 $self->_join_queries( $self->_convert_index_strings(@$limits) );
312 if ( @search_param_query_array > 1 && $search_param_limit_str ) {
313 $search_param_query_str = "($search_param_query_str)";
315 $query_str = join( ' AND ',
316 $search_param_query_str || (),
317 $search_param_limit_str || () );
319 # If there's no query on the left, let's remove the junk left behind
320 $query_str =~ s/^ AND //;
322 $options{sort} = \@sort_params;
323 $options{is_opac} = $params->{is_opac};
324 $options{weighted_fields} = $params->{weighted_fields};
325 $options{whole_record} = $params->{whole_record};
326 $query = $self->build_query( $query_str, %options );
329 # We roughly emulate the CGI parameters of the zebra query builder
331 shift @$operators; # Shift out the one we unshifted before
332 my $ea = each_array( @$operands, @$operators, @$indexes );
333 while ( my ( $oand, $otor, $index ) = $ea->() ) {
334 $query_cgi .= '&' if $query_cgi;
335 $query_cgi .= 'idx=' . uri_escape_utf8( $index // '') . '&q=' . uri_escape_utf8( $oand );
336 $query_cgi .= '&op=' . uri_escape_utf8( $otor ) if $otor;
338 $query_cgi .= '&scan=1' if ( $scan );
341 $simple_query = $operands->[0] if @$operands == 1;
343 if ( $simple_query ) {
344 $query_desc = $simple_query;
346 $query_desc = $search_param_query_str;
348 my $limit = $self->_join_queries( $self->_convert_index_strings(@$limits));
349 my $limit_cgi = ( $orig_limits and @$orig_limits )
350 ? '&limit=' . join( '&limit=', map { uri_escape_utf8($_) } @$orig_limits )
353 $limit_desc = "$limit" if $limit;
356 undef, $query, $simple_query, $query_cgi, $query_desc,
357 $limit, $limit_cgi, $limit_desc, undef, undef
361 =head2 build_authorities_query
363 my $query = $builder->build_authorities_query(\%search);
365 This takes a nice description of an authority search and turns it into a black-box
366 query that can then be passed to the appropriate searcher.
368 The search description is a hashref that looks something like:
373 where => 'Heading', # search the main entry
374 operator => 'exact', # require an exact match
375 value => 'frogs', # the search string
378 where => '', # search all entries
379 operator => '', # default keyword, right truncation
387 authtypecode => 'TOPIC_TERM',
392 sub build_authorities_query {
393 my ( $self, $search ) = @_;
395 # Start by making the query parts
398 foreach my $s ( @{ $search->{searches} } ) {
399 my ( $wh, $op, $val ) = @{$s}{qw(where operator value)};
400 if ( defined $op && ($op eq 'is' || $op eq '=' || $op eq 'exact') ) {
402 # Match the whole field, case insensitive, UTF normalized.
403 push @query_parts, { term => { "$wh.ci_raw" => $val } };
406 # Match the whole field for all searchable fields, case insensitive,
408 # Given that field data is "The quick brown fox"
409 # "The quick brown fox" and "the quick brown fox" will match
410 # but not "quick brown fox".
414 fields => $self->_search_fields({ subfield => 'ci_raw' }),
419 elsif ( defined $op && $op eq 'start') {
420 # Match the prefix within a field for all searchable fields.
421 # Given that field data is "The quick brown fox"
422 # "The quick bro" will match, but not "quick bro"
424 # Does not seems to be a multi prefix query
425 # so we need to create one
427 # Match prefix of the field.
428 push @query_parts, { prefix => {"$wh.ci_raw" => $val} };
432 foreach my $field (@{$self->_search_fields()}) {
433 push @prefix_queries, {
434 prefix => { "$field.ci_raw" => $val }
439 'should' => \@prefix_queries,
440 'minimum_should_match' => 1
446 # Query all searchable fields.
447 # Given that field data is "The quick brown fox"
448 # a search containing any of the words will match, regardless
451 my @tokens = $self->_split_query( $val );
452 foreach my $token ( @tokens ) {
453 $token = $self->_truncate_terms(
454 $self->clean_search_term( $token )
457 my $query = $self->_join_queries( @tokens );
460 lenient => JSON::true,
461 analyze_wildcard => JSON::true,
464 $query_string->{default_field} = $wh;
467 $query_string->{fields} = $self->_search_fields();
469 push @query_parts, { query_string => $query_string };
473 # Merge the query parts appropriately
474 # 'should' behaves like 'or'
475 # 'must' behaves like 'and'
476 # Zebra behaviour seem to match must so using that here
477 my $elastic_query = {};
478 $elastic_query->{bool}->{must} = \@query_parts;
480 # Filter by authtypecode if set
481 if ($search->{authtypecode}) {
482 $elastic_query->{bool}->{filter} = {
484 "authtype.raw" => $search->{authtypecode}
490 query => $elastic_query
494 $query->{sort} = [ $search->{sort} ] if exists $search->{sort};
499 =head2 build_authorities_query_compat
502 $builder->build_authorities_query_compat( \@marclist, \@and_or,
503 \@excluding, \@operator, \@value, $authtypecode, $orderby );
505 This builds a query for searching for authorities, in the style of
506 L<C4::AuthoritiesMarc::SearchAuthorities>.
514 An arrayref containing where the particular term should be searched for.
515 Options are: mainmainentry, mainentry, match, match-heading, see-from, and
516 thesaurus. If left blank, any field is used.
520 Totally ignored. It is never used in L<C4::AuthoritiesMarc::SearchAuthorities>.
528 What form of search to do. Options are: is (phrase, no truncation, whole field
529 must match), = (number exact match), exact (phrase, no truncation, whole field
530 must match). If left blank, then word list, right truncated, anywhere is used.
534 The actual user-provided string value to search for.
538 The authority type code to search within. If blank, then all will be searched.
542 The order to sort the results by. Options are Relevance, HeadingAsc,
543 HeadingDsc, AuthidAsc, AuthidDsc.
547 marclist, operator, and value must be the same length, and the values at
548 index /i/ all relate to each other.
550 This returns a query, which is a black box object that can be passed to the
551 appropriate search object.
555 our $koha_to_index_name = {
556 mainmainentry => 'heading-main',
557 mainentry => 'heading',
559 'match-heading' => 'match-heading',
560 'see-from' => 'match-heading-see-from',
561 thesaurus => 'subject-heading-thesaurus',
562 'thesaurus-conventions' => 'subject-heading-thesaurus-conventions',
567 # Note that sears and aat map to 008/11 values here
568 # but don't appear in C4/Headin/MARC21 thesaurus
569 # because they don't have values in controlled field indicators
570 # https://www.loc.gov/marc/authority/ad008.html
571 our $thesaurus_to_value = {
583 sub build_authorities_query_compat {
584 my ( $self, $marclist, $and_or, $excluding, $operator, $value,
585 $authtypecode, $orderby )
588 # This turns the old-style many-options argument form into a more
589 # extensible hash form that is understood by L<build_authorities_query>.
591 my $mappings = $self->get_elasticsearch_mappings();
593 # Convert to lower case
594 $marclist = [map(lc, @{$marclist})];
595 $orderby = lc $orderby;
598 # Make sure everything exists
599 foreach my $m (@$marclist) {
601 $m = exists $koha_to_index_name->{$m} ? $koha_to_index_name->{$m} : $m;
603 warn "Unknown search field $m in marclist" unless (defined $mappings->{properties}->{$m} || $m eq '' || $m eq 'match-heading');
605 for ( my $i = 0 ; $i < @$value ; $i++ ) {
606 next unless $value->[$i]; #clean empty form values, ES doesn't like undefined searches
607 $value->[$i] = $thesaurus_to_value->{ $value->[$i] }
608 if( defined $thesaurus_to_value->{ $value->[$i] } && $indexes[$i] eq 'subject-heading-thesaurus' );
611 where => $indexes[$i],
612 operator => $operator->[$i],
613 value => $value->[$i],
619 ( $orderby =~ /^heading/ ) ? 'heading__sort'
620 : ( $orderby =~ /^auth/ ) ? 'local-number__sort'
623 my $sort_order = ( $orderby =~ /asc$/ ) ? 'asc' : 'desc';
624 %sort = ( $sort_field => $sort_order, );
627 searches => \@searches,
628 authtypecode => $authtypecode,
630 $search{sort} = \%sort if %sort;
631 my $query = $self->build_authorities_query( \%search );
635 =head2 _build_scan_query
637 my ($query, $query_str) = $builder->_build_scan_query(\@operands, \@indexes)
639 This will build an aggregation scan query that can be issued to elasticsearch from
640 the provided string input.
644 our %scan_field_convert = (
648 'se' => 'title-series',
652 sub _build_scan_query {
653 my ( $self, $operands, $indexes ) = @_;
655 my $term = scalar( @$operands ) == 0 ? '' : $operands->[0];
656 my $index = scalar( @$indexes ) == 0 ? 'subject' : $indexes->[0];
658 my ( $f, $d ) = split( /,/, $index);
659 $index = $scan_field_convert{$f} || $f;
667 $res->{aggregations} = {
670 field => $index . '__facet',
671 order => { '_key' => 'asc' },
672 include => $self->_create_regex_filter($self->clean_search_term($term)) . '.*'
676 return ($res, $term);
679 =head2 _create_regex_filter
681 my $filter = $builder->_create_regex_filter('term')
683 This will create a regex filter that can be used with an aggregation query.
687 sub _create_regex_filter {
688 my ($self, $term) = @_;
691 foreach my $c (split(//, quotemeta($term))) {
694 $result .= $lc ne $uc ? '[' . $lc . $uc . ']' : $c;
699 =head2 _convert_sort_fields
701 my @sort_params = _convert_sort_fields(@sort_by)
703 Converts the zebra-style sort index information into elasticsearch-style.
705 C<@sort_by> is the same as presented to L<build_query_compat>, and it returns
706 something that can be sent to L<build_query>.
710 sub _convert_sort_fields {
711 my ( $self, @sort_by ) = @_;
713 # Turn the sorting into something we care about.
714 my %sort_field_convert = (
715 acqdate => 'date-of-acquisition',
717 call_number => 'cn-sort',
718 popularity => 'issues',
719 relevance => undef, # default
721 pubdate => 'date-of-publication',
722 biblionumber => 'local-number',
724 my %sort_order_convert =
725 ( qw( desc desc ), qw( dsc desc ), qw( asc asc ), qw( az asc ), qw( za desc ) );
727 # Convert the fields and orders, drop anything we don't know about.
728 grep { $_->{field} } map {
729 my ( $f, $d ) = /(.+)_(.+)/;
731 field => $sort_field_convert{$f},
732 direction => $sort_order_convert{$d}
737 sub _convert_index_fields {
738 my ( $self, @indexes ) = @_;
740 my %index_type_convert =
741 ( __default => undef, phr => 'phrase', rtrn => 'right-truncate', 'st-year' => 'st-year' );
743 @indexes = grep { $_ ne q{} } @indexes; # Remove any blank indexes, i.e. keyword
745 # Convert according to our table, drop anything that doesn't convert.
746 # If a field starts with mc- we save it as it's used (and removed) later
747 # when joining things, to indicate we make it an 'OR' join.
748 # (Sorry, this got a bit ugly after special cases were found.)
750 # Lower case all field names
751 my ( $f, $t ) = map(lc, split /,/);
758 field => exists $index_field_convert{$f} ? $index_field_convert{$f} : $f,
759 type => $index_type_convert{ $t // '__default' }
761 $r->{field} = ($mc . $r->{field}) if $mc && $r->{field};
762 $r->{field} || $r->{type} ? $r : undef;
766 =head2 _convert_index_strings
768 my @searches = $self->_convert_index_strings(@searches);
770 Similar to L<_convert_index_fields>, this takes strings of the form
771 B<field:search term> and rewrites the field from zebra-style to
772 elasticsearch-style. Anything it doesn't understand is returned verbatim.
776 sub _convert_index_strings {
777 my ( $self, @searches ) = @_;
779 foreach my $s (@searches) {
781 my ( $field, $term ) = $s =~ /^\s*([\w,-]*?):(.*)/;
782 unless ( defined($field) && defined($term) ) {
786 my ($conv) = $self->_convert_index_fields($field);
787 unless ( defined($conv) ) {
791 push @res, ($conv->{field} ? $conv->{field} . ':' : '')
792 . $self->_modify_string_by_type( %$conv, operand => $term );
797 =head2 _convert_index_strings_freeform
799 my $search = $self->_convert_index_strings_freeform($search);
801 This is similar to L<_convert_index_strings>, however it'll search out the
802 things to change within the string. So it can handle strings such as
803 C<(su:foo) AND (su:bar)>, converting the C<su> appropriately.
805 If there is something of the form "su,complete-subfield" or something, the
806 second part is stripped off as we can't yet handle that. Making it work
807 will have to wait for a real query parser.
811 sub _convert_index_strings_freeform {
812 my ( $self, $search ) = @_;
813 # @TODO: Currently will alter also fields contained within quotes:
814 # `searching for "stuff cn:123"` for example will become
815 # `searching for "stuff local-number:123"
817 # Fixing this is tricky, one possibility:
818 # https://stackoverflow.com/questions/19193876/perl-regex-to-match-a-string-that-is-not-enclosed-in-quotes
819 # Still not perfect, and will not handle escaped quotes within quotes and assumes balanced quotes.
821 # Another, not so elegant, solution could be to replace all quoted content with placeholders, and put
822 # them back when processing is done.
824 # Lower case field names
825 $search =~ s/($field_name_pattern)(?:,[\w-]*)?($multi_field_pattern):/\L$1\E$2:/og;
826 # Resolve possible field aliases
827 $search =~ s/($field_name_pattern)($multi_field_pattern):/(exists $index_field_convert{$1} ? $index_field_convert{$1} : $1).($1 eq 'kw' ? "$2" : "$2:")/oge;
831 =head2 _modify_string_by_type
833 my $str = $self->_modify_string_by_type(%index_field);
835 If you have a search term (operand) and a type (phrase, right-truncated), this
836 will convert the string to have the function in lucene search terms, e.g.
837 wrapping quotes around it.
841 sub _modify_string_by_type {
842 my ( $self, %idx ) = @_;
844 my $type = $idx{type} || '';
845 my $str = $idx{operand};
846 return $str unless $str; # Empty or undef, we can't use it.
848 $str .= '*' if $type eq 'right-truncate';
849 $str = '"' . $str . '"' if $type eq 'phrase' && $str !~ /^".*"$/;
850 if ($type eq 'st-year') {
851 if ($str =~ /^(.*)-(.*)$/) {
852 my $from = $1 || '*';
853 my $until = $2 || '*';
854 $str = "[$from TO $until]";
862 my $query_str = $self->_join_queries(@query_parts);
864 This takes a list of query parts, that might be search terms on their own, or
865 booleaned together, or specifying fields, or whatever, wraps them in
866 parentheses, and ANDs them all together. Suitable for feeding to the ES
869 Note: doesn't AND them together if they specify an index that starts with "mc"
870 as that was a special case in the original code for dealing with multiple
871 choice options (you can't search for something that has an itype of A and
872 and itype of B otherwise.)
877 my ( $self, @parts ) = @_;
879 my @norm_parts = grep { defined($_) && $_ ne '' && $_ !~ /^mc-/ } @parts;
881 map { s/^mc-//r } grep { defined($_) && $_ ne '' && $_ =~ /^mc-/ } @parts;
882 return () unless @norm_parts + @mc_parts;
883 return ( @norm_parts, @mc_parts )[0] if @norm_parts + @mc_parts == 1;
885 # Group limits by field, so they can be OR'ed together
887 foreach my $mc_part (@mc_parts) {
888 my ($field, $value) = split /:/, $mc_part, 2;
889 $mc_limits{$field} //= [];
890 push @{ $mc_limits{$field} }, $value;
894 sprintf('%s:(%s)', $_, join (' OR ', @{ $mc_limits{$_} }));
895 } sort keys %mc_limits;
897 @norm_parts = map { "($_)" } @norm_parts;
899 return join( ' AND ', @norm_parts, @mc_parts);
904 my @phrased_queries = $self->_make_phrases(@query_parts);
906 This takes the supplied queries and forces them to be phrases by wrapping
907 quotes around them. It understands field prefixes, e.g. 'subject:' and puts
908 the quotes outside of them if they're there.
913 my ( $self, @parts ) = @_;
914 map { s/^\s*(\w*?:)(.*)$/$1"$2"/r } @parts;
917 =head2 _create_query_string
919 my @query_strings = $self->_create_query_string(@queries);
921 Given a list of hashrefs, it will turn them into a lucene-style query string.
922 The hash should contain field, type (both for the indexes), operator, and
927 sub _create_query_string {
928 my ( $self, @queries ) = @_;
931 my $otor = $_->{operator} ? $_->{operator} . ' ' : '';
932 my $field = $_->{field} ? $_->{field} . ':' : '';
934 my $oand = $self->_modify_string_by_type(%$_);
935 $oand = "($oand)" if $field && scalar(split(/\s+/, $oand)) > 1 && (!defined $_->{type} || $_->{type} ne 'st-year');
936 "$otor($field$oand)";
940 =head2 clean_search_term
942 my $term = $self->clean_search_term($term);
944 This cleans a search term by removing any funny characters that may upset
945 ES and give us an error. It also calls L<_convert_index_strings_freeform>
946 to ensure those parts are correct.
950 sub clean_search_term {
951 my ( $self, $term ) = @_;
953 # Lookahead for checking if we are inside quotes
954 my $lookahead = '(?=(?:[^\"]*+\"[^\"]*+\")*+[^\"]*+$)';
956 # Some hardcoded searches (like with authorities) produce things like
957 # 'an=123', when it ought to be 'an:123' for our purposes.
960 $term = $self->_convert_index_strings_freeform($term);
962 # Remove unbalanced quotes
963 my $unquoted = $term;
964 my $count = ($unquoted =~ tr/"/ /);
965 if ($count % 2 == 1) {
968 $term = $self->_query_regex_escape_process($term);
970 # because of _truncate_terms and if QueryAutoTruncate enabled
971 # we will have any special operators ruined by _truncate_terms:
972 # for ex. search for "test [6 TO 7]" will be converted to "test* [6* TO* 7]"
973 # so no reason to keep ranges in QueryAutoTruncate==true case:
974 my $truncate = C4::Context->preference("QueryAutoTruncate") || 0;
976 # replace all ranges with any square/curly brackets combinations to temporary substitutions (ex: "{a TO b]"" -> "~~LC~~a TO b~~RS~~")
977 # (where L is for left and C is for Curly and so on)
980 (?<backslashes>(?:[\\]{2})*)
981 (?<leftbracket>\{|\[)
983 [^\s\[\]\{\}]+\ TO\ [^\s\[\]\{\}]+
987 (?<rightbracket>\}|\])
988 /$+{backslashes}.'~~L'.($+{leftbracket} eq '[' ? 'S':'C').'~~'.$+{ranges}.'~~R'.($+{rightbracket} eq ']' ? 'S':'C').'~~'/gex;
990 # save all regex contents away before escaping brackets:
991 # (same trick as with brackets above, just RE for 'RegularExpression')
997 (?:[^/]+|(?<=\\)(?:[\\]{2})*/)+
999 )$lookahead@~~RE$rgx_i~~@x
1001 @saved_regexes[$rgx_i++] = $1;
1004 # remove leading and trailing colons mixed with optional slashes and spaces
1005 $term =~ s/^([\s\\]*:\s*)+//;
1006 $term =~ s/([\s\\]*:\s*)+$//;
1007 # remove unquoted colons that have whitespace on either side of them
1008 $term =~ s/([\s\\]*:\s*)+(\s+)$lookahead/$2/g;
1009 $term =~ s/(\s+)([\s\\]*:\s*)+$lookahead/$1/g;
1010 # replace with spaces all repeated colons no matter how they surrounded with spaces and slashes
1011 $term =~ s/([\s\\]*:\s*){2,}$lookahead/ /g;
1012 # screen all followups for colons after first colon,
1013 # and correctly ignore unevenly backslashed:
1014 $term =~ s/((?<!\\)(?:[\\]{2})*:[^:\s]+(?<!\\)(?:[\\]{2})*)(?=:)/$1\\/g;
1016 # screen all exclamation signs that either are the last symbol or have white space after them
1017 # or are followed by close parentheses
1018 $term =~ s/(?:[\s\\]*!\s*)+(\s|$|\))/$1/g;
1020 # screen all brackets with backslash
1021 $term =~ s/(?<!\\)(?:[\\]{2})*([\{\}\[\]])$lookahead/\\$1/g;
1023 # restore all regex contents after escaping brackets:
1024 for (my $i = 0; $i < @saved_regexes; $i++) {
1025 $term =~ s/~~RE$i~~/$saved_regexes[$i]/;
1028 # restore temporary weird substitutions back to normal brackets
1029 $term =~ s/~~L(C|S)~~([^\s\[\]\{\}]+ TO [^\s\[\]\{\}]+)~~R(C|S)~~/($1 eq 'S' ? '[':'{').$2.($3 eq 'S' ? ']':'}')/ge;
1034 =head2 _query_regex_escape_process
1036 my $query = $self->_query_regex_escape_process($query);
1038 Processes query in accordance with current "QueryRegexEscapeOptions" system preference setting.
1042 sub _query_regex_escape_process {
1043 my ($self, $query) = @_;
1044 my $regex_escape_options = C4::Context->preference("QueryRegexEscapeOptions");
1045 if ($regex_escape_options ne 'dont_escape') {
1046 if ($regex_escape_options eq 'escape') {
1047 # Will escape unescaped slashes (/) while preserving
1048 # unescaped slashes within quotes
1049 # @TODO: assumes quotes are always balanced and will
1050 # not handle escaped quotes properly, should perhaps be
1051 # replaced with a more general parser solution
1052 # so that this function is ever only provided with unquoted
1054 $query =~ s@(?:(?<!\\)((?:[\\]{2})*)(?=/))(?![^"]*"(?:[^"]*"[^"]*")*[^"]*$)@\\$1@g;
1056 elsif($regex_escape_options eq 'unescape_escaped') {
1057 # Will unescape escaped slashes (\/) and escape
1058 # unescaped slashes (/) while preserving slashes within quotes
1059 # The same limitatations as above apply for handling of quotes
1060 $query =~ s@(?:(?<!\\)(?:((?:[\\]{2})*[\\])|((?:[\\]{2})*))(?=/))(?![^"]*"(?:[^"]*"[^"]*")*[^"]*$)@($1 ? substr($1, 0, -1) : ($2 . "\\"))@ge;
1066 =head2 _fix_limit_special_cases
1068 my $limits = $self->_fix_limit_special_cases($limits);
1070 This converts any special cases that the limit specifications have into things
1071 that are more readily processable by the rest of the code.
1073 The argument should be an arrayref, and it'll return an arrayref.
1077 sub _fix_limit_special_cases {
1078 my ( $self, $limits ) = @_;
1081 foreach my $l (@$limits) {
1083 # This is set up by opac-search.pl
1084 if ( $l =~ /^yr,st-numeric,ge[=:]/ ) {
1085 my ( $start, $end ) =
1086 ( $l =~ /^yr,st-numeric,ge[=:](.*) and yr,st-numeric,le[=:](.*)$/ );
1087 next unless defined($start) && defined($end);
1088 push @new_lim, "date-of-publication:[$start TO $end]";
1090 elsif( $l =~ /^search_filter:/ ){
1091 # Here we are going to get the query as a string, clean it, and take care of the part of the limit
1092 # Calling build_query_compat here is avoided because we generate more complex query structures
1093 my ($filter_id) = ( $l =~ /^search_filter:(.*)$/ );
1094 my $search_filter = Koha::SearchFilters->find( $filter_id );
1095 next unless $search_filter;
1096 my ($expanded_lim,$query_lim) = $search_filter->expand_filter;
1097 # In the case of nested filters we need to expand them all
1098 foreach my $el ( @{$self->_fix_limit_special_cases($expanded_lim)} ){
1101 # We need to clean the query part as we have built a string from the original search
1102 push @new_lim, $self->clean_search_term( $query_lim );
1104 elsif ( $l =~ /^yr,st-numeric[=:]/ ) {
1105 my ($date) = ( $l =~ /^yr,st-numeric[=:](.*)$/ );
1106 next unless defined($date);
1107 $date = $self->_modify_string_by_type(type => 'st-year', operand => $date);
1108 push @new_lim, "date-of-publication:$date";
1110 elsif ( $l =~ 'multibranchlimit|^branch' ) {
1111 my $branchfield = C4::Context->preference('SearchLimitLibrary');
1113 if( $l =~ 'multibranchlimit' ) {
1114 my ($group_id) = ( $l =~ /^multibranchlimit:(.*)$/ );
1115 my $search_group = Koha::Library::Groups->find( $group_id );
1116 @branchcodes = map { $_->branchcode } $search_group->all_libraries;
1117 @branchcodes = sort { $a cmp $b } @branchcodes;
1119 @branchcodes = ( $l =~ /^branch:(.*)$/ );
1123 # We quote the branchcodes here to prevent issues when codes are reserved words in ES, e.g. OR, AND, NOT, etc.
1124 if ( $branchfield eq "homebranch" ) {
1125 push @new_lim, sprintf "(%s)", join " OR ", map { 'homebranch: "' . $_ . '"' } @branchcodes;
1127 elsif ( $branchfield eq "holdingbranch" ) {
1128 push @new_lim, sprintf "(%s)", join " OR ", map { 'holdingbranch: "' . $_ . '"' } @branchcodes;
1131 push @new_lim, sprintf "(%s OR %s)",
1132 join( " OR ", map { 'homebranch: "' . $_ . '"' } @branchcodes ),
1133 join( " OR ", map { 'holdingbranch: "' . $_ . '"' } @branchcodes );
1137 elsif ( $l =~ /^available$/ ) {
1138 push @new_lim, 'available:true';
1140 elsif ( $l =~ /^\s*(kw\b[\w,-]*?):(.*)/) {
1141 my ( $field, $term ) = ($1, $2);
1142 if ( defined($field) && defined($term) && $field =~ /,phr$/) {
1143 push @new_lim, "(\"$term\")";
1146 push @new_lim, $term;
1150 my ( $field, $term ) = $l =~ /^\s*([\w,-]*?):(.*)/;
1151 $field =~ s/,phr$//; #We are quoting all the limits as phrase, this prevents from quoting again later
1152 if ( defined($field) && defined($term) ) {
1153 push @new_lim, "$field:(\"$term\")";
1165 my $field = $self->_sort_field($field);
1167 Given a field name, this works out what the actual name of the field to sort
1168 on should be. A '__sort' suffix is added for fields with a sort version, and
1169 for text fields either '.phrase' (for sortable versions) or '.raw' is appended
1170 to avoid sorting on a tokenized value.
1175 my ($self, $f) = @_;
1177 my $mappings = $self->get_elasticsearch_mappings();
1178 my $textField = defined $mappings->{properties}{$f}{type} && $mappings->{properties}{$f}{type} eq 'text';
1179 if (!defined $self->sort_fields()->{$f} || $self->sort_fields()->{$f}) {
1182 # We need to add '.raw' to text fields without a sort field,
1183 # otherwise it'll sort based on the tokenised form.
1184 $f .= '.raw' if $textField;
1189 =head2 _truncate_terms
1191 my $query = $self->_truncate_terms($query);
1193 Given a string query this function appends '*' wildcard to all terms except
1194 operands and double quoted strings.
1198 sub _truncate_terms {
1199 my ( $self, $query ) = @_;
1201 my @tokens = $self->_split_query( $query );
1203 # Filter out empty tokens
1204 my @words = grep { $_ !~ /^\s*$/ } @tokens;
1206 # Append '*' to words if needed, ie. if it ends in a word character and is not a keyword
1209 (/\W$/ or grep {lc($w) eq $_} qw/and or not/) ? $_ : "$_*";
1212 return join ' ', @terms;
1217 my @token = $self->_split_query($query_str);
1219 Given a string query this function splits it to tokens taking into account
1220 any field prefixes and quoted strings.
1224 my $tokenize_split_re = qr/((?:${field_name_pattern}${multi_field_pattern}:)?"[^"]+"|\s+)/;
1227 my ( $self, $query ) = @_;
1229 # '"donald duck" title:"the mouse" and peter" get split into
1230 # ['', '"donald duck"', '', ' ', '', 'title:"the mouse"', '', ' ', 'and', ' ', 'pete']
1231 my @tokens = split $tokenize_split_re, $query;
1233 # Filter out empty values
1234 @tokens = grep( /\S/, @tokens );
1239 =head2 _search_fields
1240 my $weighted_fields = $self->_search_fields({
1242 weighted_fields => 1,
1246 Generate a list of searchable fields to be used for Elasticsearch queries
1247 applied to multiple fields.
1249 Returns an arrayref of field names for either OPAC or staff interface, with
1250 possible weights and subfield appended to each field name depending on the
1257 Hashref with options. The parameter C<is_opac> indicates whether the searchable
1258 fields for OPAC or staff interface should be retrieved. If C<weighted_fields> is set
1259 fields weights will be applied on returned fields. C<subfield> can be used to
1260 provide a subfield that will be appended to fields as "C<field_name>.C<subfield>".
1266 sub _search_fields {
1267 my ($self, $params) = @_;
1270 weighted_fields => 0,
1272 # This is a hack for authorities build_authorities_query
1273 # can hopefully be removed in the future
1276 my $cache = Koha::Caches->get_instance();
1277 my $cache_key = 'elasticsearch_search_fields' . ($params->{is_opac} ? '_opac' : '_staff_client') . "_" . $self->index;
1278 my $search_fields = $cache->get_from_cache($cache_key, { unsafe => 1 });
1279 if (!$search_fields) {
1280 # The reason we don't use Koha::SearchFields->search here is we don't
1281 # want or need resultset wrapped as Koha::SearchField object.
1282 # It does not make any sense in this context and would cause
1283 # unnecessary overhead sice we are only querying for data
1284 # Also would not work, or produce strange results, with the "columns"
1286 my $schema = Koha::Database->schema;
1287 my $result = $schema->resultset('SearchField')->search(
1289 $params->{is_opac} ? (
1294 'type' => { '!=' => 'boolean' },
1295 'search_marc_map.index_name' => $self->index,
1296 'search_marc_map.marc_type' => C4::Context->preference('marcflavour'),
1297 'search_marc_to_fields.search' => 1,
1300 columns => [qw/name weight/],
1302 join => {search_marc_to_fields => 'search_marc_map'},
1306 while (my $search_field = $result->next) {
1307 push @search_fields, [
1308 lc $search_field->name,
1309 $search_field->weight ? $search_field->weight : ()
1312 $search_fields = \@search_fields;
1313 $cache->set_in_cache($cache_key, $search_fields);
1315 if ($params->{subfield}) {
1316 my $subfield = $params->{subfield};
1319 # Copy values to avoid mutating cached
1320 # data (since unsafe is used)
1321 my ($field, $weight) = @{$_};
1322 ["${field}.${subfield}", $weight];
1326 if ($params->{weighted_fields}) {
1327 return [map { join('^', @{$_}) } @{$search_fields}];
1330 # Exclude weight from field
1331 return [map { $_->[0] } @{$search_fields}];