1 package Koha::SearchEngine::Elasticsearch::QueryBuilder;
3 # This file is part of Koha.
5 # Copyright 2014 Catalyst IT Ltd.
7 # Koha is free software; you can redistribute it and/or modify it
8 # under the terms of the GNU General Public License as published by
9 # the Free Software Foundation; either version 3 of the License, or
10 # (at your option) any later version.
12 # Koha is distributed in the hope that it will be useful, but
13 # WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 # GNU General Public License for more details.
17 # You should have received a copy of the GNU General Public License
18 # along with Koha; if not, see <http://www.gnu.org/licenses>.
22 Koha::SearchEngine::Elasticsearch::QueryBuilder - constructs elasticsearch
23 query objects from user-supplied queries
27 This provides the functions that take a user-supplied search query, and
28 provides something that can be given to elasticsearch to get answers.
32 use Koha::SearchEngine::Elasticsearch::QueryBuilder;
33 $builder = Koha::SearchEngine::Elasticsearch->new({ index => $index });
34 my $simple_query = $builder->build_query("hello");
35 # This is currently undocumented because the original code is undocumented
36 my $adv_query = $builder->build_advanced_query($indexes, $operands, $operators);
42 use base qw(Koha::SearchEngine::Elasticsearch);
45 use List::MoreUtils qw/ each_array /;
55 my $simple_query = $builder->build_query("hello", %options)
57 This will build a query that can be issued to elasticsearch from the provided
58 string input. This expects a lucene style search form (see
59 L<http://www.elasticsearch.org/guide/en/elasticsearch/reference/current/query-dsl-query-string-query.html#query-string-syntax>
62 It'll make an attempt to respect the various query options.
64 Additional options can be provided with the C<%options> hash.
70 This should be an arrayref of hashrefs, each containing a C<field> and an
71 C<direction> (optional, defaults to C<asc>.) The results will be sorted
72 according to these values. Valid values for C<direction> are 'asc' and 'desc'.
79 my ( $self, $query, %options ) = @_;
81 my $stemming = C4::Context->preference("QueryStemming") || 0;
82 my $auto_truncation = C4::Context->preference("QueryAutoTruncate") || 0;
83 my $fuzzy_enabled = C4::Context->preference("QueryFuzzy") || 0;
85 $query = '*' unless defined $query;
88 my $fields = $self->_search_fields({
89 is_opac => $options{is_opac},
90 weighted_fields => $options{weighted_fields},
92 if ($options{whole_record}) {
93 push @$fields, 'marc_data_array.*';
98 fuzziness => $fuzzy_enabled ? 'auto' : '0',
99 default_operator => 'AND',
101 lenient => JSON::true,
102 analyze_wildcard => JSON::true,
105 $res->{query}->{query_string}->{type} = 'cross_fields' if C4::Context->preference('ElasticsearchCrossFields');
107 if ( $options{sort} ) {
108 foreach my $sort ( @{ $options{sort} } ) {
109 my ( $f, $d ) = @$sort{qw/ field direction /};
110 die "Invalid sort direction, $d"
111 if $d && ( $d ne 'asc' && $d ne 'desc' );
112 $d = 'asc' unless $d;
114 $f = $self->_sort_field($f);
115 push @{ $res->{sort} }, { $f => { order => $d } };
119 # See _convert_facets in Search.pm for how these get turned into
120 # things that Koha can use.
121 my $size = C4::Context->preference('FacetMaxCount');
122 $res->{aggregations} = {
123 author => { terms => { field => "author__facet" , size => $size } },
124 subject => { terms => { field => "subject__facet", size => $size } },
125 itype => { terms => { field => "itype__facet", size => $size} },
126 location => { terms => { field => "location__facet", size => $size } },
127 'su-geo' => { terms => { field => "su-geo__facet", size => $size} },
128 'title-series' => { terms => { field => "title-series__facet", size => $size } },
129 ccode => { terms => { field => "ccode__facet", size => $size } },
130 ln => { terms => { field => "ln__facet", size => $size } },
133 my $display_library_facets = C4::Context->preference('DisplayLibraryFacets');
134 if ( $display_library_facets eq 'both'
135 or $display_library_facets eq 'home' ) {
136 $res->{aggregations}{homebranch} = { terms => { field => "homebranch__facet", size => $size } };
138 if ( $display_library_facets eq 'both'
139 or $display_library_facets eq 'holding' ) {
140 $res->{aggregations}{holdingbranch} = { terms => { field => "holdingbranch__facet", size => $size } };
145 =head2 build_query_compat
148 $error, $query, $simple_query, $query_cgi,
149 $query_desc, $limit, $limit_cgi, $limit_desc,
150 $stopwords_removed, $query_type
152 = $builder->build_query_compat( \@operators, \@operands, \@indexes,
153 \@limits, \@sort_by, $scan, $lang, $params );
155 This handles a search using the same api as L<C4::Search::buildQuery> does.
157 A very simple query will go in with C<$operands> set to ['query'], and
158 C<$sort_by> set to ['pubdate_dsc']. This simple case will return with
159 C<$query> set to something that can perform the search, C<$simple_query>
160 set to just the search term, C<$query_cgi> set to something that can
161 reproduce this search, and C<$query_desc> set to something else.
165 sub build_query_compat {
166 my ( $self, $operators, $operands, $indexes, $orig_limits, $sort_by, $scan,
172 my $search_param_query_str = '';
175 ($query, $query_str) = $self->_build_scan_query( $operands, $indexes );
176 $search_param_query_str = $query_str;
178 my @sort_params = $self->_convert_sort_fields(@$sort_by);
179 my @index_params = $self->_convert_index_fields(@$indexes);
180 $limits = $self->_fix_limit_special_cases($orig_limits);
181 if ( $params->{suppress} ) { push @$limits, "suppress:false"; }
182 # Merge the indexes in with the search terms and the operands so that
183 # each search thing is a handy unit.
184 unshift @$operators, undef; # The first one can't have an op
186 my $truncate = C4::Context->preference("QueryAutoTruncate") || 0;
187 my $ea = each_array( @$operands, @$operators, @index_params );
188 while ( my ( $oand, $otor, $index ) = $ea->() ) {
189 next if ( !defined($oand) || $oand eq '' );
190 $oand = $self->_clean_search_term($oand);
191 $oand = $self->_truncate_terms($oand) if ($truncate);
192 push @search_params, {
193 operand => $oand, # the search terms
194 operator => defined($otor) ? uc $otor : undef, # AND and so on
195 $index ? %$index : (),
199 # We build a string query from limits and the queries. An alternative
200 # would be to pass them separately into build_query and let it build
201 # them into a structured ES query itself. Maybe later, though that'd be
203 $search_param_query_str = join( ' ', $self->_create_query_string(@search_params) );
204 $query_str = join( ' AND ',
205 $search_param_query_str || (),
206 $self->_join_queries( $self->_convert_index_strings(@$limits) ) || () );
208 # If there's no query on the left, let's remove the junk left behind
209 $query_str =~ s/^ AND //;
211 $options{sort} = \@sort_params;
212 $options{is_opac} = $params->{is_opac};
213 $options{weighted_fields} = $params->{weighted_fields};
214 $options{whole_record} = $params->{whole_record};
215 $query = $self->build_query( $query_str, %options );
218 # We roughly emulate the CGI parameters of the zebra query builder
220 shift @$operators; # Shift out the one we unshifted before
221 my $ea = each_array( @$operands, @$operators, @$indexes );
222 while ( my ( $oand, $otor, $index ) = $ea->() ) {
223 $query_cgi .= '&' if $query_cgi;
224 $query_cgi .= 'idx=' . uri_escape_utf8( $index // '') . '&q=' . uri_escape_utf8( $oand );
225 $query_cgi .= '&op=' . uri_escape_utf8( $otor ) if $otor;
227 $query_cgi .= '&scan=1' if ( $scan );
230 $simple_query = $operands->[0] if @$operands == 1;
232 if ( $simple_query ) {
233 $query_desc = $simple_query;
235 $query_desc = $search_param_query_str;
237 my $limit = $self->_join_queries( $self->_convert_index_strings(@$limits));
238 my $limit_cgi = ( $orig_limits and @$orig_limits )
239 ? '&limit=' . join( '&limit=', map { uri_escape_utf8($_) } @$orig_limits )
242 $limit_desc = "$limit" if $limit;
245 undef, $query, $simple_query, $query_cgi, $query_desc,
246 $limit, $limit_cgi, $limit_desc, undef, undef
250 =head2 build_authorities_query
252 my $query = $builder->build_authorities_query(\%search);
254 This takes a nice description of an authority search and turns it into a black-box
255 query that can then be passed to the appropriate searcher.
257 The search description is a hashref that looks something like:
262 where => 'Heading', # search the main entry
263 operator => 'exact', # require an exact match
264 value => 'frogs', # the search string
267 where => '', # search all entries
268 operator => '', # default keyword, right truncation
276 authtypecode => 'TOPIC_TERM',
281 sub build_authorities_query {
282 my ( $self, $search ) = @_;
284 # Start by making the query parts
287 foreach my $s ( @{ $search->{searches} } ) {
288 my ( $wh, $op, $val ) = @{$s}{qw(where operator value)};
289 if ( defined $op && ($op eq 'is' || $op eq '=' || $op eq 'exact') ) {
291 # Match the whole field, case insensitive, UTF normalized.
292 push @query_parts, { term => { "$wh.ci_raw" => $val } };
295 # Match the whole field for all searchable fields, case insensitive,
297 # Given that field data is "The quick brown fox"
298 # "The quick brown fox" and "the quick brown fox" will match
299 # but not "quick brown fox".
303 fields => $self->_search_fields({ subfield => 'ci_raw' }),
308 elsif ( defined $op && $op eq 'start') {
309 # Match the prefix within a field for all searchable fields.
310 # Given that field data is "The quick brown fox"
311 # "The quick bro" will match, but not "quick bro"
313 # Does not seems to be a multi prefix query
314 # so we need to create one
316 # Match prefix of the field.
317 push @query_parts, { prefix => {"$wh.ci_raw" => $val} };
321 foreach my $field (@{$self->_search_fields()}) {
322 push @prefix_queries, {
323 prefix => { "$field.ci_raw" => $val }
328 'should' => \@prefix_queries,
329 'minimum_should_match' => 1
335 # Query all searchable fields.
336 # Given that field data is "The quick brown fox"
337 # a search containing any of the words will match, regardless
340 my @tokens = $self->_split_query( $val );
341 foreach my $token ( @tokens ) {
342 $token = $self->_truncate_terms(
343 $self->_clean_search_term( $token )
346 my $query = $self->_join_queries( @tokens );
349 push @query_parts, { query_string => {
350 default_field => $wh,
351 analyze_wildcard => JSON::true,
358 analyze_wildcard => JSON::true,
360 fields => $self->_search_fields(),
367 # Merge the query parts appropriately
368 # 'should' behaves like 'or'
369 # 'must' behaves like 'and'
370 # Zebra behaviour seem to match must so using that here
371 my $elastic_query = {};
372 $elastic_query->{bool}->{must} = \@query_parts;
374 # Filter by authtypecode if set
375 if ($search->{authtypecode}) {
376 $elastic_query->{bool}->{filter} = {
378 "authtype.raw" => $search->{authtypecode}
384 query => $elastic_query
388 $query->{sort} = [ $search->{sort} ] if exists $search->{sort};
393 =head2 build_authorities_query_compat
396 $builder->build_authorities_query_compat( \@marclist, \@and_or,
397 \@excluding, \@operator, \@value, $authtypecode, $orderby );
399 This builds a query for searching for authorities, in the style of
400 L<C4::AuthoritiesMarc::SearchAuthorities>.
408 An arrayref containing where the particular term should be searched for.
409 Options are: mainmainentry, mainentry, match, match-heading, see-from, and
410 thesaurus. If left blank, any field is used.
414 Totally ignored. It is never used in L<C4::AuthoritiesMarc::SearchAuthorities>.
422 What form of search to do. Options are: is (phrase, no truncation, whole field
423 must match), = (number exact match), exact (phrase, no truncation, whole field
424 must match). If left blank, then word list, right truncated, anywhere is used.
428 The actual user-provided string value to search for.
432 The authority type code to search within. If blank, then all will be searched.
436 The order to sort the results by. Options are Relevance, HeadingAsc,
437 HeadingDsc, AuthidAsc, AuthidDsc.
441 marclist, operator, and value must be the same length, and the values at
442 index /i/ all relate to each other.
444 This returns a query, which is a black box object that can be passed to the
445 appropriate search object.
449 our $koha_to_index_name = {
450 mainmainentry => 'heading-main',
451 mainentry => 'heading',
453 'match-heading' => 'match-heading',
454 'see-from' => 'match-heading-see-from',
455 thesaurus => 'subject-heading-thesaurus',
460 sub build_authorities_query_compat {
461 my ( $self, $marclist, $and_or, $excluding, $operator, $value,
462 $authtypecode, $orderby )
465 # This turns the old-style many-options argument form into a more
466 # extensible hash form that is understood by L<build_authorities_query>.
468 my $mappings = $self->get_elasticsearch_mappings();
470 # Convert to lower case
471 $marclist = [map(lc, @{$marclist})];
472 $orderby = lc $orderby;
475 # Make sure everything exists
476 foreach my $m (@$marclist) {
478 $m = exists $koha_to_index_name->{$m} ? $koha_to_index_name->{$m} : $m;
480 warn "Unknown search field $m in marclist" unless (defined $mappings->{data}->{properties}->{$m} || $m eq '' || $m eq 'match-heading');
482 for ( my $i = 0 ; $i < @$value ; $i++ ) {
483 next unless $value->[$i]; #clean empty form values, ES doesn't like undefined searches
486 where => $indexes[$i],
487 operator => $operator->[$i],
488 value => $value->[$i],
494 ( $orderby =~ /^heading/ ) ? 'heading__sort'
495 : ( $orderby =~ /^auth/ ) ? 'local-number__sort'
498 my $sort_order = ( $orderby =~ /asc$/ ) ? 'asc' : 'desc';
499 %sort = ( $sort_field => $sort_order, );
502 searches => \@searches,
503 authtypecode => $authtypecode,
505 $search{sort} = \%sort if %sort;
506 my $query = $self->build_authorities_query( \%search );
510 =head2 _build_scan_query
512 my ($query, $query_str) = $builder->_build_scan_query(\@operands, \@indexes)
514 This will build an aggregation scan query that can be issued to elasticsearch from
515 the provided string input.
519 our %scan_field_convert = (
523 'se' => 'title-series',
527 sub _build_scan_query {
528 my ( $self, $operands, $indexes ) = @_;
530 my $term = scalar( @$operands ) == 0 ? '' : $operands->[0];
531 my $index = scalar( @$indexes ) == 0 ? 'subject' : $indexes->[0];
533 my ( $f, $d ) = split( /,/, $index);
534 $index = $scan_field_convert{$f} || $f;
542 $res->{aggregations} = {
545 field => $index . '__facet',
546 order => { '_term' => 'asc' },
547 include => $self->_create_regex_filter($self->_clean_search_term($term)) . '.*'
551 return ($res, $term);
554 =head2 _create_regex_filter
556 my $filter = $builder->_create_regex_filter('term')
558 This will create a regex filter that can be used with an aggregation query.
562 sub _create_regex_filter {
563 my ($self, $term) = @_;
566 foreach my $c (split(//, quotemeta($term))) {
569 $result .= $lc ne $uc ? '[' . $lc . $uc . ']' : $c;
574 =head2 _convert_sort_fields
576 my @sort_params = _convert_sort_fields(@sort_by)
578 Converts the zebra-style sort index information into elasticsearch-style.
580 C<@sort_by> is the same as presented to L<build_query_compat>, and it returns
581 something that can be sent to L<build_query>.
585 sub _convert_sort_fields {
586 my ( $self, @sort_by ) = @_;
588 # Turn the sorting into something we care about.
589 my %sort_field_convert = (
590 acqdate => 'date-of-acquisition',
592 call_number => 'local-classification',
593 popularity => 'issues',
594 relevance => undef, # default
596 pubdate => 'date-of-publication',
598 my %sort_order_convert =
599 ( qw( desc desc ), qw( dsc desc ), qw( asc asc ), qw( az asc ), qw( za desc ) );
601 # Convert the fields and orders, drop anything we don't know about.
602 grep { $_->{field} } map {
603 my ( $f, $d ) = /(.+)_(.+)/;
605 field => $sort_field_convert{$f},
606 direction => $sort_order_convert{$d}
611 =head2 _convert_index_fields
613 my @index_params = $self->_convert_index_fields(@indexes);
615 Converts zebra-style search index notation into elasticsearch-style.
617 C<@indexes> is an array of index names, as presented to L<build_query_compat>,
618 and it returns something that can be sent to L<build_query>.
620 B<TODO>: this will pull from the elasticsearch mappings table to figure out
625 our %index_field_convert = (
629 'lcn' => 'local-classification',
630 'callnum' => 'local-classification',
631 'record-type' => 'rtype',
632 'mc-rtype' => 'rtype',
634 'lc-card' => 'lc-card-number',
635 'sn' => 'local-number',
636 'biblionumber' => 'local-number',
637 'yr' => 'date-of-publication',
638 'pubdate' => 'date-of-publication',
639 'acqdate' => 'date-of-acquisition',
640 'date/time-last-modified' => 'date-time-last-modified',
641 'dtlm' => 'date-time-last-modified',
642 'diss' => 'dissertation-information',
645 'music-number' => 'identifier-publisher-for-music',
646 'number-music-publisher' => 'identifier-publisher-for-music',
647 'music' => 'identifier-publisher-for-music',
648 'ident' => 'identifier-standard',
649 'cpn' => 'corporate-name',
650 'cfn' => 'conference-name',
651 'pn' => 'personal-name',
656 'rcn' => 'record-control-number',
658 'su-to' => 'subject',
659 #'su-geo' => 'subject',
660 'su-ut' => 'subject',
662 'se' => 'title-series',
663 'ut' => 'title-uniform',
664 'an' => 'koha-auth-number',
665 'authority-number' => 'koha-auth-number',
668 'rank' => 'relevance',
669 'phr' => 'st-phrase',
670 'wrdl' => 'st-word-list',
671 'rt' => 'right-truncation',
672 'rtrn' => 'right-truncation',
673 'ltrn' => 'left-truncation',
674 'rltrn' => 'left-and-right',
675 'mc-itemtype' => 'itemtype',
676 'mc-ccode' => 'ccode',
677 'branch' => 'homebranch',
678 'mc-loc' => 'location',
680 'stocknumber' => 'number-local-acquisition',
681 'inv' => 'number-local-acquisition',
683 'mc-itype' => 'itype',
684 'aub' => 'author-personal-bibliography',
685 'auo' => 'author-in-order',
689 'frequency-code' => 'ff8-18',
690 'illustration-code' => 'ff8-18-21',
691 'regularity-code' => 'ff8-19',
692 'type-of-serial' => 'ff8-21',
693 'format' => 'ff8-23',
694 'conference-code' => 'ff8-29',
695 'festschrift-indicator' => 'ff8-30',
696 'index-indicator' => 'ff8-31',
699 'literature-code' => 'lf',
700 'biography' => 'bio',
702 'biography-code' => 'bio',
703 'l-format' => 'ff7-01-02',
704 'lex' => 'lexile-number',
705 'hi' => 'host-item-number',
706 'itu' => 'index-term-uncontrolled',
707 'itg' => 'index-term-genre',
709 my $field_name_pattern = '[\w\-]+';
710 my $multi_field_pattern = "(?:\\.$field_name_pattern)*";
712 sub _convert_index_fields {
713 my ( $self, @indexes ) = @_;
715 my %index_type_convert =
716 ( __default => undef, phr => 'phrase', rtrn => 'right-truncate', 'st-year' => 'st-year' );
718 # Convert according to our table, drop anything that doesn't convert.
719 # If a field starts with mc- we save it as it's used (and removed) later
720 # when joining things, to indicate we make it an 'OR' join.
721 # (Sorry, this got a bit ugly after special cases were found.)
723 # Lower case all field names
724 my ( $f, $t ) = map(lc, split /,/);
731 field => exists $index_field_convert{$f} ? $index_field_convert{$f} : $f,
732 type => $index_type_convert{ $t // '__default' }
734 $r->{field} = ($mc . $r->{field}) if $mc && $r->{field};
735 $r->{field} ? $r : undef;
739 =head2 _convert_index_strings
741 my @searches = $self->_convert_index_strings(@searches);
743 Similar to L<_convert_index_fields>, this takes strings of the form
744 B<field:search term> and rewrites the field from zebra-style to
745 elasticsearch-style. Anything it doesn't understand is returned verbatim.
749 sub _convert_index_strings {
750 my ( $self, @searches ) = @_;
752 foreach my $s (@searches) {
754 my ( $field, $term ) = $s =~ /^\s*([\w,-]*?):(.*)/;
755 unless ( defined($field) && defined($term) ) {
759 my ($conv) = $self->_convert_index_fields($field);
760 unless ( defined($conv) ) {
764 push @res, ($conv->{field} ? $conv->{field} . ':' : '')
765 . $self->_modify_string_by_type( %$conv, operand => $term );
770 =head2 _convert_index_strings_freeform
772 my $search = $self->_convert_index_strings_freeform($search);
774 This is similar to L<_convert_index_strings>, however it'll search out the
775 things to change within the string. So it can handle strings such as
776 C<(su:foo) AND (su:bar)>, converting the C<su> appropriately.
778 If there is something of the form "su,complete-subfield" or something, the
779 second part is stripped off as we can't yet handle that. Making it work
780 will have to wait for a real query parser.
784 sub _convert_index_strings_freeform {
785 my ( $self, $search ) = @_;
786 # @TODO: Currenty will alter also fields contained within quotes:
787 # `searching for "stuff cn:123"` for example will become
788 # `searching for "stuff local-number:123"
790 # Fixing this is tricky, one possibility:
791 # https://stackoverflow.com/questions/19193876/perl-regex-to-match-a-string-that-is-not-enclosed-in-quotes
792 # Still not perfect, and will not handle escaped quotes within quotes and assumes balanced quotes.
794 # Another, not so elegant, solution could be to replace all quoted content with placeholders, and put
795 # them back when processing is done.
797 # Lower case field names
798 $search =~ s/($field_name_pattern)(?:,[\w-]*)?($multi_field_pattern):/\L$1\E$2:/og;
799 # Resolve possible field aliases
800 $search =~ s/($field_name_pattern)($multi_field_pattern):/(exists $index_field_convert{$1} ? $index_field_convert{$1} : $1)."$2:"/oge;
804 =head2 _modify_string_by_type
806 my $str = $self->_modify_string_by_type(%index_field);
808 If you have a search term (operand) and a type (phrase, right-truncated), this
809 will convert the string to have the function in lucene search terms, e.g.
810 wrapping quotes around it.
814 sub _modify_string_by_type {
815 my ( $self, %idx ) = @_;
817 my $type = $idx{type} || '';
818 my $str = $idx{operand};
819 return $str unless $str; # Empty or undef, we can't use it.
821 $str .= '*' if $type eq 'right-truncate';
822 $str = '"' . $str . '"' if $type eq 'phrase' && $str !~ /^".*"$/;
823 if ($type eq 'st-year') {
824 if ($str =~ /^(.*)-(.*)$/) {
825 my $from = $1 || '*';
826 my $until = $2 || '*';
827 $str = "[$from TO $until]";
835 my $query_str = $self->_join_queries(@query_parts);
837 This takes a list of query parts, that might be search terms on their own, or
838 booleaned together, or specifying fields, or whatever, wraps them in
839 parentheses, and ANDs them all together. Suitable for feeding to the ES
842 Note: doesn't AND them together if they specify an index that starts with "mc"
843 as that was a special case in the original code for dealing with multiple
844 choice options (you can't search for something that has an itype of A and
845 and itype of B otherwise.)
850 my ( $self, @parts ) = @_;
852 my @norm_parts = grep { defined($_) && $_ ne '' && $_ !~ /^mc-/ } @parts;
854 map { s/^mc-//r } grep { defined($_) && $_ ne '' && $_ =~ /^mc-/ } @parts;
855 return () unless @norm_parts + @mc_parts;
856 return ( @norm_parts, @mc_parts )[0] if @norm_parts + @mc_parts == 1;
858 # Group limits by field, so they can be OR'ed together
860 foreach my $mc_part (@mc_parts) {
861 my ($field, $value) = split /:/, $mc_part, 2;
862 $mc_limits{$field} //= [];
863 push @{ $mc_limits{$field} }, $value;
867 sprintf('%s:(%s)', $_, join (' OR ', @{ $mc_limits{$_} }));
868 } sort keys %mc_limits;
870 @norm_parts = map { "($_)" } @norm_parts;
872 return join( ' AND ', @norm_parts, @mc_parts);
877 my @phrased_queries = $self->_make_phrases(@query_parts);
879 This takes the supplied queries and forces them to be phrases by wrapping
880 quotes around them. It understands field prefixes, e.g. 'subject:' and puts
881 the quotes outside of them if they're there.
886 my ( $self, @parts ) = @_;
887 map { s/^\s*(\w*?:)(.*)$/$1"$2"/r } @parts;
890 =head2 _create_query_string
892 my @query_strings = $self->_create_query_string(@queries);
894 Given a list of hashrefs, it will turn them into a lucene-style query string.
895 The hash should contain field, type (both for the indexes), operator, and
900 sub _create_query_string {
901 my ( $self, @queries ) = @_;
904 my $otor = $_->{operator} ? $_->{operator} . ' ' : '';
905 my $field = $_->{field} ? $_->{field} . ':' : '';
907 my $oand = $self->_modify_string_by_type(%$_);
908 $oand = "($oand)" if $field && scalar(split(/\s+/, $oand)) > 1 && (!defined $_->{type} || $_->{type} ne 'st-year');
909 "$otor($field$oand)";
913 =head2 _clean_search_term
915 my $term = $self->_clean_search_term($term);
917 This cleans a search term by removing any funny characters that may upset
918 ES and give us an error. It also calls L<_convert_index_strings_freeform>
919 to ensure those parts are correct.
923 sub _clean_search_term {
924 my ( $self, $term ) = @_;
926 # Lookahead for checking if we are inside quotes
927 my $lookahead = '(?=(?:[^\"]*+\"[^\"]*+\")*+[^\"]*+$)';
929 # Some hardcoded searches (like with authorities) produce things like
930 # 'an=123', when it ought to be 'an:123' for our purposes.
933 $term = $self->_convert_index_strings_freeform($term);
936 # Remove unbalanced quotes
937 my $unquoted = $term;
938 my $count = ($unquoted =~ tr/"/ /);
939 if ($count % 2 == 1) {
943 # Remove unquoted colons that have whitespace on either side of them
944 $term =~ s/(:+)(\s+)$lookahead/$2/g;
945 $term =~ s/(\s+)(:+)$lookahead/$1/g;
947 $term = $self->_query_regex_escape_process($term);
952 =head2 _query_regex_escape_process
954 my $query = $self->_query_regex_escape_process($query);
956 Processes query in accordance with current "QueryRegexEscapeOptions" system preference setting.
960 sub _query_regex_escape_process {
961 my ($self, $query) = @_;
962 my $regex_escape_options = C4::Context->preference("QueryRegexEscapeOptions");
963 if ($regex_escape_options ne 'dont_escape') {
964 if ($regex_escape_options eq 'escape') {
965 # Will escape unescaped slashes (/) while preserving
966 # unescaped slashes within quotes
967 # @TODO: assumes quotes are always balanced and will
968 # not handle escaped qoutes properly, should perhaps be
969 # replaced with a more general parser solution
970 # so that this function is ever only provided with unqouted
972 $query =~ s@(?:(?<!\\)((?:[\\]{2})*)(?=/))(?![^"]*"(?:[^"]*"[^"]*")*[^"]*$)@\\$1@g;
974 elsif($regex_escape_options eq 'unescape_escaped') {
975 # Will unescape escaped slashes (\/) and escape
976 # unescaped slashes (/) while preserving slashes within quotes
977 # The same limitatations as above apply for handling of quotes
978 $query =~ s@(?:(?<!\\)(?:((?:[\\]{2})*[\\])|((?:[\\]{2})*))(?=/))(?![^"]*"(?:[^"]*"[^"]*")*[^"]*$)@($1 ? substr($1, 0, -1) : ($2 . "\\"))@ge;
984 =head2 _fix_limit_special_cases
986 my $limits = $self->_fix_limit_special_cases($limits);
988 This converts any special cases that the limit specifications have into things
989 that are more readily processable by the rest of the code.
991 The argument should be an arrayref, and it'll return an arrayref.
995 sub _fix_limit_special_cases {
996 my ( $self, $limits ) = @_;
999 foreach my $l (@$limits) {
1001 # This is set up by opac-search.pl
1002 if ( $l =~ /^yr,st-numeric,ge=/ ) {
1003 my ( $start, $end ) =
1004 ( $l =~ /^yr,st-numeric,ge=(.*) and yr,st-numeric,le=(.*)$/ );
1005 next unless defined($start) && defined($end);
1006 push @new_lim, "copydate:[$start TO $end]";
1008 elsif ( $l =~ /^yr,st-numeric=/ ) {
1009 my ($date) = ( $l =~ /^yr,st-numeric=(.*)$/ );
1010 next unless defined($date);
1011 $date = $self->_modify_string_by_type(type => 'st-year', operand => $date);
1012 push @new_lim, "copydate:$date";
1014 elsif ( $l =~ /^available$/ ) {
1015 push @new_lim, 'onloan:false';
1018 my ( $field, $term ) = $l =~ /^\s*([\w,-]*?):(.*)/;
1019 $field =~ s/,phr$//; #We are quoting all the limits as phrase, this prevents from quoting again later
1020 if ( defined($field) && defined($term) ) {
1021 push @new_lim, "$field:(\"$term\")";
1033 my $field = $self->_sort_field($field);
1035 Given a field name, this works out what the actual name of the field to sort
1036 on should be. A '__sort' suffix is added for fields with a sort version, and
1037 for text fields either '.phrase' (for sortable versions) or '.raw' is appended
1038 to avoid sorting on a tokenized value.
1043 my ($self, $f) = @_;
1045 my $mappings = $self->get_elasticsearch_mappings();
1046 my $textField = defined $mappings->{data}{properties}{$f}{type} && $mappings->{data}{properties}{$f}{type} eq 'text';
1047 if (!defined $self->sort_fields()->{$f} || $self->sort_fields()->{$f}) {
1050 # We need to add '.raw' to text fields without a sort field,
1051 # otherwise it'll sort based on the tokenised form.
1052 $f .= '.raw' if $textField;
1057 =head2 _truncate_terms
1059 my $query = $self->_truncate_terms($query);
1061 Given a string query this function appends '*' wildcard to all terms except
1062 operands and double quoted strings.
1066 sub _truncate_terms {
1067 my ( $self, $query ) = @_;
1069 my @tokens = $self->_split_query( $query );
1071 # Filter out empty tokens
1072 my @words = grep { $_ !~ /^\s*$/ } @tokens;
1074 # Append '*' to words if needed, ie. if it ends in a word character and is not a keyword
1077 (/\W$/ or grep {lc($w) eq $_} qw/and or not/) ? $_ : "$_*";
1080 return join ' ', @terms;
1085 my @token = $self->_split_query($query_str);
1087 Given a string query this function splits it to tokens taking into account
1088 any field prefixes and quoted strings.
1092 my $tokenize_split_re = qr/((?:${field_name_pattern}${multi_field_pattern}:)?"[^"]+"|\s+)/;
1095 my ( $self, $query ) = @_;
1097 # '"donald duck" title:"the mouse" and peter" get split into
1098 # ['', '"donald duck"', '', ' ', '', 'title:"the mouse"', '', ' ', 'and', ' ', 'pete']
1099 my @tokens = split $tokenize_split_re, $query;
1101 # Filter out empty values
1102 @tokens = grep( /\S/, @tokens );
1107 =head2 _search_fields
1108 my $weighted_fields = $self->_search_fields({
1110 weighted_fields => 1,
1114 Generate a list of searchable fields to be used for Elasticsearch queries
1115 applied to multiple fields.
1117 Returns an arrayref of field names for either OPAC or staff interface, with
1118 possible weights and subfield appended to each field name depending on the
1125 Hashref with options. The parameter C<is_opac> indicates whether the searchable
1126 fields for OPAC or staff interface should be retrieved. If C<weighted_fields> is set
1127 fields weights will be applied on returned fields. C<subfield> can be used to
1128 provide a subfield that will be appended to fields as "C<field_name>.C<subfield>".
1134 sub _search_fields {
1135 my ($self, $params) = @_;
1138 weighted_fields => 0,
1140 # This is a hack for authorities build_authorities_query
1141 # can hopefully be removed in the future
1144 my $cache = Koha::Caches->get_instance();
1145 my $cache_key = 'elasticsearch_search_fields' . ($params->{is_opac} ? '_opac' : '_staff_client') . "_" . $self->index;
1146 my $search_fields = $cache->get_from_cache($cache_key, { unsafe => 1 });
1147 if (!$search_fields) {
1148 # The reason we don't use Koha::SearchFields->search here is we don't
1149 # want or need resultset wrapped as Koha::SearchField object.
1150 # It does not make any sense in this context and would cause
1151 # unnecessary overhead sice we are only querying for data
1152 # Also would not work, or produce strange results, with the "columns"
1154 my $schema = Koha::Database->schema;
1155 my $result = $schema->resultset('SearchField')->search(
1157 $params->{is_opac} ? (
1162 'type' => { '!=' => 'boolean' },
1163 'search_marc_map.index_name' => $self->index,
1164 'search_marc_map.marc_type' => C4::Context->preference('marcflavour'),
1165 'search_marc_to_fields.search' => 1,
1168 columns => [qw/name weight/],
1170 join => {search_marc_to_fields => 'search_marc_map'},
1174 while (my $search_field = $result->next) {
1175 push @search_fields, [
1176 lc $search_field->name,
1177 $search_field->weight ? $search_field->weight : ()
1180 $search_fields = \@search_fields;
1181 $cache->set_in_cache($cache_key, $search_fields);
1183 if ($params->{subfield}) {
1184 my $subfield = $params->{subfield};
1187 # Copy values to avoid mutating cached
1188 # data (since unsafe is used)
1189 my ($field, $weight) = @{$_};
1190 ["${field}.${subfield}", $weight];
1194 if ($params->{weighted_fields}) {
1195 return [map { join('^', @{$_}) } @{$search_fields}];
1198 # Exclude weight from field
1199 return [map { $_->[0] } @{$search_fields}];