1 package Koha::SearchEngine::Elasticsearch::QueryBuilder;
3 # This file is part of Koha.
5 # Copyright 2014 Catalyst IT Ltd.
7 # Koha is free software; you can redistribute it and/or modify it
8 # under the terms of the GNU General Public License as published by
9 # the Free Software Foundation; either version 3 of the License, or
10 # (at your option) any later version.
12 # Koha is distributed in the hope that it will be useful, but
13 # WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 # GNU General Public License for more details.
17 # You should have received a copy of the GNU General Public License
18 # along with Koha; if not, see <http://www.gnu.org/licenses>.
22 Koha::SearchEngine::Elasticsearch::QueryBuilder - constructs elasticsearch
23 query objects from user-supplied queries
27 This provides the functions that take a user-supplied search query, and
28 provides something that can be given to elasticsearch to get answers.
32 use Koha::SearchEngine::Elasticsearch::QueryBuilder;
33 $builder = Koha::SearchEngine::Elasticsearch->new({ index => $index });
34 my $simple_query = $builder->build_query("hello");
35 # This is currently undocumented because the original code is undocumented
36 my $adv_query = $builder->build_advanced_query($indexes, $operands, $operators);
42 use base qw(Koha::SearchEngine::Elasticsearch);
45 use List::MoreUtils qw/ each_array /;
55 my $simple_query = $builder->build_query("hello", %options)
57 This will build a query that can be issued to elasticsearch from the provided
58 string input. This expects a lucene style search form (see
59 L<http://www.elasticsearch.org/guide/en/elasticsearch/reference/current/query-dsl-query-string-query.html#query-string-syntax>
62 It'll make an attempt to respect the various query options.
64 Additional options can be provided with the C<%options> hash.
70 This should be an arrayref of hashrefs, each containing a C<field> and an
71 C<direction> (optional, defaults to C<asc>.) The results will be sorted
72 according to these values. Valid values for C<direction> are 'asc' and 'desc'.
79 my ( $self, $query, %options ) = @_;
81 my $stemming = C4::Context->preference("QueryStemming") || 0;
82 my $auto_truncation = C4::Context->preference("QueryAutoTruncate") || 0;
83 my $fuzzy_enabled = C4::Context->preference("QueryFuzzy") || 0;
85 $query = '*' unless defined $query;
88 my $fields = $self->_search_fields({
89 is_opac => $options{is_opac},
90 weighted_fields => $options{weighted_fields},
92 if ($options{whole_record}) {
93 push @$fields, 'marc_data_array.*';
98 fuzziness => $fuzzy_enabled ? 'auto' : '0',
99 default_operator => 'AND',
101 lenient => JSON::true,
102 analyze_wildcard => JSON::true,
106 if ( $options{sort} ) {
107 foreach my $sort ( @{ $options{sort} } ) {
108 my ( $f, $d ) = @$sort{qw/ field direction /};
109 die "Invalid sort direction, $d"
110 if $d && ( $d ne 'asc' && $d ne 'desc' );
111 $d = 'asc' unless $d;
113 $f = $self->_sort_field($f);
114 push @{ $res->{sort} }, { $f => { order => $d } };
118 # See _convert_facets in Search.pm for how these get turned into
119 # things that Koha can use.
120 my $size = C4::Context->preference('FacetMaxCount');
121 $res->{aggregations} = {
122 author => { terms => { field => "author__facet" , size => $size } },
123 subject => { terms => { field => "subject__facet", size => $size } },
124 itype => { terms => { field => "itype__facet", size => $size} },
125 location => { terms => { field => "location__facet", size => $size } },
126 'su-geo' => { terms => { field => "su-geo__facet", size => $size} },
127 'title-series' => { terms => { field => "title-series__facet", size => $size } },
128 ccode => { terms => { field => "ccode__facet", size => $size } },
129 ln => { terms => { field => "ln__facet", size => $size } },
132 my $display_library_facets = C4::Context->preference('DisplayLibraryFacets');
133 if ( $display_library_facets eq 'both'
134 or $display_library_facets eq 'home' ) {
135 $res->{aggregations}{homebranch} = { terms => { field => "homebranch__facet" } };
137 if ( $display_library_facets eq 'both'
138 or $display_library_facets eq 'holding' ) {
139 $res->{aggregations}{holdingbranch} = { terms => { field => "holdingbranch__facet" } };
144 =head2 build_browse_query
146 my $browse_query = $builder->build_browse_query($field, $query);
148 This performs a "starts with" style query on a particular field. The field
149 to be searched must have been indexed with an appropriate mapping as a
150 "phrase" subfield, which pretty much everything has.
154 # XXX this isn't really a browse query like we want in the end
155 sub build_browse_query {
156 my ( $self, $field, $query ) = @_;
158 my $fuzzy_enabled = C4::Context->preference("QueryFuzzy") || 0;
160 return { query => '*' } if !defined $query;
162 # TODO this should come from Koha::SearchEngine::Elasticsearch
163 my %field_whitelist = (
167 $field = 'title' if !exists $field_whitelist{$field};
168 my $sort = $self->_sort_field($field);
171 match_phrase_prefix => {
175 fuzziness => $fuzzy_enabled ? 'auto' : '0',
179 sort => [ { $sort => { order => "asc" } } ],
183 =head2 build_query_compat
186 $error, $query, $simple_query, $query_cgi,
187 $query_desc, $limit, $limit_cgi, $limit_desc,
188 $stopwords_removed, $query_type
190 = $builder->build_query_compat( \@operators, \@operands, \@indexes,
191 \@limits, \@sort_by, $scan, $lang, $params );
193 This handles a search using the same api as L<C4::Search::buildQuery> does.
195 A very simple query will go in with C<$operands> set to ['query'], and
196 C<$sort_by> set to ['pubdate_dsc']. This simple case will return with
197 C<$query> set to something that can perform the search, C<$simple_query>
198 set to just the search term, C<$query_cgi> set to something that can
199 reproduce this search, and C<$query_desc> set to something else.
203 sub build_query_compat {
204 my ( $self, $operators, $operands, $indexes, $orig_limits, $sort_by, $scan,
208 #die Dumper ( $self, $operators, $operands, $indexes, $orig_limits, $sort_by, $scan, $lang );
209 my @sort_params = $self->_convert_sort_fields(@$sort_by);
210 my @index_params = $self->_convert_index_fields(@$indexes);
211 my $limits = $self->_fix_limit_special_cases($orig_limits);
212 if ( $params->{suppress} ) { push @$limits, "suppress:0"; }
213 # Merge the indexes in with the search terms and the operands so that
214 # each search thing is a handy unit.
215 unshift @$operators, undef; # The first one can't have an op
217 my $truncate = C4::Context->preference("QueryAutoTruncate") || 0;
218 my $ea = each_array( @$operands, @$operators, @index_params );
219 while ( my ( $oand, $otor, $index ) = $ea->() ) {
220 next if ( !defined($oand) || $oand eq '' );
221 $oand = $self->_clean_search_term($oand);
222 $oand = $self->_truncate_terms($oand) if ($truncate);
223 push @search_params, {
224 operand => $oand, # the search terms
225 operator => defined($otor) ? uc $otor : undef, # AND and so on
226 $index ? %$index : (),
230 # We build a string query from limits and the queries. An alternative
231 # would be to pass them separately into build_query and let it build
232 # them into a structured ES query itself. Maybe later, though that'd be
234 my $search_param_query_str = join( ' ', $self->_create_query_string(@search_params) );
235 my $query_str = join( ' AND ',
236 $search_param_query_str || (),
237 $self->_join_queries( $self->_convert_index_strings(@$limits) ) || () );
239 # If there's no query on the left, let's remove the junk left behind
240 $query_str =~ s/^ AND //;
242 $options{sort} = \@sort_params;
243 $options{is_opac} = $params->{is_opac};
244 $options{weighted_fields} = $params->{weighted_fields};
245 $options{whole_record} = $params->{whole_record};
246 my $query = $self->build_query( $query_str, %options );
248 # We roughly emulate the CGI parameters of the zebra query builder
250 shift @$operators; # Shift out the one we unshifted before
251 $ea = each_array( @$operands, @$operators, @$indexes );
252 while ( my ( $oand, $otor, $index ) = $ea->() ) {
253 $query_cgi .= '&' if $query_cgi;
254 $query_cgi .= 'idx=' . uri_escape_utf8( $index // '') . '&q=' . uri_escape_utf8( $oand );
255 $query_cgi .= '&op=' . uri_escape_utf8( $otor ) if $otor;
257 $query_cgi .= '&scan=1' if ( $scan );
260 $simple_query = $operands->[0] if @$operands == 1;
262 if ( $simple_query ) {
263 $query_desc = $simple_query;
265 $query_desc = $search_param_query_str;
267 my $limit = $self->_join_queries( $self->_convert_index_strings(@$limits));
268 my $limit_cgi = ( $orig_limits and @$orig_limits )
269 ? '&limit=' . join( '&limit=', map { uri_escape_utf8($_) } @$orig_limits )
272 $limit_desc = "$limit" if $limit;
275 undef, $query, $simple_query, $query_cgi, $query_desc,
276 $limit, $limit_cgi, $limit_desc, undef, undef
280 =head2 build_authorities_query
282 my $query = $builder->build_authorities_query(\%search);
284 This takes a nice description of an authority search and turns it into a black-box
285 query that can then be passed to the appropriate searcher.
287 The search description is a hashref that looks something like:
292 where => 'Heading', # search the main entry
293 operator => 'exact', # require an exact match
294 value => 'frogs', # the search string
297 where => '', # search all entries
298 operator => '', # default keyword, right truncation
306 authtypecode => 'TOPIC_TERM',
311 sub build_authorities_query {
312 my ( $self, $search ) = @_;
314 # Start by making the query parts
317 foreach my $s ( @{ $search->{searches} } ) {
318 my ( $wh, $op, $val ) = @{$s}{qw(where operator value)};
319 if ( $op eq 'is' || $op eq '=' || $op eq 'exact') {
321 # Match the whole field, case insensitive, UTF normalized.
322 push @query_parts, { term => { "$wh.ci_raw" => $val } };
325 # Match the whole field for all searchable fields, case insensitive,
327 # Given that field data is "The quick brown fox"
328 # "The quick brown fox" and "the quick brown fox" will match
329 # but not "quick brown fox".
333 fields => $self->_search_fields({ subfield => 'ci_raw' }),
338 elsif ( $op eq 'start') {
339 # Match the prefix within a field for all searchable fields.
340 # Given that field data is "The quick brown fox"
341 # "The quick bro" will match, but not "quick bro"
343 # Does not seems to be a multi prefix query
344 # so we need to create one
346 # Match prefix of the field.
347 push @query_parts, { prefix => {"$wh.ci_raw" => $val} };
351 foreach my $field (@{$self->_search_fields()}) {
352 push @prefix_queries, {
353 prefix => { "$field.ci_raw" => $val }
358 'should' => \@prefix_queries,
359 'minimum_should_match' => 1
365 # Query all searchable fields.
366 # Given that field data is "The quick brown fox"
367 # a search containing any of the words will match, regardless
370 my @tokens = $self->_split_query( $val );
371 foreach my $token ( @tokens ) {
372 $token = $self->_truncate_terms(
373 $self->_clean_search_term( $token )
376 my $query = $self->_join_queries( @tokens );
379 push @query_parts, { query_string => {
380 default_field => $wh,
381 analyze_wildcard => JSON::true,
388 analyze_wildcard => JSON::true,
390 fields => $self->_search_fields(),
397 # Merge the query parts appropriately
398 # 'should' behaves like 'or'
399 # 'must' behaves like 'and'
400 # Zebra behaviour seem to match must so using that here
401 my $elastic_query = {};
402 $elastic_query->{bool}->{must} = \@query_parts;
404 # Filter by authtypecode if set
405 if ($search->{authtypecode}) {
406 $elastic_query->{bool}->{filter} = {
408 "authtype.raw" => $search->{authtypecode}
414 query => $elastic_query
418 $query->{sort} = [ $search->{sort} ] if exists $search->{sort};
423 =head2 build_authorities_query_compat
426 $builder->build_authorities_query_compat( \@marclist, \@and_or,
427 \@excluding, \@operator, \@value, $authtypecode, $orderby );
429 This builds a query for searching for authorities, in the style of
430 L<C4::AuthoritiesMarc::SearchAuthorities>.
438 An arrayref containing where the particular term should be searched for.
439 Options are: mainmainentry, mainentry, match, match-heading, see-from, and
440 thesaurus. If left blank, any field is used.
444 Totally ignored. It is never used in L<C4::AuthoritiesMarc::SearchAuthorities>.
452 What form of search to do. Options are: is (phrase, no truncation, whole field
453 must match), = (number exact match), exact (phrase, no truncation, whole field
454 must match). If left blank, then word list, right truncated, anywhere is used.
458 The actual user-provided string value to search for.
462 The authority type code to search within. If blank, then all will be searched.
466 The order to sort the results by. Options are Relevance, HeadingAsc,
467 HeadingDsc, AuthidAsc, AuthidDsc.
471 marclist, operator, and value must be the same length, and the values at
472 index /i/ all relate to each other.
474 This returns a query, which is a black box object that can be passed to the
475 appropriate search object.
479 our $koha_to_index_name = {
480 mainmainentry => 'heading-main',
481 mainentry => 'heading',
483 'match-heading' => 'match-heading',
484 'see-from' => 'match-heading-see-from',
485 thesaurus => 'subject-heading-thesaurus',
490 sub build_authorities_query_compat {
491 my ( $self, $marclist, $and_or, $excluding, $operator, $value,
492 $authtypecode, $orderby )
495 # This turns the old-style many-options argument form into a more
496 # extensible hash form that is understood by L<build_authorities_query>.
499 # Convert to lower case
500 $marclist = [map(lc, @{$marclist})];
501 $orderby = lc $orderby;
503 # Make sure everything exists
504 foreach my $m (@$marclist) {
505 Koha::Exceptions::WrongParameter->throw("Invalid marclist field provided: $m")
506 unless exists $koha_to_index_name->{$m};
508 for ( my $i = 0 ; $i < @$value ; $i++ ) {
509 next unless $value->[$i]; #clean empty form values, ES doesn't like undefined searches
512 where => $koha_to_index_name->{$marclist->[$i]},
513 operator => $operator->[$i],
514 value => $value->[$i],
520 ( $orderby =~ /^heading/ ) ? 'heading__sort'
521 : ( $orderby =~ /^auth/ ) ? 'local-number__sort'
524 my $sort_order = ( $orderby =~ /asc$/ ) ? 'asc' : 'desc';
525 %sort = ( $sort_field => $sort_order, );
528 searches => \@searches,
529 authtypecode => $authtypecode,
531 $search{sort} = \%sort if %sort;
532 my $query = $self->build_authorities_query( \%search );
536 =head2 _convert_sort_fields
538 my @sort_params = _convert_sort_fields(@sort_by)
540 Converts the zebra-style sort index information into elasticsearch-style.
542 C<@sort_by> is the same as presented to L<build_query_compat>, and it returns
543 something that can be sent to L<build_query>.
547 sub _convert_sort_fields {
548 my ( $self, @sort_by ) = @_;
550 # Turn the sorting into something we care about.
551 my %sort_field_convert = (
552 acqdate => 'date-of-acquisition',
554 call_number => 'local-classification',
555 popularity => 'issues',
556 relevance => undef, # default
558 pubdate => 'date-of-publication',
560 my %sort_order_convert =
561 ( qw( desc desc ), qw( dsc desc ), qw( asc asc ), qw( az asc ), qw( za desc ) );
563 # Convert the fields and orders, drop anything we don't know about.
564 grep { $_->{field} } map {
565 my ( $f, $d ) = /(.+)_(.+)/;
567 field => $sort_field_convert{$f},
568 direction => $sort_order_convert{$d}
573 =head2 _convert_index_fields
575 my @index_params = $self->_convert_index_fields(@indexes);
577 Converts zebra-style search index notation into elasticsearch-style.
579 C<@indexes> is an array of index names, as presented to L<build_query_compat>,
580 and it returns something that can be sent to L<build_query>.
582 B<TODO>: this will pull from the elasticsearch mappings table to figure out
587 our %index_field_convert = (
591 'lcn' => 'local-classification',
592 'callnum' => 'local-classification',
593 'record-type' => 'rtype',
594 'mc-rtype' => 'rtype',
596 'lc-card' => 'lc-card-number',
597 'sn' => 'local-number',
598 'yr' => 'date-of-publication',
599 'pubdate' => 'date-of-publication',
600 'acqdate' => 'date-of-acquisition',
601 'date/time-last-modified' => 'date-time-last-modified',
602 'dtlm' => 'date-time-last-modified',
603 'diss' => 'dissertation-information',
606 'music-number' => 'identifier-publisher-for-music',
607 'number-music-publisher' => 'identifier-publisher-for-music',
608 'music' => 'identifier-publisher-for-music',
609 'ident' => 'identifier-standard',
610 'cpn' => 'corporate-name',
611 'cfn' => 'conference-name',
612 'pn' => 'personal-name',
617 'rcn' => 'record-control-number',
619 'su-to' => 'subject',
620 #'su-geo' => 'subject',
621 'su-ut' => 'subject',
623 'se' => 'title-series',
624 'ut' => 'title-uniform',
625 'an' => 'koha-auth-number',
626 'authority-number' => 'koha-auth-number',
629 'rank' => 'relevance',
630 'phr' => 'st-phrase',
631 'wrdl' => 'st-word-list',
632 'rt' => 'right-truncation',
633 'rtrn' => 'right-truncation',
634 'ltrn' => 'left-truncation',
635 'rltrn' => 'left-and-right',
636 'mc-itemtype' => 'itemtype',
637 'mc-ccode' => 'ccode',
638 'branch' => 'homebranch',
639 'mc-loc' => 'location',
640 'stocknumber' => 'number-local-acquisition',
641 'inv' => 'number-local-acquisition',
643 'mc-itype' => 'itype',
644 'aub' => 'author-personal-bibliography',
645 'auo' => 'author-in-order',
649 'frequency-code' => 'ff8-18',
650 'illustration-code' => 'ff8-18-21',
651 'regularity-code' => 'ff8-19',
652 'type-of-serial' => 'ff8-21',
653 'format' => 'ff8-23',
654 'conference-code' => 'ff8-29',
655 'festschrift-indicator' => 'ff8-30',
656 'index-indicator' => 'ff8-31',
659 'literature-code' => 'lf',
660 'biography' => 'bio',
662 'biography-code' => 'bio',
663 'l-format' => 'ff7-01-02',
664 'lex' => 'lexile-number',
665 'hi' => 'host-item-number',
666 'itu' => 'index-term-uncontrolled',
667 'itg' => 'index-term-genre',
669 my $field_name_pattern = '[\w\-]+';
670 my $multi_field_pattern = "(?:\\.$field_name_pattern)*";
672 sub _convert_index_fields {
673 my ( $self, @indexes ) = @_;
675 my %index_type_convert =
676 ( __default => undef, phr => 'phrase', rtrn => 'right-truncate', 'st-year' => 'st-year' );
678 # Convert according to our table, drop anything that doesn't convert.
679 # If a field starts with mc- we save it as it's used (and removed) later
680 # when joining things, to indicate we make it an 'OR' join.
681 # (Sorry, this got a bit ugly after special cases were found.)
683 # Lower case all field names
684 my ( $f, $t ) = map(lc, split /,/);
691 field => exists $index_field_convert{$f} ? $index_field_convert{$f} : $f,
692 type => $index_type_convert{ $t // '__default' }
694 $r->{field} = ($mc . $r->{field}) if $mc && $r->{field};
695 $r->{field} ? $r : undef;
699 =head2 _convert_index_strings
701 my @searches = $self->_convert_index_strings(@searches);
703 Similar to L<_convert_index_fields>, this takes strings of the form
704 B<field:search term> and rewrites the field from zebra-style to
705 elasticsearch-style. Anything it doesn't understand is returned verbatim.
709 sub _convert_index_strings {
710 my ( $self, @searches ) = @_;
712 foreach my $s (@searches) {
714 my ( $field, $term ) = $s =~ /^\s*([\w,-]*?):(.*)/;
715 unless ( defined($field) && defined($term) ) {
719 my ($conv) = $self->_convert_index_fields($field);
720 unless ( defined($conv) ) {
724 push @res, ($conv->{field} ? $conv->{field} . ':' : '')
725 . $self->_modify_string_by_type( %$conv, operand => $term );
730 =head2 _convert_index_strings_freeform
732 my $search = $self->_convert_index_strings_freeform($search);
734 This is similar to L<_convert_index_strings>, however it'll search out the
735 things to change within the string. So it can handle strings such as
736 C<(su:foo) AND (su:bar)>, converting the C<su> appropriately.
738 If there is something of the form "su,complete-subfield" or something, the
739 second part is stripped off as we can't yet handle that. Making it work
740 will have to wait for a real query parser.
744 sub _convert_index_strings_freeform {
745 my ( $self, $search ) = @_;
746 # @TODO: Currenty will alter also fields contained within quotes:
747 # `searching for "stuff cn:123"` for example will become
748 # `searching for "stuff local-number:123"
750 # Fixing this is tricky, one possibility:
751 # https://stackoverflow.com/questions/19193876/perl-regex-to-match-a-string-that-is-not-enclosed-in-quotes
752 # Still not perfect, and will not handle escaped quotes within quotes and assumes balanced quotes.
754 # Another, not so elegant, solution could be to replace all quoted content with placeholders, and put
755 # them back when processing is done.
757 # Lower case field names
758 $search =~ s/($field_name_pattern)(?:,[\w-]*)?($multi_field_pattern):/\L$1\E$2:/og;
759 # Resolve possible field aliases
760 $search =~ s/($field_name_pattern)($multi_field_pattern):/(exists $index_field_convert{$1} ? $index_field_convert{$1} : $1)."$2:"/oge;
764 =head2 _modify_string_by_type
766 my $str = $self->_modify_string_by_type(%index_field);
768 If you have a search term (operand) and a type (phrase, right-truncated), this
769 will convert the string to have the function in lucene search terms, e.g.
770 wrapping quotes around it.
774 sub _modify_string_by_type {
775 my ( $self, %idx ) = @_;
777 my $type = $idx{type} || '';
778 my $str = $idx{operand};
779 return $str unless $str; # Empty or undef, we can't use it.
781 $str .= '*' if $type eq 'right-truncate';
782 $str = '"' . $str . '"' if $type eq 'phrase';
783 if ($type eq 'st-year') {
784 if ($str =~ /^(.*)-(.*)$/) {
785 my $from = $1 || '*';
786 my $until = $2 || '*';
787 $str = "[$from TO $until]";
795 my $query_str = $self->_join_queries(@query_parts);
797 This takes a list of query parts, that might be search terms on their own, or
798 booleaned together, or specifying fields, or whatever, wraps them in
799 parentheses, and ANDs them all together. Suitable for feeding to the ES
802 Note: doesn't AND them together if they specify an index that starts with "mc"
803 as that was a special case in the original code for dealing with multiple
804 choice options (you can't search for something that has an itype of A and
805 and itype of B otherwise.)
810 my ( $self, @parts ) = @_;
812 my @norm_parts = grep { defined($_) && $_ ne '' && $_ !~ /^mc-/ } @parts;
814 map { s/^mc-//r } grep { defined($_) && $_ ne '' && $_ =~ /^mc-/ } @parts;
815 return () unless @norm_parts + @mc_parts;
816 return ( @norm_parts, @mc_parts )[0] if @norm_parts + @mc_parts == 1;
818 @mc_parts ? '(' . ( join ' OR ', map { "($_)" } @mc_parts ) . ')' : ();
820 # Handy trick: $x || () inside a join means that if $x ends up as an
821 # empty string, it gets replaced with (), which makes join ignore it.
822 # (bad effect: this'll also happen to '0', this hopefully doesn't matter
825 join( ' AND ', map { "($_)" } @norm_parts ) || (),
831 my @phrased_queries = $self->_make_phrases(@query_parts);
833 This takes the supplied queries and forces them to be phrases by wrapping
834 quotes around them. It understands field prefixes, e.g. 'subject:' and puts
835 the quotes outside of them if they're there.
840 my ( $self, @parts ) = @_;
841 map { s/^\s*(\w*?:)(.*)$/$1"$2"/r } @parts;
844 =head2 _create_query_string
846 my @query_strings = $self->_create_query_string(@queries);
848 Given a list of hashrefs, it will turn them into a lucene-style query string.
849 The hash should contain field, type (both for the indexes), operator, and
854 sub _create_query_string {
855 my ( $self, @queries ) = @_;
858 my $otor = $_->{operator} ? $_->{operator} . ' ' : '';
859 my $field = $_->{field} ? $_->{field} . ':' : '';
861 my $oand = $self->_modify_string_by_type(%$_);
862 $oand = "($oand)" if $field && scalar(split(/\s+/, $oand)) > 1 && (!defined $_->{type} || $_->{type} ne 'st-year');
863 "$otor($field$oand)";
867 =head2 _clean_search_term
869 my $term = $self->_clean_search_term($term);
871 This cleans a search term by removing any funny characters that may upset
872 ES and give us an error. It also calls L<_convert_index_strings_freeform>
873 to ensure those parts are correct.
877 sub _clean_search_term {
878 my ( $self, $term ) = @_;
880 # Lookahead for checking if we are inside quotes
881 my $lookahead = '(?=(?:[^\"]*+\"[^\"]*+\")*+[^\"]*+$)';
883 # Some hardcoded searches (like with authorities) produce things like
884 # 'an=123', when it ought to be 'an:123' for our purposes.
887 $term = $self->_convert_index_strings_freeform($term);
890 # Remove unbalanced quotes
891 my $unquoted = $term;
892 my $count = ($unquoted =~ tr/"/ /);
893 if ($count % 2 == 1) {
897 # Remove unquoted colons that have whitespace on either side of them
898 $term =~ s/(\:[:\s]+|[:\s]+:)$lookahead//g;
900 $term = $self->_query_regex_escape_process($term);
905 =head2 _query_regex_escape_process
907 my $query = $self->_query_regex_escape_process($query);
909 Processes query in accordance with current "QueryRegexEscapeOptions" system preference setting.
913 sub _query_regex_escape_process {
914 my ($self, $query) = @_;
915 my $regex_escape_options = C4::Context->preference("QueryRegexEscapeOptions");
916 if ($regex_escape_options ne 'dont_escape') {
917 if ($regex_escape_options eq 'escape') {
918 # Will escape unescaped slashes (/) while preserving
919 # unescaped slashes within quotes
920 # @TODO: assumes quotes are always balanced and will
921 # not handle escaped qoutes properly, should perhaps be
922 # replaced with a more general parser solution
923 # so that this function is ever only provided with unqouted
925 $query =~ s@(?:(?<!\\)((?:[\\]{2})*)(?=/))(?![^"]*"(?:[^"]*"[^"]*")*[^"]*$)@\\$1@g;
927 elsif($regex_escape_options eq 'unescape_escaped') {
928 # Will unescape escaped slashes (\/) and escape
929 # unescaped slashes (/) while preserving slashes within quotes
930 # The same limitatations as above apply for handling of quotes
931 $query =~ s@(?:(?<!\\)(?:((?:[\\]{2})*[\\])|((?:[\\]{2})*))(?=/))(?![^"]*"(?:[^"]*"[^"]*")*[^"]*$)@($1 ? substr($1, 0, -1) : ($2 . "\\"))@ge;
937 =head2 _fix_limit_special_cases
939 my $limits = $self->_fix_limit_special_cases($limits);
941 This converts any special cases that the limit specifications have into things
942 that are more readily processable by the rest of the code.
944 The argument should be an arrayref, and it'll return an arrayref.
948 sub _fix_limit_special_cases {
949 my ( $self, $limits ) = @_;
952 foreach my $l (@$limits) {
954 # This is set up by opac-search.pl
955 if ( $l =~ /^yr,st-numeric,ge=/ ) {
956 my ( $start, $end ) =
957 ( $l =~ /^yr,st-numeric,ge=(.*) and yr,st-numeric,le=(.*)$/ );
958 next unless defined($start) && defined($end);
959 push @new_lim, "copydate:[$start TO $end]";
961 elsif ( $l =~ /^yr,st-numeric=/ ) {
962 my ($date) = ( $l =~ /^yr,st-numeric=(.*)$/ );
963 next unless defined($date);
964 $date = $self->_modify_string_by_type(type => 'st-year', operand => $date);
965 push @new_lim, "copydate:$date";
967 elsif ( $l =~ /^available$/ ) {
968 push @new_lim, 'onloan:false';
979 my $field = $self->_sort_field($field);
981 Given a field name, this works out what the actual name of the field to sort
982 on should be. A '__sort' suffix is added for fields with a sort version, and
983 for text fields either '.phrase' (for sortable versions) or '.raw' is appended
984 to avoid sorting on a tokenized value.
991 my $mappings = $self->get_elasticsearch_mappings();
992 my $textField = defined $mappings->{data}{properties}{$f}{type} && $mappings->{data}{properties}{$f}{type} eq 'text';
993 if (!defined $self->sort_fields()->{$f} || $self->sort_fields()->{$f}) {
995 # We need to add '.phrase' to text fields, otherwise it'll sort
996 # based on the tokenised form.
997 $f .= '.phrase' if $textField;
999 # We need to add '.raw' to text fields without a sort field,
1000 # otherwise it'll sort based on the tokenised form.
1001 $f .= '.raw' if $textField;
1006 =head2 _truncate_terms
1008 my $query = $self->_truncate_terms($query);
1010 Given a string query this function appends '*' wildcard to all terms except
1011 operands and double quoted strings.
1015 sub _truncate_terms {
1016 my ( $self, $query ) = @_;
1018 my @tokens = $self->_split_query( $query );
1020 # Filter out empty tokens
1021 my @words = grep { $_ !~ /^\s*$/ } @tokens;
1023 # Append '*' to words if needed, ie. if it ends in a word character and is not a keyword
1026 (/\W$/ or grep {lc($w) eq $_} qw/and or not/) ? $_ : "$_*";
1029 return join ' ', @terms;
1034 my @token = $self->_split_query($query_str);
1036 Given a string query this function splits it to tokens taking into account
1037 any field prefixes and quoted strings.
1041 my $tokenize_split_re = qr/((?:${field_name_pattern}${multi_field_pattern}:)?"[^"]+"|\s+)/;
1044 my ( $self, $query ) = @_;
1046 # '"donald duck" title:"the mouse" and peter" get split into
1047 # ['', '"donald duck"', '', ' ', '', 'title:"the mouse"', '', ' ', 'and', ' ', 'pete']
1048 my @tokens = split $tokenize_split_re, $query;
1050 # Filter out empty values
1051 @tokens = grep( /\S/, @tokens );
1056 =head2 _search_fields
1057 my $weighted_fields = $self->_search_fields({
1059 weighted_fields => 1,
1063 Generate a list of searchable fields to be used for Elasticsearch queries
1064 applied to multiple fields.
1066 Returns an arrayref of field names for either OPAC or Staff client, with
1067 possible weights and subfield appended to each field name depending on the
1074 Hashref with options. The parameter C<is_opac> indicates whether the searchable
1075 fields for OPAC or Staff client should be retrieved. If C<weighted_fields> is set
1076 fields weights will be applied on returned fields. C<subfield> can be used to
1077 provide a subfield that will be appended to fields as "C<field_name>.C<subfield>".
1083 sub _search_fields {
1084 my ($self, $params) = @_;
1087 weighted_fields => 0,
1089 # This is a hack for authorities build_authorities_query
1090 # can hopefully be removed in the future
1093 my $cache = Koha::Caches->get_instance();
1094 my $cache_key = 'elasticsearch_search_fields' . ($params->{is_opac} ? '_opac' : '_staff_client');
1095 my $search_fields = $cache->get_from_cache($cache_key, { unsafe => 1 });
1096 if (!$search_fields) {
1097 # The reason we don't use Koha::SearchFields->search here is we don't
1098 # want or need resultset wrapped as Koha::SearchField object.
1099 # It does not make any sense in this context and would cause
1100 # unnecessary overhead sice we are only querying for data
1101 # Also would not work, or produce strange results, with the "columns"
1103 my $schema = Koha::Database->schema;
1104 my $result = $schema->resultset('SearchField')->search(
1106 $params->{is_opac} ? (
1111 'type' => { '!=' => 'boolean' },
1112 'search_marc_map.index_name' => $self->index,
1113 'search_marc_map.marc_type' => C4::Context->preference('marcflavour'),
1114 'search_marc_to_fields.search' => 1,
1117 columns => [qw/name weight/],
1119 join => {search_marc_to_fields => 'search_marc_map'},
1123 while (my $search_field = $result->next) {
1124 push @search_fields, [
1125 $search_field->name,
1126 $search_field->weight ? $search_field->weight : ()
1129 $search_fields = \@search_fields;
1130 $cache->set_in_cache($cache_key, $search_fields);
1132 if ($params->{subfield}) {
1133 my $subfield = $params->{subfield};
1136 # Copy values to avoid mutating cached
1137 # data (since unsafe is used)
1138 my ($field, $weight) = @{$_};
1139 ["${field}.${subfield}", $weight];
1143 if ($params->{weighted_fields}) {
1144 return [map { join('^', @{$_}) } @{$search_fields}];
1147 # Exclude weight from field
1148 return [map { $_->[0] } @{$search_fields}];