1 package Koha::SearchEngine::Elasticsearch::QueryBuilder;
3 # This file is part of Koha.
5 # Copyright 2014 Catalyst IT Ltd.
7 # Koha is free software; you can redistribute it and/or modify it
8 # under the terms of the GNU General Public License as published by
9 # the Free Software Foundation; either version 3 of the License, or
10 # (at your option) any later version.
12 # Koha is distributed in the hope that it will be useful, but
13 # WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 # GNU General Public License for more details.
17 # You should have received a copy of the GNU General Public License
18 # along with Koha; if not, see <http://www.gnu.org/licenses>.
22 Koha::SearchEngine::Elasticsearch::QueryBuilder - constructs elasticsearch
23 query objects from user-supplied queries
27 This provides the functions that take a user-supplied search query, and
28 provides something that can be given to elasticsearch to get answers.
32 use Koha::SearchEngine::Elasticsearch::QueryBuilder;
33 $builder = Koha::SearchEngine::Elasticsearch->new({ index => $index });
34 my $simple_query = $builder->build_query("hello");
35 # This is currently undocumented because the original code is undocumented
36 my $adv_query = $builder->build_advanced_query($indexes, $operands, $operators);
42 use base qw(Koha::SearchEngine::Elasticsearch);
45 use List::MoreUtils qw/ each_array /;
55 my $simple_query = $builder->build_query("hello", %options)
57 This will build a query that can be issued to elasticsearch from the provided
58 string input. This expects a lucene style search form (see
59 L<http://www.elasticsearch.org/guide/en/elasticsearch/reference/current/query-dsl-query-string-query.html#query-string-syntax>
62 It'll make an attempt to respect the various query options.
64 Additional options can be provided with the C<%options> hash.
70 This should be an arrayref of hashrefs, each containing a C<field> and an
71 C<direction> (optional, defaults to C<asc>.) The results will be sorted
72 according to these values. Valid values for C<direction> are 'asc' and 'desc'.
79 my ( $self, $query, %options ) = @_;
81 my $stemming = C4::Context->preference("QueryStemming") || 0;
82 my $auto_truncation = C4::Context->preference("QueryAutoTruncate") || 0;
83 my $fuzzy_enabled = C4::Context->preference("QueryFuzzy") || 0;
85 $query = '*' unless defined $query;
88 my $fields = $self->_search_fields({
89 is_opac => $options{is_opac},
90 weighted_fields => $options{weighted_fields},
92 if ($options{whole_record}) {
93 push @$fields, 'marc_data_array.*';
98 fuzziness => $fuzzy_enabled ? 'auto' : '0',
99 default_operator => 'AND',
101 lenient => JSON::true,
102 analyze_wildcard => JSON::true,
106 if ( $options{sort} ) {
107 foreach my $sort ( @{ $options{sort} } ) {
108 my ( $f, $d ) = @$sort{qw/ field direction /};
109 die "Invalid sort direction, $d"
110 if $d && ( $d ne 'asc' && $d ne 'desc' );
111 $d = 'asc' unless $d;
113 $f = $self->_sort_field($f);
114 push @{ $res->{sort} }, { $f => { order => $d } };
118 # See _convert_facets in Search.pm for how these get turned into
119 # things that Koha can use.
120 my $size = C4::Context->preference('FacetMaxCount');
121 $res->{aggregations} = {
122 author => { terms => { field => "author__facet" , size => $size } },
123 subject => { terms => { field => "subject__facet", size => $size } },
124 itype => { terms => { field => "itype__facet", size => $size} },
125 location => { terms => { field => "location__facet", size => $size } },
126 'su-geo' => { terms => { field => "su-geo__facet", size => $size} },
127 'title-series' => { terms => { field => "title-series__facet", size => $size } },
128 ccode => { terms => { field => "ccode__facet", size => $size } },
129 ln => { terms => { field => "ln__facet", size => $size } },
132 my $display_library_facets = C4::Context->preference('DisplayLibraryFacets');
133 if ( $display_library_facets eq 'both'
134 or $display_library_facets eq 'home' ) {
135 $res->{aggregations}{homebranch} = { terms => { field => "homebranch__facet", size => $size } };
137 if ( $display_library_facets eq 'both'
138 or $display_library_facets eq 'holding' ) {
139 $res->{aggregations}{holdingbranch} = { terms => { field => "holdingbranch__facet", size => $size } };
144 =head2 build_query_compat
147 $error, $query, $simple_query, $query_cgi,
148 $query_desc, $limit, $limit_cgi, $limit_desc,
149 $stopwords_removed, $query_type
151 = $builder->build_query_compat( \@operators, \@operands, \@indexes,
152 \@limits, \@sort_by, $scan, $lang, $params );
154 This handles a search using the same api as L<C4::Search::buildQuery> does.
156 A very simple query will go in with C<$operands> set to ['query'], and
157 C<$sort_by> set to ['pubdate_dsc']. This simple case will return with
158 C<$query> set to something that can perform the search, C<$simple_query>
159 set to just the search term, C<$query_cgi> set to something that can
160 reproduce this search, and C<$query_desc> set to something else.
164 sub build_query_compat {
165 my ( $self, $operators, $operands, $indexes, $orig_limits, $sort_by, $scan,
171 my $search_param_query_str = '';
174 ($query, $query_str) = $self->_build_scan_query( $operands, $indexes );
175 $search_param_query_str = $query_str;
177 my @sort_params = $self->_convert_sort_fields(@$sort_by);
178 my @index_params = $self->_convert_index_fields(@$indexes);
179 my $limits = $self->_fix_limit_special_cases($orig_limits);
180 if ( $params->{suppress} ) { push @$limits, "suppress:false"; }
181 # Merge the indexes in with the search terms and the operands so that
182 # each search thing is a handy unit.
183 unshift @$operators, undef; # The first one can't have an op
185 my $truncate = C4::Context->preference("QueryAutoTruncate") || 0;
186 my $ea = each_array( @$operands, @$operators, @index_params );
187 while ( my ( $oand, $otor, $index ) = $ea->() ) {
188 next if ( !defined($oand) || $oand eq '' );
189 $oand = $self->_clean_search_term($oand);
190 $oand = $self->_truncate_terms($oand) if ($truncate);
191 push @search_params, {
192 operand => $oand, # the search terms
193 operator => defined($otor) ? uc $otor : undef, # AND and so on
194 $index ? %$index : (),
198 # We build a string query from limits and the queries. An alternative
199 # would be to pass them separately into build_query and let it build
200 # them into a structured ES query itself. Maybe later, though that'd be
202 $search_param_query_str = join( ' ', $self->_create_query_string(@search_params) );
203 $query_str = join( ' AND ',
204 $search_param_query_str || (),
205 $self->_join_queries( $self->_convert_index_strings(@$limits) ) || () );
207 # If there's no query on the left, let's remove the junk left behind
208 $query_str =~ s/^ AND //;
210 $options{sort} = \@sort_params;
211 $options{is_opac} = $params->{is_opac};
212 $options{weighted_fields} = $params->{weighted_fields};
213 $options{whole_record} = $params->{whole_record};
214 $query = $self->build_query( $query_str, %options );
217 # We roughly emulate the CGI parameters of the zebra query builder
219 shift @$operators; # Shift out the one we unshifted before
220 my $ea = each_array( @$operands, @$operators, @$indexes );
221 while ( my ( $oand, $otor, $index ) = $ea->() ) {
222 $query_cgi .= '&' if $query_cgi;
223 $query_cgi .= 'idx=' . uri_escape_utf8( $index // '') . '&q=' . uri_escape_utf8( $oand );
224 $query_cgi .= '&op=' . uri_escape_utf8( $otor ) if $otor;
226 $query_cgi .= '&scan=1' if ( $scan );
229 $simple_query = $operands->[0] if @$operands == 1;
231 if ( $simple_query ) {
232 $query_desc = $simple_query;
234 $query_desc = $search_param_query_str;
236 my $limit = $self->_join_queries( $self->_convert_index_strings(@$limits));
237 my $limit_cgi = ( $orig_limits and @$orig_limits )
238 ? '&limit=' . join( '&limit=', map { uri_escape_utf8($_) } @$orig_limits )
241 $limit_desc = "$limit" if $limit;
244 undef, $query, $simple_query, $query_cgi, $query_desc,
245 $limit, $limit_cgi, $limit_desc, undef, undef
249 =head2 build_authorities_query
251 my $query = $builder->build_authorities_query(\%search);
253 This takes a nice description of an authority search and turns it into a black-box
254 query that can then be passed to the appropriate searcher.
256 The search description is a hashref that looks something like:
261 where => 'Heading', # search the main entry
262 operator => 'exact', # require an exact match
263 value => 'frogs', # the search string
266 where => '', # search all entries
267 operator => '', # default keyword, right truncation
275 authtypecode => 'TOPIC_TERM',
280 sub build_authorities_query {
281 my ( $self, $search ) = @_;
283 # Start by making the query parts
286 foreach my $s ( @{ $search->{searches} } ) {
287 my ( $wh, $op, $val ) = @{$s}{qw(where operator value)};
288 if ( defined $op && ($op eq 'is' || $op eq '=' || $op eq 'exact') ) {
290 # Match the whole field, case insensitive, UTF normalized.
291 push @query_parts, { term => { "$wh.ci_raw" => $val } };
294 # Match the whole field for all searchable fields, case insensitive,
296 # Given that field data is "The quick brown fox"
297 # "The quick brown fox" and "the quick brown fox" will match
298 # but not "quick brown fox".
302 fields => $self->_search_fields({ subfield => 'ci_raw' }),
307 elsif ( defined $op && $op eq 'start') {
308 # Match the prefix within a field for all searchable fields.
309 # Given that field data is "The quick brown fox"
310 # "The quick bro" will match, but not "quick bro"
312 # Does not seems to be a multi prefix query
313 # so we need to create one
315 # Match prefix of the field.
316 push @query_parts, { prefix => {"$wh.ci_raw" => $val} };
320 foreach my $field (@{$self->_search_fields()}) {
321 push @prefix_queries, {
322 prefix => { "$field.ci_raw" => $val }
327 'should' => \@prefix_queries,
328 'minimum_should_match' => 1
334 # Query all searchable fields.
335 # Given that field data is "The quick brown fox"
336 # a search containing any of the words will match, regardless
339 my @tokens = $self->_split_query( $val );
340 foreach my $token ( @tokens ) {
341 $token = $self->_truncate_terms(
342 $self->_clean_search_term( $token )
345 my $query = $self->_join_queries( @tokens );
348 push @query_parts, { query_string => {
349 default_field => $wh,
350 analyze_wildcard => JSON::true,
357 analyze_wildcard => JSON::true,
359 fields => $self->_search_fields(),
366 # Merge the query parts appropriately
367 # 'should' behaves like 'or'
368 # 'must' behaves like 'and'
369 # Zebra behaviour seem to match must so using that here
370 my $elastic_query = {};
371 $elastic_query->{bool}->{must} = \@query_parts;
373 # Filter by authtypecode if set
374 if ($search->{authtypecode}) {
375 $elastic_query->{bool}->{filter} = {
377 "authtype.raw" => $search->{authtypecode}
383 query => $elastic_query
387 $query->{sort} = [ $search->{sort} ] if exists $search->{sort};
392 =head2 build_authorities_query_compat
395 $builder->build_authorities_query_compat( \@marclist, \@and_or,
396 \@excluding, \@operator, \@value, $authtypecode, $orderby );
398 This builds a query for searching for authorities, in the style of
399 L<C4::AuthoritiesMarc::SearchAuthorities>.
407 An arrayref containing where the particular term should be searched for.
408 Options are: mainmainentry, mainentry, match, match-heading, see-from, and
409 thesaurus. If left blank, any field is used.
413 Totally ignored. It is never used in L<C4::AuthoritiesMarc::SearchAuthorities>.
421 What form of search to do. Options are: is (phrase, no truncation, whole field
422 must match), = (number exact match), exact (phrase, no truncation, whole field
423 must match). If left blank, then word list, right truncated, anywhere is used.
427 The actual user-provided string value to search for.
431 The authority type code to search within. If blank, then all will be searched.
435 The order to sort the results by. Options are Relevance, HeadingAsc,
436 HeadingDsc, AuthidAsc, AuthidDsc.
440 marclist, operator, and value must be the same length, and the values at
441 index /i/ all relate to each other.
443 This returns a query, which is a black box object that can be passed to the
444 appropriate search object.
448 our $koha_to_index_name = {
449 mainmainentry => 'heading-main',
450 mainentry => 'heading',
452 'match-heading' => 'match-heading',
453 'see-from' => 'match-heading-see-from',
454 thesaurus => 'subject-heading-thesaurus',
459 sub build_authorities_query_compat {
460 my ( $self, $marclist, $and_or, $excluding, $operator, $value,
461 $authtypecode, $orderby )
464 # This turns the old-style many-options argument form into a more
465 # extensible hash form that is understood by L<build_authorities_query>.
467 my $mappings = $self->get_elasticsearch_mappings();
469 # Convert to lower case
470 $marclist = [map(lc, @{$marclist})];
471 $orderby = lc $orderby;
474 # Make sure everything exists
475 foreach my $m (@$marclist) {
477 $m = exists $koha_to_index_name->{$m} ? $koha_to_index_name->{$m} : $m;
479 warn "Unknown search field $m in marclist" unless (defined $mappings->{data}->{properties}->{$m} || $m eq '');
481 for ( my $i = 0 ; $i < @$value ; $i++ ) {
482 next unless $value->[$i]; #clean empty form values, ES doesn't like undefined searches
485 where => $indexes[$i],
486 operator => $operator->[$i],
487 value => $value->[$i],
493 ( $orderby =~ /^heading/ ) ? 'heading__sort'
494 : ( $orderby =~ /^auth/ ) ? 'local-number__sort'
497 my $sort_order = ( $orderby =~ /asc$/ ) ? 'asc' : 'desc';
498 %sort = ( $sort_field => $sort_order, );
501 searches => \@searches,
502 authtypecode => $authtypecode,
504 $search{sort} = \%sort if %sort;
505 my $query = $self->build_authorities_query( \%search );
509 =head2 _build_scan_query
511 my ($query, $query_str) = $builder->_build_scan_query(\@operands, \@indexes)
513 This will build an aggregation scan query that can be issued to elasticsearch from
514 the provided string input.
518 our %scan_field_convert = (
522 'se' => 'title-series',
526 sub _build_scan_query {
527 my ( $self, $operands, $indexes ) = @_;
529 my $term = scalar( @$operands ) == 0 ? '' : $operands->[0];
530 my $index = scalar( @$indexes ) == 0 ? 'subject' : $indexes->[0];
532 my ( $f, $d ) = split( /,/, $index);
533 $index = $scan_field_convert{$f} || $f;
541 $res->{aggregations} = {
544 field => $index . '__facet',
545 order => { '_term' => 'asc' },
546 include => $self->_create_regex_filter($self->_clean_search_term($term)) . '.*'
550 return ($res, $term);
553 =head2 _create_regex_filter
555 my $filter = $builder->_create_regex_filter('term')
557 This will create a regex filter that can be used with an aggregation query.
561 sub _create_regex_filter {
562 my ($self, $term) = @_;
565 foreach my $c (split(//, quotemeta($term))) {
568 $result .= $lc ne $uc ? '[' . $lc . $uc . ']' : $c;
573 =head2 _convert_sort_fields
575 my @sort_params = _convert_sort_fields(@sort_by)
577 Converts the zebra-style sort index information into elasticsearch-style.
579 C<@sort_by> is the same as presented to L<build_query_compat>, and it returns
580 something that can be sent to L<build_query>.
584 sub _convert_sort_fields {
585 my ( $self, @sort_by ) = @_;
587 # Turn the sorting into something we care about.
588 my %sort_field_convert = (
589 acqdate => 'date-of-acquisition',
591 call_number => 'local-classification',
592 popularity => 'issues',
593 relevance => undef, # default
595 pubdate => 'date-of-publication',
597 my %sort_order_convert =
598 ( qw( desc desc ), qw( dsc desc ), qw( asc asc ), qw( az asc ), qw( za desc ) );
600 # Convert the fields and orders, drop anything we don't know about.
601 grep { $_->{field} } map {
602 my ( $f, $d ) = /(.+)_(.+)/;
604 field => $sort_field_convert{$f},
605 direction => $sort_order_convert{$d}
610 =head2 _convert_index_fields
612 my @index_params = $self->_convert_index_fields(@indexes);
614 Converts zebra-style search index notation into elasticsearch-style.
616 C<@indexes> is an array of index names, as presented to L<build_query_compat>,
617 and it returns something that can be sent to L<build_query>.
619 B<TODO>: this will pull from the elasticsearch mappings table to figure out
624 our %index_field_convert = (
628 'lcn' => 'local-classification',
629 'callnum' => 'local-classification',
630 'record-type' => 'rtype',
631 'mc-rtype' => 'rtype',
633 'lc-card' => 'lc-card-number',
634 'sn' => 'local-number',
635 'biblionumber' => 'local-number',
636 'yr' => 'date-of-publication',
637 'pubdate' => 'date-of-publication',
638 'acqdate' => 'date-of-acquisition',
639 'date/time-last-modified' => 'date-time-last-modified',
640 'dtlm' => 'date-time-last-modified',
641 'diss' => 'dissertation-information',
644 'music-number' => 'identifier-publisher-for-music',
645 'number-music-publisher' => 'identifier-publisher-for-music',
646 'music' => 'identifier-publisher-for-music',
647 'ident' => 'identifier-standard',
648 'cpn' => 'corporate-name',
649 'cfn' => 'conference-name',
650 'pn' => 'personal-name',
655 'rcn' => 'record-control-number',
657 'su-to' => 'subject',
658 #'su-geo' => 'subject',
659 'su-ut' => 'subject',
661 'se' => 'title-series',
662 'ut' => 'title-uniform',
663 'an' => 'koha-auth-number',
664 'authority-number' => 'koha-auth-number',
667 'rank' => 'relevance',
668 'phr' => 'st-phrase',
669 'wrdl' => 'st-word-list',
670 'rt' => 'right-truncation',
671 'rtrn' => 'right-truncation',
672 'ltrn' => 'left-truncation',
673 'rltrn' => 'left-and-right',
674 'mc-itemtype' => 'itemtype',
675 'mc-ccode' => 'ccode',
676 'branch' => 'homebranch',
677 'mc-loc' => 'location',
679 'stocknumber' => 'number-local-acquisition',
680 'inv' => 'number-local-acquisition',
682 'mc-itype' => 'itype',
683 'aub' => 'author-personal-bibliography',
684 'auo' => 'author-in-order',
688 'frequency-code' => 'ff8-18',
689 'illustration-code' => 'ff8-18-21',
690 'regularity-code' => 'ff8-19',
691 'type-of-serial' => 'ff8-21',
692 'format' => 'ff8-23',
693 'conference-code' => 'ff8-29',
694 'festschrift-indicator' => 'ff8-30',
695 'index-indicator' => 'ff8-31',
698 'literature-code' => 'lf',
699 'biography' => 'bio',
701 'biography-code' => 'bio',
702 'l-format' => 'ff7-01-02',
703 'lex' => 'lexile-number',
704 'hi' => 'host-item-number',
705 'itu' => 'index-term-uncontrolled',
706 'itg' => 'index-term-genre',
708 my $field_name_pattern = '[\w\-]+';
709 my $multi_field_pattern = "(?:\\.$field_name_pattern)*";
711 sub _convert_index_fields {
712 my ( $self, @indexes ) = @_;
714 my %index_type_convert =
715 ( __default => undef, phr => 'phrase', rtrn => 'right-truncate', 'st-year' => 'st-year' );
717 # Convert according to our table, drop anything that doesn't convert.
718 # If a field starts with mc- we save it as it's used (and removed) later
719 # when joining things, to indicate we make it an 'OR' join.
720 # (Sorry, this got a bit ugly after special cases were found.)
722 # Lower case all field names
723 my ( $f, $t ) = map(lc, split /,/);
730 field => exists $index_field_convert{$f} ? $index_field_convert{$f} : $f,
731 type => $index_type_convert{ $t // '__default' }
733 $r->{field} = ($mc . $r->{field}) if $mc && $r->{field};
734 $r->{field} ? $r : undef;
738 =head2 _convert_index_strings
740 my @searches = $self->_convert_index_strings(@searches);
742 Similar to L<_convert_index_fields>, this takes strings of the form
743 B<field:search term> and rewrites the field from zebra-style to
744 elasticsearch-style. Anything it doesn't understand is returned verbatim.
748 sub _convert_index_strings {
749 my ( $self, @searches ) = @_;
751 foreach my $s (@searches) {
753 my ( $field, $term ) = $s =~ /^\s*([\w,-]*?):(.*)/;
754 unless ( defined($field) && defined($term) ) {
758 my ($conv) = $self->_convert_index_fields($field);
759 unless ( defined($conv) ) {
763 push @res, ($conv->{field} ? $conv->{field} . ':' : '')
764 . $self->_modify_string_by_type( %$conv, operand => $term );
769 =head2 _convert_index_strings_freeform
771 my $search = $self->_convert_index_strings_freeform($search);
773 This is similar to L<_convert_index_strings>, however it'll search out the
774 things to change within the string. So it can handle strings such as
775 C<(su:foo) AND (su:bar)>, converting the C<su> appropriately.
777 If there is something of the form "su,complete-subfield" or something, the
778 second part is stripped off as we can't yet handle that. Making it work
779 will have to wait for a real query parser.
783 sub _convert_index_strings_freeform {
784 my ( $self, $search ) = @_;
785 # @TODO: Currenty will alter also fields contained within quotes:
786 # `searching for "stuff cn:123"` for example will become
787 # `searching for "stuff local-number:123"
789 # Fixing this is tricky, one possibility:
790 # https://stackoverflow.com/questions/19193876/perl-regex-to-match-a-string-that-is-not-enclosed-in-quotes
791 # Still not perfect, and will not handle escaped quotes within quotes and assumes balanced quotes.
793 # Another, not so elegant, solution could be to replace all quoted content with placeholders, and put
794 # them back when processing is done.
796 # Lower case field names
797 $search =~ s/($field_name_pattern)(?:,[\w-]*)?($multi_field_pattern):/\L$1\E$2:/og;
798 # Resolve possible field aliases
799 $search =~ s/($field_name_pattern)($multi_field_pattern):/(exists $index_field_convert{$1} ? $index_field_convert{$1} : $1)."$2:"/oge;
803 =head2 _modify_string_by_type
805 my $str = $self->_modify_string_by_type(%index_field);
807 If you have a search term (operand) and a type (phrase, right-truncated), this
808 will convert the string to have the function in lucene search terms, e.g.
809 wrapping quotes around it.
813 sub _modify_string_by_type {
814 my ( $self, %idx ) = @_;
816 my $type = $idx{type} || '';
817 my $str = $idx{operand};
818 return $str unless $str; # Empty or undef, we can't use it.
820 $str .= '*' if $type eq 'right-truncate';
821 $str = '"' . $str . '"' if $type eq 'phrase' && $str !~ /^".*"$/;
822 if ($type eq 'st-year') {
823 if ($str =~ /^(.*)-(.*)$/) {
824 my $from = $1 || '*';
825 my $until = $2 || '*';
826 $str = "[$from TO $until]";
834 my $query_str = $self->_join_queries(@query_parts);
836 This takes a list of query parts, that might be search terms on their own, or
837 booleaned together, or specifying fields, or whatever, wraps them in
838 parentheses, and ANDs them all together. Suitable for feeding to the ES
841 Note: doesn't AND them together if they specify an index that starts with "mc"
842 as that was a special case in the original code for dealing with multiple
843 choice options (you can't search for something that has an itype of A and
844 and itype of B otherwise.)
849 my ( $self, @parts ) = @_;
851 my @norm_parts = grep { defined($_) && $_ ne '' && $_ !~ /^mc-/ } @parts;
853 map { s/^mc-//r } grep { defined($_) && $_ ne '' && $_ =~ /^mc-/ } @parts;
854 return () unless @norm_parts + @mc_parts;
855 return ( @norm_parts, @mc_parts )[0] if @norm_parts + @mc_parts == 1;
857 # Group limits by field, so they can be OR'ed together
859 foreach my $mc_part (@mc_parts) {
860 my ($field, $value) = split /:/, $mc_part, 2;
861 $mc_limits{$field} //= [];
862 push @{ $mc_limits{$field} }, $value;
866 sprintf('%s:(%s)', $_, join (' OR ', @{ $mc_limits{$_} }));
867 } sort keys %mc_limits;
869 @norm_parts = map { "($_)" } @norm_parts;
871 return join( ' AND ', @norm_parts, @mc_parts);
876 my @phrased_queries = $self->_make_phrases(@query_parts);
878 This takes the supplied queries and forces them to be phrases by wrapping
879 quotes around them. It understands field prefixes, e.g. 'subject:' and puts
880 the quotes outside of them if they're there.
885 my ( $self, @parts ) = @_;
886 map { s/^\s*(\w*?:)(.*)$/$1"$2"/r } @parts;
889 =head2 _create_query_string
891 my @query_strings = $self->_create_query_string(@queries);
893 Given a list of hashrefs, it will turn them into a lucene-style query string.
894 The hash should contain field, type (both for the indexes), operator, and
899 sub _create_query_string {
900 my ( $self, @queries ) = @_;
903 my $otor = $_->{operator} ? $_->{operator} . ' ' : '';
904 my $field = $_->{field} ? $_->{field} . ':' : '';
906 my $oand = $self->_modify_string_by_type(%$_);
907 $oand = "($oand)" if $field && scalar(split(/\s+/, $oand)) > 1 && (!defined $_->{type} || $_->{type} ne 'st-year');
908 "$otor($field$oand)";
912 =head2 _clean_search_term
914 my $term = $self->_clean_search_term($term);
916 This cleans a search term by removing any funny characters that may upset
917 ES and give us an error. It also calls L<_convert_index_strings_freeform>
918 to ensure those parts are correct.
922 sub _clean_search_term {
923 my ( $self, $term ) = @_;
925 # Lookahead for checking if we are inside quotes
926 my $lookahead = '(?=(?:[^\"]*+\"[^\"]*+\")*+[^\"]*+$)';
928 # Some hardcoded searches (like with authorities) produce things like
929 # 'an=123', when it ought to be 'an:123' for our purposes.
932 $term = $self->_convert_index_strings_freeform($term);
935 # Remove unbalanced quotes
936 my $unquoted = $term;
937 my $count = ($unquoted =~ tr/"/ /);
938 if ($count % 2 == 1) {
942 # Remove unquoted colons that have whitespace on either side of them
943 $term =~ s/(\:[:\s]+|[:\s]+:)$lookahead//g;
945 $term = $self->_query_regex_escape_process($term);
950 =head2 _query_regex_escape_process
952 my $query = $self->_query_regex_escape_process($query);
954 Processes query in accordance with current "QueryRegexEscapeOptions" system preference setting.
958 sub _query_regex_escape_process {
959 my ($self, $query) = @_;
960 my $regex_escape_options = C4::Context->preference("QueryRegexEscapeOptions");
961 if ($regex_escape_options ne 'dont_escape') {
962 if ($regex_escape_options eq 'escape') {
963 # Will escape unescaped slashes (/) while preserving
964 # unescaped slashes within quotes
965 # @TODO: assumes quotes are always balanced and will
966 # not handle escaped qoutes properly, should perhaps be
967 # replaced with a more general parser solution
968 # so that this function is ever only provided with unqouted
970 $query =~ s@(?:(?<!\\)((?:[\\]{2})*)(?=/))(?![^"]*"(?:[^"]*"[^"]*")*[^"]*$)@\\$1@g;
972 elsif($regex_escape_options eq 'unescape_escaped') {
973 # Will unescape escaped slashes (\/) and escape
974 # unescaped slashes (/) while preserving slashes within quotes
975 # The same limitatations as above apply for handling of quotes
976 $query =~ s@(?:(?<!\\)(?:((?:[\\]{2})*[\\])|((?:[\\]{2})*))(?=/))(?![^"]*"(?:[^"]*"[^"]*")*[^"]*$)@($1 ? substr($1, 0, -1) : ($2 . "\\"))@ge;
982 =head2 _fix_limit_special_cases
984 my $limits = $self->_fix_limit_special_cases($limits);
986 This converts any special cases that the limit specifications have into things
987 that are more readily processable by the rest of the code.
989 The argument should be an arrayref, and it'll return an arrayref.
993 sub _fix_limit_special_cases {
994 my ( $self, $limits ) = @_;
997 foreach my $l (@$limits) {
999 # This is set up by opac-search.pl
1000 if ( $l =~ /^yr,st-numeric,ge=/ ) {
1001 my ( $start, $end ) =
1002 ( $l =~ /^yr,st-numeric,ge=(.*) and yr,st-numeric,le=(.*)$/ );
1003 next unless defined($start) && defined($end);
1004 push @new_lim, "copydate:[$start TO $end]";
1006 elsif ( $l =~ /^yr,st-numeric=/ ) {
1007 my ($date) = ( $l =~ /^yr,st-numeric=(.*)$/ );
1008 next unless defined($date);
1009 $date = $self->_modify_string_by_type(type => 'st-year', operand => $date);
1010 push @new_lim, "copydate:$date";
1012 elsif ( $l =~ /^available$/ ) {
1013 push @new_lim, 'onloan:false';
1016 my ( $field, $term ) = $l =~ /^\s*([\w,-]*?):(.*)/;
1017 if ( defined($field) && defined($term) ) {
1018 push @new_lim, "$field:(\"$term\")";
1030 my $field = $self->_sort_field($field);
1032 Given a field name, this works out what the actual name of the field to sort
1033 on should be. A '__sort' suffix is added for fields with a sort version, and
1034 for text fields either '.phrase' (for sortable versions) or '.raw' is appended
1035 to avoid sorting on a tokenized value.
1040 my ($self, $f) = @_;
1042 my $mappings = $self->get_elasticsearch_mappings();
1043 my $textField = defined $mappings->{data}{properties}{$f}{type} && $mappings->{data}{properties}{$f}{type} eq 'text';
1044 if (!defined $self->sort_fields()->{$f} || $self->sort_fields()->{$f}) {
1047 # We need to add '.raw' to text fields without a sort field,
1048 # otherwise it'll sort based on the tokenised form.
1049 $f .= '.raw' if $textField;
1054 =head2 _truncate_terms
1056 my $query = $self->_truncate_terms($query);
1058 Given a string query this function appends '*' wildcard to all terms except
1059 operands and double quoted strings.
1063 sub _truncate_terms {
1064 my ( $self, $query ) = @_;
1066 my @tokens = $self->_split_query( $query );
1068 # Filter out empty tokens
1069 my @words = grep { $_ !~ /^\s*$/ } @tokens;
1071 # Append '*' to words if needed, ie. if it ends in a word character and is not a keyword
1074 (/\W$/ or grep {lc($w) eq $_} qw/and or not/) ? $_ : "$_*";
1077 return join ' ', @terms;
1082 my @token = $self->_split_query($query_str);
1084 Given a string query this function splits it to tokens taking into account
1085 any field prefixes and quoted strings.
1089 my $tokenize_split_re = qr/((?:${field_name_pattern}${multi_field_pattern}:)?"[^"]+"|\s+)/;
1092 my ( $self, $query ) = @_;
1094 # '"donald duck" title:"the mouse" and peter" get split into
1095 # ['', '"donald duck"', '', ' ', '', 'title:"the mouse"', '', ' ', 'and', ' ', 'pete']
1096 my @tokens = split $tokenize_split_re, $query;
1098 # Filter out empty values
1099 @tokens = grep( /\S/, @tokens );
1104 =head2 _search_fields
1105 my $weighted_fields = $self->_search_fields({
1107 weighted_fields => 1,
1111 Generate a list of searchable fields to be used for Elasticsearch queries
1112 applied to multiple fields.
1114 Returns an arrayref of field names for either OPAC or Staff client, with
1115 possible weights and subfield appended to each field name depending on the
1122 Hashref with options. The parameter C<is_opac> indicates whether the searchable
1123 fields for OPAC or Staff client should be retrieved. If C<weighted_fields> is set
1124 fields weights will be applied on returned fields. C<subfield> can be used to
1125 provide a subfield that will be appended to fields as "C<field_name>.C<subfield>".
1131 sub _search_fields {
1132 my ($self, $params) = @_;
1135 weighted_fields => 0,
1137 # This is a hack for authorities build_authorities_query
1138 # can hopefully be removed in the future
1141 my $cache = Koha::Caches->get_instance();
1142 my $cache_key = 'elasticsearch_search_fields' . ($params->{is_opac} ? '_opac' : '_staff_client') . "_" . $self->index;
1143 my $search_fields = $cache->get_from_cache($cache_key, { unsafe => 1 });
1144 if (!$search_fields) {
1145 # The reason we don't use Koha::SearchFields->search here is we don't
1146 # want or need resultset wrapped as Koha::SearchField object.
1147 # It does not make any sense in this context and would cause
1148 # unnecessary overhead sice we are only querying for data
1149 # Also would not work, or produce strange results, with the "columns"
1151 my $schema = Koha::Database->schema;
1152 my $result = $schema->resultset('SearchField')->search(
1154 $params->{is_opac} ? (
1159 'type' => { '!=' => 'boolean' },
1160 'search_marc_map.index_name' => $self->index,
1161 'search_marc_map.marc_type' => C4::Context->preference('marcflavour'),
1162 'search_marc_to_fields.search' => 1,
1165 columns => [qw/name weight/],
1167 join => {search_marc_to_fields => 'search_marc_map'},
1171 while (my $search_field = $result->next) {
1172 push @search_fields, [
1173 lc $search_field->name,
1174 $search_field->weight ? $search_field->weight : ()
1177 $search_fields = \@search_fields;
1178 $cache->set_in_cache($cache_key, $search_fields);
1180 if ($params->{subfield}) {
1181 my $subfield = $params->{subfield};
1184 # Copy values to avoid mutating cached
1185 # data (since unsafe is used)
1186 my ($field, $weight) = @{$_};
1187 ["${field}.${subfield}", $weight];
1191 if ($params->{weighted_fields}) {
1192 return [map { join('^', @{$_}) } @{$search_fields}];
1195 # Exclude weight from field
1196 return [map { $_->[0] } @{$search_fields}];