1 package Koha::SearchEngine::Elasticsearch::QueryBuilder;
3 # This file is part of Koha.
5 # Copyright 2014 Catalyst IT Ltd.
7 # Koha is free software; you can redistribute it and/or modify it
8 # under the terms of the GNU General Public License as published by
9 # the Free Software Foundation; either version 3 of the License, or
10 # (at your option) any later version.
12 # Koha is distributed in the hope that it will be useful, but
13 # WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 # GNU General Public License for more details.
17 # You should have received a copy of the GNU General Public License
18 # along with Koha; if not, see <http://www.gnu.org/licenses>.
22 Koha::SearchEngine::Elasticsearch::QueryBuilder - constructs elasticsearch
23 query objects from user-supplied queries
27 This provides the functions that take a user-supplied search query, and
28 provides something that can be given to elasticsearch to get answers.
32 use Koha::SearchEngine::Elasticsearch::QueryBuilder;
33 $builder = Koha::SearchEngine::Elasticsearch->new({ index => $index });
34 my $simple_query = $builder->build_query("hello");
35 # This is currently undocumented because the original code is undocumented
36 my $adv_query = $builder->build_advanced_query($indexes, $operands, $operators);
42 use base qw(Koha::SearchEngine::Elasticsearch);
45 use List::MoreUtils qw/ each_array /;
54 my $simple_query = $builder->build_query("hello", %options)
56 This will build a query that can be issued to elasticsearch from the provided
57 string input. This expects a lucene style search form (see
58 L<http://www.elasticsearch.org/guide/en/elasticsearch/reference/current/query-dsl-query-string-query.html#query-string-syntax>
61 It'll make an attempt to respect the various query options.
63 Additional options can be provided with the C<%options> hash.
69 This should be an arrayref of hashrefs, each containing a C<field> and an
70 C<direction> (optional, defaults to C<asc>.) The results will be sorted
71 according to these values. Valid values for C<direction> are 'asc' and 'desc'.
78 my ( $self, $query, %options ) = @_;
80 my $stemming = C4::Context->preference("QueryStemming") || 0;
81 my $auto_truncation = C4::Context->preference("QueryAutoTruncate") || 0;
82 my $weight_fields = C4::Context->preference("QueryWeightFields") || 0;
83 my $fuzzy_enabled = C4::Context->preference("QueryFuzzy") || 0;
85 $query = '*' unless defined $query;
91 fuzziness => $fuzzy_enabled ? 'auto' : '0',
92 default_operator => 'AND',
93 default_field => '_all',
94 lenient => JSON::true,
95 analyze_wildcard => JSON::true,
96 fields => $options{fields} || [],
100 if ( $options{sort} ) {
101 foreach my $sort ( @{ $options{sort} } ) {
102 my ( $f, $d ) = @$sort{qw/ field direction /};
103 die "Invalid sort direction, $d"
104 if $d && ( $d ne 'asc' && $d ne 'desc' );
105 $d = 'asc' unless $d;
107 $f = $self->_sort_field($f);
108 push @{ $res->{sort} }, { $f => { order => $d } };
112 # See _convert_facets in Search.pm for how these get turned into
113 # things that Koha can use.
114 my $size = C4::Context->preference('FacetMaxCount');
115 $res->{aggregations} = {
116 author => { terms => { field => "author__facet" , size => $size } },
117 subject => { terms => { field => "subject__facet", size => $size } },
118 itype => { terms => { field => "itype__facet", size => $size} },
119 location => { terms => { field => "location__facet", size => $size } },
120 'su-geo' => { terms => { field => "su-geo__facet", size => $size} },
121 'title-series' => { terms => { field => "title-series__facet", size => $size } },
122 ccode => { terms => { field => "ccode__facet", size => $size } },
123 ln => { terms => { field => "ln__facet", size => $size } },
126 my $display_library_facets = C4::Context->preference('DisplayLibraryFacets');
127 if ( $display_library_facets eq 'both'
128 or $display_library_facets eq 'home' ) {
129 $res->{aggregations}{homebranch} = { terms => { field => "homebranch__facet" } };
131 if ( $display_library_facets eq 'both'
132 or $display_library_facets eq 'holding' ) {
133 $res->{aggregations}{holdingbranch} = { terms => { field => "holdingbranch__facet" } };
138 =head2 build_browse_query
140 my $browse_query = $builder->build_browse_query($field, $query);
142 This performs a "starts with" style query on a particular field. The field
143 to be searched must have been indexed with an appropriate mapping as a
144 "phrase" subfield, which pretty much everything has.
148 # XXX this isn't really a browse query like we want in the end
149 sub build_browse_query {
150 my ( $self, $field, $query ) = @_;
152 my $fuzzy_enabled = C4::Context->preference("QueryFuzzy") || 0;
154 return { query => '*' } if !defined $query;
156 # TODO this should come from Koha::SearchEngine::Elasticsearch
157 my %field_whitelist = (
161 $field = 'title' if !exists $field_whitelist{$field};
162 my $sort = $self->_sort_field($field);
165 match_phrase_prefix => {
169 fuzziness => $fuzzy_enabled ? 'auto' : '0',
173 sort => [ { $sort => { order => "asc" } } ],
177 =head2 build_query_compat
180 $error, $query, $simple_query, $query_cgi,
181 $query_desc, $limit, $limit_cgi, $limit_desc,
182 $stopwords_removed, $query_type
184 = $builder->build_query_compat( \@operators, \@operands, \@indexes,
185 \@limits, \@sort_by, $scan, $lang );
187 This handles a search using the same api as L<C4::Search::buildQuery> does.
189 A very simple query will go in with C<$operands> set to ['query'], and
190 C<$sort_by> set to ['pubdate_dsc']. This simple case will return with
191 C<$query> set to something that can perform the search, C<$simple_query>
192 set to just the search term, C<$query_cgi> set to something that can
193 reproduce this search, and C<$query_desc> set to something else.
197 sub build_query_compat {
198 my ( $self, $operators, $operands, $indexes, $orig_limits, $sort_by, $scan,
202 #die Dumper ( $self, $operators, $operands, $indexes, $orig_limits, $sort_by, $scan, $lang );
203 my @sort_params = $self->_convert_sort_fields(@$sort_by);
204 my @index_params = $self->_convert_index_fields(@$indexes);
205 my $limits = $self->_fix_limit_special_cases($orig_limits);
206 if ( $params->{suppress} ) { push @$limits, "suppress:0"; }
208 # Merge the indexes in with the search terms and the operands so that
209 # each search thing is a handy unit.
210 unshift @$operators, undef; # The first one can't have an op
212 my $truncate = C4::Context->preference("QueryAutoTruncate") || 0;
213 my $ea = each_array( @$operands, @$operators, @index_params );
214 while ( my ( $oand, $otor, $index ) = $ea->() ) {
215 next if ( !defined($oand) || $oand eq '' );
216 $oand = $self->_clean_search_term($oand);
217 $oand = $self->_truncate_terms($oand) if ($truncate);
218 push @search_params, {
219 operand => $oand, # the search terms
220 operator => defined($otor) ? uc $otor : undef, # AND and so on
221 $index ? %$index : (),
225 # We build a string query from limits and the queries. An alternative
226 # would be to pass them separately into build_query and let it build
227 # them into a structured ES query itself. Maybe later, though that'd be
229 my $search_param_query_str = join( ' ', $self->_create_query_string(@search_params) );
230 my $query_str = join( ' AND ',
231 $search_param_query_str || (),
232 $self->_join_queries( $self->_convert_index_strings(@$limits) ) || () );
235 if ( defined($params->{weighted_fields}) && $params->{weighted_fields} ) {
236 push @fields, sprintf("%s^%s", $_->name, $_->weight) for Koha::SearchFields->weighted_fields;
239 # If there's no query on the left, let's remove the junk left behind
240 $query_str =~ s/^ AND //;
242 $options{fields} = \@fields;
243 $options{sort} = \@sort_params;
244 my $query = $self->build_query( $query_str, %options );
246 # We roughly emulate the CGI parameters of the zebra query builder
248 shift @$operators; # Shift out the one we unshifted before
249 $ea = each_array( @$operands, @$operators, @$indexes );
250 while ( my ( $oand, $otor, $index ) = $ea->() ) {
251 $query_cgi .= '&' if $query_cgi;
252 $query_cgi .= 'idx=' . uri_escape_utf8( $index // '') . '&q=' . uri_escape_utf8( $oand );
253 $query_cgi .= '&op=' . uri_escape_utf8( $otor ) if $otor;
255 $query_cgi .= '&scan=1' if ( $scan );
258 $simple_query = $operands->[0] if @$operands == 1;
260 if ( $simple_query ) {
261 $query_desc = $simple_query;
263 $query_desc = $search_param_query_str;
265 my $limit = $self->_join_queries( $self->_convert_index_strings(@$limits));
266 my $limit_cgi = ( $orig_limits and @$orig_limits )
267 ? '&limit=' . join( '&limit=', map { uri_escape_utf8($_) } @$orig_limits )
270 $limit_desc = "$limit" if $limit;
273 undef, $query, $simple_query, $query_cgi, $query_desc,
274 $limit, $limit_cgi, $limit_desc, undef, undef
278 =head2 build_authorities_query
280 my $query = $builder->build_authorities_query(\%search);
282 This takes a nice description of an authority search and turns it into a black-box
283 query that can then be passed to the appropriate searcher.
285 The search description is a hashref that looks something like:
290 where => 'Heading', # search the main entry
291 operator => 'exact', # require an exact match
292 value => 'frogs', # the search string
295 where => '', # search all entries
296 operator => '', # default keyword, right truncation
304 authtypecode => 'TOPIC_TERM',
309 sub build_authorities_query {
310 my ( $self, $search ) = @_;
312 # Start by making the query parts
315 foreach my $s ( @{ $search->{searches} } ) {
316 my ( $wh, $op, $val ) = @{$s}{qw(where operator value)};
317 $wh = '_all' if $wh eq '';
318 if ( $op eq 'is' || $op eq '=' || $op eq 'exact' ) {
320 # look for something that matches a term completely
321 # note, '=' is about numerical vals. May need special handling.
322 # Also, we lowercase our search because the ES
323 # index lowercases its values, and term searches don't get the
324 # search analyzer applied to them.
325 push @query_parts, { match_phrase => {"$wh.phrase" => lc $val} };
327 elsif ( $op eq 'start' ) {
328 # startswith search, uses lowercase untokenized version of heading
329 push @query_parts, { match_phrase_prefix => {"$wh.phrase" => lc $val} };
332 # regular wordlist stuff
333 my @tokens = $self->_split_query( $val );
334 foreach my $token ( @tokens ) {
335 $token = $self->_truncate_terms(
336 $self->_clean_search_term( $token )
339 my $query = $self->_join_queries( @tokens );
340 push @query_parts, { query_string => {
341 default_field => $wh,
342 analyze_wildcard => JSON::true,
348 # Merge the query parts appropriately
349 # 'should' behaves like 'or'
350 # 'must' behaves like 'and'
351 # Zebra results seem to match must so using that here
352 my $query = { query =>
354 { must => \@query_parts }
357 if ( $search->{authtypecode} ) {
358 $query->{query}->{bool}->{filter} = { term => { 'authtype.raw' => $search->{authtypecode} } };
362 if ( exists $search->{sort} ) {
363 foreach my $k ( keys %{ $search->{sort} } ) {
364 my $f = $self->_sort_field($k);
365 $s{$f} = $search->{sort}{$k};
367 $search->{sort} = \%s;
371 $query->{sort} = [ $search->{sort} ] if exists $search->{sort};
377 =head2 build_authorities_query_compat
380 $builder->build_authorities_query_compat( \@marclist, \@and_or,
381 \@excluding, \@operator, \@value, $authtypecode, $orderby );
383 This builds a query for searching for authorities, in the style of
384 L<C4::AuthoritiesMarc::SearchAuthorities>.
392 An arrayref containing where the particular term should be searched for.
393 Options are: mainmainentry, mainentry, match, match-heading, see-from, and
394 thesaurus. If left blank, any field is used.
398 Totally ignored. It is never used in L<C4::AuthoritiesMarc::SearchAuthorities>.
406 What form of search to do. Options are: is (phrase, no truncation, whole field
407 must match), = (number exact match), exact (phrase, no truncation, whole field
408 must match). If left blank, then word list, right truncated, anywhere is used.
412 The actual user-provided string value to search for.
416 The authority type code to search within. If blank, then all will be searched.
420 The order to sort the results by. Options are Relevance, HeadingAsc,
421 HeadingDsc, AuthidAsc, AuthidDsc.
425 marclist, operator, and value must be the same length, and the values at
426 index /i/ all relate to each other.
428 This returns a query, which is a black box object that can be passed to the
429 appropriate search object.
433 our $koha_to_index_name = {
434 mainmainentry => 'heading-main',
435 mainentry => 'heading',
437 'match-heading' => 'match-heading',
438 'see-from' => 'match-heading-see-from',
439 thesaurus => 'subject-heading-thesaurus',
444 sub build_authorities_query_compat {
445 my ( $self, $marclist, $and_or, $excluding, $operator, $value,
446 $authtypecode, $orderby )
449 # This turns the old-style many-options argument form into a more
450 # extensible hash form that is understood by L<build_authorities_query>.
453 # Convert to lower case
454 $marclist = [map(lc, @{$marclist})];
455 $orderby = lc $orderby;
457 # Make sure everything exists
458 foreach my $m (@$marclist) {
459 Koha::Exceptions::WrongParameter->throw("Invalid marclist field provided: $m")
460 unless exists $koha_to_index_name->{$m};
462 for ( my $i = 0 ; $i < @$value ; $i++ ) {
463 next unless $value->[$i]; #clean empty form values, ES doesn't like undefined searches
466 where => $koha_to_index_name->{$marclist->[$i]},
467 operator => $operator->[$i],
468 value => $value->[$i],
474 ( $orderby =~ /^heading/ ) ? 'heading'
475 : ( $orderby =~ /^auth/ ) ? 'local-number'
478 my $sort_order = ( $orderby =~ /asc$/ ) ? 'asc' : 'desc';
479 %sort = ( $sort_field => $sort_order, );
482 searches => \@searches,
483 authtypecode => $authtypecode,
485 $search{sort} = \%sort if %sort;
486 my $query = $self->build_authorities_query( \%search );
490 =head2 _convert_sort_fields
492 my @sort_params = _convert_sort_fields(@sort_by)
494 Converts the zebra-style sort index information into elasticsearch-style.
496 C<@sort_by> is the same as presented to L<build_query_compat>, and it returns
497 something that can be sent to L<build_query>.
501 sub _convert_sort_fields {
502 my ( $self, @sort_by ) = @_;
504 # Turn the sorting into something we care about.
505 my %sort_field_convert = (
506 acqdate => 'date-of-acquisition',
508 call_number => 'local-classification',
509 popularity => 'issues',
510 relevance => undef, # default
512 pubdate => 'date-of-publication',
514 my %sort_order_convert =
515 ( qw( desc desc ), qw( dsc desc ), qw( asc asc ), qw( az asc ), qw( za desc ) );
517 # Convert the fields and orders, drop anything we don't know about.
518 grep { $_->{field} } map {
519 my ( $f, $d ) = /(.+)_(.+)/;
521 field => $sort_field_convert{$f},
522 direction => $sort_order_convert{$d}
527 =head2 _convert_index_fields
529 my @index_params = $self->_convert_index_fields(@indexes);
531 Converts zebra-style search index notation into elasticsearch-style.
533 C<@indexes> is an array of index names, as presented to L<build_query_compat>,
534 and it returns something that can be sent to L<build_query>.
536 B<TODO>: this will pull from the elasticsearch mappings table to figure out
541 our %index_field_convert = (
545 'lcn' => 'local-classification',
546 'callnum' => 'local-classification',
547 'record-type' => 'rtype',
548 'mc-rtype' => 'rtype',
550 'lc-card' => 'lc-card-number',
551 'sn' => 'local-number',
552 'biblionumber' => 'local-number',
553 'yr' => 'date-of-publication',
554 'pubdate' => 'date-of-publication',
555 'acqdate' => 'date-of-acquisition',
556 'date/time-last-modified' => 'date-time-last-modified',
557 'dtlm' => 'date-time-last-modified',
558 'diss' => 'dissertation-information',
561 'music-number' => 'identifier-publisher-for-music',
562 'number-music-publisher' => 'identifier-publisher-for-music',
563 'music' => 'identifier-publisher-for-music',
564 'ident' => 'identifier-standard',
565 'cpn' => 'corporate-name',
566 'cfn' => 'conference-name',
567 'pn' => 'personal-name',
572 'rcn' => 'record-control-number',
574 'su-to' => 'subject',
575 #'su-geo' => 'subject',
576 'su-ut' => 'subject',
578 'se' => 'title-series',
579 'ut' => 'title-uniform',
580 'an' => 'koha-auth-number',
581 'authority-number' => 'koha-auth-number',
584 'rank' => 'relevance',
585 'phr' => 'st-phrase',
586 'wrdl' => 'st-word-list',
587 'rt' => 'right-truncation',
588 'rtrn' => 'right-truncation',
589 'ltrn' => 'left-truncation',
590 'rltrn' => 'left-and-right',
591 'mc-itemtype' => 'itemtype',
592 'mc-ccode' => 'ccode',
593 'branch' => 'homebranch',
594 'mc-loc' => 'location',
595 'stocknumber' => 'number-local-acquisition',
596 'inv' => 'number-local-acquisition',
598 'mc-itype' => 'itype',
599 'aub' => 'author-personal-bibliography',
600 'auo' => 'author-in-order',
604 'frequency-code' => 'ff8-18',
605 'illustration-code' => 'ff8-18-21',
606 'regularity-code' => 'ff8-19',
607 'type-of-serial' => 'ff8-21',
608 'format' => 'ff8-23',
609 'conference-code' => 'ff8-29',
610 'festschrift-indicator' => 'ff8-30',
611 'index-indicator' => 'ff8-31',
614 'literature-code' => 'lf',
615 'biography' => 'bio',
617 'biography-code' => 'bio',
618 'l-format' => 'ff7-01-02',
619 'lex' => 'lexile-number',
620 'hi' => 'host-item-number',
621 'itu' => 'index-term-uncontrolled',
622 'itg' => 'index-term-genre',
624 my $field_name_pattern = '[\w\-]+';
625 my $multi_field_pattern = "(?:\\.$field_name_pattern)*";
627 sub _convert_index_fields {
628 my ( $self, @indexes ) = @_;
630 my %index_type_convert =
631 ( __default => undef, phr => 'phrase', rtrn => 'right-truncate', 'st-year' => 'st-year' );
633 # Convert according to our table, drop anything that doesn't convert.
634 # If a field starts with mc- we save it as it's used (and removed) later
635 # when joining things, to indicate we make it an 'OR' join.
636 # (Sorry, this got a bit ugly after special cases were found.)
637 grep { $_->{field} } map {
638 # Lower case all field names
639 my ( $f, $t ) = map(lc, split /,/);
646 field => exists $index_field_convert{$f} ? $index_field_convert{$f} : $f,
647 type => $index_type_convert{ $t // '__default' }
649 $r->{field} = ($mc . $r->{field}) if $mc && $r->{field};
654 =head2 _convert_index_strings
656 my @searches = $self->_convert_index_strings(@searches);
658 Similar to L<_convert_index_fields>, this takes strings of the form
659 B<field:search term> and rewrites the field from zebra-style to
660 elasticsearch-style. Anything it doesn't understand is returned verbatim.
664 sub _convert_index_strings {
665 my ( $self, @searches ) = @_;
667 foreach my $s (@searches) {
669 my ( $field, $term ) = $s =~ /^\s*([\w,-]*?):(.*)/;
670 unless ( defined($field) && defined($term) ) {
674 my ($conv) = $self->_convert_index_fields($field);
675 unless ( defined($conv) ) {
679 push @res, $conv->{field} . ":"
680 . $self->_modify_string_by_type( %$conv, operand => $term );
685 =head2 _convert_index_strings_freeform
687 my $search = $self->_convert_index_strings_freeform($search);
689 This is similar to L<_convert_index_strings>, however it'll search out the
690 things to change within the string. So it can handle strings such as
691 C<(su:foo) AND (su:bar)>, converting the C<su> appropriately.
693 If there is something of the form "su,complete-subfield" or something, the
694 second part is stripped off as we can't yet handle that. Making it work
695 will have to wait for a real query parser.
699 sub _convert_index_strings_freeform {
700 my ( $self, $search ) = @_;
701 # @TODO: Currenty will alter also fields contained within quotes:
702 # `searching for "stuff cn:123"` for example will become
703 # `searching for "stuff local-number:123"
705 # Fixing this is tricky, one possibility:
706 # https://stackoverflow.com/questions/19193876/perl-regex-to-match-a-string-that-is-not-enclosed-in-quotes
707 # Still not perfect, and will not handle escaped quotes within quotes and assumes balanced quotes.
709 # Another, not so elegant, solution could be to replace all quoted content with placeholders, and put
710 # them back when processing is done.
712 # Lower case field names
713 $search =~ s/($field_name_pattern)(?:,[\w-]*)?($multi_field_pattern):/\L$1\E$2:/og;
714 # Resolve possible field aliases
715 $search =~ s/($field_name_pattern)($multi_field_pattern):/(exists $index_field_convert{$1} ? $index_field_convert{$1} : $1)."$2:"/oge;
719 =head2 _modify_string_by_type
721 my $str = $self->_modify_string_by_type(%index_field);
723 If you have a search term (operand) and a type (phrase, right-truncated), this
724 will convert the string to have the function in lucene search terms, e.g.
725 wrapping quotes around it.
729 sub _modify_string_by_type {
730 my ( $self, %idx ) = @_;
732 my $type = $idx{type} || '';
733 my $str = $idx{operand};
734 return $str unless $str; # Empty or undef, we can't use it.
736 $str .= '*' if $type eq 'right-truncate';
737 $str = '"' . $str . '"' if $type eq 'phrase';
738 if ($type eq 'st-year') {
739 if ($str =~ /^(.*)-(.*)$/) {
740 my $from = $1 || '*';
741 my $until = $2 || '*';
742 $str = "[$from TO $until]";
750 my $query_str = $self->_join_queries(@query_parts);
752 This takes a list of query parts, that might be search terms on their own, or
753 booleaned together, or specifying fields, or whatever, wraps them in
754 parentheses, and ANDs them all together. Suitable for feeding to the ES
757 Note: doesn't AND them together if they specify an index that starts with "mc"
758 as that was a special case in the original code for dealing with multiple
759 choice options (you can't search for something that has an itype of A and
760 and itype of B otherwise.)
765 my ( $self, @parts ) = @_;
767 my @norm_parts = grep { defined($_) && $_ ne '' && $_ !~ /^mc-/ } @parts;
769 map { s/^mc-//r } grep { defined($_) && $_ ne '' && $_ =~ /^mc-/ } @parts;
770 return () unless @norm_parts + @mc_parts;
771 return ( @norm_parts, @mc_parts )[0] if @norm_parts + @mc_parts == 1;
773 @mc_parts ? '(' . ( join ' OR ', map { "($_)" } @mc_parts ) . ')' : ();
775 # Handy trick: $x || () inside a join means that if $x ends up as an
776 # empty string, it gets replaced with (), which makes join ignore it.
777 # (bad effect: this'll also happen to '0', this hopefully doesn't matter
780 join( ' AND ', map { "($_)" } @norm_parts ) || (),
786 my @phrased_queries = $self->_make_phrases(@query_parts);
788 This takes the supplied queries and forces them to be phrases by wrapping
789 quotes around them. It understands field prefixes, e.g. 'subject:' and puts
790 the quotes outside of them if they're there.
795 my ( $self, @parts ) = @_;
796 map { s/^\s*(\w*?:)(.*)$/$1"$2"/r } @parts;
799 =head2 _create_query_string
801 my @query_strings = $self->_create_query_string(@queries);
803 Given a list of hashrefs, it will turn them into a lucene-style query string.
804 The hash should contain field, type (both for the indexes), operator, and
809 sub _create_query_string {
810 my ( $self, @queries ) = @_;
813 my $otor = $_->{operator} ? $_->{operator} . ' ' : '';
814 my $field = $_->{field} ? $_->{field} . ':' : '';
816 my $oand = $self->_modify_string_by_type(%$_);
817 $oand = "($oand)" if $field && scalar(split(/\s+/, $oand)) > 1 && (!defined $_->{type} || $_->{type} ne 'st-year');
818 "$otor($field$oand)";
822 =head2 _clean_search_term
824 my $term = $self->_clean_search_term($term);
826 This cleans a search term by removing any funny characters that may upset
827 ES and give us an error. It also calls L<_convert_index_strings_freeform>
828 to ensure those parts are correct.
832 sub _clean_search_term {
833 my ( $self, $term ) = @_;
835 # Lookahead for checking if we are inside quotes
836 my $lookahead = '(?=(?:[^\"]*+\"[^\"]*+\")*+[^\"]*+$)';
838 # Some hardcoded searches (like with authorities) produce things like
839 # 'an=123', when it ought to be 'an:123' for our purposes.
842 $term = $self->_convert_index_strings_freeform($term);
845 # Remove unbalanced quotes
846 my $unquoted = $term;
847 my $count = ($unquoted =~ tr/"/ /);
848 if ($count % 2 == 1) {
852 # Remove unquoted colons that have whitespace on either side of them
853 $term =~ s/(\:[:\s]+|[:\s]+:)$lookahead//g;
858 =head2 _fix_limit_special_cases
860 my $limits = $self->_fix_limit_special_cases($limits);
862 This converts any special cases that the limit specifications have into things
863 that are more readily processable by the rest of the code.
865 The argument should be an arrayref, and it'll return an arrayref.
869 sub _fix_limit_special_cases {
870 my ( $self, $limits ) = @_;
873 foreach my $l (@$limits) {
875 # This is set up by opac-search.pl
876 if ( $l =~ /^yr,st-numeric,ge=/ ) {
877 my ( $start, $end ) =
878 ( $l =~ /^yr,st-numeric,ge=(.*) and yr,st-numeric,le=(.*)$/ );
879 next unless defined($start) && defined($end);
880 push @new_lim, "copydate:[$start TO $end]";
882 elsif ( $l =~ /^yr,st-numeric=/ ) {
883 my ($date) = ( $l =~ /^yr,st-numeric=(.*)$/ );
884 next unless defined($date);
885 $date = $self->_modify_string_by_type(type => 'st-year', operand => $date);
886 push @new_lim, "copydate:$date";
888 elsif ( $l =~ /^available$/ ) {
889 push @new_lim, 'onloan:0';
900 my $field = $self->_sort_field($field);
902 Given a field name, this works out what the actual name of the field to sort
903 on should be. A '__sort' suffix is added for fields with a sort version, and
904 for text fields either '.phrase' (for sortable versions) or '.raw' is appended
905 to avoid sorting on a tokenized value.
912 my $mappings = $self->get_elasticsearch_mappings();
913 my $fieldType = defined $mappings->{data}{properties}{$f}{type} ? $mappings->{data}{properties}{$f}{type} : undef;
914 if (!defined $self->sort_fields()->{$f} || $self->sort_fields()->{$f}) {
917 # We need to add '.raw' to text fields without a sort field,
918 # otherwise it'll sort based on the tokenised form.
919 $f .= '.raw' if $fieldType eq 'text';
924 =head2 _truncate_terms
926 my $query = $self->_truncate_terms($query);
928 Given a string query this function appends '*' wildcard to all terms except
929 operands and double quoted strings.
933 sub _truncate_terms {
934 my ( $self, $query ) = @_;
936 my @tokens = $self->_split_query( $query );
938 # Filter out empty tokens
939 my @words = grep { $_ !~ /^\s*$/ } @tokens;
941 # Append '*' to words if needed, ie. if it ends in a word character and is not a keyword
944 (/\W$/ or grep {lc($w) eq $_} qw/and or not/) ? $_ : "$_*";
947 return join ' ', @terms;
952 my @token = $self->_split_query($query_str);
954 Given a string query this function splits it to tokens taking into account
955 any field prefixes and quoted strings.
959 my $tokenize_split_re = qr/((?:${field_name_pattern}${multi_field_pattern}:)?"[^"]+"|\s+)/;
962 my ( $self, $query ) = @_;
964 # '"donald duck" title:"the mouse" and peter" get split into
965 # ['', '"donald duck"', '', ' ', '', 'title:"the mouse"', '', ' ', 'and', ' ', 'pete']
966 my @tokens = split $tokenize_split_re, $query;
968 # Filter out empty values
969 @tokens = grep( /\S/, @tokens );