Koha/SearchEngine/Elasticsearch/QueryBuilder.pm

   1 package Koha::SearchEngine::Elasticsearch::QueryBuilder;
   2
   3 # This file is part of Koha.
   4 #
   5 # Copyright 2014 Catalyst IT Ltd.
   6 #
   7 # Koha is free software; you can redistribute it and/or modify it
   8 # under the terms of the GNU General Public License as published by
   9 # the Free Software Foundation; either version 3 of the License, or
  10 # (at your option) any later version.
  11 #
  12 # Koha is distributed in the hope that it will be useful, but
  13 # WITHOUT ANY WARRANTY; without even the implied warranty of
  14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  15 # GNU General Public License for more details.
  16 #
  17 # You should have received a copy of the GNU General Public License
  18 # along with Koha; if not, see <http://www.gnu.org/licenses>.
  19
  20 =head1 NAME
  21
  22 Koha::SearchEngine::Elasticsearch::QueryBuilder - constructs elasticsearch
  23 query objects from user-supplied queries
  24
  25 =head1 DESCRIPTION
  26
  27 This provides the functions that take a user-supplied search query, and
  28 provides something that can be given to elasticsearch to get answers.
  29
  30 =head1 SYNOPSIS
  31
  32     use Koha::SearchEngine::Elasticsearch::QueryBuilder;
  33     $builder = Koha::SearchEngine::Elasticsearch->new({ index => $index });
  34     my $simple_query = $builder->build_query("hello");
  35     # This is currently undocumented because the original code is undocumented
  36     my $adv_query = $builder->build_advanced_query($indexes, $operands, $operators);
  37
  38 =head1 METHODS
  39
  40 =cut
  41
  42 use base qw(Koha::SearchEngine::Elasticsearch);
  43 use JSON;
  44 use List::MoreUtils qw( each_array );
  45 use Modern::Perl;
  46 use URI::Escape qw( uri_escape_utf8 );
  47
  48 use C4::Context;
  49 use Koha::Exceptions;
  50 use Koha::Caches;
  51
  52 our %index_field_convert = (
  53     'kw' => '',
  54     'ab' => 'abstract',
  55     'au' => 'author',
  56     'lcn' => 'local-classification',
  57     'callnum' => 'local-classification',
  58     'record-type' => 'rtype',
  59     'mc-rtype' => 'rtype',
  60     'mus' => 'rtype',
  61     'lc-card' => 'lc-card-number',
  62     'sn' => 'local-number',
  63     'biblionumber' => 'local-number',
  64     'yr' => 'date-of-publication',
  65     'pubdate' => 'date-of-publication',
  66     'acqdate' => 'date-of-acquisition',
  67     'date/time-last-modified' => 'date-time-last-modified',
  68     'dtlm' => 'date-time-last-modified',
  69     'diss' => 'dissertation-information',
  70     'nb' => 'isbn',
  71     'ns' => 'issn',
  72     'music-number' => 'identifier-publisher-for-music',
  73     'number-music-publisher' => 'identifier-publisher-for-music',
  74     'music' => 'identifier-publisher-for-music',
  75     'ident' => 'identifier-standard',
  76     'cpn' => 'corporate-name',
  77     'cfn' => 'conference-name',
  78     'pn' => 'personal-name',
  79     'pb' => 'publisher',
  80     'pv' => 'provider',
  81     'nt' => 'note',
  82     'notes' => 'note',
  83     'rcn' => 'record-control-number',
  84     'su' => 'subject',
  85     'su-to' => 'subject',
  86     #'su-geo' => 'subject',
  87     'su-ut' => 'subject',
  88     'ti' => 'title',
  89     'se' => 'title-series',
  90     'ut' => 'title-uniform',
  91     'an' => 'koha-auth-number',
  92     'authority-number' => 'koha-auth-number',
  93     'at' => 'authtype',
  94     'he' => 'heading',
  95     'rank' => 'relevance',
  96     'phr' => 'st-phrase',
  97     'wrdl' => 'st-word-list',
  98     'rt' => 'right-truncation',
  99     'rtrn' => 'right-truncation',
 100     'ltrn' => 'left-truncation',
 101     'rltrn' => 'left-and-right',
 102     'mc-itemtype' => 'itemtype',
 103     'mc-ccode' => 'ccode',
 104     'branch' => 'homebranch',
 105     'mc-loc' => 'location',
 106     'loc' => 'location',
 107     'stocknumber' => 'number-local-acquisition',
 108     'inv' => 'number-local-acquisition',
 109     'bc' => 'barcode',
 110     'mc-itype' => 'itype',
 111     'aub' => 'author-personal-bibliography',
 112     'auo' => 'author-in-order',
 113     'ff8-22' => 'ta',
 114     'aud' => 'ta',
 115     'audience' => 'ta',
 116     'frequency-code' => 'ff8-18',
 117     'illustration-code' => 'ff8-18-21',
 118     'regularity-code' => 'ff8-19',
 119     'type-of-serial' => 'ff8-21',
 120     'format' => 'ff8-23',
 121     'conference-code' => 'ff8-29',
 122     'festschrift-indicator' => 'ff8-30',
 123     'index-indicator' => 'ff8-31',
 124     'fiction' => 'lf',
 125     'fic' => 'lf',
 126     'literature-code' => 'lf',
 127     'biography' => 'bio',
 128     'ff8-34' => 'bio',
 129     'biography-code' => 'bio',
 130     'l-format' => 'ff7-01-02',
 131     'lex' => 'lexile-number',
 132     'hi' => 'host-item-number',
 133     'itu' => 'index-term-uncontrolled',
 134     'itg' => 'index-term-genre',
 135 );
 136 my $field_name_pattern = '[\w\-]+';
 137 my $multi_field_pattern = "(?:\\.$field_name_pattern)*";
 138
 139 =head2 get_index_field_convert
 140
 141     my @index_params = Koha::SearchEngine::Elasticsearch::QueryBuilder->get_index_field_convert();
 142
 143 Converts zebra-style search index notation into elasticsearch-style.
 144
 145 C<@indexes> is an array of index names, as presented to L<build_query_compat>,
 146 and it returns something that can be sent to L<build_query>.
 147
 148 B<TODO>: this will pull from the elasticsearch mappings table to figure out
 149 types.
 150
 151 =cut
 152
 153 sub get_index_field_convert() {
 154     return \%index_field_convert;
 155 }
 156
 157 =head2 build_query
 158
 159     my $simple_query = $builder->build_query("hello", %options)
 160
 161 This will build a query that can be issued to elasticsearch from the provided
 162 string input. This expects a lucene style search form (see
 163 L<http://www.elasticsearch.org/guide/en/elasticsearch/reference/current/query-dsl-query-string-query.html#query-string-syntax>
 164 for details.)
 165
 166 It'll make an attempt to respect the various query options.
 167
 168 Additional options can be provided with the C<%options> hash.
 169
 170 =over 4
 171
 172 =item sort
 173
 174 This should be an arrayref of hashrefs, each containing a C<field> and an
 175 C<direction> (optional, defaults to C<asc>.) The results will be sorted
 176 according to these values. Valid values for C<direction> are 'asc' and 'desc'.
 177
 178 =back
 179
 180 =cut
 181
 182 sub build_query {
 183     my ( $self, $query, %options ) = @_;
 184
 185     my $stemming         = C4::Context->preference("QueryStemming")        || 0;
 186     my $auto_truncation  = C4::Context->preference("QueryAutoTruncate")    || 0;
 187     my $fuzzy_enabled    = C4::Context->preference("QueryFuzzy")           || 0;
 188
 189     $query = '*' unless defined $query;
 190
 191     my $res;
 192     my $fields = $self->_search_fields({
 193         is_opac => $options{is_opac},
 194         weighted_fields => $options{weighted_fields},
 195     });
 196     if ($options{whole_record}) {
 197         push @$fields, 'marc_data_array.*';
 198     }
 199     $res->{query} = {
 200         query_string => {
 201             query            => $query,
 202             fuzziness        => $fuzzy_enabled ? 'auto' : '0',
 203             default_operator => 'AND',
 204             fields           => $fields,
 205             lenient          => JSON::true,
 206             analyze_wildcard => JSON::true,
 207         }
 208     };
 209     $res->{query}->{query_string}->{type} = 'cross_fields' if C4::Context->preference('ElasticsearchCrossFields');
 210
 211     if ( $options{sort} ) {
 212         foreach my $sort ( @{ $options{sort} } ) {
 213             my ( $f, $d ) = @$sort{qw/ field direction /};
 214             die "Invalid sort direction, $d"
 215               if $d && ( $d ne 'asc' && $d ne 'desc' );
 216             $d = 'asc' unless $d;
 217
 218             $f = $self->_sort_field($f);
 219             push @{ $res->{sort} }, { $f => { order => $d } };
 220         }
 221     }
 222
 223     # See _convert_facets in Search.pm for how these get turned into
 224     # things that Koha can use.
 225     my $size = C4::Context->preference('FacetMaxCount');
 226     $res->{aggregations} = {
 227         author         => { terms => { field => "author__facet" , size => $size } },
 228         subject        => { terms => { field => "subject__facet", size => $size } },
 229         itype          => { terms => { field => "itype__facet", size => $size} },
 230         location       => { terms => { field => "location__facet", size => $size } },
 231         'su-geo'       => { terms => { field => "su-geo__facet", size => $size} },
 232         'title-series' => { terms => { field => "title-series__facet", size => $size } },
 233         ccode          => { terms => { field => "ccode__facet", size => $size } },
 234         ln             => { terms => { field => "ln__facet", size => $size } },
 235     };
 236
 237     my $display_library_facets = C4::Context->preference('DisplayLibraryFacets');
 238     if (   $display_library_facets eq 'both'
 239         or $display_library_facets eq 'home' ) {
 240         $res->{aggregations}{homebranch} = { terms => { field => "homebranch__facet", size => $size } };
 241     }
 242     if (   $display_library_facets eq 'both'
 243         or $display_library_facets eq 'holding' ) {
 244         $res->{aggregations}{holdingbranch} = { terms => { field => "holdingbranch__facet", size => $size } };
 245     }
 246     return $res;
 247 }
 248
 249 =head2 build_query_compat
 250
 251     my (
 252         $error,             $query, $simple_query, $query_cgi,
 253         $query_desc,        $limit, $limit_cgi,    $limit_desc,
 254         $stopwords_removed, $query_type
 255       )
 256       = $builder->build_query_compat( \@operators, \@operands, \@indexes,
 257         \@limits, \@sort_by, $scan, $lang, $params );
 258
 259 This handles a search using the same api as L<C4::Search::buildQuery> does.
 260
 261 A very simple query will go in with C<$operands> set to ['query'], and
 262 C<$sort_by> set to ['pubdate_dsc']. This simple case will return with
 263 C<$query> set to something that can perform the search, C<$simple_query>
 264 set to just the search term, C<$query_cgi> set to something that can
 265 reproduce this search, and C<$query_desc> set to something else.
 266
 267 =cut
 268
 269 sub build_query_compat {
 270     my ( $self, $operators, $operands, $indexes, $orig_limits, $sort_by, $scan,
 271         $lang, $params )
 272       = @_;
 273
 274     my $query;
 275     my $query_str = '';
 276     my $search_param_query_str = '';
 277     my $limits = ();
 278     if ( $scan ) {
 279         ($query, $query_str) = $self->_build_scan_query( $operands, $indexes );
 280         $search_param_query_str = $query_str;
 281     } else {
 282         my @sort_params  = $self->_convert_sort_fields(@$sort_by);
 283         my @index_params = $self->_convert_index_fields(@$indexes);
 284         $limits       = $self->_fix_limit_special_cases($orig_limits);
 285         if ( $params->{suppress} ) { push @$limits, "suppress:false"; }
 286         # Merge the indexes in with the search terms and the operands so that
 287         # each search thing is a handy unit.
 288         unshift @$operators, undef;    # The first one can't have an op
 289         my @search_params;
 290         my $truncate = C4::Context->preference("QueryAutoTruncate") || 0;
 291         my $ea = each_array( @$operands, @$operators, @index_params );
 292         while ( my ( $oand, $otor, $index ) = $ea->() ) {
 293             next if ( !defined($oand) || $oand eq '' );
 294             $oand = $self->_clean_search_term($oand);
 295             $oand = $self->_truncate_terms($oand) if ($truncate);
 296             push @search_params, {
 297                 operand => $oand,      # the search terms
 298                 operator => defined($otor) ? uc $otor : undef,    # AND and so on
 299                 $index ? %$index : (),
 300             };
 301         }
 302
 303         # We build a string query from limits and the queries. An alternative
 304         # would be to pass them separately into build_query and let it build
 305         # them into a structured ES query itself. Maybe later, though that'd be
 306         # more robust.
 307         $search_param_query_str = join( ' ', $self->_create_query_string(@search_params) );
 308         $query_str = join( ' AND ',
 309             $search_param_query_str || (),
 310             $self->_join_queries( $self->_convert_index_strings(@$limits) ) || () );
 311
 312         # If there's no query on the left, let's remove the junk left behind
 313         $query_str =~ s/^ AND //;
 314         my %options;
 315         $options{sort} = \@sort_params;
 316         $options{is_opac} = $params->{is_opac};
 317         $options{weighted_fields} = $params->{weighted_fields};
 318         $options{whole_record} = $params->{whole_record};
 319         $query = $self->build_query( $query_str, %options );
 320     }
 321
 322     # We roughly emulate the CGI parameters of the zebra query builder
 323     my $query_cgi = '';
 324     shift @$operators; # Shift out the one we unshifted before
 325     my $ea = each_array( @$operands, @$operators, @$indexes );
 326     while ( my ( $oand, $otor, $index ) = $ea->() ) {
 327         $query_cgi .= '&' if $query_cgi;
 328         $query_cgi .= 'idx=' . uri_escape_utf8( $index // '') . '&q=' . uri_escape_utf8( $oand );
 329         $query_cgi .= '&op=' . uri_escape_utf8( $otor ) if $otor;
 330     }
 331     $query_cgi .= '&scan=1' if ( $scan );
 332
 333     my $simple_query;
 334     $simple_query = $operands->[0] if @$operands == 1;
 335     my $query_desc;
 336     if ( $simple_query ) {
 337         $query_desc = $simple_query;
 338     } else {
 339         $query_desc = $search_param_query_str;
 340     }
 341     my $limit     = $self->_join_queries( $self->_convert_index_strings(@$limits));
 342     my $limit_cgi = ( $orig_limits and @$orig_limits )
 343       ? '&limit=' . join( '&limit=', map { uri_escape_utf8($_) } @$orig_limits )
 344       : '';
 345     my $limit_desc;
 346     $limit_desc = "$limit" if $limit;
 347
 348     return (
 349         undef,  $query,     $simple_query, $query_cgi, $query_desc,
 350         $limit, $limit_cgi, $limit_desc,   undef,      undef
 351     );
 352 }
 353
 354 =head2 build_authorities_query
 355
 356     my $query = $builder->build_authorities_query(\%search);
 357
 358 This takes a nice description of an authority search and turns it into a black-box
 359 query that can then be passed to the appropriate searcher.
 360
 361 The search description is a hashref that looks something like:
 362
 363     {
 364         searches => [
 365             {
 366                 where    => 'Heading',    # search the main entry
 367                 operator => 'exact',        # require an exact match
 368                 value    => 'frogs',        # the search string
 369             },
 370             {
 371                 where    => '',             # search all entries
 372                 operator => '',             # default keyword, right truncation
 373                 value    => 'pond',
 374             },
 375         ],
 376         sort => {
 377             field => 'Heading',
 378             order => 'desc',
 379         },
 380         authtypecode => 'TOPIC_TERM',
 381     }
 382
 383 =cut
 384
 385 sub build_authorities_query {
 386     my ( $self, $search ) = @_;
 387
 388     # Start by making the query parts
 389     my @query_parts;
 390
 391     foreach my $s ( @{ $search->{searches} } ) {
 392         my ( $wh, $op, $val ) = @{$s}{qw(where operator value)};
 393         if ( defined $op && ($op eq 'is' || $op eq '=' || $op eq 'exact') ) {
 394             if ($wh) {
 395                 # Match the whole field, case insensitive, UTF normalized.
 396                 push @query_parts, { term => { "$wh.ci_raw" => $val } };
 397             }
 398             else {
 399                 # Match the whole field for all searchable fields, case insensitive,
 400                 # UTF normalized.
 401                 # Given that field data is "The quick brown fox"
 402                 # "The quick brown fox" and "the quick brown fox" will match
 403                 # but not "quick brown fox".
 404                 push @query_parts, {
 405                     multi_match => {
 406                         query => $val,
 407                         fields => $self->_search_fields({ subfield => 'ci_raw' }),
 408                     }
 409                 };
 410             }
 411         }
 412         elsif ( defined $op && $op eq 'start') {
 413             # Match the prefix within a field for all searchable fields.
 414             # Given that field data is "The quick brown fox"
 415             # "The quick bro" will match, but not "quick bro"
 416
 417             # Does not seems to be a multi prefix query
 418             # so we need to create one
 419             if ($wh) {
 420                 # Match prefix of the field.
 421                 push @query_parts, { prefix => {"$wh.ci_raw" => $val} };
 422             }
 423             else {
 424                 my @prefix_queries;
 425                 foreach my $field (@{$self->_search_fields()}) {
 426                     push @prefix_queries, {
 427                         prefix => { "$field.ci_raw" => $val }
 428                     };
 429                 }
 430                 push @query_parts, {
 431                     'bool' => {
 432                         'should' => \@prefix_queries,
 433                         'minimum_should_match' => 1
 434                     }
 435                 };
 436             }
 437         }
 438         else {
 439             # Query all searchable fields.
 440             # Given that field data is "The quick brown fox"
 441             # a search containing any of the words will match, regardless
 442             # of order.
 443
 444             my @tokens = $self->_split_query( $val );
 445             foreach my $token ( @tokens ) {
 446                 $token = $self->_truncate_terms(
 447                     $self->_clean_search_term( $token )
 448                 );
 449             }
 450             my $query = $self->_join_queries( @tokens );
 451             my $query_string = {
 452                 query            => $query,
 453                 lenient          => JSON::true,
 454                 analyze_wildcard => JSON::true,
 455             };
 456             if ($wh) {
 457                 $query_string->{default_field} = $wh;
 458             }
 459             else {
 460                 $query_string->{fields} = $self->_search_fields();
 461             }
 462             push @query_parts, { query_string => $query_string };
 463         }
 464     }
 465
 466     # Merge the query parts appropriately
 467     # 'should' behaves like 'or'
 468     # 'must' behaves like 'and'
 469     # Zebra behaviour seem to match must so using that here
 470     my $elastic_query = {};
 471     $elastic_query->{bool}->{must} = \@query_parts;
 472
 473     # Filter by authtypecode if set
 474     if ($search->{authtypecode}) {
 475         $elastic_query->{bool}->{filter} = {
 476             term => {
 477                 "authtype.raw" => $search->{authtypecode}
 478             }
 479         };
 480     }
 481
 482     my $query = {
 483         query => $elastic_query
 484     };
 485
 486     # Add the sort stuff
 487     $query->{sort} = [ $search->{sort} ] if exists $search->{sort};
 488
 489     return $query;
 490 }
 491
 492 =head2 build_authorities_query_compat
 493
 494     my ($query) =
 495       $builder->build_authorities_query_compat( \@marclist, \@and_or,
 496         \@excluding, \@operator, \@value, $authtypecode, $orderby );
 497
 498 This builds a query for searching for authorities, in the style of
 499 L<C4::AuthoritiesMarc::SearchAuthorities>.
 500
 501 Arguments:
 502
 503 =over 4
 504
 505 =item marclist
 506
 507 An arrayref containing where the particular term should be searched for.
 508 Options are: mainmainentry, mainentry, match, match-heading, see-from, and
 509 thesaurus. If left blank, any field is used.
 510
 511 =item and_or
 512
 513 Totally ignored. It is never used in L<C4::AuthoritiesMarc::SearchAuthorities>.
 514
 515 =item excluding
 516
 517 Also ignored.
 518
 519 =item operator
 520
 521 What form of search to do. Options are: is (phrase, no truncation, whole field
 522 must match), = (number exact match), exact (phrase, no truncation, whole field
 523 must match). If left blank, then word list, right truncated, anywhere is used.
 524
 525 =item value
 526
 527 The actual user-provided string value to search for.
 528
 529 =item authtypecode
 530
 531 The authority type code to search within. If blank, then all will be searched.
 532
 533 =item orderby
 534
 535 The order to sort the results by. Options are Relevance, HeadingAsc,
 536 HeadingDsc, AuthidAsc, AuthidDsc.
 537
 538 =back
 539
 540 marclist, operator, and value must be the same length, and the values at
 541 index /i/ all relate to each other.
 542
 543 This returns a query, which is a black box object that can be passed to the
 544 appropriate search object.
 545
 546 =cut
 547
 548 our $koha_to_index_name = {
 549     mainmainentry   => 'heading-main',
 550     mainentry       => 'heading',
 551     match           => 'match',
 552     'match-heading' => 'match-heading',
 553     'see-from'      => 'match-heading-see-from',
 554     thesaurus       => 'subject-heading-thesaurus',
 555     any             => '',
 556     all             => ''
 557 };
 558
 559 sub build_authorities_query_compat {
 560     my ( $self, $marclist, $and_or, $excluding, $operator, $value,
 561         $authtypecode, $orderby )
 562       = @_;
 563
 564     # This turns the old-style many-options argument form into a more
 565     # extensible hash form that is understood by L<build_authorities_query>.
 566     my @searches;
 567     my $mappings = $self->get_elasticsearch_mappings();
 568
 569     # Convert to lower case
 570     $marclist = [map(lc, @{$marclist})];
 571     $orderby  = lc $orderby;
 572
 573     my @indexes;
 574     # Make sure everything exists
 575     foreach my $m (@$marclist) {
 576
 577         $m = exists $koha_to_index_name->{$m} ? $koha_to_index_name->{$m} : $m;
 578         push @indexes, $m;
 579         warn "Unknown search field $m in marclist" unless (defined $mappings->{data}->{properties}->{$m} || $m eq '' || $m eq 'match-heading');
 580     }
 581     for ( my $i = 0 ; $i < @$value ; $i++ ) {
 582         next unless $value->[$i]; #clean empty form values, ES doesn't like undefined searches
 583         push @searches,
 584           {
 585             where    => $indexes[$i],
 586             operator => $operator->[$i],
 587             value    => $value->[$i],
 588           };
 589     }
 590
 591     my %sort;
 592     my $sort_field =
 593         ( $orderby =~ /^heading/ ) ? 'heading__sort'
 594       : ( $orderby =~ /^auth/ )    ? 'local-number__sort'
 595       :                              undef;
 596     if ($sort_field) {
 597         my $sort_order = ( $orderby =~ /asc$/ ) ? 'asc' : 'desc';
 598         %sort = ( $sort_field => $sort_order, );
 599     }
 600     my %search = (
 601         searches     => \@searches,
 602         authtypecode => $authtypecode,
 603     );
 604     $search{sort} = \%sort if %sort;
 605     my $query = $self->build_authorities_query( \%search );
 606     return $query;
 607 }
 608
 609 =head2 _build_scan_query
 610
 611     my ($query, $query_str) = $builder->_build_scan_query(\@operands, \@indexes)
 612
 613 This will build an aggregation scan query that can be issued to elasticsearch from
 614 the provided string input.
 615
 616 =cut
 617
 618 our %scan_field_convert = (
 619     'ti' => 'title',
 620     'au' => 'author',
 621     'su' => 'subject',
 622     'se' => 'title-series',
 623     'pb' => 'publisher',
 624 );
 625
 626 sub _build_scan_query {
 627     my ( $self, $operands, $indexes ) = @_;
 628
 629     my $term = scalar( @$operands ) == 0 ? '' : $operands->[0];
 630     my $index = scalar( @$indexes ) == 0 ? 'subject' : $indexes->[0];
 631
 632     my ( $f, $d ) = split( /,/, $index);
 633     $index = $scan_field_convert{$f} || $f;
 634
 635     my $res;
 636     $res->{query} = {
 637         query_string => {
 638             query => '*'
 639         }
 640     };
 641     $res->{aggregations} = {
 642         $index => {
 643             terms => {
 644                 field => $index . '__facet',
 645                 order => { '_term' => 'asc' },
 646                 include => $self->_create_regex_filter($self->_clean_search_term($term)) . '.*'
 647             }
 648         }
 649     };
 650     return ($res, $term);
 651 }
 652
 653 =head2 _create_regex_filter
 654
 655     my $filter = $builder->_create_regex_filter('term')
 656
 657 This will create a regex filter that can be used with an aggregation query.
 658
 659 =cut
 660
 661 sub _create_regex_filter {
 662     my ($self, $term) = @_;
 663
 664     my $result = '';
 665     foreach my $c (split(//, quotemeta($term))) {
 666         my $lc = lc($c);
 667         my $uc = uc($c);
 668         $result .= $lc ne $uc ? '[' . $lc . $uc . ']' : $c;
 669     }
 670     return $result;
 671 }
 672
 673 =head2 _convert_sort_fields
 674
 675     my @sort_params = _convert_sort_fields(@sort_by)
 676
 677 Converts the zebra-style sort index information into elasticsearch-style.
 678
 679 C<@sort_by> is the same as presented to L<build_query_compat>, and it returns
 680 something that can be sent to L<build_query>.
 681
 682 =cut
 683
 684 sub _convert_sort_fields {
 685     my ( $self, @sort_by ) = @_;
 686
 687     # Turn the sorting into something we care about.
 688     my %sort_field_convert = (
 689         acqdate     => 'date-of-acquisition',
 690         author      => 'author',
 691         call_number => 'cn-sort',
 692         popularity  => 'issues',
 693         relevance   => undef,       # default
 694         title       => 'title',
 695         pubdate     => 'date-of-publication',
 696     );
 697     my %sort_order_convert =
 698       ( qw( desc desc ), qw( dsc desc ), qw( asc asc ), qw( az asc ), qw( za desc ) );
 699
 700     # Convert the fields and orders, drop anything we don't know about.
 701     grep { $_->{field} } map {
 702         my ( $f, $d ) = /(.+)_(.+)/;
 703         {
 704             field     => $sort_field_convert{$f},
 705             direction => $sort_order_convert{$d}
 706         }
 707     } @sort_by;
 708 }
 709
 710 sub _convert_index_fields {
 711     my ( $self, @indexes ) = @_;
 712
 713     my %index_type_convert =
 714       ( __default => undef, phr => 'phrase', rtrn => 'right-truncate', 'st-year' => 'st-year' );
 715
 716     # Convert according to our table, drop anything that doesn't convert.
 717     # If a field starts with mc- we save it as it's used (and removed) later
 718     # when joining things, to indicate we make it an 'OR' join.
 719     # (Sorry, this got a bit ugly after special cases were found.)
 720     map {
 721         # Lower case all field names
 722         my ( $f, $t ) = map(lc, split /,/);
 723         my $mc = '';
 724         if ($f =~ /^mc-/) {
 725             $mc = 'mc-';
 726             $f =~ s/^mc-//;
 727         }
 728         my $r = {
 729             field => exists $index_field_convert{$f} ? $index_field_convert{$f} : $f,
 730             type  => $index_type_convert{ $t // '__default' }
 731         };
 732         $r->{field} = ($mc . $r->{field}) if $mc && $r->{field};
 733         $r->{field} || $r->{type} ? $r : undef;
 734     } @indexes;
 735 }
 736
 737 =head2 _convert_index_strings
 738
 739     my @searches = $self->_convert_index_strings(@searches);
 740
 741 Similar to L<_convert_index_fields>, this takes strings of the form
 742 B<field:search term> and rewrites the field from zebra-style to
 743 elasticsearch-style. Anything it doesn't understand is returned verbatim.
 744
 745 =cut
 746
 747 sub _convert_index_strings {
 748     my ( $self, @searches ) = @_;
 749     my @res;
 750     foreach my $s (@searches) {
 751         next if $s eq '';
 752         my ( $field, $term ) = $s =~ /^\s*([\w,-]*?):(.*)/;
 753         unless ( defined($field) && defined($term) ) {
 754             push @res, $s;
 755             next;
 756         }
 757         my ($conv) = $self->_convert_index_fields($field);
 758         unless ( defined($conv) ) {
 759             push @res, $s;
 760             next;
 761         }
 762         push @res, ($conv->{field} ? $conv->{field} . ':' : '')
 763             . $self->_modify_string_by_type( %$conv, operand => $term );
 764     }
 765     return @res;
 766 }
 767
 768 =head2 _convert_index_strings_freeform
 769
 770     my $search = $self->_convert_index_strings_freeform($search);
 771
 772 This is similar to L<_convert_index_strings>, however it'll search out the
 773 things to change within the string. So it can handle strings such as
 774 C<(su:foo) AND (su:bar)>, converting the C<su> appropriately.
 775
 776 If there is something of the form "su,complete-subfield" or something, the
 777 second part is stripped off as we can't yet handle that. Making it work
 778 will have to wait for a real query parser.
 779
 780 =cut
 781
 782 sub _convert_index_strings_freeform {
 783     my ( $self, $search ) = @_;
 784     # @TODO: Currenty will alter also fields contained within quotes:
 785     # `searching for "stuff cn:123"` for example will become
 786     # `searching for "stuff local-number:123"
 787     #
 788     # Fixing this is tricky, one possibility:
 789     # https://stackoverflow.com/questions/19193876/perl-regex-to-match-a-string-that-is-not-enclosed-in-quotes
 790     # Still not perfect, and will not handle escaped quotes within quotes and assumes balanced quotes.
 791     #
 792     # Another, not so elegant, solution could be to replace all quoted content with placeholders, and put
 793     # them back when processing is done.
 794
 795     # Lower case field names
 796     $search =~ s/($field_name_pattern)(?:,[\w-]*)?($multi_field_pattern):/\L$1\E$2:/og;
 797     # Resolve possible field aliases
 798     $search =~ s/($field_name_pattern)($multi_field_pattern):/(exists $index_field_convert{$1} ? $index_field_convert{$1} : $1)."$2:"/oge;
 799     return $search;
 800 }
 801
 802 =head2 _modify_string_by_type
 803
 804     my $str = $self->_modify_string_by_type(%index_field);
 805
 806 If you have a search term (operand) and a type (phrase, right-truncated), this
 807 will convert the string to have the function in lucene search terms, e.g.
 808 wrapping quotes around it.
 809
 810 =cut
 811
 812 sub _modify_string_by_type {
 813     my ( $self, %idx ) = @_;
 814
 815     my $type = $idx{type} || '';
 816     my $str = $idx{operand};
 817     return $str unless $str;    # Empty or undef, we can't use it.
 818
 819     $str .= '*' if $type eq 'right-truncate';
 820     $str = '"' . $str . '"' if $type eq 'phrase' && $str !~ /^".*"$/;
 821     if ($type eq 'st-year') {
 822         if ($str =~ /^(.*)-(.*)$/) {
 823             my $from = $1 || '*';
 824             my $until = $2 || '*';
 825             $str = "[$from TO $until]";
 826         }
 827     }
 828     return $str;
 829 }
 830
 831 =head2 _join_queries
 832
 833     my $query_str = $self->_join_queries(@query_parts);
 834
 835 This takes a list of query parts, that might be search terms on their own, or
 836 booleaned together, or specifying fields, or whatever, wraps them in
 837 parentheses, and ANDs them all together. Suitable for feeding to the ES
 838 query string query.
 839
 840 Note: doesn't AND them together if they specify an index that starts with "mc"
 841 as that was a special case in the original code for dealing with multiple
 842 choice options (you can't search for something that has an itype of A and
 843 and itype of B otherwise.)
 844
 845 =cut
 846
 847 sub _join_queries {
 848     my ( $self, @parts ) = @_;
 849
 850     my @norm_parts = grep { defined($_) && $_ ne '' && $_ !~ /^mc-/ } @parts;
 851     my @mc_parts =
 852       map { s/^mc-//r } grep { defined($_) && $_ ne '' && $_ =~ /^mc-/ } @parts;
 853     return () unless @norm_parts + @mc_parts;
 854     return ( @norm_parts, @mc_parts )[0] if @norm_parts + @mc_parts == 1;
 855
 856     # Group limits by field, so they can be OR'ed together
 857     my %mc_limits;
 858     foreach my $mc_part (@mc_parts) {
 859         my ($field, $value) = split /:/, $mc_part, 2;
 860         $mc_limits{$field} //= [];
 861         push @{ $mc_limits{$field} }, $value;
 862     }
 863
 864     @mc_parts = map {
 865         sprintf('%s:(%s)', $_, join (' OR ', @{ $mc_limits{$_} }));
 866     } sort keys %mc_limits;
 867
 868     @norm_parts = map { "($_)" } @norm_parts;
 869
 870     return join( ' AND ', @norm_parts, @mc_parts);
 871 }
 872
 873 =head2 _make_phrases
 874
 875     my @phrased_queries = $self->_make_phrases(@query_parts);
 876
 877 This takes the supplied queries and forces them to be phrases by wrapping
 878 quotes around them. It understands field prefixes, e.g. 'subject:' and puts
 879 the quotes outside of them if they're there.
 880
 881 =cut
 882
 883 sub _make_phrases {
 884     my ( $self, @parts ) = @_;
 885     map { s/^\s*(\w*?:)(.*)$/$1"$2"/r } @parts;
 886 }
 887
 888 =head2 _create_query_string
 889
 890     my @query_strings = $self->_create_query_string(@queries);
 891
 892 Given a list of hashrefs, it will turn them into a lucene-style query string.
 893 The hash should contain field, type (both for the indexes), operator, and
 894 operand.
 895
 896 =cut
 897
 898 sub _create_query_string {
 899     my ( $self, @queries ) = @_;
 900
 901     map {
 902         my $otor  = $_->{operator} ? $_->{operator} . ' ' : '';
 903         my $field = $_->{field}    ? $_->{field} . ':'    : '';
 904
 905         my $oand = $self->_modify_string_by_type(%$_);
 906         $oand = "($oand)" if $field && scalar(split(/\s+/, $oand)) > 1 && (!defined $_->{type} || $_->{type} ne 'st-year');
 907         "$otor($field$oand)";
 908     } @queries;
 909 }
 910
 911 =head2 _clean_search_term
 912
 913     my $term = $self->_clean_search_term($term);
 914
 915 This cleans a search term by removing any funny characters that may upset
 916 ES and give us an error. It also calls L<_convert_index_strings_freeform>
 917 to ensure those parts are correct.
 918
 919 =cut
 920
 921 sub _clean_search_term {
 922     my ( $self, $term ) = @_;
 923
 924     # Lookahead for checking if we are inside quotes
 925     my $lookahead = '(?=(?:[^\"]*+\"[^\"]*+\")*+[^\"]*+$)';
 926
 927     # Some hardcoded searches (like with authorities) produce things like
 928     # 'an=123', when it ought to be 'an:123' for our purposes.
 929     $term =~ s/=/:/g;
 930
 931     $term = $self->_convert_index_strings_freeform($term);
 932     $term =~ s/[{}]/"/g;
 933
 934     # Remove unbalanced quotes
 935     my $unquoted = $term;
 936     my $count = ($unquoted =~ tr/"/ /);
 937     if ($count % 2 == 1) {
 938         $term = $unquoted;
 939     }
 940
 941     # Remove unquoted colons that have whitespace on either side of them
 942     $term =~ s/(:+)(\s+)$lookahead/$2/g;
 943     $term =~ s/(\s+)(:+)$lookahead/$1/g;
 944     $term =~ s/^://;
 945
 946     $term = $self->_query_regex_escape_process($term);
 947
 948     return $term;
 949 }
 950
 951 =head2 _query_regex_escape_process
 952
 953     my $query = $self->_query_regex_escape_process($query);
 954
 955 Processes query in accordance with current "QueryRegexEscapeOptions" system preference setting.
 956
 957 =cut
 958
 959 sub _query_regex_escape_process {
 960     my ($self, $query) = @_;
 961     my $regex_escape_options = C4::Context->preference("QueryRegexEscapeOptions");
 962     if ($regex_escape_options ne 'dont_escape') {
 963         if ($regex_escape_options eq 'escape') {
 964             # Will escape unescaped slashes (/) while preserving
 965             # unescaped slashes within quotes
 966             # @TODO: assumes quotes are always balanced and will
 967             # not handle escaped qoutes properly, should perhaps be
 968             # replaced with a more general parser solution
 969             # so that this function is ever only provided with unqouted
 970             # query parts
 971             $query =~ s@(?:(?<!\\)((?:[\\]{2})*)(?=/))(?![^"]*"(?:[^"]*"[^"]*")*[^"]*$)@\\$1@g;
 972         }
 973         elsif($regex_escape_options eq 'unescape_escaped') {
 974             # Will unescape escaped slashes (\/) and escape
 975             # unescaped slashes (/) while preserving slashes within quotes
 976             # The same limitatations as above apply for handling of quotes
 977             $query =~ s@(?:(?<!\\)(?:((?:[\\]{2})*[\\])|((?:[\\]{2})*))(?=/))(?![^"]*"(?:[^"]*"[^"]*")*[^"]*$)@($1 ? substr($1, 0, -1) : ($2 . "\\"))@ge;
 978         }
 979     }
 980     return $query;
 981 }
 982
 983 =head2 _fix_limit_special_cases
 984
 985     my $limits = $self->_fix_limit_special_cases($limits);
 986
 987 This converts any special cases that the limit specifications have into things
 988 that are more readily processable by the rest of the code.
 989
 990 The argument should be an arrayref, and it'll return an arrayref.
 991
 992 =cut
 993
 994 sub _fix_limit_special_cases {
 995     my ( $self, $limits ) = @_;
 996
 997     my @new_lim;
 998     foreach my $l (@$limits) {
 999
1000         # This is set up by opac-search.pl
1001         if ( $l =~ /^yr,st-numeric,ge=/ ) {
1002             my ( $start, $end ) =
1003               ( $l =~ /^yr,st-numeric,ge=(.*) and yr,st-numeric,le=(.*)$/ );
1004             next unless defined($start) && defined($end);
1005             push @new_lim, "copydate:[$start TO $end]";
1006         }
1007         elsif ( $l =~ /^yr,st-numeric=/ ) {
1008             my ($date) = ( $l =~ /^yr,st-numeric=(.*)$/ );
1009             next unless defined($date);
1010             $date = $self->_modify_string_by_type(type => 'st-year', operand => $date);
1011             push @new_lim, "copydate:$date";
1012         }
1013         elsif ( $l =~ /^available$/ ) {
1014             push @new_lim, 'onloan:false';
1015         }
1016         else {
1017             my ( $field, $term ) = $l =~ /^\s*([\w,-]*?):(.*)/;
1018             $field =~ s/,phr$//; #We are quoting all the limits as phrase, this prevents from quoting again later
1019             if ( defined($field) && defined($term) ) {
1020                 push @new_lim, "$field:(\"$term\")";
1021             }
1022             else {
1023                 push @new_lim, $l;
1024             }
1025         }
1026     }
1027     return \@new_lim;
1028 }
1029
1030 =head2 _sort_field
1031
1032     my $field = $self->_sort_field($field);
1033
1034 Given a field name, this works out what the actual name of the field to sort
1035 on should be. A '__sort' suffix is added for fields with a sort version, and
1036 for text fields either '.phrase' (for sortable versions) or '.raw' is appended
1037 to avoid sorting on a tokenized value.
1038
1039 =cut
1040
1041 sub _sort_field {
1042     my ($self, $f) = @_;
1043
1044     my $mappings = $self->get_elasticsearch_mappings();
1045     my $textField = defined $mappings->{data}{properties}{$f}{type} && $mappings->{data}{properties}{$f}{type} eq 'text';
1046     if (!defined $self->sort_fields()->{$f} || $self->sort_fields()->{$f}) {
1047         $f .= '__sort';
1048     } else {
1049         # We need to add '.raw' to text fields without a sort field,
1050         # otherwise it'll sort based on the tokenised form.
1051         $f .= '.raw' if $textField;
1052     }
1053     return $f;
1054 }
1055
1056 =head2 _truncate_terms
1057
1058     my $query = $self->_truncate_terms($query);
1059
1060 Given a string query this function appends '*' wildcard  to all terms except
1061 operands and double quoted strings.
1062
1063 =cut
1064
1065 sub _truncate_terms {
1066     my ( $self, $query ) = @_;
1067
1068     my @tokens = $self->_split_query( $query );
1069
1070     # Filter out empty tokens
1071     my @words = grep { $_ !~ /^\s*$/ } @tokens;
1072
1073     # Append '*' to words if needed, ie. if it ends in a word character and is not a keyword
1074     my @terms = map {
1075         my $w = $_;
1076         (/\W$/ or grep {lc($w) eq $_} qw/and or not/) ? $_ : "$_*";
1077     } @words;
1078
1079     return join ' ', @terms;
1080 }
1081
1082 =head2 _split_query
1083
1084     my @token = $self->_split_query($query_str);
1085
1086 Given a string query this function splits it to tokens taking into account
1087 any field prefixes and quoted strings.
1088
1089 =cut
1090
1091 my $tokenize_split_re = qr/((?:${field_name_pattern}${multi_field_pattern}:)?"[^"]+"|\s+)/;
1092
1093 sub _split_query {
1094     my ( $self, $query ) = @_;
1095
1096     # '"donald duck" title:"the mouse" and peter" get split into
1097     # ['', '"donald duck"', '', ' ', '', 'title:"the mouse"', '', ' ', 'and', ' ', 'pete']
1098     my @tokens = split $tokenize_split_re, $query;
1099
1100     # Filter out empty values
1101     @tokens = grep( /\S/, @tokens );
1102
1103     return @tokens;
1104 }
1105
1106 =head2 _search_fields
1107     my $weighted_fields = $self->_search_fields({
1108         is_opac => 0,
1109         weighted_fields => 1,
1110         subfield => 'raw'
1111     });
1112
1113 Generate a list of searchable fields to be used for Elasticsearch queries
1114 applied to multiple fields.
1115
1116 Returns an arrayref of field names for either OPAC or staff interface, with
1117 possible weights and subfield appended to each field name depending on the
1118 options provided.
1119
1120 =over 4
1121
1122 =item C<$params>
1123
1124 Hashref with options. The parameter C<is_opac> indicates whether the searchable
1125 fields for OPAC or staff interface should be retrieved. If C<weighted_fields> is set
1126 fields weights will be applied on returned fields. C<subfield> can be used to
1127 provide a subfield that will be appended to fields as "C<field_name>.C<subfield>".
1128
1129 =back
1130
1131 =cut
1132
1133 sub _search_fields {
1134     my ($self, $params) = @_;
1135     $params //= {
1136         is_opac => 0,
1137         weighted_fields => 0,
1138         whole_record => 0,
1139         # This is a hack for authorities build_authorities_query
1140         # can hopefully be removed in the future
1141         subfield => undef,
1142     };
1143     my $cache = Koha::Caches->get_instance();
1144     my $cache_key = 'elasticsearch_search_fields' . ($params->{is_opac} ? '_opac' : '_staff_client') . "_" . $self->index;
1145     my $search_fields = $cache->get_from_cache($cache_key, { unsafe => 1 });
1146     if (!$search_fields) {
1147         # The reason we don't use Koha::SearchFields->search here is we don't
1148         # want or need resultset wrapped as Koha::SearchField object.
1149         # It does not make any sense in this context and would cause
1150         # unnecessary overhead sice we are only querying for data
1151         # Also would not work, or produce strange results, with the "columns"
1152         # option.
1153         my $schema = Koha::Database->schema;
1154         my $result = $schema->resultset('SearchField')->search(
1155             {
1156                 $params->{is_opac} ? (
1157                     'opac' => 1,
1158                 ) : (
1159                     'staff_client' => 1
1160                 ),
1161                 'type' => { '!=' => 'boolean' },
1162                 'search_marc_map.index_name' => $self->index,
1163                 'search_marc_map.marc_type' => C4::Context->preference('marcflavour'),
1164                 'search_marc_to_fields.search' => 1,
1165             },
1166             {
1167                 columns => [qw/name weight/],
1168                 collapse => 1,
1169                 join => {search_marc_to_fields => 'search_marc_map'},
1170             }
1171         );
1172         my @search_fields;
1173         while (my $search_field = $result->next) {
1174             push @search_fields, [
1175                 lc $search_field->name,
1176                 $search_field->weight ? $search_field->weight : ()
1177             ];
1178         }
1179         $search_fields = \@search_fields;
1180         $cache->set_in_cache($cache_key, $search_fields);
1181     }
1182     if ($params->{subfield}) {
1183         my $subfield = $params->{subfield};
1184         $search_fields = [
1185             map {
1186                 # Copy values to avoid mutating cached
1187                 # data (since unsafe is used)
1188                 my ($field, $weight) = @{$_};
1189                 ["${field}.${subfield}", $weight];
1190             } @{$search_fields}
1191         ];
1192     }
1193     if ($params->{weighted_fields}) {
1194         return [map { join('^', @{$_}) } @{$search_fields}];
1195     }
1196     else {
1197         # Exclude weight from field
1198         return [map { $_->[0] } @{$search_fields}];
1199     }
1200 }
1201
1202 1;