Koha/SearchEngine/Elasticsearch/QueryBuilder.pm

   1 package Koha::SearchEngine::Elasticsearch::QueryBuilder;
   2
   3 # This file is part of Koha.
   4 #
   5 # Copyright 2014 Catalyst IT Ltd.
   6 #
   7 # Koha is free software; you can redistribute it and/or modify it
   8 # under the terms of the GNU General Public License as published by
   9 # the Free Software Foundation; either version 3 of the License, or
  10 # (at your option) any later version.
  11 #
  12 # Koha is distributed in the hope that it will be useful, but
  13 # WITHOUT ANY WARRANTY; without even the implied warranty of
  14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  15 # GNU General Public License for more details.
  16 #
  17 # You should have received a copy of the GNU General Public License
  18 # along with Koha; if not, see <http://www.gnu.org/licenses>.
  19
  20 =head1 NAME
  21
  22 Koha::SearchEngine::Elasticsearch::QueryBuilder - constructs elasticsearch
  23 query objects from user-supplied queries
  24
  25 =head1 DESCRIPTION
  26
  27 This provides the functions that take a user-supplied search query, and
  28 provides something that can be given to elasticsearch to get answers.
  29
  30 =head1 SYNOPSIS
  31
  32     use Koha::SearchEngine::Elasticsearch::QueryBuilder;
  33     $builder = Koha::SearchEngine::Elasticsearch->new({ index => $index });
  34     my $simple_query = $builder->build_query("hello");
  35     # This is currently undocumented because the original code is undocumented
  36     my $adv_query = $builder->build_advanced_query($indexes, $operands, $operators);
  37
  38 =head1 METHODS
  39
  40 =cut
  41
  42 use base qw(Koha::SearchEngine::Elasticsearch);
  43 use Carp;
  44 use JSON;
  45 use List::MoreUtils qw/ each_array /;
  46 use Modern::Perl;
  47 use URI::Escape;
  48
  49 use C4::Context;
  50 use Koha::Exceptions;
  51 use Koha::Caches;
  52
  53 =head2 build_query
  54
  55     my $simple_query = $builder->build_query("hello", %options)
  56
  57 This will build a query that can be issued to elasticsearch from the provided
  58 string input. This expects a lucene style search form (see
  59 L<http://www.elasticsearch.org/guide/en/elasticsearch/reference/current/query-dsl-query-string-query.html#query-string-syntax>
  60 for details.)
  61
  62 It'll make an attempt to respect the various query options.
  63
  64 Additional options can be provided with the C<%options> hash.
  65
  66 =over 4
  67
  68 =item sort
  69
  70 This should be an arrayref of hashrefs, each containing a C<field> and an
  71 C<direction> (optional, defaults to C<asc>.) The results will be sorted
  72 according to these values. Valid values for C<direction> are 'asc' and 'desc'.
  73
  74 =back
  75
  76 =cut
  77
  78 sub build_query {
  79     my ( $self, $query, %options ) = @_;
  80
  81     my $stemming         = C4::Context->preference("QueryStemming")        || 0;
  82     my $auto_truncation  = C4::Context->preference("QueryAutoTruncate")    || 0;
  83     my $fuzzy_enabled    = C4::Context->preference("QueryFuzzy")           || 0;
  84
  85     $query = '*' unless defined $query;
  86
  87     my $res;
  88     my $fields = $self->_search_fields({
  89         is_opac => $options{is_opac},
  90         weighted_fields => $options{weighted_fields},
  91     });
  92     if ($options{whole_record}) {
  93         push @$fields, 'marc_data_array.*';
  94     }
  95     $res->{query} = {
  96         query_string => {
  97             query            => $query,
  98             fuzziness        => $fuzzy_enabled ? 'auto' : '0',
  99             default_operator => 'AND',
 100             fields           => $fields,
 101             lenient          => JSON::true,
 102             analyze_wildcard => JSON::true,
 103         }
 104     };
 105
 106     if ( $options{sort} ) {
 107         foreach my $sort ( @{ $options{sort} } ) {
 108             my ( $f, $d ) = @$sort{qw/ field direction /};
 109             die "Invalid sort direction, $d"
 110               if $d && ( $d ne 'asc' && $d ne 'desc' );
 111             $d = 'asc' unless $d;
 112
 113             $f = $self->_sort_field($f);
 114             push @{ $res->{sort} }, { $f => { order => $d } };
 115         }
 116     }
 117
 118     # See _convert_facets in Search.pm for how these get turned into
 119     # things that Koha can use.
 120     my $size = C4::Context->preference('FacetMaxCount');
 121     $res->{aggregations} = {
 122         author         => { terms => { field => "author__facet" , size => $size } },
 123         subject        => { terms => { field => "subject__facet", size => $size } },
 124         itype          => { terms => { field => "itype__facet", size => $size} },
 125         location       => { terms => { field => "location__facet", size => $size } },
 126         'su-geo'       => { terms => { field => "su-geo__facet", size => $size} },
 127         'title-series' => { terms => { field => "title-series__facet", size => $size } },
 128         ccode          => { terms => { field => "ccode__facet", size => $size } },
 129         ln             => { terms => { field => "ln__facet", size => $size } },
 130     };
 131
 132     my $display_library_facets = C4::Context->preference('DisplayLibraryFacets');
 133     if (   $display_library_facets eq 'both'
 134         or $display_library_facets eq 'home' ) {
 135         $res->{aggregations}{homebranch} = { terms => { field => "homebranch__facet" } };
 136     }
 137     if (   $display_library_facets eq 'both'
 138         or $display_library_facets eq 'holding' ) {
 139         $res->{aggregations}{holdingbranch} = { terms => { field => "holdingbranch__facet" } };
 140     }
 141     return $res;
 142 }
 143
 144 =head2 build_browse_query
 145
 146     my $browse_query = $builder->build_browse_query($field, $query);
 147
 148 This performs a "starts with" style query on a particular field. The field
 149 to be searched must have been indexed with an appropriate mapping as a
 150 "phrase" subfield, which pretty much everything has.
 151
 152 =cut
 153
 154 # XXX this isn't really a browse query like we want in the end
 155 sub build_browse_query {
 156     my ( $self, $field, $query ) = @_;
 157
 158     my $fuzzy_enabled = C4::Context->preference("QueryFuzzy") || 0;
 159
 160     return { query => '*' } if !defined $query;
 161
 162     # TODO this should come from Koha::SearchEngine::Elasticsearch
 163     my %field_whitelist = (
 164         title  => 1,
 165         author => 1,
 166     );
 167     $field = 'title' if !exists $field_whitelist{$field};
 168     my $sort = $self->_sort_field($field);
 169     my $res = {
 170         query => {
 171             match_phrase_prefix => {
 172                 "$field.phrase" => {
 173                     query     => $query,
 174                     operator  => 'or',
 175                     fuzziness => $fuzzy_enabled ? 'auto' : '0',
 176                 }
 177             }
 178         },
 179         sort => [ { $sort => { order => "asc" } } ],
 180     };
 181 }
 182
 183 =head2 build_query_compat
 184
 185     my (
 186         $error,             $query, $simple_query, $query_cgi,
 187         $query_desc,        $limit, $limit_cgi,    $limit_desc,
 188         $stopwords_removed, $query_type
 189       )
 190       = $builder->build_query_compat( \@operators, \@operands, \@indexes,
 191         \@limits, \@sort_by, $scan, $lang, $params );
 192
 193 This handles a search using the same api as L<C4::Search::buildQuery> does.
 194
 195 A very simple query will go in with C<$operands> set to ['query'], and
 196 C<$sort_by> set to ['pubdate_dsc']. This simple case will return with
 197 C<$query> set to something that can perform the search, C<$simple_query>
 198 set to just the search term, C<$query_cgi> set to something that can
 199 reproduce this search, and C<$query_desc> set to something else.
 200
 201 =cut
 202
 203 sub build_query_compat {
 204     my ( $self, $operators, $operands, $indexes, $orig_limits, $sort_by, $scan,
 205         $lang, $params )
 206       = @_;
 207
 208 #die Dumper ( $self, $operators, $operands, $indexes, $orig_limits, $sort_by, $scan, $lang );
 209     my @sort_params  = $self->_convert_sort_fields(@$sort_by);
 210     my @index_params = $self->_convert_index_fields(@$indexes);
 211     my $limits       = $self->_fix_limit_special_cases($orig_limits);
 212     if ( $params->{suppress} ) { push @$limits, "suppress:0"; }
 213     # Merge the indexes in with the search terms and the operands so that
 214     # each search thing is a handy unit.
 215     unshift @$operators, undef;    # The first one can't have an op
 216     my @search_params;
 217     my $truncate = C4::Context->preference("QueryAutoTruncate") || 0;
 218     my $ea = each_array( @$operands, @$operators, @index_params );
 219     while ( my ( $oand, $otor, $index ) = $ea->() ) {
 220         next if ( !defined($oand) || $oand eq '' );
 221         $oand = $self->_clean_search_term($oand);
 222         $oand = $self->_truncate_terms($oand) if ($truncate);
 223         push @search_params, {
 224             operand => $oand,      # the search terms
 225             operator => defined($otor) ? uc $otor : undef,    # AND and so on
 226             $index ? %$index : (),
 227         };
 228     }
 229
 230     # We build a string query from limits and the queries. An alternative
 231     # would be to pass them separately into build_query and let it build
 232     # them into a structured ES query itself. Maybe later, though that'd be
 233     # more robust.
 234     my $search_param_query_str = join( ' ', $self->_create_query_string(@search_params) );
 235     my $query_str = join( ' AND ',
 236         $search_param_query_str || (),
 237         $self->_join_queries( $self->_convert_index_strings(@$limits) ) || () );
 238
 239     # If there's no query on the left, let's remove the junk left behind
 240     $query_str =~ s/^ AND //;
 241     my %options;
 242     $options{sort} = \@sort_params;
 243     $options{is_opac} = $params->{is_opac};
 244     $options{weighted_fields} = $params->{weighted_fields};
 245     $options{whole_record} = $params->{whole_record};
 246     my $query = $self->build_query( $query_str, %options );
 247
 248     # We roughly emulate the CGI parameters of the zebra query builder
 249     my $query_cgi = '';
 250     shift @$operators; # Shift out the one we unshifted before
 251     $ea = each_array( @$operands, @$operators, @$indexes );
 252     while ( my ( $oand, $otor, $index ) = $ea->() ) {
 253         $query_cgi .= '&' if $query_cgi;
 254         $query_cgi .= 'idx=' . uri_escape_utf8( $index // '') . '&q=' . uri_escape_utf8( $oand );
 255         $query_cgi .= '&op=' . uri_escape_utf8( $otor ) if $otor;
 256     }
 257     $query_cgi .= '&scan=1' if ( $scan );
 258
 259     my $simple_query;
 260     $simple_query = $operands->[0] if @$operands == 1;
 261     my $query_desc;
 262     if ( $simple_query ) {
 263         $query_desc = $simple_query;
 264     } else {
 265         $query_desc = $search_param_query_str;
 266     }
 267     my $limit     = $self->_join_queries( $self->_convert_index_strings(@$limits));
 268     my $limit_cgi = ( $orig_limits and @$orig_limits )
 269       ? '&limit=' . join( '&limit=', map { uri_escape_utf8($_) } @$orig_limits )
 270       : '';
 271     my $limit_desc;
 272     $limit_desc = "$limit" if $limit;
 273
 274     return (
 275         undef,  $query,     $simple_query, $query_cgi, $query_desc,
 276         $limit, $limit_cgi, $limit_desc,   undef,      undef
 277     );
 278 }
 279
 280 =head2 build_authorities_query
 281
 282     my $query = $builder->build_authorities_query(\%search);
 283
 284 This takes a nice description of an authority search and turns it into a black-box
 285 query that can then be passed to the appropriate searcher.
 286
 287 The search description is a hashref that looks something like:
 288
 289     {
 290         searches => [
 291             {
 292                 where    => 'Heading',    # search the main entry
 293                 operator => 'exact',        # require an exact match
 294                 value    => 'frogs',        # the search string
 295             },
 296             {
 297                 where    => '',             # search all entries
 298                 operator => '',             # default keyword, right truncation
 299                 value    => 'pond',
 300             },
 301         ],
 302         sort => {
 303             field => 'Heading',
 304             order => 'desc',
 305         },
 306         authtypecode => 'TOPIC_TERM',
 307     }
 308
 309 =cut
 310
 311 sub build_authorities_query {
 312     my ( $self, $search ) = @_;
 313
 314     # Start by making the query parts
 315     my @query_parts;
 316
 317     foreach my $s ( @{ $search->{searches} } ) {
 318         my ( $wh, $op, $val ) = @{$s}{qw(where operator value)};
 319         if ( $op eq 'is' || $op eq '=' || $op eq 'exact') {
 320             if ($wh) {
 321                 # Match the whole field, case insensitive, UTF normalized.
 322                 push @query_parts, { term => { "$wh.ci_raw" => $val } };
 323             }
 324             else {
 325                 # Match the whole field for all searchable fields, case insensitive,
 326                 # UTF normalized.
 327                 # Given that field data is "The quick brown fox"
 328                 # "The quick brown fox" and "the quick brown fox" will match
 329                 # but not "quick brown fox".
 330                 push @query_parts, {
 331                     multi_match => {
 332                         query => $val,
 333                         fields => $self->_search_fields({ subfield => 'ci_raw' }),
 334                     }
 335                 };
 336             }
 337         }
 338         elsif ( $op eq 'start') {
 339             # Match the prefix within a field for all searchable fields.
 340             # Given that field data is "The quick brown fox"
 341             # "The quick bro" will match, but not "quick bro"
 342
 343             # Does not seems to be a multi prefix query
 344             # so we need to create one
 345             if ($wh) {
 346                 # Match prefix of the field.
 347                 push @query_parts, { prefix => {"$wh.ci_raw" => $val} };
 348             }
 349             else {
 350                 my @prefix_queries;
 351                 foreach my $field (@{$self->_search_fields()}) {
 352                     push @prefix_queries, {
 353                         prefix => { "$field.ci_raw" => $val }
 354                     };
 355                 }
 356                 push @query_parts, {
 357                     'bool' => {
 358                         'should' => \@prefix_queries,
 359                         'minimum_should_match' => 1
 360                     }
 361                 };
 362             }
 363         }
 364         else {
 365             # Query all searchable fields.
 366             # Given that field data is "The quick brown fox"
 367             # a search containing any of the words will match, regardless
 368             # of order.
 369
 370             my @tokens = $self->_split_query( $val );
 371             foreach my $token ( @tokens ) {
 372                 $token = $self->_truncate_terms(
 373                     $self->_clean_search_term( $token )
 374                 );
 375             }
 376             my $query = $self->_join_queries( @tokens );
 377
 378             if ($wh) {
 379                 push @query_parts, { query_string => {
 380                     default_field => $wh,
 381                     analyze_wildcard => JSON::true,
 382                     query => $query
 383                 } };
 384             }
 385             else {
 386                 push @query_parts, {
 387                     query_string => {
 388                         analyze_wildcard => JSON::true,
 389                         query => $query,
 390                         fields => $self->_search_fields(),
 391                     }
 392                 };
 393             }
 394         }
 395     }
 396
 397     # Merge the query parts appropriately
 398     # 'should' behaves like 'or'
 399     # 'must' behaves like 'and'
 400     # Zebra behaviour seem to match must so using that here
 401     my $elastic_query = {};
 402     $elastic_query->{bool}->{must} = \@query_parts;
 403
 404     # Filter by authtypecode if set
 405     if ($search->{authtypecode}) {
 406         $elastic_query->{bool}->{filter} = {
 407             term => {
 408                 "authtype.raw" => $search->{authtypecode}
 409             }
 410         };
 411     }
 412
 413     my $query = {
 414         query => $elastic_query
 415     };
 416
 417     # Add the sort stuff
 418     $query->{sort} = [ $search->{sort} ] if exists $search->{sort};
 419
 420     return $query;
 421 }
 422
 423 =head2 build_authorities_query_compat
 424
 425     my ($query) =
 426       $builder->build_authorities_query_compat( \@marclist, \@and_or,
 427         \@excluding, \@operator, \@value, $authtypecode, $orderby );
 428
 429 This builds a query for searching for authorities, in the style of
 430 L<C4::AuthoritiesMarc::SearchAuthorities>.
 431
 432 Arguments:
 433
 434 =over 4
 435
 436 =item marclist
 437
 438 An arrayref containing where the particular term should be searched for.
 439 Options are: mainmainentry, mainentry, match, match-heading, see-from, and
 440 thesaurus. If left blank, any field is used.
 441
 442 =item and_or
 443
 444 Totally ignored. It is never used in L<C4::AuthoritiesMarc::SearchAuthorities>.
 445
 446 =item excluding
 447
 448 Also ignored.
 449
 450 =item operator
 451
 452 What form of search to do. Options are: is (phrase, no truncation, whole field
 453 must match), = (number exact match), exact (phrase, no truncation, whole field
 454 must match). If left blank, then word list, right truncated, anywhere is used.
 455
 456 =item value
 457
 458 The actual user-provided string value to search for.
 459
 460 =item authtypecode
 461
 462 The authority type code to search within. If blank, then all will be searched.
 463
 464 =item orderby
 465
 466 The order to sort the results by. Options are Relevance, HeadingAsc,
 467 HeadingDsc, AuthidAsc, AuthidDsc.
 468
 469 =back
 470
 471 marclist, operator, and value must be the same length, and the values at
 472 index /i/ all relate to each other.
 473
 474 This returns a query, which is a black box object that can be passed to the
 475 appropriate search object.
 476
 477 =cut
 478
 479 our $koha_to_index_name = {
 480     mainmainentry   => 'heading-main',
 481     mainentry       => 'heading',
 482     match           => 'match',
 483     'match-heading' => 'match-heading',
 484     'see-from'      => 'match-heading-see-from',
 485     thesaurus       => 'subject-heading-thesaurus',
 486     any             => '',
 487     all             => ''
 488 };
 489
 490 sub build_authorities_query_compat {
 491     my ( $self, $marclist, $and_or, $excluding, $operator, $value,
 492         $authtypecode, $orderby )
 493       = @_;
 494
 495     # This turns the old-style many-options argument form into a more
 496     # extensible hash form that is understood by L<build_authorities_query>.
 497     my @searches;
 498
 499     # Convert to lower case
 500     $marclist = [map(lc, @{$marclist})];
 501     $orderby  = lc $orderby;
 502
 503     # Make sure everything exists
 504     foreach my $m (@$marclist) {
 505         Koha::Exceptions::WrongParameter->throw("Invalid marclist field provided: $m")
 506             unless exists $koha_to_index_name->{$m};
 507     }
 508     for ( my $i = 0 ; $i < @$value ; $i++ ) {
 509         next unless $value->[$i]; #clean empty form values, ES doesn't like undefined searches
 510         push @searches,
 511           {
 512             where    => $koha_to_index_name->{$marclist->[$i]},
 513             operator => $operator->[$i],
 514             value    => $value->[$i],
 515           };
 516     }
 517
 518     my %sort;
 519     my $sort_field =
 520         ( $orderby =~ /^heading/ ) ? 'heading__sort'
 521       : ( $orderby =~ /^auth/ )    ? 'local-number__sort'
 522       :                              undef;
 523     if ($sort_field) {
 524         my $sort_order = ( $orderby =~ /asc$/ ) ? 'asc' : 'desc';
 525         %sort = ( $sort_field => $sort_order, );
 526     }
 527     my %search = (
 528         searches     => \@searches,
 529         authtypecode => $authtypecode,
 530     );
 531     $search{sort} = \%sort if %sort;
 532     my $query = $self->build_authorities_query( \%search );
 533     return $query;
 534 }
 535
 536 =head2 _convert_sort_fields
 537
 538     my @sort_params = _convert_sort_fields(@sort_by)
 539
 540 Converts the zebra-style sort index information into elasticsearch-style.
 541
 542 C<@sort_by> is the same as presented to L<build_query_compat>, and it returns
 543 something that can be sent to L<build_query>.
 544
 545 =cut
 546
 547 sub _convert_sort_fields {
 548     my ( $self, @sort_by ) = @_;
 549
 550     # Turn the sorting into something we care about.
 551     my %sort_field_convert = (
 552         acqdate     => 'date-of-acquisition',
 553         author      => 'author',
 554         call_number => 'local-classification',
 555         popularity  => 'issues',
 556         relevance   => undef,       # default
 557         title       => 'title',
 558         pubdate     => 'date-of-publication',
 559     );
 560     my %sort_order_convert =
 561       ( qw( desc desc ), qw( dsc desc ), qw( asc asc ), qw( az asc ), qw( za desc ) );
 562
 563     # Convert the fields and orders, drop anything we don't know about.
 564     grep { $_->{field} } map {
 565         my ( $f, $d ) = /(.+)_(.+)/;
 566         {
 567             field     => $sort_field_convert{$f},
 568             direction => $sort_order_convert{$d}
 569         }
 570     } @sort_by;
 571 }
 572
 573 =head2 _convert_index_fields
 574
 575     my @index_params = $self->_convert_index_fields(@indexes);
 576
 577 Converts zebra-style search index notation into elasticsearch-style.
 578
 579 C<@indexes> is an array of index names, as presented to L<build_query_compat>,
 580 and it returns something that can be sent to L<build_query>.
 581
 582 B<TODO>: this will pull from the elasticsearch mappings table to figure out
 583 types.
 584
 585 =cut
 586
 587 our %index_field_convert = (
 588     'kw' => '',
 589     'ab' => 'abstract',
 590     'au' => 'author',
 591     'lcn' => 'local-classification',
 592     'callnum' => 'local-classification',
 593     'record-type' => 'rtype',
 594     'mc-rtype' => 'rtype',
 595     'mus' => 'rtype',
 596     'lc-card' => 'lc-card-number',
 597     'sn' => 'local-number',
 598     'yr' => 'date-of-publication',
 599     'pubdate' => 'date-of-publication',
 600     'acqdate' => 'date-of-acquisition',
 601     'date/time-last-modified' => 'date-time-last-modified',
 602     'dtlm' => 'date-time-last-modified',
 603     'diss' => 'dissertation-information',
 604     'nb' => 'isbn',
 605     'ns' => 'issn',
 606     'music-number' => 'identifier-publisher-for-music',
 607     'number-music-publisher' => 'identifier-publisher-for-music',
 608     'music' => 'identifier-publisher-for-music',
 609     'ident' => 'identifier-standard',
 610     'cpn' => 'corporate-name',
 611     'cfn' => 'conference-name',
 612     'pn' => 'personal-name',
 613     'pb' => 'publisher',
 614     'pv' => 'provider',
 615     'nt' => 'note',
 616     'notes' => 'note',
 617     'rcn' => 'record-control-number',
 618     'su' => 'subject',
 619     'su-to' => 'subject',
 620     #'su-geo' => 'subject',
 621     'su-ut' => 'subject',
 622     'ti' => 'title',
 623     'se' => 'title-series',
 624     'ut' => 'title-uniform',
 625     'an' => 'koha-auth-number',
 626     'authority-number' => 'koha-auth-number',
 627     'at' => 'authtype',
 628     'he' => 'heading',
 629     'rank' => 'relevance',
 630     'phr' => 'st-phrase',
 631     'wrdl' => 'st-word-list',
 632     'rt' => 'right-truncation',
 633     'rtrn' => 'right-truncation',
 634     'ltrn' => 'left-truncation',
 635     'rltrn' => 'left-and-right',
 636     'mc-itemtype' => 'itemtype',
 637     'mc-ccode' => 'ccode',
 638     'branch' => 'homebranch',
 639     'mc-loc' => 'location',
 640     'stocknumber' => 'number-local-acquisition',
 641     'inv' => 'number-local-acquisition',
 642     'bc' => 'barcode',
 643     'mc-itype' => 'itype',
 644     'aub' => 'author-personal-bibliography',
 645     'auo' => 'author-in-order',
 646     'ff8-22' => 'ta',
 647     'aud' => 'ta',
 648     'audience' => 'ta',
 649     'frequency-code' => 'ff8-18',
 650     'illustration-code' => 'ff8-18-21',
 651     'regularity-code' => 'ff8-19',
 652     'type-of-serial' => 'ff8-21',
 653     'format' => 'ff8-23',
 654     'conference-code' => 'ff8-29',
 655     'festschrift-indicator' => 'ff8-30',
 656     'index-indicator' => 'ff8-31',
 657     'fiction' => 'lf',
 658     'fic' => 'lf',
 659     'literature-code' => 'lf',
 660     'biography' => 'bio',
 661     'ff8-34' => 'bio',
 662     'biography-code' => 'bio',
 663     'l-format' => 'ff7-01-02',
 664     'lex' => 'lexile-number',
 665     'hi' => 'host-item-number',
 666     'itu' => 'index-term-uncontrolled',
 667     'itg' => 'index-term-genre',
 668 );
 669 my $field_name_pattern = '[\w\-]+';
 670 my $multi_field_pattern = "(?:\\.$field_name_pattern)*";
 671
 672 sub _convert_index_fields {
 673     my ( $self, @indexes ) = @_;
 674
 675     my %index_type_convert =
 676       ( __default => undef, phr => 'phrase', rtrn => 'right-truncate', 'st-year' => 'st-year' );
 677
 678     # Convert according to our table, drop anything that doesn't convert.
 679     # If a field starts with mc- we save it as it's used (and removed) later
 680     # when joining things, to indicate we make it an 'OR' join.
 681     # (Sorry, this got a bit ugly after special cases were found.)
 682     map {
 683         # Lower case all field names
 684         my ( $f, $t ) = map(lc, split /,/);
 685         my $mc = '';
 686         if ($f =~ /^mc-/) {
 687             $mc = 'mc-';
 688             $f =~ s/^mc-//;
 689         }
 690         my $r = {
 691             field => exists $index_field_convert{$f} ? $index_field_convert{$f} : $f,
 692             type  => $index_type_convert{ $t // '__default' }
 693         };
 694         $r->{field} = ($mc . $r->{field}) if $mc && $r->{field};
 695         $r->{field} ? $r : undef;
 696     } @indexes;
 697 }
 698
 699 =head2 _convert_index_strings
 700
 701     my @searches = $self->_convert_index_strings(@searches);
 702
 703 Similar to L<_convert_index_fields>, this takes strings of the form
 704 B<field:search term> and rewrites the field from zebra-style to
 705 elasticsearch-style. Anything it doesn't understand is returned verbatim.
 706
 707 =cut
 708
 709 sub _convert_index_strings {
 710     my ( $self, @searches ) = @_;
 711     my @res;
 712     foreach my $s (@searches) {
 713         next if $s eq '';
 714         my ( $field, $term ) = $s =~ /^\s*([\w,-]*?):(.*)/;
 715         unless ( defined($field) && defined($term) ) {
 716             push @res, $s;
 717             next;
 718         }
 719         my ($conv) = $self->_convert_index_fields($field);
 720         unless ( defined($conv) ) {
 721             push @res, $s;
 722             next;
 723         }
 724         push @res, ($conv->{field} ? $conv->{field} . ':' : '')
 725             . $self->_modify_string_by_type( %$conv, operand => $term );
 726     }
 727     return @res;
 728 }
 729
 730 =head2 _convert_index_strings_freeform
 731
 732     my $search = $self->_convert_index_strings_freeform($search);
 733
 734 This is similar to L<_convert_index_strings>, however it'll search out the
 735 things to change within the string. So it can handle strings such as
 736 C<(su:foo) AND (su:bar)>, converting the C<su> appropriately.
 737
 738 If there is something of the form "su,complete-subfield" or something, the
 739 second part is stripped off as we can't yet handle that. Making it work
 740 will have to wait for a real query parser.
 741
 742 =cut
 743
 744 sub _convert_index_strings_freeform {
 745     my ( $self, $search ) = @_;
 746     # @TODO: Currenty will alter also fields contained within quotes:
 747     # `searching for "stuff cn:123"` for example will become
 748     # `searching for "stuff local-number:123"
 749     #
 750     # Fixing this is tricky, one possibility:
 751     # https://stackoverflow.com/questions/19193876/perl-regex-to-match-a-string-that-is-not-enclosed-in-quotes
 752     # Still not perfect, and will not handle escaped quotes within quotes and assumes balanced quotes.
 753     #
 754     # Another, not so elegant, solution could be to replace all quoted content with placeholders, and put
 755     # them back when processing is done.
 756
 757     # Lower case field names
 758     $search =~ s/($field_name_pattern)(?:,[\w-]*)?($multi_field_pattern):/\L$1\E$2:/og;
 759     # Resolve possible field aliases
 760     $search =~ s/($field_name_pattern)($multi_field_pattern):/(exists $index_field_convert{$1} ? $index_field_convert{$1} : $1)."$2:"/oge;
 761     return $search;
 762 }
 763
 764 =head2 _modify_string_by_type
 765
 766     my $str = $self->_modify_string_by_type(%index_field);
 767
 768 If you have a search term (operand) and a type (phrase, right-truncated), this
 769 will convert the string to have the function in lucene search terms, e.g.
 770 wrapping quotes around it.
 771
 772 =cut
 773
 774 sub _modify_string_by_type {
 775     my ( $self, %idx ) = @_;
 776
 777     my $type = $idx{type} || '';
 778     my $str = $idx{operand};
 779     return $str unless $str;    # Empty or undef, we can't use it.
 780
 781     $str .= '*' if $type eq 'right-truncate';
 782     $str = '"' . $str . '"' if $type eq 'phrase';
 783     if ($type eq 'st-year') {
 784         if ($str =~ /^(.*)-(.*)$/) {
 785             my $from = $1 || '*';
 786             my $until = $2 || '*';
 787             $str = "[$from TO $until]";
 788         }
 789     }
 790     return $str;
 791 }
 792
 793 =head2 _join_queries
 794
 795     my $query_str = $self->_join_queries(@query_parts);
 796
 797 This takes a list of query parts, that might be search terms on their own, or
 798 booleaned together, or specifying fields, or whatever, wraps them in
 799 parentheses, and ANDs them all together. Suitable for feeding to the ES
 800 query string query.
 801
 802 Note: doesn't AND them together if they specify an index that starts with "mc"
 803 as that was a special case in the original code for dealing with multiple
 804 choice options (you can't search for something that has an itype of A and
 805 and itype of B otherwise.)
 806
 807 =cut
 808
 809 sub _join_queries {
 810     my ( $self, @parts ) = @_;
 811
 812     my @norm_parts = grep { defined($_) && $_ ne '' && $_ !~ /^mc-/ } @parts;
 813     my @mc_parts =
 814       map { s/^mc-//r } grep { defined($_) && $_ ne '' && $_ =~ /^mc-/ } @parts;
 815     return () unless @norm_parts + @mc_parts;
 816     return ( @norm_parts, @mc_parts )[0] if @norm_parts + @mc_parts == 1;
 817     my $grouped_mc =
 818       @mc_parts ? '(' . ( join ' OR ', map { "($_)" } @mc_parts ) . ')' : ();
 819
 820     # Handy trick: $x || () inside a join means that if $x ends up as an
 821     # empty string, it gets replaced with (), which makes join ignore it.
 822     # (bad effect: this'll also happen to '0', this hopefully doesn't matter
 823     # in this case.)
 824     join( ' AND ',
 825         join( ' AND ', map { "($_)" } @norm_parts ) || (),
 826         $grouped_mc || () );
 827 }
 828
 829 =head2 _make_phrases
 830
 831     my @phrased_queries = $self->_make_phrases(@query_parts);
 832
 833 This takes the supplied queries and forces them to be phrases by wrapping
 834 quotes around them. It understands field prefixes, e.g. 'subject:' and puts
 835 the quotes outside of them if they're there.
 836
 837 =cut
 838
 839 sub _make_phrases {
 840     my ( $self, @parts ) = @_;
 841     map { s/^\s*(\w*?:)(.*)$/$1"$2"/r } @parts;
 842 }
 843
 844 =head2 _create_query_string
 845
 846     my @query_strings = $self->_create_query_string(@queries);
 847
 848 Given a list of hashrefs, it will turn them into a lucene-style query string.
 849 The hash should contain field, type (both for the indexes), operator, and
 850 operand.
 851
 852 =cut
 853
 854 sub _create_query_string {
 855     my ( $self, @queries ) = @_;
 856
 857     map {
 858         my $otor  = $_->{operator} ? $_->{operator} . ' ' : '';
 859         my $field = $_->{field}    ? $_->{field} . ':'    : '';
 860
 861         my $oand = $self->_modify_string_by_type(%$_);
 862         $oand = "($oand)" if $field && scalar(split(/\s+/, $oand)) > 1 && (!defined $_->{type} || $_->{type} ne 'st-year');
 863         "$otor($field$oand)";
 864     } @queries;
 865 }
 866
 867 =head2 _clean_search_term
 868
 869     my $term = $self->_clean_search_term($term);
 870
 871 This cleans a search term by removing any funny characters that may upset
 872 ES and give us an error. It also calls L<_convert_index_strings_freeform>
 873 to ensure those parts are correct.
 874
 875 =cut
 876
 877 sub _clean_search_term {
 878     my ( $self, $term ) = @_;
 879
 880     # Lookahead for checking if we are inside quotes
 881     my $lookahead = '(?=(?:[^\"]*+\"[^\"]*+\")*+[^\"]*+$)';
 882
 883     # Some hardcoded searches (like with authorities) produce things like
 884     # 'an=123', when it ought to be 'an:123' for our purposes.
 885     $term =~ s/=/:/g;
 886
 887     $term = $self->_convert_index_strings_freeform($term);
 888     $term =~ s/[{}]/"/g;
 889
 890     # Remove unbalanced quotes
 891     my $unquoted = $term;
 892     my $count = ($unquoted =~ tr/"/ /);
 893     if ($count % 2 == 1) {
 894         $term = $unquoted;
 895     }
 896
 897     # Remove unquoted colons that have whitespace on either side of them
 898     $term =~ s/(\:[:\s]+|[:\s]+:)$lookahead//g;
 899
 900     $term = $self->_query_regex_escape_process($term);
 901
 902     return $term;
 903 }
 904
 905 =head2 _query_regex_escape_process
 906
 907     my $query = $self->_query_regex_escape_process($query);
 908
 909 Processes query in accordance with current "QueryRegexEscapeOptions" system preference setting.
 910
 911 =cut
 912
 913 sub _query_regex_escape_process {
 914     my ($self, $query) = @_;
 915     my $regex_escape_options = C4::Context->preference("QueryRegexEscapeOptions");
 916     if ($regex_escape_options ne 'dont_escape') {
 917         if ($regex_escape_options eq 'escape') {
 918             # Will escape unescaped slashes (/) while preserving
 919             # unescaped slashes within quotes
 920             # @TODO: assumes quotes are always balanced and will
 921             # not handle escaped qoutes properly, should perhaps be
 922             # replaced with a more general parser solution
 923             # so that this function is ever only provided with unqouted
 924             # query parts
 925             $query =~ s@(?:(?<!\\)((?:[\\]{2})*)(?=/))(?![^"]*"(?:[^"]*"[^"]*")*[^"]*$)@\\$1@g;
 926         }
 927         elsif($regex_escape_options eq 'unescape_escaped') {
 928             # Will unescape escaped slashes (\/) and escape
 929             # unescaped slashes (/) while preserving slashes within quotes
 930             # The same limitatations as above apply for handling of quotes
 931             $query =~ s@(?:(?<!\\)(?:((?:[\\]{2})*[\\])|((?:[\\]{2})*))(?=/))(?![^"]*"(?:[^"]*"[^"]*")*[^"]*$)@($1 ? substr($1, 0, -1) : ($2 . "\\"))@ge;
 932         }
 933     }
 934     return $query;
 935 }
 936
 937 =head2 _fix_limit_special_cases
 938
 939     my $limits = $self->_fix_limit_special_cases($limits);
 940
 941 This converts any special cases that the limit specifications have into things
 942 that are more readily processable by the rest of the code.
 943
 944 The argument should be an arrayref, and it'll return an arrayref.
 945
 946 =cut
 947
 948 sub _fix_limit_special_cases {
 949     my ( $self, $limits ) = @_;
 950
 951     my @new_lim;
 952     foreach my $l (@$limits) {
 953
 954         # This is set up by opac-search.pl
 955         if ( $l =~ /^yr,st-numeric,ge=/ ) {
 956             my ( $start, $end ) =
 957               ( $l =~ /^yr,st-numeric,ge=(.*) and yr,st-numeric,le=(.*)$/ );
 958             next unless defined($start) && defined($end);
 959             push @new_lim, "copydate:[$start TO $end]";
 960         }
 961         elsif ( $l =~ /^yr,st-numeric=/ ) {
 962             my ($date) = ( $l =~ /^yr,st-numeric=(.*)$/ );
 963             next unless defined($date);
 964             $date = $self->_modify_string_by_type(type => 'st-year', operand => $date);
 965             push @new_lim, "copydate:$date";
 966         }
 967         elsif ( $l =~ /^available$/ ) {
 968             push @new_lim, 'onloan:false';
 969         }
 970         else {
 971             push @new_lim, $l;
 972         }
 973     }
 974     return \@new_lim;
 975 }
 976
 977 =head2 _sort_field
 978
 979     my $field = $self->_sort_field($field);
 980
 981 Given a field name, this works out what the actual name of the field to sort
 982 on should be. A '__sort' suffix is added for fields with a sort version, and
 983 for text fields either '.phrase' (for sortable versions) or '.raw' is appended
 984 to avoid sorting on a tokenized value.
 985
 986 =cut
 987
 988 sub _sort_field {
 989     my ($self, $f) = @_;
 990
 991     my $mappings = $self->get_elasticsearch_mappings();
 992     my $textField = defined $mappings->{data}{properties}{$f}{type} && $mappings->{data}{properties}{$f}{type} eq 'text';
 993     if (!defined $self->sort_fields()->{$f} || $self->sort_fields()->{$f}) {
 994         $f .= '__sort';
 995         # We need to add '.phrase' to text fields, otherwise it'll sort
 996         # based on the tokenised form.
 997         $f .= '.phrase' if $textField;
 998     } else {
 999         # We need to add '.raw' to text fields without a sort field,
1000         # otherwise it'll sort based on the tokenised form.
1001         $f .= '.raw' if $textField;
1002     }
1003     return $f;
1004 }
1005
1006 =head2 _truncate_terms
1007
1008     my $query = $self->_truncate_terms($query);
1009
1010 Given a string query this function appends '*' wildcard  to all terms except
1011 operands and double quoted strings.
1012
1013 =cut
1014
1015 sub _truncate_terms {
1016     my ( $self, $query ) = @_;
1017
1018     my @tokens = $self->_split_query( $query );
1019
1020     # Filter out empty tokens
1021     my @words = grep { $_ !~ /^\s*$/ } @tokens;
1022
1023     # Append '*' to words if needed, ie. if it ends in a word character and is not a keyword
1024     my @terms = map {
1025         my $w = $_;
1026         (/\W$/ or grep {lc($w) eq $_} qw/and or not/) ? $_ : "$_*";
1027     } @words;
1028
1029     return join ' ', @terms;
1030 }
1031
1032 =head2 _split_query
1033
1034     my @token = $self->_split_query($query_str);
1035
1036 Given a string query this function splits it to tokens taking into account
1037 any field prefixes and quoted strings.
1038
1039 =cut
1040
1041 my $tokenize_split_re = qr/((?:${field_name_pattern}${multi_field_pattern}:)?"[^"]+"|\s+)/;
1042
1043 sub _split_query {
1044     my ( $self, $query ) = @_;
1045
1046     # '"donald duck" title:"the mouse" and peter" get split into
1047     # ['', '"donald duck"', '', ' ', '', 'title:"the mouse"', '', ' ', 'and', ' ', 'pete']
1048     my @tokens = split $tokenize_split_re, $query;
1049
1050     # Filter out empty values
1051     @tokens = grep( /\S/, @tokens );
1052
1053     return @tokens;
1054 }
1055
1056 =head2 _search_fields
1057     my $weighted_fields = $self->_search_fields({
1058         is_opac => 0,
1059         weighted_fields => 1,
1060         subfield => 'raw'
1061     });
1062
1063 Generate a list of searchable fields to be used for Elasticsearch queries
1064 applied to multiple fields.
1065
1066 Returns an arrayref of field names for either OPAC or Staff client, with
1067 possible weights and subfield appended to each field name depending on the
1068 options provided.
1069
1070 =over 4
1071
1072 =item C<$params>
1073
1074 Hashref with options. The parameter C<is_opac> indicates whether the searchable
1075 fields for OPAC or Staff client should be retrieved. If C<weighted_fields> is set
1076 fields weights will be applied on returned fields. C<subfield> can be used to
1077 provide a subfield that will be appended to fields as "C<field_name>.C<subfield>".
1078
1079 =back
1080
1081 =cut
1082
1083 sub _search_fields {
1084     my ($self, $params) = @_;
1085     $params //= {
1086         is_opac => 0,
1087         weighted_fields => 0,
1088         whole_record => 0,
1089         # This is a hack for authorities build_authorities_query
1090         # can hopefully be removed in the future
1091         subfield => undef,
1092     };
1093     my $cache = Koha::Caches->get_instance();
1094     my $cache_key = 'elasticsearch_search_fields' . ($params->{is_opac} ? '_opac' : '_staff_client');
1095     my $search_fields = $cache->get_from_cache($cache_key, { unsafe => 1 });
1096     if (!$search_fields) {
1097         # The reason we don't use Koha::SearchFields->search here is we don't
1098         # want or need resultset wrapped as Koha::SearchField object.
1099         # It does not make any sense in this context and would cause
1100         # unnecessary overhead sice we are only querying for data
1101         # Also would not work, or produce strange results, with the "columns"
1102         # option.
1103         my $schema = Koha::Database->schema;
1104         my $result = $schema->resultset('SearchField')->search(
1105             {
1106                 $params->{is_opac} ? (
1107                     'opac' => 1,
1108                 ) : (
1109                     'staff_client' => 1
1110                 ),
1111                 'type' => { '!=' => 'boolean' },
1112                 'search_marc_map.index_name' => $self->index,
1113                 'search_marc_map.marc_type' => C4::Context->preference('marcflavour'),
1114                 'search_marc_to_fields.search' => 1,
1115             },
1116             {
1117                 columns => [qw/name weight/],
1118                 collapse => 1,
1119                 join => {search_marc_to_fields => 'search_marc_map'},
1120             }
1121         );
1122         my @search_fields;
1123         while (my $search_field = $result->next) {
1124             push @search_fields, [
1125                 $search_field->name,
1126                 $search_field->weight ? $search_field->weight : ()
1127             ];
1128         }
1129         $search_fields = \@search_fields;
1130         $cache->set_in_cache($cache_key, $search_fields);
1131     }
1132     if ($params->{subfield}) {
1133         my $subfield = $params->{subfield};
1134         $search_fields = [
1135             map {
1136                 # Copy values to avoid mutating cached
1137                 # data (since unsafe is used)
1138                 my ($field, $weight) = @{$_};
1139                 ["${field}.${subfield}", $weight];
1140             } @{$search_fields}
1141         ];
1142     }
1143     if ($params->{weighted_fields}) {
1144         return [map { join('^', @{$_}) } @{$search_fields}];
1145     }
1146     else {
1147         # Exclude weight from field
1148         return [map { $_->[0] } @{$search_fields}];
1149     }
1150 }
1151
1152 1;