Koha/SearchEngine/Elasticsearch/QueryBuilder.pm

   1 package Koha::SearchEngine::Elasticsearch::QueryBuilder;
   2
   3 # This file is part of Koha.
   4 #
   5 # Copyright 2014 Catalyst IT Ltd.
   6 #
   7 # Koha is free software; you can redistribute it and/or modify it
   8 # under the terms of the GNU General Public License as published by
   9 # the Free Software Foundation; either version 3 of the License, or
  10 # (at your option) any later version.
  11 #
  12 # Koha is distributed in the hope that it will be useful, but
  13 # WITHOUT ANY WARRANTY; without even the implied warranty of
  14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  15 # GNU General Public License for more details.
  16 #
  17 # You should have received a copy of the GNU General Public License
  18 # along with Koha; if not, see <http://www.gnu.org/licenses>.
  19
  20 =head1 NAME
  21
  22 Koha::SearchEngine::Elasticsearch::QueryBuilder - constructs elasticsearch
  23 query objects from user-supplied queries
  24
  25 =head1 DESCRIPTION
  26
  27 This provides the functions that take a user-supplied search query, and
  28 provides something that can be given to elasticsearch to get answers.
  29
  30 =head1 SYNOPSIS
  31
  32     use Koha::SearchEngine::Elasticsearch::QueryBuilder;
  33     $builder = Koha::SearchEngine::Elasticsearch->new({ index => $index });
  34     my $simple_query = $builder->build_query("hello");
  35     # This is currently undocumented because the original code is undocumented
  36     my $adv_query = $builder->build_advanced_query($indexes, $operands, $operators);
  37
  38 =head1 METHODS
  39
  40 =cut
  41
  42 use base qw(Koha::SearchEngine::Elasticsearch);
  43 use Carp;
  44 use JSON;
  45 use List::MoreUtils qw/ each_array /;
  46 use Modern::Perl;
  47 use URI::Escape;
  48
  49 use C4::Context;
  50 use Koha::Exceptions;
  51 use Koha::Caches;
  52
  53 =head2 build_query
  54
  55     my $simple_query = $builder->build_query("hello", %options)
  56
  57 This will build a query that can be issued to elasticsearch from the provided
  58 string input. This expects a lucene style search form (see
  59 L<http://www.elasticsearch.org/guide/en/elasticsearch/reference/current/query-dsl-query-string-query.html#query-string-syntax>
  60 for details.)
  61
  62 It'll make an attempt to respect the various query options.
  63
  64 Additional options can be provided with the C<%options> hash.
  65
  66 =over 4
  67
  68 =item sort
  69
  70 This should be an arrayref of hashrefs, each containing a C<field> and an
  71 C<direction> (optional, defaults to C<asc>.) The results will be sorted
  72 according to these values. Valid values for C<direction> are 'asc' and 'desc'.
  73
  74 =back
  75
  76 =cut
  77
  78 sub build_query {
  79     my ( $self, $query, %options ) = @_;
  80
  81     my $stemming         = C4::Context->preference("QueryStemming")        || 0;
  82     my $auto_truncation  = C4::Context->preference("QueryAutoTruncate")    || 0;
  83     my $fuzzy_enabled    = C4::Context->preference("QueryFuzzy")           || 0;
  84
  85     $query = '*' unless defined $query;
  86
  87     my $res;
  88     my $fields = $self->_search_fields({
  89         is_opac => $options{is_opac},
  90         weighted_fields => $options{weighted_fields},
  91     });
  92     if ($options{whole_record}) {
  93         push @$fields, 'marc_data_array.*';
  94     }
  95     $res->{query} = {
  96         query_string => {
  97             query            => $query,
  98             fuzziness        => $fuzzy_enabled ? 'auto' : '0',
  99             default_operator => 'AND',
 100             fields           => $fields,
 101             lenient          => JSON::true,
 102             analyze_wildcard => JSON::true,
 103         }
 104     };
 105
 106     if ( $options{sort} ) {
 107         foreach my $sort ( @{ $options{sort} } ) {
 108             my ( $f, $d ) = @$sort{qw/ field direction /};
 109             die "Invalid sort direction, $d"
 110               if $d && ( $d ne 'asc' && $d ne 'desc' );
 111             $d = 'asc' unless $d;
 112
 113             $f = $self->_sort_field($f);
 114             push @{ $res->{sort} }, { $f => { order => $d } };
 115         }
 116     }
 117
 118     # See _convert_facets in Search.pm for how these get turned into
 119     # things that Koha can use.
 120     my $size = C4::Context->preference('FacetMaxCount');
 121     $res->{aggregations} = {
 122         author         => { terms => { field => "author__facet" , size => $size } },
 123         subject        => { terms => { field => "subject__facet", size => $size } },
 124         itype          => { terms => { field => "itype__facet", size => $size} },
 125         location       => { terms => { field => "location__facet", size => $size } },
 126         'su-geo'       => { terms => { field => "su-geo__facet", size => $size} },
 127         'title-series' => { terms => { field => "title-series__facet", size => $size } },
 128         ccode          => { terms => { field => "ccode__facet", size => $size } },
 129         ln             => { terms => { field => "ln__facet", size => $size } },
 130     };
 131
 132     my $display_library_facets = C4::Context->preference('DisplayLibraryFacets');
 133     if (   $display_library_facets eq 'both'
 134         or $display_library_facets eq 'home' ) {
 135         $res->{aggregations}{homebranch} = { terms => { field => "homebranch__facet" } };
 136     }
 137     if (   $display_library_facets eq 'both'
 138         or $display_library_facets eq 'holding' ) {
 139         $res->{aggregations}{holdingbranch} = { terms => { field => "holdingbranch__facet" } };
 140     }
 141     return $res;
 142 }
 143
 144 =head2 build_browse_query
 145
 146     my $browse_query = $builder->build_browse_query($field, $query);
 147
 148 This performs a "starts with" style query on a particular field. The field
 149 to be searched must have been indexed with an appropriate mapping as a
 150 "phrase" subfield, which pretty much everything has.
 151
 152 =cut
 153
 154 # XXX this isn't really a browse query like we want in the end
 155 sub build_browse_query {
 156     my ( $self, $field, $query ) = @_;
 157
 158     my $fuzzy_enabled = C4::Context->preference("QueryFuzzy") || 0;
 159
 160     return { query => '*' } if !defined $query;
 161
 162     # TODO this should come from Koha::SearchEngine::Elasticsearch
 163     my %field_whitelist = (
 164         title  => 1,
 165         author => 1,
 166     );
 167     $field = 'title' if !exists $field_whitelist{$field};
 168     my $sort = $self->_sort_field($field);
 169     my $res = {
 170         query => {
 171             match_phrase_prefix => {
 172                 "$field.phrase" => {
 173                     query     => $query,
 174                     operator  => 'or',
 175                     fuzziness => $fuzzy_enabled ? 'auto' : '0',
 176                 }
 177             }
 178         },
 179         sort => [ { $sort => { order => "asc" } } ],
 180     };
 181 }
 182
 183 =head2 build_query_compat
 184
 185     my (
 186         $error,             $query, $simple_query, $query_cgi,
 187         $query_desc,        $limit, $limit_cgi,    $limit_desc,
 188         $stopwords_removed, $query_type
 189       )
 190       = $builder->build_query_compat( \@operators, \@operands, \@indexes,
 191         \@limits, \@sort_by, $scan, $lang, $params );
 192
 193 This handles a search using the same api as L<C4::Search::buildQuery> does.
 194
 195 A very simple query will go in with C<$operands> set to ['query'], and
 196 C<$sort_by> set to ['pubdate_dsc']. This simple case will return with
 197 C<$query> set to something that can perform the search, C<$simple_query>
 198 set to just the search term, C<$query_cgi> set to something that can
 199 reproduce this search, and C<$query_desc> set to something else.
 200
 201 =cut
 202
 203 sub build_query_compat {
 204     my ( $self, $operators, $operands, $indexes, $orig_limits, $sort_by, $scan,
 205         $lang, $params )
 206       = @_;
 207
 208 #die Dumper ( $self, $operators, $operands, $indexes, $orig_limits, $sort_by, $scan, $lang );
 209     my @sort_params  = $self->_convert_sort_fields(@$sort_by);
 210     my @index_params = $self->_convert_index_fields(@$indexes);
 211     my $limits       = $self->_fix_limit_special_cases($orig_limits);
 212     if ( $params->{suppress} ) { push @$limits, "suppress:0"; }
 213     # Merge the indexes in with the search terms and the operands so that
 214     # each search thing is a handy unit.
 215     unshift @$operators, undef;    # The first one can't have an op
 216     my @search_params;
 217     my $truncate = C4::Context->preference("QueryAutoTruncate") || 0;
 218     my $ea = each_array( @$operands, @$operators, @index_params );
 219     while ( my ( $oand, $otor, $index ) = $ea->() ) {
 220         next if ( !defined($oand) || $oand eq '' );
 221         $oand = $self->_clean_search_term($oand);
 222         $oand = $self->_truncate_terms($oand) if ($truncate);
 223         push @search_params, {
 224             operand => $oand,      # the search terms
 225             operator => defined($otor) ? uc $otor : undef,    # AND and so on
 226             $index ? %$index : (),
 227         };
 228     }
 229
 230     # We build a string query from limits and the queries. An alternative
 231     # would be to pass them separately into build_query and let it build
 232     # them into a structured ES query itself. Maybe later, though that'd be
 233     # more robust.
 234     my $search_param_query_str = join( ' ', $self->_create_query_string(@search_params) );
 235     my $query_str = join( ' AND ',
 236         $search_param_query_str || (),
 237         $self->_join_queries( $self->_convert_index_strings(@$limits) ) || () );
 238
 239     # If there's no query on the left, let's remove the junk left behind
 240     $query_str =~ s/^ AND //;
 241     my %options;
 242     $options{sort} = \@sort_params;
 243     $options{is_opac} = $params->{is_opac};
 244     $options{weighted_fields} = $params->{weighted_fields};
 245     $options{whole_record} = $params->{whole_record};
 246     my $query = $self->build_query( $query_str, %options );
 247
 248     # We roughly emulate the CGI parameters of the zebra query builder
 249     my $query_cgi = '';
 250     shift @$operators; # Shift out the one we unshifted before
 251     $ea = each_array( @$operands, @$operators, @$indexes );
 252     while ( my ( $oand, $otor, $index ) = $ea->() ) {
 253         $query_cgi .= '&' if $query_cgi;
 254         $query_cgi .= 'idx=' . uri_escape_utf8( $index // '') . '&q=' . uri_escape_utf8( $oand );
 255         $query_cgi .= '&op=' . uri_escape_utf8( $otor ) if $otor;
 256     }
 257     $query_cgi .= '&scan=1' if ( $scan );
 258
 259     my $simple_query;
 260     $simple_query = $operands->[0] if @$operands == 1;
 261     my $query_desc;
 262     if ( $simple_query ) {
 263         $query_desc = $simple_query;
 264     } else {
 265         $query_desc = $search_param_query_str;
 266     }
 267     my $limit     = $self->_join_queries( $self->_convert_index_strings(@$limits));
 268     my $limit_cgi = ( $orig_limits and @$orig_limits )
 269       ? '&limit=' . join( '&limit=', map { uri_escape_utf8($_) } @$orig_limits )
 270       : '';
 271     my $limit_desc;
 272     $limit_desc = "$limit" if $limit;
 273
 274     return (
 275         undef,  $query,     $simple_query, $query_cgi, $query_desc,
 276         $limit, $limit_cgi, $limit_desc,   undef,      undef
 277     );
 278 }
 279
 280 =head2 build_authorities_query
 281
 282     my $query = $builder->build_authorities_query(\%search);
 283
 284 This takes a nice description of an authority search and turns it into a black-box
 285 query that can then be passed to the appropriate searcher.
 286
 287 The search description is a hashref that looks something like:
 288
 289     {
 290         searches => [
 291             {
 292                 where    => 'Heading',    # search the main entry
 293                 operator => 'exact',        # require an exact match
 294                 value    => 'frogs',        # the search string
 295             },
 296             {
 297                 where    => '',             # search all entries
 298                 operator => '',             # default keyword, right truncation
 299                 value    => 'pond',
 300             },
 301         ],
 302         sort => {
 303             field => 'Heading',
 304             order => 'desc',
 305         },
 306         authtypecode => 'TOPIC_TERM',
 307     }
 308
 309 =cut
 310
 311 sub build_authorities_query {
 312     my ( $self, $search ) = @_;
 313
 314     # Start by making the query parts
 315     my @query_parts;
 316
 317     foreach my $s ( @{ $search->{searches} } ) {
 318         my ( $wh, $op, $val ) = @{$s}{qw(where operator value)};
 319         if ( $op eq 'is' || $op eq '=' || $op eq 'exact') {
 320             if ($wh) {
 321                 # Match the whole field, case insensitive, UTF normalized.
 322                 push @query_parts, { term => { "$wh.ci_raw" => $val } };
 323             }
 324             else {
 325                 # Match the whole field for all searchable fields, case insensitive,
 326                 # UTF normalized.
 327                 # Given that field data is "The quick brown fox"
 328                 # "The quick brown fox" and "the quick brown fox" will match
 329                 # but not "quick brown fox".
 330                 push @query_parts, {
 331                     multi_match => {
 332                         query => $val,
 333                         fields => $self->_search_fields({ subfield => 'ci_raw' }),
 334                     }
 335                 };
 336             }
 337         }
 338         elsif ( $op eq 'start') {
 339             # Match the prefix within a field for all searchable fields.
 340             # Given that field data is "The quick brown fox"
 341             # "The quick bro" will match, but not "quick bro"
 342
 343             # Does not seems to be a multi prefix query
 344             # so we need to create one
 345             if ($wh) {
 346                 # Match prefix of the field.
 347                 push @query_parts, { prefix => {"$wh.ci_raw" => $val} };
 348             }
 349             else {
 350                 my @prefix_queries;
 351                 foreach my $field (@{$self->_search_fields()}) {
 352                     push @prefix_queries, {
 353                         prefix => { "$field.ci_raw" => $val }
 354                     };
 355                 }
 356                 push @query_parts, {
 357                     'bool' => {
 358                         'should' => \@prefix_queries,
 359                         'minimum_should_match' => 1
 360                     }
 361                 };
 362             }
 363         }
 364         else {
 365             # Query all searchable fields.
 366             # Given that field data is "The quick brown fox"
 367             # a search containing any of the words will match, regardless
 368             # of order.
 369
 370             my @tokens = $self->_split_query( $val );
 371             foreach my $token ( @tokens ) {
 372                 $token = $self->_truncate_terms(
 373                     $self->_clean_search_term( $token )
 374                 );
 375             }
 376             my $query = $self->_join_queries( @tokens );
 377
 378             if ($wh) {
 379                 push @query_parts, { query_string => { default_field => $wh, query => $query } };
 380             }
 381             else {
 382                 push @query_parts, {
 383                     query_string => {
 384                         query => $query,
 385                         fields => $self->_search_fields(),
 386                     }
 387                 };
 388             }
 389         }
 390     }
 391
 392     # Merge the query parts appropriately
 393     # 'should' behaves like 'or'
 394     # 'must' behaves like 'and'
 395     # Zebra behaviour seem to match must so using that here
 396     my $elastic_query = {};
 397     $elastic_query->{bool}->{must} = \@query_parts;
 398
 399     # Filter by authtypecode if set
 400     if ($search->{authtypecode}) {
 401         $elastic_query->{bool}->{filter} = {
 402             term => {
 403                 "authtype.raw" => $search->{authtypecode}
 404             }
 405         };
 406     }
 407
 408     my $query = {
 409         query => $elastic_query
 410     };
 411
 412     # Add the sort stuff
 413     $query->{sort} = [ $search->{sort} ] if exists $search->{sort};
 414
 415     return $query;
 416 }
 417
 418 =head2 build_authorities_query_compat
 419
 420     my ($query) =
 421       $builder->build_authorities_query_compat( \@marclist, \@and_or,
 422         \@excluding, \@operator, \@value, $authtypecode, $orderby );
 423
 424 This builds a query for searching for authorities, in the style of
 425 L<C4::AuthoritiesMarc::SearchAuthorities>.
 426
 427 Arguments:
 428
 429 =over 4
 430
 431 =item marclist
 432
 433 An arrayref containing where the particular term should be searched for.
 434 Options are: mainmainentry, mainentry, match, match-heading, see-from, and
 435 thesaurus. If left blank, any field is used.
 436
 437 =item and_or
 438
 439 Totally ignored. It is never used in L<C4::AuthoritiesMarc::SearchAuthorities>.
 440
 441 =item excluding
 442
 443 Also ignored.
 444
 445 =item operator
 446
 447 What form of search to do. Options are: is (phrase, no truncation, whole field
 448 must match), = (number exact match), exact (phrase, no truncation, whole field
 449 must match). If left blank, then word list, right truncated, anywhere is used.
 450
 451 =item value
 452
 453 The actual user-provided string value to search for.
 454
 455 =item authtypecode
 456
 457 The authority type code to search within. If blank, then all will be searched.
 458
 459 =item orderby
 460
 461 The order to sort the results by. Options are Relevance, HeadingAsc,
 462 HeadingDsc, AuthidAsc, AuthidDsc.
 463
 464 =back
 465
 466 marclist, operator, and value must be the same length, and the values at
 467 index /i/ all relate to each other.
 468
 469 This returns a query, which is a black box object that can be passed to the
 470 appropriate search object.
 471
 472 =cut
 473
 474 our $koha_to_index_name = {
 475     mainmainentry   => 'heading-main',
 476     mainentry       => 'heading',
 477     match           => 'match',
 478     'match-heading' => 'match-heading',
 479     'see-from'      => 'match-heading-see-from',
 480     thesaurus       => 'subject-heading-thesaurus',
 481     any             => '',
 482     all             => ''
 483 };
 484
 485 sub build_authorities_query_compat {
 486     my ( $self, $marclist, $and_or, $excluding, $operator, $value,
 487         $authtypecode, $orderby )
 488       = @_;
 489
 490     # This turns the old-style many-options argument form into a more
 491     # extensible hash form that is understood by L<build_authorities_query>.
 492     my @searches;
 493
 494     # Convert to lower case
 495     $marclist = [map(lc, @{$marclist})];
 496     $orderby  = lc $orderby;
 497
 498     # Make sure everything exists
 499     foreach my $m (@$marclist) {
 500         Koha::Exceptions::WrongParameter->throw("Invalid marclist field provided: $m")
 501             unless exists $koha_to_index_name->{$m};
 502     }
 503     for ( my $i = 0 ; $i < @$value ; $i++ ) {
 504         next unless $value->[$i]; #clean empty form values, ES doesn't like undefined searches
 505         push @searches,
 506           {
 507             where    => $koha_to_index_name->{$marclist->[$i]},
 508             operator => $operator->[$i],
 509             value    => $value->[$i],
 510           };
 511     }
 512
 513     my %sort;
 514     my $sort_field =
 515         ( $orderby =~ /^heading/ ) ? 'heading__sort'
 516       : ( $orderby =~ /^auth/ )    ? 'local-number__sort'
 517       :                              undef;
 518     if ($sort_field) {
 519         my $sort_order = ( $orderby =~ /asc$/ ) ? 'asc' : 'desc';
 520         %sort = ( $sort_field => $sort_order, );
 521     }
 522     my %search = (
 523         searches     => \@searches,
 524         authtypecode => $authtypecode,
 525     );
 526     $search{sort} = \%sort if %sort;
 527     my $query = $self->build_authorities_query( \%search );
 528     return $query;
 529 }
 530
 531 =head2 _convert_sort_fields
 532
 533     my @sort_params = _convert_sort_fields(@sort_by)
 534
 535 Converts the zebra-style sort index information into elasticsearch-style.
 536
 537 C<@sort_by> is the same as presented to L<build_query_compat>, and it returns
 538 something that can be sent to L<build_query>.
 539
 540 =cut
 541
 542 sub _convert_sort_fields {
 543     my ( $self, @sort_by ) = @_;
 544
 545     # Turn the sorting into something we care about.
 546     my %sort_field_convert = (
 547         acqdate     => 'date-of-acquisition',
 548         author      => 'author',
 549         call_number => 'local-classification',
 550         popularity  => 'issues',
 551         relevance   => undef,       # default
 552         title       => 'title',
 553         pubdate     => 'date-of-publication',
 554     );
 555     my %sort_order_convert =
 556       ( qw( desc desc ), qw( dsc desc ), qw( asc asc ), qw( az asc ), qw( za desc ) );
 557
 558     # Convert the fields and orders, drop anything we don't know about.
 559     grep { $_->{field} } map {
 560         my ( $f, $d ) = /(.+)_(.+)/;
 561         {
 562             field     => $sort_field_convert{$f},
 563             direction => $sort_order_convert{$d}
 564         }
 565     } @sort_by;
 566 }
 567
 568 =head2 _convert_index_fields
 569
 570     my @index_params = $self->_convert_index_fields(@indexes);
 571
 572 Converts zebra-style search index notation into elasticsearch-style.
 573
 574 C<@indexes> is an array of index names, as presented to L<build_query_compat>,
 575 and it returns something that can be sent to L<build_query>.
 576
 577 B<TODO>: this will pull from the elasticsearch mappings table to figure out
 578 types.
 579
 580 =cut
 581
 582 our %index_field_convert = (
 583     'kw' => '',
 584     'ab' => 'abstract',
 585     'au' => 'author',
 586     'lcn' => 'local-classification',
 587     'callnum' => 'local-classification',
 588     'record-type' => 'rtype',
 589     'mc-rtype' => 'rtype',
 590     'mus' => 'rtype',
 591     'lc-card' => 'lc-card-number',
 592     'sn' => 'local-number',
 593     'yr' => 'date-of-publication',
 594     'pubdate' => 'date-of-publication',
 595     'acqdate' => 'date-of-acquisition',
 596     'date/time-last-modified' => 'date-time-last-modified',
 597     'dtlm' => 'date-time-last-modified',
 598     'diss' => 'dissertation-information',
 599     'nb' => 'isbn',
 600     'ns' => 'issn',
 601     'music-number' => 'identifier-publisher-for-music',
 602     'number-music-publisher' => 'identifier-publisher-for-music',
 603     'music' => 'identifier-publisher-for-music',
 604     'ident' => 'identifier-standard',
 605     'cpn' => 'corporate-name',
 606     'cfn' => 'conference-name',
 607     'pn' => 'personal-name',
 608     'pb' => 'publisher',
 609     'pv' => 'provider',
 610     'nt' => 'note',
 611     'notes' => 'note',
 612     'rcn' => 'record-control-number',
 613     'su' => 'subject',
 614     'su-to' => 'subject',
 615     #'su-geo' => 'subject',
 616     'su-ut' => 'subject',
 617     'ti' => 'title',
 618     'se' => 'title-series',
 619     'ut' => 'title-uniform',
 620     'an' => 'koha-auth-number',
 621     'authority-number' => 'koha-auth-number',
 622     'at' => 'authtype',
 623     'he' => 'heading',
 624     'rank' => 'relevance',
 625     'phr' => 'st-phrase',
 626     'wrdl' => 'st-word-list',
 627     'rt' => 'right-truncation',
 628     'rtrn' => 'right-truncation',
 629     'ltrn' => 'left-truncation',
 630     'rltrn' => 'left-and-right',
 631     'mc-itemtype' => 'itemtype',
 632     'mc-ccode' => 'ccode',
 633     'branch' => 'homebranch',
 634     'mc-loc' => 'location',
 635     'stocknumber' => 'number-local-acquisition',
 636     'inv' => 'number-local-acquisition',
 637     'bc' => 'barcode',
 638     'mc-itype' => 'itype',
 639     'aub' => 'author-personal-bibliography',
 640     'auo' => 'author-in-order',
 641     'ff8-22' => 'ta',
 642     'aud' => 'ta',
 643     'audience' => 'ta',
 644     'frequency-code' => 'ff8-18',
 645     'illustration-code' => 'ff8-18-21',
 646     'regularity-code' => 'ff8-19',
 647     'type-of-serial' => 'ff8-21',
 648     'format' => 'ff8-23',
 649     'conference-code' => 'ff8-29',
 650     'festschrift-indicator' => 'ff8-30',
 651     'index-indicator' => 'ff8-31',
 652     'fiction' => 'lf',
 653     'fic' => 'lf',
 654     'literature-code' => 'lf',
 655     'biography' => 'bio',
 656     'ff8-34' => 'bio',
 657     'biography-code' => 'bio',
 658     'l-format' => 'ff7-01-02',
 659     'lex' => 'lexile-number',
 660     'hi' => 'host-item-number',
 661     'itu' => 'index-term-uncontrolled',
 662     'itg' => 'index-term-genre',
 663 );
 664 my $field_name_pattern = '[\w\-]+';
 665 my $multi_field_pattern = "(?:\\.$field_name_pattern)*";
 666
 667 sub _convert_index_fields {
 668     my ( $self, @indexes ) = @_;
 669
 670     my %index_type_convert =
 671       ( __default => undef, phr => 'phrase', rtrn => 'right-truncate', 'st-year' => 'st-year' );
 672
 673     # Convert according to our table, drop anything that doesn't convert.
 674     # If a field starts with mc- we save it as it's used (and removed) later
 675     # when joining things, to indicate we make it an 'OR' join.
 676     # (Sorry, this got a bit ugly after special cases were found.)
 677     map {
 678         # Lower case all field names
 679         my ( $f, $t ) = map(lc, split /,/);
 680         my $mc = '';
 681         if ($f =~ /^mc-/) {
 682             $mc = 'mc-';
 683             $f =~ s/^mc-//;
 684         }
 685         my $r = {
 686             field => exists $index_field_convert{$f} ? $index_field_convert{$f} : $f,
 687             type  => $index_type_convert{ $t // '__default' }
 688         };
 689         $r->{field} = ($mc . $r->{field}) if $mc && $r->{field};
 690         $r->{field} ? $r : undef;
 691     } @indexes;
 692 }
 693
 694 =head2 _convert_index_strings
 695
 696     my @searches = $self->_convert_index_strings(@searches);
 697
 698 Similar to L<_convert_index_fields>, this takes strings of the form
 699 B<field:search term> and rewrites the field from zebra-style to
 700 elasticsearch-style. Anything it doesn't understand is returned verbatim.
 701
 702 =cut
 703
 704 sub _convert_index_strings {
 705     my ( $self, @searches ) = @_;
 706     my @res;
 707     foreach my $s (@searches) {
 708         next if $s eq '';
 709         my ( $field, $term ) = $s =~ /^\s*([\w,-]*?):(.*)/;
 710         unless ( defined($field) && defined($term) ) {
 711             push @res, $s;
 712             next;
 713         }
 714         my ($conv) = $self->_convert_index_fields($field);
 715         unless ( defined($conv) ) {
 716             push @res, $s;
 717             next;
 718         }
 719         push @res, ($conv->{field} ? $conv->{field} . ':' : '')
 720             . $self->_modify_string_by_type( %$conv, operand => $term );
 721     }
 722     return @res;
 723 }
 724
 725 =head2 _convert_index_strings_freeform
 726
 727     my $search = $self->_convert_index_strings_freeform($search);
 728
 729 This is similar to L<_convert_index_strings>, however it'll search out the
 730 things to change within the string. So it can handle strings such as
 731 C<(su:foo) AND (su:bar)>, converting the C<su> appropriately.
 732
 733 If there is something of the form "su,complete-subfield" or something, the
 734 second part is stripped off as we can't yet handle that. Making it work
 735 will have to wait for a real query parser.
 736
 737 =cut
 738
 739 sub _convert_index_strings_freeform {
 740     my ( $self, $search ) = @_;
 741     # @TODO: Currenty will alter also fields contained within quotes:
 742     # `searching for "stuff cn:123"` for example will become
 743     # `searching for "stuff local-number:123"
 744     #
 745     # Fixing this is tricky, one possibility:
 746     # https://stackoverflow.com/questions/19193876/perl-regex-to-match-a-string-that-is-not-enclosed-in-quotes
 747     # Still not perfect, and will not handle escaped quotes within quotes and assumes balanced quotes.
 748     #
 749     # Another, not so elegant, solution could be to replace all quoted content with placeholders, and put
 750     # them back when processing is done.
 751
 752     # Lower case field names
 753     $search =~ s/($field_name_pattern)(?:,[\w-]*)?($multi_field_pattern):/\L$1\E$2:/og;
 754     # Resolve possible field aliases
 755     $search =~ s/($field_name_pattern)($multi_field_pattern):/(exists $index_field_convert{$1} ? $index_field_convert{$1} : $1)."$2:"/oge;
 756     return $search;
 757 }
 758
 759 =head2 _modify_string_by_type
 760
 761     my $str = $self->_modify_string_by_type(%index_field);
 762
 763 If you have a search term (operand) and a type (phrase, right-truncated), this
 764 will convert the string to have the function in lucene search terms, e.g.
 765 wrapping quotes around it.
 766
 767 =cut
 768
 769 sub _modify_string_by_type {
 770     my ( $self, %idx ) = @_;
 771
 772     my $type = $idx{type} || '';
 773     my $str = $idx{operand};
 774     return $str unless $str;    # Empty or undef, we can't use it.
 775
 776     $str .= '*' if $type eq 'right-truncate';
 777     $str = '"' . $str . '"' if $type eq 'phrase';
 778     if ($type eq 'st-year') {
 779         if ($str =~ /^(.*)-(.*)$/) {
 780             my $from = $1 || '*';
 781             my $until = $2 || '*';
 782             $str = "[$from TO $until]";
 783         }
 784     }
 785     return $str;
 786 }
 787
 788 =head2 _join_queries
 789
 790     my $query_str = $self->_join_queries(@query_parts);
 791
 792 This takes a list of query parts, that might be search terms on their own, or
 793 booleaned together, or specifying fields, or whatever, wraps them in
 794 parentheses, and ANDs them all together. Suitable for feeding to the ES
 795 query string query.
 796
 797 Note: doesn't AND them together if they specify an index that starts with "mc"
 798 as that was a special case in the original code for dealing with multiple
 799 choice options (you can't search for something that has an itype of A and
 800 and itype of B otherwise.)
 801
 802 =cut
 803
 804 sub _join_queries {
 805     my ( $self, @parts ) = @_;
 806
 807     my @norm_parts = grep { defined($_) && $_ ne '' && $_ !~ /^mc-/ } @parts;
 808     my @mc_parts =
 809       map { s/^mc-//r } grep { defined($_) && $_ ne '' && $_ =~ /^mc-/ } @parts;
 810     return () unless @norm_parts + @mc_parts;
 811     return ( @norm_parts, @mc_parts )[0] if @norm_parts + @mc_parts == 1;
 812     my $grouped_mc =
 813       @mc_parts ? '(' . ( join ' OR ', map { "($_)" } @mc_parts ) . ')' : ();
 814
 815     # Handy trick: $x || () inside a join means that if $x ends up as an
 816     # empty string, it gets replaced with (), which makes join ignore it.
 817     # (bad effect: this'll also happen to '0', this hopefully doesn't matter
 818     # in this case.)
 819     join( ' AND ',
 820         join( ' AND ', map { "($_)" } @norm_parts ) || (),
 821         $grouped_mc || () );
 822 }
 823
 824 =head2 _make_phrases
 825
 826     my @phrased_queries = $self->_make_phrases(@query_parts);
 827
 828 This takes the supplied queries and forces them to be phrases by wrapping
 829 quotes around them. It understands field prefixes, e.g. 'subject:' and puts
 830 the quotes outside of them if they're there.
 831
 832 =cut
 833
 834 sub _make_phrases {
 835     my ( $self, @parts ) = @_;
 836     map { s/^\s*(\w*?:)(.*)$/$1"$2"/r } @parts;
 837 }
 838
 839 =head2 _create_query_string
 840
 841     my @query_strings = $self->_create_query_string(@queries);
 842
 843 Given a list of hashrefs, it will turn them into a lucene-style query string.
 844 The hash should contain field, type (both for the indexes), operator, and
 845 operand.
 846
 847 =cut
 848
 849 sub _create_query_string {
 850     my ( $self, @queries ) = @_;
 851
 852     map {
 853         my $otor  = $_->{operator} ? $_->{operator} . ' ' : '';
 854         my $field = $_->{field}    ? $_->{field} . ':'    : '';
 855
 856         my $oand = $self->_modify_string_by_type(%$_);
 857         $oand = "($oand)" if $field && scalar(split(/\s+/, $oand)) > 1 && (!defined $_->{type} || $_->{type} ne 'st-year');
 858         "$otor($field$oand)";
 859     } @queries;
 860 }
 861
 862 =head2 _clean_search_term
 863
 864     my $term = $self->_clean_search_term($term);
 865
 866 This cleans a search term by removing any funny characters that may upset
 867 ES and give us an error. It also calls L<_convert_index_strings_freeform>
 868 to ensure those parts are correct.
 869
 870 =cut
 871
 872 sub _clean_search_term {
 873     my ( $self, $term ) = @_;
 874
 875     # Lookahead for checking if we are inside quotes
 876     my $lookahead = '(?=(?:[^\"]*+\"[^\"]*+\")*+[^\"]*+$)';
 877
 878     # Some hardcoded searches (like with authorities) produce things like
 879     # 'an=123', when it ought to be 'an:123' for our purposes.
 880     $term =~ s/=/:/g;
 881
 882     $term = $self->_convert_index_strings_freeform($term);
 883     $term =~ s/[{}]/"/g;
 884
 885     # Remove unbalanced quotes
 886     my $unquoted = $term;
 887     my $count = ($unquoted =~ tr/"/ /);
 888     if ($count % 2 == 1) {
 889         $term = $unquoted;
 890     }
 891
 892     # Remove unquoted colons that have whitespace on either side of them
 893     $term =~ s/(\:[:\s]+|[:\s]+:)$lookahead//g;
 894
 895     $term = $self->_query_regex_escape_process($term);
 896
 897     return $term;
 898 }
 899
 900 =head2 _query_regex_escape_process
 901
 902     my $query = $self->_query_regex_escape_process($query);
 903
 904 Processes query in accordance with current "QueryRegexEscapeOptions" system preference setting.
 905
 906 =cut
 907
 908 sub _query_regex_escape_process {
 909     my ($self, $query) = @_;
 910     my $regex_escape_options = C4::Context->preference("QueryRegexEscapeOptions");
 911     if ($regex_escape_options ne 'dont_escape') {
 912         if ($regex_escape_options eq 'escape') {
 913             # Will escape unescaped slashes (/) while preserving
 914             # unescaped slashes within quotes
 915             # @TODO: assumes quotes are always balanced and will
 916             # not handle escaped qoutes properly, should perhaps be
 917             # replaced with a more general parser solution
 918             # so that this function is ever only provided with unqouted
 919             # query parts
 920             $query =~ s@(?:(?<!\\)((?:[\\]{2})*)(?=/))(?![^"]*"(?:[^"]*"[^"]*")*[^"]*$)@\\$1@g;
 921         }
 922         elsif($regex_escape_options eq 'unescape_escaped') {
 923             # Will unescape escaped slashes (\/) and escape
 924             # unescaped slashes (/) while preserving slashes within quotes
 925             # The same limitatations as above apply for handling of quotes
 926             $query =~ s@(?:(?<!\\)(?:((?:[\\]{2})*[\\])|((?:[\\]{2})*))(?=/))(?![^"]*"(?:[^"]*"[^"]*")*[^"]*$)@($1 ? substr($1, 0, -1) : ($2 . "\\"))@ge;
 927         }
 928     }
 929     return $query;
 930 }
 931
 932 =head2 _fix_limit_special_cases
 933
 934     my $limits = $self->_fix_limit_special_cases($limits);
 935
 936 This converts any special cases that the limit specifications have into things
 937 that are more readily processable by the rest of the code.
 938
 939 The argument should be an arrayref, and it'll return an arrayref.
 940
 941 =cut
 942
 943 sub _fix_limit_special_cases {
 944     my ( $self, $limits ) = @_;
 945
 946     my @new_lim;
 947     foreach my $l (@$limits) {
 948
 949         # This is set up by opac-search.pl
 950         if ( $l =~ /^yr,st-numeric,ge=/ ) {
 951             my ( $start, $end ) =
 952               ( $l =~ /^yr,st-numeric,ge=(.*) and yr,st-numeric,le=(.*)$/ );
 953             next unless defined($start) && defined($end);
 954             push @new_lim, "copydate:[$start TO $end]";
 955         }
 956         elsif ( $l =~ /^yr,st-numeric=/ ) {
 957             my ($date) = ( $l =~ /^yr,st-numeric=(.*)$/ );
 958             next unless defined($date);
 959             $date = $self->_modify_string_by_type(type => 'st-year', operand => $date);
 960             push @new_lim, "copydate:$date";
 961         }
 962         elsif ( $l =~ /^available$/ ) {
 963             push @new_lim, 'onloan:0';
 964         }
 965         else {
 966             push @new_lim, $l;
 967         }
 968     }
 969     return \@new_lim;
 970 }
 971
 972 =head2 _sort_field
 973
 974     my $field = $self->_sort_field($field);
 975
 976 Given a field name, this works out what the actual name of the field to sort
 977 on should be. A '__sort' suffix is added for fields with a sort version, and
 978 for text fields either '.phrase' (for sortable versions) or '.raw' is appended
 979 to avoid sorting on a tokenized value.
 980
 981 =cut
 982
 983 sub _sort_field {
 984     my ($self, $f) = @_;
 985
 986     my $mappings = $self->get_elasticsearch_mappings();
 987     my $textField = defined $mappings->{data}{properties}{$f}{type} && $mappings->{data}{properties}{$f}{type} eq 'text';
 988     if (!defined $self->sort_fields()->{$f} || $self->sort_fields()->{$f}) {
 989         $f .= '__sort';
 990         # We need to add '.phrase' to text fields, otherwise it'll sort
 991         # based on the tokenised form.
 992         $f .= '.phrase' if $textField;
 993     } else {
 994         # We need to add '.raw' to text fields without a sort field,
 995         # otherwise it'll sort based on the tokenised form.
 996         $f .= '.raw' if $textField;
 997     }
 998     return $f;
 999 }
1000
1001 =head2 _truncate_terms
1002
1003     my $query = $self->_truncate_terms($query);
1004
1005 Given a string query this function appends '*' wildcard  to all terms except
1006 operands and double quoted strings.
1007
1008 =cut
1009
1010 sub _truncate_terms {
1011     my ( $self, $query ) = @_;
1012
1013     my @tokens = $self->_split_query( $query );
1014
1015     # Filter out empty tokens
1016     my @words = grep { $_ !~ /^\s*$/ } @tokens;
1017
1018     # Append '*' to words if needed, ie. if it ends in a word character and is not a keyword
1019     my @terms = map {
1020         my $w = $_;
1021         (/\W$/ or grep {lc($w) eq $_} qw/and or not/) ? $_ : "$_*";
1022     } @words;
1023
1024     return join ' ', @terms;
1025 }
1026
1027 =head2 _split_query
1028
1029     my @token = $self->_split_query($query_str);
1030
1031 Given a string query this function splits it to tokens taking into account
1032 any field prefixes and quoted strings.
1033
1034 =cut
1035
1036 my $tokenize_split_re = qr/((?:${field_name_pattern}${multi_field_pattern}:)?"[^"]+"|\s+)/;
1037
1038 sub _split_query {
1039     my ( $self, $query ) = @_;
1040
1041     # '"donald duck" title:"the mouse" and peter" get split into
1042     # ['', '"donald duck"', '', ' ', '', 'title:"the mouse"', '', ' ', 'and', ' ', 'pete']
1043     my @tokens = split $tokenize_split_re, $query;
1044
1045     # Filter out empty values
1046     @tokens = grep( /\S/, @tokens );
1047
1048     return @tokens;
1049 }
1050
1051 =head2 _search_fields
1052     my $weighted_fields = $self->_search_fields({
1053         is_opac => 0,
1054         weighted_fields => 1,
1055         subfield => 'raw'
1056     });
1057
1058 Generate a list of searchable fields to be used for Elasticsearch queries
1059 applied to multiple fields.
1060
1061 Returns an arrayref of field names for either OPAC or Staff client, with
1062 possible weights and subfield appended to each field name depending on the
1063 options provided.
1064
1065 =over 4
1066
1067 =item C<$params>
1068
1069 Hashref with options. The parameter C<is_opac> indicates whether the searchable
1070 fields for OPAC or Staff client should be retrieved. If C<weighted_fields> is set
1071 fields weights will be applied on returned fields. C<subfield> can be used to
1072 provide a subfield that will be appended to fields as "C<field_name>.C<subfield>".
1073
1074 =back
1075
1076 =cut
1077
1078 sub _search_fields {
1079     my ($self, $params) = @_;
1080     $params //= {
1081         is_opac => 0,
1082         weighted_fields => 0,
1083         whole_record => 0,
1084         # This is a hack for authorities build_authorities_query
1085         # can hopefully be removed in the future
1086         subfield => undef,
1087     };
1088     my $cache = Koha::Caches->get_instance();
1089     my $cache_key = 'elasticsearch_search_fields' . ($params->{is_opac} ? '_opac' : '_staff_client');
1090     my $search_fields = $cache->get_from_cache($cache_key, { unsafe => 1 });
1091     if (!$search_fields) {
1092         # The reason we don't use Koha::SearchFields->search here is we don't
1093         # want or need resultset wrapped as Koha::SearchField object.
1094         # It does not make any sense in this context and would cause
1095         # unnecessary overhead sice we are only querying for data
1096         # Also would not work, or produce strange results, with the "columns"
1097         # option.
1098         my $schema = Koha::Database->schema;
1099         my $result = $schema->resultset('SearchField')->search(
1100             {
1101                 $params->{is_opac} ? (
1102                     'opac' => 1,
1103                 ) : (
1104                     'staff_client' => 1
1105                 ),
1106                 'type' => { '!=' => 'boolean' },
1107                 'search_marc_map.index_name' => $self->index,
1108                 'search_marc_map.marc_type' => C4::Context->preference('marcflavour'),
1109                 'search_marc_to_fields.search' => 1,
1110             },
1111             {
1112                 columns => [qw/name weight/],
1113                 collapse => 1,
1114                 join => {search_marc_to_fields => 'search_marc_map'},
1115             }
1116         );
1117         my @search_fields;
1118         while (my $search_field = $result->next) {
1119             push @search_fields, [
1120                 $search_field->name,
1121                 $search_field->weight ? $search_field->weight : ()
1122             ];
1123         }
1124         $search_fields = \@search_fields;
1125         $cache->set_in_cache($cache_key, $search_fields);
1126     }
1127     if ($params->{subfield}) {
1128         my $subfield = $params->{subfield};
1129         $search_fields = [
1130             map {
1131                 # Copy values to avoid mutating cached
1132                 # data (since unsafe is used)
1133                 my ($field, $weight) = @{$_};
1134                 ["${field}.${subfield}", $weight];
1135             } @{$search_fields}
1136         ];
1137     }
1138     if ($params->{weighted_fields}) {
1139         return [map { join('^', @{$_}) } @{$search_fields}];
1140     }
1141     else {
1142         # Exclude weight from field
1143         return [map { $_->[0] } @{$search_fields}];
1144     }
1145 }
1146
1147 1;