Koha/SearchEngine/Elasticsearch/QueryBuilder.pm

   1 package Koha::SearchEngine::Elasticsearch::QueryBuilder;
   2
   3 # This file is part of Koha.
   4 #
   5 # Copyright 2014 Catalyst IT Ltd.
   6 #
   7 # Koha is free software; you can redistribute it and/or modify it
   8 # under the terms of the GNU General Public License as published by
   9 # the Free Software Foundation; either version 3 of the License, or
  10 # (at your option) any later version.
  11 #
  12 # Koha is distributed in the hope that it will be useful, but
  13 # WITHOUT ANY WARRANTY; without even the implied warranty of
  14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  15 # GNU General Public License for more details.
  16 #
  17 # You should have received a copy of the GNU General Public License
  18 # along with Koha; if not, see <http://www.gnu.org/licenses>.
  19
  20 =head1 NAME
  21
  22 Koha::SearchEngine::Elasticsearch::QueryBuilder - constructs elasticsearch
  23 query objects from user-supplied queries
  24
  25 =head1 DESCRIPTION
  26
  27 This provides the functions that take a user-supplied search query, and
  28 provides something that can be given to elasticsearch to get answers.
  29
  30 =head1 SYNOPSIS
  31
  32     use Koha::SearchEngine::Elasticsearch::QueryBuilder;
  33     $builder = Koha::SearchEngine::Elasticsearch->new({ index => $index });
  34     my $simple_query = $builder->build_query("hello");
  35     # This is currently undocumented because the original code is undocumented
  36     my $adv_query = $builder->build_advanced_query($indexes, $operands, $operators);
  37
  38 =head1 METHODS
  39
  40 =cut
  41
  42 use base qw(Koha::SearchEngine::Elasticsearch);
  43 use Carp;
  44 use JSON;
  45 use List::MoreUtils qw/ each_array /;
  46 use Modern::Perl;
  47 use URI::Escape;
  48
  49 use C4::Context;
  50 use Koha::Exceptions;
  51 use Koha::Caches;
  52
  53 =head2 build_query
  54
  55     my $simple_query = $builder->build_query("hello", %options)
  56
  57 This will build a query that can be issued to elasticsearch from the provided
  58 string input. This expects a lucene style search form (see
  59 L<http://www.elasticsearch.org/guide/en/elasticsearch/reference/current/query-dsl-query-string-query.html#query-string-syntax>
  60 for details.)
  61
  62 It'll make an attempt to respect the various query options.
  63
  64 Additional options can be provided with the C<%options> hash.
  65
  66 =over 4
  67
  68 =item sort
  69
  70 This should be an arrayref of hashrefs, each containing a C<field> and an
  71 C<direction> (optional, defaults to C<asc>.) The results will be sorted
  72 according to these values. Valid values for C<direction> are 'asc' and 'desc'.
  73
  74 =back
  75
  76 =cut
  77
  78 sub build_query {
  79     my ( $self, $query, %options ) = @_;
  80
  81     my $stemming         = C4::Context->preference("QueryStemming")        || 0;
  82     my $auto_truncation  = C4::Context->preference("QueryAutoTruncate")    || 0;
  83     my $fuzzy_enabled    = C4::Context->preference("QueryFuzzy")           || 0;
  84
  85     $query = '*' unless defined $query;
  86
  87     my $res;
  88     my $fields = $self->_search_fields({
  89         is_opac => $options{is_opac},
  90         weighted_fields => $options{weighted_fields},
  91     });
  92     if ($options{whole_record}) {
  93         push @$fields, 'marc_data_array.*';
  94     }
  95     $res->{query} = {
  96         query_string => {
  97             query            => $query,
  98             fuzziness        => $fuzzy_enabled ? 'auto' : '0',
  99             default_operator => 'AND',
 100             fields           => $fields,
 101             lenient          => JSON::true,
 102             analyze_wildcard => JSON::true,
 103         }
 104     };
 105     $res->{query}->{query_string}->{type} = 'cross_fields' if C4::Context->preference('ElasticsearchCrossFields');
 106
 107     if ( $options{sort} ) {
 108         foreach my $sort ( @{ $options{sort} } ) {
 109             my ( $f, $d ) = @$sort{qw/ field direction /};
 110             die "Invalid sort direction, $d"
 111               if $d && ( $d ne 'asc' && $d ne 'desc' );
 112             $d = 'asc' unless $d;
 113
 114             $f = $self->_sort_field($f);
 115             push @{ $res->{sort} }, { $f => { order => $d } };
 116         }
 117     }
 118
 119     # See _convert_facets in Search.pm for how these get turned into
 120     # things that Koha can use.
 121     my $size = C4::Context->preference('FacetMaxCount');
 122     $res->{aggregations} = {
 123         author         => { terms => { field => "author__facet" , size => $size } },
 124         subject        => { terms => { field => "subject__facet", size => $size } },
 125         itype          => { terms => { field => "itype__facet", size => $size} },
 126         location       => { terms => { field => "location__facet", size => $size } },
 127         'su-geo'       => { terms => { field => "su-geo__facet", size => $size} },
 128         'title-series' => { terms => { field => "title-series__facet", size => $size } },
 129         ccode          => { terms => { field => "ccode__facet", size => $size } },
 130         ln             => { terms => { field => "ln__facet", size => $size } },
 131     };
 132
 133     my $display_library_facets = C4::Context->preference('DisplayLibraryFacets');
 134     if (   $display_library_facets eq 'both'
 135         or $display_library_facets eq 'home' ) {
 136         $res->{aggregations}{homebranch} = { terms => { field => "homebranch__facet", size => $size } };
 137     }
 138     if (   $display_library_facets eq 'both'
 139         or $display_library_facets eq 'holding' ) {
 140         $res->{aggregations}{holdingbranch} = { terms => { field => "holdingbranch__facet", size => $size } };
 141     }
 142     return $res;
 143 }
 144
 145 =head2 build_query_compat
 146
 147     my (
 148         $error,             $query, $simple_query, $query_cgi,
 149         $query_desc,        $limit, $limit_cgi,    $limit_desc,
 150         $stopwords_removed, $query_type
 151       )
 152       = $builder->build_query_compat( \@operators, \@operands, \@indexes,
 153         \@limits, \@sort_by, $scan, $lang, $params );
 154
 155 This handles a search using the same api as L<C4::Search::buildQuery> does.
 156
 157 A very simple query will go in with C<$operands> set to ['query'], and
 158 C<$sort_by> set to ['pubdate_dsc']. This simple case will return with
 159 C<$query> set to something that can perform the search, C<$simple_query>
 160 set to just the search term, C<$query_cgi> set to something that can
 161 reproduce this search, and C<$query_desc> set to something else.
 162
 163 =cut
 164
 165 sub build_query_compat {
 166     my ( $self, $operators, $operands, $indexes, $orig_limits, $sort_by, $scan,
 167         $lang, $params )
 168       = @_;
 169
 170     my $query;
 171     my $query_str = '';
 172     my $search_param_query_str = '';
 173     my $limits = ();
 174     if ( $scan ) {
 175         ($query, $query_str) = $self->_build_scan_query( $operands, $indexes );
 176         $search_param_query_str = $query_str;
 177     } else {
 178         my @sort_params  = $self->_convert_sort_fields(@$sort_by);
 179         my @index_params = $self->_convert_index_fields(@$indexes);
 180         $limits       = $self->_fix_limit_special_cases($orig_limits);
 181         if ( $params->{suppress} ) { push @$limits, "suppress:false"; }
 182         # Merge the indexes in with the search terms and the operands so that
 183         # each search thing is a handy unit.
 184         unshift @$operators, undef;    # The first one can't have an op
 185         my @search_params;
 186         my $truncate = C4::Context->preference("QueryAutoTruncate") || 0;
 187         my $ea = each_array( @$operands, @$operators, @index_params );
 188         while ( my ( $oand, $otor, $index ) = $ea->() ) {
 189             next if ( !defined($oand) || $oand eq '' );
 190             $oand = $self->_clean_search_term($oand);
 191             $oand = $self->_truncate_terms($oand) if ($truncate);
 192             push @search_params, {
 193                 operand => $oand,      # the search terms
 194                 operator => defined($otor) ? uc $otor : undef,    # AND and so on
 195                 $index ? %$index : (),
 196             };
 197         }
 198
 199         # We build a string query from limits and the queries. An alternative
 200         # would be to pass them separately into build_query and let it build
 201         # them into a structured ES query itself. Maybe later, though that'd be
 202         # more robust.
 203         $search_param_query_str = join( ' ', $self->_create_query_string(@search_params) );
 204         $query_str = join( ' AND ',
 205             $search_param_query_str || (),
 206             $self->_join_queries( $self->_convert_index_strings(@$limits) ) || () );
 207
 208         # If there's no query on the left, let's remove the junk left behind
 209         $query_str =~ s/^ AND //;
 210         my %options;
 211         $options{sort} = \@sort_params;
 212         $options{is_opac} = $params->{is_opac};
 213         $options{weighted_fields} = $params->{weighted_fields};
 214         $options{whole_record} = $params->{whole_record};
 215         $query = $self->build_query( $query_str, %options );
 216     }
 217
 218     # We roughly emulate the CGI parameters of the zebra query builder
 219     my $query_cgi = '';
 220     shift @$operators; # Shift out the one we unshifted before
 221     my $ea = each_array( @$operands, @$operators, @$indexes );
 222     while ( my ( $oand, $otor, $index ) = $ea->() ) {
 223         $query_cgi .= '&' if $query_cgi;
 224         $query_cgi .= 'idx=' . uri_escape_utf8( $index // '') . '&q=' . uri_escape_utf8( $oand );
 225         $query_cgi .= '&op=' . uri_escape_utf8( $otor ) if $otor;
 226     }
 227     $query_cgi .= '&scan=1' if ( $scan );
 228
 229     my $simple_query;
 230     $simple_query = $operands->[0] if @$operands == 1;
 231     my $query_desc;
 232     if ( $simple_query ) {
 233         $query_desc = $simple_query;
 234     } else {
 235         $query_desc = $search_param_query_str;
 236     }
 237     my $limit     = $self->_join_queries( $self->_convert_index_strings(@$limits));
 238     my $limit_cgi = ( $orig_limits and @$orig_limits )
 239       ? '&limit=' . join( '&limit=', map { uri_escape_utf8($_) } @$orig_limits )
 240       : '';
 241     my $limit_desc;
 242     $limit_desc = "$limit" if $limit;
 243
 244     return (
 245         undef,  $query,     $simple_query, $query_cgi, $query_desc,
 246         $limit, $limit_cgi, $limit_desc,   undef,      undef
 247     );
 248 }
 249
 250 =head2 build_authorities_query
 251
 252     my $query = $builder->build_authorities_query(\%search);
 253
 254 This takes a nice description of an authority search and turns it into a black-box
 255 query that can then be passed to the appropriate searcher.
 256
 257 The search description is a hashref that looks something like:
 258
 259     {
 260         searches => [
 261             {
 262                 where    => 'Heading',    # search the main entry
 263                 operator => 'exact',        # require an exact match
 264                 value    => 'frogs',        # the search string
 265             },
 266             {
 267                 where    => '',             # search all entries
 268                 operator => '',             # default keyword, right truncation
 269                 value    => 'pond',
 270             },
 271         ],
 272         sort => {
 273             field => 'Heading',
 274             order => 'desc',
 275         },
 276         authtypecode => 'TOPIC_TERM',
 277     }
 278
 279 =cut
 280
 281 sub build_authorities_query {
 282     my ( $self, $search ) = @_;
 283
 284     # Start by making the query parts
 285     my @query_parts;
 286
 287     foreach my $s ( @{ $search->{searches} } ) {
 288         my ( $wh, $op, $val ) = @{$s}{qw(where operator value)};
 289         if ( defined $op && ($op eq 'is' || $op eq '=' || $op eq 'exact') ) {
 290             if ($wh) {
 291                 # Match the whole field, case insensitive, UTF normalized.
 292                 push @query_parts, { term => { "$wh.ci_raw" => $val } };
 293             }
 294             else {
 295                 # Match the whole field for all searchable fields, case insensitive,
 296                 # UTF normalized.
 297                 # Given that field data is "The quick brown fox"
 298                 # "The quick brown fox" and "the quick brown fox" will match
 299                 # but not "quick brown fox".
 300                 push @query_parts, {
 301                     multi_match => {
 302                         query => $val,
 303                         fields => $self->_search_fields({ subfield => 'ci_raw' }),
 304                     }
 305                 };
 306             }
 307         }
 308         elsif ( defined $op && $op eq 'start') {
 309             # Match the prefix within a field for all searchable fields.
 310             # Given that field data is "The quick brown fox"
 311             # "The quick bro" will match, but not "quick bro"
 312
 313             # Does not seems to be a multi prefix query
 314             # so we need to create one
 315             if ($wh) {
 316                 # Match prefix of the field.
 317                 push @query_parts, { prefix => {"$wh.ci_raw" => $val} };
 318             }
 319             else {
 320                 my @prefix_queries;
 321                 foreach my $field (@{$self->_search_fields()}) {
 322                     push @prefix_queries, {
 323                         prefix => { "$field.ci_raw" => $val }
 324                     };
 325                 }
 326                 push @query_parts, {
 327                     'bool' => {
 328                         'should' => \@prefix_queries,
 329                         'minimum_should_match' => 1
 330                     }
 331                 };
 332             }
 333         }
 334         else {
 335             # Query all searchable fields.
 336             # Given that field data is "The quick brown fox"
 337             # a search containing any of the words will match, regardless
 338             # of order.
 339
 340             my @tokens = $self->_split_query( $val );
 341             foreach my $token ( @tokens ) {
 342                 $token = $self->_truncate_terms(
 343                     $self->_clean_search_term( $token )
 344                 );
 345             }
 346             my $query = $self->_join_queries( @tokens );
 347
 348             if ($wh) {
 349                 push @query_parts, { query_string => {
 350                     default_field => $wh,
 351                     analyze_wildcard => JSON::true,
 352                     query => $query
 353                 } };
 354             }
 355             else {
 356                 push @query_parts, {
 357                     query_string => {
 358                         analyze_wildcard => JSON::true,
 359                         query => $query,
 360                         fields => $self->_search_fields(),
 361                     }
 362                 };
 363             }
 364         }
 365     }
 366
 367     # Merge the query parts appropriately
 368     # 'should' behaves like 'or'
 369     # 'must' behaves like 'and'
 370     # Zebra behaviour seem to match must so using that here
 371     my $elastic_query = {};
 372     $elastic_query->{bool}->{must} = \@query_parts;
 373
 374     # Filter by authtypecode if set
 375     if ($search->{authtypecode}) {
 376         $elastic_query->{bool}->{filter} = {
 377             term => {
 378                 "authtype.raw" => $search->{authtypecode}
 379             }
 380         };
 381     }
 382
 383     my $query = {
 384         query => $elastic_query
 385     };
 386
 387     # Add the sort stuff
 388     $query->{sort} = [ $search->{sort} ] if exists $search->{sort};
 389
 390     return $query;
 391 }
 392
 393 =head2 build_authorities_query_compat
 394
 395     my ($query) =
 396       $builder->build_authorities_query_compat( \@marclist, \@and_or,
 397         \@excluding, \@operator, \@value, $authtypecode, $orderby );
 398
 399 This builds a query for searching for authorities, in the style of
 400 L<C4::AuthoritiesMarc::SearchAuthorities>.
 401
 402 Arguments:
 403
 404 =over 4
 405
 406 =item marclist
 407
 408 An arrayref containing where the particular term should be searched for.
 409 Options are: mainmainentry, mainentry, match, match-heading, see-from, and
 410 thesaurus. If left blank, any field is used.
 411
 412 =item and_or
 413
 414 Totally ignored. It is never used in L<C4::AuthoritiesMarc::SearchAuthorities>.
 415
 416 =item excluding
 417
 418 Also ignored.
 419
 420 =item operator
 421
 422 What form of search to do. Options are: is (phrase, no truncation, whole field
 423 must match), = (number exact match), exact (phrase, no truncation, whole field
 424 must match). If left blank, then word list, right truncated, anywhere is used.
 425
 426 =item value
 427
 428 The actual user-provided string value to search for.
 429
 430 =item authtypecode
 431
 432 The authority type code to search within. If blank, then all will be searched.
 433
 434 =item orderby
 435
 436 The order to sort the results by. Options are Relevance, HeadingAsc,
 437 HeadingDsc, AuthidAsc, AuthidDsc.
 438
 439 =back
 440
 441 marclist, operator, and value must be the same length, and the values at
 442 index /i/ all relate to each other.
 443
 444 This returns a query, which is a black box object that can be passed to the
 445 appropriate search object.
 446
 447 =cut
 448
 449 our $koha_to_index_name = {
 450     mainmainentry   => 'heading-main',
 451     mainentry       => 'heading',
 452     match           => 'match',
 453     'match-heading' => 'match-heading',
 454     'see-from'      => 'match-heading-see-from',
 455     thesaurus       => 'subject-heading-thesaurus',
 456     any             => '',
 457     all             => ''
 458 };
 459
 460 sub build_authorities_query_compat {
 461     my ( $self, $marclist, $and_or, $excluding, $operator, $value,
 462         $authtypecode, $orderby )
 463       = @_;
 464
 465     # This turns the old-style many-options argument form into a more
 466     # extensible hash form that is understood by L<build_authorities_query>.
 467     my @searches;
 468     my $mappings = $self->get_elasticsearch_mappings();
 469
 470     # Convert to lower case
 471     $marclist = [map(lc, @{$marclist})];
 472     $orderby  = lc $orderby;
 473
 474     my @indexes;
 475     # Make sure everything exists
 476     foreach my $m (@$marclist) {
 477
 478         $m = exists $koha_to_index_name->{$m} ? $koha_to_index_name->{$m} : $m;
 479         push @indexes, $m;
 480         warn "Unknown search field $m in marclist" unless (defined $mappings->{data}->{properties}->{$m} || $m eq '' || $m eq 'match-heading');
 481     }
 482     for ( my $i = 0 ; $i < @$value ; $i++ ) {
 483         next unless $value->[$i]; #clean empty form values, ES doesn't like undefined searches
 484         push @searches,
 485           {
 486             where    => $indexes[$i],
 487             operator => $operator->[$i],
 488             value    => $value->[$i],
 489           };
 490     }
 491
 492     my %sort;
 493     my $sort_field =
 494         ( $orderby =~ /^heading/ ) ? 'heading__sort'
 495       : ( $orderby =~ /^auth/ )    ? 'local-number__sort'
 496       :                              undef;
 497     if ($sort_field) {
 498         my $sort_order = ( $orderby =~ /asc$/ ) ? 'asc' : 'desc';
 499         %sort = ( $sort_field => $sort_order, );
 500     }
 501     my %search = (
 502         searches     => \@searches,
 503         authtypecode => $authtypecode,
 504     );
 505     $search{sort} = \%sort if %sort;
 506     my $query = $self->build_authorities_query( \%search );
 507     return $query;
 508 }
 509
 510 =head2 _build_scan_query
 511
 512     my ($query, $query_str) = $builder->_build_scan_query(\@operands, \@indexes)
 513
 514 This will build an aggregation scan query that can be issued to elasticsearch from
 515 the provided string input.
 516
 517 =cut
 518
 519 our %scan_field_convert = (
 520     'ti' => 'title',
 521     'au' => 'author',
 522     'su' => 'subject',
 523     'se' => 'title-series',
 524     'pb' => 'publisher',
 525 );
 526
 527 sub _build_scan_query {
 528     my ( $self, $operands, $indexes ) = @_;
 529
 530     my $term = scalar( @$operands ) == 0 ? '' : $operands->[0];
 531     my $index = scalar( @$indexes ) == 0 ? 'subject' : $indexes->[0];
 532
 533     my ( $f, $d ) = split( /,/, $index);
 534     $index = $scan_field_convert{$f} || $f;
 535
 536     my $res;
 537     $res->{query} = {
 538         query_string => {
 539             query => '*'
 540         }
 541     };
 542     $res->{aggregations} = {
 543         $index => {
 544             terms => {
 545                 field => $index . '__facet',
 546                 order => { '_term' => 'asc' },
 547                 include => $self->_create_regex_filter($self->_clean_search_term($term)) . '.*'
 548             }
 549         }
 550     };
 551     return ($res, $term);
 552 }
 553
 554 =head2 _create_regex_filter
 555
 556     my $filter = $builder->_create_regex_filter('term')
 557
 558 This will create a regex filter that can be used with an aggregation query.
 559
 560 =cut
 561
 562 sub _create_regex_filter {
 563     my ($self, $term) = @_;
 564
 565     my $result = '';
 566     foreach my $c (split(//, quotemeta($term))) {
 567         my $lc = lc($c);
 568         my $uc = uc($c);
 569         $result .= $lc ne $uc ? '[' . $lc . $uc . ']' : $c;
 570     }
 571     return $result;
 572 }
 573
 574 =head2 _convert_sort_fields
 575
 576     my @sort_params = _convert_sort_fields(@sort_by)
 577
 578 Converts the zebra-style sort index information into elasticsearch-style.
 579
 580 C<@sort_by> is the same as presented to L<build_query_compat>, and it returns
 581 something that can be sent to L<build_query>.
 582
 583 =cut
 584
 585 sub _convert_sort_fields {
 586     my ( $self, @sort_by ) = @_;
 587
 588     # Turn the sorting into something we care about.
 589     my %sort_field_convert = (
 590         acqdate     => 'date-of-acquisition',
 591         author      => 'author',
 592         call_number => 'local-classification',
 593         popularity  => 'issues',
 594         relevance   => undef,       # default
 595         title       => 'title',
 596         pubdate     => 'date-of-publication',
 597     );
 598     my %sort_order_convert =
 599       ( qw( desc desc ), qw( dsc desc ), qw( asc asc ), qw( az asc ), qw( za desc ) );
 600
 601     # Convert the fields and orders, drop anything we don't know about.
 602     grep { $_->{field} } map {
 603         my ( $f, $d ) = /(.+)_(.+)/;
 604         {
 605             field     => $sort_field_convert{$f},
 606             direction => $sort_order_convert{$d}
 607         }
 608     } @sort_by;
 609 }
 610
 611 =head2 _convert_index_fields
 612
 613     my @index_params = $self->_convert_index_fields(@indexes);
 614
 615 Converts zebra-style search index notation into elasticsearch-style.
 616
 617 C<@indexes> is an array of index names, as presented to L<build_query_compat>,
 618 and it returns something that can be sent to L<build_query>.
 619
 620 B<TODO>: this will pull from the elasticsearch mappings table to figure out
 621 types.
 622
 623 =cut
 624
 625 our %index_field_convert = (
 626     'kw' => '',
 627     'ab' => 'abstract',
 628     'au' => 'author',
 629     'lcn' => 'local-classification',
 630     'callnum' => 'local-classification',
 631     'record-type' => 'rtype',
 632     'mc-rtype' => 'rtype',
 633     'mus' => 'rtype',
 634     'lc-card' => 'lc-card-number',
 635     'sn' => 'local-number',
 636     'biblionumber' => 'local-number',
 637     'yr' => 'date-of-publication',
 638     'pubdate' => 'date-of-publication',
 639     'acqdate' => 'date-of-acquisition',
 640     'date/time-last-modified' => 'date-time-last-modified',
 641     'dtlm' => 'date-time-last-modified',
 642     'diss' => 'dissertation-information',
 643     'nb' => 'isbn',
 644     'ns' => 'issn',
 645     'music-number' => 'identifier-publisher-for-music',
 646     'number-music-publisher' => 'identifier-publisher-for-music',
 647     'music' => 'identifier-publisher-for-music',
 648     'ident' => 'identifier-standard',
 649     'cpn' => 'corporate-name',
 650     'cfn' => 'conference-name',
 651     'pn' => 'personal-name',
 652     'pb' => 'publisher',
 653     'pv' => 'provider',
 654     'nt' => 'note',
 655     'notes' => 'note',
 656     'rcn' => 'record-control-number',
 657     'su' => 'subject',
 658     'su-to' => 'subject',
 659     #'su-geo' => 'subject',
 660     'su-ut' => 'subject',
 661     'ti' => 'title',
 662     'se' => 'title-series',
 663     'ut' => 'title-uniform',
 664     'an' => 'koha-auth-number',
 665     'authority-number' => 'koha-auth-number',
 666     'at' => 'authtype',
 667     'he' => 'heading',
 668     'rank' => 'relevance',
 669     'phr' => 'st-phrase',
 670     'wrdl' => 'st-word-list',
 671     'rt' => 'right-truncation',
 672     'rtrn' => 'right-truncation',
 673     'ltrn' => 'left-truncation',
 674     'rltrn' => 'left-and-right',
 675     'mc-itemtype' => 'itemtype',
 676     'mc-ccode' => 'ccode',
 677     'branch' => 'homebranch',
 678     'mc-loc' => 'location',
 679     'loc' => 'location',
 680     'stocknumber' => 'number-local-acquisition',
 681     'inv' => 'number-local-acquisition',
 682     'bc' => 'barcode',
 683     'mc-itype' => 'itype',
 684     'aub' => 'author-personal-bibliography',
 685     'auo' => 'author-in-order',
 686     'ff8-22' => 'ta',
 687     'aud' => 'ta',
 688     'audience' => 'ta',
 689     'frequency-code' => 'ff8-18',
 690     'illustration-code' => 'ff8-18-21',
 691     'regularity-code' => 'ff8-19',
 692     'type-of-serial' => 'ff8-21',
 693     'format' => 'ff8-23',
 694     'conference-code' => 'ff8-29',
 695     'festschrift-indicator' => 'ff8-30',
 696     'index-indicator' => 'ff8-31',
 697     'fiction' => 'lf',
 698     'fic' => 'lf',
 699     'literature-code' => 'lf',
 700     'biography' => 'bio',
 701     'ff8-34' => 'bio',
 702     'biography-code' => 'bio',
 703     'l-format' => 'ff7-01-02',
 704     'lex' => 'lexile-number',
 705     'hi' => 'host-item-number',
 706     'itu' => 'index-term-uncontrolled',
 707     'itg' => 'index-term-genre',
 708 );
 709 my $field_name_pattern = '[\w\-]+';
 710 my $multi_field_pattern = "(?:\\.$field_name_pattern)*";
 711
 712 sub _convert_index_fields {
 713     my ( $self, @indexes ) = @_;
 714
 715     my %index_type_convert =
 716       ( __default => undef, phr => 'phrase', rtrn => 'right-truncate', 'st-year' => 'st-year' );
 717
 718     # Convert according to our table, drop anything that doesn't convert.
 719     # If a field starts with mc- we save it as it's used (and removed) later
 720     # when joining things, to indicate we make it an 'OR' join.
 721     # (Sorry, this got a bit ugly after special cases were found.)
 722     map {
 723         # Lower case all field names
 724         my ( $f, $t ) = map(lc, split /,/);
 725         my $mc = '';
 726         if ($f =~ /^mc-/) {
 727             $mc = 'mc-';
 728             $f =~ s/^mc-//;
 729         }
 730         my $r = {
 731             field => exists $index_field_convert{$f} ? $index_field_convert{$f} : $f,
 732             type  => $index_type_convert{ $t // '__default' }
 733         };
 734         $r->{field} = ($mc . $r->{field}) if $mc && $r->{field};
 735         $r->{field} ? $r : undef;
 736     } @indexes;
 737 }
 738
 739 =head2 _convert_index_strings
 740
 741     my @searches = $self->_convert_index_strings(@searches);
 742
 743 Similar to L<_convert_index_fields>, this takes strings of the form
 744 B<field:search term> and rewrites the field from zebra-style to
 745 elasticsearch-style. Anything it doesn't understand is returned verbatim.
 746
 747 =cut
 748
 749 sub _convert_index_strings {
 750     my ( $self, @searches ) = @_;
 751     my @res;
 752     foreach my $s (@searches) {
 753         next if $s eq '';
 754         my ( $field, $term ) = $s =~ /^\s*([\w,-]*?):(.*)/;
 755         unless ( defined($field) && defined($term) ) {
 756             push @res, $s;
 757             next;
 758         }
 759         my ($conv) = $self->_convert_index_fields($field);
 760         unless ( defined($conv) ) {
 761             push @res, $s;
 762             next;
 763         }
 764         push @res, ($conv->{field} ? $conv->{field} . ':' : '')
 765             . $self->_modify_string_by_type( %$conv, operand => $term );
 766     }
 767     return @res;
 768 }
 769
 770 =head2 _convert_index_strings_freeform
 771
 772     my $search = $self->_convert_index_strings_freeform($search);
 773
 774 This is similar to L<_convert_index_strings>, however it'll search out the
 775 things to change within the string. So it can handle strings such as
 776 C<(su:foo) AND (su:bar)>, converting the C<su> appropriately.
 777
 778 If there is something of the form "su,complete-subfield" or something, the
 779 second part is stripped off as we can't yet handle that. Making it work
 780 will have to wait for a real query parser.
 781
 782 =cut
 783
 784 sub _convert_index_strings_freeform {
 785     my ( $self, $search ) = @_;
 786     # @TODO: Currenty will alter also fields contained within quotes:
 787     # `searching for "stuff cn:123"` for example will become
 788     # `searching for "stuff local-number:123"
 789     #
 790     # Fixing this is tricky, one possibility:
 791     # https://stackoverflow.com/questions/19193876/perl-regex-to-match-a-string-that-is-not-enclosed-in-quotes
 792     # Still not perfect, and will not handle escaped quotes within quotes and assumes balanced quotes.
 793     #
 794     # Another, not so elegant, solution could be to replace all quoted content with placeholders, and put
 795     # them back when processing is done.
 796
 797     # Lower case field names
 798     $search =~ s/($field_name_pattern)(?:,[\w-]*)?($multi_field_pattern):/\L$1\E$2:/og;
 799     # Resolve possible field aliases
 800     $search =~ s/($field_name_pattern)($multi_field_pattern):/(exists $index_field_convert{$1} ? $index_field_convert{$1} : $1)."$2:"/oge;
 801     return $search;
 802 }
 803
 804 =head2 _modify_string_by_type
 805
 806     my $str = $self->_modify_string_by_type(%index_field);
 807
 808 If you have a search term (operand) and a type (phrase, right-truncated), this
 809 will convert the string to have the function in lucene search terms, e.g.
 810 wrapping quotes around it.
 811
 812 =cut
 813
 814 sub _modify_string_by_type {
 815     my ( $self, %idx ) = @_;
 816
 817     my $type = $idx{type} || '';
 818     my $str = $idx{operand};
 819     return $str unless $str;    # Empty or undef, we can't use it.
 820
 821     $str .= '*' if $type eq 'right-truncate';
 822     $str = '"' . $str . '"' if $type eq 'phrase' && $str !~ /^".*"$/;
 823     if ($type eq 'st-year') {
 824         if ($str =~ /^(.*)-(.*)$/) {
 825             my $from = $1 || '*';
 826             my $until = $2 || '*';
 827             $str = "[$from TO $until]";
 828         }
 829     }
 830     return $str;
 831 }
 832
 833 =head2 _join_queries
 834
 835     my $query_str = $self->_join_queries(@query_parts);
 836
 837 This takes a list of query parts, that might be search terms on their own, or
 838 booleaned together, or specifying fields, or whatever, wraps them in
 839 parentheses, and ANDs them all together. Suitable for feeding to the ES
 840 query string query.
 841
 842 Note: doesn't AND them together if they specify an index that starts with "mc"
 843 as that was a special case in the original code for dealing with multiple
 844 choice options (you can't search for something that has an itype of A and
 845 and itype of B otherwise.)
 846
 847 =cut
 848
 849 sub _join_queries {
 850     my ( $self, @parts ) = @_;
 851
 852     my @norm_parts = grep { defined($_) && $_ ne '' && $_ !~ /^mc-/ } @parts;
 853     my @mc_parts =
 854       map { s/^mc-//r } grep { defined($_) && $_ ne '' && $_ =~ /^mc-/ } @parts;
 855     return () unless @norm_parts + @mc_parts;
 856     return ( @norm_parts, @mc_parts )[0] if @norm_parts + @mc_parts == 1;
 857
 858     # Group limits by field, so they can be OR'ed together
 859     my %mc_limits;
 860     foreach my $mc_part (@mc_parts) {
 861         my ($field, $value) = split /:/, $mc_part, 2;
 862         $mc_limits{$field} //= [];
 863         push @{ $mc_limits{$field} }, $value;
 864     }
 865
 866     @mc_parts = map {
 867         sprintf('%s:(%s)', $_, join (' OR ', @{ $mc_limits{$_} }));
 868     } sort keys %mc_limits;
 869
 870     @norm_parts = map { "($_)" } @norm_parts;
 871
 872     return join( ' AND ', @norm_parts, @mc_parts);
 873 }
 874
 875 =head2 _make_phrases
 876
 877     my @phrased_queries = $self->_make_phrases(@query_parts);
 878
 879 This takes the supplied queries and forces them to be phrases by wrapping
 880 quotes around them. It understands field prefixes, e.g. 'subject:' and puts
 881 the quotes outside of them if they're there.
 882
 883 =cut
 884
 885 sub _make_phrases {
 886     my ( $self, @parts ) = @_;
 887     map { s/^\s*(\w*?:)(.*)$/$1"$2"/r } @parts;
 888 }
 889
 890 =head2 _create_query_string
 891
 892     my @query_strings = $self->_create_query_string(@queries);
 893
 894 Given a list of hashrefs, it will turn them into a lucene-style query string.
 895 The hash should contain field, type (both for the indexes), operator, and
 896 operand.
 897
 898 =cut
 899
 900 sub _create_query_string {
 901     my ( $self, @queries ) = @_;
 902
 903     map {
 904         my $otor  = $_->{operator} ? $_->{operator} . ' ' : '';
 905         my $field = $_->{field}    ? $_->{field} . ':'    : '';
 906
 907         my $oand = $self->_modify_string_by_type(%$_);
 908         $oand = "($oand)" if $field && scalar(split(/\s+/, $oand)) > 1 && (!defined $_->{type} || $_->{type} ne 'st-year');
 909         "$otor($field$oand)";
 910     } @queries;
 911 }
 912
 913 =head2 _clean_search_term
 914
 915     my $term = $self->_clean_search_term($term);
 916
 917 This cleans a search term by removing any funny characters that may upset
 918 ES and give us an error. It also calls L<_convert_index_strings_freeform>
 919 to ensure those parts are correct.
 920
 921 =cut
 922
 923 sub _clean_search_term {
 924     my ( $self, $term ) = @_;
 925
 926     # Lookahead for checking if we are inside quotes
 927     my $lookahead = '(?=(?:[^\"]*+\"[^\"]*+\")*+[^\"]*+$)';
 928
 929     # Some hardcoded searches (like with authorities) produce things like
 930     # 'an=123', when it ought to be 'an:123' for our purposes.
 931     $term =~ s/=/:/g;
 932
 933     $term = $self->_convert_index_strings_freeform($term);
 934     $term =~ s/[{}]/"/g;
 935
 936     # Remove unbalanced quotes
 937     my $unquoted = $term;
 938     my $count = ($unquoted =~ tr/"/ /);
 939     if ($count % 2 == 1) {
 940         $term = $unquoted;
 941     }
 942
 943     # Remove unquoted colons that have whitespace on either side of them
 944     $term =~ s/(:+)(\s+)$lookahead/$2/g;
 945     $term =~ s/(\s+)(:+)$lookahead/$1/g;
 946
 947     $term = $self->_query_regex_escape_process($term);
 948
 949     return $term;
 950 }
 951
 952 =head2 _query_regex_escape_process
 953
 954     my $query = $self->_query_regex_escape_process($query);
 955
 956 Processes query in accordance with current "QueryRegexEscapeOptions" system preference setting.
 957
 958 =cut
 959
 960 sub _query_regex_escape_process {
 961     my ($self, $query) = @_;
 962     my $regex_escape_options = C4::Context->preference("QueryRegexEscapeOptions");
 963     if ($regex_escape_options ne 'dont_escape') {
 964         if ($regex_escape_options eq 'escape') {
 965             # Will escape unescaped slashes (/) while preserving
 966             # unescaped slashes within quotes
 967             # @TODO: assumes quotes are always balanced and will
 968             # not handle escaped qoutes properly, should perhaps be
 969             # replaced with a more general parser solution
 970             # so that this function is ever only provided with unqouted
 971             # query parts
 972             $query =~ s@(?:(?<!\\)((?:[\\]{2})*)(?=/))(?![^"]*"(?:[^"]*"[^"]*")*[^"]*$)@\\$1@g;
 973         }
 974         elsif($regex_escape_options eq 'unescape_escaped') {
 975             # Will unescape escaped slashes (\/) and escape
 976             # unescaped slashes (/) while preserving slashes within quotes
 977             # The same limitatations as above apply for handling of quotes
 978             $query =~ s@(?:(?<!\\)(?:((?:[\\]{2})*[\\])|((?:[\\]{2})*))(?=/))(?![^"]*"(?:[^"]*"[^"]*")*[^"]*$)@($1 ? substr($1, 0, -1) : ($2 . "\\"))@ge;
 979         }
 980     }
 981     return $query;
 982 }
 983
 984 =head2 _fix_limit_special_cases
 985
 986     my $limits = $self->_fix_limit_special_cases($limits);
 987
 988 This converts any special cases that the limit specifications have into things
 989 that are more readily processable by the rest of the code.
 990
 991 The argument should be an arrayref, and it'll return an arrayref.
 992
 993 =cut
 994
 995 sub _fix_limit_special_cases {
 996     my ( $self, $limits ) = @_;
 997
 998     my @new_lim;
 999     foreach my $l (@$limits) {
1000
1001         # This is set up by opac-search.pl
1002         if ( $l =~ /^yr,st-numeric,ge=/ ) {
1003             my ( $start, $end ) =
1004               ( $l =~ /^yr,st-numeric,ge=(.*) and yr,st-numeric,le=(.*)$/ );
1005             next unless defined($start) && defined($end);
1006             push @new_lim, "copydate:[$start TO $end]";
1007         }
1008         elsif ( $l =~ /^yr,st-numeric=/ ) {
1009             my ($date) = ( $l =~ /^yr,st-numeric=(.*)$/ );
1010             next unless defined($date);
1011             $date = $self->_modify_string_by_type(type => 'st-year', operand => $date);
1012             push @new_lim, "copydate:$date";
1013         }
1014         elsif ( $l =~ /^available$/ ) {
1015             push @new_lim, 'onloan:false';
1016         }
1017         else {
1018             my ( $field, $term ) = $l =~ /^\s*([\w,-]*?):(.*)/;
1019             $field =~ s/,phr$//; #We are quoting all the limits as phrase, this prevents from quoting again later
1020             if ( defined($field) && defined($term) ) {
1021                 push @new_lim, "$field:(\"$term\")";
1022             }
1023             else {
1024                 push @new_lim, $l;
1025             }
1026         }
1027     }
1028     return \@new_lim;
1029 }
1030
1031 =head2 _sort_field
1032
1033     my $field = $self->_sort_field($field);
1034
1035 Given a field name, this works out what the actual name of the field to sort
1036 on should be. A '__sort' suffix is added for fields with a sort version, and
1037 for text fields either '.phrase' (for sortable versions) or '.raw' is appended
1038 to avoid sorting on a tokenized value.
1039
1040 =cut
1041
1042 sub _sort_field {
1043     my ($self, $f) = @_;
1044
1045     my $mappings = $self->get_elasticsearch_mappings();
1046     my $textField = defined $mappings->{data}{properties}{$f}{type} && $mappings->{data}{properties}{$f}{type} eq 'text';
1047     if (!defined $self->sort_fields()->{$f} || $self->sort_fields()->{$f}) {
1048         $f .= '__sort';
1049     } else {
1050         # We need to add '.raw' to text fields without a sort field,
1051         # otherwise it'll sort based on the tokenised form.
1052         $f .= '.raw' if $textField;
1053     }
1054     return $f;
1055 }
1056
1057 =head2 _truncate_terms
1058
1059     my $query = $self->_truncate_terms($query);
1060
1061 Given a string query this function appends '*' wildcard  to all terms except
1062 operands and double quoted strings.
1063
1064 =cut
1065
1066 sub _truncate_terms {
1067     my ( $self, $query ) = @_;
1068
1069     my @tokens = $self->_split_query( $query );
1070
1071     # Filter out empty tokens
1072     my @words = grep { $_ !~ /^\s*$/ } @tokens;
1073
1074     # Append '*' to words if needed, ie. if it ends in a word character and is not a keyword
1075     my @terms = map {
1076         my $w = $_;
1077         (/\W$/ or grep {lc($w) eq $_} qw/and or not/) ? $_ : "$_*";
1078     } @words;
1079
1080     return join ' ', @terms;
1081 }
1082
1083 =head2 _split_query
1084
1085     my @token = $self->_split_query($query_str);
1086
1087 Given a string query this function splits it to tokens taking into account
1088 any field prefixes and quoted strings.
1089
1090 =cut
1091
1092 my $tokenize_split_re = qr/((?:${field_name_pattern}${multi_field_pattern}:)?"[^"]+"|\s+)/;
1093
1094 sub _split_query {
1095     my ( $self, $query ) = @_;
1096
1097     # '"donald duck" title:"the mouse" and peter" get split into
1098     # ['', '"donald duck"', '', ' ', '', 'title:"the mouse"', '', ' ', 'and', ' ', 'pete']
1099     my @tokens = split $tokenize_split_re, $query;
1100
1101     # Filter out empty values
1102     @tokens = grep( /\S/, @tokens );
1103
1104     return @tokens;
1105 }
1106
1107 =head2 _search_fields
1108     my $weighted_fields = $self->_search_fields({
1109         is_opac => 0,
1110         weighted_fields => 1,
1111         subfield => 'raw'
1112     });
1113
1114 Generate a list of searchable fields to be used for Elasticsearch queries
1115 applied to multiple fields.
1116
1117 Returns an arrayref of field names for either OPAC or staff interface, with
1118 possible weights and subfield appended to each field name depending on the
1119 options provided.
1120
1121 =over 4
1122
1123 =item C<$params>
1124
1125 Hashref with options. The parameter C<is_opac> indicates whether the searchable
1126 fields for OPAC or staff interface should be retrieved. If C<weighted_fields> is set
1127 fields weights will be applied on returned fields. C<subfield> can be used to
1128 provide a subfield that will be appended to fields as "C<field_name>.C<subfield>".
1129
1130 =back
1131
1132 =cut
1133
1134 sub _search_fields {
1135     my ($self, $params) = @_;
1136     $params //= {
1137         is_opac => 0,
1138         weighted_fields => 0,
1139         whole_record => 0,
1140         # This is a hack for authorities build_authorities_query
1141         # can hopefully be removed in the future
1142         subfield => undef,
1143     };
1144     my $cache = Koha::Caches->get_instance();
1145     my $cache_key = 'elasticsearch_search_fields' . ($params->{is_opac} ? '_opac' : '_staff_client') . "_" . $self->index;
1146     my $search_fields = $cache->get_from_cache($cache_key, { unsafe => 1 });
1147     if (!$search_fields) {
1148         # The reason we don't use Koha::SearchFields->search here is we don't
1149         # want or need resultset wrapped as Koha::SearchField object.
1150         # It does not make any sense in this context and would cause
1151         # unnecessary overhead sice we are only querying for data
1152         # Also would not work, or produce strange results, with the "columns"
1153         # option.
1154         my $schema = Koha::Database->schema;
1155         my $result = $schema->resultset('SearchField')->search(
1156             {
1157                 $params->{is_opac} ? (
1158                     'opac' => 1,
1159                 ) : (
1160                     'staff_client' => 1
1161                 ),
1162                 'type' => { '!=' => 'boolean' },
1163                 'search_marc_map.index_name' => $self->index,
1164                 'search_marc_map.marc_type' => C4::Context->preference('marcflavour'),
1165                 'search_marc_to_fields.search' => 1,
1166             },
1167             {
1168                 columns => [qw/name weight/],
1169                 collapse => 1,
1170                 join => {search_marc_to_fields => 'search_marc_map'},
1171             }
1172         );
1173         my @search_fields;
1174         while (my $search_field = $result->next) {
1175             push @search_fields, [
1176                 lc $search_field->name,
1177                 $search_field->weight ? $search_field->weight : ()
1178             ];
1179         }
1180         $search_fields = \@search_fields;
1181         $cache->set_in_cache($cache_key, $search_fields);
1182     }
1183     if ($params->{subfield}) {
1184         my $subfield = $params->{subfield};
1185         $search_fields = [
1186             map {
1187                 # Copy values to avoid mutating cached
1188                 # data (since unsafe is used)
1189                 my ($field, $weight) = @{$_};
1190                 ["${field}.${subfield}", $weight];
1191             } @{$search_fields}
1192         ];
1193     }
1194     if ($params->{weighted_fields}) {
1195         return [map { join('^', @{$_}) } @{$search_fields}];
1196     }
1197     else {
1198         # Exclude weight from field
1199         return [map { $_->[0] } @{$search_fields}];
1200     }
1201 }
1202
1203 1;