Koha/SearchEngine/Elasticsearch/QueryBuilder.pm

   1 package Koha::SearchEngine::Elasticsearch::QueryBuilder;
   2
   3 # This file is part of Koha.
   4 #
   5 # Copyright 2014 Catalyst IT Ltd.
   6 #
   7 # Koha is free software; you can redistribute it and/or modify it
   8 # under the terms of the GNU General Public License as published by
   9 # the Free Software Foundation; either version 3 of the License, or
  10 # (at your option) any later version.
  11 #
  12 # Koha is distributed in the hope that it will be useful, but
  13 # WITHOUT ANY WARRANTY; without even the implied warranty of
  14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  15 # GNU General Public License for more details.
  16 #
  17 # You should have received a copy of the GNU General Public License
  18 # along with Koha; if not, see <http://www.gnu.org/licenses>.
  19
  20 =head1 NAME
  21
  22 Koha::SearchEngine::Elasticsearch::QueryBuilder - constructs elasticsearch
  23 query objects from user-supplied queries
  24
  25 =head1 DESCRIPTION
  26
  27 This provides the functions that take a user-supplied search query, and
  28 provides something that can be given to elasticsearch to get answers.
  29
  30 =head1 SYNOPSIS
  31
  32     use Koha::SearchEngine::Elasticsearch::QueryBuilder;
  33     $builder = Koha::SearchEngine::Elasticsearch->new({ index => $index });
  34     my $simple_query = $builder->build_query("hello");
  35     # This is currently undocumented because the original code is undocumented
  36     my $adv_query = $builder->build_advanced_query($indexes, $operands, $operators);
  37
  38 =head1 METHODS
  39
  40 =cut
  41
  42 use base qw(Koha::SearchEngine::Elasticsearch);
  43 use Carp;
  44 use JSON;
  45 use List::MoreUtils qw/ each_array /;
  46 use Modern::Perl;
  47 use URI::Escape;
  48
  49 use C4::Context;
  50 use Koha::Exceptions;
  51 use Koha::Caches;
  52
  53 our %index_field_convert = (
  54     'kw' => '',
  55     'ab' => 'abstract',
  56     'au' => 'author',
  57     'lcn' => 'local-classification',
  58     'callnum' => 'local-classification',
  59     'record-type' => 'rtype',
  60     'mc-rtype' => 'rtype',
  61     'mus' => 'rtype',
  62     'lc-card' => 'lc-card-number',
  63     'sn' => 'local-number',
  64     'biblionumber' => 'local-number',
  65     'yr' => 'date-of-publication',
  66     'pubdate' => 'date-of-publication',
  67     'acqdate' => 'date-of-acquisition',
  68     'date/time-last-modified' => 'date-time-last-modified',
  69     'dtlm' => 'date-time-last-modified',
  70     'diss' => 'dissertation-information',
  71     'nb' => 'isbn',
  72     'ns' => 'issn',
  73     'music-number' => 'identifier-publisher-for-music',
  74     'number-music-publisher' => 'identifier-publisher-for-music',
  75     'music' => 'identifier-publisher-for-music',
  76     'ident' => 'identifier-standard',
  77     'cpn' => 'corporate-name',
  78     'cfn' => 'conference-name',
  79     'pn' => 'personal-name',
  80     'pb' => 'publisher',
  81     'pv' => 'provider',
  82     'nt' => 'note',
  83     'notes' => 'note',
  84     'rcn' => 'record-control-number',
  85     'su' => 'subject',
  86     'su-to' => 'subject',
  87     #'su-geo' => 'subject',
  88     'su-ut' => 'subject',
  89     'ti' => 'title',
  90     'se' => 'title-series',
  91     'ut' => 'title-uniform',
  92     'an' => 'koha-auth-number',
  93     'authority-number' => 'koha-auth-number',
  94     'at' => 'authtype',
  95     'he' => 'heading',
  96     'rank' => 'relevance',
  97     'phr' => 'st-phrase',
  98     'wrdl' => 'st-word-list',
  99     'rt' => 'right-truncation',
 100     'rtrn' => 'right-truncation',
 101     'ltrn' => 'left-truncation',
 102     'rltrn' => 'left-and-right',
 103     'mc-itemtype' => 'itemtype',
 104     'mc-ccode' => 'ccode',
 105     'branch' => 'homebranch',
 106     'mc-loc' => 'location',
 107     'loc' => 'location',
 108     'stocknumber' => 'number-local-acquisition',
 109     'inv' => 'number-local-acquisition',
 110     'bc' => 'barcode',
 111     'mc-itype' => 'itype',
 112     'aub' => 'author-personal-bibliography',
 113     'auo' => 'author-in-order',
 114     'ff8-22' => 'ta',
 115     'aud' => 'ta',
 116     'audience' => 'ta',
 117     'frequency-code' => 'ff8-18',
 118     'illustration-code' => 'ff8-18-21',
 119     'regularity-code' => 'ff8-19',
 120     'type-of-serial' => 'ff8-21',
 121     'format' => 'ff8-23',
 122     'conference-code' => 'ff8-29',
 123     'festschrift-indicator' => 'ff8-30',
 124     'index-indicator' => 'ff8-31',
 125     'fiction' => 'lf',
 126     'fic' => 'lf',
 127     'literature-code' => 'lf',
 128     'biography' => 'bio',
 129     'ff8-34' => 'bio',
 130     'biography-code' => 'bio',
 131     'l-format' => 'ff7-01-02',
 132     'lex' => 'lexile-number',
 133     'hi' => 'host-item-number',
 134     'itu' => 'index-term-uncontrolled',
 135     'itg' => 'index-term-genre',
 136 );
 137 my $field_name_pattern = '[\w\-]+';
 138 my $multi_field_pattern = "(?:\\.$field_name_pattern)*";
 139
 140 =head2 get_index_field_convert
 141
 142     my @index_params = Koha::SearchEngine::Elasticsearch::QueryBuilder->get_index_field_convert();
 143
 144 Converts zebra-style search index notation into elasticsearch-style.
 145
 146 C<@indexes> is an array of index names, as presented to L<build_query_compat>,
 147 and it returns something that can be sent to L<build_query>.
 148
 149 B<TODO>: this will pull from the elasticsearch mappings table to figure out
 150 types.
 151
 152 =cut
 153
 154 sub get_index_field_convert() {
 155     return \%index_field_convert;
 156 }
 157
 158 =head2 build_query
 159
 160     my $simple_query = $builder->build_query("hello", %options)
 161
 162 This will build a query that can be issued to elasticsearch from the provided
 163 string input. This expects a lucene style search form (see
 164 L<http://www.elasticsearch.org/guide/en/elasticsearch/reference/current/query-dsl-query-string-query.html#query-string-syntax>
 165 for details.)
 166
 167 It'll make an attempt to respect the various query options.
 168
 169 Additional options can be provided with the C<%options> hash.
 170
 171 =over 4
 172
 173 =item sort
 174
 175 This should be an arrayref of hashrefs, each containing a C<field> and an
 176 C<direction> (optional, defaults to C<asc>.) The results will be sorted
 177 according to these values. Valid values for C<direction> are 'asc' and 'desc'.
 178
 179 =back
 180
 181 =cut
 182
 183 sub build_query {
 184     my ( $self, $query, %options ) = @_;
 185
 186     my $stemming         = C4::Context->preference("QueryStemming")        || 0;
 187     my $auto_truncation  = C4::Context->preference("QueryAutoTruncate")    || 0;
 188     my $fuzzy_enabled    = C4::Context->preference("QueryFuzzy")           || 0;
 189
 190     $query = '*' unless defined $query;
 191
 192     my $res;
 193     my $fields = $self->_search_fields({
 194         is_opac => $options{is_opac},
 195         weighted_fields => $options{weighted_fields},
 196     });
 197     if ($options{whole_record}) {
 198         push @$fields, 'marc_data_array.*';
 199     }
 200     $res->{query} = {
 201         query_string => {
 202             query            => $query,
 203             fuzziness        => $fuzzy_enabled ? 'auto' : '0',
 204             default_operator => 'AND',
 205             fields           => $fields,
 206             lenient          => JSON::true,
 207             analyze_wildcard => JSON::true,
 208         }
 209     };
 210     $res->{query}->{query_string}->{type} = 'cross_fields' if C4::Context->preference('ElasticsearchCrossFields');
 211
 212     if ( $options{sort} ) {
 213         foreach my $sort ( @{ $options{sort} } ) {
 214             my ( $f, $d ) = @$sort{qw/ field direction /};
 215             die "Invalid sort direction, $d"
 216               if $d && ( $d ne 'asc' && $d ne 'desc' );
 217             $d = 'asc' unless $d;
 218
 219             $f = $self->_sort_field($f);
 220             push @{ $res->{sort} }, { $f => { order => $d } };
 221         }
 222     }
 223
 224     # See _convert_facets in Search.pm for how these get turned into
 225     # things that Koha can use.
 226     my $size = C4::Context->preference('FacetMaxCount');
 227     $res->{aggregations} = {
 228         author         => { terms => { field => "author__facet" , size => $size } },
 229         subject        => { terms => { field => "subject__facet", size => $size } },
 230         itype          => { terms => { field => "itype__facet", size => $size} },
 231         location       => { terms => { field => "location__facet", size => $size } },
 232         'su-geo'       => { terms => { field => "su-geo__facet", size => $size} },
 233         'title-series' => { terms => { field => "title-series__facet", size => $size } },
 234         ccode          => { terms => { field => "ccode__facet", size => $size } },
 235         ln             => { terms => { field => "ln__facet", size => $size } },
 236     };
 237
 238     my $display_library_facets = C4::Context->preference('DisplayLibraryFacets');
 239     if (   $display_library_facets eq 'both'
 240         or $display_library_facets eq 'home' ) {
 241         $res->{aggregations}{homebranch} = { terms => { field => "homebranch__facet", size => $size } };
 242     }
 243     if (   $display_library_facets eq 'both'
 244         or $display_library_facets eq 'holding' ) {
 245         $res->{aggregations}{holdingbranch} = { terms => { field => "holdingbranch__facet", size => $size } };
 246     }
 247     return $res;
 248 }
 249
 250 =head2 build_query_compat
 251
 252     my (
 253         $error,             $query, $simple_query, $query_cgi,
 254         $query_desc,        $limit, $limit_cgi,    $limit_desc,
 255         $stopwords_removed, $query_type
 256       )
 257       = $builder->build_query_compat( \@operators, \@operands, \@indexes,
 258         \@limits, \@sort_by, $scan, $lang, $params );
 259
 260 This handles a search using the same api as L<C4::Search::buildQuery> does.
 261
 262 A very simple query will go in with C<$operands> set to ['query'], and
 263 C<$sort_by> set to ['pubdate_dsc']. This simple case will return with
 264 C<$query> set to something that can perform the search, C<$simple_query>
 265 set to just the search term, C<$query_cgi> set to something that can
 266 reproduce this search, and C<$query_desc> set to something else.
 267
 268 =cut
 269
 270 sub build_query_compat {
 271     my ( $self, $operators, $operands, $indexes, $orig_limits, $sort_by, $scan,
 272         $lang, $params )
 273       = @_;
 274
 275     my $query;
 276     my $query_str = '';
 277     my $search_param_query_str = '';
 278     my $limits = ();
 279     if ( $scan ) {
 280         ($query, $query_str) = $self->_build_scan_query( $operands, $indexes );
 281         $search_param_query_str = $query_str;
 282     } else {
 283         my @sort_params  = $self->_convert_sort_fields(@$sort_by);
 284         my @index_params = $self->_convert_index_fields(@$indexes);
 285         $limits       = $self->_fix_limit_special_cases($orig_limits);
 286         if ( $params->{suppress} ) { push @$limits, "suppress:false"; }
 287         # Merge the indexes in with the search terms and the operands so that
 288         # each search thing is a handy unit.
 289         unshift @$operators, undef;    # The first one can't have an op
 290         my @search_params;
 291         my $truncate = C4::Context->preference("QueryAutoTruncate") || 0;
 292         my $ea = each_array( @$operands, @$operators, @index_params );
 293         while ( my ( $oand, $otor, $index ) = $ea->() ) {
 294             next if ( !defined($oand) || $oand eq '' );
 295             $oand = $self->_clean_search_term($oand);
 296             $oand = $self->_truncate_terms($oand) if ($truncate);
 297             push @search_params, {
 298                 operand => $oand,      # the search terms
 299                 operator => defined($otor) ? uc $otor : undef,    # AND and so on
 300                 $index ? %$index : (),
 301             };
 302         }
 303
 304         # We build a string query from limits and the queries. An alternative
 305         # would be to pass them separately into build_query and let it build
 306         # them into a structured ES query itself. Maybe later, though that'd be
 307         # more robust.
 308         $search_param_query_str = join( ' ', $self->_create_query_string(@search_params) );
 309         $query_str = join( ' AND ',
 310             $search_param_query_str || (),
 311             $self->_join_queries( $self->_convert_index_strings(@$limits) ) || () );
 312
 313         # If there's no query on the left, let's remove the junk left behind
 314         $query_str =~ s/^ AND //;
 315         my %options;
 316         $options{sort} = \@sort_params;
 317         $options{is_opac} = $params->{is_opac};
 318         $options{weighted_fields} = $params->{weighted_fields};
 319         $options{whole_record} = $params->{whole_record};
 320         $query = $self->build_query( $query_str, %options );
 321     }
 322
 323     # We roughly emulate the CGI parameters of the zebra query builder
 324     my $query_cgi = '';
 325     shift @$operators; # Shift out the one we unshifted before
 326     my $ea = each_array( @$operands, @$operators, @$indexes );
 327     while ( my ( $oand, $otor, $index ) = $ea->() ) {
 328         $query_cgi .= '&' if $query_cgi;
 329         $query_cgi .= 'idx=' . uri_escape_utf8( $index // '') . '&q=' . uri_escape_utf8( $oand );
 330         $query_cgi .= '&op=' . uri_escape_utf8( $otor ) if $otor;
 331     }
 332     $query_cgi .= '&scan=1' if ( $scan );
 333
 334     my $simple_query;
 335     $simple_query = $operands->[0] if @$operands == 1;
 336     my $query_desc;
 337     if ( $simple_query ) {
 338         $query_desc = $simple_query;
 339     } else {
 340         $query_desc = $search_param_query_str;
 341     }
 342     my $limit     = $self->_join_queries( $self->_convert_index_strings(@$limits));
 343     my $limit_cgi = ( $orig_limits and @$orig_limits )
 344       ? '&limit=' . join( '&limit=', map { uri_escape_utf8($_) } @$orig_limits )
 345       : '';
 346     my $limit_desc;
 347     $limit_desc = "$limit" if $limit;
 348
 349     return (
 350         undef,  $query,     $simple_query, $query_cgi, $query_desc,
 351         $limit, $limit_cgi, $limit_desc,   undef,      undef
 352     );
 353 }
 354
 355 =head2 build_authorities_query
 356
 357     my $query = $builder->build_authorities_query(\%search);
 358
 359 This takes a nice description of an authority search and turns it into a black-box
 360 query that can then be passed to the appropriate searcher.
 361
 362 The search description is a hashref that looks something like:
 363
 364     {
 365         searches => [
 366             {
 367                 where    => 'Heading',    # search the main entry
 368                 operator => 'exact',        # require an exact match
 369                 value    => 'frogs',        # the search string
 370             },
 371             {
 372                 where    => '',             # search all entries
 373                 operator => '',             # default keyword, right truncation
 374                 value    => 'pond',
 375             },
 376         ],
 377         sort => {
 378             field => 'Heading',
 379             order => 'desc',
 380         },
 381         authtypecode => 'TOPIC_TERM',
 382     }
 383
 384 =cut
 385
 386 sub build_authorities_query {
 387     my ( $self, $search ) = @_;
 388
 389     # Start by making the query parts
 390     my @query_parts;
 391
 392     foreach my $s ( @{ $search->{searches} } ) {
 393         my ( $wh, $op, $val ) = @{$s}{qw(where operator value)};
 394         if ( defined $op && ($op eq 'is' || $op eq '=' || $op eq 'exact') ) {
 395             if ($wh) {
 396                 # Match the whole field, case insensitive, UTF normalized.
 397                 push @query_parts, { term => { "$wh.ci_raw" => $val } };
 398             }
 399             else {
 400                 # Match the whole field for all searchable fields, case insensitive,
 401                 # UTF normalized.
 402                 # Given that field data is "The quick brown fox"
 403                 # "The quick brown fox" and "the quick brown fox" will match
 404                 # but not "quick brown fox".
 405                 push @query_parts, {
 406                     multi_match => {
 407                         query => $val,
 408                         fields => $self->_search_fields({ subfield => 'ci_raw' }),
 409                     }
 410                 };
 411             }
 412         }
 413         elsif ( defined $op && $op eq 'start') {
 414             # Match the prefix within a field for all searchable fields.
 415             # Given that field data is "The quick brown fox"
 416             # "The quick bro" will match, but not "quick bro"
 417
 418             # Does not seems to be a multi prefix query
 419             # so we need to create one
 420             if ($wh) {
 421                 # Match prefix of the field.
 422                 push @query_parts, { prefix => {"$wh.ci_raw" => $val} };
 423             }
 424             else {
 425                 my @prefix_queries;
 426                 foreach my $field (@{$self->_search_fields()}) {
 427                     push @prefix_queries, {
 428                         prefix => { "$field.ci_raw" => $val }
 429                     };
 430                 }
 431                 push @query_parts, {
 432                     'bool' => {
 433                         'should' => \@prefix_queries,
 434                         'minimum_should_match' => 1
 435                     }
 436                 };
 437             }
 438         }
 439         else {
 440             # Query all searchable fields.
 441             # Given that field data is "The quick brown fox"
 442             # a search containing any of the words will match, regardless
 443             # of order.
 444
 445             my @tokens = $self->_split_query( $val );
 446             foreach my $token ( @tokens ) {
 447                 $token = $self->_truncate_terms(
 448                     $self->_clean_search_term( $token )
 449                 );
 450             }
 451             my $query = $self->_join_queries( @tokens );
 452
 453             if ($wh) {
 454                 push @query_parts, { query_string => {
 455                     default_field => $wh,
 456                     analyze_wildcard => JSON::true,
 457                     query => $query
 458                 } };
 459             }
 460             else {
 461                 push @query_parts, {
 462                     query_string => {
 463                         analyze_wildcard => JSON::true,
 464                         query => $query,
 465                         fields => $self->_search_fields(),
 466                     }
 467                 };
 468             }
 469         }
 470     }
 471
 472     # Merge the query parts appropriately
 473     # 'should' behaves like 'or'
 474     # 'must' behaves like 'and'
 475     # Zebra behaviour seem to match must so using that here
 476     my $elastic_query = {};
 477     $elastic_query->{bool}->{must} = \@query_parts;
 478
 479     # Filter by authtypecode if set
 480     if ($search->{authtypecode}) {
 481         $elastic_query->{bool}->{filter} = {
 482             term => {
 483                 "authtype.raw" => $search->{authtypecode}
 484             }
 485         };
 486     }
 487
 488     my $query = {
 489         query => $elastic_query
 490     };
 491
 492     # Add the sort stuff
 493     $query->{sort} = [ $search->{sort} ] if exists $search->{sort};
 494
 495     return $query;
 496 }
 497
 498 =head2 build_authorities_query_compat
 499
 500     my ($query) =
 501       $builder->build_authorities_query_compat( \@marclist, \@and_or,
 502         \@excluding, \@operator, \@value, $authtypecode, $orderby );
 503
 504 This builds a query for searching for authorities, in the style of
 505 L<C4::AuthoritiesMarc::SearchAuthorities>.
 506
 507 Arguments:
 508
 509 =over 4
 510
 511 =item marclist
 512
 513 An arrayref containing where the particular term should be searched for.
 514 Options are: mainmainentry, mainentry, match, match-heading, see-from, and
 515 thesaurus. If left blank, any field is used.
 516
 517 =item and_or
 518
 519 Totally ignored. It is never used in L<C4::AuthoritiesMarc::SearchAuthorities>.
 520
 521 =item excluding
 522
 523 Also ignored.
 524
 525 =item operator
 526
 527 What form of search to do. Options are: is (phrase, no truncation, whole field
 528 must match), = (number exact match), exact (phrase, no truncation, whole field
 529 must match). If left blank, then word list, right truncated, anywhere is used.
 530
 531 =item value
 532
 533 The actual user-provided string value to search for.
 534
 535 =item authtypecode
 536
 537 The authority type code to search within. If blank, then all will be searched.
 538
 539 =item orderby
 540
 541 The order to sort the results by. Options are Relevance, HeadingAsc,
 542 HeadingDsc, AuthidAsc, AuthidDsc.
 543
 544 =back
 545
 546 marclist, operator, and value must be the same length, and the values at
 547 index /i/ all relate to each other.
 548
 549 This returns a query, which is a black box object that can be passed to the
 550 appropriate search object.
 551
 552 =cut
 553
 554 our $koha_to_index_name = {
 555     mainmainentry   => 'heading-main',
 556     mainentry       => 'heading',
 557     match           => 'match',
 558     'match-heading' => 'match-heading',
 559     'see-from'      => 'match-heading-see-from',
 560     thesaurus       => 'subject-heading-thesaurus',
 561     any             => '',
 562     all             => ''
 563 };
 564
 565 sub build_authorities_query_compat {
 566     my ( $self, $marclist, $and_or, $excluding, $operator, $value,
 567         $authtypecode, $orderby )
 568       = @_;
 569
 570     # This turns the old-style many-options argument form into a more
 571     # extensible hash form that is understood by L<build_authorities_query>.
 572     my @searches;
 573     my $mappings = $self->get_elasticsearch_mappings();
 574
 575     # Convert to lower case
 576     $marclist = [map(lc, @{$marclist})];
 577     $orderby  = lc $orderby;
 578
 579     my @indexes;
 580     # Make sure everything exists
 581     foreach my $m (@$marclist) {
 582
 583         $m = exists $koha_to_index_name->{$m} ? $koha_to_index_name->{$m} : $m;
 584         push @indexes, $m;
 585         warn "Unknown search field $m in marclist" unless (defined $mappings->{data}->{properties}->{$m} || $m eq '' || $m eq 'match-heading');
 586     }
 587     for ( my $i = 0 ; $i < @$value ; $i++ ) {
 588         next unless $value->[$i]; #clean empty form values, ES doesn't like undefined searches
 589         push @searches,
 590           {
 591             where    => $indexes[$i],
 592             operator => $operator->[$i],
 593             value    => $value->[$i],
 594           };
 595     }
 596
 597     my %sort;
 598     my $sort_field =
 599         ( $orderby =~ /^heading/ ) ? 'heading__sort'
 600       : ( $orderby =~ /^auth/ )    ? 'local-number__sort'
 601       :                              undef;
 602     if ($sort_field) {
 603         my $sort_order = ( $orderby =~ /asc$/ ) ? 'asc' : 'desc';
 604         %sort = ( $sort_field => $sort_order, );
 605     }
 606     my %search = (
 607         searches     => \@searches,
 608         authtypecode => $authtypecode,
 609     );
 610     $search{sort} = \%sort if %sort;
 611     my $query = $self->build_authorities_query( \%search );
 612     return $query;
 613 }
 614
 615 =head2 _build_scan_query
 616
 617     my ($query, $query_str) = $builder->_build_scan_query(\@operands, \@indexes)
 618
 619 This will build an aggregation scan query that can be issued to elasticsearch from
 620 the provided string input.
 621
 622 =cut
 623
 624 our %scan_field_convert = (
 625     'ti' => 'title',
 626     'au' => 'author',
 627     'su' => 'subject',
 628     'se' => 'title-series',
 629     'pb' => 'publisher',
 630 );
 631
 632 sub _build_scan_query {
 633     my ( $self, $operands, $indexes ) = @_;
 634
 635     my $term = scalar( @$operands ) == 0 ? '' : $operands->[0];
 636     my $index = scalar( @$indexes ) == 0 ? 'subject' : $indexes->[0];
 637
 638     my ( $f, $d ) = split( /,/, $index);
 639     $index = $scan_field_convert{$f} || $f;
 640
 641     my $res;
 642     $res->{query} = {
 643         query_string => {
 644             query => '*'
 645         }
 646     };
 647     $res->{aggregations} = {
 648         $index => {
 649             terms => {
 650                 field => $index . '__facet',
 651                 order => { '_term' => 'asc' },
 652                 include => $self->_create_regex_filter($self->_clean_search_term($term)) . '.*'
 653             }
 654         }
 655     };
 656     return ($res, $term);
 657 }
 658
 659 =head2 _create_regex_filter
 660
 661     my $filter = $builder->_create_regex_filter('term')
 662
 663 This will create a regex filter that can be used with an aggregation query.
 664
 665 =cut
 666
 667 sub _create_regex_filter {
 668     my ($self, $term) = @_;
 669
 670     my $result = '';
 671     foreach my $c (split(//, quotemeta($term))) {
 672         my $lc = lc($c);
 673         my $uc = uc($c);
 674         $result .= $lc ne $uc ? '[' . $lc . $uc . ']' : $c;
 675     }
 676     return $result;
 677 }
 678
 679 =head2 _convert_sort_fields
 680
 681     my @sort_params = _convert_sort_fields(@sort_by)
 682
 683 Converts the zebra-style sort index information into elasticsearch-style.
 684
 685 C<@sort_by> is the same as presented to L<build_query_compat>, and it returns
 686 something that can be sent to L<build_query>.
 687
 688 =cut
 689
 690 sub _convert_sort_fields {
 691     my ( $self, @sort_by ) = @_;
 692
 693     # Turn the sorting into something we care about.
 694     my %sort_field_convert = (
 695         acqdate     => 'date-of-acquisition',
 696         author      => 'author',
 697         call_number => 'cn-sort',
 698         popularity  => 'issues',
 699         relevance   => undef,       # default
 700         title       => 'title',
 701         pubdate     => 'date-of-publication',
 702     );
 703     my %sort_order_convert =
 704       ( qw( desc desc ), qw( dsc desc ), qw( asc asc ), qw( az asc ), qw( za desc ) );
 705
 706     # Convert the fields and orders, drop anything we don't know about.
 707     grep { $_->{field} } map {
 708         my ( $f, $d ) = /(.+)_(.+)/;
 709         {
 710             field     => $sort_field_convert{$f},
 711             direction => $sort_order_convert{$d}
 712         }
 713     } @sort_by;
 714 }
 715
 716 sub _convert_index_fields {
 717     my ( $self, @indexes ) = @_;
 718
 719     my %index_type_convert =
 720       ( __default => undef, phr => 'phrase', rtrn => 'right-truncate', 'st-year' => 'st-year' );
 721
 722     # Convert according to our table, drop anything that doesn't convert.
 723     # If a field starts with mc- we save it as it's used (and removed) later
 724     # when joining things, to indicate we make it an 'OR' join.
 725     # (Sorry, this got a bit ugly after special cases were found.)
 726     map {
 727         # Lower case all field names
 728         my ( $f, $t ) = map(lc, split /,/);
 729         my $mc = '';
 730         if ($f =~ /^mc-/) {
 731             $mc = 'mc-';
 732             $f =~ s/^mc-//;
 733         }
 734         my $r = {
 735             field => exists $index_field_convert{$f} ? $index_field_convert{$f} : $f,
 736             type  => $index_type_convert{ $t // '__default' }
 737         };
 738         $r->{field} = ($mc . $r->{field}) if $mc && $r->{field};
 739         $r->{field} || $r->{type} ? $r : undef;
 740     } @indexes;
 741 }
 742
 743 =head2 _convert_index_strings
 744
 745     my @searches = $self->_convert_index_strings(@searches);
 746
 747 Similar to L<_convert_index_fields>, this takes strings of the form
 748 B<field:search term> and rewrites the field from zebra-style to
 749 elasticsearch-style. Anything it doesn't understand is returned verbatim.
 750
 751 =cut
 752
 753 sub _convert_index_strings {
 754     my ( $self, @searches ) = @_;
 755     my @res;
 756     foreach my $s (@searches) {
 757         next if $s eq '';
 758         my ( $field, $term ) = $s =~ /^\s*([\w,-]*?):(.*)/;
 759         unless ( defined($field) && defined($term) ) {
 760             push @res, $s;
 761             next;
 762         }
 763         my ($conv) = $self->_convert_index_fields($field);
 764         unless ( defined($conv) ) {
 765             push @res, $s;
 766             next;
 767         }
 768         push @res, ($conv->{field} ? $conv->{field} . ':' : '')
 769             . $self->_modify_string_by_type( %$conv, operand => $term );
 770     }
 771     return @res;
 772 }
 773
 774 =head2 _convert_index_strings_freeform
 775
 776     my $search = $self->_convert_index_strings_freeform($search);
 777
 778 This is similar to L<_convert_index_strings>, however it'll search out the
 779 things to change within the string. So it can handle strings such as
 780 C<(su:foo) AND (su:bar)>, converting the C<su> appropriately.
 781
 782 If there is something of the form "su,complete-subfield" or something, the
 783 second part is stripped off as we can't yet handle that. Making it work
 784 will have to wait for a real query parser.
 785
 786 =cut
 787
 788 sub _convert_index_strings_freeform {
 789     my ( $self, $search ) = @_;
 790     # @TODO: Currenty will alter also fields contained within quotes:
 791     # `searching for "stuff cn:123"` for example will become
 792     # `searching for "stuff local-number:123"
 793     #
 794     # Fixing this is tricky, one possibility:
 795     # https://stackoverflow.com/questions/19193876/perl-regex-to-match-a-string-that-is-not-enclosed-in-quotes
 796     # Still not perfect, and will not handle escaped quotes within quotes and assumes balanced quotes.
 797     #
 798     # Another, not so elegant, solution could be to replace all quoted content with placeholders, and put
 799     # them back when processing is done.
 800
 801     # Lower case field names
 802     $search =~ s/($field_name_pattern)(?:,[\w-]*)?($multi_field_pattern):/\L$1\E$2:/og;
 803     # Resolve possible field aliases
 804     $search =~ s/($field_name_pattern)($multi_field_pattern):/(exists $index_field_convert{$1} ? $index_field_convert{$1} : $1)."$2:"/oge;
 805     return $search;
 806 }
 807
 808 =head2 _modify_string_by_type
 809
 810     my $str = $self->_modify_string_by_type(%index_field);
 811
 812 If you have a search term (operand) and a type (phrase, right-truncated), this
 813 will convert the string to have the function in lucene search terms, e.g.
 814 wrapping quotes around it.
 815
 816 =cut
 817
 818 sub _modify_string_by_type {
 819     my ( $self, %idx ) = @_;
 820
 821     my $type = $idx{type} || '';
 822     my $str = $idx{operand};
 823     return $str unless $str;    # Empty or undef, we can't use it.
 824
 825     $str .= '*' if $type eq 'right-truncate';
 826     $str = '"' . $str . '"' if $type eq 'phrase' && $str !~ /^".*"$/;
 827     if ($type eq 'st-year') {
 828         if ($str =~ /^(.*)-(.*)$/) {
 829             my $from = $1 || '*';
 830             my $until = $2 || '*';
 831             $str = "[$from TO $until]";
 832         }
 833     }
 834     return $str;
 835 }
 836
 837 =head2 _join_queries
 838
 839     my $query_str = $self->_join_queries(@query_parts);
 840
 841 This takes a list of query parts, that might be search terms on their own, or
 842 booleaned together, or specifying fields, or whatever, wraps them in
 843 parentheses, and ANDs them all together. Suitable for feeding to the ES
 844 query string query.
 845
 846 Note: doesn't AND them together if they specify an index that starts with "mc"
 847 as that was a special case in the original code for dealing with multiple
 848 choice options (you can't search for something that has an itype of A and
 849 and itype of B otherwise.)
 850
 851 =cut
 852
 853 sub _join_queries {
 854     my ( $self, @parts ) = @_;
 855
 856     my @norm_parts = grep { defined($_) && $_ ne '' && $_ !~ /^mc-/ } @parts;
 857     my @mc_parts =
 858       map { s/^mc-//r } grep { defined($_) && $_ ne '' && $_ =~ /^mc-/ } @parts;
 859     return () unless @norm_parts + @mc_parts;
 860     return ( @norm_parts, @mc_parts )[0] if @norm_parts + @mc_parts == 1;
 861
 862     # Group limits by field, so they can be OR'ed together
 863     my %mc_limits;
 864     foreach my $mc_part (@mc_parts) {
 865         my ($field, $value) = split /:/, $mc_part, 2;
 866         $mc_limits{$field} //= [];
 867         push @{ $mc_limits{$field} }, $value;
 868     }
 869
 870     @mc_parts = map {
 871         sprintf('%s:(%s)', $_, join (' OR ', @{ $mc_limits{$_} }));
 872     } sort keys %mc_limits;
 873
 874     @norm_parts = map { "($_)" } @norm_parts;
 875
 876     return join( ' AND ', @norm_parts, @mc_parts);
 877 }
 878
 879 =head2 _make_phrases
 880
 881     my @phrased_queries = $self->_make_phrases(@query_parts);
 882
 883 This takes the supplied queries and forces them to be phrases by wrapping
 884 quotes around them. It understands field prefixes, e.g. 'subject:' and puts
 885 the quotes outside of them if they're there.
 886
 887 =cut
 888
 889 sub _make_phrases {
 890     my ( $self, @parts ) = @_;
 891     map { s/^\s*(\w*?:)(.*)$/$1"$2"/r } @parts;
 892 }
 893
 894 =head2 _create_query_string
 895
 896     my @query_strings = $self->_create_query_string(@queries);
 897
 898 Given a list of hashrefs, it will turn them into a lucene-style query string.
 899 The hash should contain field, type (both for the indexes), operator, and
 900 operand.
 901
 902 =cut
 903
 904 sub _create_query_string {
 905     my ( $self, @queries ) = @_;
 906
 907     map {
 908         my $otor  = $_->{operator} ? $_->{operator} . ' ' : '';
 909         my $field = $_->{field}    ? $_->{field} . ':'    : '';
 910
 911         my $oand = $self->_modify_string_by_type(%$_);
 912         $oand = "($oand)" if $field && scalar(split(/\s+/, $oand)) > 1 && (!defined $_->{type} || $_->{type} ne 'st-year');
 913         "$otor($field$oand)";
 914     } @queries;
 915 }
 916
 917 =head2 _clean_search_term
 918
 919     my $term = $self->_clean_search_term($term);
 920
 921 This cleans a search term by removing any funny characters that may upset
 922 ES and give us an error. It also calls L<_convert_index_strings_freeform>
 923 to ensure those parts are correct.
 924
 925 =cut
 926
 927 sub _clean_search_term {
 928     my ( $self, $term ) = @_;
 929
 930     # Lookahead for checking if we are inside quotes
 931     my $lookahead = '(?=(?:[^\"]*+\"[^\"]*+\")*+[^\"]*+$)';
 932
 933     # Some hardcoded searches (like with authorities) produce things like
 934     # 'an=123', when it ought to be 'an:123' for our purposes.
 935     $term =~ s/=/:/g;
 936
 937     $term = $self->_convert_index_strings_freeform($term);
 938     $term =~ s/[{}]/"/g;
 939
 940     # Remove unbalanced quotes
 941     my $unquoted = $term;
 942     my $count = ($unquoted =~ tr/"/ /);
 943     if ($count % 2 == 1) {
 944         $term = $unquoted;
 945     }
 946
 947     # Remove unquoted colons that have whitespace on either side of them
 948     $term =~ s/(:+)(\s+)$lookahead/$2/g;
 949     $term =~ s/(\s+)(:+)$lookahead/$1/g;
 950     $term =~ s/^://;
 951
 952     $term = $self->_query_regex_escape_process($term);
 953
 954     return $term;
 955 }
 956
 957 =head2 _query_regex_escape_process
 958
 959     my $query = $self->_query_regex_escape_process($query);
 960
 961 Processes query in accordance with current "QueryRegexEscapeOptions" system preference setting.
 962
 963 =cut
 964
 965 sub _query_regex_escape_process {
 966     my ($self, $query) = @_;
 967     my $regex_escape_options = C4::Context->preference("QueryRegexEscapeOptions");
 968     if ($regex_escape_options ne 'dont_escape') {
 969         if ($regex_escape_options eq 'escape') {
 970             # Will escape unescaped slashes (/) while preserving
 971             # unescaped slashes within quotes
 972             # @TODO: assumes quotes are always balanced and will
 973             # not handle escaped qoutes properly, should perhaps be
 974             # replaced with a more general parser solution
 975             # so that this function is ever only provided with unqouted
 976             # query parts
 977             $query =~ s@(?:(?<!\\)((?:[\\]{2})*)(?=/))(?![^"]*"(?:[^"]*"[^"]*")*[^"]*$)@\\$1@g;
 978         }
 979         elsif($regex_escape_options eq 'unescape_escaped') {
 980             # Will unescape escaped slashes (\/) and escape
 981             # unescaped slashes (/) while preserving slashes within quotes
 982             # The same limitatations as above apply for handling of quotes
 983             $query =~ s@(?:(?<!\\)(?:((?:[\\]{2})*[\\])|((?:[\\]{2})*))(?=/))(?![^"]*"(?:[^"]*"[^"]*")*[^"]*$)@($1 ? substr($1, 0, -1) : ($2 . "\\"))@ge;
 984         }
 985     }
 986     return $query;
 987 }
 988
 989 =head2 _fix_limit_special_cases
 990
 991     my $limits = $self->_fix_limit_special_cases($limits);
 992
 993 This converts any special cases that the limit specifications have into things
 994 that are more readily processable by the rest of the code.
 995
 996 The argument should be an arrayref, and it'll return an arrayref.
 997
 998 =cut
 999
1000 sub _fix_limit_special_cases {
1001     my ( $self, $limits ) = @_;
1002
1003     my @new_lim;
1004     foreach my $l (@$limits) {
1005
1006         # This is set up by opac-search.pl
1007         if ( $l =~ /^yr,st-numeric,ge=/ ) {
1008             my ( $start, $end ) =
1009               ( $l =~ /^yr,st-numeric,ge=(.*) and yr,st-numeric,le=(.*)$/ );
1010             next unless defined($start) && defined($end);
1011             push @new_lim, "copydate:[$start TO $end]";
1012         }
1013         elsif ( $l =~ /^yr,st-numeric=/ ) {
1014             my ($date) = ( $l =~ /^yr,st-numeric=(.*)$/ );
1015             next unless defined($date);
1016             $date = $self->_modify_string_by_type(type => 'st-year', operand => $date);
1017             push @new_lim, "copydate:$date";
1018         }
1019         elsif ( $l =~ /^available$/ ) {
1020             push @new_lim, 'onloan:false';
1021         }
1022         else {
1023             my ( $field, $term ) = $l =~ /^\s*([\w,-]*?):(.*)/;
1024             $field =~ s/,phr$//; #We are quoting all the limits as phrase, this prevents from quoting again later
1025             if ( defined($field) && defined($term) ) {
1026                 push @new_lim, "$field:(\"$term\")";
1027             }
1028             else {
1029                 push @new_lim, $l;
1030             }
1031         }
1032     }
1033     return \@new_lim;
1034 }
1035
1036 =head2 _sort_field
1037
1038     my $field = $self->_sort_field($field);
1039
1040 Given a field name, this works out what the actual name of the field to sort
1041 on should be. A '__sort' suffix is added for fields with a sort version, and
1042 for text fields either '.phrase' (for sortable versions) or '.raw' is appended
1043 to avoid sorting on a tokenized value.
1044
1045 =cut
1046
1047 sub _sort_field {
1048     my ($self, $f) = @_;
1049
1050     my $mappings = $self->get_elasticsearch_mappings();
1051     my $textField = defined $mappings->{data}{properties}{$f}{type} && $mappings->{data}{properties}{$f}{type} eq 'text';
1052     if (!defined $self->sort_fields()->{$f} || $self->sort_fields()->{$f}) {
1053         $f .= '__sort';
1054     } else {
1055         # We need to add '.raw' to text fields without a sort field,
1056         # otherwise it'll sort based on the tokenised form.
1057         $f .= '.raw' if $textField;
1058     }
1059     return $f;
1060 }
1061
1062 =head2 _truncate_terms
1063
1064     my $query = $self->_truncate_terms($query);
1065
1066 Given a string query this function appends '*' wildcard  to all terms except
1067 operands and double quoted strings.
1068
1069 =cut
1070
1071 sub _truncate_terms {
1072     my ( $self, $query ) = @_;
1073
1074     my @tokens = $self->_split_query( $query );
1075
1076     # Filter out empty tokens
1077     my @words = grep { $_ !~ /^\s*$/ } @tokens;
1078
1079     # Append '*' to words if needed, ie. if it ends in a word character and is not a keyword
1080     my @terms = map {
1081         my $w = $_;
1082         (/\W$/ or grep {lc($w) eq $_} qw/and or not/) ? $_ : "$_*";
1083     } @words;
1084
1085     return join ' ', @terms;
1086 }
1087
1088 =head2 _split_query
1089
1090     my @token = $self->_split_query($query_str);
1091
1092 Given a string query this function splits it to tokens taking into account
1093 any field prefixes and quoted strings.
1094
1095 =cut
1096
1097 my $tokenize_split_re = qr/((?:${field_name_pattern}${multi_field_pattern}:)?"[^"]+"|\s+)/;
1098
1099 sub _split_query {
1100     my ( $self, $query ) = @_;
1101
1102     # '"donald duck" title:"the mouse" and peter" get split into
1103     # ['', '"donald duck"', '', ' ', '', 'title:"the mouse"', '', ' ', 'and', ' ', 'pete']
1104     my @tokens = split $tokenize_split_re, $query;
1105
1106     # Filter out empty values
1107     @tokens = grep( /\S/, @tokens );
1108
1109     return @tokens;
1110 }
1111
1112 =head2 _search_fields
1113     my $weighted_fields = $self->_search_fields({
1114         is_opac => 0,
1115         weighted_fields => 1,
1116         subfield => 'raw'
1117     });
1118
1119 Generate a list of searchable fields to be used for Elasticsearch queries
1120 applied to multiple fields.
1121
1122 Returns an arrayref of field names for either OPAC or staff interface, with
1123 possible weights and subfield appended to each field name depending on the
1124 options provided.
1125
1126 =over 4
1127
1128 =item C<$params>
1129
1130 Hashref with options. The parameter C<is_opac> indicates whether the searchable
1131 fields for OPAC or staff interface should be retrieved. If C<weighted_fields> is set
1132 fields weights will be applied on returned fields. C<subfield> can be used to
1133 provide a subfield that will be appended to fields as "C<field_name>.C<subfield>".
1134
1135 =back
1136
1137 =cut
1138
1139 sub _search_fields {
1140     my ($self, $params) = @_;
1141     $params //= {
1142         is_opac => 0,
1143         weighted_fields => 0,
1144         whole_record => 0,
1145         # This is a hack for authorities build_authorities_query
1146         # can hopefully be removed in the future
1147         subfield => undef,
1148     };
1149     my $cache = Koha::Caches->get_instance();
1150     my $cache_key = 'elasticsearch_search_fields' . ($params->{is_opac} ? '_opac' : '_staff_client') . "_" . $self->index;
1151     my $search_fields = $cache->get_from_cache($cache_key, { unsafe => 1 });
1152     if (!$search_fields) {
1153         # The reason we don't use Koha::SearchFields->search here is we don't
1154         # want or need resultset wrapped as Koha::SearchField object.
1155         # It does not make any sense in this context and would cause
1156         # unnecessary overhead sice we are only querying for data
1157         # Also would not work, or produce strange results, with the "columns"
1158         # option.
1159         my $schema = Koha::Database->schema;
1160         my $result = $schema->resultset('SearchField')->search(
1161             {
1162                 $params->{is_opac} ? (
1163                     'opac' => 1,
1164                 ) : (
1165                     'staff_client' => 1
1166                 ),
1167                 'type' => { '!=' => 'boolean' },
1168                 'search_marc_map.index_name' => $self->index,
1169                 'search_marc_map.marc_type' => C4::Context->preference('marcflavour'),
1170                 'search_marc_to_fields.search' => 1,
1171             },
1172             {
1173                 columns => [qw/name weight/],
1174                 collapse => 1,
1175                 join => {search_marc_to_fields => 'search_marc_map'},
1176             }
1177         );
1178         my @search_fields;
1179         while (my $search_field = $result->next) {
1180             push @search_fields, [
1181                 lc $search_field->name,
1182                 $search_field->weight ? $search_field->weight : ()
1183             ];
1184         }
1185         $search_fields = \@search_fields;
1186         $cache->set_in_cache($cache_key, $search_fields);
1187     }
1188     if ($params->{subfield}) {
1189         my $subfield = $params->{subfield};
1190         $search_fields = [
1191             map {
1192                 # Copy values to avoid mutating cached
1193                 # data (since unsafe is used)
1194                 my ($field, $weight) = @{$_};
1195                 ["${field}.${subfield}", $weight];
1196             } @{$search_fields}
1197         ];
1198     }
1199     if ($params->{weighted_fields}) {
1200         return [map { join('^', @{$_}) } @{$search_fields}];
1201     }
1202     else {
1203         # Exclude weight from field
1204         return [map { $_->[0] } @{$search_fields}];
1205     }
1206 }
1207
1208 1;