Koha/SearchEngine/Elasticsearch/QueryBuilder.pm

   1 package Koha::SearchEngine::Elasticsearch::QueryBuilder;
   2
   3 # This file is part of Koha.
   4 #
   5 # Copyright 2014 Catalyst IT Ltd.
   6 #
   7 # Koha is free software; you can redistribute it and/or modify it
   8 # under the terms of the GNU General Public License as published by
   9 # the Free Software Foundation; either version 3 of the License, or
  10 # (at your option) any later version.
  11 #
  12 # Koha is distributed in the hope that it will be useful, but
  13 # WITHOUT ANY WARRANTY; without even the implied warranty of
  14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  15 # GNU General Public License for more details.
  16 #
  17 # You should have received a copy of the GNU General Public License
  18 # along with Koha; if not, see <http://www.gnu.org/licenses>.
  19
  20 =head1 NAME
  21
  22 Koha::SearchEngine::Elasticsearch::QueryBuilder - constructs elasticsearch
  23 query objects from user-supplied queries
  24
  25 =head1 DESCRIPTION
  26
  27 This provides the functions that take a user-supplied search query, and
  28 provides something that can be given to elasticsearch to get answers.
  29
  30 =head1 SYNOPSIS
  31
  32     use Koha::SearchEngine::Elasticsearch::QueryBuilder;
  33     $builder = Koha::SearchEngine::Elasticsearch->new({ index => $index });
  34     my $simple_query = $builder->build_query("hello");
  35     # This is currently undocumented because the original code is undocumented
  36     my $adv_query = $builder->build_advanced_query($indexes, $operands, $operators);
  37
  38 =head1 METHODS
  39
  40 =cut
  41
  42 use base qw(Koha::SearchEngine::Elasticsearch);
  43 use Carp;
  44 use JSON;
  45 use List::MoreUtils qw/ each_array /;
  46 use Modern::Perl;
  47 use URI::Escape;
  48
  49 use C4::Context;
  50 use Koha::Exceptions;
  51
  52 =head2 build_query
  53
  54     my $simple_query = $builder->build_query("hello", %options)
  55
  56 This will build a query that can be issued to elasticsearch from the provided
  57 string input. This expects a lucene style search form (see
  58 L<http://www.elasticsearch.org/guide/en/elasticsearch/reference/current/query-dsl-query-string-query.html#query-string-syntax>
  59 for details.)
  60
  61 It'll make an attempt to respect the various query options.
  62
  63 Additional options can be provided with the C<%options> hash.
  64
  65 =over 4
  66
  67 =item sort
  68
  69 This should be an arrayref of hashrefs, each containing a C<field> and an
  70 C<direction> (optional, defaults to C<asc>.) The results will be sorted
  71 according to these values. Valid values for C<direction> are 'asc' and 'desc'.
  72
  73 =back
  74
  75 =cut
  76
  77 sub build_query {
  78     my ( $self, $query, %options ) = @_;
  79
  80     my $stemming         = C4::Context->preference("QueryStemming")        || 0;
  81     my $auto_truncation  = C4::Context->preference("QueryAutoTruncate")    || 0;
  82     my $weight_fields    = C4::Context->preference("QueryWeightFields")    || 0;
  83     my $fuzzy_enabled    = C4::Context->preference("QueryFuzzy")           || 0;
  84
  85     $query = '*' unless defined $query;
  86
  87     my $res;
  88     $res->{query} = {
  89         query_string => {
  90             query            => $query,
  91             fuzziness        => $fuzzy_enabled ? 'auto' : '0',
  92             default_operator => 'AND',
  93             default_field    => '_all',
  94             lenient          => JSON::true,
  95             analyze_wildcard => JSON::true,
  96             fields           => $options{fields} || [],
  97         }
  98     };
  99
 100     if ( $options{sort} ) {
 101         foreach my $sort ( @{ $options{sort} } ) {
 102             my ( $f, $d ) = @$sort{qw/ field direction /};
 103             die "Invalid sort direction, $d"
 104               if $d && ( $d ne 'asc' && $d ne 'desc' );
 105             $d = 'asc' unless $d;
 106
 107             $f = $self->_sort_field($f);
 108             push @{ $res->{sort} }, { $f => { order => $d } };
 109         }
 110     }
 111
 112     # See _convert_facets in Search.pm for how these get turned into
 113     # things that Koha can use.
 114     my $size = C4::Context->preference('FacetMaxCount');
 115     $res->{aggregations} = {
 116         author         => { terms => { field => "author__facet" , size => $size } },
 117         subject        => { terms => { field => "subject__facet", size => $size } },
 118         itype          => { terms => { field => "itype__facet", size => $size} },
 119         location       => { terms => { field => "location__facet", size => $size } },
 120         'su-geo'       => { terms => { field => "su-geo__facet", size => $size} },
 121         'title-series' => { terms => { field => "title-series__facet", size => $size } },
 122         ccode          => { terms => { field => "ccode__facet", size => $size } },
 123         ln             => { terms => { field => "ln__facet", size => $size } },
 124     };
 125
 126     my $display_library_facets = C4::Context->preference('DisplayLibraryFacets');
 127     if (   $display_library_facets eq 'both'
 128         or $display_library_facets eq 'home' ) {
 129         $res->{aggregations}{homebranch} = { terms => { field => "homebranch__facet" } };
 130     }
 131     if (   $display_library_facets eq 'both'
 132         or $display_library_facets eq 'holding' ) {
 133         $res->{aggregations}{holdingbranch} = { terms => { field => "holdingbranch__facet" } };
 134     }
 135     return $res;
 136 }
 137
 138 =head2 build_browse_query
 139
 140     my $browse_query = $builder->build_browse_query($field, $query);
 141
 142 This performs a "starts with" style query on a particular field. The field
 143 to be searched must have been indexed with an appropriate mapping as a
 144 "phrase" subfield, which pretty much everything has.
 145
 146 =cut
 147
 148 # XXX this isn't really a browse query like we want in the end
 149 sub build_browse_query {
 150     my ( $self, $field, $query ) = @_;
 151
 152     my $fuzzy_enabled = C4::Context->preference("QueryFuzzy") || 0;
 153
 154     return { query => '*' } if !defined $query;
 155
 156     # TODO this should come from Koha::SearchEngine::Elasticsearch
 157     my %field_whitelist = (
 158         title  => 1,
 159         author => 1,
 160     );
 161     $field = 'title' if !exists $field_whitelist{$field};
 162     my $sort = $self->_sort_field($field);
 163     my $res = {
 164         query => {
 165             match_phrase_prefix => {
 166                 "$field.phrase" => {
 167                     query     => $query,
 168                     operator  => 'or',
 169                     fuzziness => $fuzzy_enabled ? 'auto' : '0',
 170                 }
 171             }
 172         },
 173         sort => [ { $sort => { order => "asc" } } ],
 174     };
 175 }
 176
 177 =head2 build_query_compat
 178
 179     my (
 180         $error,             $query, $simple_query, $query_cgi,
 181         $query_desc,        $limit, $limit_cgi,    $limit_desc,
 182         $stopwords_removed, $query_type
 183       )
 184       = $builder->build_query_compat( \@operators, \@operands, \@indexes,
 185         \@limits, \@sort_by, $scan, $lang );
 186
 187 This handles a search using the same api as L<C4::Search::buildQuery> does.
 188
 189 A very simple query will go in with C<$operands> set to ['query'], and
 190 C<$sort_by> set to ['pubdate_dsc']. This simple case will return with
 191 C<$query> set to something that can perform the search, C<$simple_query>
 192 set to just the search term, C<$query_cgi> set to something that can
 193 reproduce this search, and C<$query_desc> set to something else.
 194
 195 =cut
 196
 197 sub build_query_compat {
 198     my ( $self, $operators, $operands, $indexes, $orig_limits, $sort_by, $scan,
 199         $lang, $params )
 200       = @_;
 201
 202 #die Dumper ( $self, $operators, $operands, $indexes, $orig_limits, $sort_by, $scan, $lang );
 203     my @sort_params  = $self->_convert_sort_fields(@$sort_by);
 204     my @index_params = $self->_convert_index_fields(@$indexes);
 205     my $limits       = $self->_fix_limit_special_cases($orig_limits);
 206     if ( $params->{suppress} ) { push @$limits, "suppress:0"; }
 207
 208     # Merge the indexes in with the search terms and the operands so that
 209     # each search thing is a handy unit.
 210     unshift @$operators, undef;    # The first one can't have an op
 211     my @search_params;
 212     my $truncate = C4::Context->preference("QueryAutoTruncate") || 0;
 213     my $ea = each_array( @$operands, @$operators, @index_params );
 214     while ( my ( $oand, $otor, $index ) = $ea->() ) {
 215         next if ( !defined($oand) || $oand eq '' );
 216         $oand = $self->_clean_search_term($oand);
 217         $oand = $self->_truncate_terms($oand) if ($truncate);
 218         push @search_params, {
 219             operand => $oand,      # the search terms
 220             operator => defined($otor) ? uc $otor : undef,    # AND and so on
 221             $index ? %$index : (),
 222         };
 223     }
 224
 225     # We build a string query from limits and the queries. An alternative
 226     # would be to pass them separately into build_query and let it build
 227     # them into a structured ES query itself. Maybe later, though that'd be
 228     # more robust.
 229     my $search_param_query_str = join( ' ', $self->_create_query_string(@search_params) );
 230     my $query_str = join( ' AND ',
 231         $search_param_query_str || (),
 232         $self->_join_queries( $self->_convert_index_strings(@$limits) ) || () );
 233
 234     my @fields = '_all';
 235     if ( defined($params->{weighted_fields}) && $params->{weighted_fields} ) {
 236         push @fields, sprintf("%s^%s", $_->name, $_->weight) for Koha::SearchFields->weighted_fields;
 237     }
 238
 239     # If there's no query on the left, let's remove the junk left behind
 240     $query_str =~ s/^ AND //;
 241     my %options;
 242     $options{fields} = \@fields;
 243     $options{sort} = \@sort_params;
 244     my $query = $self->build_query( $query_str, %options );
 245
 246     # We roughly emulate the CGI parameters of the zebra query builder
 247     my $query_cgi = '';
 248     shift @$operators; # Shift out the one we unshifted before
 249     $ea = each_array( @$operands, @$operators, @$indexes );
 250     while ( my ( $oand, $otor, $index ) = $ea->() ) {
 251         $query_cgi .= '&' if $query_cgi;
 252         $query_cgi .= 'idx=' . uri_escape_utf8( $index // '') . '&q=' . uri_escape_utf8( $oand );
 253         $query_cgi .= '&op=' . uri_escape_utf8( $otor ) if $otor;
 254     }
 255     $query_cgi .= '&scan=1' if ( $scan );
 256
 257     my $simple_query;
 258     $simple_query = $operands->[0] if @$operands == 1;
 259     my $query_desc;
 260     if ( $simple_query ) {
 261         $query_desc = $simple_query;
 262     } else {
 263         $query_desc = $search_param_query_str;
 264     }
 265     my $limit     = $self->_join_queries( $self->_convert_index_strings(@$limits));
 266     my $limit_cgi = ( $orig_limits and @$orig_limits )
 267       ? '&limit=' . join( '&limit=', map { uri_escape_utf8($_) } @$orig_limits )
 268       : '';
 269     my $limit_desc;
 270     $limit_desc = "$limit" if $limit;
 271
 272     return (
 273         undef,  $query,     $simple_query, $query_cgi, $query_desc,
 274         $limit, $limit_cgi, $limit_desc,   undef,      undef
 275     );
 276 }
 277
 278 =head2 build_authorities_query
 279
 280     my $query = $builder->build_authorities_query(\%search);
 281
 282 This takes a nice description of an authority search and turns it into a black-box
 283 query that can then be passed to the appropriate searcher.
 284
 285 The search description is a hashref that looks something like:
 286
 287     {
 288         searches => [
 289             {
 290                 where    => 'Heading',    # search the main entry
 291                 operator => 'exact',        # require an exact match
 292                 value    => 'frogs',        # the search string
 293             },
 294             {
 295                 where    => '',             # search all entries
 296                 operator => '',             # default keyword, right truncation
 297                 value    => 'pond',
 298             },
 299         ],
 300         sort => {
 301             field => 'Heading',
 302             order => 'desc',
 303         },
 304         authtypecode => 'TOPIC_TERM',
 305     }
 306
 307 =cut
 308
 309 sub build_authorities_query {
 310     my ( $self, $search ) = @_;
 311
 312     # Start by making the query parts
 313     my @query_parts;
 314
 315     foreach my $s ( @{ $search->{searches} } ) {
 316         my ( $wh, $op, $val ) = @{$s}{qw(where operator value)};
 317         $wh = '_all' if $wh eq '';
 318         if ( $op eq 'is' || $op eq '='  || $op eq 'exact' ) {
 319
 320             # look for something that matches a term completely
 321             # note, '=' is about numerical vals. May need special handling.
 322             # Also, we lowercase our search because the ES
 323             # index lowercases its values, and term searches don't get the
 324             # search analyzer applied to them.
 325             push @query_parts, { match_phrase => {"$wh.phrase" => lc $val} };
 326         }
 327         elsif ( $op eq 'start' ) {
 328             # startswith search, uses lowercase untokenized version of heading
 329             push @query_parts, { match_phrase_prefix => {"$wh.phrase" => lc $val} };
 330         }
 331         else {
 332             # regular wordlist stuff
 333             my @tokens = $self->_split_query( $val );
 334             foreach my $token ( @tokens ) {
 335                 $token = $self->_truncate_terms(
 336                     $self->_clean_search_term( $token )
 337                 );
 338             }
 339             my $query = $self->_join_queries( @tokens );
 340             push @query_parts, { query_string => { default_field => $wh, query => $query } };
 341         }
 342     }
 343
 344     # Merge the query parts appropriately
 345     # 'should' behaves like 'or'
 346     # 'must' behaves like 'and'
 347     # Zebra results seem to match must so using that here
 348     my $query = { query =>
 349                  { bool =>
 350                      { must => \@query_parts  }
 351                  }
 352              };
 353
 354     my %s;
 355     if ( exists $search->{sort} ) {
 356         foreach my $k ( keys %{ $search->{sort} } ) {
 357             my $f = $self->_sort_field($k);
 358             $s{$f} = $search->{sort}{$k};
 359         }
 360         $search->{sort} = \%s;
 361     }
 362
 363     # add the sort stuff
 364     $query->{sort} = [ $search->{sort} ]  if exists $search->{sort};
 365
 366     return $query;
 367 }
 368
 369
 370 =head2 build_authorities_query_compat
 371
 372     my ($query) =
 373       $builder->build_authorities_query_compat( \@marclist, \@and_or,
 374         \@excluding, \@operator, \@value, $authtypecode, $orderby );
 375
 376 This builds a query for searching for authorities, in the style of
 377 L<C4::AuthoritiesMarc::SearchAuthorities>.
 378
 379 Arguments:
 380
 381 =over 4
 382
 383 =item marclist
 384
 385 An arrayref containing where the particular term should be searched for.
 386 Options are: mainmainentry, mainentry, match, match-heading, see-from, and
 387 thesaurus. If left blank, any field is used.
 388
 389 =item and_or
 390
 391 Totally ignored. It is never used in L<C4::AuthoritiesMarc::SearchAuthorities>.
 392
 393 =item excluding
 394
 395 Also ignored.
 396
 397 =item operator
 398
 399 What form of search to do. Options are: is (phrase, no truncation, whole field
 400 must match), = (number exact match), exact (phrase, no truncation, whole field
 401 must match). If left blank, then word list, right truncated, anywhere is used.
 402
 403 =item value
 404
 405 The actual user-provided string value to search for.
 406
 407 =item authtypecode
 408
 409 The authority type code to search within. If blank, then all will be searched.
 410
 411 =item orderby
 412
 413 The order to sort the results by. Options are Relevance, HeadingAsc,
 414 HeadingDsc, AuthidAsc, AuthidDsc.
 415
 416 =back
 417
 418 marclist, operator, and value must be the same length, and the values at
 419 index /i/ all relate to each other.
 420
 421 This returns a query, which is a black box object that can be passed to the
 422 appropriate search object.
 423
 424 =cut
 425
 426 our $koha_to_index_name = {
 427     mainmainentry   => 'heading-main',
 428     mainentry       => 'heading',
 429     match           => 'match',
 430     'match-heading' => 'match-heading',
 431     'see-from'      => 'match-heading-see-from',
 432     thesaurus       => 'subject-heading-thesaurus',
 433     any             => '',
 434     all             => ''
 435 };
 436
 437 sub build_authorities_query_compat {
 438     my ( $self, $marclist, $and_or, $excluding, $operator, $value,
 439         $authtypecode, $orderby )
 440       = @_;
 441
 442     # This turns the old-style many-options argument form into a more
 443     # extensible hash form that is understood by L<build_authorities_query>.
 444     my @searches;
 445
 446     # Convert to lower case
 447     $marclist = [map(lc, @{$marclist})];
 448     $orderby  = lc $orderby;
 449
 450     # Make sure everything exists
 451     foreach my $m (@$marclist) {
 452         Koha::Exceptions::WrongParameter->throw("Invalid marclist field provided: $m")
 453             unless exists $koha_to_index_name->{$m};
 454     }
 455     for ( my $i = 0 ; $i < @$value ; $i++ ) {
 456         next unless $value->[$i]; #clean empty form values, ES doesn't like undefined searches
 457         push @searches,
 458           {
 459             where    => $koha_to_index_name->{$marclist->[$i]},
 460             operator => $operator->[$i],
 461             value    => $value->[$i],
 462           };
 463     }
 464
 465     my %sort;
 466     my $sort_field =
 467         ( $orderby =~ /^heading/ ) ? 'heading'
 468       : ( $orderby =~ /^auth/ )    ? 'local-number'
 469       :                              undef;
 470     if ($sort_field) {
 471         my $sort_order = ( $orderby =~ /asc$/ ) ? 'asc' : 'desc';
 472         %sort = ( $sort_field => $sort_order, );
 473     }
 474     my %search = (
 475         searches     => \@searches,
 476         authtypecode => $authtypecode,
 477     );
 478     $search{sort} = \%sort if %sort;
 479     my $query = $self->build_authorities_query( \%search );
 480     return $query;
 481 }
 482
 483 =head2 _convert_sort_fields
 484
 485     my @sort_params = _convert_sort_fields(@sort_by)
 486
 487 Converts the zebra-style sort index information into elasticsearch-style.
 488
 489 C<@sort_by> is the same as presented to L<build_query_compat>, and it returns
 490 something that can be sent to L<build_query>.
 491
 492 =cut
 493
 494 sub _convert_sort_fields {
 495     my ( $self, @sort_by ) = @_;
 496
 497     # Turn the sorting into something we care about.
 498     my %sort_field_convert = (
 499         acqdate     => 'date-of-acquisition',
 500         author      => 'author',
 501         call_number => 'local-classification',
 502         popularity  => 'issues',
 503         relevance   => undef,       # default
 504         title       => 'title',
 505         pubdate     => 'date-of-publication',
 506     );
 507     my %sort_order_convert =
 508       ( qw( desc desc ), qw( dsc desc ), qw( asc asc ), qw( az asc ), qw( za desc ) );
 509
 510     # Convert the fields and orders, drop anything we don't know about.
 511     grep { $_->{field} } map {
 512         my ( $f, $d ) = /(.+)_(.+)/;
 513         {
 514             field     => $sort_field_convert{$f},
 515             direction => $sort_order_convert{$d}
 516         }
 517     } @sort_by;
 518 }
 519
 520 =head2 _convert_index_fields
 521
 522     my @index_params = $self->_convert_index_fields(@indexes);
 523
 524 Converts zebra-style search index notation into elasticsearch-style.
 525
 526 C<@indexes> is an array of index names, as presented to L<build_query_compat>,
 527 and it returns something that can be sent to L<build_query>.
 528
 529 B<TODO>: this will pull from the elasticsearch mappings table to figure out
 530 types.
 531
 532 =cut
 533
 534 our %index_field_convert = (
 535     'kw' => '_all',
 536     'ab' => 'abstract',
 537     'au' => 'author',
 538     'lcn' => 'local-classification',
 539     'callnum' => 'local-classification',
 540     'record-type' => 'rtype',
 541     'mc-rtype' => 'rtype',
 542     'mus' => 'rtype',
 543     'lc-card' => 'lc-card-number',
 544     'sn' => 'local-number',
 545     'yr' => 'date-of-publication',
 546     'pubdate' => 'date-of-publication',
 547     'acqdate' => 'date-of-acquisition',
 548     'date/time-last-modified' => 'date-time-last-modified',
 549     'dtlm' => 'date-time-last-modified',
 550     'diss' => 'dissertation-information',
 551     'nb' => 'isbn',
 552     'ns' => 'issn',
 553     'music-number' => 'identifier-publisher-for-music',
 554     'number-music-publisher' => 'identifier-publisher-for-music',
 555     'music' => 'identifier-publisher-for-music',
 556     'ident' => 'identifier-standard',
 557     'cpn' => 'corporate-name',
 558     'cfn' => 'conference-name',
 559     'pn' => 'personal-name',
 560     'pb' => 'publisher',
 561     'pv' => 'provider',
 562     'nt' => 'note',
 563     'notes' => 'note',
 564     'rcn' => 'record-control-number',
 565     'su' => 'subject',
 566     'su-to' => 'subject',
 567     #'su-geo' => 'subject',
 568     'su-ut' => 'subject',
 569     'ti' => 'title',
 570     'se' => 'title-series',
 571     'ut' => 'title-uniform',
 572     'an' => 'koha-auth-number',
 573     'authority-number' => 'koha-auth-number',
 574     'at' => 'authtype',
 575     'he' => 'heading',
 576     'rank' => 'relevance',
 577     'phr' => 'st-phrase',
 578     'wrdl' => 'st-word-list',
 579     'rt' => 'right-truncation',
 580     'rtrn' => 'right-truncation',
 581     'ltrn' => 'left-truncation',
 582     'rltrn' => 'left-and-right',
 583     'mc-itemtype' => 'itemtype',
 584     'mc-ccode' => 'ccode',
 585     'branch' => 'homebranch',
 586     'mc-loc' => 'location',
 587     'stocknumber' => 'number-local-acquisition',
 588     'inv' => 'number-local-acquisition',
 589     'bc' => 'barcode',
 590     'mc-itype' => 'itype',
 591     'aub' => 'author-personal-bibliography',
 592     'auo' => 'author-in-order',
 593     'ff8-22' => 'ta',
 594     'aud' => 'ta',
 595     'audience' => 'ta',
 596     'frequency-code' => 'ff8-18',
 597     'illustration-code' => 'ff8-18-21',
 598     'regularity-code' => 'ff8-19',
 599     'type-of-serial' => 'ff8-21',
 600     'format' => 'ff8-23',
 601     'conference-code' => 'ff8-29',
 602     'festschrift-indicator' => 'ff8-30',
 603     'index-indicator' => 'ff8-31',
 604     'fiction' => 'lf',
 605     'fic' => 'lf',
 606     'literature-code' => 'lf',
 607     'biography' => 'bio',
 608     'ff8-34' => 'bio',
 609     'biography-code' => 'bio',
 610     'l-format' => 'ff7-01-02',
 611     'lex' => 'lexile-number',
 612     'hi' => 'host-item-number',
 613     'itu' => 'index-term-uncontrolled',
 614     'itg' => 'index-term-genre',
 615 );
 616 my $field_name_pattern = '[\w\-]+';
 617 my $multi_field_pattern = "(?:\\.$field_name_pattern)*";
 618
 619 sub _convert_index_fields {
 620     my ( $self, @indexes ) = @_;
 621
 622     my %index_type_convert =
 623       ( __default => undef, phr => 'phrase', rtrn => 'right-truncate', 'st-year' => 'st-year' );
 624
 625     # Convert according to our table, drop anything that doesn't convert.
 626     # If a field starts with mc- we save it as it's used (and removed) later
 627     # when joining things, to indicate we make it an 'OR' join.
 628     # (Sorry, this got a bit ugly after special cases were found.)
 629     grep { $_->{field} } map {
 630         # Lower case all field names
 631         my ( $f, $t ) = map(lc, split /,/);
 632         my $mc = '';
 633         if ($f =~ /^mc-/) {
 634             $mc = 'mc-';
 635             $f =~ s/^mc-//;
 636         }
 637         my $r = {
 638             field => exists $index_field_convert{$f} ? $index_field_convert{$f} : $f,
 639             type  => $index_type_convert{ $t // '__default' }
 640         };
 641         $r->{field} = ($mc . $r->{field}) if $mc && $r->{field};
 642         $r;
 643     } @indexes;
 644 }
 645
 646 =head2 _convert_index_strings
 647
 648     my @searches = $self->_convert_index_strings(@searches);
 649
 650 Similar to L<_convert_index_fields>, this takes strings of the form
 651 B<field:search term> and rewrites the field from zebra-style to
 652 elasticsearch-style. Anything it doesn't understand is returned verbatim.
 653
 654 =cut
 655
 656 sub _convert_index_strings {
 657     my ( $self, @searches ) = @_;
 658     my @res;
 659     foreach my $s (@searches) {
 660         next if $s eq '';
 661         my ( $field, $term ) = $s =~ /^\s*([\w,-]*?):(.*)/;
 662         unless ( defined($field) && defined($term) ) {
 663             push @res, $s;
 664             next;
 665         }
 666         my ($conv) = $self->_convert_index_fields($field);
 667         unless ( defined($conv) ) {
 668             push @res, $s;
 669             next;
 670         }
 671         push @res, $conv->{field} . ":"
 672           . $self->_modify_string_by_type( %$conv, operand => $term );
 673     }
 674     return @res;
 675 }
 676
 677 =head2 _convert_index_strings_freeform
 678
 679     my $search = $self->_convert_index_strings_freeform($search);
 680
 681 This is similar to L<_convert_index_strings>, however it'll search out the
 682 things to change within the string. So it can handle strings such as
 683 C<(su:foo) AND (su:bar)>, converting the C<su> appropriately.
 684
 685 If there is something of the form "su,complete-subfield" or something, the
 686 second part is stripped off as we can't yet handle that. Making it work
 687 will have to wait for a real query parser.
 688
 689 =cut
 690
 691 sub _convert_index_strings_freeform {
 692     my ( $self, $search ) = @_;
 693     # @TODO: Currenty will alter also fields contained within quotes:
 694     # `searching for "stuff cn:123"` for example will become
 695     # `searching for "stuff local-number:123"
 696     #
 697     # Fixing this is tricky, one possibility:
 698     # https://stackoverflow.com/questions/19193876/perl-regex-to-match-a-string-that-is-not-enclosed-in-quotes
 699     # Still not perfect, and will not handle escaped quotes within quotes and assumes balanced quotes.
 700     #
 701     # Another, not so elegant, solution could be to replace all quoted content with placeholders, and put
 702     # them back when processing is done.
 703
 704     # Lower case field names
 705     $search =~ s/($field_name_pattern)(?:,[\w-]*)?($multi_field_pattern):/\L$1\E$2:/og;
 706     # Resolve possible field aliases
 707     $search =~ s/($field_name_pattern)($multi_field_pattern):/(exists $index_field_convert{$1} ? $index_field_convert{$1} : $1)."$2:"/oge;
 708     return $search;
 709 }
 710
 711 =head2 _modify_string_by_type
 712
 713     my $str = $self->_modify_string_by_type(%index_field);
 714
 715 If you have a search term (operand) and a type (phrase, right-truncated), this
 716 will convert the string to have the function in lucene search terms, e.g.
 717 wrapping quotes around it.
 718
 719 =cut
 720
 721 sub _modify_string_by_type {
 722     my ( $self, %idx ) = @_;
 723
 724     my $type = $idx{type} || '';
 725     my $str = $idx{operand};
 726     return $str unless $str;    # Empty or undef, we can't use it.
 727
 728     $str .= '*' if $type eq 'right-truncate';
 729     $str = '"' . $str . '"' if $type eq 'phrase';
 730     if ($type eq 'st-year') {
 731         if ($str =~ /^(.*)-(.*)$/) {
 732             my $from = $1 || '*';
 733             my $until = $2 || '*';
 734             $str = "[$from TO $until]";
 735         }
 736     }
 737     return $str;
 738 }
 739
 740 =head2 _join_queries
 741
 742     my $query_str = $self->_join_queries(@query_parts);
 743
 744 This takes a list of query parts, that might be search terms on their own, or
 745 booleaned together, or specifying fields, or whatever, wraps them in
 746 parentheses, and ANDs them all together. Suitable for feeding to the ES
 747 query string query.
 748
 749 Note: doesn't AND them together if they specify an index that starts with "mc"
 750 as that was a special case in the original code for dealing with multiple
 751 choice options (you can't search for something that has an itype of A and
 752 and itype of B otherwise.)
 753
 754 =cut
 755
 756 sub _join_queries {
 757     my ( $self, @parts ) = @_;
 758
 759     my @norm_parts = grep { defined($_) && $_ ne '' && $_ !~ /^mc-/ } @parts;
 760     my @mc_parts =
 761       map { s/^mc-//r } grep { defined($_) && $_ ne '' && $_ =~ /^mc-/ } @parts;
 762     return () unless @norm_parts + @mc_parts;
 763     return ( @norm_parts, @mc_parts )[0] if @norm_parts + @mc_parts == 1;
 764     my $grouped_mc =
 765       @mc_parts ? '(' . ( join ' OR ', map { "($_)" } @mc_parts ) . ')' : ();
 766
 767     # Handy trick: $x || () inside a join means that if $x ends up as an
 768     # empty string, it gets replaced with (), which makes join ignore it.
 769     # (bad effect: this'll also happen to '0', this hopefully doesn't matter
 770     # in this case.)
 771     join( ' AND ',
 772         join( ' AND ', map { "($_)" } @norm_parts ) || (),
 773         $grouped_mc || () );
 774 }
 775
 776 =head2 _make_phrases
 777
 778     my @phrased_queries = $self->_make_phrases(@query_parts);
 779
 780 This takes the supplied queries and forces them to be phrases by wrapping
 781 quotes around them. It understands field prefixes, e.g. 'subject:' and puts
 782 the quotes outside of them if they're there.
 783
 784 =cut
 785
 786 sub _make_phrases {
 787     my ( $self, @parts ) = @_;
 788     map { s/^\s*(\w*?:)(.*)$/$1"$2"/r } @parts;
 789 }
 790
 791 =head2 _create_query_string
 792
 793     my @query_strings = $self->_create_query_string(@queries);
 794
 795 Given a list of hashrefs, it will turn them into a lucene-style query string.
 796 The hash should contain field, type (both for the indexes), operator, and
 797 operand.
 798
 799 =cut
 800
 801 sub _create_query_string {
 802     my ( $self, @queries ) = @_;
 803
 804     map {
 805         my $otor  = $_->{operator} ? $_->{operator} . ' ' : '';
 806         my $field = $_->{field}    ? $_->{field} . ':'    : '';
 807
 808         my $oand = $self->_modify_string_by_type(%$_);
 809         $oand = "($oand)" if $field && scalar(split(/\s+/, $oand)) > 1 && (!defined $_->{type} || $_->{type} ne 'st-year');
 810         "$otor($field$oand)";
 811     } @queries;
 812 }
 813
 814 =head2 _clean_search_term
 815
 816     my $term = $self->_clean_search_term($term);
 817
 818 This cleans a search term by removing any funny characters that may upset
 819 ES and give us an error. It also calls L<_convert_index_strings_freeform>
 820 to ensure those parts are correct.
 821
 822 =cut
 823
 824 sub _clean_search_term {
 825     my ( $self, $term ) = @_;
 826
 827     # Lookahead for checking if we are inside quotes
 828     my $lookahead = '(?=(?:[^\"]*+\"[^\"]*+\")*+[^\"]*+$)';
 829
 830     # Some hardcoded searches (like with authorities) produce things like
 831     # 'an=123', when it ought to be 'an:123' for our purposes.
 832     $term =~ s/=/:/g;
 833
 834     $term = $self->_convert_index_strings_freeform($term);
 835     $term =~ s/[{}]/"/g;
 836
 837     # Remove unbalanced quotes
 838     my $unquoted = $term;
 839     my $count = ($unquoted =~ tr/"/ /);
 840     if ($count % 2 == 1) {
 841         $term = $unquoted;
 842     }
 843
 844     # Remove unquoted colons that have whitespace on either side of them
 845     $term =~ s/(\:[:\s]+|[:\s]+:)$lookahead//g;
 846
 847     return $term;
 848 }
 849
 850 =head2 _fix_limit_special_cases
 851
 852     my $limits = $self->_fix_limit_special_cases($limits);
 853
 854 This converts any special cases that the limit specifications have into things
 855 that are more readily processable by the rest of the code.
 856
 857 The argument should be an arrayref, and it'll return an arrayref.
 858
 859 =cut
 860
 861 sub _fix_limit_special_cases {
 862     my ( $self, $limits ) = @_;
 863
 864     my @new_lim;
 865     foreach my $l (@$limits) {
 866
 867         # This is set up by opac-search.pl
 868         if ( $l =~ /^yr,st-numeric,ge=/ ) {
 869             my ( $start, $end ) =
 870               ( $l =~ /^yr,st-numeric,ge=(.*) and yr,st-numeric,le=(.*)$/ );
 871             next unless defined($start) && defined($end);
 872             push @new_lim, "copydate:[$start TO $end]";
 873         }
 874         elsif ( $l =~ /^yr,st-numeric=/ ) {
 875             my ($date) = ( $l =~ /^yr,st-numeric=(.*)$/ );
 876             next unless defined($date);
 877             $date = $self->_modify_string_by_type(type => 'st-year', operand => $date);
 878             push @new_lim, "copydate:$date";
 879         }
 880         elsif ( $l =~ /^available$/ ) {
 881             push @new_lim, 'onloan:0';
 882         }
 883         else {
 884             push @new_lim, $l;
 885         }
 886     }
 887     return \@new_lim;
 888 }
 889
 890 =head2 _sort_field
 891
 892     my $field = $self->_sort_field($field);
 893
 894 Given a field name, this works out what the actual name of the field to sort
 895 on should be. A '__sort' suffix is added for fields with a sort version, and
 896 for text fields either '.phrase' (for sortable versions) or '.raw' is appended
 897 to avoid sorting on a tokenized value.
 898
 899 =cut
 900
 901 sub _sort_field {
 902     my ($self, $f) = @_;
 903
 904     my $mappings = $self->get_elasticsearch_mappings();
 905     my $textField = defined $mappings->{data}{properties}{$f}{type} && $mappings->{data}{properties}{$f}{type} eq 'text';
 906     if (!defined $self->sort_fields()->{$f} || $self->sort_fields()->{$f}) {
 907         $f .= '__sort';
 908         # We need to add '.phrase' to text fields, otherwise it'll sort
 909         # based on the tokenised form.
 910         $f .= '.phrase' if $textField;
 911     } else {
 912         # We need to add '.raw' to text fields without a sort field,
 913         # otherwise it'll sort based on the tokenised form.
 914         $f .= '.raw' if $textField;
 915     }
 916     return $f;
 917 }
 918
 919 =head2 _truncate_terms
 920
 921     my $query = $self->_truncate_terms($query);
 922
 923 Given a string query this function appends '*' wildcard  to all terms except
 924 operands and double quoted strings.
 925
 926 =cut
 927
 928 sub _truncate_terms {
 929     my ( $self, $query ) = @_;
 930
 931     my @tokens = $self->_split_query( $query );
 932
 933     # Filter out empty tokens
 934     my @words = grep { $_ !~ /^\s*$/ } @tokens;
 935
 936     # Append '*' to words if needed, ie. if it ends in a word character and is not a keyword
 937     my @terms = map {
 938         my $w = $_;
 939         (/\W$/ or grep {lc($w) eq $_} qw/and or not/) ? $_ : "$_*";
 940     } @words;
 941
 942     return join ' ', @terms;
 943 }
 944
 945 =head2 _split_query
 946
 947     my @token = $self->_split_query($query_str);
 948
 949 Given a string query this function splits it to tokens taking into account
 950 any field prefixes and quoted strings.
 951
 952 =cut
 953
 954 my $tokenize_split_re = qr/((?:${field_name_pattern}${multi_field_pattern}:)?"[^"]+"|\s+)/;
 955
 956 sub _split_query {
 957     my ( $self, $query ) = @_;
 958
 959     # '"donald duck" title:"the mouse" and peter" get split into
 960     # ['', '"donald duck"', '', ' ', '', 'title:"the mouse"', '', ' ', 'and', ' ', 'pete']
 961     my @tokens = split $tokenize_split_re, $query;
 962
 963     # Filter out empty values
 964     @tokens = grep( /\S/, @tokens );
 965
 966     return @tokens;
 967 }
 968
 969 1;