Koha/SearchEngine/Elasticsearch/QueryBuilder.pm

   1 package Koha::SearchEngine::Elasticsearch::QueryBuilder;
   2
   3 # This file is part of Koha.
   4 #
   5 # Copyright 2014 Catalyst IT Ltd.
   6 #
   7 # Koha is free software; you can redistribute it and/or modify it
   8 # under the terms of the GNU General Public License as published by
   9 # the Free Software Foundation; either version 3 of the License, or
  10 # (at your option) any later version.
  11 #
  12 # Koha is distributed in the hope that it will be useful, but
  13 # WITHOUT ANY WARRANTY; without even the implied warranty of
  14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  15 # GNU General Public License for more details.
  16 #
  17 # You should have received a copy of the GNU General Public License
  18 # along with Koha; if not, see <http://www.gnu.org/licenses>.
  19
  20 =head1 NAME
  21
  22 Koha::SearchEngine::Elasticsearch::QueryBuilder - constructs elasticsearch
  23 query objects from user-supplied queries
  24
  25 =head1 DESCRIPTION
  26
  27 This provides the functions that take a user-supplied search query, and
  28 provides something that can be given to elasticsearch to get answers.
  29
  30 =head1 SYNOPSIS
  31
  32     use Koha::SearchEngine::Elasticsearch::QueryBuilder;
  33     $builder = Koha::SearchEngine::Elasticsearch->new({ index => $index });
  34     my $simple_query = $builder->build_query("hello");
  35     # This is currently undocumented because the original code is undocumented
  36     my $adv_query = $builder->build_advanced_query($indexes, $operands, $operators);
  37
  38 =head1 METHODS
  39
  40 =cut
  41
  42 use base qw(Koha::SearchEngine::Elasticsearch);
  43 use Carp;
  44 use JSON;
  45 use List::MoreUtils qw/ each_array /;
  46 use Modern::Perl;
  47 use URI::Escape;
  48
  49 use C4::Context;
  50 use Koha::Exceptions;
  51
  52 =head2 build_query
  53
  54     my $simple_query = $builder->build_query("hello", %options)
  55
  56 This will build a query that can be issued to elasticsearch from the provided
  57 string input. This expects a lucene style search form (see
  58 L<http://www.elasticsearch.org/guide/en/elasticsearch/reference/current/query-dsl-query-string-query.html#query-string-syntax>
  59 for details.)
  60
  61 It'll make an attempt to respect the various query options.
  62
  63 Additional options can be provided with the C<%options> hash.
  64
  65 =over 4
  66
  67 =item sort
  68
  69 This should be an arrayref of hashrefs, each containing a C<field> and an
  70 C<direction> (optional, defaults to C<asc>.) The results will be sorted
  71 according to these values. Valid values for C<direction> are 'asc' and 'desc'.
  72
  73 =back
  74
  75 =cut
  76
  77 sub build_query {
  78     my ( $self, $query, %options ) = @_;
  79
  80     my $stemming         = C4::Context->preference("QueryStemming")        || 0;
  81     my $auto_truncation  = C4::Context->preference("QueryAutoTruncate")    || 0;
  82     my $weight_fields    = C4::Context->preference("QueryWeightFields")    || 0;
  83     my $fuzzy_enabled    = C4::Context->preference("QueryFuzzy")           || 0;
  84
  85     $query = '*' unless defined $query;
  86
  87     my $res;
  88     $res->{query} = {
  89         query_string => {
  90             query            => $query,
  91             fuzziness        => $fuzzy_enabled ? 'auto' : '0',
  92             default_operator => 'AND',
  93             default_field    => '_all',
  94             lenient          => JSON::true,
  95             fields           => $options{fields} || [],
  96         }
  97     };
  98
  99     if ( $options{sort} ) {
 100         foreach my $sort ( @{ $options{sort} } ) {
 101             my ( $f, $d ) = @$sort{qw/ field direction /};
 102             die "Invalid sort direction, $d"
 103               if $d && ( $d ne 'asc' && $d ne 'desc' );
 104             $d = 'asc' unless $d;
 105
 106             $f = $self->_sort_field($f);
 107             push @{ $res->{sort} }, { $f => { order => $d } };
 108         }
 109     }
 110
 111     # See _convert_facets in Search.pm for how these get turned into
 112     # things that Koha can use.
 113     $res->{aggregations} = {
 114         author => { terms => { field => "author__facet" } },
 115         subject => { terms => { field => "subject__facet" } },
 116         itype => { terms => { field => "itype__facet" } },
 117         location => { terms => { field => "location__facet" } },
 118         'su-geo' => { terms => { field => "su-geo__facet" } },
 119         'title-series' => { terms => { field => "title-series__facet" } },
 120         ccode => { terms => { field => "ccode__facet" } },
 121     };
 122
 123     my $display_library_facets = C4::Context->preference('DisplayLibraryFacets');
 124     if (   $display_library_facets eq 'both'
 125         or $display_library_facets eq 'home' ) {
 126         $res->{aggregations}{homebranch} = { terms => { field => "homebranch__facet" } };
 127     }
 128     if (   $display_library_facets eq 'both'
 129         or $display_library_facets eq 'holding' ) {
 130         $res->{aggregations}{holdingbranch} = { terms => { field => "holdingbranch__facet" } };
 131     }
 132     if ( my $ef = $options{expanded_facet} ) {
 133         $res->{aggregations}{$ef}{terms}{size} = C4::Context->preference('FacetMaxCount');
 134     };
 135     return $res;
 136 }
 137
 138 =head2 build_browse_query
 139
 140     my $browse_query = $builder->build_browse_query($field, $query);
 141
 142 This performs a "starts with" style query on a particular field. The field
 143 to be searched must have been indexed with an appropriate mapping as a
 144 "phrase" subfield, which pretty much everything has.
 145
 146 =cut
 147
 148 # XXX this isn't really a browse query like we want in the end
 149 sub build_browse_query {
 150     my ( $self, $field, $query ) = @_;
 151
 152     my $fuzzy_enabled = C4::Context->preference("QueryFuzzy") || 0;
 153
 154     return { query => '*' } if !defined $query;
 155
 156     # TODO this should come from Koha::SearchEngine::Elasticsearch
 157     my %field_whitelist = (
 158         title  => 1,
 159         author => 1,
 160     );
 161     $field = 'title' if !exists $field_whitelist{$field};
 162     my $sort = $self->_sort_field($field);
 163     my $res = {
 164         query => {
 165             match_phrase_prefix => {
 166                 "$field.phrase" => {
 167                     query     => $query,
 168                     operator  => 'or',
 169                     fuzziness => $fuzzy_enabled ? 'auto' : '0',
 170                 }
 171             }
 172         },
 173         sort => [ { $sort => { order => "asc" } } ],
 174     };
 175 }
 176
 177 =head2 build_query_compat
 178
 179     my (
 180         $error,             $query, $simple_query, $query_cgi,
 181         $query_desc,        $limit, $limit_cgi,    $limit_desc,
 182         $stopwords_removed, $query_type
 183       )
 184       = $builder->build_query_compat( \@operators, \@operands, \@indexes,
 185         \@limits, \@sort_by, $scan, $lang );
 186
 187 This handles a search using the same api as L<C4::Search::buildQuery> does.
 188
 189 A very simple query will go in with C<$operands> set to ['query'], and
 190 C<$sort_by> set to ['pubdate_dsc']. This simple case will return with
 191 C<$query> set to something that can perform the search, C<$simple_query>
 192 set to just the search term, C<$query_cgi> set to something that can
 193 reproduce this search, and C<$query_desc> set to something else.
 194
 195 =cut
 196
 197 sub build_query_compat {
 198     my ( $self, $operators, $operands, $indexes, $orig_limits, $sort_by, $scan,
 199         $lang, $params )
 200       = @_;
 201
 202 #die Dumper ( $self, $operators, $operands, $indexes, $orig_limits, $sort_by, $scan, $lang );
 203     my @sort_params  = $self->_convert_sort_fields(@$sort_by);
 204     my @index_params = $self->_convert_index_fields(@$indexes);
 205     my $limits       = $self->_fix_limit_special_cases($orig_limits);
 206     if ( $params->{suppress} ) { push @$limits, "suppress:0"; }
 207
 208     # Merge the indexes in with the search terms and the operands so that
 209     # each search thing is a handy unit.
 210     unshift @$operators, undef;    # The first one can't have an op
 211     my @search_params;
 212     my $truncate = C4::Context->preference("QueryAutoTruncate") || 0;
 213     my $ea = each_array( @$operands, @$operators, @index_params );
 214     while ( my ( $oand, $otor, $index ) = $ea->() ) {
 215         next if ( !defined($oand) || $oand eq '' );
 216         $oand = $self->_clean_search_term($oand);
 217         $oand = $self->_truncate_terms($oand) if ($truncate);
 218         push @search_params, {
 219             operand => $oand,      # the search terms
 220             operator => defined($otor) ? uc $otor : undef,    # AND and so on
 221             $index ? %$index : (),
 222         };
 223     }
 224
 225     # We build a string query from limits and the queries. An alternative
 226     # would be to pass them separately into build_query and let it build
 227     # them into a structured ES query itself. Maybe later, though that'd be
 228     # more robust.
 229     my $query_str = join( ' AND ',
 230         join( ' ', $self->_create_query_string(@search_params) ) || (),
 231         $self->_join_queries( $self->_convert_index_strings(@$limits) ) || () );
 232
 233     my @fields = '_all';
 234     if ( defined($params->{weighted_fields}) && $params->{weighted_fields} ) {
 235         push @fields, sprintf("%s^%s", $_->name, $_->weight) for Koha::SearchFields->weighted_fields;
 236     }
 237
 238     # If there's no query on the left, let's remove the junk left behind
 239     $query_str =~ s/^ AND //;
 240     my %options;
 241     $options{fields} = \@fields;
 242     $options{sort} = \@sort_params;
 243     $options{expanded_facet} = $params->{expanded_facet};
 244     my $query = $self->build_query( $query_str, %options );
 245
 246     #die Dumper($query);
 247     # We roughly emulate the CGI parameters of the zebra query builder
 248     my $query_cgi;
 249     $query_cgi = 'q=' . uri_escape_utf8( $operands->[0] ) if @$operands;
 250     my $simple_query;
 251     $simple_query = $operands->[0] if @$operands == 1;
 252     my $query_desc   = $simple_query;
 253     my $limit        = $self->_join_queries( $self->_convert_index_strings(@$limits));
 254     my $limit_cgi = ( $orig_limits and @$orig_limits )
 255       ? '&limit=' . join( '&limit=', map { uri_escape_utf8($_) } @$orig_limits )
 256       : '';
 257     my $limit_desc;
 258     $limit_desc = "$limit" if $limit;
 259     return (
 260         undef,  $query,     $simple_query, $query_cgi, $query_desc,
 261         $limit, $limit_cgi, $limit_desc,   undef,      undef
 262     );
 263 }
 264
 265 =head2 build_authorities_query
 266
 267     my $query = $builder->build_authorities_query(\%search);
 268
 269 This takes a nice description of an authority search and turns it into a black-box
 270 query that can then be passed to the appropriate searcher.
 271
 272 The search description is a hashref that looks something like:
 273
 274     {
 275         searches => [
 276             {
 277                 where    => 'Heading',    # search the main entry
 278                 operator => 'exact',        # require an exact match
 279                 value    => 'frogs',        # the search string
 280             },
 281             {
 282                 where    => '',             # search all entries
 283                 operator => '',             # default keyword, right truncation
 284                 value    => 'pond',
 285             },
 286         ],
 287         sort => {
 288             field => 'Heading',
 289             order => 'desc',
 290         },
 291         authtypecode => 'TOPIC_TERM',
 292     }
 293
 294 =cut
 295
 296 sub build_authorities_query {
 297     my ( $self, $search ) = @_;
 298
 299     # Start by making the query parts
 300     my @query_parts;
 301
 302     foreach my $s ( @{ $search->{searches} } ) {
 303         my ( $wh, $op, $val ) = @{$s}{qw(where operator value)};
 304         $wh = '_all' if $wh eq '';
 305         if ( $op eq 'is' || $op eq '='  || $op eq 'exact' ) {
 306
 307             # look for something that matches a term completely
 308             # note, '=' is about numerical vals. May need special handling.
 309             # Also, we lowercase our search because the ES
 310             # index lowercases its values, and term searches don't get the
 311             # search analyzer applied to them.
 312             push @query_parts, { match_phrase => {"$wh.phrase" => lc $val} };
 313         }
 314         elsif ( $op eq 'start' ) {
 315             # startswith search, uses lowercase untokenized version of heading
 316             push @query_parts, { match_phrase_prefix => {"$wh.phrase" => lc $val} };
 317         }
 318         else {
 319             # regular wordlist stuff
 320             my @tokens = $self->_split_query( $val );
 321             foreach my $token ( @tokens ) {
 322                 $token = $self->_truncate_terms(
 323                     $self->_clean_search_term( $token )
 324                 );
 325             }
 326             my $query = $self->_join_queries( @tokens );
 327             push @query_parts, { query_string => { default_field => $wh, query => $query } };
 328         }
 329     }
 330
 331     # Merge the query parts appropriately
 332     # 'should' behaves like 'or'
 333     # 'must' behaves like 'and'
 334     # Zebra results seem to match must so using that here
 335     my $query = { query =>
 336                  { bool =>
 337                      { must => \@query_parts  }
 338                  }
 339              };
 340
 341     my %s;
 342     if ( exists $search->{sort} ) {
 343         foreach my $k ( keys %{ $search->{sort} } ) {
 344             my $f = $self->_sort_field($k);
 345             $s{$f} = $search->{sort}{$k};
 346         }
 347         $search->{sort} = \%s;
 348     }
 349
 350     # add the sort stuff
 351     $query->{sort} = [ $search->{sort} ]  if exists $search->{sort};
 352
 353     return $query;
 354 }
 355
 356
 357 =head2 build_authorities_query_compat
 358
 359     my ($query) =
 360       $builder->build_authorities_query_compat( \@marclist, \@and_or,
 361         \@excluding, \@operator, \@value, $authtypecode, $orderby );
 362
 363 This builds a query for searching for authorities, in the style of
 364 L<C4::AuthoritiesMarc::SearchAuthorities>.
 365
 366 Arguments:
 367
 368 =over 4
 369
 370 =item marclist
 371
 372 An arrayref containing where the particular term should be searched for.
 373 Options are: mainmainentry, mainentry, match, match-heading, see-from, and
 374 thesaurus. If left blank, any field is used.
 375
 376 =item and_or
 377
 378 Totally ignored. It is never used in L<C4::AuthoritiesMarc::SearchAuthorities>.
 379
 380 =item excluding
 381
 382 Also ignored.
 383
 384 =item operator
 385
 386 What form of search to do. Options are: is (phrase, no truncation, whole field
 387 must match), = (number exact match), exact (phrase, no truncation, whole field
 388 must match). If left blank, then word list, right truncated, anywhere is used.
 389
 390 =item value
 391
 392 The actual user-provided string value to search for.
 393
 394 =item authtypecode
 395
 396 The authority type code to search within. If blank, then all will be searched.
 397
 398 =item orderby
 399
 400 The order to sort the results by. Options are Relevance, HeadingAsc,
 401 HeadingDsc, AuthidAsc, AuthidDsc.
 402
 403 =back
 404
 405 marclist, operator, and value must be the same length, and the values at
 406 index /i/ all relate to each other.
 407
 408 This returns a query, which is a black box object that can be passed to the
 409 appropriate search object.
 410
 411 =cut
 412
 413 our $koha_to_index_name = {
 414     mainmainentry   => 'heading-main',
 415     mainentry       => 'heading',
 416     match           => 'match',
 417     'match-heading' => 'match-heading',
 418     'see-from'      => 'match-heading-see-from',
 419     thesaurus       => 'subject-heading-thesaurus',
 420     any             => '',
 421     all             => ''
 422 };
 423
 424 sub build_authorities_query_compat {
 425     my ( $self, $marclist, $and_or, $excluding, $operator, $value,
 426         $authtypecode, $orderby )
 427       = @_;
 428
 429     # This turns the old-style many-options argument form into a more
 430     # extensible hash form that is understood by L<build_authorities_query>.
 431     my @searches;
 432
 433     # Convert to lower case
 434     $marclist = [map(lc, @{$marclist})];
 435     $orderby  = lc $orderby;
 436
 437     # Make sure everything exists
 438     foreach my $m (@$marclist) {
 439         Koha::Exceptions::WrongParameter->throw("Invalid marclist field provided: $m")
 440             unless exists $koha_to_index_name->{$m};
 441     }
 442     for ( my $i = 0 ; $i < @$value ; $i++ ) {
 443         next unless $value->[$i]; #clean empty form values, ES doesn't like undefined searches
 444         push @searches,
 445           {
 446             where    => $koha_to_index_name->{$marclist->[$i]},
 447             operator => $operator->[$i],
 448             value    => $value->[$i],
 449           };
 450     }
 451
 452     my %sort;
 453     my $sort_field =
 454         ( $orderby =~ /^heading/ ) ? 'heading'
 455       : ( $orderby =~ /^auth/ )    ? 'local-number'
 456       :                              undef;
 457     if ($sort_field) {
 458         my $sort_order = ( $orderby =~ /asc$/ ) ? 'asc' : 'desc';
 459         %sort = ( $sort_field => $sort_order, );
 460     }
 461     my %search = (
 462         searches     => \@searches,
 463         authtypecode => $authtypecode,
 464     );
 465     $search{sort} = \%sort if %sort;
 466     my $query = $self->build_authorities_query( \%search );
 467     return $query;
 468 }
 469
 470 =head2 _convert_sort_fields
 471
 472     my @sort_params = _convert_sort_fields(@sort_by)
 473
 474 Converts the zebra-style sort index information into elasticsearch-style.
 475
 476 C<@sort_by> is the same as presented to L<build_query_compat>, and it returns
 477 something that can be sent to L<build_query>.
 478
 479 =cut
 480
 481 sub _convert_sort_fields {
 482     my ( $self, @sort_by ) = @_;
 483
 484     # Turn the sorting into something we care about.
 485     my %sort_field_convert = (
 486         acqdate     => 'date-of-acquisition',
 487         author      => 'author',
 488         call_number => 'local-classification',
 489         popularity  => 'issues',
 490         relevance   => undef,       # default
 491         title       => 'title',
 492         pubdate     => 'date-of-publication',
 493     );
 494     my %sort_order_convert =
 495       ( qw( desc desc ), qw( dsc desc ), qw( asc asc ), qw( az asc ), qw( za desc ) );
 496
 497     # Convert the fields and orders, drop anything we don't know about.
 498     grep { $_->{field} } map {
 499         my ( $f, $d ) = /(.+)_(.+)/;
 500         {
 501             field     => $sort_field_convert{$f},
 502             direction => $sort_order_convert{$d}
 503         }
 504     } @sort_by;
 505 }
 506
 507 =head2 _convert_index_fields
 508
 509     my @index_params = $self->_convert_index_fields(@indexes);
 510
 511 Converts zebra-style search index notation into elasticsearch-style.
 512
 513 C<@indexes> is an array of index names, as presented to L<build_query_compat>,
 514 and it returns something that can be sent to L<build_query>.
 515
 516 B<TODO>: this will pull from the elasticsearch mappings table to figure out
 517 types.
 518
 519 =cut
 520
 521 our %index_field_convert = (
 522     'kw' => '_all',
 523     'ab' => 'abstract',
 524     'au' => 'author',
 525     'lcn' => 'local-classification',
 526     'callnum' => 'local-classification',
 527     'record-type' => 'rtype',
 528     'mc-rtype' => 'rtype',
 529     'mus' => 'rtype',
 530     'lc-card' => 'lc-card-number',
 531     'sn' => 'local-number',
 532     'yr' => 'date-of-publication',
 533     'pubdate' => 'date-of-publication',
 534     'acqdate' => 'date-of-acquisition',
 535     'date/time-last-modified' => 'date-time-last-modified',
 536     'dtlm' => 'date/time-last-modified',
 537     'diss' => 'dissertation-information',
 538     'nb' => 'isbn',
 539     'ns' => 'issn',
 540     'music-number' => 'identifier-publisher-for-music',
 541     'number-music-publisher' => 'identifier-publisher-for-music',
 542     'music' => 'identifier-publisher-for-music',
 543     'ident' => 'identifier-standard',
 544     'cpn' => 'corporate-name',
 545     'cfn' => 'conference-name',
 546     'pn' => 'personal-name',
 547     'pb' => 'publisher',
 548     'pv' => 'provider',
 549     'nt' => 'note',
 550     'notes' => 'note',
 551     'rcn' => 'record-control-number',
 552     'su' => 'subject',
 553     'su-to' => 'subject',
 554     #'su-geo' => 'subject',
 555     'su-ut' => 'subject',
 556     'ti' => 'title',
 557     'se' => 'title-series',
 558     'ut' => 'title-uniform',
 559     'an' => 'koha-auth-number',
 560     'authority-number' => 'koha-auth-number',
 561     'at' => 'authtype',
 562     'he' => 'heading',
 563     'rank' => 'relevance',
 564     'phr' => 'st-phrase',
 565     'wrdl' => 'st-word-list',
 566     'rt' => 'right-truncation',
 567     'rtrn' => 'right-truncation',
 568     'ltrn' => 'left-truncation',
 569     'rltrn' => 'left-and-right',
 570     'mc-itemtype' => 'itemtype',
 571     'mc-ccode' => 'ccode',
 572     'branch' => 'homebranch',
 573     'mc-loc' => 'location',
 574     'stocknumber' => 'number-local-acquisition',
 575     'inv' => 'number-local-acquisition',
 576     'bc' => 'barcode',
 577     'mc-itype' => 'itype',
 578     'aub' => 'author-personal-bibliography',
 579     'auo' => 'author-in-order',
 580     'ff8-22' => 'ta',
 581     'aud' => 'ta',
 582     'audience' => 'ta',
 583     'frequency-code' => 'ff8-18',
 584     'illustration-code' => 'ff8-18-21',
 585     'regularity-code' => 'ff8-19',
 586     'type-of-serial' => 'ff8-21',
 587     'format' => 'ff8-23',
 588     'conference-code' => 'ff8-29',
 589     'festschrift-indicator' => 'ff8-30',
 590     'index-indicator' => 'ff8-31',
 591     'fiction' => 'lf',
 592     'fic' => 'lf',
 593     'literature-code' => 'lf',
 594     'biography' => 'bio',
 595     'ff8-34' => 'bio',
 596     'biography-code' => 'bio',
 597     'l-format' => 'ff7-01-02',
 598     'lex' => 'lexile-number',
 599     'hi' => 'host-item-number',
 600     'itu' => 'index-term-uncontrolled',
 601     'itg' => 'index-term-genre',
 602 );
 603 my $field_name_pattern = '[\w\-]+';
 604 my $multi_field_pattern = "(?:\\.$field_name_pattern)*";
 605
 606 sub _convert_index_fields {
 607     my ( $self, @indexes ) = @_;
 608
 609     my %index_type_convert =
 610       ( __default => undef, phr => 'phrase', rtrn => 'right-truncate' );
 611
 612     # Convert according to our table, drop anything that doesn't convert.
 613     # If a field starts with mc- we save it as it's used (and removed) later
 614     # when joining things, to indicate we make it an 'OR' join.
 615     # (Sorry, this got a bit ugly after special cases were found.)
 616     grep { $_->{field} } map {
 617         # Lower case all field names
 618         my ( $f, $t ) = map(lc, split /,/);
 619         my $mc = '';
 620         if ($f =~ /^mc-/) {
 621             $mc = 'mc-';
 622             $f =~ s/^mc-//;
 623         }
 624         my $r = {
 625             field => exists $index_field_convert{$f} ? $index_field_convert{$f} : $f,
 626             type  => $index_type_convert{ $t // '__default' }
 627         };
 628         $r->{field} = ($mc . $r->{field}) if $mc && $r->{field};
 629         $r;
 630     } @indexes;
 631 }
 632
 633 =head2 _convert_index_strings
 634
 635     my @searches = $self->_convert_index_strings(@searches);
 636
 637 Similar to L<_convert_index_fields>, this takes strings of the form
 638 B<field:search term> and rewrites the field from zebra-style to
 639 elasticsearch-style. Anything it doesn't understand is returned verbatim.
 640
 641 =cut
 642
 643 sub _convert_index_strings {
 644     my ( $self, @searches ) = @_;
 645     my @res;
 646     foreach my $s (@searches) {
 647         next if $s eq '';
 648         my ( $field, $term ) = $s =~ /^\s*([\w,-]*?):(.*)/;
 649         unless ( defined($field) && defined($term) ) {
 650             push @res, $s;
 651             next;
 652         }
 653         my ($conv) = $self->_convert_index_fields($field);
 654         unless ( defined($conv) ) {
 655             push @res, $s;
 656             next;
 657         }
 658         push @res, $conv->{field} . ":"
 659           . $self->_modify_string_by_type( %$conv, operand => $term );
 660     }
 661     return @res;
 662 }
 663
 664 =head2 _convert_index_strings_freeform
 665
 666     my $search = $self->_convert_index_strings_freeform($search);
 667
 668 This is similar to L<_convert_index_strings>, however it'll search out the
 669 things to change within the string. So it can handle strings such as
 670 C<(su:foo) AND (su:bar)>, converting the C<su> appropriately.
 671
 672 If there is something of the form "su,complete-subfield" or something, the
 673 second part is stripped off as we can't yet handle that. Making it work
 674 will have to wait for a real query parser.
 675
 676 =cut
 677
 678 sub _convert_index_strings_freeform {
 679     my ( $self, $search ) = @_;
 680     # @TODO: Currenty will alter also fields contained within quotes:
 681     # `searching for "stuff cn:123"` for example will become
 682     # `searching for "stuff local-number:123"
 683     #
 684     # Fixing this is tricky, one possibility:
 685     # https://stackoverflow.com/questions/19193876/perl-regex-to-match-a-string-that-is-not-enclosed-in-quotes
 686     # Still not perfect, and will not handle escaped quotes within quotes and assumes balanced quotes.
 687     #
 688     # Another, not so elegant, solution could be to replace all quoted content with placeholders, and put
 689     # them back when processing is done.
 690
 691     # Lower case field names
 692     $search =~ s/($field_name_pattern)(?:,[\w-]*)?($multi_field_pattern):/\L$1\E$2:/og;
 693     # Resolve possible field aliases
 694     $search =~ s/($field_name_pattern)($multi_field_pattern):/(exists $index_field_convert{$1} ? $index_field_convert{$1} : $1)."$2:"/oge;
 695     return $search;
 696 }
 697
 698 =head2 _modify_string_by_type
 699
 700     my $str = $self->_modify_string_by_type(%index_field);
 701
 702 If you have a search term (operand) and a type (phrase, right-truncated), this
 703 will convert the string to have the function in lucene search terms, e.g.
 704 wrapping quotes around it.
 705
 706 =cut
 707
 708 sub _modify_string_by_type {
 709     my ( $self, %idx ) = @_;
 710
 711     my $type = $idx{type} || '';
 712     my $str = $idx{operand};
 713     return $str unless $str;    # Empty or undef, we can't use it.
 714
 715     $str .= '*' if $type eq 'right-truncate';
 716     $str = '"' . $str . '"' if $type eq 'phrase';
 717     return $str;
 718 }
 719
 720 =head2 _join_queries
 721
 722     my $query_str = $self->_join_queries(@query_parts);
 723
 724 This takes a list of query parts, that might be search terms on their own, or
 725 booleaned together, or specifying fields, or whatever, wraps them in
 726 parentheses, and ANDs them all together. Suitable for feeding to the ES
 727 query string query.
 728
 729 Note: doesn't AND them together if they specify an index that starts with "mc"
 730 as that was a special case in the original code for dealing with multiple
 731 choice options (you can't search for something that has an itype of A and
 732 and itype of B otherwise.)
 733
 734 =cut
 735
 736 sub _join_queries {
 737     my ( $self, @parts ) = @_;
 738
 739     my @norm_parts = grep { defined($_) && $_ ne '' && $_ !~ /^mc-/ } @parts;
 740     my @mc_parts =
 741       map { s/^mc-//r } grep { defined($_) && $_ ne '' && $_ =~ /^mc-/ } @parts;
 742     return () unless @norm_parts + @mc_parts;
 743     return ( @norm_parts, @mc_parts )[0] if @norm_parts + @mc_parts == 1;
 744     my $grouped_mc =
 745       @mc_parts ? '(' . ( join ' OR ', map { "($_)" } @mc_parts ) . ')' : ();
 746
 747     # Handy trick: $x || () inside a join means that if $x ends up as an
 748     # empty string, it gets replaced with (), which makes join ignore it.
 749     # (bad effect: this'll also happen to '0', this hopefully doesn't matter
 750     # in this case.)
 751     join( ' AND ',
 752         join( ' AND ', map { "($_)" } @norm_parts ) || (),
 753         $grouped_mc || () );
 754 }
 755
 756 =head2 _make_phrases
 757
 758     my @phrased_queries = $self->_make_phrases(@query_parts);
 759
 760 This takes the supplied queries and forces them to be phrases by wrapping
 761 quotes around them. It understands field prefixes, e.g. 'subject:' and puts
 762 the quotes outside of them if they're there.
 763
 764 =cut
 765
 766 sub _make_phrases {
 767     my ( $self, @parts ) = @_;
 768     map { s/^\s*(\w*?:)(.*)$/$1"$2"/r } @parts;
 769 }
 770
 771 =head2 _create_query_string
 772
 773     my @query_strings = $self->_create_query_string(@queries);
 774
 775 Given a list of hashrefs, it will turn them into a lucene-style query string.
 776 The hash should contain field, type (both for the indexes), operator, and
 777 operand.
 778
 779 =cut
 780
 781 sub _create_query_string {
 782     my ( $self, @queries ) = @_;
 783
 784     map {
 785         my $otor  = $_->{operator} ? $_->{operator} . ' ' : '';
 786         my $field = $_->{field}    ? $_->{field} . ':'    : '';
 787
 788         my $oand = $self->_modify_string_by_type(%$_);
 789         "$otor($field$oand)";
 790     } @queries;
 791 }
 792
 793 =head2 _clean_search_term
 794
 795     my $term = $self->_clean_search_term($term);
 796
 797 This cleans a search term by removing any funny characters that may upset
 798 ES and give us an error. It also calls L<_convert_index_strings_freeform>
 799 to ensure those parts are correct.
 800
 801 =cut
 802
 803 sub _clean_search_term {
 804     my ( $self, $term ) = @_;
 805
 806     # Some hardcoded searches (like with authorities) produce things like
 807     # 'an=123', when it ought to be 'an:123' for our purposes.
 808     $term =~ s/=/:/g;
 809     $term = $self->_convert_index_strings_freeform($term);
 810     $term =~ s/[{}]/"/g;
 811     return $term;
 812 }
 813
 814 =head2 _fix_limit_special_cases
 815
 816     my $limits = $self->_fix_limit_special_cases($limits);
 817
 818 This converts any special cases that the limit specifications have into things
 819 that are more readily processable by the rest of the code.
 820
 821 The argument should be an arrayref, and it'll return an arrayref.
 822
 823 =cut
 824
 825 sub _fix_limit_special_cases {
 826     my ( $self, $limits ) = @_;
 827
 828     my @new_lim;
 829     foreach my $l (@$limits) {
 830
 831         # This is set up by opac-search.pl
 832         if ( $l =~ /^yr,st-numeric,ge=/ ) {
 833             my ( $start, $end ) =
 834               ( $l =~ /^yr,st-numeric,ge=(.*) and yr,st-numeric,le=(.*)$/ );
 835             next unless defined($start) && defined($end);
 836             push @new_lim, "copydate:[$start TO $end]";
 837         }
 838         elsif ( $l =~ /^yr,st-numeric=/ ) {
 839             my ($date) = ( $l =~ /^yr,st-numeric=(.*)$/ );
 840             next unless defined($date);
 841             push @new_lim, "copydate:$date";
 842         }
 843         elsif ( $l =~ /^available$/ ) {
 844             push @new_lim, 'onloan:0';
 845         }
 846         else {
 847             push @new_lim, $l;
 848         }
 849     }
 850     return \@new_lim;
 851 }
 852
 853 =head2 _sort_field
 854
 855     my $field = $self->_sort_field($field);
 856
 857 Given a field name, this works out what the actual name of the field to sort
 858 on should be. A '__sort' suffix is added for fields with a sort version, and
 859 for text fields either '.phrase' (for sortable versions) or '.raw' is appended
 860 to avoid sorting on a tokenized value.
 861
 862 =cut
 863
 864 sub _sort_field {
 865     my ($self, $f) = @_;
 866
 867     my $mappings = $self->get_elasticsearch_mappings();
 868     my $textField = defined $mappings->{data}{properties}{$f}{type} && $mappings->{data}{properties}{$f}{type} eq 'text';
 869     if (!defined $self->sort_fields()->{$f} || $self->sort_fields()->{$f}) {
 870         $f .= '__sort';
 871         # We need to add '.phrase' to text fields, otherwise it'll sort
 872         # based on the tokenised form.
 873         $f .= '.phrase' if $textField;
 874     } else {
 875         # We need to add '.raw' to text fields without a sort field,
 876         # otherwise it'll sort based on the tokenised form.
 877         $f .= '.raw' if $textField;
 878     }
 879     return $f;
 880 }
 881
 882 =head2 _truncate_terms
 883
 884     my $query = $self->_truncate_terms($query);
 885
 886 Given a string query this function appends '*' wildcard  to all terms except
 887 operands and double quoted strings.
 888
 889 =cut
 890
 891 sub _truncate_terms {
 892     my ( $self, $query ) = @_;
 893
 894     my @tokens = $self->_split_query( $query );
 895
 896     # Filter out empty tokens
 897     my @words = grep { $_ !~ /^\s*$/ } @tokens;
 898
 899     # Append '*' to words if needed, ie. if it ends in a word character and is not a keyword
 900     my @terms = map {
 901         my $w = $_;
 902         (/\W$/ or grep {lc($w) eq $_} qw/and or not/) ? $_ : "$_*";
 903     } @words;
 904
 905     return join ' ', @terms;
 906 }
 907
 908 =head2 _split_query
 909
 910     my @token = $self->_split_query($query_str);
 911
 912 Given a string query this function splits it to tokens taking into account
 913 any field prefixes and quoted strings.
 914
 915 =cut
 916
 917 my $tokenize_split_re = qr/((?:${field_name_pattern}${multi_field_pattern}:)?"[^"]+"|\s+)/;
 918
 919 sub _split_query {
 920     my ( $self, $query ) = @_;
 921
 922     # '"donald duck" title:"the mouse" and peter" get split into
 923     # ['', '"donald duck"', '', ' ', '', 'title:"the mouse"', '', ' ', 'and', ' ', 'pete']
 924     my @tokens = split $tokenize_split_re, $query;
 925
 926     # Filter out empty values
 927     @tokens = grep( /\S/, @tokens );
 928
 929     return @tokens;
 930 }
 931
 932 1;