Koha/SearchEngine/Elasticsearch/QueryBuilder.pm

   1 package Koha::SearchEngine::Elasticsearch::QueryBuilder;
   2
   3 # This file is part of Koha.
   4 #
   5 # Copyright 2014 Catalyst IT Ltd.
   6 #
   7 # Koha is free software; you can redistribute it and/or modify it
   8 # under the terms of the GNU General Public License as published by
   9 # the Free Software Foundation; either version 3 of the License, or
  10 # (at your option) any later version.
  11 #
  12 # Koha is distributed in the hope that it will be useful, but
  13 # WITHOUT ANY WARRANTY; without even the implied warranty of
  14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  15 # GNU General Public License for more details.
  16 #
  17 # You should have received a copy of the GNU General Public License
  18 # along with Koha; if not, see <http://www.gnu.org/licenses>.
  19
  20 =head1 NAME
  21
  22 Koha::SearchEngine::Elasticsearch::QueryBuilder - constructs elasticsearch
  23 query objects from user-supplied queries
  24
  25 =head1 DESCRIPTION
  26
  27 This provides the functions that take a user-supplied search query, and
  28 provides something that can be given to elasticsearch to get answers.
  29
  30 =head1 SYNOPSIS
  31
  32     use Koha::SearchEngine::Elasticsearch::QueryBuilder;
  33     $builder = Koha::SearchEngine::Elasticsearch->new({ index => $index });
  34     my $simple_query = $builder->build_query("hello");
  35     # This is currently undocumented because the original code is undocumented
  36     my $adv_query = $builder->build_advanced_query($indexes, $operands, $operators);
  37
  38 =head1 METHODS
  39
  40 =cut
  41
  42 use base qw(Koha::SearchEngine::Elasticsearch);
  43 use Carp;
  44 use JSON;
  45 use List::MoreUtils qw/ each_array /;
  46 use Modern::Perl;
  47 use URI::Escape;
  48
  49 use C4::Context;
  50 use Koha::Exceptions;
  51
  52 =head2 build_query
  53
  54     my $simple_query = $builder->build_query("hello", %options)
  55
  56 This will build a query that can be issued to elasticsearch from the provided
  57 string input. This expects a lucene style search form (see
  58 L<http://www.elasticsearch.org/guide/en/elasticsearch/reference/current/query-dsl-query-string-query.html#query-string-syntax>
  59 for details.)
  60
  61 It'll make an attempt to respect the various query options.
  62
  63 Additional options can be provided with the C<%options> hash.
  64
  65 =over 4
  66
  67 =item sort
  68
  69 This should be an arrayref of hashrefs, each containing a C<field> and an
  70 C<direction> (optional, defaults to C<asc>.) The results will be sorted
  71 according to these values. Valid values for C<direction> are 'asc' and 'desc'.
  72
  73 =back
  74
  75 =cut
  76
  77 sub build_query {
  78     my ( $self, $query, %options ) = @_;
  79
  80     my $stemming         = C4::Context->preference("QueryStemming")        || 0;
  81     my $auto_truncation  = C4::Context->preference("QueryAutoTruncate")    || 0;
  82     my $weight_fields    = C4::Context->preference("QueryWeightFields")    || 0;
  83     my $fuzzy_enabled    = C4::Context->preference("QueryFuzzy")           || 0;
  84
  85     $query = '*' unless defined $query;
  86
  87     my $res;
  88     $res->{query} = {
  89         query_string => {
  90             query            => $query,
  91             fuzziness        => $fuzzy_enabled ? 'auto' : '0',
  92             default_operator => 'AND',
  93             default_field    => '_all',
  94             lenient          => JSON::true,
  95             fields           => $options{fields} || [],
  96         }
  97     };
  98
  99     if ( $options{sort} ) {
 100         foreach my $sort ( @{ $options{sort} } ) {
 101             my ( $f, $d ) = @$sort{qw/ field direction /};
 102             die "Invalid sort direction, $d"
 103               if $d && ( $d ne 'asc' && $d ne 'desc' );
 104             $d = 'asc' unless $d;
 105
 106             $f = $self->_sort_field($f);
 107             push @{ $res->{sort} }, { $f => { order => $d } };
 108         }
 109     }
 110
 111     # See _convert_facets in Search.pm for how these get turned into
 112     # things that Koha can use.
 113     $res->{aggregations} = {
 114         author         => { terms => { field => "author__facet" } },
 115         subject        => { terms => { field => "subject__facet" } },
 116         itype          => { terms => { field => "itype__facet" } },
 117         location       => { terms => { field => "location__facet" } },
 118         'su-geo'       => { terms => { field => "su-geo__facet" } },
 119         'title-series' => { terms => { field => "title-series__facet" } },
 120         ccode          => { terms => { field => "ccode__facet" } },
 121         ln             => { terms => { field => "ln__facet" } },
 122     };
 123
 124     my $display_library_facets = C4::Context->preference('DisplayLibraryFacets');
 125     if (   $display_library_facets eq 'both'
 126         or $display_library_facets eq 'home' ) {
 127         $res->{aggregations}{homebranch} = { terms => { field => "homebranch__facet" } };
 128     }
 129     if (   $display_library_facets eq 'both'
 130         or $display_library_facets eq 'holding' ) {
 131         $res->{aggregations}{holdingbranch} = { terms => { field => "holdingbranch__facet" } };
 132     }
 133     if ( my $ef = $options{expanded_facet} ) {
 134         $res->{aggregations}{$ef}{terms}{size} = C4::Context->preference('FacetMaxCount');
 135     };
 136     return $res;
 137 }
 138
 139 =head2 build_browse_query
 140
 141     my $browse_query = $builder->build_browse_query($field, $query);
 142
 143 This performs a "starts with" style query on a particular field. The field
 144 to be searched must have been indexed with an appropriate mapping as a
 145 "phrase" subfield, which pretty much everything has.
 146
 147 =cut
 148
 149 # XXX this isn't really a browse query like we want in the end
 150 sub build_browse_query {
 151     my ( $self, $field, $query ) = @_;
 152
 153     my $fuzzy_enabled = C4::Context->preference("QueryFuzzy") || 0;
 154
 155     return { query => '*' } if !defined $query;
 156
 157     # TODO this should come from Koha::SearchEngine::Elasticsearch
 158     my %field_whitelist = (
 159         title  => 1,
 160         author => 1,
 161     );
 162     $field = 'title' if !exists $field_whitelist{$field};
 163     my $sort = $self->_sort_field($field);
 164     my $res = {
 165         query => {
 166             match_phrase_prefix => {
 167                 "$field.phrase" => {
 168                     query     => $query,
 169                     operator  => 'or',
 170                     fuzziness => $fuzzy_enabled ? 'auto' : '0',
 171                 }
 172             }
 173         },
 174         sort => [ { $sort => { order => "asc" } } ],
 175     };
 176 }
 177
 178 =head2 build_query_compat
 179
 180     my (
 181         $error,             $query, $simple_query, $query_cgi,
 182         $query_desc,        $limit, $limit_cgi,    $limit_desc,
 183         $stopwords_removed, $query_type
 184       )
 185       = $builder->build_query_compat( \@operators, \@operands, \@indexes,
 186         \@limits, \@sort_by, $scan, $lang );
 187
 188 This handles a search using the same api as L<C4::Search::buildQuery> does.
 189
 190 A very simple query will go in with C<$operands> set to ['query'], and
 191 C<$sort_by> set to ['pubdate_dsc']. This simple case will return with
 192 C<$query> set to something that can perform the search, C<$simple_query>
 193 set to just the search term, C<$query_cgi> set to something that can
 194 reproduce this search, and C<$query_desc> set to something else.
 195
 196 =cut
 197
 198 sub build_query_compat {
 199     my ( $self, $operators, $operands, $indexes, $orig_limits, $sort_by, $scan,
 200         $lang, $params )
 201       = @_;
 202
 203 #die Dumper ( $self, $operators, $operands, $indexes, $orig_limits, $sort_by, $scan, $lang );
 204     my @sort_params  = $self->_convert_sort_fields(@$sort_by);
 205     my @index_params = $self->_convert_index_fields(@$indexes);
 206     my $limits       = $self->_fix_limit_special_cases($orig_limits);
 207     if ( $params->{suppress} ) { push @$limits, "suppress:0"; }
 208
 209     # Merge the indexes in with the search terms and the operands so that
 210     # each search thing is a handy unit.
 211     unshift @$operators, undef;    # The first one can't have an op
 212     my @search_params;
 213     my $truncate = C4::Context->preference("QueryAutoTruncate") || 0;
 214     my $ea = each_array( @$operands, @$operators, @index_params );
 215     while ( my ( $oand, $otor, $index ) = $ea->() ) {
 216         next if ( !defined($oand) || $oand eq '' );
 217         $oand = $self->_clean_search_term($oand);
 218         $oand = $self->_truncate_terms($oand) if ($truncate);
 219         push @search_params, {
 220             operand => $oand,      # the search terms
 221             operator => defined($otor) ? uc $otor : undef,    # AND and so on
 222             $index ? %$index : (),
 223         };
 224     }
 225
 226     # We build a string query from limits and the queries. An alternative
 227     # would be to pass them separately into build_query and let it build
 228     # them into a structured ES query itself. Maybe later, though that'd be
 229     # more robust.
 230     my $query_str = join( ' AND ',
 231         join( ' ', $self->_create_query_string(@search_params) ) || (),
 232         $self->_join_queries( $self->_convert_index_strings(@$limits) ) || () );
 233
 234     my @fields = '_all';
 235     if ( defined($params->{weighted_fields}) && $params->{weighted_fields} ) {
 236         push @fields, sprintf("%s^%s", $_->name, $_->weight) for Koha::SearchFields->weighted_fields;
 237     }
 238
 239     # If there's no query on the left, let's remove the junk left behind
 240     $query_str =~ s/^ AND //;
 241     my %options;
 242     $options{fields} = \@fields;
 243     $options{sort} = \@sort_params;
 244     $options{expanded_facet} = $params->{expanded_facet};
 245     my $query = $self->build_query( $query_str, %options );
 246
 247     #die Dumper($query);
 248     # We roughly emulate the CGI parameters of the zebra query builder
 249     my $query_cgi;
 250     $query_cgi = 'q=' . uri_escape_utf8( $operands->[0] ) if @$operands;
 251     my $simple_query;
 252     $simple_query = $operands->[0] if @$operands == 1;
 253     my $query_desc   = $simple_query;
 254     my $limit        = $self->_join_queries( $self->_convert_index_strings(@$limits));
 255     my $limit_cgi = ( $orig_limits and @$orig_limits )
 256       ? '&limit=' . join( '&limit=', map { uri_escape_utf8($_) } @$orig_limits )
 257       : '';
 258     my $limit_desc;
 259     $limit_desc = "$limit" if $limit;
 260     return (
 261         undef,  $query,     $simple_query, $query_cgi, $query_desc,
 262         $limit, $limit_cgi, $limit_desc,   undef,      undef
 263     );
 264 }
 265
 266 =head2 build_authorities_query
 267
 268     my $query = $builder->build_authorities_query(\%search);
 269
 270 This takes a nice description of an authority search and turns it into a black-box
 271 query that can then be passed to the appropriate searcher.
 272
 273 The search description is a hashref that looks something like:
 274
 275     {
 276         searches => [
 277             {
 278                 where    => 'Heading',    # search the main entry
 279                 operator => 'exact',        # require an exact match
 280                 value    => 'frogs',        # the search string
 281             },
 282             {
 283                 where    => '',             # search all entries
 284                 operator => '',             # default keyword, right truncation
 285                 value    => 'pond',
 286             },
 287         ],
 288         sort => {
 289             field => 'Heading',
 290             order => 'desc',
 291         },
 292         authtypecode => 'TOPIC_TERM',
 293     }
 294
 295 =cut
 296
 297 sub build_authorities_query {
 298     my ( $self, $search ) = @_;
 299
 300     # Start by making the query parts
 301     my @query_parts;
 302
 303     foreach my $s ( @{ $search->{searches} } ) {
 304         my ( $wh, $op, $val ) = @{$s}{qw(where operator value)};
 305         $wh = '_all' if $wh eq '';
 306         if ( $op eq 'is' || $op eq '='  || $op eq 'exact' ) {
 307
 308             # look for something that matches a term completely
 309             # note, '=' is about numerical vals. May need special handling.
 310             # Also, we lowercase our search because the ES
 311             # index lowercases its values, and term searches don't get the
 312             # search analyzer applied to them.
 313             push @query_parts, { match_phrase => {"$wh.phrase" => lc $val} };
 314         }
 315         elsif ( $op eq 'start' ) {
 316             # startswith search, uses lowercase untokenized version of heading
 317             push @query_parts, { match_phrase_prefix => {"$wh.phrase" => lc $val} };
 318         }
 319         else {
 320             # regular wordlist stuff
 321             my @tokens = $self->_split_query( $val );
 322             foreach my $token ( @tokens ) {
 323                 $token = $self->_truncate_terms(
 324                     $self->_clean_search_term( $token )
 325                 );
 326             }
 327             my $query = $self->_join_queries( @tokens );
 328             push @query_parts, { query_string => { default_field => $wh, query => $query } };
 329         }
 330     }
 331
 332     # Merge the query parts appropriately
 333     # 'should' behaves like 'or'
 334     # 'must' behaves like 'and'
 335     # Zebra results seem to match must so using that here
 336     my $query = { query =>
 337                  { bool =>
 338                      { must => \@query_parts  }
 339                  }
 340              };
 341
 342     my %s;
 343     if ( exists $search->{sort} ) {
 344         foreach my $k ( keys %{ $search->{sort} } ) {
 345             my $f = $self->_sort_field($k);
 346             $s{$f} = $search->{sort}{$k};
 347         }
 348         $search->{sort} = \%s;
 349     }
 350
 351     # add the sort stuff
 352     $query->{sort} = [ $search->{sort} ]  if exists $search->{sort};
 353
 354     return $query;
 355 }
 356
 357
 358 =head2 build_authorities_query_compat
 359
 360     my ($query) =
 361       $builder->build_authorities_query_compat( \@marclist, \@and_or,
 362         \@excluding, \@operator, \@value, $authtypecode, $orderby );
 363
 364 This builds a query for searching for authorities, in the style of
 365 L<C4::AuthoritiesMarc::SearchAuthorities>.
 366
 367 Arguments:
 368
 369 =over 4
 370
 371 =item marclist
 372
 373 An arrayref containing where the particular term should be searched for.
 374 Options are: mainmainentry, mainentry, match, match-heading, see-from, and
 375 thesaurus. If left blank, any field is used.
 376
 377 =item and_or
 378
 379 Totally ignored. It is never used in L<C4::AuthoritiesMarc::SearchAuthorities>.
 380
 381 =item excluding
 382
 383 Also ignored.
 384
 385 =item operator
 386
 387 What form of search to do. Options are: is (phrase, no truncation, whole field
 388 must match), = (number exact match), exact (phrase, no truncation, whole field
 389 must match). If left blank, then word list, right truncated, anywhere is used.
 390
 391 =item value
 392
 393 The actual user-provided string value to search for.
 394
 395 =item authtypecode
 396
 397 The authority type code to search within. If blank, then all will be searched.
 398
 399 =item orderby
 400
 401 The order to sort the results by. Options are Relevance, HeadingAsc,
 402 HeadingDsc, AuthidAsc, AuthidDsc.
 403
 404 =back
 405
 406 marclist, operator, and value must be the same length, and the values at
 407 index /i/ all relate to each other.
 408
 409 This returns a query, which is a black box object that can be passed to the
 410 appropriate search object.
 411
 412 =cut
 413
 414 our $koha_to_index_name = {
 415     mainmainentry   => 'heading-main',
 416     mainentry       => 'heading',
 417     match           => 'match',
 418     'match-heading' => 'match-heading',
 419     'see-from'      => 'match-heading-see-from',
 420     thesaurus       => 'subject-heading-thesaurus',
 421     any             => '',
 422     all             => ''
 423 };
 424
 425 sub build_authorities_query_compat {
 426     my ( $self, $marclist, $and_or, $excluding, $operator, $value,
 427         $authtypecode, $orderby )
 428       = @_;
 429
 430     # This turns the old-style many-options argument form into a more
 431     # extensible hash form that is understood by L<build_authorities_query>.
 432     my @searches;
 433
 434     # Convert to lower case
 435     $marclist = [map(lc, @{$marclist})];
 436     $orderby  = lc $orderby;
 437
 438     # Make sure everything exists
 439     foreach my $m (@$marclist) {
 440         Koha::Exceptions::WrongParameter->throw("Invalid marclist field provided: $m")
 441             unless exists $koha_to_index_name->{$m};
 442     }
 443     for ( my $i = 0 ; $i < @$value ; $i++ ) {
 444         next unless $value->[$i]; #clean empty form values, ES doesn't like undefined searches
 445         push @searches,
 446           {
 447             where    => $koha_to_index_name->{$marclist->[$i]},
 448             operator => $operator->[$i],
 449             value    => $value->[$i],
 450           };
 451     }
 452
 453     my %sort;
 454     my $sort_field =
 455         ( $orderby =~ /^heading/ ) ? 'heading'
 456       : ( $orderby =~ /^auth/ )    ? 'local-number'
 457       :                              undef;
 458     if ($sort_field) {
 459         my $sort_order = ( $orderby =~ /asc$/ ) ? 'asc' : 'desc';
 460         %sort = ( $sort_field => $sort_order, );
 461     }
 462     my %search = (
 463         searches     => \@searches,
 464         authtypecode => $authtypecode,
 465     );
 466     $search{sort} = \%sort if %sort;
 467     my $query = $self->build_authorities_query( \%search );
 468     return $query;
 469 }
 470
 471 =head2 _convert_sort_fields
 472
 473     my @sort_params = _convert_sort_fields(@sort_by)
 474
 475 Converts the zebra-style sort index information into elasticsearch-style.
 476
 477 C<@sort_by> is the same as presented to L<build_query_compat>, and it returns
 478 something that can be sent to L<build_query>.
 479
 480 =cut
 481
 482 sub _convert_sort_fields {
 483     my ( $self, @sort_by ) = @_;
 484
 485     # Turn the sorting into something we care about.
 486     my %sort_field_convert = (
 487         acqdate     => 'date-of-acquisition',
 488         author      => 'author',
 489         call_number => 'local-classification',
 490         popularity  => 'issues',
 491         relevance   => undef,       # default
 492         title       => 'title',
 493         pubdate     => 'date-of-publication',
 494     );
 495     my %sort_order_convert =
 496       ( qw( desc desc ), qw( dsc desc ), qw( asc asc ), qw( az asc ), qw( za desc ) );
 497
 498     # Convert the fields and orders, drop anything we don't know about.
 499     grep { $_->{field} } map {
 500         my ( $f, $d ) = /(.+)_(.+)/;
 501         {
 502             field     => $sort_field_convert{$f},
 503             direction => $sort_order_convert{$d}
 504         }
 505     } @sort_by;
 506 }
 507
 508 =head2 _convert_index_fields
 509
 510     my @index_params = $self->_convert_index_fields(@indexes);
 511
 512 Converts zebra-style search index notation into elasticsearch-style.
 513
 514 C<@indexes> is an array of index names, as presented to L<build_query_compat>,
 515 and it returns something that can be sent to L<build_query>.
 516
 517 B<TODO>: this will pull from the elasticsearch mappings table to figure out
 518 types.
 519
 520 =cut
 521
 522 our %index_field_convert = (
 523     'kw' => '_all',
 524     'ab' => 'abstract',
 525     'au' => 'author',
 526     'lcn' => 'local-classification',
 527     'callnum' => 'local-classification',
 528     'record-type' => 'rtype',
 529     'mc-rtype' => 'rtype',
 530     'mus' => 'rtype',
 531     'lc-card' => 'lc-card-number',
 532     'sn' => 'local-number',
 533     'yr' => 'date-of-publication',
 534     'pubdate' => 'date-of-publication',
 535     'acqdate' => 'date-of-acquisition',
 536     'date/time-last-modified' => 'date-time-last-modified',
 537     'dtlm' => 'date/time-last-modified',
 538     'diss' => 'dissertation-information',
 539     'nb' => 'isbn',
 540     'ns' => 'issn',
 541     'music-number' => 'identifier-publisher-for-music',
 542     'number-music-publisher' => 'identifier-publisher-for-music',
 543     'music' => 'identifier-publisher-for-music',
 544     'ident' => 'identifier-standard',
 545     'cpn' => 'corporate-name',
 546     'cfn' => 'conference-name',
 547     'pn' => 'personal-name',
 548     'pb' => 'publisher',
 549     'pv' => 'provider',
 550     'nt' => 'note',
 551     'notes' => 'note',
 552     'rcn' => 'record-control-number',
 553     'su' => 'subject',
 554     'su-to' => 'subject',
 555     #'su-geo' => 'subject',
 556     'su-ut' => 'subject',
 557     'ti' => 'title',
 558     'se' => 'title-series',
 559     'ut' => 'title-uniform',
 560     'an' => 'koha-auth-number',
 561     'authority-number' => 'koha-auth-number',
 562     'at' => 'authtype',
 563     'he' => 'heading',
 564     'rank' => 'relevance',
 565     'phr' => 'st-phrase',
 566     'wrdl' => 'st-word-list',
 567     'rt' => 'right-truncation',
 568     'rtrn' => 'right-truncation',
 569     'ltrn' => 'left-truncation',
 570     'rltrn' => 'left-and-right',
 571     'mc-itemtype' => 'itemtype',
 572     'mc-ccode' => 'ccode',
 573     'branch' => 'homebranch',
 574     'mc-loc' => 'location',
 575     'stocknumber' => 'number-local-acquisition',
 576     'inv' => 'number-local-acquisition',
 577     'bc' => 'barcode',
 578     'mc-itype' => 'itype',
 579     'aub' => 'author-personal-bibliography',
 580     'auo' => 'author-in-order',
 581     'ff8-22' => 'ta',
 582     'aud' => 'ta',
 583     'audience' => 'ta',
 584     'frequency-code' => 'ff8-18',
 585     'illustration-code' => 'ff8-18-21',
 586     'regularity-code' => 'ff8-19',
 587     'type-of-serial' => 'ff8-21',
 588     'format' => 'ff8-23',
 589     'conference-code' => 'ff8-29',
 590     'festschrift-indicator' => 'ff8-30',
 591     'index-indicator' => 'ff8-31',
 592     'fiction' => 'lf',
 593     'fic' => 'lf',
 594     'literature-code' => 'lf',
 595     'biography' => 'bio',
 596     'ff8-34' => 'bio',
 597     'biography-code' => 'bio',
 598     'l-format' => 'ff7-01-02',
 599     'lex' => 'lexile-number',
 600     'hi' => 'host-item-number',
 601     'itu' => 'index-term-uncontrolled',
 602     'itg' => 'index-term-genre',
 603 );
 604 my $field_name_pattern = '[\w\-]+';
 605 my $multi_field_pattern = "(?:\\.$field_name_pattern)*";
 606
 607 sub _convert_index_fields {
 608     my ( $self, @indexes ) = @_;
 609
 610     my %index_type_convert =
 611       ( __default => undef, phr => 'phrase', rtrn => 'right-truncate' );
 612
 613     # Convert according to our table, drop anything that doesn't convert.
 614     # If a field starts with mc- we save it as it's used (and removed) later
 615     # when joining things, to indicate we make it an 'OR' join.
 616     # (Sorry, this got a bit ugly after special cases were found.)
 617     grep { $_->{field} } map {
 618         # Lower case all field names
 619         my ( $f, $t ) = map(lc, split /,/);
 620         my $mc = '';
 621         if ($f =~ /^mc-/) {
 622             $mc = 'mc-';
 623             $f =~ s/^mc-//;
 624         }
 625         my $r = {
 626             field => exists $index_field_convert{$f} ? $index_field_convert{$f} : $f,
 627             type  => $index_type_convert{ $t // '__default' }
 628         };
 629         $r->{field} = ($mc . $r->{field}) if $mc && $r->{field};
 630         $r;
 631     } @indexes;
 632 }
 633
 634 =head2 _convert_index_strings
 635
 636     my @searches = $self->_convert_index_strings(@searches);
 637
 638 Similar to L<_convert_index_fields>, this takes strings of the form
 639 B<field:search term> and rewrites the field from zebra-style to
 640 elasticsearch-style. Anything it doesn't understand is returned verbatim.
 641
 642 =cut
 643
 644 sub _convert_index_strings {
 645     my ( $self, @searches ) = @_;
 646     my @res;
 647     foreach my $s (@searches) {
 648         next if $s eq '';
 649         my ( $field, $term ) = $s =~ /^\s*([\w,-]*?):(.*)/;
 650         unless ( defined($field) && defined($term) ) {
 651             push @res, $s;
 652             next;
 653         }
 654         my ($conv) = $self->_convert_index_fields($field);
 655         unless ( defined($conv) ) {
 656             push @res, $s;
 657             next;
 658         }
 659         push @res, $conv->{field} . ":"
 660           . $self->_modify_string_by_type( %$conv, operand => $term );
 661     }
 662     return @res;
 663 }
 664
 665 =head2 _convert_index_strings_freeform
 666
 667     my $search = $self->_convert_index_strings_freeform($search);
 668
 669 This is similar to L<_convert_index_strings>, however it'll search out the
 670 things to change within the string. So it can handle strings such as
 671 C<(su:foo) AND (su:bar)>, converting the C<su> appropriately.
 672
 673 If there is something of the form "su,complete-subfield" or something, the
 674 second part is stripped off as we can't yet handle that. Making it work
 675 will have to wait for a real query parser.
 676
 677 =cut
 678
 679 sub _convert_index_strings_freeform {
 680     my ( $self, $search ) = @_;
 681     # @TODO: Currenty will alter also fields contained within quotes:
 682     # `searching for "stuff cn:123"` for example will become
 683     # `searching for "stuff local-number:123"
 684     #
 685     # Fixing this is tricky, one possibility:
 686     # https://stackoverflow.com/questions/19193876/perl-regex-to-match-a-string-that-is-not-enclosed-in-quotes
 687     # Still not perfect, and will not handle escaped quotes within quotes and assumes balanced quotes.
 688     #
 689     # Another, not so elegant, solution could be to replace all quoted content with placeholders, and put
 690     # them back when processing is done.
 691
 692     # Lower case field names
 693     $search =~ s/($field_name_pattern)(?:,[\w-]*)?($multi_field_pattern):/\L$1\E$2:/og;
 694     # Resolve possible field aliases
 695     $search =~ s/($field_name_pattern)($multi_field_pattern):/(exists $index_field_convert{$1} ? $index_field_convert{$1} : $1)."$2:"/oge;
 696     return $search;
 697 }
 698
 699 =head2 _modify_string_by_type
 700
 701     my $str = $self->_modify_string_by_type(%index_field);
 702
 703 If you have a search term (operand) and a type (phrase, right-truncated), this
 704 will convert the string to have the function in lucene search terms, e.g.
 705 wrapping quotes around it.
 706
 707 =cut
 708
 709 sub _modify_string_by_type {
 710     my ( $self, %idx ) = @_;
 711
 712     my $type = $idx{type} || '';
 713     my $str = $idx{operand};
 714     return $str unless $str;    # Empty or undef, we can't use it.
 715
 716     $str .= '*' if $type eq 'right-truncate';
 717     $str = '"' . $str . '"' if $type eq 'phrase';
 718     return $str;
 719 }
 720
 721 =head2 _join_queries
 722
 723     my $query_str = $self->_join_queries(@query_parts);
 724
 725 This takes a list of query parts, that might be search terms on their own, or
 726 booleaned together, or specifying fields, or whatever, wraps them in
 727 parentheses, and ANDs them all together. Suitable for feeding to the ES
 728 query string query.
 729
 730 Note: doesn't AND them together if they specify an index that starts with "mc"
 731 as that was a special case in the original code for dealing with multiple
 732 choice options (you can't search for something that has an itype of A and
 733 and itype of B otherwise.)
 734
 735 =cut
 736
 737 sub _join_queries {
 738     my ( $self, @parts ) = @_;
 739
 740     my @norm_parts = grep { defined($_) && $_ ne '' && $_ !~ /^mc-/ } @parts;
 741     my @mc_parts =
 742       map { s/^mc-//r } grep { defined($_) && $_ ne '' && $_ =~ /^mc-/ } @parts;
 743     return () unless @norm_parts + @mc_parts;
 744     return ( @norm_parts, @mc_parts )[0] if @norm_parts + @mc_parts == 1;
 745     my $grouped_mc =
 746       @mc_parts ? '(' . ( join ' OR ', map { "($_)" } @mc_parts ) . ')' : ();
 747
 748     # Handy trick: $x || () inside a join means that if $x ends up as an
 749     # empty string, it gets replaced with (), which makes join ignore it.
 750     # (bad effect: this'll also happen to '0', this hopefully doesn't matter
 751     # in this case.)
 752     join( ' AND ',
 753         join( ' AND ', map { "($_)" } @norm_parts ) || (),
 754         $grouped_mc || () );
 755 }
 756
 757 =head2 _make_phrases
 758
 759     my @phrased_queries = $self->_make_phrases(@query_parts);
 760
 761 This takes the supplied queries and forces them to be phrases by wrapping
 762 quotes around them. It understands field prefixes, e.g. 'subject:' and puts
 763 the quotes outside of them if they're there.
 764
 765 =cut
 766
 767 sub _make_phrases {
 768     my ( $self, @parts ) = @_;
 769     map { s/^\s*(\w*?:)(.*)$/$1"$2"/r } @parts;
 770 }
 771
 772 =head2 _create_query_string
 773
 774     my @query_strings = $self->_create_query_string(@queries);
 775
 776 Given a list of hashrefs, it will turn them into a lucene-style query string.
 777 The hash should contain field, type (both for the indexes), operator, and
 778 operand.
 779
 780 =cut
 781
 782 sub _create_query_string {
 783     my ( $self, @queries ) = @_;
 784
 785     map {
 786         my $otor  = $_->{operator} ? $_->{operator} . ' ' : '';
 787         my $field = $_->{field}    ? $_->{field} . ':'    : '';
 788
 789         my $oand = $self->_modify_string_by_type(%$_);
 790         "$otor($field$oand)";
 791     } @queries;
 792 }
 793
 794 =head2 _clean_search_term
 795
 796     my $term = $self->_clean_search_term($term);
 797
 798 This cleans a search term by removing any funny characters that may upset
 799 ES and give us an error. It also calls L<_convert_index_strings_freeform>
 800 to ensure those parts are correct.
 801
 802 =cut
 803
 804 sub _clean_search_term {
 805     my ( $self, $term ) = @_;
 806
 807     # Lookahead for checking if we are inside quotes
 808     my $lookahead = '(?=(?:[^\"]*+\"[^\"]*+\")*+[^\"]*+$)';
 809
 810     # Some hardcoded searches (like with authorities) produce things like
 811     # 'an=123', when it ought to be 'an:123' for our purposes.
 812     $term =~ s/=/:/g;
 813
 814     $term = $self->_convert_index_strings_freeform($term);
 815     $term =~ s/[{}]/"/g;
 816
 817     # Remove unbalanced quotes
 818     my $unquoted = $term;
 819     my $count = ($unquoted =~ tr/"/ /);
 820     if ($count % 2 == 1) {
 821         $term = $unquoted;
 822     }
 823
 824     # Remove unquoted colons that have whitespace on either side of them
 825     $term =~ s/(\:[:\s]+|[:\s]+:)$lookahead//g;
 826
 827     return $term;
 828 }
 829
 830 =head2 _fix_limit_special_cases
 831
 832     my $limits = $self->_fix_limit_special_cases($limits);
 833
 834 This converts any special cases that the limit specifications have into things
 835 that are more readily processable by the rest of the code.
 836
 837 The argument should be an arrayref, and it'll return an arrayref.
 838
 839 =cut
 840
 841 sub _fix_limit_special_cases {
 842     my ( $self, $limits ) = @_;
 843
 844     my @new_lim;
 845     foreach my $l (@$limits) {
 846
 847         # This is set up by opac-search.pl
 848         if ( $l =~ /^yr,st-numeric,ge=/ ) {
 849             my ( $start, $end ) =
 850               ( $l =~ /^yr,st-numeric,ge=(.*) and yr,st-numeric,le=(.*)$/ );
 851             next unless defined($start) && defined($end);
 852             push @new_lim, "copydate:[$start TO $end]";
 853         }
 854         elsif ( $l =~ /^yr,st-numeric=/ ) {
 855             my ($date) = ( $l =~ /^yr,st-numeric=(.*)$/ );
 856             next unless defined($date);
 857             push @new_lim, "copydate:$date";
 858         }
 859         elsif ( $l =~ /^available$/ ) {
 860             push @new_lim, 'onloan:0';
 861         }
 862         else {
 863             push @new_lim, $l;
 864         }
 865     }
 866     return \@new_lim;
 867 }
 868
 869 =head2 _sort_field
 870
 871     my $field = $self->_sort_field($field);
 872
 873 Given a field name, this works out what the actual name of the field to sort
 874 on should be. A '__sort' suffix is added for fields with a sort version, and
 875 for text fields either '.phrase' (for sortable versions) or '.raw' is appended
 876 to avoid sorting on a tokenized value.
 877
 878 =cut
 879
 880 sub _sort_field {
 881     my ($self, $f) = @_;
 882
 883     my $mappings = $self->get_elasticsearch_mappings();
 884     my $textField = defined $mappings->{data}{properties}{$f}{type} && $mappings->{data}{properties}{$f}{type} eq 'text';
 885     if (!defined $self->sort_fields()->{$f} || $self->sort_fields()->{$f}) {
 886         $f .= '__sort';
 887         # We need to add '.phrase' to text fields, otherwise it'll sort
 888         # based on the tokenised form.
 889         $f .= '.phrase' if $textField;
 890     } else {
 891         # We need to add '.raw' to text fields without a sort field,
 892         # otherwise it'll sort based on the tokenised form.
 893         $f .= '.raw' if $textField;
 894     }
 895     return $f;
 896 }
 897
 898 =head2 _truncate_terms
 899
 900     my $query = $self->_truncate_terms($query);
 901
 902 Given a string query this function appends '*' wildcard  to all terms except
 903 operands and double quoted strings.
 904
 905 =cut
 906
 907 sub _truncate_terms {
 908     my ( $self, $query ) = @_;
 909
 910     my @tokens = $self->_split_query( $query );
 911
 912     # Filter out empty tokens
 913     my @words = grep { $_ !~ /^\s*$/ } @tokens;
 914
 915     # Append '*' to words if needed, ie. if it ends in a word character and is not a keyword
 916     my @terms = map {
 917         my $w = $_;
 918         (/\W$/ or grep {lc($w) eq $_} qw/and or not/) ? $_ : "$_*";
 919     } @words;
 920
 921     return join ' ', @terms;
 922 }
 923
 924 =head2 _split_query
 925
 926     my @token = $self->_split_query($query_str);
 927
 928 Given a string query this function splits it to tokens taking into account
 929 any field prefixes and quoted strings.
 930
 931 =cut
 932
 933 my $tokenize_split_re = qr/((?:${field_name_pattern}${multi_field_pattern}:)?"[^"]+"|\s+)/;
 934
 935 sub _split_query {
 936     my ( $self, $query ) = @_;
 937
 938     # '"donald duck" title:"the mouse" and peter" get split into
 939     # ['', '"donald duck"', '', ' ', '', 'title:"the mouse"', '', ' ', 'and', ' ', 'pete']
 940     my @tokens = split $tokenize_split_re, $query;
 941
 942     # Filter out empty values
 943     @tokens = grep( /\S/, @tokens );
 944
 945     return @tokens;
 946 }
 947
 948 1;