From be2bdfd5113d00699c581e897d1f764158a860b0 Mon Sep 17 00:00:00 2001 From: Ere Maijala Date: Fri, 14 Sep 2018 06:40:24 +0300 Subject: [PATCH] Bug 19365: Fix several issues with the Elasticsearch code Also optimize it so it's actually usable. Test plan: 1. To test it properly you need biblio and authority data. You might get away with enabling AutoCreateAuthorities and BiblioAddsAuthorities so that authorities are created in the process. Another option would be to import authorities first e.g. from LoC. 2. Make sure the authority index has been properly created with "misc/search_tools/rebuild_elastic_search.pl -a -d" 3. Run "misc/link_bibs_to_authorities.pl -v -l" twice and observe the results. Sponsored-by: National Library of Finland Signed-off-by: Brendan Gallagher Signed-off-by: Martin Renvoize Signed-off-by: Nick Clemens --- C4/Heading.pm | 20 ++++- C4/Matcher.pm | 12 ++- Koha/SearchEngine/Elasticsearch/Indexer.pm | 3 +- .../Elasticsearch/QueryBuilder.pm | 52 +++++++------ Koha/SearchEngine/Elasticsearch/Search.pm | 77 ++++++++++--------- Koha/SearchEngine/Zebra/Search.pm | 4 +- misc/link_bibs_to_authorities.pl | 15 ++-- 7 files changed, 109 insertions(+), 74 deletions(-) diff --git a/C4/Heading.pm b/C4/Heading.pm index e323b5bcd2..193be162b4 100644 --- a/C4/Heading.pm +++ b/C4/Heading.pm @@ -200,12 +200,24 @@ sub _search { # push @operator, 'is'; # push @value, $self->{'thesaurus'}; # } - require C4::AuthoritiesMarc; - return C4::AuthoritiesMarc::SearchAuthorities( + + require Koha::SearchEngine::QueryBuilder; + require Koha::SearchEngine::Search; + + # Use state variables to avoid recreating the objects every time. + # With Elasticsearch this also avoids creating a massive amount of + # ES connectors that would eventually run out of file descriptors. + state $builder = Koha::SearchEngine::QueryBuilder->new( + { index => $Koha::SearchEngine::AUTHORITIES_INDEX } ); + state $searcher = Koha::SearchEngine::Search->new( + {index => $Koha::SearchEngine::AUTHORITIES_INDEX} ); + + my $search_query = $builder->build_authorities_query_compat( \@marclist, \@and_or, \@excluding, \@operator, - \@value, 0, 20, $self->{'auth_type'}, - 'AuthidAsc', $skipmetadata + \@value, $self->{'auth_type'}, + 'AuthidAsc' ); + return $searcher->search_auth_compat( $search_query, 0, 20, $skipmetadata ); } =head1 INTERNAL FUNCTIONS diff --git a/C4/Matcher.pm b/C4/Matcher.pm index d19dd64b2f..dc848048c1 100644 --- a/C4/Matcher.pm +++ b/C4/Matcher.pm @@ -664,7 +664,10 @@ sub get_matches { #NOTE: double-quote the values so you don't get a "Embedded truncation not supported" error when a term has a ? in it. } - my $searcher = Koha::SearchEngine::Search->new({index => $Koha::SearchEngine::BIBLIOS_INDEX}); + # Use state variables to avoid recreating the objects every time. + # With Elasticsearch this also avoids creating a massive amount of + # ES connectors that would eventually run out of file descriptors. + state $searcher = Koha::SearchEngine::Search->new({index => $Koha::SearchEngine::BIBLIOS_INDEX}); ( $error, $searchresults, $total_hits ) = $searcher->simple_search_compat( $query, 0, $max_matches, undef, skip_normalize => 1 ); @@ -703,8 +706,11 @@ sub get_matches { push @operator, 'exact'; push @value, $key; } - my $builder = Koha::SearchEngine::QueryBuilder->new({index => $Koha::SearchEngine::AUTHORITIES_INDEX}); - my $searcher = Koha::SearchEngine::Search->new({index => $Koha::SearchEngine::AUTHORITIES_INDEX}); + # Use state variables to avoid recreating the objects every time. + # With Elasticsearch this also avoids creating a massive amount of + # ES connectors that would eventually run out of file descriptors. + state $builder = Koha::SearchEngine::QueryBuilder->new({index => $Koha::SearchEngine::AUTHORITIES_INDEX}); + state $searcher = Koha::SearchEngine::Search->new({index => $Koha::SearchEngine::AUTHORITIES_INDEX}); my $search_query = $builder->build_authorities_query_compat( \@marclist, \@and_or, \@excluding, \@operator, \@value, undef, 'AuthidAsc' diff --git a/Koha/SearchEngine/Elasticsearch/Indexer.pm b/Koha/SearchEngine/Elasticsearch/Indexer.pm index 4774d78019..703f39f3ca 100644 --- a/Koha/SearchEngine/Elasticsearch/Indexer.pm +++ b/Koha/SearchEngine/Elasticsearch/Indexer.pm @@ -398,7 +398,8 @@ sub _sanitise_records { # tears in rain... if ( $rec ) { $rec->delete_fields($rec->field('999')); - $rec->append_fields(MARC::Field->new('999','','','c' => $bibnum, 'd' => $bibnum)); + # Make sure biblionumber is a string. Elasticsearch would consider int and string different IDs. + $rec->append_fields(MARC::Field->new('999','','','c' => "" . $bibnum, 'd' => "" . $bibnum)); } } } diff --git a/Koha/SearchEngine/Elasticsearch/QueryBuilder.pm b/Koha/SearchEngine/Elasticsearch/QueryBuilder.pm index 57836e9d8c..254fba2c9e 100644 --- a/Koha/SearchEngine/Elasticsearch/QueryBuilder.pm +++ b/Koha/SearchEngine/Elasticsearch/QueryBuilder.pm @@ -103,10 +103,8 @@ sub build_query { if $d && ( $d ne 'asc' && $d ne 'desc' ); $d = 'asc' unless $d; - # TODO account for fields that don't have a 'phrase' type - $f = $self->_sort_field($f); - push @{ $res->{sort} }, { "$f.phrase" => { order => $d } }; + push @{ $res->{sort} }, { $f => { order => $d } }; } } @@ -172,7 +170,7 @@ sub build_browse_query { } } }, - sort => [ { "$sort.phrase" => { order => "asc" } } ], + sort => [ { $sort => { order => "asc" } } ], }; } @@ -301,22 +299,18 @@ sub build_authorities_query { foreach my $s ( @{ $search->{searches} } ) { my ( $wh, $op, $val ) = @{$s}{qw(where operator value)}; $wh = '_all' if $wh eq ''; - if ( $op eq 'is' || $op eq '=' ) { + if ( $op eq 'is' || $op eq '=' || $op eq 'exact' ) { # look for something that matches a term completely # note, '=' is about numerical vals. May need special handling. # Also, we lowercase our search because the ES # index lowercases its values, and term searches don't get the # search analyzer applied to them. - push @query_parts, { term => {"$wh.phrase" => lc $val} }; - } - elsif ( $op eq 'exact' ) { - # left and right truncation, otherwise an exact phrase push @query_parts, { match_phrase => {"$wh.phrase" => lc $val} }; } elsif ( $op eq 'start' ) { # startswith search, uses lowercase untokenized version of heading - push @query_parts, { prefix => {"$wh.lc_raw" => lc $val} }; + push @query_parts, { match_phrase_prefix => {"$wh.phrase" => lc $val} }; } else { # regular wordlist stuff @@ -332,19 +326,17 @@ sub build_authorities_query { # 'should' behaves like 'or' # 'must' behaves like 'and' # Zebra results seem to match must so using that here - my $query = { query=> + my $query = { query => { bool => { must => \@query_parts } } }; - # We need to add '.phrase' to all the sort headings otherwise it'll sort - # based on the tokenised form. my %s; if ( exists $search->{sort} ) { foreach my $k ( keys %{ $search->{sort} } ) { my $f = $self->_sort_field($k); - $s{"$f.phrase"} = $search->{sort}{$k}; + $s{$f} = $search->{sort}{$k}; } $search->{sort} = \%s; } @@ -385,9 +377,9 @@ Also ignored. =item operator -What form of search to do. Options are: is (phrase, no trunction, whole field -must match), = (number exact match), exact (phrase, but with left and right -truncation). If left blank, then word list, right truncted, anywhere is used. +What form of search to do. Options are: is (phrase, no truncation, whole field +must match), = (number exact match), exact (phrase, no truncation, whole field +must match). If left blank, then word list, right truncated, anywhere is used. =item value @@ -419,7 +411,8 @@ our $koha_to_index_name = { 'match-heading' => 'Match-heading', 'see-from' => 'Match-heading-see-from', thesaurus => 'Subject-heading-thesaurus', - all => '' + any => '', + all => '' }; sub build_authorities_query_compat { @@ -448,8 +441,8 @@ sub build_authorities_query_compat { my %sort; my $sort_field = - ( $orderby =~ /^Heading/ ) ? 'Heading__sort' - : ( $orderby =~ /^Auth/ ) ? 'Local-Number' + ( $orderby =~ /^Heading/ ) ? 'Heading' + : ( $orderby =~ /^Auth/ ) ? 'Local-number' : undef; if ($sort_field) { my $sort_order = ( $orderby =~ /Asc$/ ) ? 'asc' : 'desc'; @@ -773,16 +766,27 @@ sub _fix_limit_special_cases { my $field = $self->_sort_field($field); -Given a field name, this works out what the actual name of the version to sort -on should be. Often it's the same, sometimes it involves sticking "__sort" on -the end. Maybe it'll be something else in the future, who knows? +Given a field name, this works out what the actual name of the field to sort +on should be. A '__sort' suffix is added for fields with a sort version, and +for text fields either '.phrase' (for sortable versions) or '.raw' is appended +to avoid sorting on a tokenized value. =cut sub _sort_field { my ($self, $f) = @_; - if ($self->sort_fields()->{$f}) { + + my $mappings = $self->get_elasticsearch_mappings(); + my $textField = defined $mappings->{data}{properties}{$f}{type} && $mappings->{data}{properties}{$f}{type} eq 'text'; + if (!defined $self->sort_fields()->{$f} || $self->sort_fields()->{$f}) { $f .= '__sort'; + # We need to add '.phrase' to text fields, otherwise it'll sort + # based on the tokenised form. + $f .= '.phrase' if $textField; + } else { + # We need to add '.raw' to text fields without a sort field, + # otherwise it'll sort based on the tokenised form. + $f .= '.raw' if $textField; } return $f; } diff --git a/Koha/SearchEngine/Elasticsearch/Search.pm b/Koha/SearchEngine/Elasticsearch/Search.pm index 6917f197df..b7f139f1c1 100644 --- a/Koha/SearchEngine/Elasticsearch/Search.pm +++ b/Koha/SearchEngine/Elasticsearch/Search.pm @@ -84,23 +84,21 @@ sub search { my ($self, $query, $page, $count, %options) = @_; my $params = $self->get_elasticsearch_params(); - my %paging; # 20 is the default number of results per page - $paging{limit} = $count || 20; - # ES/Catmandu doesn't want pages, it wants a record to start from. + $query->{size} = $count || 20; + # ES doesn't want pages, it wants a record to start from. if (exists $options{offset}) { - $paging{start} = $options{offset}; + $query->{from} = $options{offset}; } else { $page = (!defined($page) || ($page <= 0)) ? 0 : $page - 1; - $paging{start} = $page * $paging{limit}; + $query->{from} = $page * $query->{size}; } - $self->store( - Catmandu::Store::ElasticSearch->new( - %$params, - ) - ) unless $self->store; + my $elasticsearch = $self->get_elasticsearch(); my $results = eval { - $self->store->bag->search( %$query, %paging ); + $elasticsearch->search( + index => $params->{index_name}, + body => $query + ); }; if ($@) { die $self->process_error($@); @@ -163,13 +161,15 @@ sub search_compat { # opac-search expects results to be put in the # right place in the array, according to $offset my $index = $offset; - $results->each(sub { - $records[$index++] = $self->decode_record_from_result(@_); - }); + my $hits = $results->{'hits'}; + foreach my $es_record (@{$hits->{'hits'}}) { + $records[$index++] = $self->decode_record_from_result($es_record->{'_source'}); + } + # consumers of this expect a name-spaced result, we provide the default # configuration. my %result; - $result{biblioserver}{hits} = $results->total; + $result{biblioserver}{hits} = $hits->{'total'}; $result{biblioserver}{RECORDS} = \@records; return (undef, \%result, $self->_convert_facets($results->{aggregations}, $expanded_facet)); } @@ -177,7 +177,7 @@ sub search_compat { =head2 search_auth_compat my ( $results, $total ) = - $searcher->search_auth_compat( $query, $page, $count, %options ); + $searcher->search_auth_compat( $query, $offset, $count, $skipmetadata, %options ); This has a similar calling convention to L, however it returns its results in a form the same as L. @@ -185,32 +185,39 @@ results in a form the same as L. =cut sub search_auth_compat { - my $self = shift; + my ($self, $query, $offset, $count, $skipmetadata, %options) = @_; - # TODO handle paging + if ( !defined $offset or $offset <= 0 ) { + $offset = 1; + } + # Uh, authority search uses 1-based offset.. + $options{offset} = $offset - 1; my $database = Koha::Database->new(); my $schema = $database->schema(); - my $res = $self->search(@_); + my $res = $self->search($query, undef, $count, %options); + my $bib_searcher = Koha::SearchEngine::Elasticsearch::Search->new({index => 'biblios'}); my @records; - $res->each( - sub { - my %result; + my $hits = $res->{'hits'}; + foreach my $es_record (@{$hits->{'hits'}}) { + my $record = $es_record->{'_source'}; + my %result; - # I wonder if these should be real values defined in the mapping - # rather than hard-coded conversions. - my $record = $_[0]; - # Handle legacy nested arrays indexed with splitting enabled. - my $authid = $record->{ 'Local-number' }[0]; - $authid = @$authid[0] if (ref $authid eq 'ARRAY'); + # I wonder if these should be real values defined in the mapping + # rather than hard-coded conversions. + #my $record = $_[0]; + # Handle legacy nested arrays indexed with splitting enabled. + my $authid = $record->{ 'Local-number' }[0]; + $authid = @$authid[0] if (ref $authid eq 'ARRAY'); - $result{authid} = $authid; + $result{authid} = $authid; + if (!defined $skipmetadata || !$skipmetadata) { # TODO put all this info into the record at index time so we # don't have to go and sort it all out now. my $authtypecode = $record->{authtype}; my $rs = $schema->resultset('AuthType') - ->search( { authtypecode => $authtypecode } ); + ->search( { authtypecode => $authtypecode } ); # FIXME there's an assumption here that we will get a result. # the original code also makes an assumption that some provided @@ -219,7 +226,7 @@ sub search_auth_compat { # it's not reproduced here yet. my $authtype = $rs->single; my $auth_tag_to_report = $authtype ? $authtype->auth_tag_to_report : ""; - my $marc = $self->decode_record_from_result(@_); + my $marc = $self->decode_record_from_result($record); my $mainentry = $marc->field($auth_tag_to_report); my $reported_tag; if ($mainentry) { @@ -233,13 +240,13 @@ sub search_auth_compat { # Reimplementing BuildSummary is out of scope because it'll be hard $result{summary} = - C4::AuthoritiesMarc::BuildSummary( $marc, $result{authid}, + C4::AuthoritiesMarc::BuildSummary( $marc, $result{authid}, $authtypecode ); $result{used} = $self->count_auth_use($bib_searcher, $authid); - push @records, \%result; } - ); - return ( \@records, $res->total ); + push @records, \%result; + } + return ( \@records, $hits->{'total'} ); } =head2 count_auth_use diff --git a/Koha/SearchEngine/Zebra/Search.pm b/Koha/SearchEngine/Zebra/Search.pm index 576b9f59c2..850469ec5d 100644 --- a/Koha/SearchEngine/Zebra/Search.pm +++ b/Koha/SearchEngine/Zebra/Search.pm @@ -100,12 +100,12 @@ This passes the search query on to C4::AuthoritiesMarc::SearchAuthorities =cut sub search_auth_compat { - my ( $self, $q, $startfrom, $resperpage ) = @_; + my ( $self, $q, $startfrom, $resperpage, $skipmetadata ) = @_; my @params = ( @{$q}{ 'marclist', 'and_or', 'excluding', 'operator', 'value' }, $startfrom - 1, - $resperpage, @{$q}{ 'authtypecode', 'orderby' } + $resperpage, @{$q}{ 'authtypecode', 'orderby' }, $skipmetadata ); C4::AuthoritiesMarc::SearchAuthorities(@params); } diff --git a/misc/link_bibs_to_authorities.pl b/misc/link_bibs_to_authorities.pl index b139a7666b..2711d16537 100755 --- a/misc/link_bibs_to_authorities.pl +++ b/misc/link_bibs_to_authorities.pl @@ -195,9 +195,10 @@ sub process_bib { return; } + my $frameworkcode = GetFrameworkCode($biblionumber); + my ( $headings_changed, $results ) = - LinkBibHeadingsToAuthorities( $linker, $bib, - GetFrameworkCode($biblionumber) ); + LinkBibHeadingsToAuthorities( $linker, $bib, $frameworkcode ); foreach my $key ( keys %{ $results->{'unlinked'} } ) { $unlinked_headings{$key} += $results->{'unlinked'}->{$key}; } @@ -211,11 +212,15 @@ sub process_bib { if ($headings_changed) { if ($verbose) { my $title = substr( $bib->title, 0, 20 ); - print -"Bib $biblionumber ($title): $headings_changed headings changed\n"; + printf( + "Bib %12d (%-20s): %3d headings changed\n", + $biblionumber, + $title, + $headings_changed + ); } if ( not $test_only ) { - ModBiblio( $bib, $biblionumber, GetFrameworkCode($biblionumber) ); + ModBiblio( $bib, $biblionumber, $frameworkcode ); $num_bibs_modified++; } } -- 2.39.5