From 5a47503aca5ef58fac90dafa0de8ae26a25c22da Mon Sep 17 00:00:00 2001 From: Janusz Kaczmarek Date: Tue, 24 Jan 2023 10:45:39 +0100 Subject: [PATCH] Bug 32707: ElasticSearch should not auto truncate (even if QueryAutoTruncate = 1) for identifiers (and some other fields) MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Koha with Zebra prevented auto truncation in some circumstances -- see the first return for ccl inside C4::Search:: buildQuery, before applying auto truncation, and setting $auto_truncation = 0 for some search fields. Koha with ElasticSearch applies auto truncation for all search fields, not paying attention to these special cases when it should not be done. This leads to various problems as described in bug 26508, 26608, etc. The solution would be to prevent auto truncation for certain search fields, above all – the identifiers. In addition, under no circumstances should a search field other than of text type be truncated (an attempt to truncate would generate an exception from ElasticSearch, e.g. number_format_exception for integer). Test plan ========= 0. Use the test data sample provided in the bug report. Comment 16 in the ticket. Scenario A (authority) ---------------------- 1. Enable Elasticsearch engine. 2. Have like 100+ bibliographic records and properly linked authority records. 3. Reindex ElasticSearch if needed. 4. Enable QueryAutoTruncate systempreference. 5. Find the authority record #1 and note the number of linked biblio records. 6. Show the detail of authority #1 and compare the number of linked biblio records. 7. If in the database there are authority records with ids =~ /^1/ (i.e. 10, 11, 12, ..., 100, 101, ...) linked to the biblio records you get two different numbers of linked records. 8. Also, as lists of linked biblio records (via link: Used in N record(s) from results view and detail view) you will get two different sets of biblio records. In particular, on the list generated from detail view (authorities/detail.pl?authid=1) you will get biblio records that are in fact not linked to the auth #1 (the list is generated with &q=an:1). 8.99. Skip to scenario B and come back here after finishing to not have to unapply the patches and restart services for nothing. 9. Apply this patch. 10. Counts and list of linked biblios should be ok. Scenario B (analytics) ---------------------- 1. Enable Elasticsearch engine. 2. Have three monographic bibliographic records with 001 = 1, 10, 100 (i.e. =~ /^1/). 3. Have an analytical record with 773 $w = 1 (in the test data set - biblio 896). 4. Enable QueryAutoTruncate and UseControlNumber systempreference. 6. Find the analytical record and click on the link generated from 773, i.e. (In: ... --> catalogue/search.pl?q=Control-number:1). You should see a list of 100+ records (001 = 1, 10, 100) instead of one (001 = 1). 7. From the biblio # 1 try to go to the analytice records (with Show analytics link) - you should get 60+ records with from different host records (773 $t) -- instead of one. 8. Apply this patch. Make sure that control-number and record-control-number are included in ESPreventAutoTruncate syspref. 9. Repeat steps 6 and 7. You should arrive at the right host record in p. 6 and at one analytical records when looking for analytical records in p. 7 (or more than one, but right, if you modified the test data set). Signed-off-by: Victor Grousset/tuxayo Signed-off-by: Nick Clemens Signed-off-by: Katrin Fischer (cherry picked from commit 4a0713c80b022b6e71872072192ecf2397a28fed) Signed-off-by: Fridolin Somers --- .../Elasticsearch/QueryBuilder.pm | 71 ++++++++++++++++++- 1 file changed, 70 insertions(+), 1 deletion(-) diff --git a/Koha/SearchEngine/Elasticsearch/QueryBuilder.pm b/Koha/SearchEngine/Elasticsearch/QueryBuilder.pm index 35ab4b7f91..edde18cc09 100644 --- a/Koha/SearchEngine/Elasticsearch/QueryBuilder.pm +++ b/Koha/SearchEngine/Elasticsearch/QueryBuilder.pm @@ -294,7 +294,8 @@ sub build_query_compat { while ( my ( $oand, $otor, $index ) = $ea->() ) { next if ( !defined($oand) || $oand eq '' ); $oand = $self->clean_search_term($oand); - $oand = $self->_truncate_terms($oand) if ($truncate); + $oand = $self->_truncate_terms($oand) + if $truncate && $self->_is_safe_to_auto_truncate( $index->{field}, $oand ); push @search_params, { operand => $oand, # the search terms operator => defined($otor) ? uc $otor : undef, # AND and so on @@ -1338,4 +1339,72 @@ sub _search_fields { } } + +=pod + +=head2 _is_safe_to_auto_truncate + +_is_safe_to_auto_truncate($index_field, $oand); + +Checks if it is safe to auto truncate a search term within a given search field. + +The search term should not be auto truncated when searching for identifiers, e.g. +koha-auth-number, record-control-number, local-number etc. Also, non-text fields +must not be auto truncated (doing so would generate ES exception). + +=cut + +sub _is_safe_to_auto_truncate { + my ( $self, $index_field, $oand ) = @_; + + # Do not auto truncate fields that should not be auto truncated, + # primarily various types of identifiers, above all record identifiers. + # Other search fields that should not be auto truncated can be defined + # with ESPreventAutoTruncate syspref. + my %do_not_autotruncate_fields; + my $cache = Koha::Caches->get_instance(); + my $cache_key = 'elasticsearch_search_do_not_autotruncate'; + my $do_not_autotruncate_fields_ref = $cache->get_from_cache( $cache_key, { unsafe => 1 } ); + %do_not_autotruncate_fields = %$do_not_autotruncate_fields_ref if $do_not_autotruncate_fields_ref; + if ( !scalar( keys %do_not_autotruncate_fields ) ) { + %do_not_autotruncate_fields = + map { $_ => 1 } qw / biblioitemnumber host-item-number itemnumber koha-auth-number local-number /; + + # In addition, under no circumstances should non-text fields + # be auto truncated. + my $schema = Koha::Database->new()->schema(); + my $sfs = + $schema->resultset('SearchField') + ->search( + { '-or' => [ { type => 'boolean' }, { type => 'number' }, { type => 'sum' }, { type => 'year' } ] } ); + while ( my $sf = $sfs->next ) { + $do_not_autotruncate_fields{ $sf->name } = 1; + } + $cache->set_in_cache( $cache_key, \%do_not_autotruncate_fields ); + } + + # processing of the syspref is done outside cache since the systempreference + # can be modified and the modification should be reflected in the + # $do_not_autotruncate_fields array + my $prevent_autotruncate = C4::Context->preference('ESPreventAutoTruncate'); + for my $field ( split( /\s*[,;|]\s*/, $prevent_autotruncate ) ) { + $do_not_autotruncate_fields{$field} = 1; + } + + # search fields can be given as a explicit index name (e.g. from advanced + # search): + if ($index_field) { + return 0 if grep { $index_field eq $_ } keys %do_not_autotruncate_fields; + + # OR can be given implicitly, as prefix in the operand (e.g. in links generated + # by Koha like catalogue/search.pl?q=an:): + } elsif ( $oand =~ /\b($field_name_pattern):/ ) { # check field name prefixing operand + my $field_name = $1; + return 0 if grep { $field_name eq $_ } keys %do_not_autotruncate_fields; + } + + # It is safe to auto truncate: + return 1; +} + 1; -- 2.39.5