Browse Source

Bug 19575: Use canonical field names and resolve aliased fields

Adjust elastic search mappings to more closely match Zebra equivalents
resolving serveral issues with coded Zebra searches in templates and
sorting of search results in UI. Also make field names in search strings
case insensitive to accept case variations in template links and user input.

Sponsored-by: Gothenburg University Library

Signed-off-by: Nick Clemens <nick@bywatersolutions.com>
Signed-off-by: Ere Maijala <ere.maijala@helsinki.fi>
Signed-off-by: Martin Renvoize <martin.renvoize@ptfs-europe.com>

Signed-off-by: Nick Clemens <nick@bywatersolutions.com>
tags/v19.05.00
David Gustafsson 3 years ago
committed by root
parent
commit
6d53910f0c
6 changed files with 1782 additions and 659 deletions
  1. +3
    -1
      Koha/SearchEngine/Elasticsearch.pm
  2. +120
    -37
      Koha/SearchEngine/Elasticsearch/QueryBuilder.pm
  3. +7
    -7
      Koha/SearchEngine/Elasticsearch/Search.pm
  4. +1615
    -606
      admin/searchengine/elasticsearch/mappings.yaml
  5. +29
    -0
      installer/data/mysql/atomicupdate/bug_19575-use-canonical-field-names-and-resolve-aliased-fields.sql
  6. +8
    -8
      t/db_dependent/Koha/SearchEngine/Elasticsearch/QueryBuilder.t

+ 3
- 1
Koha/SearchEngine/Elasticsearch.pm View File

@@ -860,7 +860,9 @@ sub _foreach_mapping {

while ( my $search_field = $search_fields->next ) {
$sub->(
$search_field->name,
# Force lower case on indexed field names for case insensitive
# field name searches
lc($search_field->name),
$search_field->type,
$search_field->get_column('facet'),
$search_field->get_column('suggestible'),


+ 120
- 37
Koha/SearchEngine/Elasticsearch/QueryBuilder.pm View File

@@ -111,13 +111,13 @@ sub build_query {
# See _convert_facets in Search.pm for how these get turned into
# things that Koha can use.
$res->{aggregations} = {
author => { terms => { field => "author__facet" } },
subject => { terms => { field => "subject__facet" } },
itype => { terms => { field => "itype__facet" } },
author => { terms => { field => "author__facet" } },
subject => { terms => { field => "subject__facet" } },
itype => { terms => { field => "itype__facet" } },
location => { terms => { field => "location__facet" } },
'su-geo' => { terms => { field => "su-geo__facet" } },
se => { terms => { field => "se__facet" } },
ccode => { terms => { field => "ccode__facet" } },
'title-series' => { terms => { field => "title-series__facet" } },
ccode => { terms => { field => "ccode__facet" } },
};

my $display_library_facets = C4::Context->preference('DisplayLibraryFacets');
@@ -411,12 +411,12 @@ appropriate search object.
=cut

our $koha_to_index_name = {
mainmainentry => 'Heading-Main',
mainentry => 'Heading',
match => 'Match',
'match-heading' => 'Match-heading',
'see-from' => 'Match-heading-see-from',
thesaurus => 'Subject-heading-thesaurus',
mainmainentry => 'heading-main',
mainentry => 'heading',
match => 'match',
'match-heading' => 'match-heading',
'see-from' => 'match-heading-see-from',
thesaurus => 'subject-heading-thesaurus',
any => '',
all => ''
};
@@ -430,6 +430,9 @@ sub build_authorities_query_compat {
# extensible hash form that is understood by L<build_authorities_query>.
my @searches;

# Convert to lower case
$marclist = [map(lc, @{$marclist})];

# Make sure everything exists
foreach my $m (@$marclist) {
Koha::Exceptions::WrongParameter->throw("Invalid marclist field provided: $m")
@@ -447,8 +450,8 @@ sub build_authorities_query_compat {

my %sort;
my $sort_field =
( $orderby =~ /^Heading/ ) ? 'Heading'
: ( $orderby =~ /^Auth/ ) ? 'Local-number'
( $orderby =~ /^heading/ ) ? 'heading'
: ( $orderby =~ /^auth/ ) ? 'local-number'
: undef;
if ($sort_field) {
my $sort_order = ( $orderby =~ /Asc$/ ) ? 'asc' : 'desc';
@@ -479,13 +482,13 @@ sub _convert_sort_fields {

# Turn the sorting into something we care about.
my %sort_field_convert = (
acqdate => 'acqdate',
acqdate => 'date-of-acquisition',
author => 'author',
call_number => 'callnum',
call_number => 'local-classification',
popularity => 'issues',
relevance => undef, # default
title => 'title',
pubdate => 'pubdate',
pubdate => 'date-of-publication',
);
my %sort_order_convert =
( qw( desc desc ), qw( dsc desc ), qw( asc asc ), qw( az asc ), qw( za desc ) );
@@ -515,23 +518,89 @@ types.
=cut

our %index_field_convert = (
'kw' => '_all',
'ti' => 'title',
'au' => 'author',
'su' => 'subject',
'nb' => 'isbn',
'se' => 'title-series',
'callnum' => 'callnum',
'itype' => 'itype',
'ln' => 'ln',
'branch' => 'homebranch',
'fic' => 'lf',
'mus' => 'rtype',
'aud' => 'ta',
'hi' => 'Host-Item-Number',
'at' => 'authtype',
'he' => 'Heading'
'kw' => '_all',
'ab' => 'abstract',
'au' => 'author',
'lcn' => 'local-classification',
'callnum' => 'local-classification',
'record-type' => 'rtype',
'mc-rtype' => 'rtype',
'mus' => 'rtype',
'lc-card' => 'lc-card-number',
'sn' => 'local-number',
'yr' => 'date-of-publication',
'pubdate' => 'date-of-publication',
'acqdate' => 'date-of-acquisition',
'date/time-last-modified' => 'date-time-last-modified',
'dtlm' => 'date/time-last-modified',
'diss' => 'dissertation-information',
'nb' => 'isbn',
'ns' => 'issn',
'music-number' => 'identifier-publisher-for-music',
'number-music-publisher' => 'identifier-publisher-for-music',
'music' => 'identifier-publisher-for-music',
'ident' => 'identifier-standard',
'cpn' => 'corporate-name',
'cfn' => 'conference-name',
'pn' => 'personal-name',
'pb' => 'publisher',
'pv' => 'provider',
'nt' => 'note',
'notes' => 'note',
'rcn' => 'record-control-number',
'su' => 'subject',
'su-to' => 'subject',
#'su-geo' => 'subject',
'su-ut' => 'subject',
'ti' => 'title',
'se' => 'title-series',
'ut' => 'title-uniform',
'an' => 'authority-number',
'koha-auth-number' => 'authority-number',
'at' => 'authtype',
'he' => 'heading',
'rank' => 'relevance',
'phr' => 'st-phrase',
'wrdl' => 'st-word-list',
'rt' => 'right-truncation',
'rtrn' => 'right-truncation',
'ltrn' => 'left-truncation',
'rltrn' => 'left-and-right',
'mc-itemtype' => 'itemtype',
'mc-ccode' => 'ccode',
'branch' => 'homebranch',
'mc-loc' => 'location',
'stocknumber' => 'number-local-acquisition',
'inv' => 'number-local-acquisition',
'bc' => 'barcode',
'mc-itype' => 'itype',
'aub' => 'author-personal-bibliography',
'auo' => 'author-in-order',
'ff8-22' => 'ta',
'aud' => 'ta',
'audience' => 'ta',
'frequency-code' => 'ff8-18',
'illustration-code' => 'ff8-18-21',
'regularity-code' => 'ff8-19',
'type-of-serial' => 'ff8-21',
'format' => 'ff8-23',
'conference-code' => 'ff8-29',
'festschrift-indicator' => 'ff8-30',
'index-indicator' => 'ff8-31',
'fiction' => 'lf',
'fic' => 'lf',
'literature-code' => 'lf',
'biography' => 'bio',
'ff8-34' => 'bio',
'biography-code' => 'bio',
'l-format' => 'ff7-01-02',
'lex' => 'lexile-number',
'hi' => 'host-item-number',
'itu' => 'index-term-uncontrolled',
'itg' => 'index-term-genre',
);
my $field_name_pattern = '[\w\-]+';
my $multi_field_pattern = "(?:\\.$field_name_pattern)*";

sub _convert_index_fields {
my ( $self, @indexes ) = @_;
@@ -544,14 +613,15 @@ sub _convert_index_fields {
# when joining things, to indicate we make it an 'OR' join.
# (Sorry, this got a bit ugly after special cases were found.)
grep { $_->{field} } map {
my ( $f, $t ) = split /,/;
# Lower case all field names
my ( $f, $t ) = map(lc, split /,/);
my $mc = '';
if ($f =~ /^mc-/) {
$mc = 'mc-';
$f =~ s/^mc-//;
}
my $r = {
field => $index_field_convert{$f},
field => exists $index_field_convert{$f} ? $index_field_convert{$f} : $f,
type => $index_type_convert{ $t // '__default' }
};
$r->{field} = ($mc . $r->{field}) if $mc && $r->{field};
@@ -606,9 +676,21 @@ will have to wait for a real query parser.

sub _convert_index_strings_freeform {
my ( $self, $search ) = @_;
while ( my ( $zeb, $es ) = each %index_field_convert ) {
$search =~ s/\b$zeb(?:,[\w\-]*)?:/$es:/g;
}
# @TODO: Currenty will alter also fields contained within quotes:
# `searching for "stuff cn:123"` for example will become
# `searching for "stuff local-number:123"
#
# Fixing this is tricky, one possibility:
# https://stackoverflow.com/questions/19193876/perl-regex-to-match-a-string-that-is-not-enclosed-in-quotes
# Still not perfect, and will not handle escaped quotes within quotes and assumes balanced quotes.
#
# Another, not so elegant, solution could be to replace all quoted content with placeholders, and put
# them back when processing is done.

# Lower case field names
$search =~ s/($field_name_pattern)(?:,[\w-]*)?($multi_field_pattern):/\L$1\E$2:/og;
# Resolve possible field aliases
$search =~ s/($field_name_pattern)($multi_field_pattern):/(exists $index_field_convert{$1} ? $index_field_convert{$1} : $1)."$2:"/oge;
return $search;
}

@@ -805,6 +887,7 @@ operands and double quoted strings.

=cut

my $tokenize_split_re = qr/((?:${field_name_pattern}${multi_field_pattern}:)?"[^"]+"|\s+)/;
sub _truncate_terms {
my ( $self, $query ) = @_;



+ 7
- 7
Koha/SearchEngine/Elasticsearch/Search.pm View File

@@ -207,7 +207,7 @@ sub search_auth_compat {
# rather than hard-coded conversions.
#my $record = $_[0];
# Handle legacy nested arrays indexed with splitting enabled.
my $authid = $record->{ 'Local-number' }[0];
my $authid = $record->{ 'local-number' }[0];
$authid = @$authid[0] if (ref $authid eq 'ARRAY');

$result{authid} = $authid;
@@ -266,7 +266,7 @@ sub count_auth_use {
query => {
bool => {
# query => { match_all => {} },
filter => { term => { an => $authid } }
filter => { term => { 'authority-number' => $authid } }
}
}
};
@@ -438,13 +438,13 @@ sub _convert_facets {
# things that zebra uses.
# TODO let the library define the order using the interface.
my %type_to_label = (
author => { order => 1, label => 'Authors', },
itype => { order => 2, label => 'ItemTypes', },
author => { order => 1, label => 'Authors', },
itype => { order => 2, label => 'ItemTypes', },
location => { order => 3, label => 'Location', },
'su-geo' => { order => 4, label => 'Places', },
se => { order => 5, label => 'Series', },
subject => { order => 6, label => 'Topics', },
ccode => { order => 7, label => 'CollectionCodes',},
'title-series' => { order => 5, label => 'Series', },
subject => { order => 6, label => 'Topics', },
ccode => { order => 7, label => 'CollectionCodes',},
holdingbranch => { order => 8, label => 'HoldingLibrary' },
homebranch => { order => 9, label => 'HomeLibrary' }
);


+ 1615
- 606
admin/searchengine/elasticsearch/mappings.yaml
File diff suppressed because it is too large
View File


+ 29
- 0
installer/data/mysql/atomicupdate/bug_19575-use-canonical-field-names-and-resolve-aliased-fields.sql View File

@@ -0,0 +1,29 @@
UPDATE `search_field` SET `name` = LOWER(name), `label` = LOWER(label);
UPDATE `search_field` SET `name` = 'date-of-publication', `label` = 'date-of-publication' WHERE `name` = 'pubdate';
UPDATE `search_field` SET `name` = 'title-series', `label` = 'title-series' WHERE `name` = 'se';
UPDATE `search_field` SET `name` = 'identifier-standard', `label` = 'identifier-standard' WHERE `name` = 'identifier-standard';
UPDATE `search_field` SET `name` = 'author', `label` = 'author' WHERE `name` = 'author';
UPDATE `search_field` SET `name` = 'control-number', `label` = 'control-number' WHERE `name` = 'control-number';
UPDATE `search_field` SET `name` = 'place-of-publication', `label` = 'place-of-publication' WHERE `name` = 'place';
UPDATE `search_field` SET `name` = 'date-of-acquisition', `label` = 'date-of-acquisition' WHERE `name` = 'acqdate';
UPDATE `search_field` SET `name` = 'isbn', `label` = 'isbn' WHERE `name` = 'isbn';
UPDATE `search_field` SET `name` = 'authority-number', `label` = 'authority-number' WHERE `name` = 'an';
UPDATE `search_field` SET `name` = 'subject', `label` = 'subject' WHERE `name` = 'subject';
UPDATE `search_field` SET `name` = 'publisher', `label` = 'publisher' WHERE `name` = 'publisher';
UPDATE `search_field` SET `name` = 'record-source', `label` = 'record-source' WHERE `name` = 'record-source';
UPDATE `search_field` SET `name` = 'title', `label` = 'title' WHERE `name` = 'title';
UPDATE `search_field` SET `name` = 'local-classification', `label` = 'local-classification' WHERE `name` = 'local-classification';
UPDATE `search_field` SET `name` = 'bib-level', `label` = 'bib-level' WHERE `name` = 'bib-level';
UPDATE `search_field` SET `name` = 'microform-generation', `label` = 'microform-generation' WHERE `name` = 'microform-generation';
UPDATE `search_field` SET `name` = 'material-type', `label` = 'material-type' WHERE `name` = 'material-type';
UPDATE `search_field` SET `name` = 'bgf-number', `label` = 'bgf-number' WHERE `name` = 'bgf-number';
UPDATE `search_field` SET `name` = 'number-db', `label` = 'number-db' WHERE `name` = 'number-db';
UPDATE `search_field` SET `name` = 'number-natl-biblio', `label` = 'number-natl-biblio' WHERE `name` = 'number-natl-biblio';
UPDATE `search_field` SET `name` = 'number-legal-deposit', `label` = 'number-legal-deposit' WHERE `name` = 'number-legal-deposit';
UPDATE `search_field` SET `name` = 'issn', `label` = 'issn' WHERE `name` = 'issn';
UPDATE `search_field` SET `name` = 'local-number', `label` = 'local-number' WHERE `name` = 'local-number';
UPDATE `search_field` SET `name` = 'suppress', `label` = 'supress' WHERE `name` = 'suppress';
UPDATE `search_field` SET `name` = 'bnb-card-number', `label` = 'bnb-card-number' WHERE `name` = 'bnb-card-number';
UPDATE `search_field` SET `name` = 'date/time-last-modified', `label` = 'date/time-last-modified' WHERE `name` = 'date-time-last-modified';
DELETE FROM `search_field` WHERE `name` = 'lc-cardnumber';
DELETE FROM `search_marc_map` WHERE `id` NOT IN(SELECT `search_marc_map_id` FROM `search_marc_to_field`);

+ 8
- 8
t/db_dependent/Koha/SearchEngine/Elasticsearch/QueryBuilder.t View File

@@ -288,29 +288,29 @@ subtest 'build_query tests' => sub {
( undef, $query ) = $qb->build_query_compat( undef, ['Local-number:"123456"'] );
is(
$query->{query}{query_string}{query},
'(Local-number:"123456")',
"query of specific field including hyphen and quoted is not truncated"
'(local-number:"123456")',
"query of specific field including hyphen and quoted is not truncated, field name is converted to lower case"
);

( undef, $query ) = $qb->build_query_compat( undef, ['Local-number:123456'] );
is(
$query->{query}{query_string}{query},
'(Local-number:123456*)',
"query of specific field including hyphen and not quoted is truncated"
'(local-number:123456*)',
"query of specific field including hyphen and not quoted is truncated, field name is converted to lower case"
);

( undef, $query ) = $qb->build_query_compat( undef, ['Local-number.raw:123456'] );
is(
$query->{query}{query_string}{query},
'(Local-number.raw:123456*)',
"query of specific field including period and not quoted is truncated"
'(local-number.raw:123456*)',
"query of specific field including period and not quoted is truncated, field name is converted to lower case"
);

( undef, $query ) = $qb->build_query_compat( undef, ['Local-number.raw:"123456"'] );
is(
$query->{query}{query_string}{query},
'(Local-number.raw:"123456")',
"query of specific field including period and quoted is not truncated"
'(local-number.raw:"123456")',
"query of specific field including period and quoted is not truncated, field name is converted to lower case"
);

( undef, $query ) = $qb->build_query_compat( undef, ['J.R.R'] );


Loading…
Cancel
Save