Browse Source

Bug 22258: Elasticsearch: Add array as an alternative MARC format

Adds preference ElasticsearchMARCFormat that controls whether MARC records are stored as ISO2709/MARCXML or array. Array is searchable by field and also indexes all subfields in the _all field for searching.

Test plan:
1. Test that searching and indexing works with the patch without any changes.
2. Switch to array format and index some records.
3. Check e.g. the 008 field of a record and verify that the record can be found with the contents enclosed in quotes.
4. Check that it's possible to search for a specific field/subfield. Search query: marc_data_array.fields.655.subfields.a:Diaries
5. Check that tests still pass, especially t/Koha/SearchEngine/Elasticsearch.t

Signed-off-by: Michal Denar <black23@gmail.com>
Signed-off-by: Nick Clemens <nick@bywatersolutions.com>
Signed-off-by: Martin Renvoize <martin.renvoize@ptfs-europe.com>
remotes/origin/19.11.x
Ere Maijala 3 years ago
committed by Martin Renvoize
parent
commit
4f42ea51d7
Signed by: martin.renvoize GPG Key ID: 422B469130441A0F
  1. 132
      Koha/SearchEngine/Elasticsearch.pm
  2. 3
      Koha/SearchEngine/Elasticsearch/Search.pm
  3. 3
      admin/searchengine/elasticsearch/field_config.yaml
  4. 11
      installer/data/mysql/atomicupdate/bug_22258.perl
  5. 1
      installer/data/mysql/sysprefs.sql
  6. 9
      koha-tmpl/intranet-tmpl/prog/en/modules/admin/preferences/admin.pref
  7. 82
      t/Koha/SearchEngine/Elasticsearch.t

132
Koha/SearchEngine/Elasticsearch.pm

@ -408,6 +408,7 @@ sub marc_records_to_documents {
my $control_fields_rules = $rules->{control_fields};
my $data_fields_rules = $rules->{data_fields};
my $marcflavour = lc C4::Context->preference('marcflavour');
my $use_array = C4::Context->preference('ElasticsearchMARCFormat') eq 'ARRAY';
my @record_documents;
@ -527,26 +528,31 @@ sub marc_records_to_documents {
# TODO: Perhaps should check if $records_document non empty, but really should never be the case
$record->encoding('UTF-8');
my @warnings;
{
# Temporarily intercept all warn signals (MARC::Record carps when record length > 99999)
local $SIG{__WARN__} = sub {
push @warnings, $_[0];
};
$record_document->{'marc_data'} = encode_base64(encode('UTF-8', $record->as_usmarc()));
}
if (@warnings) {
# Suppress warnings if record length exceeded
unless (substr($record->leader(), 0, 5) eq '99999') {
foreach my $warning (@warnings) {
carp $warning;
if ($use_array) {
$record_document->{'marc_data_array'} = $self->_marc_to_array($record);
$record_document->{'marc_format'} = 'ARRAY';
} else {
my @warnings;
{
# Temporarily intercept all warn signals (MARC::Record carps when record length > 99999)
local $SIG{__WARN__} = sub {
push @warnings, $_[0];
};
$record_document->{'marc_data'} = encode_base64(encode('UTF-8', $record->as_usmarc()));
}
if (@warnings) {
# Suppress warnings if record length exceeded
unless (substr($record->leader(), 0, 5) eq '99999') {
foreach my $warning (@warnings) {
carp $warning;
}
}
$record_document->{'marc_data'} = $record->as_xml_record($marcflavour);
$record_document->{'marc_format'} = 'MARCXML';
}
else {
$record_document->{'marc_format'} = 'base64ISO2709';
}
$record_document->{'marc_data'} = $record->as_xml_record($marcflavour);
$record_document->{'marc_format'} = 'MARCXML';
}
else {
$record_document->{'marc_format'} = 'base64ISO2709';
}
my $id = $record->subfield('999', 'c');
push @record_documents, [$id, $record_document];
@ -554,6 +560,96 @@ sub marc_records_to_documents {
return \@record_documents;
}
=head2 _marc_to_array($record)
my @fields = _marc_to_array($record)
Convert a MARC::Record to an array modeled after MARC-in-JSON
(see https://github.com/marc4j/marc4j/wiki/MARC-in-JSON-Description)
=over 4
=item C<$record>
A MARC::Record object
=back
=cut
sub _marc_to_array {
my ($self, $record) = @_;
my $data = {
leader => $record->leader(),
fields => []
};
for my $field ($record->fields()) {
my $tag = $field->tag();
if ($field->is_control_field()) {
push @{$data->{fields}}, {$tag => $field->data()};
} else {
my $subfields = ();
foreach my $subfield ($field->subfields()) {
my ($code, $contents) = @{$subfield};
push @{$subfields}, {$code => $contents};
}
push @{$data->{fields}}, {
$tag => {
ind1 => $field->indicator(1),
ind2 => $field->indicator(2),
subfields => $subfields
}
};
}
}
return $data;
}
=head2 _array_to_marc($data)
my $record = _array_to_marc($data)
Convert an array modeled after MARC-in-JSON to a MARC::Record
=over 4
=item C<$data>
An array modeled after MARC-in-JSON
(see https://github.com/marc4j/marc4j/wiki/MARC-in-JSON-Description)
=back
=cut
sub _array_to_marc {
my ($self, $data) = @_;
my $record = MARC::Record->new();
$record->leader($data->{leader});
for my $field (@{$data->{fields}}) {
my $tag = (keys %{$field})[0];
$field = $field->{$tag};
my $marc_field;
if (ref($field) eq 'HASH') {
my @subfields;
foreach my $subfield (@{$field->{subfields}}) {
my $code = (keys %{$subfield})[0];
push @subfields, $code;
push @subfields, $subfield->{$code};
}
$marc_field = MARC::Field->new($tag, $field->{ind1}, $field->{ind2}, @subfields);
} else {
$marc_field = MARC::Field->new($tag, $field)
}
$record->append_fields($marc_field);
}
;
return $record;
}
=head2 _field_mappings($facet, $suggestible, $sort, $target_name, $target_type, $range)
my @mappings = _field_mappings($facet, $suggestible, $sort, $target_name, $target_type, $range)

3
Koha/SearchEngine/Elasticsearch/Search.pm

@ -382,6 +382,9 @@ sub decode_record_from_result {
elsif ($result->{marc_format} eq 'MARCXML') {
return MARC::Record->new_from_xml($result->{marc_data}, 'UTF-8', uc C4::Context->preference('marcflavour'));
}
elsif ($result->{marc_format} eq 'ARRAY') {
return $self->_array_to_marc($result->{marc_data_array});
}
else {
Koha::Exceptions::Elasticsearch->throw("Missing marc_format field in Elasticsearch result");
}

3
admin/searchengine/elasticsearch/field_config.yaml

@ -10,6 +10,9 @@ general:
type: text
analyzer: keyword
index: false
marc_data_array:
type: object
dynamic: true
marc_format:
store: true
type: text

11
installer/data/mysql/atomicupdate/bug_22258.perl

@ -0,0 +1,11 @@
$DBversion = 'XXX'; # will be replaced by the RM
if( CheckVersion( $DBversion ) ) {
$dbh->do(q{
INSERT IGNORE INTO `systempreferences` (`variable`,`value`,`explanation`,`options`,`type`) VALUES
('ElasticsearchMARCFormat', 'ISO2709', 'ISO2709|ARRAY', 'Elasticsearch MARC format. ISO2709 format is recommended as it is faster and takes less space, whereas array is searchable.', 'Choice')
});
# Always end with this (adjust the bug info)
SetVersion( $DBversion );
print "Upgrade to $DBversion done (Bug 22258 - Add ElasticsearchMARCFormat preference)\n";
}

1
installer/data/mysql/sysprefs.sql

@ -159,6 +159,7 @@ INSERT INTO systempreferences ( `variable`, `value`, `options`, `explanation`, `
('EasyAnalyticalRecords','0','','If on, display in the catalogue screens tools to easily setup analytical record relationships','YesNo'),
('ElasticsearchIndexStatus_authorities', '0', 'Authorities index status', NULL, NULL),
('ElasticsearchIndexStatus_biblios', '0', 'Biblios index status', NULL, NULL),
('ElasticsearchMARCFormat', 'ISO2709', 'ISO2709|ARRAY', 'Elasticsearch MARC format. ISO2709 format is recommended as it is faster and takes less space, whereas array is searchable.', 'Choice'),
('EmailAddressForSuggestions','','',' If you choose EmailAddressForSuggestions you have to enter a valid email address: ','free'),
('emailLibrarianWhenHoldIsPlaced','0',NULL,'If ON, emails the librarian whenever a hold is placed','YesNo'),
('EmailPurchaseSuggestions','0','0|EmailAddressForSuggestions|BranchEmailAddress|KohaAdminEmailAddress','Choose email address that new purchase suggestions will be sent to: ','Choice'),

9
koha-tmpl/intranet-tmpl/prog/en/modules/admin/preferences/admin.pref

@ -439,3 +439,12 @@ Administration:
choices:
Zebra: Zebra
Elasticsearch: Elasticsearch
-
- "Elasticsearch MARC format: "
- pref: ElasticsearchMARCFormat
default: "ISO2709"
choices:
"ISO2709": "ISO2709 (exchange format)"
"ARRAY": "Searchable array"
- <br>ISO2709 format is recommended as it is faster and takes less space, whereas array format makes the full MARC record searchable.
- <br><strong>NOTE:</strong> Making the full record searchable may have a negative effect on relevance ranking of search results.

82
t/Koha/SearchEngine/Elasticsearch.t

@ -17,7 +17,7 @@
use Modern::Perl;
use Test::More tests => 4;
use Test::More tests => 5;
use Test::Exception;
use t::lib::Mocks;
@ -120,6 +120,7 @@ subtest 'Koha::SearchEngine::Elasticsearch::marc_records_to_documents () tests'
plan tests => 50;
t::lib::Mocks::mock_preference('marcflavour', 'MARC21');
t::lib::Mocks::mock_preference('ElasticsearchMARCFormat', 'ISO2709');
my @mappings = (
{
@ -503,3 +504,82 @@ subtest 'Koha::SearchEngine::Elasticsearch::marc_records_to_documents () tests'
ok($exception->isa("Koha::Exceptions::Elasticsearch::MARCFieldExprParseError"), "Exception is of correct class");
ok($exception->message =~ /Unmatched closing parenthesis/, "Exception has the correct message");
};
subtest 'Koha::SearchEngine::Elasticsearch::marc_records_to_documents_array () tests' => sub {
plan tests => 6;
t::lib::Mocks::mock_preference('marcflavour', 'MARC21');
t::lib::Mocks::mock_preference('ElasticsearchMARCFormat', 'ARRAY');
my @mappings = (
{
name => 'control_number',
type => 'string',
facet => 0,
suggestible => 0,
sort => undef,
marc_type => 'marc21',
marc_field => '001',
}
);
my $se = Test::MockModule->new('Koha::SearchEngine::Elasticsearch');
$se->mock('_foreach_mapping', sub {
my ($self, $sub) = @_;
foreach my $map (@mappings) {
$sub->(
$map->{name},
$map->{type},
$map->{facet},
$map->{suggestible},
$map->{sort},
$map->{marc_type},
$map->{marc_field}
);
}
});
my $see = Koha::SearchEngine::Elasticsearch::Search->new({ index => $Koha::SearchEngine::Elasticsearch::BIBLIOS_INDEX });
my $marc_record_1 = MARC::Record->new();
$marc_record_1->leader(' cam 22 a 4500');
$marc_record_1->append_fields(
MARC::Field->new('001', '123'),
MARC::Field->new('020', '', '', a => '1-56619-909-3'),
MARC::Field->new('100', '', '', a => 'Author 1'),
MARC::Field->new('110', '', '', a => 'Corp Author'),
MARC::Field->new('210', '', '', a => 'Title 1'),
MARC::Field->new('245', '', '', a => 'Title:', b => 'first record'),
MARC::Field->new('999', '', '', c => '1234567'),
);
my $marc_record_2 = MARC::Record->new();
$marc_record_2->leader(' cam 22 a 4500');
$marc_record_2->append_fields(
MARC::Field->new('100', '', '', a => 'Author 2'),
# MARC::Field->new('210', '', '', a => 'Title 2'),
# MARC::Field->new('245', '', '', a => 'Title: second record'),
MARC::Field->new('999', '', '', c => '1234568'),
MARC::Field->new('952', '', '', 0 => 1, g => 'string where should be numeric'),
);
my $records = [$marc_record_1, $marc_record_2];
$see->get_elasticsearch_mappings(); #sort_fields will call this and use the actual db values unless we call it first
my $docs = $see->marc_records_to_documents($records);
# First record:
is(scalar @{$docs}, 2, 'Two records converted to documents');
is($docs->[0][0], '1234567', 'First document biblionumber should be set as first element in document touple');
is_deeply($docs->[0][1]->{control_number}, ['123'], 'First record control number should be set correctly');
is($docs->[0][1]->{marc_format}, 'ARRAY', 'First document marc_format should be set correctly');
my $decoded_marc_record = $see->decode_record_from_result($docs->[0][1]);
ok($decoded_marc_record->isa('MARC::Record'), "ARRAY record successfully decoded from result");
is($decoded_marc_record->as_usmarc(), $marc_record_1->as_usmarc(), "Decoded ARRAY record has same data as original record");
};

Loading…
Cancel
Save