From 4f42ea51d74325af6282420ef11ba539c784ffd7 Mon Sep 17 00:00:00 2001 From: Ere Maijala Date: Fri, 15 Feb 2019 13:32:03 +0200 Subject: [PATCH] Bug 22258: Elasticsearch: Add array as an alternative MARC format Adds preference ElasticsearchMARCFormat that controls whether MARC records are stored as ISO2709/MARCXML or array. Array is searchable by field and also indexes all subfields in the _all field for searching. Test plan: 1. Test that searching and indexing works with the patch without any changes. 2. Switch to array format and index some records. 3. Check e.g. the 008 field of a record and verify that the record can be found with the contents enclosed in quotes. 4. Check that it's possible to search for a specific field/subfield. Search query: marc_data_array.fields.655.subfields.a:Diaries 5. Check that tests still pass, especially t/Koha/SearchEngine/Elasticsearch.t Signed-off-by: Michal Denar Signed-off-by: Nick Clemens Signed-off-by: Martin Renvoize --- Koha/SearchEngine/Elasticsearch.pm | 132 +++++++++++++++--- Koha/SearchEngine/Elasticsearch/Search.pm | 3 + .../elasticsearch/field_config.yaml | 3 + .../data/mysql/atomicupdate/bug_22258.perl | 11 ++ installer/data/mysql/sysprefs.sql | 1 + .../en/modules/admin/preferences/admin.pref | 9 ++ t/Koha/SearchEngine/Elasticsearch.t | 82 ++++++++++- 7 files changed, 222 insertions(+), 19 deletions(-) create mode 100644 installer/data/mysql/atomicupdate/bug_22258.perl diff --git a/Koha/SearchEngine/Elasticsearch.pm b/Koha/SearchEngine/Elasticsearch.pm index 4435015d81..3104e2ab04 100644 --- a/Koha/SearchEngine/Elasticsearch.pm +++ b/Koha/SearchEngine/Elasticsearch.pm @@ -408,6 +408,7 @@ sub marc_records_to_documents { my $control_fields_rules = $rules->{control_fields}; my $data_fields_rules = $rules->{data_fields}; my $marcflavour = lc C4::Context->preference('marcflavour'); + my $use_array = C4::Context->preference('ElasticsearchMARCFormat') eq 'ARRAY'; my @record_documents; @@ -527,26 +528,31 @@ sub marc_records_to_documents { # TODO: Perhaps should check if $records_document non empty, but really should never be the case $record->encoding('UTF-8'); - my @warnings; - { - # Temporarily intercept all warn signals (MARC::Record carps when record length > 99999) - local $SIG{__WARN__} = sub { - push @warnings, $_[0]; - }; - $record_document->{'marc_data'} = encode_base64(encode('UTF-8', $record->as_usmarc())); - } - if (@warnings) { - # Suppress warnings if record length exceeded - unless (substr($record->leader(), 0, 5) eq '99999') { - foreach my $warning (@warnings) { - carp $warning; + if ($use_array) { + $record_document->{'marc_data_array'} = $self->_marc_to_array($record); + $record_document->{'marc_format'} = 'ARRAY'; + } else { + my @warnings; + { + # Temporarily intercept all warn signals (MARC::Record carps when record length > 99999) + local $SIG{__WARN__} = sub { + push @warnings, $_[0]; + }; + $record_document->{'marc_data'} = encode_base64(encode('UTF-8', $record->as_usmarc())); + } + if (@warnings) { + # Suppress warnings if record length exceeded + unless (substr($record->leader(), 0, 5) eq '99999') { + foreach my $warning (@warnings) { + carp $warning; + } } + $record_document->{'marc_data'} = $record->as_xml_record($marcflavour); + $record_document->{'marc_format'} = 'MARCXML'; + } + else { + $record_document->{'marc_format'} = 'base64ISO2709'; } - $record_document->{'marc_data'} = $record->as_xml_record($marcflavour); - $record_document->{'marc_format'} = 'MARCXML'; - } - else { - $record_document->{'marc_format'} = 'base64ISO2709'; } my $id = $record->subfield('999', 'c'); push @record_documents, [$id, $record_document]; @@ -554,6 +560,96 @@ sub marc_records_to_documents { return \@record_documents; } +=head2 _marc_to_array($record) + + my @fields = _marc_to_array($record) + +Convert a MARC::Record to an array modeled after MARC-in-JSON +(see https://github.com/marc4j/marc4j/wiki/MARC-in-JSON-Description) + +=over 4 + +=item C<$record> + +A MARC::Record object + +=back + +=cut + +sub _marc_to_array { + my ($self, $record) = @_; + + my $data = { + leader => $record->leader(), + fields => [] + }; + for my $field ($record->fields()) { + my $tag = $field->tag(); + if ($field->is_control_field()) { + push @{$data->{fields}}, {$tag => $field->data()}; + } else { + my $subfields = (); + foreach my $subfield ($field->subfields()) { + my ($code, $contents) = @{$subfield}; + push @{$subfields}, {$code => $contents}; + } + push @{$data->{fields}}, { + $tag => { + ind1 => $field->indicator(1), + ind2 => $field->indicator(2), + subfields => $subfields + } + }; + } + } + return $data; +} + +=head2 _array_to_marc($data) + + my $record = _array_to_marc($data) + +Convert an array modeled after MARC-in-JSON to a MARC::Record + +=over 4 + +=item C<$data> + +An array modeled after MARC-in-JSON +(see https://github.com/marc4j/marc4j/wiki/MARC-in-JSON-Description) + +=back + +=cut + +sub _array_to_marc { + my ($self, $data) = @_; + + my $record = MARC::Record->new(); + + $record->leader($data->{leader}); + for my $field (@{$data->{fields}}) { + my $tag = (keys %{$field})[0]; + $field = $field->{$tag}; + my $marc_field; + if (ref($field) eq 'HASH') { + my @subfields; + foreach my $subfield (@{$field->{subfields}}) { + my $code = (keys %{$subfield})[0]; + push @subfields, $code; + push @subfields, $subfield->{$code}; + } + $marc_field = MARC::Field->new($tag, $field->{ind1}, $field->{ind2}, @subfields); + } else { + $marc_field = MARC::Field->new($tag, $field) + } + $record->append_fields($marc_field); + } +; + return $record; +} + =head2 _field_mappings($facet, $suggestible, $sort, $target_name, $target_type, $range) my @mappings = _field_mappings($facet, $suggestible, $sort, $target_name, $target_type, $range) diff --git a/Koha/SearchEngine/Elasticsearch/Search.pm b/Koha/SearchEngine/Elasticsearch/Search.pm index f37252cb5d..705feb27fa 100644 --- a/Koha/SearchEngine/Elasticsearch/Search.pm +++ b/Koha/SearchEngine/Elasticsearch/Search.pm @@ -382,6 +382,9 @@ sub decode_record_from_result { elsif ($result->{marc_format} eq 'MARCXML') { return MARC::Record->new_from_xml($result->{marc_data}, 'UTF-8', uc C4::Context->preference('marcflavour')); } + elsif ($result->{marc_format} eq 'ARRAY') { + return $self->_array_to_marc($result->{marc_data_array}); + } else { Koha::Exceptions::Elasticsearch->throw("Missing marc_format field in Elasticsearch result"); } diff --git a/admin/searchengine/elasticsearch/field_config.yaml b/admin/searchengine/elasticsearch/field_config.yaml index 0baddf3a02..73af9773e0 100644 --- a/admin/searchengine/elasticsearch/field_config.yaml +++ b/admin/searchengine/elasticsearch/field_config.yaml @@ -10,6 +10,9 @@ general: type: text analyzer: keyword index: false + marc_data_array: + type: object + dynamic: true marc_format: store: true type: text diff --git a/installer/data/mysql/atomicupdate/bug_22258.perl b/installer/data/mysql/atomicupdate/bug_22258.perl new file mode 100644 index 0000000000..5c86d1973b --- /dev/null +++ b/installer/data/mysql/atomicupdate/bug_22258.perl @@ -0,0 +1,11 @@ +$DBversion = 'XXX'; # will be replaced by the RM +if( CheckVersion( $DBversion ) ) { + $dbh->do(q{ + INSERT IGNORE INTO `systempreferences` (`variable`,`value`,`explanation`,`options`,`type`) VALUES + ('ElasticsearchMARCFormat', 'ISO2709', 'ISO2709|ARRAY', 'Elasticsearch MARC format. ISO2709 format is recommended as it is faster and takes less space, whereas array is searchable.', 'Choice') + }); + + # Always end with this (adjust the bug info) + SetVersion( $DBversion ); + print "Upgrade to $DBversion done (Bug 22258 - Add ElasticsearchMARCFormat preference)\n"; +} diff --git a/installer/data/mysql/sysprefs.sql b/installer/data/mysql/sysprefs.sql index 82a8402a36..f61c042a47 100644 --- a/installer/data/mysql/sysprefs.sql +++ b/installer/data/mysql/sysprefs.sql @@ -159,6 +159,7 @@ INSERT INTO systempreferences ( `variable`, `value`, `options`, `explanation`, ` ('EasyAnalyticalRecords','0','','If on, display in the catalogue screens tools to easily setup analytical record relationships','YesNo'), ('ElasticsearchIndexStatus_authorities', '0', 'Authorities index status', NULL, NULL), ('ElasticsearchIndexStatus_biblios', '0', 'Biblios index status', NULL, NULL), +('ElasticsearchMARCFormat', 'ISO2709', 'ISO2709|ARRAY', 'Elasticsearch MARC format. ISO2709 format is recommended as it is faster and takes less space, whereas array is searchable.', 'Choice'), ('EmailAddressForSuggestions','','',' If you choose EmailAddressForSuggestions you have to enter a valid email address: ','free'), ('emailLibrarianWhenHoldIsPlaced','0',NULL,'If ON, emails the librarian whenever a hold is placed','YesNo'), ('EmailPurchaseSuggestions','0','0|EmailAddressForSuggestions|BranchEmailAddress|KohaAdminEmailAddress','Choose email address that new purchase suggestions will be sent to: ','Choice'), diff --git a/koha-tmpl/intranet-tmpl/prog/en/modules/admin/preferences/admin.pref b/koha-tmpl/intranet-tmpl/prog/en/modules/admin/preferences/admin.pref index 5bf920ba25..5c890706be 100644 --- a/koha-tmpl/intranet-tmpl/prog/en/modules/admin/preferences/admin.pref +++ b/koha-tmpl/intranet-tmpl/prog/en/modules/admin/preferences/admin.pref @@ -439,3 +439,12 @@ Administration: choices: Zebra: Zebra Elasticsearch: Elasticsearch + - + - "Elasticsearch MARC format: " + - pref: ElasticsearchMARCFormat + default: "ISO2709" + choices: + "ISO2709": "ISO2709 (exchange format)" + "ARRAY": "Searchable array" + -
ISO2709 format is recommended as it is faster and takes less space, whereas array format makes the full MARC record searchable. + -
NOTE: Making the full record searchable may have a negative effect on relevance ranking of search results. diff --git a/t/Koha/SearchEngine/Elasticsearch.t b/t/Koha/SearchEngine/Elasticsearch.t index 4954179e51..5b688ab41e 100644 --- a/t/Koha/SearchEngine/Elasticsearch.t +++ b/t/Koha/SearchEngine/Elasticsearch.t @@ -17,7 +17,7 @@ use Modern::Perl; -use Test::More tests => 4; +use Test::More tests => 5; use Test::Exception; use t::lib::Mocks; @@ -120,6 +120,7 @@ subtest 'Koha::SearchEngine::Elasticsearch::marc_records_to_documents () tests' plan tests => 50; t::lib::Mocks::mock_preference('marcflavour', 'MARC21'); + t::lib::Mocks::mock_preference('ElasticsearchMARCFormat', 'ISO2709'); my @mappings = ( { @@ -503,3 +504,82 @@ subtest 'Koha::SearchEngine::Elasticsearch::marc_records_to_documents () tests' ok($exception->isa("Koha::Exceptions::Elasticsearch::MARCFieldExprParseError"), "Exception is of correct class"); ok($exception->message =~ /Unmatched closing parenthesis/, "Exception has the correct message"); }; + +subtest 'Koha::SearchEngine::Elasticsearch::marc_records_to_documents_array () tests' => sub { + + plan tests => 6; + + t::lib::Mocks::mock_preference('marcflavour', 'MARC21'); + t::lib::Mocks::mock_preference('ElasticsearchMARCFormat', 'ARRAY'); + + my @mappings = ( + { + name => 'control_number', + type => 'string', + facet => 0, + suggestible => 0, + sort => undef, + marc_type => 'marc21', + marc_field => '001', + } + ); + + my $se = Test::MockModule->new('Koha::SearchEngine::Elasticsearch'); + $se->mock('_foreach_mapping', sub { + my ($self, $sub) = @_; + + foreach my $map (@mappings) { + $sub->( + $map->{name}, + $map->{type}, + $map->{facet}, + $map->{suggestible}, + $map->{sort}, + $map->{marc_type}, + $map->{marc_field} + ); + } + }); + + my $see = Koha::SearchEngine::Elasticsearch::Search->new({ index => $Koha::SearchEngine::Elasticsearch::BIBLIOS_INDEX }); + + my $marc_record_1 = MARC::Record->new(); + $marc_record_1->leader(' cam 22 a 4500'); + $marc_record_1->append_fields( + MARC::Field->new('001', '123'), + MARC::Field->new('020', '', '', a => '1-56619-909-3'), + MARC::Field->new('100', '', '', a => 'Author 1'), + MARC::Field->new('110', '', '', a => 'Corp Author'), + MARC::Field->new('210', '', '', a => 'Title 1'), + MARC::Field->new('245', '', '', a => 'Title:', b => 'first record'), + MARC::Field->new('999', '', '', c => '1234567'), + ); + my $marc_record_2 = MARC::Record->new(); + $marc_record_2->leader(' cam 22 a 4500'); + $marc_record_2->append_fields( + MARC::Field->new('100', '', '', a => 'Author 2'), + # MARC::Field->new('210', '', '', a => 'Title 2'), + # MARC::Field->new('245', '', '', a => 'Title: second record'), + MARC::Field->new('999', '', '', c => '1234568'), + MARC::Field->new('952', '', '', 0 => 1, g => 'string where should be numeric'), + ); + my $records = [$marc_record_1, $marc_record_2]; + + $see->get_elasticsearch_mappings(); #sort_fields will call this and use the actual db values unless we call it first + + my $docs = $see->marc_records_to_documents($records); + + # First record: + is(scalar @{$docs}, 2, 'Two records converted to documents'); + + is($docs->[0][0], '1234567', 'First document biblionumber should be set as first element in document touple'); + + is_deeply($docs->[0][1]->{control_number}, ['123'], 'First record control number should be set correctly'); + + is($docs->[0][1]->{marc_format}, 'ARRAY', 'First document marc_format should be set correctly'); + + my $decoded_marc_record = $see->decode_record_from_result($docs->[0][1]); + + ok($decoded_marc_record->isa('MARC::Record'), "ARRAY record successfully decoded from result"); + is($decoded_marc_record->as_usmarc(), $marc_record_1->as_usmarc(), "Decoded ARRAY record has same data as original record"); +}; -- 2.39.5