From e74d86a3431558b97d2fcd0922467e265cb749c3 Mon Sep 17 00:00:00 2001 From: Ere Maijala Date: Mon, 1 Feb 2021 13:25:04 +0200 Subject: [PATCH] Bug 27584: Refactor OAI-PMH paging to improve performance Includes the following optimizations: - Use next biblionumber instead of large offset in the queries. - Use unions instead of subqueries - Avoid fetching item timestamps when items are not included. Test plan: 1. Without the patch, try harvesting a Koha database with (and without for good measure) `include_items: 1` in the OAI-PMH configuration file pointed to by preference OAI-PMH:ConfFile and take note of performance. For useful metrics the database must be large enough to not fit in InnoDB buffers or OS file cache. 2. Apply the patch. 3. Run tests: prove -v t/db_dependent/OAI 4. Try again the harvesting from step 1 and compare performance with step 1. Signed-off-by: David Cook Signed-off-by: Nick Clemens Signed-off-by: Jonathan Druart --- Koha/OAI/Server/ListBase.pm | 134 ++++++++++++++++------------- Koha/OAI/Server/ResumptionToken.pm | 35 ++++---- t/db_dependent/OAI/Server.t | 47 +++++----- 3 files changed, 116 insertions(+), 100 deletions(-) diff --git a/Koha/OAI/Server/ListBase.pm b/Koha/OAI/Server/ListBase.pm index f1567dfb47..03d13f7a0a 100644 --- a/Koha/OAI/Server/ListBase.pm +++ b/Koha/OAI/Server/ListBase.pm @@ -45,13 +45,14 @@ sub GetRecords { if ( defined $token->{'set'} ) { $set = GetOAISetBySpec($token->{'set'}); } - my $offset = $token->{offset}; my $deleted = defined $token->{deleted} ? $token->{deleted} : 0; - my $deleted_count = defined $token->{deleted_count} ? $token->{deleted_count} : 0; my $max = $repository->{koha_max_count}; my $count = 0; my $format = $args{metadataPrefix} || $token->{metadata_prefix}; my $include_items = $repository->items_included( $format ); + my $from = $token->{from_arg}; + my $until = $token->{until_arg}; + my $next_id = $token->{next_id}; # Since creating a union of normal and deleted record tables would be a heavy # operation in a large database, build results in two stages: @@ -59,71 +60,80 @@ sub GetRecords { STAGELOOP: for ( ; $deleted >= 0; $deleted-- ) { my $table = $deleted ? 'deletedbiblio_metadata' : 'biblio_metadata'; + + my @part_bind_params = ($next_id); + + my $where = "biblionumber >= ?"; + if ( $from ) { + $where .= " AND timestamp >= ?"; + push @part_bind_params, $from; + } + if ( $until ) { + $where .= " AND timestamp <= ?"; + push @part_bind_params, $until; + } + if ( defined $set ) { + $where .= " AND biblionumber in (SELECT osb.biblionumber FROM oai_sets_biblios osb WHERE osb.set_id = ?)"; + push @part_bind_params, $set->{'id'}; + } + + my @bind_params = @part_bind_params; + + my $criteria = "WHERE $where ORDER BY biblionumber LIMIT " . ($max + 1); + my $sql = " SELECT biblionumber FROM $table - WHERE (timestamp >= ? AND timestamp <= ?) + $criteria "; - my @bind_params = ($token->{'from_arg'}, $token->{'until_arg'}); - if ($include_items) { - $sql .= " - OR biblionumber IN (SELECT biblionumber from deleteditems WHERE timestamp >= ? AND timestamp <= ?) - "; - push @bind_params, ($token->{'from_arg'}, $token->{'until_arg'}); + # If items are included, fetch a set of potential biblionumbers from items tables as well. + # Then merge them, sort them and take the required number of them from the resulting list. + # This may seem counter-intuitive as in worst case we fetch 3 times the biblionumbers needed, + # but avoiding joins or subqueries makes this so much faster that it does not matter. + if ( $include_items ) { + $sql = "($sql) UNION (SELECT DISTINCT(biblionumber) FROM deleteditems $criteria)"; + push @bind_params, @part_bind_params; if (!$deleted) { - $sql .= " - OR biblionumber IN (SELECT biblionumber from items WHERE timestamp >= ? AND timestamp <= ?) - "; - push @bind_params, ($token->{'from_arg'}, $token->{'until_arg'}); + $sql .= " UNION (SELECT DISTINCT(biblionumber) FROM items $criteria)"; + push @bind_params, @part_bind_params; } + $sql = "SELECT biblionumber FROM ($sql) bibnos ORDER BY biblionumber LIMIT " . ($max + 1); } - $sql .= " - ORDER BY biblionumber - "; - - # Use a subquery for sets since it allows us to use an index in - # biblioitems table and is quite a bit faster than a join. - if (defined $set) { - $sql = " - SELECT bi.* FROM ($sql) bi - WHERE bi.biblionumber in (SELECT osb.biblionumber FROM oai_sets_biblios osb WHERE osb.set_id = ?) - "; - push @bind_params, $set->{'id'}; - } - - $sql .= " - LIMIT " . ($max + 1) . " - OFFSET " . ($offset - $deleted_count); - my $sth = $dbh->prepare( $sql ) || die( 'Could not prepare statement: ' . $dbh->errstr ); + $sth->execute( @bind_params ) || die( 'Could not execute statement: ' . $sth->errstr ); - if ( $deleted ) { - $sql = " - SELECT MAX(timestamp) - FROM ( - SELECT timestamp FROM deletedbiblio_metadata WHERE biblionumber = ? - UNION - SELECT timestamp FROM deleteditems WHERE biblionumber = ? - ) bis - "; + my $ts_sql; + if ( $include_items ) { + if ( $deleted ) { + $ts_sql = " + SELECT MAX(timestamp) + FROM ( + SELECT timestamp FROM deletedbiblio_metadata WHERE biblionumber = ? + UNION + SELECT timestamp FROM deleteditems WHERE biblionumber = ? + ) bis + "; + } else { + $ts_sql = " + SELECT MAX(timestamp) + FROM ( + SELECT timestamp FROM biblio_metadata WHERE biblionumber = ? + UNION + SELECT timestamp FROM deleteditems WHERE biblionumber = ? + UNION + SELECT timestamp FROM items WHERE biblionumber = ? + ) bi + "; + } } else { - $sql = " - SELECT MAX(timestamp) - FROM ( - SELECT timestamp FROM biblio_metadata WHERE biblionumber = ? - UNION - SELECT timestamp FROM deleteditems WHERE biblionumber = ? - UNION - SELECT timestamp FROM items WHERE biblionumber = ? - ) bi - "; + $ts_sql = "SELECT timestamp FROM $table WHERE biblionumber = ?"; } - my $record_sth = $dbh->prepare( $sql ) || die( 'Could not prepare statement: ' . $dbh->errstr ); + my $ts_sth = $dbh->prepare( $ts_sql ) || die( 'Could not prepare statement: ' . $dbh->errstr ); - $sth->execute( @bind_params ) || die( 'Could not execute statement: ' . $sth->errstr ); - while ( my ($biblionumber) = $sth->fetchrow ) { + foreach my $row (@{ $sth->fetchall_arrayref() }) { + my $biblionumber = $row->[0]; $count++; if ( $count > $max ) { $self->resumptionToken( @@ -131,18 +141,21 @@ sub GetRecords { metadataPrefix => $token->{metadata_prefix}, from => $token->{from}, until => $token->{until}, - offset => $token->{offset} + $max, + cursor => $token->{cursor} + $max, set => $token->{set}, deleted => $deleted, - deleted_count => $deleted_count + next_id => $biblionumber ) ); last STAGELOOP; } - my @params = $deleted ? ( $biblionumber, $biblionumber ) : ( $biblionumber, $biblionumber, $biblionumber ); - $record_sth->execute( @params ) || die( 'Could not execute statement: ' . $sth->errstr ); + my @params = ($biblionumber); + if ( $include_items ) { + push @params, $deleted ? ( $biblionumber ) : ( $biblionumber, $biblionumber ); + } + $ts_sth->execute( @params ) || die( 'Could not execute statement: ' . $ts_sth->errstr ); - my ($timestamp) = $record_sth->fetchrow; + my ($timestamp) = $ts_sth->fetchrow; my $oai_sets = GetOAISetsBiblio($biblionumber); my @setSpecs; @@ -172,9 +185,8 @@ sub GetRecords { ) ); } } - # Store offset and deleted record count - $offset += $count; - $deleted_count = $offset if ($deleted); + # Reset $next_id for the next stage + $next_id = 0; } return $count; } diff --git a/Koha/OAI/Server/ResumptionToken.pm b/Koha/OAI/Server/ResumptionToken.pm index ee97771ca5..7f77542cb4 100644 --- a/Koha/OAI/Server/ResumptionToken.pm +++ b/Koha/OAI/Server/ResumptionToken.pm @@ -1,6 +1,6 @@ # Copyright Tamil s.a.r.l. 2008-2015 # Copyright Biblibre 2008-2015 -# Copyright The National Library of Finland, University of Helsinki 2016-2017 +# Copyright The National Library of Finland, University of Helsinki 2016-2021 # # This file is part of Koha. # @@ -31,49 +31,50 @@ use base ("HTTP::OAI::ResumptionToken"); # - metadataPrefix # - from # - until -# - offset +# - cursor # - deleted +# - deleted count (not used anymore, but preserved for back-compatibility) +# - nextId sub new { my ($class, %args) = @_; my $self = $class->SUPER::new(%args); - my ($metadata_prefix, $offset, $from, $until, $set, $deleted, $deleted_count); + my ($metadata_prefix, $cursor, $from, $until, $set, $deleted, $next_id); if ( $args{ resumptionToken } ) { - ($metadata_prefix, $offset, $from, $until, $set, $deleted, $deleted_count) + # Note: undef in place of deleted record count for back-compatibility + ($metadata_prefix, $cursor, $from, $until, $set, $deleted, undef, $next_id) = split( '/', $args{resumptionToken} ); + $next_id = 0 unless $next_id; } else { $metadata_prefix = $args{ metadataPrefix }; - $from = $args{ from } || '1970-01-01'; - $until = $args{ until }; - unless ( $until) { - my ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday) = gmtime( time ); - $until = sprintf( "%.4d-%.2d-%.2d", $year+1900, $mon+1,$mday ); - } - #Add times to the arguments, when necessary, so they correctly match against the DB timestamps + $from = $args{ from } || ''; + $until = $args{ until } || ''; + # Add times to the arguments, when necessary, so they correctly match against the DB timestamps $from .= 'T00:00:00Z' if length($from) == 10; $until .= 'T23:59:59Z' if length($until) == 10; - $offset = $args{ offset } || 0; + $cursor = $args{ cursor } // 0; $set = $args{ set } || ''; $deleted = defined $args{ deleted } ? $args{ deleted } : 1; - $deleted_count = defined $args{ deleted_count } ? $args{ deleted_count } : 0; + $next_id = $args{ next_id } // 0; } $self->{ metadata_prefix } = $metadata_prefix; - $self->{ offset } = $offset; + $self->{ cursor } = $cursor; $self->{ from } = $from; $self->{ until } = $until; $self->{ set } = $set; $self->{ from_arg } = _strip_UTC_designators($from); $self->{ until_arg } = _strip_UTC_designators($until); $self->{ deleted } = $deleted; - $self->{ deleted_count } = $deleted_count; + $self->{ next_id } = $next_id; + # Note: put zero where deleted record count used to be for back-compatibility $self->resumptionToken( - join( '/', $metadata_prefix, $offset, $from, $until, $set, $deleted, $deleted_count ) ); - $self->cursor( $offset ); + join( '/', $metadata_prefix, $cursor, $from, $until, $set, $deleted, 0, $next_id ) ); + $self->cursor( $cursor ); return $self; } diff --git a/t/db_dependent/OAI/Server.t b/t/db_dependent/OAI/Server.t index 7efac23999..09a09de890 100755 --- a/t/db_dependent/OAI/Server.t +++ b/t/db_dependent/OAI/Server.t @@ -18,6 +18,7 @@ # along with Koha; if not, see . use Modern::Perl; +use Test::Deep qw( cmp_deeply re ); use Test::MockTime qw/set_fixed_time restore_time/; use Test::More tests => 31; @@ -74,12 +75,13 @@ $dbh->do('DELETE FROM oai_sets'); set_fixed_time(CORE::time()); -my $base_datetime = dt_from_string(); +my $base_datetime = dt_from_string(undef, undef, 'UTC'); my $date_added = $base_datetime->ymd . ' ' .$base_datetime->hms . 'Z'; my $date_to = substr($date_added, 0, 10) . 'T23:59:59Z'; my (@header, @marcxml, @oaidc, @marcxml_transformed); my $sth = $dbh->prepare('UPDATE biblioitems SET timestamp=? WHERE biblionumber=?'); my $sth2 = $dbh->prepare('UPDATE biblio_metadata SET timestamp=? WHERE biblionumber=?'); +my $first_bn = 0; # Add biblio records foreach my $index ( 0 .. NUMBER_OF_MARC_RECORDS - 1 ) { @@ -94,6 +96,7 @@ foreach my $index ( 0 .. NUMBER_OF_MARC_RECORDS - 1 ) { $record->append_fields( MARC::Field->new('952', '', '', 'a' => "Code" ) ); } my ($biblionumber) = AddBiblio($record, ''); + $first_bn = $biblionumber unless $first_bn; my $timestamp = $base_datetime->ymd . ' ' .$base_datetime->hms; $sth->execute($timestamp,$biblionumber); $sth2->execute($timestamp,$biblionumber); @@ -174,7 +177,7 @@ sub test_query { } delete $response->{responseDate}; - unless (is_deeply($response, \%full_expected, $test)) { + unless (cmp_deeply($response, \%full_expected, $test)) { diag "PARAM:" . YAML::XS::Dump($param) . "EXPECTED:" . YAML::XS::Dump(\%full_expected) . @@ -215,7 +218,7 @@ test_query('ListIdentifiers', {verb => 'ListIdentifiers', metadataPrefix => 'mar ListIdentifiers => { header => [ @header[0..2] ], resumptionToken => { - content => "marcxml/3/1970-01-01T00:00:00Z/$date_to//0/0", + content => re( qr{^marcxml/3////0/0/\d+$} ), cursor => 3, }, }, @@ -225,7 +228,7 @@ test_query('ListIdentifiers', {verb => 'ListIdentifiers', metadataPrefix => 'mar ListIdentifiers => { header => [ @header[0..2] ], resumptionToken => { - content => "marcxml/3/1970-01-01T00:00:00Z/$date_to//0/0", + content => re( qr{^marcxml/3////0/0/\d+$} ), cursor => 3, }, }, @@ -233,12 +236,12 @@ test_query('ListIdentifiers', {verb => 'ListIdentifiers', metadataPrefix => 'mar test_query( 'ListIdentifiers with resumptionToken 1', - { verb => 'ListIdentifiers', resumptionToken => "marcxml/3/1970-01-01T00:00:00Z/$date_to//0/0" }, + { verb => 'ListIdentifiers', resumptionToken => "marcxml/3/1970-01-01T00:00:00Z/$date_to//0/0/" . ($first_bn + 3) }, { ListIdentifiers => { header => [ @header[3..5] ], resumptionToken => { - content => "marcxml/6/1970-01-01T00:00:00Z/$date_to//0/0", + content => re( qr{^marcxml/6/1970-01-01T00:00:00Z/$date_to//0/0/\d+$} ), cursor => 6, }, }, @@ -247,12 +250,12 @@ test_query( test_query( 'ListIdentifiers with resumptionToken 2', - { verb => 'ListIdentifiers', resumptionToken => "marcxml/6/1970-01-01T00:00:00Z/$date_to//0/0" }, + { verb => 'ListIdentifiers', resumptionToken => "marcxml/6/1970-01-01T00:00:00Z/$date_to//0/0/" . ($first_bn + 6) }, { ListIdentifiers => { header => [ @header[6..8] ], resumptionToken => { - content => "marcxml/9/1970-01-01T00:00:00Z/$date_to//0/0", + content => re( qr{^marcxml/9/1970-01-01T00:00:00Z/$date_to//0/0/\d+$} ), cursor => 9, }, }, @@ -261,7 +264,7 @@ test_query( test_query( 'ListIdentifiers with resumptionToken 3, response without resumption', - { verb => 'ListIdentifiers', resumptionToken => "marcxml/9/1970-01-01T00:00:00Z/$date_to//0/0" }, + { verb => 'ListIdentifiers', resumptionToken => "marcxml/9/1970-01-01T00:00:00Z/$date_to//0/0/" . ($first_bn + 9) }, { ListIdentifiers => { header => $header[9], @@ -280,7 +283,7 @@ test_query('ListRecords marcxml', {verb => 'ListRecords', metadataPrefix => 'mar ListRecords => { record => [ @marcxml[0..2] ], resumptionToken => { - content => "marcxml/3/1970-01-01T00:00:00Z/$date_to//0/0", + content => re( qr{^marcxml/3////0/0/\d+$} ), cursor => 3, }, }, @@ -288,11 +291,11 @@ test_query('ListRecords marcxml', {verb => 'ListRecords', metadataPrefix => 'mar test_query( 'ListRecords marcxml with resumptionToken 1', - { verb => 'ListRecords', resumptionToken => "marcxml/3/1970-01-01T00:00:00Z/$date_to//0/0" }, + { verb => 'ListRecords', resumptionToken => "marcxml/3////0/0/" . ($first_bn + 3) }, { ListRecords => { record => [ @marcxml[3..5] ], resumptionToken => { - content => "marcxml/6/1970-01-01T00:00:00Z/$date_to//0/0", + content => re( qr{^marcxml/6////0/0/\d+$} ), cursor => 6, }, }, @@ -300,11 +303,11 @@ test_query( test_query( 'ListRecords marcxml with resumptionToken 2', - { verb => 'ListRecords', resumptionToken => "marcxml/6/1970-01-01T00:00:00Z/$date_to//0/0" }, + { verb => 'ListRecords', resumptionToken => "marcxml/6/1970-01-01T00:00:00Z/$date_to//0/0/" . ($first_bn + 6) }, { ListRecords => { record => [ @marcxml[6..8] ], resumptionToken => { - content => "marcxml/9/1970-01-01T00:00:00Z/$date_to//0/0", + content => re( qr{^marcxml/9/1970-01-01T00:00:00Z/$date_to//0/0/\d+$} ), cursor => 9, }, }, @@ -313,7 +316,7 @@ test_query( # Last record, so no resumption token test_query( 'ListRecords marcxml with resumptionToken 3, response without resumption', - { verb => 'ListRecords', resumptionToken => "marcxml/9/1970-01-01T00:00:00Z/$date_to//0/0" }, + { verb => 'ListRecords', resumptionToken => "marcxml/9/1970-01-01T00:00:00Z/$date_to//0/0/" . ($first_bn + 9) }, { ListRecords => { record => $marcxml[9], }, @@ -323,7 +326,7 @@ test_query('ListRecords oai_dc', {verb => 'ListRecords', metadataPrefix => 'oai_ ListRecords => { record => [ @oaidc[0..2] ], resumptionToken => { - content => "oai_dc/3/1970-01-01T00:00:00Z/$date_to//0/0", + content => re( qr{^oai_dc/3////0/0/\d+$} ), cursor => 3, }, }, @@ -331,11 +334,11 @@ test_query('ListRecords oai_dc', {verb => 'ListRecords', metadataPrefix => 'oai_ test_query( 'ListRecords oai_dc with resumptionToken 1', - { verb => 'ListRecords', resumptionToken => "oai_dc/3/1970-01-01T00:00:00Z/$date_to//0/0" }, + { verb => 'ListRecords', resumptionToken => "oai_dc/3////0/0/" . ($first_bn + 3) }, { ListRecords => { record => [ @oaidc[3..5] ], resumptionToken => { - content => "oai_dc/6/1970-01-01T00:00:00Z/$date_to//0/0", + content => re( qr{^oai_dc/6////0/0/\d+$} ), cursor => 6, }, }, @@ -343,11 +346,11 @@ test_query( test_query( 'ListRecords oai_dc with resumptionToken 2', - { verb => 'ListRecords', resumptionToken => "oai_dc/6/1970-01-01T00:00:00Z/$date_to//0/0" }, + { verb => 'ListRecords', resumptionToken => "oai_dc/6/1970-01-01T00:00:00Z/$date_to//0/0/" . ($first_bn + 6) }, { ListRecords => { record => [ @oaidc[6..8] ], resumptionToken => { - content => "oai_dc/9/1970-01-01T00:00:00Z/$date_to//0/0", + content => re( qr{^oai_dc/9/1970-01-01T00:00:00Z/$date_to//0/0/\d+$} ), cursor => 9, }, }, @@ -356,7 +359,7 @@ test_query( # Last record, so no resumption token test_query( 'ListRecords oai_dc with resumptionToken 3, response without resumption', - { verb => 'ListRecords', resumptionToken => "oai_dc/9/1970-01-01T00:00:00Z/$date_to//0/0" }, + { verb => 'ListRecords', resumptionToken => "oai_dc/9/1970-01-01T00:00:00Z/$date_to//0/0/" . ($first_bn + 9) }, { ListRecords => { record => $oaidc[9], }, @@ -369,7 +372,7 @@ test_query('ListRecords marcxml with xsl transformation', { ListRecords => { record => [ @marcxml_transformed[0..2] ], resumptionToken => { - content => "marcxml/3/1970-01-01T00:00:00Z/$date_to//0/0", + content => re( qr{^marcxml/3////0/0/\d+$} ), cursor => 3, } }, -- 2.39.5