From 381443700333aaa4cbd4ac1067b1a884202d6f96 Mon Sep 17 00:00:00 2001 From: Matt Blenkinsop Date: Thu, 19 Oct 2023 16:12:58 +0000 Subject: [PATCH] Bug 34788: Allow for import of CSV files This commit allows CSV files to be imported alongside TSV files. It also adds some performance improvements relating to the max_allowed_packet and the matching of titles, as well as some small bugfixes and unit test changes Test plan: 1) Enable the ERM module 2) Navigate to E-resource management > eHoldings > Local > Packages 3) Create at least one package 4) Navigate to E-resource management > eHoldings > Local > Titles 5) There should be a button for "Import from KBART file" 6) Click this button 7) Select the package that you created from the dropdown and then choose your KBART file using the "Choose file" button. I have attached some example files to the bug but feel free to use your own if you have them. 8) Click Submit 9) If your file is a valid file, a background job will be queued, if not then a warning will display showing what is incorrect in your file 10) To test the file format warning, edit your file and add a random column heading into the file e.g. test_column. When you upload it, the warning should show that an invalid column "test_column" has been detected 11) Click on the background job. (If you have uploaded a very large file, the system will chunk the file into smaller pieces and create multiple background jobs) 12) It should display a progress bar followed by a report and any error messages 13) Navigate to E-resource management > eHoldings > Local > Titles and you should see your new titles. 14) Run the unit test: prove t/db_dependent/Koha/BackgroundJob/ImportKBARTFile.t Signed-off-by: Clemens Tubach Signed-off-by: Nick Clemens Signed-off-by: Katrin Fischer --- Koha/BackgroundJob/ImportKBARTFile.pm | 238 ++++++++++++++---- Koha/REST/V1/ERM/EHoldings/Titles/Local.pm | 66 +++-- .../swagger/paths/erm_eholdings_titles.yaml | 2 + .../import_from_kbart_file.inc | 2 +- .../ERM/EHoldingsLocalTitlesKBARTImport.vue | 44 +++- .../ERM/EHoldingsLocalTitlesList.vue | 22 +- .../ERM/EHoldingsLocalTitlesShow.vue | 4 +- .../Koha/BackgroundJob/ImportKBARTFile.t | 86 +++++-- 8 files changed, 334 insertions(+), 130 deletions(-) diff --git a/Koha/BackgroundJob/ImportKBARTFile.pm b/Koha/BackgroundJob/ImportKBARTFile.pm index c243695127..8114abe0d2 100644 --- a/Koha/BackgroundJob/ImportKBARTFile.pm +++ b/Koha/BackgroundJob/ImportKBARTFile.pm @@ -16,10 +16,10 @@ package Koha::BackgroundJob::ImportKBARTFile; # along with Koha; if not, see . use Modern::Perl; -use JSON qw( decode_json encode_json ); -use Try::Tiny qw( catch try ); +use JSON qw( decode_json encode_json ); +use Try::Tiny qw( catch try ); use MIME::Base64 qw( decode_base64 ); -use POSIX qw( floor ); +use POSIX qw( floor ); use C4::Context; @@ -65,37 +65,37 @@ sub process { my $titles_imported = 0; my $duplicate_titles = 0; my $failed_imports = 0; - my $total_lines; - my $file_name = $args->{file}->{filename}; + my $total_rows; + my $file_name = $args->{file_name}; my $report = { duplicates_found => undef, titles_imported => undef, file_name => $file_name, - total_lines => undef, + total_rows => undef, failed_imports => undef }; try { - my $file = $args->{file}; - my $package_id = $args->{package_id}; - my ( $column_headers, $lines ) = format_file($file); + my $column_headers = $args->{column_headers}; + my $rows = $args->{rows}; + my $package_id = $args->{package_id}; - if ( scalar( @{$lines} ) == 0 ) { + if ( scalar( @{$rows} ) == 0 ) { push @messages, { code => 'job_failed', type => 'error', - error_message => 'No valid lines were found in this file. Please check the file formatting.', + error_message => 'No valid rows were found in this file. Please check the file formatting.', }; $self->status('failed')->store; } - $self->size( scalar( @{$lines} ) )->store; - $total_lines = scalar( @{$lines} ); + $self->size( scalar( @{$rows} ) )->store; + $total_rows = scalar( @{$rows} ); - foreach my $line ( @{$lines} ) { - next if !$line; - my $new_title = create_title_hash_from_line_data( $line, $column_headers ); - my $title_match = Koha::ERM::EHoldings::Titles->search( { external_id => $new_title->{title_id} } )->count; + foreach my $row ( @{$rows} ) { + next if !$row; + my $new_title = create_title_hash_from_line_data( $row, $column_headers ); + my $title_match = check_for_matching_title($new_title); if ($title_match) { $duplicate_titles++; @@ -119,9 +119,12 @@ sub process { $failed_imports++; } else { my $imported_title = Koha::ERM::EHoldings::Title->new($formatted_title)->store; - my $title_id = $imported_title->title_id; - Koha::ERM::EHoldings::Resource->new( { title_id => $title_id, package_id => $package_id } ) - ->store; + create_linked_resource( + { + title => $imported_title, + package_id => $package_id + } + ); # No need to add a message for a successful import, # files could have 1000s of titles which will lead to lots of messages in background_job->data @@ -132,7 +135,7 @@ sub process { push @messages, { code => 'title_failed', type => 'error', - error_message => $_->{msg}, + error_message => $_->{msg} || "Please check your file", title => $new_title->{publication_title} } }; @@ -142,7 +145,7 @@ sub process { $report->{duplicates_found} = $duplicate_titles; $report->{titles_imported} = $titles_imported; - $report->{total_lines} = $total_lines; + $report->{total_rows} = $total_rows; $report->{failed_imports} = $failed_imports; my $data = $self->decoded_data; @@ -167,7 +170,7 @@ Enqueue the new job sub enqueue { my ( $self, $args ) = @_; - return unless exists $args->{file}; + return unless exists $args->{column_headers}; $self->SUPER::enqueue( { @@ -194,7 +197,7 @@ sub format_title { delete $title->{title_id}; # Some files appear to use coverage_notes instead of "notes" as in the KBART standard - if ( $title->{coverage_notes} ) { + if ( exists $title->{coverage_notes} ) { $title->{notes} = $title->{coverage_notes}; delete $title->{coverage_notes}; } @@ -202,23 +205,39 @@ sub format_title { return $title; } -=head3 format_file +=head3 read_file -Formats a file to provide report headers and lines to be processed +Reads a file to provide report headers and lines to be processed =cut -sub format_file { +sub read_file { my ($file) = @_; - my $file_content = decode_base64( $file->{file_content} ); - $file_content =~ s/\n/\r/g; - my @lines = split /\r/, $file_content; - my @column_headers = split /\t/, $lines[0]; - shift @lines; # Remove headers row - my @remove_null_lines = grep $_ ne '', @lines; + my $file_content = defined( $file->{file_content} ) ? decode_base64( $file->{file_content} ) : ""; + my $delimiter = $file->{filename} =~ /\.tsv$/ ? "\t" : ","; + my $quote_char = $file->{filename} =~ /\.tsv$/ ? "" : '"'; - return ( \@column_headers, \@remove_null_lines ); + open my $fh, "<", \$file_content or die; + my $csv = Text::CSV_XS->new( + { + sep_char => $delimiter, + quote_char => $quote_char, + binary => 1, + allow_loose_quotes => 1 + } + ); + my $headers_to_check = $csv->getline($fh); + my $column_headers = rescue_EBSCO_files($headers_to_check); + my $lines = $csv->getline_all( $fh, 0 ); + + my ( $cde, $str, $pos ) = $csv->error_diag(); + my $error = $cde ? "$cde, $str, $pos" : ""; + warn $error if $error; + + close($fh); + + return ( $column_headers, $lines, $error ); } =head3 create_title_hash_from_line_data @@ -228,16 +247,102 @@ Takes a line and creates a hash of the values mapped to the column headings =cut sub create_title_hash_from_line_data { - my ( $line, $column_headers ) = @_; + my ( $row, $column_headers ) = @_; my %new_title; - my @values = split /\t/, $line; - @new_title{ @{$column_headers} } = @values; + @new_title{ @{$column_headers} } = @$row; + + # If the file has been converted from CSV to TSV for import, then some titles containing commas will be enclosed in "" + my $first_char = substr( $new_title{publication_title}, 0, 1 ); + my $last_char = substr( $new_title{publication_title}, -1 ); + if ( $first_char eq '"' && $last_char eq '"' ) { + $new_title{publication_title} =~ s/^"|"$//g; + } return \%new_title; } +=head3 check_for_matching_title + +Checks whether this title already exists to avoid duplicates + +=cut + +sub check_for_matching_title { + my ($title) = @_; + + my $match_parameters = {}; + $match_parameters->{print_identifier} = $title->{print_identifier} if $title->{print_identifier}; + $match_parameters->{online_identifier} = $title->{online_identifier} if $title->{online_identifier}; + + # Use external_id in case title exists for a different provider, we want to add it for the new provider + $match_parameters->{external_id} = $title->{title_id} if $title->{title_id}; + + # If no match parameters are provided in the file we should add the new title + return 0 if !%$match_parameters; + + my $title_match = Koha::ERM::EHoldings::Titles->search($match_parameters)->count; + + return $title_match; +} + +=head3 create_linked_resource + +Creates a resource for a newly stored title. + +=cut + +sub create_linked_resource { + my ($args) = @_; + + my $title = $args->{title}; + my $package_id = $args->{package_id}; + + my $title_id = $title->title_id; + my ( $date_first_issue_online, $date_last_issue_online ) = get_first_and_last_issue_dates($title); + my $resource = Koha::ERM::EHoldings::Resource->new( + { + title_id => $title_id, + package_id => $package_id, + started_on => $date_first_issue_online, + ended_on => $date_last_issue_online, + } + )->store; + + return; +} + +=head3 get_first_and_last_issue_dates + +Gets and formats a date for storing on the resource. Dates can come from files in YYYY, YYYY-MM or YYYY-MM-DD format + +=cut + +sub get_first_and_last_issue_dates { + my ($title) = @_; + + return ( undef, undef ) if ( !$title->date_first_issue_online && !$title->date_last_issue_online ); + + my $date_first_issue_online = + $title->date_first_issue_online =~ /^\d{4}((-\d{2}-\d{2}$|-\d{2}$)|$)$/ + ? $title->date_first_issue_online + : undef; + my $date_last_issue_online = + $title->date_last_issue_online =~ /^\d{4}((-\d{2}-\d{2}$|-\d{2}$)|$)$/ ? $title->date_last_issue_online : undef; + + $date_first_issue_online = $date_first_issue_online . '-01-01' + if $date_first_issue_online && $date_first_issue_online =~ /^\d{4}$/; + $date_last_issue_online = $date_last_issue_online . '-01-01' + if $date_last_issue_online && $date_last_issue_online =~ /^\d{4}$/; + $date_first_issue_online = $date_first_issue_online . '-01' + if $date_first_issue_online && $date_first_issue_online =~ /^\d{4}-\d{2}$/; + $date_last_issue_online = $date_last_issue_online . '-01' + if $date_last_issue_online && $date_last_issue_online =~ /^\d{4}-\d{2}$/; + + return ( $date_first_issue_online, $date_last_issue_online ); +} + =head3 get_valid_headers Returns a list of permitted headers in a KBART phase II file @@ -275,20 +380,63 @@ sub get_valid_headers { ); } -=head3 calculate_chunked_file_size +=head3 calculate_chunked_params_size Calculates average line size to work out how many lines to chunk a large file into -Knocks 10% off the final result to give some margin for error +Uses only 75% of the max_allowed_packet as an upper limit + +=cut + +sub calculate_chunked_params_size { + my ( $params_size, $max_allowed_packet, $number_of_rows ) = @_; + + my $average_line_size = $params_size / $number_of_rows; + my $lines_possible = ( $max_allowed_packet * 0.75 ) / $average_line_size; + my $rounded_value = floor($lines_possible); + return $rounded_value; +} + +=head3 is_file_too_large + +Calculates the final size of the background job object that will need storing to check if we exceed the max_allowed_packet + +=cut + +sub is_file_too_large { + my ( $params_to_store, $max_allowed_packet ) = @_; + + my $json = JSON->new->utf8(0); + my $encoded_params = $json->encode($params_to_store); + my $params_size = length $encoded_params; + + # A lot more than just the params are stored in the background job table and this is difficult to calculate + # We should allow for no more than 75% of the max_allowed_packet to be made up of the job params to avoid db conflicts + return { + file_too_large => 1, + params_size => $params_size + } if $params_size > ( $max_allowed_packet * 0.75 ); + + return { + file_too_large => 0, + params_size => $params_size + }; +} + +=head3 + +EBSCO have an incorrect spelling of "preceding_publication_title_id" in all of their KBART files ("preceeding" instead of "preceding"). +This is very annoying because it means all of their KBART files fail to import using the current methodology. +There is no simple way of finding out who the vendor is before importing so all KBART files from any vendor are going to have to be checked for this spelling and corrected. =cut -sub calculate_chunked_file_size { - my ( $file_size, $max_allowed_packet, $number_of_lines ) = @_; +sub rescue_EBSCO_files { + my ($column_headers) = @_; + + my ($index) = grep { @$column_headers[$_] eq 'preceeding_publication_title_id' } ( 0 .. @$column_headers - 1 ); + @$column_headers[$index] = 'preceding_publication_title_id' if $index; - my $average_line_size = $file_size / $number_of_lines; - my $lines_possible = $max_allowed_packet / $average_line_size; - my $moderated_value = floor( $lines_possible * 0.9 ); - return $moderated_value; + return $column_headers; } 1; diff --git a/Koha/REST/V1/ERM/EHoldings/Titles/Local.pm b/Koha/REST/V1/ERM/EHoldings/Titles/Local.pm index 7587eaa84f..64ae0d45d1 100644 --- a/Koha/REST/V1/ERM/EHoldings/Titles/Local.pm +++ b/Koha/REST/V1/ERM/EHoldings/Titles/Local.pm @@ -27,6 +27,7 @@ use Scalar::Util qw( blessed ); use Try::Tiny qw( catch try ); use MIME::Base64 qw( decode_base64 encode_base64 ); use POSIX qw( floor ); +use Text::CSV_XS; =head1 API @@ -272,57 +273,67 @@ sub import_from_kbart_file { my $c = shift or return; my $import_data = $c->req->json; - my $file = $import_data->{file}; - my $package_id = $import_data->{package_id}; + my $file = $import_data->{file}; + my $package_id = $import_data->{package_id}; return try { my @job_ids; my @invalid_columns; my $max_allowed_packet = C4::Context->dbh->selectrow_array(q{SELECT @@max_allowed_packet}); - my $file_content = defined( $file->{file_content} ) ? decode_base64( $file->{file_content} ) : ""; - $file_content =~ s/\n/\r/g; - my @lines = split /\r/, $file_content; - my @column_headers = split /\t/, $lines[0]; - shift @lines; # Remove headers row - my @remove_null_lines = grep $_ ne '', @lines; + + # Check if file is in TSV or CSV format and send an error back if not + if ( $file->{filename} !~ /\.csv$/ && $file->{filename} !~ /\.tsv$/ ) { + return $c->render( + status => 201, + openapi => { invalid_filetype => 1 } + ); + } + + my ( $column_headers, $rows ) = Koha::BackgroundJob::ImportKBARTFile::read_file($file); # Check that the column headers in the file match the standardised KBART phase II columns - # If not, return a warning before the job is queued + # If not, return a warning my @valid_headers = Koha::BackgroundJob::ImportKBARTFile::get_valid_headers(); - foreach my $header (@column_headers) { + foreach my $header (@$column_headers) { if ( !grep { $_ eq $header } @valid_headers ) { + $header = 'Empty column - please remove' if $header eq ''; push @invalid_columns, $header; } } return $c->render( status => 201, - openapi => { invalid_columns => \@invalid_columns, valid_columns => \@valid_headers } + openapi => { invalid_columns => \@invalid_columns, valid_columns => \@valid_headers, invalid_filetype => 0 } ) if scalar(@invalid_columns) > 0; - my $file_size = length($file_content); + my $params = { + column_headers => $column_headers, + rows => $rows, + package_id => $package_id, + file_name => $file->{filename} + }; + my $outcome = Koha::BackgroundJob::ImportKBARTFile::is_file_too_large( $params, $max_allowed_packet ); # If the file is too large, we can break the file into smaller chunks and enqueue one job per chunk - if ( $file_size > $max_allowed_packet ) { - - my $max_number_of_lines = Koha::BackgroundJob::ImportKBARTFile::calculate_chunked_file_size( - $file_size, $max_allowed_packet, - scalar(@remove_null_lines) + if ( $outcome->{file_too_large} ) { + my $max_number_of_rows = Koha::BackgroundJob::ImportKBARTFile::calculate_chunked_params_size( + $outcome->{params_size}, $max_allowed_packet, + scalar(@$rows) ); - my @chunked_files; - push @chunked_files, [ splice @remove_null_lines, 0, $max_number_of_lines ] while @remove_null_lines; + my @chunked_files; + push @chunked_files, [ splice @$rows, 0, $max_number_of_rows ] while @$rows; foreach my $chunk (@chunked_files) { - unshift( @{$chunk}, join( "\t", @column_headers ) ); - my $chunked_file = { - filename => $file->{filename}, - file_content => encode_base64( join( "\r", @{$chunk} ) ) + my $params = { + column_headers => $column_headers, + rows => $chunk, + package_id => $package_id, + file_name => $file->{filename} }; - my $params = { file => $chunked_file, package_id => $package_id }; - my $job_id = Koha::BackgroundJob::ImportKBARTFile->new->enqueue($params); - push @job_ids, $job_id; + + my $chunked_job_id = Koha::BackgroundJob::ImportKBARTFile->new->enqueue($params); + push @job_ids, $chunked_job_id; } } else { - my $params = { file => $file, package_id => $package_id }; my $job_id = Koha::BackgroundJob::ImportKBARTFile->new->enqueue($params); push @job_ids, $job_id; } @@ -336,4 +347,5 @@ sub import_from_kbart_file { }; } + 1; diff --git a/api/v1/swagger/paths/erm_eholdings_titles.yaml b/api/v1/swagger/paths/erm_eholdings_titles.yaml index 6959d39237..f2fdef93c5 100644 --- a/api/v1/swagger/paths/erm_eholdings_titles.yaml +++ b/api/v1/swagger/paths/erm_eholdings_titles.yaml @@ -529,6 +529,8 @@ type: array valid_columns: type: array + invalid_filetype: + type: integer additionalProperties: false 400: description: Bad parameter diff --git a/koha-tmpl/intranet-tmpl/prog/en/includes/background_jobs/import_from_kbart_file.inc b/koha-tmpl/intranet-tmpl/prog/en/includes/background_jobs/import_from_kbart_file.inc index 7f535b8050..fe1bee7ace 100644 --- a/koha-tmpl/intranet-tmpl/prog/en/includes/background_jobs/import_from_kbart_file.inc +++ b/koha-tmpl/intranet-tmpl/prog/en/includes/background_jobs/import_from_kbart_file.inc @@ -12,7 +12,7 @@ Total lines processed - [% report.total_lines | html %] + [% report.total_rows | html %] Titles imported diff --git a/koha-tmpl/intranet-tmpl/prog/js/vue/components/ERM/EHoldingsLocalTitlesKBARTImport.vue b/koha-tmpl/intranet-tmpl/prog/js/vue/components/ERM/EHoldingsLocalTitlesKBARTImport.vue index afc545f132..055c4e6ae8 100644 --- a/koha-tmpl/intranet-tmpl/prog/js/vue/components/ERM/EHoldingsLocalTitlesKBARTImport.vue +++ b/koha-tmpl/intranet-tmpl/prog/js/vue/components/ERM/EHoldingsLocalTitlesKBARTImport.vue @@ -2,7 +2,22 @@

{{ $__("Import from a KBART file") }}

- +

{{ $__("Requirements:") }}

+
    +
  • {{ $__("The file must be in TSV or CSV format") }}
  • +
  • + {{ + $__( + "The file should not contain any additional information / header rows, e.g. a file with a single title would be structured as follows:" + ) + }} +
      +
    1. Column headings row
    2. +
    3. Title data row
    4. +
    +
  • +
+

{{ $__("File") }}:

{{ $__("Select a file") }} @@ -44,7 +59,7 @@
- {{ + {{ $__("Clear form") }}
@@ -104,7 +119,7 @@ export default { let message = "" if (success.job_ids) { if (success.job_ids.length > 1) { - message += this.__( + message += this.$__( "
  • Your file was too large to process in one job, the file has been split into %s jobs to meet the maximum size limits.
  • " ).format(success.job_ids.length) } @@ -116,28 +131,31 @@ export default { setMessage(message, true) } if (success.invalid_columns) { - message += + message += this.$__( "

    Invalid columns were detected in your report, please check the list below:

    " + ) success.invalid_columns.forEach(column => { message += this.$__( `
  • %s
  • ` ).format(column) }) - message += - '

    Below is a list of columns allowed in a KBART phase II report:

    ' - success.valid_columns.forEach(column => { - message += this.$__( - `
  • %s
  • ` - ).format(column) - }) + message += this.$__( + '

    For a list of compliant column headers, please click here

    ' + ) + setWarning(message) + } + if (success.invalid_filetype) { + message += this.$__( + "

    The file must be in .tsv or .csv format, please convert your file and try again.

    " + ) setWarning(message) } }, error => {} ) + this.clearForm() }, - clearForm(e) { - e.preventDefault() + clearForm() { this.file = { filename: null, file_type: null, diff --git a/koha-tmpl/intranet-tmpl/prog/js/vue/components/ERM/EHoldingsLocalTitlesList.vue b/koha-tmpl/intranet-tmpl/prog/js/vue/components/ERM/EHoldingsLocalTitlesList.vue index 3c261bc897..5785856ac8 100644 --- a/koha-tmpl/intranet-tmpl/prog/js/vue/components/ERM/EHoldingsLocalTitlesList.vue +++ b/koha-tmpl/intranet-tmpl/prog/js/vue/components/ERM/EHoldingsLocalTitlesList.vue @@ -13,6 +13,11 @@ icon="plus" :title="$__('Import from list')" /> +
    - {{ title.title_url }} + {{ + title.title_url + }}
  • diff --git a/t/db_dependent/Koha/BackgroundJob/ImportKBARTFile.t b/t/db_dependent/Koha/BackgroundJob/ImportKBARTFile.t index b8c7ec6a5b..76b5618def 100755 --- a/t/db_dependent/Koha/BackgroundJob/ImportKBARTFile.t +++ b/t/db_dependent/Koha/BackgroundJob/ImportKBARTFile.t @@ -45,16 +45,16 @@ subtest 'enqueue' => sub { $schema->storage->txn_rollback; }; -subtest 'calculate_chunked_file_size' => sub { +subtest 'calculate_chunked_params_size' => sub { plan tests => 2; my $max_number_of_lines = - Koha::BackgroundJob::ImportKBARTFile::calculate_chunked_file_size( 500000, 100000, 50000 ); - is( $max_number_of_lines, 9000, 'Number of lines calculated correctly' ); + Koha::BackgroundJob::ImportKBARTFile::calculate_chunked_params_size( 500000, 100000, 50000 ); + is( $max_number_of_lines, 7500, 'Number of lines calculated correctly' ); my $max_number_of_lines2 = - Koha::BackgroundJob::ImportKBARTFile::calculate_chunked_file_size( 400000, 100000, 60000 ); - is( $max_number_of_lines2, 13500, 'Number of lines calculated correctly' ); + Koha::BackgroundJob::ImportKBARTFile::calculate_chunked_params_size( 400000, 100000, 60000 ); + is( $max_number_of_lines2, 11250, 'Number of lines calculated correctly' ); }; subtest 'format_title' => sub { @@ -74,12 +74,12 @@ subtest 'format_title' => sub { is( $title->{coverage_notes}, undef, 'coverage_notes has been deleted' ); }; -subtest 'format_file' => sub { +subtest 'read_file' => sub { plan tests => 6; my $file = { - filename => 'Test_file.txt', + filename => 'Test_file.tsv', file_content => encode_base64( 'publication_title print_identifier online_identifier date_first_issue_online num_first_vol_online num_first_issue_online date_last_issue_online num_last_vol_online num_last_issue_online title_url first_author title_id embargo_info coverage_depth coverage_notes publisher_name publication_type date_monograph_published_print date_monograph_published_online monograph_volume monograph_edition first_editor parent_publication_title_id preceding_publication_title_id access_type Nature Plants 2055-0278 2015-01 1 1 https://www.nature.com/nplants 4aaa7 fulltext Hybrid (Open Choice) Nature Publishing Group UK serial P @@ -87,21 +87,28 @@ Nature Astronomy 2397-3366 2017-01 1 1 https://www.nature.com/natastron 4bb ) }; - my ( $column_headers, $lines ) = Koha::BackgroundJob::ImportKBARTFile::format_file($file); + my ( $column_headers, $lines, $error ) = Koha::BackgroundJob::ImportKBARTFile::read_file($file); is( @{$column_headers}, 25, '25 column headers found' ); is( @{$column_headers}[0], 'publication_title', 'First header correctly extracted' ); is( @{$column_headers}[10], 'first_author', 'Tenth header correctly extracted' ); - - is( @{$lines}, 2, 'Two lines need processing' ); - is( + is( @{$lines}, 2, 'Two lines need processing' ); + is_deeply( @{$lines}[0], - 'Nature Plants 2055-0278 2015-01 1 1 https://www.nature.com/nplants 4aaa7 fulltext Hybrid (Open Choice) Nature Publishing Group UK serial P', + [ + 'Nature Plants', '', '2055-0278', '2015-01', '1', '1', '', '', '', 'https://www.nature.com/nplants', '', + '4aaa7', '', 'fulltext', 'Hybrid (Open Choice)', 'Nature Publishing Group UK', 'serial', '', '', '', '', + '', '', '', 'P' + ], 'Line correctly identified' ); - is( + is_deeply( @{$lines}[1], - 'Nature Astronomy 2397-3366 2017-01 1 1 https://www.nature.com/natastron 4bbb0 fulltext Hybrid (Open Choice) Nature Publishing Group UK serial P', + [ + 'Nature Astronomy', '', '2397-3366', '2017-01', '1', '1', '', '', '', 'https://www.nature.com/natastron', + '', '4bbb0', '', 'fulltext', 'Hybrid (Open Choice)', 'Nature Publishing Group UK', 'serial', '', '', '', + '', '', '', '', 'P' + ], 'Line correctly identified' ); @@ -112,7 +119,7 @@ subtest 'create_title_hash_from_line_data' => sub { plan tests => 2; my $file = { - filename => 'Test_file.txt', + filename => 'Test_file.tsv', file_content => encode_base64( 'publication_title print_identifier online_identifier date_first_issue_online num_first_vol_online num_first_issue_online date_last_issue_online num_last_vol_online num_last_issue_online title_url first_author title_id embargo_info coverage_depth coverage_notes publisher_name publication_type date_monograph_published_print date_monograph_published_online monograph_volume monograph_edition first_editor parent_publication_title_id preceding_publication_title_id access_type Nature Plants 2055-0278 2015-01 1 1 https://www.nature.com/nplants 4aaa7 fulltext Hybrid (Open Choice) Nature Publishing Group UK serial P @@ -120,7 +127,7 @@ Nature Astronomy 2397-3366 2017-01 1 1 https://www.nature.com/natastron 4bb ) }; - my ( $column_headers, $lines ) = Koha::BackgroundJob::ImportKBARTFile::format_file($file); + my ( $column_headers, $lines ) = Koha::BackgroundJob::ImportKBARTFile::read_file($file); my $title_from_line1 = Koha::BackgroundJob::ImportKBARTFile::create_title_hash_from_line_data( @{$lines}[0], $column_headers ); @@ -191,8 +198,16 @@ subtest 'process' => sub { $schema->storage->txn_begin; + Koha::ERM::EHoldings::Packages->search->delete; + my $ehpackage = $builder->build_object( + { + class => 'Koha::ERM::EHoldings::Packages', + value => { external_id => undef } + } + ); + my $file = { - filename => 'Test_file.txt', + filename => 'Test_file.tsv', file_content => encode_base64( 'publication_title print_identifier online_identifier date_first_issue_online num_first_vol_online num_first_issue_online date_last_issue_online num_last_vol_online num_last_issue_online title_url first_author title_id embargo_info coverage_depth coverage_notes publisher_name publication_type date_monograph_published_print date_monograph_published_online monograph_volume monograph_edition first_editor parent_publication_title_id preceding_publication_title_id access_type Nature Plants 2055-0278 2015-01 1 1 https://www.nature.com/nplants 4aaa7 fulltext Hybrid (Open Choice) Nature Publishing Group UK serial P @@ -200,6 +215,14 @@ Nature Astronomy 2397-3366 2017-01 1 1 https://www.nature.com/natastron 4bb ) }; + my ( $column_headers, $rows, $error ) = Koha::BackgroundJob::ImportKBARTFile::read_file($file); + my $data = { + column_headers => $column_headers, + rows => $rows, + package_id => $ehpackage->package_id, + file_name => $file->{filename} + }; + my $job = Koha::BackgroundJob::ImportKBARTFile->new( { status => 'new', @@ -208,7 +231,6 @@ Nature Astronomy 2397-3366 2017-01 1 1 https://www.nature.com/natastron 4bb } )->store; $job = Koha::BackgroundJobs->find( $job->id ); - my $data = { file => $file }; my $json = $job->json->encode($data); $job->data($json)->store; $job->process($data); @@ -251,12 +273,18 @@ Nature Astronomy 2397-3366 2017-01 1 1 https://www.nature.com/natastron 4bb $module->mock( 'create_title_hash_from_line_data', sub { - my ( $line, $column_headers ) = @_; + my ( $row, $column_headers ) = @_; my %new_title; - my @values = split /\t/, $line; - @new_title{ @{$column_headers} } = @values; + @new_title{ @{$column_headers} } = @$row; + + # If the file has been converted from CSV to TSV for import, then some titles containing commas will be enclosed in "" + my $first_char = substr( $new_title{publication_title}, 0, 1 ); + my $last_char = substr( $new_title{publication_title}, -1 ); + if ( $first_char eq '"' && $last_char eq '"' ) { + $new_title{publication_title} =~ s/^"|"$//g; + } $new_title{title_id} = '12345' if $new_title{publication_title} eq 'Nature Plants'; $new_title{publication_title} = '' if $new_title{publication_title} eq 'Nature Plants'; @@ -302,12 +330,18 @@ Nature Astronomy 2397-3366 2017-01 1 1 https://www.nature.com/natastron 4bb $module->mock( 'create_title_hash_from_line_data', sub { - my ( $line, $column_headers ) = @_; + my ( $row, $column_headers ) = @_; my %new_title; - my @values = split /\t/, $line; - @new_title{ @{$column_headers} } = @values; + @new_title{ @{$column_headers} } = @$row; + + # If the file has been converted from CSV to TSV for import, then some titles containing commas will be enclosed in "" + my $first_char = substr( $new_title{publication_title}, 0, 1 ); + my $last_char = substr( $new_title{publication_title}, -1 ); + if ( $first_char eq '"' && $last_char eq '"' ) { + $new_title{publication_title} =~ s/^"|"$//g; + } $new_title{title_id} = 'abcde' if $new_title{publication_title} eq 'Nature Plants'; $new_title{unknown_field} = '' if $new_title{publication_title} eq 'Nature Plants'; @@ -334,8 +368,8 @@ Nature Astronomy 2397-3366 2017-01 1 1 https://www.nature.com/natastron 4bb $job4->messages, [ { - 'type' => 'error', - 'title' => 'Nature Plants', + 'type' => 'error', + 'title' => 'Nature Plants', 'error_message' => 'DBIx::Class::Row::store_column(): No such column \'unknown_field\' on Koha::Schema::Result::ErmEholdingsTitle at /kohadevbox/koha/Koha/Object.pm line 79 ', -- 2.39.5