2 # Import an iso2709 file into Koha 3
7 # find Koha's Perl modules
8 # test carefully before changing this
10 eval { require "$FindBin::Bin/../kohalib.pl" };
14 use MARC::File::USMARC;
15 # Uncomment the line below and use MARC::File::XML again when it works better.
17 # use MARC::File::XML;
22 # According to kados, an undocumented feature of setting MARC::Charset to
23 # ignore_errors(1) is that errors are not ignored. Instead of deleting the
24 # whole subfield when a character does not translate properly from MARC8 into
25 # UTF-8, just the problem characters are deleted. This should solve at least
26 # some of the fixme problems for fMARC8ToUTF8().
28 # Problems remain if there are MARC 21 records where 000/09 is set incorrectly.
30 # MARC::Charset->ignore_errors(1);
35 use Unicode::Normalize;
36 use Time::HiRes qw(gettimeofday);
38 binmode(STDOUT, ":utf8");
42 my ( $input_marc_file, $number) = ('',0);
43 my ($version, $delete, $test_parameter, $skip_marc8_conversion, $char_encoding, $verbose, $commit, $fk_off,$format);
48 'commit:f' => \$commit,
49 'file:s' => \$input_marc_file,
53 't' => \$test_parameter,
54 's' => \$skip_marc8_conversion,
55 'c:s' => \$char_encoding,
61 # FIXME: Management of error conditions needed for record parsing problems
62 # and MARC8 character sets with mappings to Unicode not yet included in
63 # MARC::Charset. The real world rarity of these problems is not fully tested.
64 # Unmapped character sets will throw a warning currently and processing will
65 # continue with the error condition. A fairly trivial correction should
66 # address some record parsing and unmapped character set problems but I need
67 # time to implement a test and correction for undef subfields and revert to
68 # MARC8 if mappings are missing. -- thd
69 sub fMARC8ToUTF8($$) {
71 my ($verbose) = shift;
73 foreach my $field ($record->fields()) {
74 if ($field->is_control_field()) {
75 ; # do nothing -- control fields should not contain non-ASCII characters
78 my $fieldName = $field->tag();
79 my $indicator1Value = $field->indicator(1);
80 my $indicator2Value = $field->indicator(2);
81 foreach my $subfield ($field->subfields()) {
82 my $subfieldName = $subfield->[0];
83 my $subfieldValue = $subfield->[1];
84 my $utf8sf = MARC::Charset::marc8_to_utf8($subfieldValue);
85 unless (defined $utf8sf) {
86 # For now, we're being very strict about
87 # error during the MARC8 conversion, so return
88 # if there's a problem.
91 $subfieldValue = NFC($utf8sf); # Normalization Form C to assist
92 # some browswers (e.g., Firefox on OS X)
93 # that have issues with decomposed characters
96 # Alas, MARC::Field::update() does not work correctly.
97 ## push (@subfieldsArray, $subfieldName, $subfieldValue);
99 push @subfieldsArray, [$subfieldName, $subfieldValue];
102 # Alas, MARC::Field::update() does not work correctly.
104 # The first instance in the field of a of a repeated subfield
105 # overwrites the content from later instances with the content
106 # from the first instance.
107 ## $field->update(@subfieldsArray);
109 foreach my $subfieldRow(@subfieldsArray) {
110 my $subfieldName = $subfieldRow->[0];
111 $field->delete_subfields($subfieldName);
113 foreach my $subfieldRow(@subfieldsArray) {
114 $field->add_subfields(@$subfieldRow);
119 # Reading the indicator values again is not necessary.
120 # They were not converted.
121 # $indicator1Value = $field->indicator(1);
122 # $indicator2Value = $field->indicator(2);
123 # $indicator1Value =~ s/ /#/;
124 # $indicator2Value =~ s/ /#/;
125 print "\nCONVERTED TO UTF-8:\n" . $fieldName . ' ' .
128 foreach my $subfield ($field->subfields()) {
129 my $subfieldName = $subfield->[0];
130 my $subfieldValue = $subfield->[1];
131 print " \$" . $subfieldName . ' ' . $subfieldValue;
137 print "\n" if $verbose;
143 # must set Leader/09 to 'a' to indicate that
144 # record is now in UTF-8
145 my $leader = $record->leader();
146 substr($leader, 9, 1) = 'a';
147 $record->leader($leader);
149 $record->encoding('UTF-8');
154 if ($version || ($input_marc_file eq '')) {
156 small script to import an iso2709 file into Koha.
158 \th : this version/help screen
159 \tfile /path/to/file/to/dump : the file to import
160 \tv : verbose mode. 1 means "some infos", 2 means "MARC dumping"
161 \tfk : Turn off foreign key checks during import.
162 \tn : the number of records to import. If missing, all the file is imported
163 \tcommit : the number of records to wait before performing a 'commit' operation
164 \tt : test mode : parses the file, saying what he would do, but doing nothing.
165 \ts : skip automatic conversion of MARC-8 to UTF-8. This option is
166 \t provided for debugging.
167 \tc : the characteristic MARC flavour. At the moment, only MARC21 and UNIMARC
168 \tsupported. MARC21 by default.
169 \td : delete EVERYTHING related to biblio in koha-DB before import :tables :
170 \t\tbiblio, \tbiblioitems,\titems
171 IMPORTANT : don't use this script before you've entered and checked your MARC parameters tables twice (or more!).
172 Otherwise, the import won't work correctly and you will get invalid data.
175 \t\$ export KOHA_CONF=/etc/koha.conf
176 \t\$ perl misc/migration_tools/bulkmarcimport.pl -d -commit 1000 -file /home/jmf/koha.mrc -n 3000
182 my $dbh = C4::Context->dbh;
184 # save the CataloguingLog property : we don't want to log a bulkmarcimport. It will slow the import &
185 # will create problems in the action_logs table, that can't handle more than 1 entry per second per user.
186 my $CataloguingLog = C4::Context->preference('CataloguingLog');
187 $dbh->do("UPDATE systempreferences SET value=0 WHERE variable='CataloguingLog'");
190 print "deleting biblios\n";
191 $dbh->do("truncate biblio");
192 $dbh->do("truncate biblioitems");
193 $dbh->do("truncate items");
194 $dbh->do("truncate zebraqueue");
197 $dbh->do("SET FOREIGN_KEY_CHECKS = 0");
199 if ($test_parameter) {
200 print "TESTING MODE ONLY\n DOING NOTHING\n===============\n";
203 my $marcFlavour = C4::Context->preference('marcflavour') || 'MARC21';
205 print "Characteristic MARC flavour: $marcFlavour\n" if $verbose;
206 my $starttime = gettimeofday;
208 if ($format =~ /XML/i) {
209 $batch = MARC::Batch->new( 'XML', $input_marc_file );
211 $batch = MARC::Batch->new( 'USMARC', $input_marc_file );
213 $batch->warnings_off();
214 $batch->strict_off();
220 $commitnum = $commit;
224 $dbh->{AutoCommit} = 0;
225 RECORD: while ( my $record = $batch->next() ) {
228 print "\r$i" unless $i % 100;
230 if ($record->encoding() eq 'MARC-8' and not $skip_marc8_conversion) {
231 unless (fMARC8ToUTF8($record, $verbose)) {
232 warn "ERROR: failed to perform character conversion for record $i\n";
237 unless ($test_parameter) {
238 my ( $biblionumber, $biblioitemnumber, $itemnumbers_ref, $errors_ref );
239 eval { ( $biblionumber, $biblioitemnumber ) = AddBiblio($record, '', { defer_marc_save => 1 }) };
241 warn "ERROR: Adding biblio $biblionumber failed: $@\n";
244 eval { ( $itemnumbers_ref, $errors_ref ) = AddItemBatchFromMarc( $record, $biblionumber, $biblioitemnumber, '' ); };
246 warn "ERROR: Adding items to bib $biblionumber failed: $@\n";
247 # if we failed because of an exception, assume that
248 # the MARC columns in biblioitems were not set.
249 ModBiblioMarc( $record, $biblionumber, '' );
252 if ($#{ $errors_ref } > -1) {
253 report_item_errors($biblionumber, $errors_ref);
256 $dbh->commit() if (0 == $i % $commitnum);
258 last if $i == $number;
264 $dbh->do("SET FOREIGN_KEY_CHECKS = 1");
267 # restore CataloguingLog
268 $dbh->do("UPDATE systempreferences SET value=$CataloguingLog WHERE variable='CataloguingLog'");
270 my $timeneeded = gettimeofday - $starttime;
271 print "\n$i MARC records done in $timeneeded seconds\n";
275 sub report_item_errors {
276 my $biblionumber = shift;
277 my $errors_ref = shift;
279 foreach my $error (@{ $errors_ref }) {
280 my $msg = "Item not added (bib $biblionumber, item tag #$error->{'item_sequence'}, barcode $error->{'item_barcode'}): ";
281 my $error_code = $error->{'error_code'};
282 $error_code =~ s/_/ /g;
283 $msg .= "$error_code $error->{'error_information'}";