2 # small script that import an iso2709 file into koha 2.0
8 use MARC::File::USMARC;
9 # Uncomment the line below and use MARC::File::XML again when it works better.
11 # use MARC::File::XML;
16 # According to kados, an undocumented feature of setting MARC::Charset to
17 # ignore_errors(1) is that errors are not ignored. Instead of deleting the
18 # whole subfield when a character does not translate properly from MARC8 into
19 # UTF-8, just the problem characters are deleted. This should solve at least
20 # some of the fixme problems for fMARC8ToUTF8().
22 # Problems remain if there are MARC 21 records where 000/09 is set incorrectly.
24 MARC::Charset->ignore_errors(1);
28 use Time::HiRes qw(gettimeofday);
30 binmode(STDOUT, ":utf8");
32 my ( $input_marc_file, $number) = ('',0);
33 my ($version, $delete, $test_parameter,$marcFlavour, $verbose);
36 'file:s' => \$input_marc_file,
40 't' => \$test_parameter,
41 'c:s' => \$marcFlavour,
45 # FIXME: Management of error conditions needed for record parsing problems
46 # and MARC8 character sets with mappings to Unicode not yet included in
47 # MARC::Charset. The real world rarity of these problems is not fully tested.
48 # Unmapped character sets will throw a warning currently and processing will
49 # continue with the error condition. A fairly trivial correction should
50 # address some record parsing and unmapped character set problems but I need
51 # time to implement a test and correction for undef subfields and revert to
52 # MARC8 if mappings are missing. -- thd
53 sub fMARC8ToUTF8($$) {
55 my ($verbose) = shift;
58 my $leader = $record->leader();
60 print "\n000 " . $leader;
63 foreach my $field ($record->fields()) {
64 if ($field->is_control_field()) {
67 my $fieldName = $field->tag();
68 my $fieldValue = $field->data();
69 $fieldValue =~ s/ /#/g;
70 print "\n" . $fieldName;
71 print ' ' . $fieldValue;
76 my $fieldName = $field->tag();
77 my $indicator1Value = $field->indicator(1);
78 my $indicator2Value = $field->indicator(2);
81 $indicator1Value =~ s/ /#/;
82 $indicator2Value =~ s/ /#/;
83 print "\n" . $fieldName . ' ' .
88 foreach my $subfield ($field->subfields()) {
89 my $subfieldName = $subfield->[0];
90 my $subfieldValue = $subfield->[1];
91 $subfieldValue = MARC::Charset::marc8_to_utf8($subfieldValue);
93 # Alas, MARC::Field::update() does not work correctly.
94 ## push (@subfieldsArray, $subfieldName, $subfieldValue);
96 push @subfieldsArray, [$subfieldName, $subfieldValue];
99 print " \$" . $subfieldName . ' ' . $subfieldValue;
104 # Alas, MARC::Field::update() does not work correctly.
106 # The first instance in the field of a of a repeated subfield
107 # overwrites the content from later instances with the content
108 # from the first instance.
109 ## $field->update(@subfieldsArray);
111 foreach my $subfieldRow(@subfieldsArray) {
112 my $subfieldName = $subfieldRow->[0];
113 $field->delete_subfields($subfieldName);
115 foreach my $subfieldRow(@subfieldsArray) {
116 $field->add_subfields(@$subfieldRow);
121 # Reading the indicator values again is not necessary.
122 # They were not converted.
123 # $indicator1Value = $field->indicator(1);
124 # $indicator2Value = $field->indicator(2);
125 # $indicator1Value =~ s/ /#/;
126 # $indicator2Value =~ s/ /#/;
127 print "\nCONVERTED TO UTF-8:\n" . $fieldName . ' ' .
130 foreach my $subfield ($field->subfields()) {
131 my $subfieldName = $subfield->[0];
132 my $subfieldValue = $subfield->[1];
133 print " \$" . $subfieldName . ' ' . $subfieldValue;
139 print "\n" if $verbose;
144 $record->encoding('UTF-8');
149 if ($version || ($input_marc_file eq '')) {
151 small script to import an iso2709 file into Koha.
153 \th : this version/help screen
154 \tfile /path/to/file/to/dump : the file to dump
155 \tv : verbose mode. 1 means "some infos", 2 means "MARC dumping"
156 \tn : the number of the record to import. If missing, all the file is imported
157 \tt : test mode : parses the file, saying what he would do, but doing nothing.
158 \tc : the characteristic MARC flavour. At the moment, only MARC21 and UNIMARC
159 \tsupported. MARC21 by default.
160 \td : delete EVERYTHING related to biblio in koha-DB before import :tables :
161 \t\tbiblio, \t\tbiblioitems, \t\tsubjects,\titems
162 \t\tadditionalauthors, \tbibliosubtitles, \tmarc_biblio,
163 \t\tmarc_subfield_table, \tmarc_word, \t\tmarc_blob_subfield
164 IMPORTANT : don't use this script before you've entered and checked twice (or more) your MARC parameters tables.
165 If you fail this, the import won't work correctly and you will get invalid datas.
167 SAMPLE : ./bulkmarcimport.pl -file /home/paul/koha.dev/local/npl -n 1
173 my $dbh = C4::Context->dbh;
176 print "deleting biblios\n";
177 $dbh->do("delete from biblio");
178 $dbh->do("delete from biblioitems");
179 $dbh->do("delete from items");
180 $dbh->do("delete from bibliosubject");
181 $dbh->do("delete from additionalauthors");
182 $dbh->do("delete from bibliosubtitle");
183 $dbh->do("delete from marc_biblio");
184 $dbh->do("delete from marc_subfield_table");
185 $dbh->do("delete from marc_word");
186 $dbh->do("delete from marc_blob_subfield");
188 if ($test_parameter) {
189 print "TESTING MODE ONLY\n DOING NOTHING\n===============\n";
192 $marcFlavour = 'MARC21' unless ($marcFlavour);
193 print "Characteristic MARC flavour: $marcFlavour\n" if $verbose;
194 my $starttime = gettimeofday;
195 my $batch = MARC::Batch->new( 'USMARC', $input_marc_file );
196 $batch->warnings_off();
197 $batch->strict_off();
199 #1st of all, find item MARC tag.
200 my ($tagfield,$tagsubfield) = &MARCfind_marc_from_kohafield($dbh,"items.itemnumber",'');
201 # $dbh->do("lock tables biblio write, biblioitems write, items write, marc_biblio write, marc_subfield_table write, marc_blob_subfield write, marc_word write, marc_subfield_structure write, stopwords write");
202 while ( my $record = $batch->next() ) {
204 #FIXME: it's kind of silly to go from MARC::Record to MARC::File::XML and
205 # then back again just to fix the encoding
207 # It is even sillier when the conversion too frequently produces errors
208 # instead of fixing the encoding. Hence, the following MARC::File::XML
209 # lines are now commented out until character set conversion in XML
210 # works better. -- thd
211 ## my $uxml = $record->as_xml;
212 ## $record = MARC::Record::new_from_xml($uxml, 'UTF-8');
214 # Check record encoding and convert encoding if necessary.
216 if ($marcFlavour eq 'MARC21') {
218 if ($record->encoding() eq 'UTF-8') {
220 print "\nRecord $i encoding is UTF-8\n";
221 $tag000_pos09 = substr ($record->leader, 9, 1);
222 $tag000_pos09 =~ s/ /#/;
223 print "\nUTF-8 LEADER/09: " . $tag000_pos09 ."\n";
225 } elsif ($record->encoding() eq 'MARC-8') {
226 print "\nConverting record $i encoding from MARC8 to UTF-8\n";
227 # Convert MARC-8 to UTF-8
228 $record = fMARC8ToUTF8($record, $verbose);
230 print "\nRecord $i encoding has been converted to UTF-8\n";
231 $tag000_pos09 = substr ($record->leader, 9, 1);
232 $tag000_pos09 =~ s/ /#/;
233 print "\nUTF-8 LEADER/09: " . $tag000_pos09 ."\n";
236 } elsif ($marcFlavour eq 'UNIMARC') {
237 # I have not developed a UNIMARC character encoding conversion script
238 # yet. Common encodings should be easy. Less comon and multiple
239 # encodings will need extra work. I am happy to work on this if there
240 # is some interest. -- thd
243 #now, parse the record, extract the item fields, and store them in somewhere else.
245 ## create an empty record object to populate
246 my $newRecord = MARC::Record->new();
247 $newRecord->leader($record->leader());
249 # go through each field in the existing record
250 foreach my $oldField ( $record->fields() ) {
252 # just reproduce tags < 010 in our new record
254 # Fields are not necessarily only numeric in the actual world of records
255 # nor in what I would recommend for additonal safe non-interfering local
256 # use fields. The following regular expression match is much safer than
257 # a numeric evaluation. -- thd
258 if ( $oldField->tag() =~ m/^00/ ) {
259 $newRecord->append_fields( $oldField );
263 # store our new subfield data in this list
264 my @newSubfields = ();
266 # go through each subfield code/data pair
267 foreach my $pair ( $oldField->subfields() ) {
268 $pair->[1] =~ s/\<//g;
269 $pair->[1] =~ s/\>//g;
270 push( @newSubfields, $pair->[0], char_decode($pair->[1],$marcFlavour) );
273 # add the new field to our new record
274 my $newField = MARC::Field->new(
276 $oldField->indicator(1),
277 $oldField->indicator(2),
281 $newRecord->append_fields( $newField );
287 warn "$i ==>".$newRecord->as_formatted() if $verbose eq 2;
289 my @fields = $newRecord->field($tagfield);
293 foreach my $field (@fields) {
294 my $item = MARC::Record->new();
295 $item->append_fields($field);
297 $newRecord->delete_field($field);
300 print "$i : $nbitems items found\n" if $verbose;
301 # now, create biblio and items with NEWnewXX call.
302 unless ($test_parameter) {
303 my ($bibid,$oldbibnum,$oldbibitemnum) = NEWnewbiblio($dbh,$newRecord,'');
304 warn "ADDED biblio NB $bibid in DB\n" if $verbose;
305 for (my $i=0;$i<=$#items;$i++) {
306 NEWnewitem($dbh,$items[$i],$bibid);
310 # $dbh->do("unlock tables");
311 my $timeneeded = gettimeofday - $starttime;
312 print "$i MARC record done in $timeneeded seconds";