For MARC 21, instead of deleting the whole subfield when a character does not

translate properly from MARC8 into UTF-8, only the problem characters are deleted.
2006-09-01 17:11:53 +00:00 · 2006-09-01 17:11:53 +00:00 · ad657e71eb
commit ad657e71eb
parent e928ba3e05
1 changed files with 72 additions and 53 deletions
--- a/misc/migration_tools/bulkmarcimport.pl
+++ b/misc/migration_tools/bulkmarcimport.pl
@ -12,25 +12,33 @@ use MARC::File::USMARC;
 use MARC::Record;
 use MARC::Batch;
 use MARC::Charset;
+
+# According to kados, an undocumented feature of setting MARC::Charset to 
+# ignore_errors(1) is that errors are not ignored.  Instead of deleting the 
+# whole subfield when a character does not translate properly from MARC8 into 
+# UTF-8, just the problem characters are deleted.  This should solve at least 
+# some of the fixme problems for fMARC8ToUTF8().
+# 
+# Problems remain if there are MARC 21 records where 000/09 is set incorrectly. 
+# -- thd.
+MARC::Charset->ignore_errors(1);
+
 use C4::Context;
 use C4::Biblio;
 use Time::HiRes qw(gettimeofday);
 use Getopt::Long;
 binmode(STDOUT, ":utf8");

-use Getopt::Long;
-
 my ( $input_marc_file, $number) = ('',0);
-my ($version, $delete, $test_parameter,$char_encoding, $verbose, $commit);
+my ($version, $delete, $test_parameter,$marcFlavour, $verbose);

 GetOptions(
-	'commit:f'	=> \$commit,
 	'file:s'    => \$input_marc_file,
-    'n:f' => \$number,
+	'n' => \$number,
 	'h' => \$version,
 	'd' => \$delete,
 	't' => \$test_parameter,
-    'c:s' => \$char_encoding,
+	'c:s' => \$marcFlavour,
 	'v:s' => \$verbose,
 );

@ -145,8 +153,7 @@ parameters :
 \th : this version/help screen
 \tfile /path/to/file/to/dump : the file to dump
 \tv : verbose mode. 1 means "some infos", 2 means "MARC dumping"
-\tn : the number of records to import. If missing, all the file is imported
-\tcommit : the number of records to wait before performing a 'commit' operation
+\tn : the number of the record to import. If missing, all the file is imported
 \tt : test mode : parses the file, saying what he would do, but doing nothing.
 \tc : the characteristic MARC flavour. At the moment, only MARC21 and UNIMARC 
 \tsupported. MARC21 by default.
@ -154,12 +161,10 @@ parameters :
 \t\tbiblio, \t\tbiblioitems, \t\tsubjects,\titems
 \t\tadditionalauthors, \tbibliosubtitles, \tmarc_biblio,
 \t\tmarc_subfield_table, \tmarc_word, \t\tmarc_blob_subfield
-IMPORTANT : don't use this script before you've entered and checked your MARC parameters tables twice (or more!).
-Otherwise, the import won't work correctly and you will get invalid data.
+IMPORTANT : don't use this script before you've entered and checked twice (or more) your  MARC parameters tables.
+If you fail this, the import won't work correctly and you will get invalid datas.

-SAMPLE : 
-\t\$ export KOHA_CONF=/etc/koha.conf
-\t\$ perl misc/migration_tools/bulkmarcimport.pl -d -commit 1000 -file /home/jmf/koha.mrc -n 3000
+SAMPLE : ./bulkmarcimport.pl -file /home/paul/koha.dev/local/npl -n 1
 EOF
 ;#'
 die;
@ -191,35 +196,50 @@ my $batch = MARC::Batch->new( 'USMARC', $input_marc_file );
 $batch->warnings_off();
 $batch->strict_off();
 my $i=0;
-my $commitnum = 50;
-
-if ($commit) {
-
-$commitnum = $commit;
-
-}
-
 #1st of all, find item MARC tag.
 my ($tagfield,$tagsubfield) = &MARCfind_marc_from_kohafield($dbh,"items.itemnumber",'');
 # $dbh->do("lock tables biblio write, biblioitems write, items write, marc_biblio write, marc_subfield_table write, marc_blob_subfield write, marc_word write, marc_subfield_structure write, stopwords write");
 while ( my $record = $batch->next() ) {
-warn "I:".$i;
-warn "NUM:".$number;
 	$i++;
+#FIXME: it's kind of silly to go from MARC::Record to MARC::File::XML and 
+	# then back again just to fix the encoding
+	#
+	# It is even sillier when the conversion too frequently produces errors 
+	# instead of fixing the encoding.  Hence, the following MARC::File::XML 
+	# lines are now commented out until character set conversion in XML 
+	# works better. -- thd
+	## my $uxml = $record->as_xml;
+	## $record = MARC::Record::new_from_xml($uxml, 'UTF-8');
 	
-	if ($i==$number) {
-		z3950_extended_services('commit',set_service_options('commit'));
-		print "COMMIT OPERATION SUCCESSFUL\n";
+	# Check record encoding and convert encoding if necessary.
 	
-		my $timeneeded = gettimeofday - $starttime;
-		die "$i MARC records imported in $timeneeded seconds\n";
+	if ($marcFlavour eq 'MARC21') {
+		my $tag000_pos09;
+		if ($record->encoding() eq 'UTF-8') {
+			if ($verbose) {
+				print "\nRecord $i encoding is UTF-8\n";
+				$tag000_pos09 = substr ($record->leader, 9, 1);
+				$tag000_pos09 =~ s/ /#/;
+				print "\nUTF-8 LEADER/09: " . $tag000_pos09 ."\n";
 			}
-	# perform the commit operation ever so often
-	if ($i==$commit) {
-		z3950_extended_services('commit',set_service_options('commit'));
-		$commit+=$commitnum;
-		print "COMMIT OPERATION SUCCESSFUL\n";
+		} elsif ($record->encoding() eq 'MARC-8') {
+			print "\nConverting record $i encoding from MARC8 to UTF-8\n";
+			# Convert MARC-8 to UTF-8
+			$record = fMARC8ToUTF8($record, $verbose);
+			if ($verbose) {
+				print "\nRecord $i encoding has been converted to UTF-8\n";
+				$tag000_pos09 = substr ($record->leader, 9, 1);
+				$tag000_pos09 =~ s/ /#/;
+				print "\nUTF-8 LEADER/09: " . $tag000_pos09 ."\n";
 			}
+		}
+	} elsif ($marcFlavour eq 'UNIMARC') {
+		# I have not developed a UNIMARC character encoding conversion script 
+		# yet.  Common encodings should be easy.  Less comon and multiple 
+		# encodings will need extra work.  I am happy to work on this if there 
+		# is some interest. -- thd
+	}
+	
 	#now, parse the record, extract the item fields, and store them in somewhere else.

 	## create an empty record object to populate
@ -245,9 +265,9 @@ warn "NUM:".$number;

 	# go through each subfield code/data pair
 	foreach my $pair ( $oldField->subfields() ) { 
-		#$pair->[1] =~ s/\<//g;
-		#$pair->[1] =~ s/\>//g;
-		push( @newSubfields, $pair->[0], $pair->[1] ); #char_decode($pair->[1],$char_encoding) );
+		$pair->[1] =~ s/\<//g;
+		$pair->[1] =~ s/\>//g;
+		push( @newSubfields, $pair->[0], char_decode($pair->[1],$marcFlavour) );
 	}

 	# add the new field to our new record
@ -262,7 +282,10 @@ warn "NUM:".$number;

 	}

+
+	if ($verbose) {
 		warn "$i ==>".$newRecord->as_formatted() if $verbose eq 2;
+	}
 	my @fields = $newRecord->field($tagfield);
 	my @items;
 	my $nbitems=0;
@ -277,17 +300,13 @@ warn "NUM:".$number;
 	print "$i : $nbitems items found\n" if $verbose;
 	# now, create biblio and items with NEWnewXX call.
 	unless ($test_parameter) {
-		my ($bibid,$oldbibitemnum) = NEWnewbiblio($dbh,$newRecord,'');
+		my ($bibid,$oldbibnum,$oldbibitemnum) = NEWnewbiblio($dbh,$newRecord,'');
 		warn "ADDED biblio NB $bibid in DB\n" if $verbose;
 		for (my $i=0;$i<=$#items;$i++) {
-		    warn "here is the biblioitemnumber $oldbibitemnum";
-			NEWnewitem($dbh,$items[$i],$bibid,$oldbibitemnum);
+			NEWnewitem($dbh,$items[$i],$bibid);
 		}
 	}
 }
-# final commit of the changes
-z3950_extended_services('commit',set_service_options('commit'));
-print "COMMIT OPERATION SUCCESSFUL\n";
-
+# $dbh->do("unlock tables");
 my $timeneeded = gettimeofday - $starttime;
-print "$i MARC records done in $timeneeded seconds\n";
+print "$i MARC record done in $timeneeded seconds";