1 package C4::SimpleMarc;
5 # Routines for handling import of MARC data into Koha db
7 # Koha library project www.koha.org
9 # Licensed under the GPL
12 # Copyright 2000-2002 Katipo Communications
14 # This file is part of Koha.
16 # Koha is free software; you can redistribute it and/or modify it under the
17 # terms of the GNU General Public License as published by the Free Software
18 # Foundation; either version 2 of the License, or (at your option) any later
21 # Koha is distributed in the hope that it will be useful, but WITHOUT ANY
22 # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
23 # A PARTICULAR PURPOSE. See the GNU General Public License for more details.
25 # You should have received a copy of the GNU General Public License along with
26 # Koha; if not, write to the Free Software Foundation, Inc., 59 Temple Place,
27 # Suite 330, Boston, MA 02111-1307 USA
32 use vars qw($VERSION @ISA @EXPORT @EXPORT_OK);
34 # set the version for version checking
39 C4::SimpleMarc - Functions for parsing MARC records and files
47 This module provides functions for parsing MARC records and files.
64 # FIXME - %tagtext and %tagmap are in both @EXPORT and @EXPORT_OK.
65 # They should be in one or the other, but not both (though preferably,
66 # things shouldn't get exported in the first place).
72 #------------------------------------------------
77 # %tagtext maps MARC tags to descriptive names.
80 '001' => 'Control number',
81 '003' => 'Control number identifier',
82 '005' => 'Date and time of latest transaction',
83 '006' => 'Fixed-length data elements -- additional material characteristics',
84 '007' => 'Physical description fixed field',
85 '008' => 'Fixed length data elements',
87 '015' => 'National library CN',
90 '024' => 'Other standard ID',
91 '035' => 'System control number',
92 '037' => 'Source of acquisition',
93 '040' => 'Cataloging source',
94 '041' => 'Language code',
95 '043' => 'Geographic area code',
96 '043' => 'Publishing country code',
97 '050' => 'Library of Congress call number',
98 '055' => 'Canadian classification number',
99 '060' => 'National Library of Medicine call number',
100 '082' => 'Dewey decimal call number',
101 '100' => 'Main entry -- Personal name',
102 '110' => 'Main entry -- Corporate name',
103 '130' => 'Main entry -- Uniform title',
104 '240' => 'Uniform title',
105 '245' => 'Title statement',
106 '246' => 'Varying form of title',
107 '250' => 'Edition statement',
108 '256' => 'Computer file characteristics',
109 '260' => 'Publication, distribution, etc.',
110 '263' => 'Projected publication date',
111 '300' => 'Physical description',
112 '306' => 'Playing time',
113 '440' => 'Series statement / Added entry -- Title',
114 '490' => 'Series statement',
115 '500' => 'General note',
116 '504' => 'Bibliography, etc. note',
117 '505' => 'Formatted contents note',
118 '508' => 'Creation/production credits note',
119 '510' => 'Citation/references note',
120 '511' => 'Participant or performer note',
121 '520' => 'Summary, etc. note',
122 '521' => 'Target audience note (ie age)',
123 '530' => 'Additional physical form available note',
124 '538' => 'System details note',
125 '586' => 'Awards note',
126 '600' => 'Subject added entry -- Personal name',
127 '610' => 'Subject added entry -- Corporate name',
128 '650' => 'Subject added entry -- Topical term',
129 '651' => 'Subject added entry -- Geographic name',
130 '656' => 'Index term -- Occupation',
131 '700' => 'Added entry -- Personal name',
132 '710' => 'Added entry -- Corporate name',
133 '730' => 'Added entry -- Uniform title',
134 '740' => 'Added entry -- Uncontrolled related/analytical title',
135 '800' => 'Series added entry -- Personal name',
136 '830' => 'Series added entry -- Uniform title',
138 '856' => 'Electronic location and access',
141 # tag, subfield, field name, repeats, striptrailingchars
142 # FIXME - What is this? Can it be explained without a semester-long
145 # XXX - Maps MARC (field, subfield) tuples to Koha database field
146 # names (presumably in 'biblioitems'). $tagmap{$field}->{$subfield} is
147 # an anonymous hash of the form
149 # name => "title", # Name of Koha field
150 # rpt => 0, # I don't know what this is, but
152 # striptrail => ',:;/-', # Lists the set of characters that
153 # # should be stripped from the end
154 # # of the MARC field.
158 '010'=>{'a'=>{name=> 'lccn', rpt=>0, striptrail=>' ' }},
159 '015'=>{'a'=>{name=> 'lccn', rpt=>0 }},
160 '020'=>{'a'=>{name=> 'isbn', rpt=>0 }},
161 '022'=>{'a'=>{name=> 'issn', rpt=>0 }},
162 '082'=>{'a'=>{name=> 'dewey', rpt=>0 }},
163 '100'=>{'a'=>{name=> 'author', rpt=>0, striptrail=>',:;/-' }},
164 '245'=>{'a'=>{name=> 'title', rpt=>0, striptrail=>',:;/' },
165 'b'=>{name=> 'subtitle', rpt=>0, striptrail=>',:;/' }},
166 '260'=>{'a'=>{name=> 'place', rpt=>0, striptrail=>',:;/-' },
167 'b'=>{name=> 'publisher', rpt=>0, striptrail=>',:;/-' },
168 'c'=>{name=> 'year' , rpt=>0, striptrail=>'.,:;/-' }},
169 '300'=>{'a'=>{name=> 'pages', rpt=>0, striptrail=>',:;/-' },
170 'c'=>{name=> 'size', rpt=>0, striptrail=>',:;/-' }},
171 '362'=>{'a'=>{name=> 'volume-number', rpt=>0 }},
172 '440'=>{'a'=>{name=> 'seriestitle', rpt=>0, striptrail=>',:;/' },
173 'v'=>{name=> 'volume-number',rpt=>0 }},
174 '490'=>{'a'=>{name=> 'seriestitle', rpt=>0, striptrail=>',:;/' },
175 'v'=>{name=> 'volume-number',rpt=>0 }},
176 '700'=>{'a'=>{name=> 'addtional-author-illus',rpt=>1, striptrail=>',:;/' }},
177 '5xx'=>{'a'=>{name=> 'notes', rpt=>1 }},
178 '65x'=>{'a'=>{name=> 'subject', rpt=>1, striptrail=>'.,:;/-' }},
184 =item extractmarcfields
186 $biblioitem = &extractmarcfields($marc_record);
188 C<$marc_record> is a reference-to-array representing a MARC record;
189 each element is a reference-to-hash specifying a MARC field (possibly
192 C<&extractmarcfields> translates C<$marc_record> into a Koha
193 biblioitem. C<$biblioitem> is a reference-to-hash whose keys are named
194 after fields in the biblioitems table of the Koha database.
198 # FIXME - Throughout:
199 # $foo->{bar}->[baz]->{quux}
200 # can be rewritten as
201 # $foo->{bar}[baz]{quux}
202 sub extractmarcfields {
206 $record, # pointer to list of MARC field hashes.
207 # Example: $record->[0]->{'tag'} = '100' # Author
208 # $record->[0]->{'subfields'}->{'a'} = subfieldvalue
212 my $bib; # pointer to hash of named output fields
213 # Example: $bib->{'author'} = "Twain, Mark";
220 $subfield, # Marc subfield [a-z]
221 $fieldname, # name of field "author", "title", etc.
222 $strip, # chars to remove from end of field
223 $stripregex, # reg exp pattern
225 my ($lccn, $isbn, $issn,
226 $publicationyear, @subjects, $subject,
228 $notes, $additionalauthors, $illustrator, $copyrightdate,
229 $s, $subdivision, $subjectsubfield,
232 print "<PRE>\n" if $debug;
234 if ( ref($record) eq "ARRAY" ) {
235 foreach $field (@$record) {
237 # Check each subfield in field
238 # FIXME - Would this code be more readable with
239 # while (($subfieldname, $subfield) = each %{$field->{subfields}})
241 foreach $subfield ( keys %{$field->{subfields}} ) {
242 # see if it is defined in our Marc to koha mapping table
243 # FIXME - This if-clause takes up the entire loop.
244 # This would be better rewritten as
245 # next unless defined($tagmap{...});
246 # Then the body of the loop doesn't have to be
248 if ( $fieldname=$tagmap{ $field->{'tag'} }->{$subfield}->{name} ) {
249 # Yes, so keep the value
250 if ( ref($field->{'subfields'}->{$subfield} ) eq 'ARRAY' ) {
251 # if it was an array, just keep first element.
252 $bib->{$fieldname}=$field->{'subfields'}->{$subfield}[0];
254 $bib->{$fieldname}=$field->{'subfields'}->{$subfield};
256 print "$field->{'tag'} $subfield $fieldname=$bib->{$fieldname}\n" if $debug;
257 # see if this field should have trailing chars dropped
258 if ($strip=$tagmap{ $field->{'tag'} }->{$subfield}->{striptrail} ) {
259 # FIXME - The next three lines can be rewritten as:
260 # $bib =~ s/[\Q$strip\E]+$//;
261 $strip=~s//\\/; # backquote each char
262 $stripregex='[ ' . $strip . ']+$'; # remove trailing spaces also
263 $bib->{$fieldname}=~s/$stripregex//;
264 # also strip leading spaces
265 $bib->{$fieldname}=~s/^ +//;
267 print "Found subfield $field->{'tag'} $subfield " .
268 "$fieldname = $bib->{$fieldname}\n" if $debug;
273 # Handle special fields and tags
274 if ($field->{'tag'} eq '001') {
275 $bib->{controlnumber}=$field->{'indicator'};
277 if ($field->{'tag'} eq '015') {
278 # FIXME - I think this can be rewritten as
279 # $field->{"subfields"}{"a"} =~ /^\s*C?(\S+)/ and
280 # $bib->{"lccn"} = $1;
281 # This might break with invalid input, though.
282 $bib->{lccn}=$field->{'subfields'}->{'a'};
283 $bib->{lccn}=~s/^\s*//;
284 $bib->{lccn}=~s/^C//;
285 ($bib->{lccn}) = (split(/\s+/, $bib->{lccn}))[0];
289 # FIXME - Fix indentation
290 if ($field->{'tag'} eq '260') {
292 $publicationyear=$field->{'subfields'}->{'c'};
293 # FIXME - "\d\d\d\d" can be rewritten as "\d{4}"
294 if ($publicationyear=~/c(\d\d\d\d)/) {
297 if ($publicationyear=~/[^c](\d\d\d\d)/) {
299 } elsif ($copyrightdate) {
300 $publicationyear=$copyrightdate;
302 $publicationyear=~/(\d\d\d\d)/;
306 if ($field->{'tag'} eq '700') {
307 my $name=$field->{'subfields'}->{'a'};
308 if ( defined($field->{'subfields'}->{'e'})
309 and $field->{'subfields'}->{'e'}=~/ill/) {
312 $additionalauthors.="$name\n";
315 if ($field->{'tag'} =~/^5/) {
316 $notes.="$field->{'subfields'}->{'a'}\n";
318 if ($field->{'tag'} =~/65\d/) {
319 my $sub; # FIXME - Never used
320 my $subject=$field->{'subfields'}->{'a'};
322 print "Subject=$subject\n" if $debug;
323 foreach $subjectsubfield ( 'x','y','z' ) {
324 # FIXME - $subdivision is only used in this
325 # loop. Make it 'my' here, rather than in the
327 # Ditto $subjectsubfield. Make it 'my' in the
328 # 'foreach' statement.
329 if ($subdivision=$field->{'subfields'}->{$subjectsubfield}) {
330 if ( ref($subdivision) eq 'ARRAY' ) {
331 foreach $s (@$subdivision) {
334 } # foreach subdivision
336 $subdivision=~s/\.$//;
337 $subject.=" -- $subdivision";
339 } # if subfield exists
341 print "Subject=$subject\n" if $debug;
342 push @subjects, $subject;
347 # FIXME - Why not do this up in the "Handle special fields and
349 ($publicationyear ) && ($bib->{publicationyear}=$publicationyear );
350 ($copyrightdate ) && ($bib->{copyrightdate}=$copyrightdate );
351 ($additionalauthors ) && ($bib->{additionalauthors}=$additionalauthors );
352 ($illustrator ) && ($bib->{illustrator}=$illustrator );
353 ($notes ) && ($bib->{notes}=$notes );
354 ($#subjects ) && ($bib->{subject}=\@subjects );
355 # FIXME - This doesn't look right: for an array with
356 # one element, $#subjects == 0, which is false. For an
357 # array with 0 elements, $#subjects == -1, which is
362 $bib->{dewey}=~s/\///g; # drop any slashes
363 # FIXME - Why? Don't the
364 # slashes mean something?
365 # The Dewey code is NOT a number,
370 ($bib->{lccn}) = (split(/\s+/, $bib->{lccn}))[0]; # only keep first word
373 if ( $bib->{isbn} ) {
374 $bib->{isbn}=~s/[^\d]*//g; # drop non-digits
375 # FIXME - "[^\d]" can be rewritten as "\D"
376 # FIXME - Does this include the check digit? If so,
380 if ( $bib->{issn} ) {
381 $bib->{issn}=~s/^\s*//;
382 ($bib->{issn}) = (split(/\s+/, $bib->{issn}))[0];
385 if ( $bib->{'volume-number'} ) {
386 if ($bib->{'volume-number'}=~/(\d+).*(\d+)/ ) {
390 $bib->{volume}=$bib->{'volume-number'};
392 delete $bib->{'volume-number'};
396 # FIXME - Style: this sort of error-checking should really go
397 # closer to the actual test, e.g.:
398 # if (ref($record) ne "ARRAY")
400 # die "Not an array!"
402 # then the rest of the code which follows can assume that the
403 # input is good, and you don't have to indent as much.
404 print "Error: extractmarcfields: input ref $record is " .
405 ref($record) . " not ARRAY. Contact sysadmin.\n";
407 print "</PRE>\n" if $debug;
411 } # sub extractmarcfields
412 #---------------------------------
414 #--------------------------
416 =item parsemarcfileformat
418 @records = &parsemarcfileformat($marc_data);
420 Parses the contents of a MARC file.
422 C<$marc_data> is a string, the contents of a MARC file.
423 C<&parsemarcfileformat> parses this string into individual MARC
424 records and returns them.
426 C<@records> is an array of references-to-hash. Each element is a MARC
427 record; its keys are the MARC tags.
431 # Parse MARC data in file format with control-character separators
432 # May be multiple records.
433 # FIXME - Is the input ever likely to be more than a few Kb? If so, it
434 # might be worth changing this function to take a (read-only)
435 # reference-to-string, to avoid unnecessary copying.
436 sub parsemarcfileformat {
438 # Input is one big text string
440 # Output is list of records. Each record is list of field hashes
443 my $splitchar=chr(29); # \c]
444 my $splitchar2=chr(30); # \c^
445 my $splitchar3=chr(31); # \c_
448 foreach $record (split(/$splitchar/, $data)) {
455 my $leader=substr($record,0,24);
456 print "<pre>parse Leader:$leader</pre>\n" if $debug;
459 'indicator' => $leader ,
462 $record=substr($record,24);
463 foreach $field (split(/$splitchar2/, $record)) {
467 unless ($directory) {
468 # If we didn't already find a directory, extract one.
475 while ($item=substr($directory,0,12)) {
476 # Pull out location of first field
477 $tag=substr($directory,0,3);
478 $length=substr($directory,3,4);
479 $start=substr($directory,7,6);
481 # Bump to next directory entry
482 $directory=substr($directory,12);
483 $tag{$counter2}=$tag;
489 $tag=$tag{$tagcounter};
492 my @subfields=split(/$splitchar3/, $field);
493 $indicator=$subfields[0];
494 $field{'indicator'}=$indicator;
495 print "<pre>parse indicator:$indicator</pre>\n" if $debug;
497 unless ($#subfields==0) {
501 for ($i=1; $i<=$#subfields; $i++) {
502 my $text=$subfields[$i];
503 my $subfieldcode=substr($text,0,1);
504 my $subfield=substr($text,1);
505 # if this subfield already exists, do array
506 if ($subfields{$subfieldcode}) {
507 my $subfieldlist=$subfields{$subfieldcode};
508 if ( ref($subfieldlist) eq 'ARRAY' ) {
509 # Already an array, add on to it
510 print "$tag Adding to array $subfieldcode -- $subfield<br>\n" if $debug;
511 @subfieldlist=@$subfieldlist;
512 push (@subfieldlist, $subfield);
514 # Change simple value to array
515 print "$tag Arraying $subfieldcode -- $subfield<br>\n" if $debug;
516 @subfieldlist=($subfields{$subfieldcode}, $subfield);
519 $subfields{$subfieldcode}=\@subfieldlist;
521 # subfield doesn't exist yet, keep simple value
522 $subfields{$subfieldcode}=$subfield;
525 $field{'subfields'}=\%subfields;
527 push (@record, \%field);
528 } # foreach field in record
529 push (@records, \@record);
532 print "</pre>" if $debug;
534 } # sub parsemarcfileformat
536 #----------------------------------------------
540 $label = &taglabel($tag);
542 Converts a MARC tag (a three-digit number, or "LDR") and returns a
545 Note that although the tag looks like a number, it is treated here as
546 a string. Be sure to use
548 $label = &taglabel("082");
552 $label = &taglabel(082); # <-- Invalid octal number!
556 # FIXME - Does this function mean that %tagtext doesn't need to be
561 return $tagtext{$tag};
567 #---------------------------------------------
569 # Revision 1.7 2002/10/13 08:30:38 arensb
570 # Deleted unused variables.
571 # Removed trailing whitespace.
573 # Revision 1.6 2002/10/10 04:44:28 arensb
574 # Added whitespace to make the POD work.
576 # Revision 1.5 2002/10/07 00:51:22 arensb
577 # Added POD and some comments.
579 # Revision 1.4 2002/10/05 09:53:11 arensb
580 # Merged with arensb-context branch: use C4::Context->dbh instead of
581 # &C4Connect, and generally prefer C4::Context over C4::Database.
583 # Revision 1.3.2.1 2002/10/04 02:57:38 arensb
584 # Removed useless "use C4::Database;" line.
586 # Revision 1.3 2002/08/14 18:12:52 tonnesen
587 # Added copyright statement to all .pl and .pm files
589 # Revision 1.2 2002/07/02 20:30:15 tonnesen
590 # Merged SimpleMarc.pm over from rel-1-2
592 # Revision 1.1.2.4 2002/06/28 14:36:47 amillar
593 # Fix broken logic on illustrator vs. add'l author
595 # Revision 1.1.2.3 2002/06/26 20:54:32 tonnesen
596 # use warnings breaks on perl 5.005...
598 # Revision 1.1.2.2 2002/06/26 15:52:55 amillar
599 # Fix display of marc tag labels and indicators
601 # Revision 1.1.2.1 2002/06/26 07:27:35 amillar
602 # Moved acqui.simple MARC handling to new module SimpleMarc.pm
610 Koha Developement team <info@koha.org>