C4/SimpleMarc.pm

   1 #!/usr/bin/perl
   2
   3 # $Id$
   4
   5 package C4::SimpleMarc;
   6
   7 # Routines for handling import of MARC data into Koha db
   8
   9 # Koha library project  www.koha.org
  10
  11 # Licensed under the GPL
  12
  13
  14 # Copyright 2000-2002 Katipo Communications
  15 #
  16 # This file is part of Koha.
  17 #
  18 # Koha is free software; you can redistribute it and/or modify it under the
  19 # terms of the GNU General Public License as published by the Free Software
  20 # Foundation; either version 2 of the License, or (at your option) any later
  21 # version.
  22 #
  23 # Koha is distributed in the hope that it will be useful, but WITHOUT ANY
  24 # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
  25 # A PARTICULAR PURPOSE.  See the GNU General Public License for more details.
  26 #
  27 # You should have received a copy of the GNU General Public License along with
  28 # Koha; if not, write to the Free Software Foundation, Inc., 59 Temple Place,
  29 # Suite 330, Boston, MA  02111-1307 USA
  30
  31 use strict;
  32
  33 # standard or CPAN modules used
  34 use DBI;
  35
  36 # Koha modules used
  37
  38 require Exporter;
  39
  40 use vars qw($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
  41
  42 # set the version for version checking
  43 $VERSION = 0.01;
  44
  45 @ISA = qw(Exporter);
  46 @EXPORT = qw(
  47         &extractmarcfields
  48         &parsemarcfileformat
  49         &taglabel
  50         %tagtext
  51         %tagmap
  52 );
  53 %EXPORT_TAGS = ( );     # eg: TAG => [ qw!name1 name2! ],
  54
  55 # your exported package globals go here,
  56 # as well as any optionally exported functions
  57
  58 @EXPORT_OK   = qw(
  59         %tagtext
  60         %tagmap
  61 );
  62
  63 # non-exported package globals go here
  64 use vars qw(@more $stuff);
  65
  66 # initalize package globals, first exported ones
  67
  68 my $Var1   = '';
  69 my %Hashit = ();
  70
  71 # then the others (which are still accessible as $Some::Module::stuff)
  72 my $stuff  = '';
  73 my @more   = ();
  74
  75 # all file-scoped lexicals must be created before
  76 # the functions below that use them.
  77
  78 # file-private lexicals go here
  79 my $priv_var    = '';
  80 my %secret_hash = ();
  81
  82 # here's a file-private function as a closure,
  83 # callable as &$priv_func;  it cannot be prototyped.
  84 my $priv_func = sub {
  85   # stuff goes here.
  86   };
  87
  88 # make all your functions, whether exported or not;
  89 #------------------------------------------------
  90
  91 #------------------
  92 # Constants
  93
  94 my %tagtext = (
  95     'LDR' => 'Leader',
  96     '001' => 'Control number',
  97     '003' => 'Control number identifier',
  98     '005' => 'Date and time of latest transaction',
  99     '006' => 'Fixed-length data elements -- additional material characteristics',
 100     '007' => 'Physical description fixed field',
 101     '008' => 'Fixed length data elements',
 102     '010' => 'LCCN',
 103     '015' => 'National library CN',
 104     '020' => 'ISBN',
 105     '022' => 'ISSN',
 106     '024' => 'Other standard ID',
 107     '035' => 'System control number',
 108     '037' => 'Source of acquisition',
 109     '040' => 'Cataloging source',
 110     '041' => 'Language code',
 111     '043' => 'Geographic area code',
 112     '043' => 'Publishing country code',
 113     '050' => 'Library of Congress call number',
 114     '055' => 'Canadian classification number',
 115     '060' => 'National Library of Medicine call number',
 116     '082' => 'Dewey decimal call number',
 117     '100' => 'Main entry -- Personal name',
 118     '110' => 'Main entry -- Corporate name',
 119     '130' => 'Main entry -- Uniform title',
 120     '240' => 'Uniform title',
 121     '245' => 'Title statement',
 122     '246' => 'Varying form of title',
 123     '250' => 'Edition statement',
 124     '256' => 'Computer file characteristics',
 125     '260' => 'Publication, distribution, etc.',
 126     '263' => 'Projected publication date',
 127     '300' => 'Physical description',
 128     '306' => 'Playing time',
 129     '440' => 'Series statement / Added entry -- Title',
 130     '490' => 'Series statement',
 131     '500' => 'General note',
 132     '504' => 'Bibliography, etc. note',
 133     '505' => 'Formatted contents note',
 134     '508' => 'Creation/production credits note',
 135     '510' => 'Citation/references note',
 136     '511' => 'Participant or performer note',
 137     '520' => 'Summary, etc. note',
 138     '521' => 'Target audience note (ie age)',
 139     '530' => 'Additional physical form available note',
 140     '538' => 'System details note',
 141     '586' => 'Awards note',
 142     '600' => 'Subject added entry -- Personal name',
 143     '610' => 'Subject added entry -- Corporate name',
 144     '650' => 'Subject added entry -- Topical term',
 145     '651' => 'Subject added entry -- Geographic name',
 146     '656' => 'Index term -- Occupation',
 147     '700' => 'Added entry -- Personal name',
 148     '710' => 'Added entry -- Corporate name',
 149     '730' => 'Added entry -- Uniform title',
 150     '740' => 'Added entry -- Uncontrolled related/analytical title',
 151     '800' => 'Series added entry -- Personal name',
 152     '830' => 'Series added entry -- Uniform title',
 153     '852' => 'Location',
 154     '856' => 'Electronic location and access',
 155 );
 156
 157 # tag, subfield, field name, repeats, striptrailingchars
 158 my %tagmap=(
 159     '010'=>{'a'=>{name=> 'lccn',        rpt=>0, striptrail=>' '         }},
 160     '015'=>{'a'=>{name=> 'lccn',        rpt=>0  }},
 161     '020'=>{'a'=>{name=> 'isbn',        rpt=>0  }},
 162     '022'=>{'a'=>{name=> 'issn',        rpt=>0  }},
 163     '082'=>{'a'=>{name=> 'dewey',       rpt=>0  }},
 164     '100'=>{'a'=>{name=> 'author',      rpt=>0, striptrail=>',:;/-'     }},
 165     '245'=>{'a'=>{name=> 'title',       rpt=>0, striptrail=>',:;/'      },
 166             'b'=>{name=> 'subtitle',    rpt=>0, striptrail=>',:;/'      }},
 167     '260'=>{'a'=>{name=> 'place',       rpt=>0, striptrail=>',:;/-'     },
 168             'b'=>{name=> 'publisher',   rpt=>0, striptrail=>',:;/-'     },
 169             'c'=>{name=> 'year' ,       rpt=>0, striptrail=>'.,:;/-'    }},
 170     '300'=>{'a'=>{name=> 'pages',       rpt=>0, striptrail=>',:;/-'     },
 171             'c'=>{name=> 'size',        rpt=>0, striptrail=>',:;/-'     }},
 172     '362'=>{'a'=>{name=> 'volume-number',       rpt=>0  }},
 173     '440'=>{'a'=>{name=> 'seriestitle', rpt=>0, striptrail=>',:;/'      },
 174             'v'=>{name=> 'volume-number',rpt=>0 }},
 175     '490'=>{'a'=>{name=> 'seriestitle', rpt=>0, striptrail=>',:;/'      },
 176             'v'=>{name=> 'volume-number',rpt=>0 }},
 177     '700'=>{'a'=>{name=> 'addtional-author-illus',rpt=>1, striptrail=>',:;/'    }},
 178     '5xx'=>{'a'=>{name=> 'notes',       rpt=>1  }},
 179     '65x'=>{'a'=>{name=> 'subject',     rpt=>1, striptrail=>'.,:;/-'    }},
 180 );
 181
 182
 183 #------------------
 184 sub extractmarcfields {
 185     use strict;
 186     # input
 187     my (
 188         $record,        # pointer to list of MARC field hashes.
 189                         # Example: $record->[0]->{'tag'} = '100' # Author
 190                         #       $record->[0]->{'subfields'}->{'a'} = subfieldvalue
 191     )=@_;
 192
 193     # return
 194     my $bib;            # pointer to hash of named output fields
 195                         # Example: $bib->{'author'} = "Twain, Mark";
 196
 197     my $debug=0;
 198
 199     my (
 200         $field,         # hash ref
 201         $value,
 202         $subfield,      # Marc subfield [a-z]
 203         $fieldname,     # name of field "author", "title", etc.
 204         $strip,         # chars to remove from end of field
 205         $stripregex,    # reg exp pattern
 206     );
 207     my ($lccn, $isbn, $issn,
 208         $publicationyear, @subjects, $subject,
 209         $controlnumber,
 210         $notes, $additionalauthors, $illustrator, $copyrightdate,
 211         $s, $subdivision, $subjectsubfield,
 212     );
 213
 214     print "<PRE>\n" if $debug;
 215
 216     if ( ref($record) eq "ARRAY" ) {
 217         foreach $field (@$record) {
 218
 219             # Check each subfield in field
 220             foreach $subfield ( keys %{$field->{subfields}} ) {
 221                 # see if it is defined in our Marc to koha mapping table
 222                 if ( $fieldname=$tagmap{ $field->{'tag'} }->{$subfield}->{name} ) {
 223                     # Yes, so keep the value
 224                     if ( ref($field->{'subfields'}->{$subfield} ) eq 'ARRAY' ) {
 225                         # if it was an array, just keep first element.
 226                         $bib->{$fieldname}=$field->{'subfields'}->{$subfield}[0];
 227                     } else {
 228                         $bib->{$fieldname}=$field->{'subfields'}->{$subfield};
 229                     } # if array
 230                     print "$field->{'tag'} $subfield $fieldname=$bib->{$fieldname}\n" if $debug;
 231                     # see if this field should have trailing chars dropped
 232                     if ($strip=$tagmap{ $field->{'tag'} }->{$subfield}->{striptrail} ) {
 233                         $strip=~s//\\/; # backquote each char
 234                         $stripregex='[ ' . $strip . ']+$';  # remove trailing spaces also
 235                         $bib->{$fieldname}=~s/$stripregex//;
 236                         # also strip leading spaces
 237                         $bib->{$fieldname}=~s/^ +//;
 238                     } # if strip
 239                     print "Found subfield $field->{'tag'} $subfield " .
 240                         "$fieldname = $bib->{$fieldname}\n" if $debug;
 241                 } # if tagmap exists
 242
 243             } # foreach subfield
 244
 245
 246             if ($field->{'tag'} eq '001') {
 247                 $bib->{controlnumber}=$field->{'indicator'};
 248             }
 249             if ($field->{'tag'} eq '015') {
 250                 $bib->{lccn}=$field->{'subfields'}->{'a'};
 251                 $bib->{lccn}=~s/^\s*//;
 252                 $bib->{lccn}=~s/^C//;
 253                 ($bib->{lccn}) = (split(/\s+/, $bib->{lccn}))[0];
 254             }
 255
 256
 257                 if ($field->{'tag'} eq '260') {
 258
 259                     $publicationyear=$field->{'subfields'}->{'c'};
 260                     if ($publicationyear=~/c(\d\d\d\d)/) {
 261                         $copyrightdate=$1;
 262                     }
 263                     if ($publicationyear=~/[^c](\d\d\d\d)/) {
 264                         $publicationyear=$1;
 265                     } elsif ($copyrightdate) {
 266                         $publicationyear=$copyrightdate;
 267                     } else {
 268                         $publicationyear=~/(\d\d\d\d)/;
 269                         $publicationyear=$1;
 270                     }
 271                 }
 272                 if ($field->{'tag'} eq '700') {
 273                     my $name=$field->{'subfields'}->{'a'};
 274                     if ( defined($field->{'subfields'}->{'e'})
 275                         and  $field->{'subfields'}->{'e'}=~/ill/) {
 276                         $illustrator=$name;
 277                     } else {
 278                         $additionalauthors.="$name\n";
 279                     }
 280                 }
 281                 if ($field->{'tag'} =~/^5/) {
 282                     $notes.="$field->{'subfields'}->{'a'}\n";
 283                 }
 284                 if ($field->{'tag'} =~/65\d/) {
 285                     my $sub;
 286                     my $subject=$field->{'subfields'}->{'a'};
 287                     $subject=~s/\.$//;
 288                     print "Subject=$subject\n" if $debug;
 289                     foreach $subjectsubfield ( 'x','y','z' ) {
 290                       if ($subdivision=$field->{'subfields'}->{$subjectsubfield}) {
 291                         if ( ref($subdivision) eq 'ARRAY' ) {
 292                             foreach $s (@$subdivision) {
 293                                 $s=~s/\.$//;
 294                                 $subject.=" -- $s";
 295                             } # foreach subdivision
 296                         } else {
 297                             $subdivision=~s/\.$//;
 298                             $subject.=" -- $subdivision";
 299                         } # if array
 300                       } # if subfield exists
 301                     } # foreach subfield
 302                     print "Subject=$subject\n" if $debug;
 303                     push @subjects, $subject;
 304                 } # if tag 65x
 305
 306
 307         } # foreach field
 308         ($publicationyear       ) && ($bib->{publicationyear}=$publicationyear  );
 309         ($copyrightdate         ) && ($bib->{copyrightdate}=$copyrightdate  );
 310         ($additionalauthors     ) && ($bib->{additionalauthors}=$additionalauthors  );
 311         ($illustrator           ) && ($bib->{illustrator}=$illustrator  );
 312         ($notes                 ) && ($bib->{notes}=$notes  );
 313         ($#subjects             ) && ($bib->{subject}=\@subjects  );
 314
 315         # Misc cleanup
 316         if ($bib->{dewey}) {
 317             $bib->{dewey}=~s/\///g;     # drop any slashes
 318         }
 319
 320         if ($bib->{lccn}) {
 321            ($bib->{lccn}) = (split(/\s+/, $bib->{lccn}))[0]; # only keep first word
 322         }
 323
 324         if ( $bib->{isbn} ) {
 325             $bib->{isbn}=~s/[^\d]*//g;  # drop non-digits
 326         };
 327
 328         if ( $bib->{issn} ) {
 329             $bib->{issn}=~s/^\s*//;
 330             ($bib->{issn}) = (split(/\s+/, $bib->{issn}))[0];
 331         };
 332
 333         if ( $bib->{'volume-number'} ) {
 334             if ($bib->{'volume-number'}=~/(\d+).*(\d+)/ ) {
 335                 $bib->{'volume'}=$1;
 336                 $bib->{'number'}=$2;
 337             } else {
 338                 $bib->{volume}=$bib->{'volume-number'};
 339             }
 340             delete $bib->{'volume-number'};
 341         } # if volume-number
 342
 343     } else {
 344         print "Error: extractmarcfields: input ref $record is " .
 345                 ref($record) . " not ARRAY. Contact sysadmin.\n";
 346     }
 347     print "</PRE>\n" if $debug;
 348
 349     return $bib;
 350
 351 } # sub extractmarcfields
 352 #---------------------------------
 353
 354 #--------------------------
 355 # Parse MARC data in file format with control-character separators
 356 #   May be multiple records.
 357 sub parsemarcfileformat {
 358     use strict;
 359     # Input is one big text string
 360     my $data=shift;
 361     # Output is list of records.  Each record is list of field hashes
 362     my @records;
 363
 364     my $splitchar=chr(29);
 365     my $splitchar2=chr(30);
 366     my $splitchar3=chr(31);
 367     my $debug=0;
 368     my $record;
 369     foreach $record (split(/$splitchar/, $data)) {
 370         my @record;
 371         my $directory=0;
 372         my $tagcounter=0;
 373         my %tag;
 374         my $field;
 375
 376         my $leader=substr($record,0,24);
 377         print "<pre>parse Leader:$leader</pre>\n" if $debug;
 378         push (@record, {
 379                 'tag' => 'LDR',
 380                 'indicator' => $leader ,
 381         } );
 382
 383         $record=substr($record,24);
 384         foreach $field (split(/$splitchar2/, $record)) {
 385             my %field;
 386             my $tag;
 387             my $indicator;
 388             unless ($directory) {
 389                 # If we didn't already find a directory, extract one.
 390                 $directory=$field;
 391                 my $itemcounter=1;
 392                 my $counter2=0;
 393                 my $item;
 394                 my $length;
 395                 my $start;
 396                 while ($item=substr($directory,0,12)) {
 397                     # Pull out location of first field
 398                     $tag=substr($directory,0,3);
 399                     $length=substr($directory,3,4);
 400                     $start=substr($directory,7,6);
 401
 402                     # Bump to next directory entry
 403                     $directory=substr($directory,12);
 404                     $tag{$counter2}=$tag;
 405                     $counter2++;
 406                 }
 407                 $directory=1;
 408                 next;
 409             }
 410             $tag=$tag{$tagcounter};
 411             $tagcounter++;
 412             $field{'tag'}=$tag;
 413             my @subfields=split(/$splitchar3/, $field);
 414             $indicator=$subfields[0];
 415             $field{'indicator'}=$indicator;
 416             print "<pre>parse indicator:$indicator</pre>\n" if $debug;
 417             my $firstline=1;
 418             unless ($#subfields==0) {
 419                 my %subfields;
 420                 my @subfieldlist;
 421                 my $i;
 422                 for ($i=1; $i<=$#subfields; $i++) {
 423                     my $text=$subfields[$i];
 424                     my $subfieldcode=substr($text,0,1);
 425                     my $subfield=substr($text,1);
 426                     # if this subfield already exists, do array
 427                     if ($subfields{$subfieldcode}) {
 428                         my $subfieldlist=$subfields{$subfieldcode};
 429                         if ( ref($subfieldlist) eq 'ARRAY' ) {
 430                             # Already an array, add on to it
 431                             print "$tag Adding to array $subfieldcode -- $subfield<br>\n" if $debug;
 432                             @subfieldlist=@$subfieldlist;
 433                             push (@subfieldlist, $subfield);
 434                         } else {
 435                             # Change simple value to array
 436                             print "$tag Arraying $subfieldcode -- $subfield<br>\n" if $debug;
 437                             @subfieldlist=($subfields{$subfieldcode}, $subfield);
 438                         }
 439                         # keep new array
 440                         $subfields{$subfieldcode}=\@subfieldlist;
 441                     } else {
 442                         # subfield doesn't exist yet, keep simple value
 443                         $subfields{$subfieldcode}=$subfield;
 444                     }
 445                 }
 446                 $field{'subfields'}=\%subfields;
 447             }
 448             push (@record, \%field);
 449         } # foreach field in record
 450         push (@records, \@record);
 451         # $counter++;
 452     }
 453     print "</pre>" if $debug;
 454     return @records;
 455 } # sub parsemarcfileformat
 456
 457 #----------------------------------------------
 458 sub taglabel {
 459     my ($tag)=@_;
 460
 461     return $tagtext{$tag};
 462
 463 } # sub taglabel
 464
 465 #---------------------------------------------
 466 # $Log$
 467 # Revision 1.4  2002/10/05 09:53:11  arensb
 468 # Merged with arensb-context branch: use C4::Context->dbh instead of
 469 # &C4Connect, and generally prefer C4::Context over C4::Database.
 470 #
 471 # Revision 1.3.2.1  2002/10/04 02:57:38  arensb
 472 # Removed useless "use C4::Database;" line.
 473 #
 474 # Revision 1.3  2002/08/14 18:12:52  tonnesen
 475 # Added copyright statement to all .pl and .pm files
 476 #
 477 # Revision 1.2  2002/07/02 20:30:15  tonnesen
 478 # Merged SimpleMarc.pm over from rel-1-2
 479 #
 480 # Revision 1.1.2.4  2002/06/28 14:36:47  amillar
 481 # Fix broken logic on illustrator vs. add'l author
 482 #
 483 # Revision 1.1.2.3  2002/06/26 20:54:32  tonnesen
 484 # use warnings breaks on perl 5.005...
 485 #
 486 # Revision 1.1.2.2  2002/06/26 15:52:55  amillar
 487 # Fix display of marc tag labels and indicators
 488 #
 489 # Revision 1.1.2.1  2002/06/26 07:27:35  amillar
 490 # Moved acqui.simple MARC handling to new module SimpleMarc.pm
 491 #