C4/SimpleMarc.pm

   1 #!/usr/bin/perl
   2
   3 # $Id$
   4
   5 package C4::SimpleMarc;
   6
   7 # Routines for handling import of MARC data into Koha db
   8
   9 # Koha library project  www.koha.org
  10
  11 # Licensed under the GPL
  12
  13
  14 # Copyright 2000-2002 Katipo Communications
  15 #
  16 # This file is part of Koha.
  17 #
  18 # Koha is free software; you can redistribute it and/or modify it under the
  19 # terms of the GNU General Public License as published by the Free Software
  20 # Foundation; either version 2 of the License, or (at your option) any later
  21 # version.
  22 #
  23 # Koha is distributed in the hope that it will be useful, but WITHOUT ANY
  24 # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
  25 # A PARTICULAR PURPOSE.  See the GNU General Public License for more details.
  26 #
  27 # You should have received a copy of the GNU General Public License along with
  28 # Koha; if not, write to the Free Software Foundation, Inc., 59 Temple Place,
  29 # Suite 330, Boston, MA  02111-1307 USA
  30
  31 use strict;
  32
  33 # standard or CPAN modules used
  34 use DBI;
  35
  36 # Koha modules used
  37 use C4::Database;
  38
  39 require Exporter;
  40
  41 use vars qw($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
  42
  43 # set the version for version checking
  44 $VERSION = 0.01;
  45
  46 @ISA = qw(Exporter);
  47 @EXPORT = qw(
  48         &extractmarcfields
  49         &parsemarcfileformat
  50         &taglabel
  51         %tagtext
  52         %tagmap
  53 );
  54 %EXPORT_TAGS = ( );     # eg: TAG => [ qw!name1 name2! ],
  55
  56 # your exported package globals go here,
  57 # as well as any optionally exported functions
  58
  59 @EXPORT_OK   = qw(
  60         %tagtext
  61         %tagmap
  62 );
  63
  64 # non-exported package globals go here
  65 use vars qw(@more $stuff);
  66
  67 # initalize package globals, first exported ones
  68
  69 my $Var1   = '';
  70 my %Hashit = ();
  71
  72 # then the others (which are still accessible as $Some::Module::stuff)
  73 my $stuff  = '';
  74 my @more   = ();
  75
  76 # all file-scoped lexicals must be created before
  77 # the functions below that use them.
  78
  79 # file-private lexicals go here
  80 my $priv_var    = '';
  81 my %secret_hash = ();
  82
  83 # here's a file-private function as a closure,
  84 # callable as &$priv_func;  it cannot be prototyped.
  85 my $priv_func = sub {
  86   # stuff goes here.
  87   };
  88
  89 # make all your functions, whether exported or not;
  90 #------------------------------------------------
  91
  92 #------------------
  93 # Constants
  94
  95 my %tagtext = (
  96     'LDR' => 'Leader',
  97     '001' => 'Control number',
  98     '003' => 'Control number identifier',
  99     '005' => 'Date and time of latest transaction',
 100     '006' => 'Fixed-length data elements -- additional material characteristics',
 101     '007' => 'Physical description fixed field',
 102     '008' => 'Fixed length data elements',
 103     '010' => 'LCCN',
 104     '015' => 'National library CN',
 105     '020' => 'ISBN',
 106     '022' => 'ISSN',
 107     '024' => 'Other standard ID',
 108     '035' => 'System control number',
 109     '037' => 'Source of acquisition',
 110     '040' => 'Cataloging source',
 111     '041' => 'Language code',
 112     '043' => 'Geographic area code',
 113     '043' => 'Publishing country code',
 114     '050' => 'Library of Congress call number',
 115     '055' => 'Canadian classification number',
 116     '060' => 'National Library of Medicine call number',
 117     '082' => 'Dewey decimal call number',
 118     '100' => 'Main entry -- Personal name',
 119     '110' => 'Main entry -- Corporate name',
 120     '130' => 'Main entry -- Uniform title',
 121     '240' => 'Uniform title',
 122     '245' => 'Title statement',
 123     '246' => 'Varying form of title',
 124     '250' => 'Edition statement',
 125     '256' => 'Computer file characteristics',
 126     '260' => 'Publication, distribution, etc.',
 127     '263' => 'Projected publication date',
 128     '300' => 'Physical description',
 129     '306' => 'Playing time',
 130     '440' => 'Series statement / Added entry -- Title',
 131     '490' => 'Series statement',
 132     '500' => 'General note',
 133     '504' => 'Bibliography, etc. note',
 134     '505' => 'Formatted contents note',
 135     '508' => 'Creation/production credits note',
 136     '510' => 'Citation/references note',
 137     '511' => 'Participant or performer note',
 138     '520' => 'Summary, etc. note',
 139     '521' => 'Target audience note (ie age)',
 140     '530' => 'Additional physical form available note',
 141     '538' => 'System details note',
 142     '586' => 'Awards note',
 143     '600' => 'Subject added entry -- Personal name',
 144     '610' => 'Subject added entry -- Corporate name',
 145     '650' => 'Subject added entry -- Topical term',
 146     '651' => 'Subject added entry -- Geographic name',
 147     '656' => 'Index term -- Occupation',
 148     '700' => 'Added entry -- Personal name',
 149     '710' => 'Added entry -- Corporate name',
 150     '730' => 'Added entry -- Uniform title',
 151     '740' => 'Added entry -- Uncontrolled related/analytical title',
 152     '800' => 'Series added entry -- Personal name',
 153     '830' => 'Series added entry -- Uniform title',
 154     '852' => 'Location',
 155     '856' => 'Electronic location and access',
 156 );
 157
 158 # tag, subfield, field name, repeats, striptrailingchars
 159 my %tagmap=(
 160     '010'=>{'a'=>{name=> 'lccn',        rpt=>0, striptrail=>' '         }},
 161     '015'=>{'a'=>{name=> 'lccn',        rpt=>0  }},
 162     '020'=>{'a'=>{name=> 'isbn',        rpt=>0  }},
 163     '022'=>{'a'=>{name=> 'issn',        rpt=>0  }},
 164     '082'=>{'a'=>{name=> 'dewey',       rpt=>0  }},
 165     '100'=>{'a'=>{name=> 'author',      rpt=>0, striptrail=>',:;/-'     }},
 166     '245'=>{'a'=>{name=> 'title',       rpt=>0, striptrail=>',:;/'      },
 167             'b'=>{name=> 'subtitle',    rpt=>0, striptrail=>',:;/'      }},
 168     '260'=>{'a'=>{name=> 'place',       rpt=>0, striptrail=>',:;/-'     },
 169             'b'=>{name=> 'publisher',   rpt=>0, striptrail=>',:;/-'     },
 170             'c'=>{name=> 'year' ,       rpt=>0, striptrail=>'.,:;/-'    }},
 171     '300'=>{'a'=>{name=> 'pages',       rpt=>0, striptrail=>',:;/-'     },
 172             'c'=>{name=> 'size',        rpt=>0, striptrail=>',:;/-'     }},
 173     '362'=>{'a'=>{name=> 'volume-number',       rpt=>0  }},
 174     '440'=>{'a'=>{name=> 'seriestitle', rpt=>0, striptrail=>',:;/'      },
 175             'v'=>{name=> 'volume-number',rpt=>0 }},
 176     '490'=>{'a'=>{name=> 'seriestitle', rpt=>0, striptrail=>',:;/'      },
 177             'v'=>{name=> 'volume-number',rpt=>0 }},
 178     '700'=>{'a'=>{name=> 'addtional-author-illus',rpt=>1, striptrail=>',:;/'    }},
 179     '5xx'=>{'a'=>{name=> 'notes',       rpt=>1  }},
 180     '65x'=>{'a'=>{name=> 'subject',     rpt=>1, striptrail=>'.,:;/-'    }},
 181 );
 182
 183
 184 #------------------
 185 sub extractmarcfields {
 186     use strict;
 187     # input
 188     my (
 189         $record,        # pointer to list of MARC field hashes.
 190                         # Example: $record->[0]->{'tag'} = '100' # Author
 191                         #       $record->[0]->{'subfields'}->{'a'} = subfieldvalue
 192     )=@_;
 193
 194     # return
 195     my $bib;            # pointer to hash of named output fields
 196                         # Example: $bib->{'author'} = "Twain, Mark";
 197
 198     my $debug=0;
 199
 200     my (
 201         $field,         # hash ref
 202         $value,
 203         $subfield,      # Marc subfield [a-z]
 204         $fieldname,     # name of field "author", "title", etc.
 205         $strip,         # chars to remove from end of field
 206         $stripregex,    # reg exp pattern
 207     );
 208     my ($lccn, $isbn, $issn,
 209         $publicationyear, @subjects, $subject,
 210         $controlnumber,
 211         $notes, $additionalauthors, $illustrator, $copyrightdate,
 212         $s, $subdivision, $subjectsubfield,
 213     );
 214
 215     print "<PRE>\n" if $debug;
 216
 217     if ( ref($record) eq "ARRAY" ) {
 218         foreach $field (@$record) {
 219
 220             # Check each subfield in field
 221             foreach $subfield ( keys %{$field->{subfields}} ) {
 222                 # see if it is defined in our Marc to koha mapping table
 223                 if ( $fieldname=$tagmap{ $field->{'tag'} }->{$subfield}->{name} ) {
 224                     # Yes, so keep the value
 225                     if ( ref($field->{'subfields'}->{$subfield} ) eq 'ARRAY' ) {
 226                         # if it was an array, just keep first element.
 227                         $bib->{$fieldname}=$field->{'subfields'}->{$subfield}[0];
 228                     } else {
 229                         $bib->{$fieldname}=$field->{'subfields'}->{$subfield};
 230                     } # if array
 231                     print "$field->{'tag'} $subfield $fieldname=$bib->{$fieldname}\n" if $debug;
 232                     # see if this field should have trailing chars dropped
 233                     if ($strip=$tagmap{ $field->{'tag'} }->{$subfield}->{striptrail} ) {
 234                         $strip=~s//\\/; # backquote each char
 235                         $stripregex='[ ' . $strip . ']+$';  # remove trailing spaces also
 236                         $bib->{$fieldname}=~s/$stripregex//;
 237                         # also strip leading spaces
 238                         $bib->{$fieldname}=~s/^ +//;
 239                     } # if strip
 240                     print "Found subfield $field->{'tag'} $subfield " .
 241                         "$fieldname = $bib->{$fieldname}\n" if $debug;
 242                 } # if tagmap exists
 243
 244             } # foreach subfield
 245
 246
 247             if ($field->{'tag'} eq '001') {
 248                 $bib->{controlnumber}=$field->{'indicator'};
 249             }
 250             if ($field->{'tag'} eq '015') {
 251                 $bib->{lccn}=$field->{'subfields'}->{'a'};
 252                 $bib->{lccn}=~s/^\s*//;
 253                 $bib->{lccn}=~s/^C//;
 254                 ($bib->{lccn}) = (split(/\s+/, $bib->{lccn}))[0];
 255             }
 256
 257
 258                 if ($field->{'tag'} eq '260') {
 259
 260                     $publicationyear=$field->{'subfields'}->{'c'};
 261                     if ($publicationyear=~/c(\d\d\d\d)/) {
 262                         $copyrightdate=$1;
 263                     }
 264                     if ($publicationyear=~/[^c](\d\d\d\d)/) {
 265                         $publicationyear=$1;
 266                     } elsif ($copyrightdate) {
 267                         $publicationyear=$copyrightdate;
 268                     } else {
 269                         $publicationyear=~/(\d\d\d\d)/;
 270                         $publicationyear=$1;
 271                     }
 272                 }
 273                 if ($field->{'tag'} eq '700') {
 274                     my $name=$field->{'subfields'}->{'a'};
 275                     if ( defined($field->{'subfields'}->{'e'})
 276                         and  $field->{'subfields'}->{'e'}=~/ill/) {
 277                         $illustrator=$name;
 278                     } else {
 279                         $additionalauthors.="$name\n";
 280                     }
 281                 }
 282                 if ($field->{'tag'} =~/^5/) {
 283                     $notes.="$field->{'subfields'}->{'a'}\n";
 284                 }
 285                 if ($field->{'tag'} =~/65\d/) {
 286                     my $sub;
 287                     my $subject=$field->{'subfields'}->{'a'};
 288                     $subject=~s/\.$//;
 289                     print "Subject=$subject\n" if $debug;
 290                     foreach $subjectsubfield ( 'x','y','z' ) {
 291                       if ($subdivision=$field->{'subfields'}->{$subjectsubfield}) {
 292                         if ( ref($subdivision) eq 'ARRAY' ) {
 293                             foreach $s (@$subdivision) {
 294                                 $s=~s/\.$//;
 295                                 $subject.=" -- $s";
 296                             } # foreach subdivision
 297                         } else {
 298                             $subdivision=~s/\.$//;
 299                             $subject.=" -- $subdivision";
 300                         } # if array
 301                       } # if subfield exists
 302                     } # foreach subfield
 303                     print "Subject=$subject\n" if $debug;
 304                     push @subjects, $subject;
 305                 } # if tag 65x
 306
 307
 308         } # foreach field
 309         ($publicationyear       ) && ($bib->{publicationyear}=$publicationyear  );
 310         ($copyrightdate         ) && ($bib->{copyrightdate}=$copyrightdate  );
 311         ($additionalauthors     ) && ($bib->{additionalauthors}=$additionalauthors  );
 312         ($illustrator           ) && ($bib->{illustrator}=$illustrator  );
 313         ($notes                 ) && ($bib->{notes}=$notes  );
 314         ($#subjects             ) && ($bib->{subject}=\@subjects  );
 315
 316         # Misc cleanup
 317         if ($bib->{dewey}) {
 318             $bib->{dewey}=~s/\///g;     # drop any slashes
 319         }
 320
 321         if ($bib->{lccn}) {
 322            ($bib->{lccn}) = (split(/\s+/, $bib->{lccn}))[0]; # only keep first word
 323         }
 324
 325         if ( $bib->{isbn} ) {
 326             $bib->{isbn}=~s/[^\d]*//g;  # drop non-digits
 327         };
 328
 329         if ( $bib->{issn} ) {
 330             $bib->{issn}=~s/^\s*//;
 331             ($bib->{issn}) = (split(/\s+/, $bib->{issn}))[0];
 332         };
 333
 334         if ( $bib->{'volume-number'} ) {
 335             if ($bib->{'volume-number'}=~/(\d+).*(\d+)/ ) {
 336                 $bib->{'volume'}=$1;
 337                 $bib->{'number'}=$2;
 338             } else {
 339                 $bib->{volume}=$bib->{'volume-number'};
 340             }
 341             delete $bib->{'volume-number'};
 342         } # if volume-number
 343
 344     } else {
 345         print "Error: extractmarcfields: input ref $record is " .
 346                 ref($record) . " not ARRAY. Contact sysadmin.\n";
 347     }
 348     print "</PRE>\n" if $debug;
 349
 350     return $bib;
 351
 352 } # sub extractmarcfields
 353 #---------------------------------
 354
 355 #--------------------------
 356 # Parse MARC data in file format with control-character separators
 357 #   May be multiple records.
 358 sub parsemarcfileformat {
 359     use strict;
 360     # Input is one big text string
 361     my $data=shift;
 362     # Output is list of records.  Each record is list of field hashes
 363     my @records;
 364
 365     my $splitchar=chr(29);
 366     my $splitchar2=chr(30);
 367     my $splitchar3=chr(31);
 368     my $debug=0;
 369     my $record;
 370     foreach $record (split(/$splitchar/, $data)) {
 371         my @record;
 372         my $directory=0;
 373         my $tagcounter=0;
 374         my %tag;
 375         my $field;
 376
 377         my $leader=substr($record,0,24);
 378         print "<pre>parse Leader:$leader</pre>\n" if $debug;
 379         push (@record, {
 380                 'tag' => 'LDR',
 381                 'indicator' => $leader ,
 382         } );
 383
 384         $record=substr($record,24);
 385         foreach $field (split(/$splitchar2/, $record)) {
 386             my %field;
 387             my $tag;
 388             my $indicator;
 389             unless ($directory) {
 390                 # If we didn't already find a directory, extract one.
 391                 $directory=$field;
 392                 my $itemcounter=1;
 393                 my $counter2=0;
 394                 my $item;
 395                 my $length;
 396                 my $start;
 397                 while ($item=substr($directory,0,12)) {
 398                     # Pull out location of first field
 399                     $tag=substr($directory,0,3);
 400                     $length=substr($directory,3,4);
 401                     $start=substr($directory,7,6);
 402
 403                     # Bump to next directory entry
 404                     $directory=substr($directory,12);
 405                     $tag{$counter2}=$tag;
 406                     $counter2++;
 407                 }
 408                 $directory=1;
 409                 next;
 410             }
 411             $tag=$tag{$tagcounter};
 412             $tagcounter++;
 413             $field{'tag'}=$tag;
 414             my @subfields=split(/$splitchar3/, $field);
 415             $indicator=$subfields[0];
 416             $field{'indicator'}=$indicator;
 417             print "<pre>parse indicator:$indicator</pre>\n" if $debug;
 418             my $firstline=1;
 419             unless ($#subfields==0) {
 420                 my %subfields;
 421                 my @subfieldlist;
 422                 my $i;
 423                 for ($i=1; $i<=$#subfields; $i++) {
 424                     my $text=$subfields[$i];
 425                     my $subfieldcode=substr($text,0,1);
 426                     my $subfield=substr($text,1);
 427                     # if this subfield already exists, do array
 428                     if ($subfields{$subfieldcode}) {
 429                         my $subfieldlist=$subfields{$subfieldcode};
 430                         if ( ref($subfieldlist) eq 'ARRAY' ) {
 431                             # Already an array, add on to it
 432                             print "$tag Adding to array $subfieldcode -- $subfield<br>\n" if $debug;
 433                             @subfieldlist=@$subfieldlist;
 434                             push (@subfieldlist, $subfield);
 435                         } else {
 436                             # Change simple value to array
 437                             print "$tag Arraying $subfieldcode -- $subfield<br>\n" if $debug;
 438                             @subfieldlist=($subfields{$subfieldcode}, $subfield);
 439                         }
 440                         # keep new array
 441                         $subfields{$subfieldcode}=\@subfieldlist;
 442                     } else {
 443                         # subfield doesn't exist yet, keep simple value
 444                         $subfields{$subfieldcode}=$subfield;
 445                     }
 446                 }
 447                 $field{'subfields'}=\%subfields;
 448             }
 449             push (@record, \%field);
 450         } # foreach field in record
 451         push (@records, \@record);
 452         # $counter++;
 453     }
 454     print "</pre>" if $debug;
 455     return @records;
 456 } # sub parsemarcfileformat
 457
 458 #----------------------------------------------
 459 sub taglabel {
 460     my ($tag)=@_;
 461
 462     return $tagtext{$tag};
 463
 464 } # sub taglabel
 465
 466 #---------------------------------------------
 467 # $Log$
 468 # Revision 1.3  2002/08/14 18:12:52  tonnesen
 469 # Added copyright statement to all .pl and .pm files
 470 #
 471 # Revision 1.2  2002/07/02 20:30:15  tonnesen
 472 # Merged SimpleMarc.pm over from rel-1-2
 473 #
 474 # Revision 1.1.2.4  2002/06/28 14:36:47  amillar
 475 # Fix broken logic on illustrator vs. add'l author
 476 #
 477 # Revision 1.1.2.3  2002/06/26 20:54:32  tonnesen
 478 # use warnings breaks on perl 5.005...
 479 #
 480 # Revision 1.1.2.2  2002/06/26 15:52:55  amillar
 481 # Fix display of marc tag labels and indicators
 482 #
 483 # Revision 1.1.2.1  2002/06/26 07:27:35  amillar
 484 # Moved acqui.simple MARC handling to new module SimpleMarc.pm
 485 #