C4/Ris.pm

   1 package C4::Ris;
   2
   3 # Original script :
   4 ## marc2ris: converts MARC21 and UNIMARC datasets to RIS format
   5 ##           See comments below for compliance with other MARC dialects
   6 ##
   7 ## usage: perl marc2ris < infile.marc > outfile.ris
   8 ##
   9 ## Dependencies: perl 5.6.0 or later
  10 ##               MARC::Record
  11 ##               MARC::Charset
  12 ##
  13 ## markus@mhoenicka.de 2002-11-16
  14
  15 ##   This program is free software; you can redistribute it and/or modify
  16 ##   it under the terms of the GNU General Public License as published by
  17 ##   the Free Software Foundation; either version 2 of the License, or
  18 ##   (at your option) any later version.
  19 ##
  20 ##   This program is distributed in the hope that it will be useful,
  21 ##   but WITHOUT ANY WARRANTY; without even the implied warranty of
  22 ##   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  23 ##   GNU General Public License for more details.
  24
  25 ##   You should have received a copy of the GNU General Public License
  26 ##   along with this program; if not, write to the Free Software
  27 ##   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  28
  29 ## Some background about MARC as understood by this script
  30 ## The default input format used in this script is MARC21, which
  31 ## superseded USMARC and CANMARC. The specification can be found at:
  32 ## http://lcweb.loc.gov/marc/
  33 ## UNIMARC follows the specification at:
  34 ## http://www.ifla.org/VI/3/p1996-1/sec-uni.htm
  35 ## UKMARC support is a bit shaky because there is no specification available
  36 ## for free. The wisdom used in this script was taken from a PDF document
  37 ## comparing UKMARC to MARC21 found at:
  38 ## www.bl.uk/services/bibliographic/marcchange.pdf
  39
  40
  41 # Modified 2008 by BibLibre for Koha
  42 # Modified 2011 by Catalyst
  43 # Modified 2011 by Equinox Software, Inc.
  44 #
  45 # This file is part of Koha.
  46 #
  47 # Koha is free software; you can redistribute it and/or modify it
  48 # under the terms of the GNU General Public License as published by
  49 # the Free Software Foundation; either version 3 of the License, or
  50 # (at your option) any later version.
  51 #
  52 # Koha is distributed in the hope that it will be useful, but
  53 # WITHOUT ANY WARRANTY; without even the implied warranty of
  54 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  55 # GNU General Public License for more details.
  56 #
  57 # You should have received a copy of the GNU General Public License
  58 # along with Koha; if not, see <http://www.gnu.org/licenses>.
  59 #
  60 #
  61
  62 #use strict;
  63 #use warnings;
  64
  65 use List::MoreUtils qw/uniq/;
  66 use vars qw($VERSION @ISA @EXPORT);
  67
  68 # set the version for version checking
  69 $VERSION = 3.07.00.049;
  70
  71 @ISA = qw(Exporter);
  72
  73 # only export API methods
  74
  75 @EXPORT = qw(
  76   &marc2ris
  77 );
  78
  79
  80 =head1 marc2bibtex - Convert from UNIMARC to RIS
  81
  82   my ($ris) = marc2ris($record);
  83
  84 Returns a RIS scalar
  85
  86 C<$record> - a MARC::Record object
  87
  88 =cut
  89
  90 sub marc2ris {
  91     my ($record) = @_;
  92     my $output;
  93
  94     my $marcflavour = C4::Context->preference("marcflavour");
  95     my $intype = lc($marcflavour);
  96     my $marcprint = 0; # Debug flag;
  97
  98     # Let's redirect stdout
  99     open my $oldout, ">&STDOUT";
 100     my $outvar;
 101     close STDOUT;
 102     open STDOUT,'>', \$outvar;
 103
 104
 105         ## First we should check the character encoding. This may be
 106         ## MARC-8 or UTF-8. The former is indicated by a blank, the latter
 107         ## by 'a' at position 09 (zero-based) of the leader
 108         my $leader = $record->leader();
 109         if ($intype eq "marc21") {
 110             if ($leader =~ /^.{9}a/) {
 111                 print "<marc>---\r\n<marc>UTF-8 data\r\n" if $marcprint;
 112                 $utf = 1;
 113             }
 114             else {
 115                 print "<marc>---\r\n<marc>MARC-8 data\r\n" if $marcprint;
 116             }
 117         }
 118         ## else: other MARC formats do not specify the character encoding
 119         ## we assume it's *not* UTF-8
 120
 121         ## start RIS dataset
 122         &print_typetag($leader);
 123
 124         ## retrieve all author fields and collect them in a list
 125         my @author_fields;
 126
 127         if ($intype eq "unimarc") {
 128             ## Fields 700, 701, and 702 can contain author names
 129             @author_fields = ($record->field('700'), $record->field('701'), $record->field('702'));
 130         }
 131         else {  ## marc21, ukmarc
 132             ## Field 100 sometimes carries main author
 133             ## Field(s) 700 carry added entries - personal names
 134             @author_fields = ($record->field('100'), $record->field('700'));
 135         }
 136
 137         ## loop over all author fields
 138         foreach my $field (@author_fields) {
 139             if (length($field)) {
 140                 my $author = &get_author($field);
 141                 print "AU  - ",&charconv($author),"\r\n";
 142             }
 143         }
 144
 145         # ToDo: should we specify anonymous as author if we didn't find
 146         # one? or use one of the corporate/meeting names below?
 147
 148         ## add corporate names or meeting names as editors ??
 149         my @editor_fields;
 150
 151         if ($intype eq "unimarc") {
 152             ## Fields 710, 711, and 712 can carry corporate names
 153             ## Field(s) 720, 721, 722, 730 have additional candidates
 154             @editor_fields = ($record->field('710'), $record->field('711'), $record->field('712'), $record->field('720'), $record->field('721'), $record->field('722'), $record->field('730'));
 155         }
 156         else { ## marc21, ukmarc
 157             ## Fields 110 and 111 carry the main entries - corporate name and
 158             ## meeting name, respectively
 159             ## Field(s) 710, 711 carry added entries - personal names
 160             @editor_fields = ($record->field('110'), $record->field('111'), $record->field('710'), $record->field('711'));
 161         }
 162
 163         ## loop over all editor fields
 164         foreach my $field (@editor_fields) {
 165             if (length($field)) {
 166                 my $editor = &get_editor($field);
 167                 print "ED  - ",&charconv($editor),"\r\n";
 168             }
 169         }
 170
 171         ## get info from the title field
 172         if ($intype eq "unimarc") {
 173             &print_title($record->field('200'));
 174         }
 175         else { ## marc21, ukmarc
 176             &print_title($record->field('245'));
 177         }
 178
 179         ## series title
 180         if ($intype eq "unimarc") {
 181             &print_stitle($record->field('225'));
 182         }
 183         else { ## marc21, ukmarc
 184             &print_stitle($record->field('490'));
 185         }
 186
 187         ## ISBN/ISSN
 188         if ($intype eq "unimarc") {
 189             &print_isbn($record->field('010'));
 190             &print_issn($record->field('011'));
 191         }
 192         elsif ($intype eq "ukmarc") {
 193             &print_isbn($record->field('021'));
 194             ## this is just an assumption
 195             &print_issn($record->field('022'));
 196         }
 197         else { ## assume marc21
 198             &print_isbn($record->field('020'));
 199             &print_issn($record->field('022'));
 200         }
 201
 202         if ($intype eq "marc21") {
 203             &print_loc_callno($record->field('050'));
 204             &print_dewey($record->field('082'));
 205         }
 206         ## else: unimarc, ukmarc do not seem to store call numbers?
 207
 208         ## publication info
 209         if ($intype eq "unimarc") {
 210             &print_pubinfo($record->field('210'));
 211         }
 212         else { ## marc21, ukmarc
 213             if ($record->field('264')) {
 214                  &print_pubinfo($record->field('264'));
 215             }
 216             else {
 217             &print_pubinfo($record->field('260'));
 218             }
 219         }
 220
 221         ## 6XX fields contain KW candidates. We add all of them to a
 222
 223     my @field_list;
 224     if ($intype eq "unimarc") {
 225         @field_list = ('600', '601', '602', '604', '605', '606','607', '608', '610', '615', '620', '660', '661', '670', '675', '676', '680', '686');
 226     } elsif ($intype eq "ukmarc") {
 227         @field_list = ('600', '610', '611', '630', '650', '651','653', '655', '660', '661', '668', '690', '691', '692', '695');
 228     } else { ## assume marc21
 229         @field_list = ('600', '610', '611', '630', '650', '651','653', '654', '655', '656', '657', '658');
 230     }
 231
 232     my @kwpool;
 233     for my $f ( @field_list ) {
 234         my @fields = $record->field($f);
 235         push @kwpool, ( get_keywords("$f",$record->field($f)) );
 236     }
 237
 238     # Remove duplicate
 239     @kwpool = uniq @kwpool;
 240
 241     for my $kw ( @kwpool ) {
 242         print "KW  - ", &charconv($kw), "\r\n";
 243     }
 244
 245         ## 5XX have various candidates for notes and abstracts. We pool
 246         ## all notes-like stuff in one list.
 247         my @notepool;
 248
 249         ## these fields have notes candidates
 250         if ($intype eq "unimarc") {
 251             foreach ('300', '301', '302', '303', '304', '305', '306', '307', '308', '310', '311', '312', '313', '314', '315', '316', '317', '318', '320', '321', '322', '323', '324', '325', '326', '327', '328', '332', '333', '336', '337', '345') {
 252                 &pool_subx(\@notepool, $_, $record->field($_));
 253             }
 254         }
 255         elsif ($intype eq "ukmarc") {
 256             foreach ('500', '501', '502', '503', '504', '505', '506', '508', '514', '515', '516', '521', '524', '525', '528', '530', '531', '532', '533', '534', '535', '537', '538', '540', '541', '542', '544', '554', '555', '556', '557', '561', '563', '580', '583', '584', '586') {
 257                 &pool_subx(\@notepool, $_, $record->field($_));
 258         }
 259         }
 260         else { ## assume marc21
 261             foreach ('500', '501', '502', '504', '505', '506', '507', '508', '510', '511', '513', '514', '515', '516', '518', '521', '522', '524', '525', '526', '530', '533', '534', '535') {
 262                 &pool_subx(\@notepool, $_, $record->field($_));
 263             }
 264         }
 265
 266         my $allnotes = join "; ", @notepool;
 267
 268         if (length($allnotes) > 0) {
 269             print "N1  - ", &charconv($allnotes), "\r\n";
 270         }
 271
 272         ## 320/520 have the abstract
 273         if ($intype eq "unimarc") {
 274             &print_abstract($record->field('320'));
 275         }
 276         elsif ($intype eq "ukmarc") {
 277             &print_abstract($record->field('512'), $record->field('513'));
 278         }
 279         else { ## assume marc21
 280             &print_abstract($record->field('520'));
 281         }
 282
 283     # 856u has the URI
 284     if ($record->field('856')) {
 285         print_uri($record->field('856'));
 286     }
 287
 288         ## end RIS dataset
 289         print "ER  - \r\n";
 290
 291     # Let's re-redirect stdout
 292     close STDOUT;
 293     open STDOUT, ">&", $oldout;
 294
 295     return $outvar;
 296
 297 }
 298
 299
 300 ##********************************************************************
 301 ## print_typetag(): prints the first line of a RIS dataset including
 302 ## the preceeding newline
 303 ## Argument: the leader of a MARC dataset
 304 ## Returns: the value at leader position 06
 305 ##********************************************************************
 306 sub print_typetag {
 307   my ($leader)= @_;
 308     ## the keys of typehash are the allowed values at position 06
 309     ## of the leader of a MARC record, the values are the RIS types
 310     ## that might appropriately represent these types.
 311     my %ustypehash = (
 312                     "a" => "BOOK",
 313                     "c" => "MUSIC",
 314                     "d" => "MUSIC",
 315                     "e" => "MAP",
 316                     "f" => "MAP",
 317                     "g" => "ADVS",
 318                     "i" => "SOUND",
 319                     "j" => "SOUND",
 320                     "k" => "ART",
 321                     "m" => "DATA",
 322                     "o" => "GEN",
 323                     "p" => "GEN",
 324                     "r" => "ART",
 325                     "t" => "GEN",
 326                 );
 327
 328     my %unitypehash = (
 329                     "a" => "BOOK",
 330                     "b" => "BOOK",
 331                     "c" => "MUSIC",
 332                     "d" => "MUSIC",
 333                     "e" => "MAP",
 334                     "f" => "MAP",
 335                     "g" => "ADVS",
 336                     "i" => "SOUND",
 337                     "j" => "SOUND",
 338                     "k" => "ART",
 339                     "l" => "ELEC",
 340                     "m" => "ADVS",
 341                     "r" => "ART",
 342                 );
 343
 344     ## The type of a MARC record is found at position 06 of the leader
 345     my $typeofrecord = substr($leader, 6, 1);
 346
 347     ## ToDo: for books, field 008 positions 24-27 might have a few more
 348     ## hints
 349
 350     my %typehash;
 351
 352     ## the ukmarc here is just a guess
 353     if ($intype eq "marc21" || $intype eq "ukmarc") {
 354         %typehash = %ustypehash;
 355     }
 356     elsif ($intype eq "unimarc") {
 357         %typehash = %unitypehash;
 358     }
 359     else {
 360         ## assume MARC21 as default
 361         %typehash = %ustypehash;
 362     }
 363
 364     if (!exists $typehash{$typeofrecord}) {
 365         print "TY  - BOOK\r\n"; ## most reasonable default
 366         warn ("no type found - assume BOOK") if $marcprint;
 367     }
 368     else {
 369         print "TY  - $typehash{$typeofrecord}\r\n";
 370     }
 371
 372     ## use $typeofrecord as the return value, just in case
 373     $typeofrecord;
 374 }
 375
 376 ##********************************************************************
 377 ## normalize_author(): normalizes an authorname
 378 ## Arguments: authorname subfield a
 379 ##            authorname subfield b
 380 ##            authorname subfield c
 381 ##            name type if known: 0=direct order
 382 ##                               1=only surname or full name in
 383 ##                                 inverted order
 384 ##                               3=family, clan, dynasty name
 385 ## Returns: the normalized authorname
 386 ##********************************************************************
 387 sub normalize_author {
 388     my($rawauthora, $rawauthorb, $rawauthorc, $nametype) = @_;
 389
 390     if ($nametype == 0) {
 391         # ToDo: convert every input to Last[,(F.|First)[ (M.|Middle)[,Suffix]]]
 392         warn("name >>$rawauthora<< in direct order - leave as is") if $marcprint;
 393         return $rawauthora;
 394     }
 395     elsif ($nametype == 1) {
 396         ## start munging subfield a (the real name part)
 397         ## remove spaces after separators
 398         $rawauthora =~ s%([,.]+) *%$1%g;
 399
 400         ## remove trailing separators after spaces
 401         $rawauthora =~ s% *[,;:/]*$%%;
 402
 403         ## remove periods after a non-abbreviated name
 404         $rawauthora =~ s%(\w{2,})\.%$1%g;
 405
 406         ## start munging subfield b (something like the suffix)
 407         ## remove trailing separators after spaces
 408         $rawauthorb =~ s% *[,;:/]*$%%;
 409
 410         ## we currently ignore subfield c until someone complains
 411         if (length($rawauthorb) > 0) {
 412             return join ",", ($rawauthora, $rawauthorb);
 413         }
 414         else {
 415             return $rawauthora;
 416         }
 417     }
 418     elsif ($nametype == 3) {
 419         return $rawauthora;
 420     }
 421 }
 422
 423 ##********************************************************************
 424 ## get_author(): gets authorname info from MARC fields 100, 700
 425 ## Argument: field (100 or 700)
 426 ## Returns: an author string in the format found in the record
 427 ##********************************************************************
 428 sub get_author {
 429     my ($authorfield) = @_;
 430     my ($indicator);
 431
 432     ## the sequence of the name parts is encoded either in indicator
 433     ## 1 (marc21) or 2 (unimarc)
 434     if ($intype eq "unimarc") {
 435         $indicator = 2;
 436     }
 437     else { ## assume marc21
 438         $indicator = 1;
 439     }
 440
 441     print "<marc>:Author(Ind$indicator): ", $authorfield->indicator("$indicator"),"\r\n" if $marcprint;
 442     print "<marc>:Author(\$a): ", $authorfield->subfield('a'),"\r\n" if $marcprint;
 443     print "<marc>:Author(\$b): ", $authorfield->subfield('b'),"\r\n" if $marcprint;
 444     print "<marc>:Author(\$c): ", $authorfield->subfield('c'),"\r\n" if $marcprint;
 445     print "<marc>:Author(\$h): ", $authorfield->subfield('h'),"\r\n" if $marcprint;
 446     if ($intype eq "ukmarc") {
 447         my $authorname = $authorfield->subfield('a') . "," . $authorfield->subfield('h');
 448         normalize_author($authorname, $authorfield->subfield('b'), $authorfield->subfield('c'), $authorfield->indicator("$indicator"));
 449     }
 450     else {
 451         normalize_author($authorfield->subfield('a'), $authorfield->subfield('b'), $authorfield->subfield('c'), $authorfield->indicator("$indicator"));
 452     }
 453 }
 454
 455 ##********************************************************************
 456 ## get_editor(): gets editor info from MARC fields 110, 111, 710, 711
 457 ## Argument: field (110, 111, 710, or 711)
 458 ## Returns: an author string in the format found in the record
 459 ##********************************************************************
 460 sub get_editor {
 461     my ($editorfield) = @_;
 462
 463     if (!$editorfield) {
 464         return;
 465     }
 466     else {
 467         print "<marc>Editor(\$a): ", $editorfield->subfield('a'),"\r\n" if $marcprint;
 468         print "<marc>Editor(\$b): ", $editorfield->subfield('b'),"\r\n" if $marcprint;
 469         print "<marc>editor(\$c): ", $editorfield->subfield('c'),"\r\n" if $marcprint;
 470         return $editorfield->subfield('a');
 471     }
 472 }
 473
 474 ##********************************************************************
 475 ## print_title(): gets info from MARC field 245
 476 ## Arguments: field (245)
 477 ## Returns:
 478 ##********************************************************************
 479 sub print_title {
 480     my ($titlefield) = @_;
 481     if (!$titlefield) {
 482         print "<marc>empty title field (245)\r\n" if $marcprint;
 483         warn("empty title field (245)") if $marcprint;
 484     }
 485     else {
 486         print "<marc>Title(\$a): ",$titlefield->subfield('a'),"\r\n" if $marcprint;
 487         print "<marc>Title(\$b): ",$titlefield->subfield('b'),"\r\n" if $marcprint;
 488         print "<marc>Title(\$c): ",$titlefield->subfield('c'),"\r\n" if $marcprint;
 489
 490         ## The title is usually written in a very odd notation. The title
 491         ## proper ($a) often ends with a space followed by a separator like
 492         ## a slash or a colon. The subtitle ($b) doesn't start with a space
 493         ## so simple concatenation looks odd. We have to conditionally remove
 494         ## the separator and make sure there's a space between title and
 495         ## subtitle
 496
 497         my $clean_title = $titlefield->subfield('a');
 498
 499         my $clean_subtitle = $titlefield->subfield('b');
 500         $clean_title =~ s% *[/:;.]$%%;
 501         $clean_subtitle =~ s%^ *(.*) *[/:;.]$%$1%;
 502
 503         if (length($clean_title) > 0
 504             || (length($clean_subtitle) > 0 && $intype ne "unimarc")) {
 505             print "TI  - ", &charconv($clean_title);
 506
 507             ## subfield $b is relevant only for marc21/ukmarc
 508             if (length($clean_subtitle) > 0 && $intype ne "unimarc") {
 509                 print ": ",&charconv($clean_subtitle);
 510             }
 511             print "\r\n";
 512         }
 513
 514         ## The statement of responsibility is just this: horrors. There is
 515         ## no formal definition how authors, editors and the like should
 516         ## be written and designated. The field is free-form and resistant
 517         ## to all parsing efforts, so this information is lost on me
 518     }
 519 }
 520
 521 ##********************************************************************
 522 ## print_stitle(): prints info from series title field
 523 ## Arguments: field
 524 ## Returns:
 525 ##********************************************************************
 526 sub print_stitle {
 527     my ($titlefield) = @_;
 528
 529     if (!$titlefield) {
 530         print "<marc>empty series title field\r\n" if $marcprint;
 531     }
 532     else {
 533         print "<marc>Series title(\$a): ",$titlefield->subfield('a'),"\r\n" if $marcprint;
 534         my $clean_title = $titlefield->subfield('a');
 535
 536         $clean_title =~ s% *[/:;.]$%%;
 537
 538         if (length($clean_title) > 0) {
 539             print "T2  - ", &charconv($clean_title),"\r\n";
 540         }
 541
 542         if ($intype eq "unimarc") {
 543             print "<marc>Series vol(\$v): ",$titlefield->subfield('v'),"\r\n" if $marcprint;
 544             if (length($titlefield->subfield('v')) > 0) {
 545                 print "VL  - ", &charconv($titlefield->subfield('v')),"\r\n";
 546             }
 547         }
 548     }
 549 }
 550
 551 ##********************************************************************
 552 ## print_isbn(): gets info from MARC field 020
 553 ## Arguments: field (020)
 554 ##********************************************************************
 555 sub print_isbn {
 556     my($isbnfield) = @_;
 557
 558     if (!$isbnfield || length ($isbnfield->subfield('a')) == 0) {
 559         print "<marc>no isbn found (020\$a)\r\n" if $marcprint;
 560         warn("no isbn found") if $marcprint;
 561     }
 562     else {
 563         if (length ($isbnfield->subfield('a')) < 10) {
 564             print "<marc>truncated isbn (020\$a)\r\n" if $marcprint;
 565             warn("truncated isbn") if $marcprint;
 566         }
 567
 568         my $isbn = substr($isbnfield->subfield('a'), 0, 10);
 569         print "SN  - ", &charconv($isbn), "\r\n";
 570     }
 571 }
 572
 573 ##********************************************************************
 574 ## print_issn(): gets info from MARC field 022
 575 ## Arguments: field (022)
 576 ##********************************************************************
 577 sub print_issn {
 578     my($issnfield) = @_;
 579
 580     if (!$issnfield || length ($issnfield->subfield('a')) == 0) {
 581         print "<marc>no issn found (022\$a)\r\n" if $marcprint;
 582         warn("no issn found") if $marcprint;
 583     }
 584     else {
 585         if (length ($issnfield->subfield('a')) < 9) {
 586             print "<marc>truncated issn (022\$a)\r\n" if $marcprint;
 587             warn("truncated issn") if $marcprint;
 588         }
 589
 590         my $issn = substr($issnfield->subfield('a'), 0, 9);
 591         print "SN  - ", &charconv($issn), "\r\n";
 592     }
 593 }
 594
 595 ###
 596 # print_uri() prints info from 856 u
 597 ###
 598 sub print_uri {
 599     my @f856s = @_;
 600
 601     foreach my $f856 (@f856s) {
 602         if (my $uri = $f856->subfield('u')) {
 603                 print "UR  - ", charconv($uri), "\r\n";
 604         }
 605     }
 606 }
 607
 608 ##********************************************************************
 609 ## print_loc_callno(): gets info from MARC field 050
 610 ## Arguments: field (050)
 611 ##********************************************************************
 612 sub print_loc_callno {
 613     my($callnofield) = @_;
 614
 615     if (!$callnofield || length ($callnofield->subfield('a')) == 0) {
 616         print "<marc>no LOC call number found (050\$a)\r\n" if $marcprint;
 617         warn("no LOC call number found") if $marcprint;
 618     }
 619     else {
 620         print "AV  - ", &charconv($callnofield->subfield('a')), " ", &charconv($callnofield->subfield('b')), "\r\n";
 621     }
 622 }
 623
 624 ##********************************************************************
 625 ## print_dewey(): gets info from MARC field 082
 626 ## Arguments: field (082)
 627 ##********************************************************************
 628 sub print_dewey {
 629     my($deweyfield) = @_;
 630
 631     if (!$deweyfield || length ($deweyfield->subfield('a')) == 0) {
 632         print "<marc>no Dewey number found (082\$a)\r\n" if $marcprint;
 633         warn("no Dewey number found") if $marcprint;
 634     }
 635     else {
 636         print "U1  - ", &charconv($deweyfield->subfield('a')), " ", &charconv($deweyfield->subfield('2')), "\r\n";
 637     }
 638 }
 639
 640 ##********************************************************************
 641 ## print_pubinfo(): gets info from MARC field 260
 642 ## Arguments: field (260)
 643 ##********************************************************************
 644 sub print_pubinfo {
 645     my($pubinfofield) = @_;
 646
 647     if (!$pubinfofield) {
 648     print "<marc>no publication information found (260/264)\r\n" if $marcprint;
 649         warn("no publication information found") if $marcprint;
 650     }
 651     else {
 652         ## the following information is available in MARC21:
 653         ## $a place -> CY
 654         ## $b publisher -> PB
 655         ## $c date -> PY
 656         ## the corresponding subfields for UNIMARC:
 657         ## $a place -> CY
 658         ## $c publisher -> PB
 659         ## $d date -> PY
 660
 661         ## all of them are repeatable. We pool all places into a
 662         ## comma-separated list in CY. We also pool all publishers
 663         ## into a comma-separated list in PB.  We break the rule with
 664         ## the date field because this wouldn't make much sense. In
 665         ## this case, we use the first occurrence for PY, the second
 666         ## for Y2, and ignore the rest
 667
 668         my @pubsubfields = $pubinfofield->subfields();
 669         my @cities;
 670         my @publishers;
 671         my $pycounter = 0;
 672
 673         my $pubsub_place;
 674         my $pubsub_publisher;
 675         my $pubsub_date;
 676
 677         if ($intype eq "unimarc") {
 678             $pubsub_place = "a";
 679             $pubsub_publisher = "c";
 680             $pubsub_date = "d";
 681         }
 682         else { ## assume marc21
 683             $pubsub_place = "a";
 684             $pubsub_publisher = "b";
 685             $pubsub_date = "c";
 686         }
 687
 688         ## loop over all subfield list entries
 689         for my $tuple (@pubsubfields) {
 690             ## each tuple consists of the subfield code and the value
 691             if (@$tuple[0] eq $pubsub_place) {
 692                 ## strip any trailing crap
 693                 $_ = @$tuple[1];
 694                 s% *[,;:/]$%%;
 695                 ## pool all occurrences in a list
 696                 push (@cities, $_);
 697             }
 698             elsif (@$tuple[0] eq $pubsub_publisher) {
 699                 ## strip any trailing crap
 700                 $_ = @$tuple[1];
 701                 s% *[,;:/]$%%;
 702                 ## pool all occurrences in a list
 703                 push (@publishers, $_);
 704             }
 705             elsif (@$tuple[0] eq $pubsub_date) {
 706                 ## the dates are free-form, so we want to extract
 707                 ## a four-digit year and leave the rest as
 708                 ## "other info"
 709                 $protoyear = @$tuple[1];
 710                 print "<marc>Year (260\$c): $protoyear\r\n" if $marcprint;
 711
 712                 ## strip any separator chars at the end
 713                 $protoyear =~ s% *[\.;:/]*$%%;
 714
 715                 ## isolate a four-digit year. We discard anything
 716                 ## preceeding the year, but keep everything after
 717                 ## the year as other info.
 718                 $protoyear =~ s%\D*([0-9\-]{4})(.*)%$1///$2%;
 719
 720                 ## check what we've got. If there is no four-digit
 721                 ## year, make it up. If digits are replaced by '-',
 722                 ## replace those with 0s
 723
 724                 if (index($protoyear, "/") == 4) {
 725                     ## have year info
 726                     ## replace all '-' in the four-digit year
 727                     ## by '0'
 728                     substr($protoyear,0,4) =~ s!-!0!g;
 729                 }
 730                 else {
 731                     ## have no year info
 732                     print "<marc>no four-digit year found, use 0000\r\n" if $marcprint;
 733                     $protoyear = "0000///$protoyear";
 734                     warn("no four-digit year found, use 0000") if $marcprint;
 735                 }
 736
 737                 if ($pycounter == 0 && length($protoyear)) {
 738                     print "PY  - $protoyear\r\n";
 739                 }
 740                 elsif ($pycounter == 1 && length($_)) {
 741                     print "Y2  - $protoyear\r\n";
 742                 }
 743                 ## else: discard
 744             }
 745             ## else: discard
 746         }
 747
 748         ## now dump the collected CY and PB lists
 749         if (@cities > 0) {
 750             print "CY  - ", &charconv(join(", ", @cities)), "\r\n";
 751         }
 752         if (@publishers > 0) {
 753             print "PB  - ", &charconv(join(", ", @publishers)), "\r\n";
 754         }
 755     }
 756 }
 757
 758 ##********************************************************************
 759 ## get_keywords(): prints info from MARC fields 6XX
 760 ## Arguments: list of fields (6XX)
 761 ##********************************************************************
 762 sub get_keywords {
 763     my($fieldname, @keywords) = @_;
 764
 765     my @kw;
 766     ## a list of all possible subfields
 767     my @subfields = ('a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'x', 'y', 'z', '2', '3', '4');
 768
 769     ## loop over all 6XX fields
 770     foreach my $kwfield (@keywords) {
 771         if ($kwfield != undef) {
 772             ## authornames get special treatment
 773             if ($fieldname eq "600") {
 774                 my $val = normalize_author($kwfield->subfield('a'), $kwfield->subfield('b'), $kwfield->subfield('c'), $kwfield->indicator('1'));
 775         push @kw, $val;
 776                 print "<marc>Field $kwfield subfield a:", $kwfield->subfield('a'), "\r\n<marc>Field $kwfield subfield b:", $kwfield->subfield('b'), "\r\n<marc>Field $kwfield subfield c:", $kwfield->subfield('c'), "\r\n" if $marcprint;
 777             }
 778             else {
 779                 ## retrieve all available subfields
 780                 @kwsubfields = $kwfield->subfields();
 781
 782                 ## loop over all available subfield tuples
 783         foreach my $kwtuple (@kwsubfields) {
 784                     ## loop over all subfields to check
 785             foreach my $subfield (@subfields) {
 786                         ## [0] contains subfield code
 787                         if (@$kwtuple[0] eq $subfield) {
 788                             ## [1] contains value, remove trailing separators
 789                             @$kwtuple[1] =~ s% *[,;.:/]*$%%;
 790                             if (length(@$kwtuple[1]) > 0) {
 791                 push @kw, @$kwtuple[1];
 792                                 print "<marc>Field $fieldname subfield $subfield:", @$kwtuple[1], "\r\n" if $marcprint;
 793                             }
 794                             ## we can leave the subfields loop here
 795                             last;
 796                         }
 797                     }
 798                 }
 799             }
 800         }
 801     }
 802     return @kw;
 803 }
 804
 805 ##********************************************************************
 806 ## pool_subx(): adds contents of several subfields to a list
 807 ## Arguments: reference to a list
 808 ##            field name
 809 ##            list of fields (5XX)
 810 ##********************************************************************
 811 sub pool_subx {
 812     my($aref, $fieldname, @notefields) = @_;
 813
 814     ## we use a list that contains the interesting subfields
 815     ## for each field
 816     # ToDo: this is apparently correct only for marc21
 817     my @subfields;
 818
 819     if ($fieldname eq "500") {
 820         @subfields = ('a');
 821     }
 822     elsif ($fieldname eq "501") {
 823         @subfields = ('a');
 824     }
 825     elsif ($fieldname eq "502") {
 826         @subfields = ('a');
 827             }
 828     elsif ($fieldname eq "504") {
 829         @subfields = ('a', 'b');
 830     }
 831     elsif ($fieldname eq "505") {
 832         @subfields = ('a', 'g', 'r', 't', 'u');
 833     }
 834     elsif ($fieldname eq "506") {
 835         @subfields = ('a', 'b', 'c', 'd', 'e');
 836     }
 837     elsif ($fieldname eq "507") {
 838         @subfields = ('a', 'b');
 839     }
 840     elsif ($fieldname eq "508") {
 841         @subfields = ('a');
 842     }
 843     elsif ($fieldname eq "510") {
 844         @subfields = ('a', 'b', 'c', 'x', '3');
 845     }
 846     elsif ($fieldname eq "511") {
 847         @subfields = ('a');
 848     }
 849     elsif ($fieldname eq "513") {
 850         @subfields = ('a', 'b');
 851     }
 852     elsif ($fieldname eq "514") {
 853         @subfields = ('a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'm', 'u', 'z');
 854     }
 855     elsif ($fieldname eq "515") {
 856         @subfields = ('a');
 857     }
 858     elsif ($fieldname eq "516") {
 859         @subfields = ('a');
 860     }
 861     elsif ($fieldname eq "518") {
 862         @subfields = ('a', '3');
 863     }
 864     elsif ($fieldname eq "521") {
 865         @subfields = ('a', 'b', '3');
 866     }
 867     elsif ($fieldname eq "522") {
 868         @subfields = ('a');
 869     }
 870     elsif ($fieldname eq "524") {
 871         @subfields = ('a', '2', '3');
 872     }
 873     elsif ($fieldname eq "525") {
 874         @subfields = ('a');
 875     }
 876     elsif ($fieldname eq "526") {
 877         @subfields = ('a', 'b', 'c', 'd', 'i', 'x', 'z', '5');
 878     }
 879     elsif ($fieldname eq "530") {
 880         @subfields = ('a', 'b', 'c', 'd', 'u', '3');
 881     }
 882     elsif ($fieldname eq "533") {
 883         @subfields = ('a', 'b', 'c', 'd', 'e', 'f', 'm', 'n', '3');
 884     }
 885     elsif ($fieldname eq "534") {
 886         @subfields = ('a', 'b', 'c', 'e', 'f', 'k', 'l', 'm', 'n', 'p', 't', 'x', 'z');
 887     }
 888     elsif ($fieldname eq "535") {
 889         @subfields = ('a', 'b', 'c', 'd', 'g', '3');
 890     }
 891
 892     ## loop over all notefields
 893     foreach my $notefield (@notefields) {
 894         if ($notefield != undef) {
 895             ## retrieve all available subfield tuples
 896             @notesubfields = $notefield->subfields();
 897
 898             ## loop over all subfield tuples
 899         foreach my $notetuple (@notesubfields) {
 900                 ## loop over all subfields to check
 901         foreach my $subfield (@subfields) {
 902                     ## [0] contains subfield code
 903                     if (@$notetuple[0] eq $subfield) {
 904                         ## [1] contains value, remove trailing separators
 905                         print "<marc>field $fieldname subfield $subfield: ", @$notetuple[1], "\r\n" if $marcprint;
 906                         @$notetuple[1] =~ s% *[,;.:/]*$%%;
 907                         if (length(@$notetuple[1]) > 0) {
 908                             ## add to list
 909                             push @{$aref}, @$notetuple[1];
 910                         }
 911                         last;
 912                     }
 913                 }
 914             }
 915         }
 916     }
 917 }
 918
 919 ##********************************************************************
 920 ## print_abstract(): prints abstract fields
 921 ## Arguments: list of fields (520)
 922 ##********************************************************************
 923 sub print_abstract {
 924     # ToDo: take care of repeatable subfields
 925     my(@abfields) = @_;
 926
 927     ## we check the following subfields
 928     my @subfields = ('a', 'b');
 929
 930     ## we generate a list for all useful strings
 931     my @abstrings;
 932
 933     ## loop over all abfields
 934     foreach my $abfield (@abfields) {
 935         foreach my $field (@subfields) {
 936             if ( length( $abfield->subfield($field) ) > 0 ) {
 937                 my $ab = $abfield->subfield($field);
 938
 939                 print "<marc>field 520 subfield $field: $ab\r\n" if $marcprint;
 940
 941                 ## strip trailing separators
 942                 $ab =~ s% *[;,:./]*$%%;
 943
 944                 ## add string to the list
 945                 push( @abstrings, $ab );
 946             }
 947         }
 948     }
 949
 950     my $allabs = join "; ", @abstrings;
 951
 952     if (length($allabs) > 0) {
 953         print "N2  - ", &charconv($allabs), "\r\n";
 954     }
 955
 956 }
 957
 958
 959
 960 ##********************************************************************
 961 ## charconv(): converts to a different charset based on a global var
 962 ## Arguments: string
 963 ## Returns: string
 964 ##********************************************************************
 965 sub charconv {
 966     if ($utf) {
 967         ## return unaltered if already utf-8
 968         return @_;
 969     }
 970     elsif ($uniout eq "t") {
 971         ## convert to utf-8
 972         return marc8_to_utf8("@_");
 973     }
 974     else {
 975         ## return unaltered if no utf-8 requested
 976         return @_;
 977     }
 978 }
 979 1;