merging 2.2 branch with head. Sorry for not making it before, many many commits done...
[koha.git] / misc / migration_tools / build6xx.pl
1 #!/usr/bin/perl
2 # script that rebuild thesaurus from biblio table.
3
4 # delete  FROM  `marc_subfield_table`  WHERE tag =  "606" AND subfieldcode = 9;
5 use strict;
6
7 # Koha modules used
8 use MARC::File::USMARC;
9 use MARC::Record;
10 use MARC::Batch;
11 use C4::Context;
12 use C4::Biblio;
13 use C4::AuthoritiesMarc;
14 use Time::HiRes qw(gettimeofday);
15
16 use Getopt::Long;
17 my ( $input_marc_file, $number) = ('',0);
18 my ($version, $verbose, $test_parameter, $field,$delete,$category,$subfields);
19 GetOptions(
20     'h' => \$version,
21     'd' => \$delete,
22     't' => \$test_parameter,
23     's:s' => \$subfields,
24     'v' => \$verbose,
25     'c:s' => \$category,
26 );
27
28 if ($version || ($category eq '')) {
29         print <<EOF
30 small script to recreate a authority table into Koha.
31 parameters :
32 \tc : thesaurus category. Can be filled with anything, the NC is hardcoded. But mandatory to confirm that you want to rebuild 6xx
33 \d : delete every entry of the selected category before doing work.
34
35 SAMPLES :
36  ./build6xx.pl -c NC -d 
37 EOF
38 ;#
39 die;
40 }
41
42 my $dbh = C4::Context->dbh;
43 my @subf = $subfields =~ /(##\d\d\d##.)/g;
44 if ($delete) {
45         print "deleting thesaurus\n";
46         my $del1 = $dbh->prepare("delete from auth_subfield_table where authid=?");
47         my $del2 = $dbh->prepare("delete from auth_word where authid=?");
48         my $sth = $dbh->prepare("select authid from auth_header where authtypecode='NC'");
49         $sth->execute;
50         while (my ($authid) = $sth->fetchrow) {
51                 $del1->execute($authid);
52                 $del2->execute($authid);
53         }
54         $dbh->do("delete from auth_header where authtypecode='NC'");
55         $dbh->do("delete from marc_subfield_table where tag='606' and subfieldcode='9'");
56         $dbh->do("delete from marc_word where tagsubfield='6069'");
57 }
58
59 if ($test_parameter) {
60         print "TESTING MODE ONLY\n    DOING NOTHING\n===============\n";
61 }
62 $|=1; # flushes output
63 my $starttime = gettimeofday;
64 my $sth = $dbh->prepare("select bibid from marc_biblio");
65 $sth->execute;
66 my $i=1;
67 my %alreadydone;
68
69 # search biblios to "connect" to an authority with any number of $x (limited to 4 $x in this script)
70 my $sthBIBLIOS = $dbh->prepare("select distinct m1.bibid,m1.tag,m1.tagorder,m1.subfieldorder from marc_subfield_table as m1 where tag in (606) and subfieldcode='a' and subfieldvalue=?");
71 my $sthBIBLIOSx = $dbh->prepare("select distinct m1.bibid,m1.tag,m1.tagorder,m1.subfieldorder from marc_subfield_table as m1 left join marc_subfield_table as m2 on m1.bibid=m2.bibid where m1.tag in (606) and m1.subfieldcode='a' and m2.subfieldcode='x' and m1.subfieldvalue=? and m2.subfieldvalue=?");
72 my $sthBIBLIOSxx = $dbh->prepare("select distinct m1.bibid,m1.tag,m1.tagorder,m1.subfieldorder from marc_subfield_table as m1 left join marc_subfield_table as m2 on m1.bibid=m2.bibid left join marc_subfield_table as m3 on m1.bibid=m3.bibid where m1.tag in (606) and m1.subfieldcode='a' and m2.subfieldcode='x' and m3.subfieldcode='x' and m1.subfieldvalue=? and m2.subfieldvalue=? and m3.subfieldvalue=?");
73 my $sthBIBLIOSxxx = $dbh->prepare("select distinct m1.bibid,m1.tag,m1.tagorder,m1.subfieldorder from marc_subfield_table as m1 left join marc_subfield_table as m2 on m1.bibid=m2.bibid left join marc_subfield_table as m3 on m1.bibid=m4.bibid left join marc_subfield_table as m4 on m1.bibid=m4.bibid where m1.tag in (606) and m1.subfieldcode='a' and m2.subfieldcode='x' and m3.subfieldcode='x' and m4.subfieldcode='x' and m1.subfieldvalue=? and m2.subfieldvalue=? and m3.subfieldvalue=? and m4.subfieldvalue=?");
74 my $sthBIBLIOSxxxx = $dbh->prepare("select distinct m1.bibid,m1.tag,m1.tagorder,m1.subfieldorder from marc_subfield_table as m1 left join marc_subfield_table as m2 on m1.bibid=m2.bibid left join marc_subfield_table as m3 on m1.bibid=m4.bibid left join marc_subfield_table as m4 on m1.bibid=m4.bibid left join marc_subfield_table as m5 on m1.bibid=m5.bibid where m1.tag in (606) and m1.subfieldcode='a' and m2.subfieldcode='x' and m3.subfieldcode='x' and m4.subfieldcode='x' and m5.subfieldcode='x' and m1.subfieldvalue=? and m2.subfieldvalue=? and m3.subfieldvalue=? and m4.subfieldvalue=? and m5.subfieldvalue=?");
75
76 # loop through each biblio
77 while (my ($bibid) = $sth->fetchrow) {
78         my $record = MARCgetbiblio($dbh,$bibid);
79         my $timeneeded = gettimeofday - $starttime;
80         print "$i in $timeneeded s\n" unless ($i % 50);
81         foreach my $field ($record->field(995)) {
82                 $record->delete_field($field);
83         }
84         my $totdone=0;
85         my $authid;
86         # search the 606 field(s)
87         foreach my $field ($record->field("606")) {
88                 foreach my $authentry ($field->subfield("a")) {
89                         # the hashentry variable contains all $x fields and the $a in a single string. Used to differenciate
90                         # $xsomething$aelse and $asomething else
91                         my $hashentry = $authentry;
92                         foreach my $x ($field->subfield('x')) {
93                                 $hashentry.=" -- $x";
94                         }
95                         # remove é,à,$e...
96                         # all the same for mysql, but NOT for perl hashes !
97                         # without those lines, tôt is not tot and patée is not patee
98                         $hashentry =~ s/é|ê|è/e/g;
99                         $hashentry =~ s/â|à/a/g;
100                         $hashentry =~ s/î/i/g;
101                         $hashentry =~ s/ô/o/g;
102                         $hashentry =~ s/ù|û/u/g;
103                         # uppercase all, in case of typing error.
104                         $hashentry = uc($hashentry);
105                         $totdone++;
106                         if ($alreadydone{$hashentry}) {
107                                 $authid = $alreadydone{$hashentry};
108                                 print ".";
109                         } else {
110                                 print "*";
111                                 #create authority.
112                                 my $authorityRecord = MARC::Record->new();
113                                 my $newfield = MARC::Field->new(250,'','','a' => "".$authentry);
114                                 foreach my $x ($field->subfield('x')) {
115                                         $newfield->add_subfields('x' => $x);
116                                 }
117                                 foreach my $z ($field->subfield('z')) {
118                                         $newfield->add_subfields('z' => $z);
119                                 }
120                                 $authorityRecord->insert_fields_ordered($newfield);
121                                 $authid=AUTHaddauthority($dbh,$authorityRecord,'','NC');
122                                 $alreadydone{$hashentry} = $authid;
123                                 # we have the authority number, now we update all biblios that use this authority...
124                                 my @x = $field->subfield('x'); # depending on the number of $x in the subfield
125                                 if ($#x eq -1) { # no $x
126                                         $sthBIBLIOS->execute($authentry);
127                                         while (my ($bibid,$tag,$tagorder,$subfieldorder) = $sthBIBLIOS->fetchrow) {
128                                                 # check that the field does not already have a $x (if it has, it will or has been managed by another authority
129                                                 my $inbiblio = MARCgetbiblio($dbh,$bibid);
130                                                 my $isOK = 0;
131                                                 # loop in each 606 field
132                                                 foreach my $in606 ($inbiblio->field('606')) {
133                                                         my $inEntry = $in606->subfield('a');
134                                                         # and rebuild the $x -- $x -- $a string (like for $hashentry, few lines before)
135                                                         foreach my $x ($in606->subfield('x')) {
136                                                                 $inEntry.=" -- $x";
137                                                         }
138                                                         $inEntry =~ s/é|ê|è/e/g;
139                                                         $inEntry =~ s/â|à/a/g;
140                                                         $inEntry =~ s/î/i/g;
141                                                         $inEntry =~ s/ô/o/g;
142                                                         $inEntry =~ s/ù|û/u/g;
143                                                         $inEntry = uc($inEntry);
144                                                         # ok, it's confirmed that we must add the $9 subfield for this biblio, so...
145                                                         $isOK=1 if $inEntry eq $hashentry;
146                                                 }
147                                                 # ... add it !
148                                                 C4::Biblio::MARCaddsubfield($dbh,$bibid,$tag,'',$tagorder,9,$subfieldorder,$authid) if $isOK;
149                                         }
150                                 }
151                                 if ($#x eq 0) { # one $x
152                                         $sthBIBLIOSx->execute($authentry,$x[0]);
153                                         while (my ($bibid,$tag,$tagorder,$subfieldorder) = $sthBIBLIOSx->fetchrow) {
154                                                 my $inbiblio = MARCgetbiblio($dbh,$bibid);
155                                                 my $isOK = 0;
156                                                 foreach my $in606 ($inbiblio->field('606')) {
157                                                         my $inEntry = $in606->subfield('a');
158                                                         foreach my $x ($in606->subfield('x')) {
159                                                                 $inEntry.=" -- $x";
160                                                         }
161                                                         $inEntry =~ s/é|ê|è/e/g;
162                                                         $inEntry =~ s/â|à/a/g;
163                                                         $inEntry =~ s/î/i/g;
164                                                         $inEntry =~ s/ô/o/g;
165                                                         $inEntry =~ s/ù|û/u/g;
166                                                         $inEntry = uc($inEntry);
167                                                         $isOK=1 if $inEntry eq $hashentry;
168                                                 }
169                                                 C4::Biblio::MARCaddsubfield($dbh,$bibid,$tag,'',$tagorder,9,$subfieldorder,$authid) if $isOK;
170                                         }
171                                 }
172                                 if ($#x eq 1) { # two $x
173                                         $sthBIBLIOSxx->execute($authentry,$x[0],$x[1]);
174                                         while (my ($bibid,$tag,$tagorder,$subfieldorder) = $sthBIBLIOSxx->fetchrow) {
175                                                 my $inbiblio = MARCgetbiblio($dbh,$bibid);
176                                                 my $isOK = 0;
177                                                 foreach my $in606 ($inbiblio->field('606')) {
178                                                         my $inEntry = $in606->subfield('a');
179                                                         foreach my $x ($in606->subfield('x')) {
180                                                                 $inEntry.=" -- $x";
181                                                         }
182                                                         $inEntry =~ s/é|ê|è/e/g;
183                                                         $inEntry =~ s/â|à/a/g;
184                                                         $inEntry =~ s/î/i/g;
185                                                         $inEntry =~ s/ô/o/g;
186                                                         $inEntry =~ s/ù|û/u/g;
187                                                         $inEntry = uc($inEntry);
188                                                         $isOK=1 if $inEntry eq $hashentry;
189                                                 }
190                                                 C4::Biblio::MARCaddsubfield($dbh,$bibid,$tag,'',$tagorder,9,$subfieldorder,$authid) if $isOK;
191                                         }
192                                 }
193                                 if ($#x eq 2) { # 3 $x
194                                         $sthBIBLIOSxxx->execute($authentry,$x[0],$x[1],$x[2]);
195                                         while (my ($bibid,$tag,$tagorder,$subfieldorder) = $sthBIBLIOSxxx->fetchrow) {
196                                                 my $inbiblio = MARCgetbiblio($dbh,$bibid);
197                                                 my $isOK = 0;
198                                                 foreach my $in606 ($inbiblio->field('606')) {
199                                                         my $inEntry = $in606->subfield('a');
200                                                         foreach my $x ($in606->subfield('x')) {
201                                                                 $inEntry.=" -- $x";
202                                                         }
203                                                         $inEntry =~ s/é|ê|è/e/g;
204                                                         $inEntry =~ s/â|à/a/g;
205                                                         $inEntry =~ s/î/i/g;
206                                                         $inEntry =~ s/ô/o/g;
207                                                         $inEntry =~ s/ù|û/u/g;
208                                                         $inEntry = uc($inEntry);
209                                                         $isOK=1 if $inEntry eq $hashentry;
210                                                 }
211                                                 C4::Biblio::MARCaddsubfield($dbh,$bibid,$tag,'',$tagorder,9,$subfieldorder,$authid) if $isOK;
212                                         }
213                                 }
214                                 if ($#x eq 3) { # 3 $x
215                                         $sthBIBLIOSxxxx->execute($authentry,$x[0],$x[1],$x[2],$x[3]);
216                                         while (my ($bibid,$tag,$tagorder,$subfieldorder) = $sthBIBLIOSxxxx->fetchrow) {
217                                                 my $inbiblio = MARCgetbiblio($dbh,$bibid);
218                                                 my $isOK = 0;
219                                                 foreach my $in606 ($inbiblio->field('606')) {
220                                                         my $inEntry = $in606->subfield('a');
221                                                         foreach my $x ($in606->subfield('x')) {
222                                                                 $inEntry.=" -- $x";
223                                                         }
224                                                         $inEntry =~ s/é|ê|è/e/g;
225                                                         $inEntry =~ s/â|à/a/g;
226                                                         $inEntry =~ s/î/i/g;
227                                                         $inEntry =~ s/ô/o/g;
228                                                         $inEntry =~ s/ù|û/u/g;
229                                                         $inEntry = uc($inEntry);
230                                                         $isOK=1 if $inEntry eq $hashentry;
231                                                 }
232                                                 C4::Biblio::MARCaddsubfield($dbh,$bibid,$tag,'',$tagorder,9,$subfieldorder,$authid) if $isOK;
233                                         }
234                                 }
235                                 if ($#x >4) {
236                                         # too many $x, not handled, warn the developper that tries to migrate
237                                         print "warning there is ".$#x.'$x values';
238                                 }
239                         }
240                 }
241         }
242         $i++;
243 }
244 my $timeneeded = gettimeofday - $starttime;
245 print "$i entries done in $timeneeded seconds (".($i/$timeneeded)." per second)\n";