2 ## This Script creates a Koha suggest and spellcheck database
3 ## for those features as visible on LibLime's opac: opac.liblime.com
4 ## It also contains the needed specs for creating a table of
5 ## queries for statistical purposes as well as a method of
6 ## returning popular searches via the suggest and spellcheck.
7 ## The code for running suggest and spellcheck can be found
8 ## either in Koha 2.4 CVS (HEAD as of this writing) or at
9 ## LibLime's website in the downlaods
10 ## section: http://liblime.com/c/downloads.html
12 ##Author: Joshua Ferraro jmf at liblime dot com
14 ## TODO: add suggest features, merge the two of them?
15 ## There are a few configurable variables.
17 ## CONFIGURABLE VARIABLES ####################
19 # These are the tags that have meaningful data
20 # for the databases I've worked with (MARC21 only)
21 # you may need to change them depending on your data
23 #Tag documentation from http://lcweb.loc.gov/marc/bibliographic/ecbdhome.html
24 "020a", # INTERNATIONAL STANDARD BOOK NUMBER
25 #"022a", # INTERNATIONAL STANDARD SERIAL NUMBER
26 "100a", # MAIN ENTRY--PERSONAL NAME
27 "110a", # MAIN ENTRY--CORPORATE NAME
28 #"110b", # Subordinate unit
29 #"110c", # Location of meeting
30 #"111a", # MAIN ENTRY--MEETING NAME
31 #"111c", # Location of meeting
32 "130a", # MAIN ENTRY--UNIFORM TITLE
33 "240a", # UNIFORM TITLE
34 "245a", # TITLE STATEMENT
35 "245b", # Remainder of title
36 "245c", # Statement of responsibility, etc.
37 "245p", # Name of part/section of a work
38 "246a", # VARYING FORM OF TITLE
39 "246b", # Remainder of title
40 #"260b", # PUBLICATION, DISTRIBUTION, ETC. (IMPRINT)
41 "440a", # SERIES STATEMENT/ADDED ENTRY--TITLE
42 "440p", # Name of part/section of a work
43 #"500a", # GENERAL NOTE
44 "505t", # FORMATTED CONTENTS NOTE (t is Title)
45 "511a", # PARTICIPANT OR PERFORMER NOTE
46 #"520a", # SUMMARY, ETC.
47 "534a", # ORIGINAL VERSION NOTE
48 #"534k", # Key title of original
49 #"534t", # Title statement of original
50 #"586a", # AWARDS NOTE
51 "600a", # SUBJECT ADDED ENTRY--PERSONAL NAME
52 "610a", # SUBJECT ADDED ENTRY--CORPORATE NAME
53 "611a", # SUBJECT ADDED ENTRY--MEETING NAME
54 "630a", # SUBJECT ADDED ENTRY--UNIFORM TITLE
55 "650a", # SUBJECT ADDED ENTRY--TOPICAL TERM
56 "651a", # SUBJECT ADDED ENTRY--GEOGRAPHIC NAME
57 "700a", # ADDED ENTRY--PERSONAL NAME
58 "710a", # ADDED ENTRY--CORPORATE NAME
59 #"711a", # ADDED ENTRY--MEETING NAME
60 #"720a", # ADDED ENTRY--UNCONTROLLED NAME
61 "730a", # ADDED ENTRY--UNIFORM TITLE
62 "740a", # ADDED ENTRY--UNCONTROLLED RELATED/ANALYTICAL TITLE
63 #"752a", # ADDED ENTRY--HIERARCHICAL PLACE NAME
64 "800a", # SERIES ADDED ENTRY--PERSONAL NAME
65 #"810a", # SERIES ADDED ENTRY--CORPORATE NAME
66 #"811a", # SERIES ADDED ENTRY--MEETING NAME
67 "830a", # SERIES ADDED ENTRY--UNIFORM TITLE
68 #"942k" # Holdings Branch ?? Unique to NPL??
70 ## Leave this next bit alone
73 # find Koha's Perl modules
74 # test carefully before changing this
76 eval { require "$FindBin::Bin/../kohalib.pl" };
80 # SUGGEST DATABASE INFO
81 # You'll need to change this if you want to keep your 'suggest' database
82 # separate from your Koha database -- simply comment out the next line
83 # and uncomment the one after it, adding your site info (check out GRANT
84 # syntax in the mysql manual if you're unsure how enable authentication)
86 my dbh2 = C4::Context->dbh;
88 #my $dbh2=DBI->connect("DBI:mysql:<add your database name here>:localhost","<add your mysql user here>","<add your password here>");
89 ########################################################################
90 ## End of most common configurable variables: in most cases you won't need
91 ## edit any further ... of course feel free to indulge yourself ;-)
92 ########################################################################
93 my $dbh=C4::Context->dbh;
96 # Check for existance of suggest database and add if it doesn't.
97 print "Step 1 of 5: Checking to make sure suggest tables exist\n";
98 my $check_tables_query = "select distinct resultcount from ?";
99 my @tables = ("notdistinctspchk", "notdistinctsugg", "spellcheck", "suggestions");
100 my %tables = ( notdistinctspchk => "( display varchar(40) not null default,
101 suggestion varchar(40) not null default,
102 foreach my $table (@tables) {
103 my $sth_check=$dbh2->prepare($check_tables_query) || die "cant prepare query: $DBI::errstr";
104 my $rv = $sth_check->execute($table);
106 print "$table missing ... creating it now\n";
107 my $create_this = "CREATE TABLE \'$table\' \(
108 display varchar\(40\) NOT NULL default \'\',
109 suggestion varchar\(40\) NOT NULL default \'\',
110 resultcount varchar\(40\) NOT NULL default \'0\'
112 my $sth_create = $dbh->prepare($create_this) || die "can't prepare query: $DBI::errstr";
113 $sth_create->execute() || die "can't execute: $DBI::errstr";
114 print "$table created ...\n";
116 print "$table exists ... moving along\n";
119 print "All tables present ... moving along\n";
121 print "Step 2 of 5: Deleting old data\n";
122 my $clear_out = "DELETE FROM notdistinctspchk";
124 my $sth_clear_out=$dbh2->prepare($clear_out) || die "cant prepare query";
125 $sth_clear_out->execute();
126 print "Step 3 of 5: Creating non-distinct table from various Koha tables\n";
127 my $query_words = "SELECT DISTINCT word, COUNT(word) FROM marc_word";
128 my $query_marc_subfields = "SELECT DISTINCT subfieldvalue, COUNT(subfieldvalue) FROM marc_subfield_table";
129 my $query_titles = "SELECT DISTINCT title, COUNT(title) FROM biblio GROUP BY title";
130 my $query_authors = "SELECT DISTINCT author, COUNT(author) FROM biblio GROUP BY author";
132 my @queries = ("$query_words", "$query_marc_subfields", "$query_titles", "$query_authors");
134 foreach my $query (@queries) {
136 #we need to do some special stuff for marc_word and marc_subfield_table queries
137 if ($query eq $queries[0]) { #marc_word
138 my $listoftagsubfields;
140 foreach my $tag (@tags) {
141 $listoftagsubfields.="$tag, ";
143 $query.=" WHERE tagsubfield=\'$tag\'";
146 $query.=" OR tagsubfield=\'$tag\'";
149 $query.=" GROUP BY word";
150 print "Finished building marc_word list\n";
151 print "Adding marc_word entries with the following tagsubfields:"."$listoftagsubfields"."\n";
154 if ($query eq $queries[1]) { #marc_subfield_table
155 my $listofsubfieldstuff; #for testing
157 foreach my $tag (@tags) {
159 $justtag =~ s/\D\Z//;
160 my $subfieldcode = $&;
161 $listofsubfieldstuff.="$justtag, "."$subfieldcode, ";
163 $query.=" WHERE (tag=\'$justtag\' and subfieldcode=\'$subfieldcode\')";
166 $query.=" OR (tag=\'$justtag\' and subfieldcode=\'$subfieldcode\')";
169 $query.=" GROUP BY subfieldvalue";
170 print "Finished building marc_subfield_table list\n";
171 print "Adding marc_subfield_table entries with the following tags and subfields:"."$listofsubfieldstuff"."\n";
174 my $sth=$dbh->prepare($query) || die "cant prepare query";
177 my $insert = "INSERT INTO notdistinctspchk(suggestion,display,resultcount) VALUES(?,?,?)";
179 my $sth2=$dbh2->prepare($insert);
181 while (my ($phraseterm,$count)=$sth->fetchrow_array) {
183 #$display looks exactly like the DB
184 my $display = $phraseterm;
185 #except for a few things
187 $display =~ s/^\s+//; #remove leading whitespace
188 $display =~ s/\s+$//; #remove trailing whitespace
189 $display =~ s/(\.|\/)/ /g;
191 #suggestion is tweaked for optimal searching
192 my $suggestion = $phraseterm;
193 $suggestion =~ tr/A-Z/a-z/;
194 $suggestion =~ s/(\.|\?|\:|\!|\'|,|\-|\"|\(|\)|\[|\]|\{|\})/ /g;
195 $suggestion =~s/(\Aand-or |\Aand\/or |\Aanon |\Aan |\Aa |\Abut |\Aby |\Ade |\Ader |\Adr |\Adu|et |\Afor |\Afrom |\Ain |\Ainto |\Ait |\Amy |\Anot |\Aon |\Aor |\Aper |\Apt |\Aspp |\Ato |\Avs |\Awith |\Athe )/ /g;
196 $suggestion =~s/( and-or | and\/or | anon | an | a | but | by | de | der | dr | du|et | for | from | in | into | it | my | not | on | or | per | pt | spp | to | vs | with | the )/ /g;
198 $suggestion =~s/ / /g;
200 $suggestion =~ s/^\s+//; #remove leading whitespace
201 $suggestion =~ s/\s+$//; #remove trailing whitespace
203 if (length($suggestion)>2) {
204 $sth2->execute($suggestion,$display,$count) || die "can't execute write";
209 print $counter." more records added...\n";
214 # Now grab distincts from there and insert into our REAL database
216 print "Step 4 of 5: Deleting old distinct entries\n";
217 my $clear_distincts = "DELETE FROM spellcheck";
220 my $sth_clear_distincts=$dbh2->prepare($clear_distincts) || die "cant prepare query";
221 $sth_clear_distincts->execute();
223 print "Step 5 of 5: Creating distinct spellcheck table out of non-distinct table\n";
224 my $query_distincts = "SELECT DISTINCT suggestion, display, COUNT(display) FROM notdistinctspchk GROUP BY suggestion";
225 my $insert_distincts = "INSERT INTO spellcheck(suggestion,display,resultcount) VALUES(?,?,?)";
226 my $distinctcounter = 0;
228 my $sth=$dbh2->prepare($query_distincts) || die "cant prepare query";
230 my $sth2=$dbh2->prepare($insert_distincts) || die "cant prepare query";
231 while (my ($suggestion,$display,$count)=$sth->fetchrow_array) {
233 $sth2->execute($suggestion,$display,$count) || die "can't execute write";
237 print "Finished: total distinct items added to spellcheck: "."$distinctcounter\n";