From 63474fc22fb317dc3ba5e541b6b3984e92422d6f Mon Sep 17 00:00:00 2001
From: tonnesen <tonnesen>
Date: Fri, 17 May 2002 22:03:14 +0000
Subject: [PATCH] Benchmarking files for comparing performance of two different
 proposed database schemas for MARC storage.  See IRC log at

    http://www.haz.cmsd.bc.ca/cgi-bin/kohalog.pl

for information on the two proposals.
---
 marc/benchmarks/benchmarkresults   |  6 +++
 marc/benchmarks/benchmarkschema    | 22 +++++++++
 marc/benchmarks/generaterandomdata | 73 ++++++++++++++++++++++++++++++
 marc/benchmarks/getdata-paul       | 32 +++++++++++++
 marc/benchmarks/getdata-paul-regex | 33 ++++++++++++++
 marc/benchmarks/getdata-steve      | 32 +++++++++++++
 marc/benchmarks/runbenchmark       | 19 ++++++++
 7 files changed, 217 insertions(+)
 create mode 100644 marc/benchmarks/benchmarkresults
 create mode 100644 marc/benchmarks/benchmarkschema
 create mode 100644 marc/benchmarks/generaterandomdata
 create mode 100644 marc/benchmarks/getdata-paul
 create mode 100644 marc/benchmarks/getdata-paul-regex
 create mode 100644 marc/benchmarks/getdata-steve
 create mode 100644 marc/benchmarks/runbenchmark
diff --git a/marc/benchmarks/benchmarkresults b/marc/benchmarks/benchmarkresults
new file mode 100644
index 0000000000..4191d587a0
--- /dev/null
+++ b/marc/benchmarks/benchmarkresults
@@ -0,0 +1,6 @@
+Benchmark results from Steve Tonnesen
+
+getdata-steve:		15.73	13.26	14.47	12.73	11.90
+getdata-paul:  		21.56	14.11	12.58	12.49	13.18
+getdata-paul-regex:     14.32	11.53	13.51	10.15	10.99
+
diff --git a/marc/benchmarks/benchmarkschema b/marc/benchmarks/benchmarkschema
new file mode 100644
index 0000000000..7c10aeb42d
--- /dev/null
+++ b/marc/benchmarks/benchmarkschema
@@ -0,0 +1,22 @@
+CREATE TABLE marc_0XX_tag_table (
+		bibcode bigint(20) NOT NULL default '0',
+		tagnumber char(3) NOT NULL default '',
+		tagorder tinyint(4) NOT NULL default '0',
+		tagvalue varchar(255) default NULL,
+		valuebloblink bigint(20) default NULL,
+		PRIMARY KEY  (bibcode,tagnumber,tagorder)
+		) TYPE=MyISAM;
+
+CREATE TABLE marc_2XX_subfield_table (
+		subfieldid bigint(20) unsigned NOT NULL auto_increment,
+		tagid bigint(20) NOT NULL default '0',
+		tag char(3) NOT NULL default '',
+		bibid bigint(20) NOT NULL default '0',
+		subfieldorder tinyint(4) NOT NULL default '0',
+		subfieldcode char(1) NOT NULL default '',
+		subfieldvalue varchar(255) default NULL,
+		valuebloblink bigint(20) default NULL,
+		PRIMARY KEY (subfieldid),
+		KEY (bibid,tagid,tag,subfieldcode)
+		) TYPE=MyISAM;
+
diff --git a/marc/benchmarks/generaterandomdata b/marc/benchmarks/generaterandomdata
new file mode 100644
index 0000000000..2c6455b3e1
--- /dev/null
+++ b/marc/benchmarks/generaterandomdata
@@ -0,0 +1,73 @@
+#!/usr/bin/perl
+#
+# This script generates 80,000 random records in the kohabenchmark database for
+# the purposes of comparing two different marc storage schemas.  It requires
+# the presence of a word list for populating the data.  Mine is in
+# /usr/share/dict/words.  Change that if necessary.  You'll also need to change
+# your userid and password for the dbi->connect line.
+
+use DBI;
+
+my $dbh=DBI->connect("dbi:mysql:kohabenchmark", 'youruserid', 'yourpassword');
+@subfields = ( 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n');
+
+
+open (W, "/usr/share/dict/words");
+while (<W>) {
+    chomp;
+    push @words, $_;
+}
+
+my $tagcounter=0;
+my $subfieldcounter=0;
+srand($$|time);
+for ($bibid=1; $bibid<80000; $bibid++) {
+    my $numtags=int(rand(10)+5);
+    my $localtagcounter=0;
+    for ($i=1; $i<$numtags; $i++) {
+	$localtagcounter++;
+	$tagcounter++;
+	my $tag=$i*40+100;
+	my $numsubfields=int(rand(10)+1);
+	my $subfieldsused;
+	my $localsubfieldcounter=0;
+	my $tagvalue='';
+	for ($j=1; $j<=$numsubfields; $j++) {
+	    my $code='';
+	    until ($code) { 
+		my $codepicker=int(rand($#subfields));
+		if ($subfieldsused->{$subfields[$codepicker]}==0) {
+		    $subfieldsused->{$subfields[$codepicker]}=1;
+		    $code=$subfields[$codepicker];
+		}
+	    }
+	    $subfieldcounter++;
+	    $localsubfieldcounter++;
+	    my $word=$words[int(rand($#words))];
+	    $tagvalue.="\$$code $word\0";
+	    my $sth=$dbh->prepare("insert into marc_2XX_subfield_table (subfieldid, tagid, tag, bibid, subfieldorder, subfieldcode, subfieldvalue) values (?,?,?,?,?,?,?)");
+	    my $error=1;
+	    while ($error) {
+		$sth->execute($subfieldcounter, $tagcounter, $tag, $bibid, $localsubfieldcounter, $code, $word);
+		$error=$dbh->err;
+		if ($error) {
+		    sleep 1;
+		    print "ERROR: $error\n";
+		}
+		$sth->finish;
+	    }
+	}
+	$tagvalue=~s/\0$//;
+	my $error=1;
+	my $sth=$dbh->prepare("insert into marc_0XX_tag_table (bibcode, tagnumber, tagorder, tagvalue) values (?, ?, ?, ?)");
+	while ($error) {
+	    $sth->execute($bibid, $tag, $localtagcounter, $tagvalue);
+	    $error=$dbh->err;
+	    if ($error) {
+		sleep 1;
+		print "ERROR: $error\n";
+	    }
+	    $sth->finish;
+	}
+    }
+}
diff --git a/marc/benchmarks/getdata-paul b/marc/benchmarks/getdata-paul
new file mode 100644
index 0000000000..04849250a4
--- /dev/null
+++ b/marc/benchmarks/getdata-paul
@@ -0,0 +1,32 @@
+#!/usr/bin/perl
+#
+#
+# Benchmark script for Paul's marc db schema using split() to separate subfield
+# code from subfield value
+
+use DBI;
+
+
+my $dbh=DBI->connect("dbi:mysql:kohabenchmark", 'root', 'testpass');
+
+my $count=$ARGV[0];
+my $print=$ARGV[1];
+my $bibid=$ARGV[2];
+
+
+
+for ($i=0; $i<$count; $i++) {
+    ($bibid) || ($bibid=int(rand(79998))+1);
+
+    ($print) && (print "BIBID: $bibid\n");
+    my $sth=$dbh->prepare("select tagnumber,tagvalue from marc_0XX_tag_table where bibcode=$bibid order by tagorder");
+    $sth->execute;
+    while (my ($tagnumber, $tagvalue) = $sth->fetchrow) {
+	($print) && (print "  Tag: $tagnumber\n");
+	foreach (split(/\0/, $tagvalue)) {
+	    my ($code, $value) = split(/\s/, $_, 2);
+	    ($print) && (print "    $code $value\n");
+	}
+    }
+    $bibid=0;
+}
diff --git a/marc/benchmarks/getdata-paul-regex b/marc/benchmarks/getdata-paul-regex
new file mode 100644
index 0000000000..edd33a8dbf
--- /dev/null
+++ b/marc/benchmarks/getdata-paul-regex
@@ -0,0 +1,33 @@
+#!/usr/bin/perl
+#
+#
+# Benchmark script for Paul's marc db schema using regex to separate subfield
+# code from subfield value
+
+use DBI;
+
+
+my $dbh=DBI->connect("dbi:mysql:kohabenchmark", 'root', 'testpass');
+
+my $count=$ARGV[0];
+my $print=$ARGV[1];
+my $bibid=$ARGV[2];
+
+
+
+for ($i=0; $i<$count; $i++) {
+    ($bibid) || ($bibid=int(rand(79998))+1);
+
+    ($print) && (print "BIBID: $bibid\n");
+    my $sth=$dbh->prepare("select tagnumber,tagvalue from marc_0XX_tag_table where bibcode=$bibid order by tagorder");
+    $sth->execute;
+    while (my ($tagnumber, $tagvalue) = $sth->fetchrow) {
+	($print) && (print "  Tag: $tagnumber\n");
+	foreach (split(/\0/, $tagvalue)) {
+	    m#$(.) (.*)#;
+	    my ($code, $value) = ($1, $2);
+	    ($print) && (print "    $code $value\n");
+	}
+    }
+    $bibid=0;
+}
diff --git a/marc/benchmarks/getdata-steve b/marc/benchmarks/getdata-steve
new file mode 100644
index 0000000000..7bdc3acbe7
--- /dev/null
+++ b/marc/benchmarks/getdata-steve
@@ -0,0 +1,32 @@
+#!/usr/bin/perl
+#
+#
+# Benchmark script for Steve's marc db schema
+
+
+use DBI;
+
+
+my $dbh=DBI->connect("dbi:mysql:kohabenchmark", 'root', 'testpass');
+
+my $count=$ARGV[0];
+my $print=$ARGV[1];
+my $bibid=$ARGV[2];
+
+
+for ($i=0; $i<$count; $i++) {
+    ($bibid) || ($bibid=int(rand(79998))+1);
+
+    ($print) && (print "BIBID: $bibid\n");
+    my $sth=$dbh->prepare("select tagid,tag,subfieldcode,subfieldvalue from marc_2XX_subfield_table where bibid=$bibid order by tagid,subfieldorder");
+    $sth->execute;
+    my $lasttag='';
+    while (my ($tagid,$tag,$subfieldcode,$subfieldvalue) = $sth->fetchrow) {
+	if ($tag ne $lasttag) {
+	    ($print) && (print "  Tag: $tag\n");
+	    $lasttag=$tag;
+	}
+	($print) && (print "    $subfieldcode $subfieldvalue\n");
+    }
+    $bibid=0;
+}
diff --git a/marc/benchmarks/runbenchmark b/marc/benchmarks/runbenchmark
new file mode 100644
index 0000000000..566ab06e3b
--- /dev/null
+++ b/marc/benchmarks/runbenchmark
@@ -0,0 +1,19 @@
+#!/usr/bin/perl
+#
+#
+#  This script will iterate through each benchmark 5 times, looking up 500
+#  random records each time.  Results will be printed to STDOUT.
+
+my @benchmarks=('getdata-steve', 'getdata-paul', 'getdata-paul-regex');
+
+my $iterations=5;
+
+foreach (@benchmarks) {
+    print "$_:\t";
+    for ($i=1; $i<=$iterations; $i++) {
+	my $timer=`/usr/bin/time -f "%E" perl $_ 500 2>&1`;
+	chomp $timer;
+	print "$timer\t";
+    }
+    print "\n";
+}
-- 
2.39.2