Add support for sqlite database instead of a hashdb file
authorSteve McIntyre <steve@einval.com>
Fri, 24 Feb 2017 14:45:32 +0000 (14:45 +0000)
committerSteve McIntyre <steve@einval.com>
Fri, 24 Feb 2017 14:46:24 +0000 (14:46 +0000)
It's massively quicker to run queries against
For now, support both

find_file.cgi
find_file_db.pl
update-lists-db.pl

index 7f1f908..2393e8c 100755 (executable)
@@ -27,6 +27,8 @@ use threads;
 use DB_File;
 use CGI;
 use ConfigReader::Simple;
+use DBI;
+use DBD::SQLite;
 
 my %conf;
 my $cdimage_url = "http://cdimage.debian.org/cdimage/";
@@ -37,7 +39,7 @@ my %fileinfo;
 my %imageinfo;
 my $query_term;
 my $query_type;
-my @results;
+my %results;
 my @chosen_areas;
 my $l = "";
 my $max_count = 1000;
@@ -59,6 +61,7 @@ sub set_default_config () {
     $conf{'dbdir'} = "/home/steve/debian/debian-cd/search/search-db";
     $conf{'htmldir'} = "/home/steve/debian/debian-cd/html";
     $conf{'debug'} = 0;
+    $conf{'dbtype'} = "sqlite";
 }
 
 # If we can find an appropriately-name config file, read it and
@@ -335,6 +338,7 @@ if (@chosen_areas &&
 }
 
 my $count = 0;
+my $count_images = 0;
 my $re_search;
 my $using_glob = "";
 if ($query_type eq "simple") {
@@ -355,35 +359,80 @@ print_html_header("$title_base results");
 # Now start the keepalive thread to print something every few seconds
 my $thr = threads->create(\&keepalive_thread);
 
-foreach my $area (@chosen_areas) {
-    my $db_file_name = "$conf{'dbdir'}/$area.db";
-    $l .= "Looking in area $area, file $db_file_name<br>\n";
-    dbmopen(%fileinfo, "$db_file_name", 0000) ||
-        log_error(500, "Failed to open db file: $!\n");
-
-    if ($query_term =~ /[\*\?]/ || $query_type eq "simple") {
-        $using_glob = "(using globs)";
-        # Will need to search through all the keys to allow for glob
-        foreach my $file (keys %fileinfo) {
-            if ($file =~ $re_search) {
-                $count++;
-                push(@results, "$file $fileinfo{$file}");
-                if ($count >= $max_count) {
-                    last;
-                }
-            }
-        }
-    } else {
-        # We've been given an exact name - do the exact key lookup \o/
-        if (defined($fileinfo{$query_term})) {
-            push (@results, "$query_term $fileinfo{$query_term}");
-            $count++;
-        }
+if ($conf{'dbtype'} eq "hashdb") {
+    foreach my $area (@chosen_areas) {
+       my $db_file_name = "$conf{'dbdir'}/$area.db";
+       $l .= "Looking in area $area, file $db_file_name<br>\n";
+       dbmopen(%fileinfo, "$db_file_name", 0000) ||
+           log_error(500, "Failed to open db file: $!\n");
+
+       if ($query_term =~ /[\*\?]/ || $query_type eq "simple") {
+           $using_glob = "(using globs)";
+           # Will need to search through all the keys to allow for glob
+           foreach my $file (keys %fileinfo) {
+               if ($file =~ $re_search) {
+                   $count++;
+                   $count_images += scalar (split / /, $fileinfo{$file});
+                   $results{$file} = $fileinfo{$file};
+                   if ($count >= $max_count) {
+                       last;
+                   }
+               }
+           }
+       } else {
+           # We've been given an exact name - do the exact key lookup \o/
+           if (defined($fileinfo{$query_term})) {
+               $results{$query_term} = $fileinfo{$query_term};
+               $count_images += scalar (split / /, $fileinfo{$query_term});
+               $count++;
+           }
+       }
+       if ($count >= $max_count) {
+           last;
+       }
+       dbmclose %fileinfo;
     }
-    if ($count >= $max_count) {
-        last;
+} else {
+    foreach my $area (@chosen_areas) {
+       my $sth;
+       my @db_results;
+       my $db_file_name = "$conf{'dbdir'}/$area.db.sqlite";
+       my $dbh = DBI->connect("dbi:SQLite:dbname=$db_file_name","","", {
+           sqlite_open_flags => DBD::SQLite::OPEN_READONLY,
+                              }) or log_error (500, "Failed to open DB file $db_file_name: $!\n");
+       $dbh->do("PRAGMA synchronous = OFF");
+
+       if ($query_term =~ /[\*\?]/ || $query_type eq "simple") {
+           $using_glob = "(using globs)";
+
+           # Will need to use sql LIKE and SQL wildcards
+           my $sql_term = "*" . $query_term . "*";
+           $sql_term =~ s,\*,\%,g;
+           $sql_term =~ s,\?,\_,g;
+           $sth = $dbh->prepare("SELECT * FROM entries WHERE filename LIKE ? ORDER BY filename ASC, jigdo ASC");
+           $sth->execute($sql_term);
+       } else {
+           # We've been given an exact name - do the exact lookup
+           $sth = $dbh->prepare("SELECT * FROM entries WHERE filename=? ORDER BY filename ASC");
+           $sth->execute($query_term);
+       }
+       while (@db_results = $sth->fetchrow_array) {
+           my $file = $db_results[0];
+           my $image = $db_results[1];
+           if (defined($results{$file})) {
+               $results{$file} = "$results{$file} $image";
+               $count_images++;
+           } else {
+               $results{$file} = "$image";
+               $count++;
+               $count_images++;
+           }
+           if ($count >= $max_count) {
+               last;
+           }
+       }
+       $dbh->disconnect();
     }
-    dbmclose %fileinfo;
 }
 
 # Kill the keepalive thread
@@ -393,7 +442,7 @@ my $end_time = time();
 my $time_taken = $end_time - $start_time;
 
 print
-    $q->start_html("$title_base: $count results"),
+    $q->start_html("$title_base: $count results from $count_images images"),
     $q->h1($title_base), "\n";
     print_config_if_debug();
 if ($conf{'debug'}) {
@@ -409,12 +458,12 @@ print $q->p("<a href=\"" . $q->url . "\">Search again.</a>");
 if ($count >= $max_count) {
     print $q->p("More than $max_count results for $query_type search \"$query_term\". Showing the first $count only\n");
 } else {   
-    print $q->p("$count result(s) for $query_type search \"$query_term\".\n");
+    print $q->p("$count result(s), $count_images image(s) for \"$query_term\":\n");
 }
 if ($count > 0) {
     print "<ol>\n";
-    foreach my $result (sort (@results)) {
-        my($found, @list) = split(' ', $result);
+    foreach my $found (sort (keys %results)) {
+       my @list = split(' ', $results{$found});
         print "<li> $found appears in:\n";
         print "<ul>\n";
         foreach my $image (sort(@list)) {
index 3e0cc8d..88112bd 100755 (executable)
@@ -25,14 +25,17 @@ use strict;
 use warnings;
 use threads;
 use DB_File;
+use DBI;
+use DBD::SQLite;
 
 my $dbdir = "/home/steve/debian/debian-cd/search-db";
 my @AREAS;
 my %num_files;
 my %fileinfo;
 my %imageinfo;
-my @results;
+my %results;
 my $max_count = 1000;
+my $dbtype = "sqlite";
 
 # Borrowed from Ikiwiki.pm
 sub glob2re ($) {
@@ -45,7 +48,11 @@ sub glob2re ($) {
 chdir($dbdir) || die "Failed to cd to $dbdir: $!\n";
 opendir(my $dh, ".") || die "Failed to open $dbdir: $!\n";
 while (defined($_ = readdir($dh))) {
-    m/(.*)\.db$/ and push (@AREAS, $1);
+    if ($dbtype eq "hash") {
+       m/(.*)\.db$/ and push (@AREAS, $1);
+    } else {
+       m/(.*)\.db\.sqlite$/ and push (@AREAS, $1);
+    }
 }
 closedir($dh);
 
@@ -64,60 +71,105 @@ sub keepalive_thread () {
 }
 
 my $count = 0;
+my $count_images = 0;
 my $re_search = glob2re($query_term);
 my $start_time = time();
 
 # Now start the keepalive thread to print something every few seconds
-my $thr = threads->create(\&keepalive_thread);
-
-foreach my $area (@AREAS) {
-    print "Looking in area $area\n";
-    my $db_file_name = "$dbdir/$area.db";
-    dbmopen(%fileinfo, "$db_file_name", 0000) || 
-        die "Failed to open db file: $!\n";
-
-    if ($query_term =~ /[\*\?]/) {
-        # Will need to search through all the keys to allow for glob
-        foreach my $file (keys %fileinfo) {
-            if ($file =~ $re_search) {
-                $count++;
-                push(@results, "$file $fileinfo{$file}");
-                if ($count >= $max_count) {
-                    last;
-                }
-            }
-        }
-    } else {
-        # We've been given an exact name - do the exact key lookup \o/
-        if (defined($fileinfo{$query_term})) {
-            push (@results, "$query_term $fileinfo{$query_term}");
-            $count++;
-        }
+#my $thr = threads->create(\&keepalive_thread);
+
+if ($dbtype eq "hash") {
+    foreach my $area (@AREAS) {
+       print "Looking in area $area\n";
+       my $db_file_name = "$dbdir/$area.db";
+       dbmopen(%fileinfo, "$db_file_name", 0000) || 
+           die "Failed to open db file: $!\n";
+
+       if ($query_term =~ /[\*\?]/) {
+           # Will need to search through all the keys to allow for glob
+           foreach my $file (keys %fileinfo) {
+               if ($file =~ $re_search) {
+                   $count++;
+                   $count_images += scalar (split / /, $fileinfo{$file});
+                   $results{$file} = $fileinfo{$file};
+                   if ($count >= $max_count) {
+                       last;
+                   }
+               }
+           }
+       } else {
+           # We've been given an exact name - do the exact key lookup \o/
+           if (defined($fileinfo{$query_term})) {
+               $results{$query_term} = $fileinfo{$query_term};
+               $count_images += scalar (split / /, $fileinfo{$query_term});
+               $count++;
+           }
+       }
+       if ($count >= $max_count) {
+           last;
+       }
+       dbmclose %fileinfo;
     }
-    if ($count >= $max_count) {
-        last;
+} else {
+    foreach my $area (@AREAS) {
+       print "Looking in area $area\n";
+       my $sth;
+       my @db_results;
+       my $db_file_name = "$dbdir/$area.db.sqlite";
+       my $dbh = DBI->connect("dbi:SQLite:dbname=$db_file_name","","", {
+           sqlite_open_flags => DBD::SQLite::OPEN_READONLY,
+                              }) or die "Failed to open DB file $db_file_name: $!\n";
+       $dbh->do("PRAGMA synchronous = OFF");
+
+       if ($query_term =~ /[\*\?]/) {
+           # Will need to use sql LIKE and SQL wildcards
+           my $sql_term = $query_term;
+           $sql_term =~ s,\*,\%,g;
+           $sql_term =~ s,\?,\_,g;
+           $sth = $dbh->prepare("SELECT * FROM entries WHERE filename LIKE ? ORDER BY filename ASC, jigdo ASC");
+           $sth->execute($sql_term);
+       } else {
+           # We've been given an exact name - do the exact lookup
+           $sth = $dbh->prepare("SELECT * FROM entries WHERE filename=? ORDER BY filename ASC");
+           $sth->execute($query_term);
+       }
+       while (@db_results = $sth->fetchrow_array) {
+           my $file = $db_results[0];
+           my $image = $db_results[1];
+           if (defined($results{$file})) {
+               $results{$file} = "$results{$file} $image";
+               $count_images++;
+           } else {
+               $results{$file} = "$image";
+               $count++;
+               $count_images++;
+           }
+           #                   print "@results\n";
+           if ($count >= $max_count) {
+               last;
+           }
+       }
+       $dbh->disconnect();
     }
-    dbmclose %fileinfo;
 }
 # Kill the keepalive thread
-$thr->kill('KILL')->join();
+#$thr->kill('KILL')->join();
 
 my $end_time = time();
 my $time_taken = $end_time - $start_time;
 
+print "Using $dbtype database, took $time_taken seconds:\n";
 if ($count >= $max_count) {
     print "More than $max_count results for \"$query_term\", showing the first $count only\n";
 } else {   
-    print "$count result(s) for \"$query_term\":\n";
+    print "$count result(s), $count_images image(s) for \"$query_term\":\n";
 }
 
-foreach my $result (sort (@results)) {
-    my($found, @list) = split(' ', $result);
+foreach my $found (sort (keys %results)) {
+    my @list = split(' ', $results{$found});
     print "  $found:\n";
     foreach my $image (sort(@list)) {
        print "    $image\n";
     }
 }
 
-print "time taken: $time_taken sec\n";
-
index 7249cbf..dbe225f 100755 (executable)
@@ -30,10 +30,12 @@ use Compress::Zlib;
 use POSIX qw(ENOENT EROFS ENOSYS EEXIST EPERM EBUSY O_RDONLY O_RDWR O_APPEND O_CREAT);
 use Fcntl qw(O_RDWR O_WRONLY);
 use DB_File;
+use DBI;
 
-my $dbdir = "/home/debian-cd/search-db";
+my $dbdir = "/home/steve/debian/debian-cd/search-db";
 my $lock = "$dbdir/.update.lock";
 my @areas = qw(daily-builds release weekly-builds archive);
+#my @areas = qw(daily-builds release weekly-builds);
 #my @areas = qw(weekly-builds);
 my $update_needed = 0;
 my $num_list_files;
@@ -42,22 +44,38 @@ my $db_file_name;
 my $list_file_name;
 my $list_file;
 my $verbose = 0;
+my $dbh;
+my $sth;
+my ($start, $stop, $taken);
+my $dbtype = "sqlite";
 
 while (1) {
     my $arg = shift;
     if (!defined($arg)) {
-       last;
+               last;
     }
     if ("-v" eq $arg) {
-       $verbose++;
+               $verbose++;
     }
 }
 
+sub get_time()
+{
+    my @tm;
+    my $text;
+
+    @tm = gmtime();
+    $text = sprintf("%4d-%02d-%02d %02d:%02d:%02d UTC",
+                    (1900 + $tm[5]),(1 + $tm[4]),$tm[3],$tm[2],$tm[1],$tm[0]);
+    return $text;
+}
+
 sub print_log {
     my $level = shift;
     my $msg = shift;
     if ($level <= $verbose) {
-       print $msg;
+               my $timestamp = get_time();
+               print "$timestamp $msg";
     }
 }
 
@@ -78,7 +96,7 @@ sub file_mtime {
     my ($file) = shift;
     my $sb = lstat($file);
     if (! -e $file) {
-       print "ENOENT $file!\n";
+               print "ENOENT $file!\n";
     }
     return $sb->mtime;
 }
@@ -87,20 +105,20 @@ sub check_newer {
     my ($filename);
     $filename = $File::Find::name;
     if ($filename =~ m/\.list\.gz$/) {
-       my $mtime = file_mtime("/mnt/nfs-cdimage/$filename");
-       print_log(4, "  check_newer: found $filename\n");
-       print LISTS "$filename\n";
-       if ($mtime > $db_mtime) {
-           $update_needed = 1;
-       }
-       $num_list_files++;
-       if (!($num_list_files % 1000)) {
-           print_log(3, "  check_newer: found $num_list_files list files\n");
-       }
+               my $mtime = file_mtime("/home/steve/debian/debian-cd/jigdo-lists/$filename");
+               print_log(4, "  check_newer: found $filename\n");
+               print LISTS "$filename\n";
+               if ($mtime > $db_mtime) {
+                       $update_needed = 1;
+               }
+               $num_list_files++;
+               if (!($num_list_files % 1000)) {
+                       print_log(3, "  check_newer: found $num_list_files list files\n");
+               }
     }
 }
 
-chdir "/mnt/nfs-cdimage";
+chdir "/home/steve/debian/debian-cd/jigdo-lists";
 
 open(my $lockfile, ">>", "$lock") or die "Can't open lockfile: $!";
 print_log(1, "waiting on lock for $lock\n");
@@ -108,76 +126,143 @@ lock($lockfile);
 print_log(1, "lock acquired for $lock\n");
 
 foreach my $area (@areas) {
-    $update_needed = 0;
-    $num_list_files = 0;
-    $db_file_name = "$dbdir/$area.db";
-    $list_file_name = "$dbdir/$area.lists";
-
-    print_log(1, "Working on area $area:\n");
-    unlink "$list_file_name.new";
-    open(LISTS, ">> $list_file_name.new") or die ("Can't open lists file $list_file_name.new for writing: $!\n");
-    if (-f $db_file_name) {
-       $db_mtime = file_mtime($db_file_name);
-    } else {
-       $db_mtime = 0;
-    }
-    find (\&check_newer,  "$area");
-    close LISTS;
-    print_log(2, "  found $num_list_files list files total, update_needed $update_needed\n");
-
-    if ($update_needed) {
-       my $current_list_num = 0;
-       my $num_files = 0;
-       my $current_file = 0;
-       my $unique_files = 0;
-       my %fileinfo;
-       my %dbinfo;
-
-       # Two passes; work in memory first, then push to the DB
-       # file. Will this work better?
-       undef %fileinfo;
-       undef %dbinfo;
-
-       unlink "$db_file_name.new";
-
-       open(LISTS, "< $list_file_name.new") or die ("Can't open lists file $list_file_name.new for reading: $!\n");
-       while (my $listfile = <LISTS>) {
-           $current_list_num++;
-           chomp $listfile;
-           my $gz = gzopen($listfile, "rb") or die "Cannot open $listfile: $gzerrno\n";
-           my $file;
-           while ($gz->gzreadline($file) > 0) {
-               chomp $file;
-               $num_files++;
-               if (defined($fileinfo{$file})) {
-                   $fileinfo{$file} = "$fileinfo{$file} $listfile";
-               } else {
-                   $fileinfo{$file} = "$listfile";
-                   $unique_files++;
-               }
-           }
-           $gz->gzclose();
-           if (!($current_list_num % 100)) {
-               print_log(3, "    processing $area in memory: $current_list_num/$num_list_files list files done, $num_files files ($unique_files unique)\n");
-           }
-       }
 
-       # now push to the db
-       tie %dbinfo, 'DB_File', "$db_file_name.new";
-       foreach my $file (keys %fileinfo) {
-           $dbinfo{$file} = $fileinfo{$file};      
-           $current_file++;
-           if (!($current_file % 10000)) {
-               print_log(3, "    storing $area to db: $current_file/$unique_files files added\n");
-           }
+       if ($dbtype =~ "hash") {
+               $update_needed = 0;
+               $num_list_files = 0;
+               $db_file_name = "$dbdir/$area.db";
+               $list_file_name = "$dbdir/$area.lists";
+
+               print_log(1, "Working on area $area:\n");
+               unlink "$list_file_name.new", "$list_file_name";
+               open(LISTS, ">> $list_file_name.new") or die ("Can't open lists file $list_file_name.new for writing: $!\n");
+               if (-f $db_file_name) {
+                       $db_mtime = file_mtime($db_file_name);
+               } else {
+                       $db_mtime = 0;
+               }
+               find (\&check_newer,  "$area");
+               close LISTS;
+               print_log(2, "  found $num_list_files list files total, update_needed $update_needed\n");
+
+               if ($update_needed) {
+                       my $current_list_num = 0;
+                       my $num_files = 0;
+                       my $current_file = 0;
+                       my $unique_files = 0;
+                       my %fileinfo;
+                       my %dbinfo;
+
+                       # Two passes; work in memory first, then push to the DB
+                       # file. Will this work better?
+                       undef %fileinfo;
+                       undef %dbinfo;
+
+                       $start = time();
+                       unlink "$db_file_name.new";
+                       open(LISTS, "< $list_file_name.new") or die ("Can't open lists file $list_file_name.new for reading: $!\n");
+                       while (my $listfile = <LISTS>) {
+                               $current_list_num++;
+                               chomp $listfile;
+                               my $gz = gzopen($listfile, "rb") or die "Cannot open $listfile: $gzerrno\n";
+                               my $file;
+                               while ($gz->gzreadline($file) > 0) {
+                                       chomp $file;
+                                       $num_files++;
+                                       if (defined($fileinfo{$file})) {
+                                               $fileinfo{$file} = "$fileinfo{$file} $listfile";
+                                       } else {
+                                               $fileinfo{$file} = "$listfile";
+                                               $unique_files++;
+                                       }
+                               }
+                               $gz->gzclose();
+                               if (!($current_list_num % 100)) {
+                                       print_log(3, "    processing $area in memory: $current_list_num/$num_list_files list files done, $num_files files ($unique_files unique)\n");
+                               }
+                       }
+
+                       # now push to the hashdb
+                       tie %dbinfo, 'DB_File', "$db_file_name.new";
+                       foreach my $file (keys %fileinfo) {
+                               $dbinfo{$file} = $fileinfo{$file};          
+                               $current_file++;
+                               if (!($current_file % 10000)) {
+                                       print_log(3, "    storing $area to hashdb: $current_file/$unique_files files added\n");
+                               }
+                       }
+                       untie %dbinfo;
+                       $stop = time();
+                       $taken = $stop - $start;
+                       
+                       rename("$db_file_name.new", "$db_file_name");
+                       print_log(2, "  $db_file_name created in $taken sec: $num_list_files list files, $num_files files referenced\n");
+               }
+       } elsif ($dbtype =~ "sqlite") {
+               $update_needed = 0;
+               $num_list_files = 0;
+               $db_file_name = "$dbdir/$area.db.sqlite";
+               $list_file_name = "$dbdir/$area.lists";
+
+               print_log(1, "Working on area $area:\n");
+               unlink "$list_file_name.new", "$list_file_name";
+               open(LISTS, ">> $list_file_name.new") or die ("Can't open lists file $list_file_name.new for writing: $!\n");
+               if (-f $db_file_name) {
+                       $db_mtime = file_mtime($db_file_name);
+               } else {
+                       $db_mtime = 0;
+               }
+               find (\&check_newer,  "$area");
+               close LISTS;
+               print_log(2, "  found $num_list_files list files total, update_needed $update_needed\n");
+
+               if ($update_needed) {
+                       my $current_list_num = 0;
+                       my $num_files = 0;
+                       my $current_file = 0;
+
+                       $start = time();
+                       $dbh = DBI->connect("dbi:SQLite:dbname=$db_file_name.new","","");
+
+                       # Maximise performance - we're doing a bulk insert for a cache...
+                       $dbh->do("PRAGMA synchronous = OFF");
+                       $dbh->do("PRAGMA journal_mode = MEMORY");
+                       $dbh->do("PRAGMA locking_mode = EXCLUSIVE");
+                       $sth = $dbh->prepare("CREATE TABLE entries (filename VARCHAR(512), jigdo VARCHAR(512));");
+                       $sth->execute();
+
+                       $dbh->do("BEGIN TRANSACTION");
+                       open(LISTS, "< $list_file_name.new") or die ("Can't open lists file $list_file_name.new for reading: $!\n");
+                       $sth = $dbh->prepare("INSERT INTO entries VALUES (?, ?)");
+                       while (my $listfile = <LISTS>) {
+                               $current_list_num++;
+                               chomp $listfile;
+                               my $gz = gzopen($listfile, "rb") or die "Cannot open $listfile: $gzerrno\n";
+                               my $file;
+                               while ($gz->gzreadline($file) > 0) {
+                                       chomp $file;
+                                       $sth->execute($file, $listfile);
+                                       $num_files++;
+                               }
+                               $gz->gzclose();
+                               if (!($current_list_num % 100)) {
+                                       print_log(3, "    processing $area into sqlite: $current_list_num/$num_list_files list files done, $num_files files\n");
+                               }
+                       }
+                       if ($current_list_num % 100) {
+                               print_log(3, "    finished processing $area into sqlite: $current_list_num/$num_list_files list files done, $num_files files; creating index now\n");
+                       }
+                       $dbh->do("CREATE INDEX fn_index ON entries (filename);");
+                       $dbh->do("END TRANSACTION");
+                       $dbh->disconnect();
+                       $stop = time();
+                       $taken = $stop - $start;
+                       rename("$db_file_name.new", "$db_file_name");
+                       print_log(2, "  $db_file_name created in $taken sec: $num_list_files list files, $num_files files referenced\n");
+               }
        }
-       untie %dbinfo;
-       
-       rename("$db_file_name.new", "$db_file_name");
-       print_log(2, "  $db_file_name created: $num_list_files list files, $num_files files referenced\n");
-    }
-    rename("$list_file_name.new", "$list_file_name");
 }
+rename("$list_file_name.new", "$list_file_name");
 print_log(1, "dropping lock for $lock\n");
 unlock($lockfile);
 close($lockfile);