Tweak to cope with different DB file names
[debian-cd-search.git] / update-lists-db.pl
1 #!/usr/bin/perl -w
2 #
3 # update-lists-db.pl
4 #
5 # Parse the list.gz files that are created on cdimage.debian.org and
6 # convert the contents into a database for lookup tools to use.
7
8 # Copyright (c) 2011-2017 Steve McIntyre <93sam@debian.org>
9
10 # This program is free software; you can redistribute it and/or modify
11 # it under the terms of the GNU General Public License as published by
12 # the Free Software Foundation; either version 2 of the License, or
13 # (at your option) any later version.
14 #
15 # This program is distributed in the hope that it will be useful,
16 # but WITHOUT ANY WARRANTY; without even the implied warranty of
17 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18 # GNU General Public License for more details.
19 #
20 # You should have received a copy of the GNU General Public License
21 # along with this program; if not, write to the Free Software
22 # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
23
24 use strict;
25 use Fcntl qw(:flock SEEK_END);
26 use File::stat;
27 use File::Find;
28 use File::Basename;
29 use Compress::Zlib;
30 use POSIX qw(ENOENT EROFS ENOSYS EEXIST EPERM EBUSY O_RDONLY O_RDWR O_APPEND O_CREAT);
31 use Fcntl qw(O_RDWR O_WRONLY);
32 use DB_File;
33 use DBI;
34 use Getopt::Long;
35
36 # Set defaults for these
37 my $dbdir = "/home/steve/debian/debian-cd/search-db";
38 my $treedir = "/home/steve/debian/debian-cd/jigdo-lists";
39 my $dbtype = "sqlite";
40 my $verbose = 0;
41
42 GetOptions ("dbdir=s"     => \$dbdir,      # numeric
43             "treedir=s"   => \$treedir,    # string
44             "dbtype=s"    => \$dbtype,     # string
45             "verbose=i"   => \$verbose)    # numeric
46     or die("Error in command line arguments\n");
47
48 if (!($dbtype eq "hashdb" or $dbtype eq "sqlite")) {
49     die "Invalid dbtype $dbtype\n";
50 }
51
52 my $lock = "$dbdir/.update.lock";
53 my @areas = qw(daily-builds release weekly-builds archive);
54 #my @areas = qw(daily-builds release weekly-builds);
55 #my @areas = qw(weekly-builds);
56 my $update_needed = 0;
57 my $num_list_files;
58 my $db_mtime = 0;
59 my $db_file_name;
60 my $list_file_name;
61 my $list_file;
62 my $dbh;
63 my $sth;
64 my ($start, $stop, $taken);
65
66 sub get_time()
67 {
68     my @tm;
69     my $text;
70
71     @tm = gmtime();
72     $text = sprintf("%4d-%02d-%02d %02d:%02d:%02d UTC",
73                     (1900 + $tm[5]),(1 + $tm[4]),$tm[3],$tm[2],$tm[1],$tm[0]);
74     return $text;
75 }
76
77 sub print_log {
78     my $level = shift;
79     my $msg = shift;
80     if ($level <= $verbose) {
81                 my $timestamp = get_time();
82                 print "$timestamp $msg";
83     }
84 }
85
86 sub lock {
87     my ($fh) = @_;
88     flock($fh, LOCK_EX) or die "Cannot lock lockfile - $!\n";
89     
90     # and, in case someone appended while we were waiting...
91     seek($fh, 0, SEEK_END) or die "Cannot seek - $!\n";
92 }
93
94 sub unlock {
95     my ($fh) = @_;
96     flock($fh, LOCK_UN) or die "Cannot unlock lockfile - $!\n";
97 }
98
99 sub file_mtime {
100     my ($file) = shift;
101     my $sb = lstat($file);
102     if (! -e $file) {
103                 print "ENOENT $file!\n";
104     }
105     return $sb->mtime;
106 }
107
108 sub check_newer {
109     my ($filename);
110     $filename = $File::Find::name;
111     if ($filename =~ m/\.list\.gz$/) {
112                 my $mtime = file_mtime("/$treedir/$filename");
113                 print_log(4, "  check_newer: found $filename\n");
114                 print LISTS "$filename\n";
115                 if ($mtime > $db_mtime) {
116                         $update_needed = 1;
117                 }
118                 $num_list_files++;
119                 if (!($num_list_files % 1000)) {
120                         print_log(3, "  check_newer: found $num_list_files list files\n");
121                 }
122     }
123 }
124
125 chdir "$treedir";
126
127 open(my $lockfile, ">>", "$lock") or die "Can't open lockfile: $!";
128 print_log(1, "waiting on lock for $lock\n");
129 lock($lockfile);
130 print_log(1, "lock acquired for $lock\n");
131
132 foreach my $area (@areas) {
133
134         if ($dbtype =~ "hash") {
135                 $update_needed = 0;
136                 $num_list_files = 0;
137                 $db_file_name = "$dbdir/$area.db";
138                 $list_file_name = "$dbdir/$area.lists";
139
140                 print_log(1, "Working on area $area:\n");
141                 unlink "$list_file_name.new", "$list_file_name";
142                 open(LISTS, ">> $list_file_name.new") or die ("Can't open lists file $list_file_name.new for writing: $!\n");
143                 if (-f $db_file_name) {
144                         $db_mtime = file_mtime($db_file_name);
145                 } else {
146                         $db_mtime = 0;
147                 }
148                 find (\&check_newer,  "$area");
149                 close LISTS;
150                 print_log(2, "  found $num_list_files list files total, update_needed $update_needed\n");
151
152                 if ($update_needed) {
153                         my $current_list_num = 0;
154                         my $num_files = 0;
155                         my $current_file = 0;
156                         my $unique_files = 0;
157                         my %fileinfo;
158                         my %dbinfo;
159
160                         # Two passes; work in memory first, then push to the DB
161                         # file. Will this work better?
162                         undef %fileinfo;
163                         undef %dbinfo;
164
165                         $start = time();
166                         unlink "$db_file_name.new";
167                         open(LISTS, "< $list_file_name.new") or die ("Can't open lists file $list_file_name.new for reading: $!\n");
168                         while (my $listfile = <LISTS>) {
169                                 $current_list_num++;
170                                 chomp $listfile;
171                                 my $gz = gzopen($listfile, "rb") or die "Cannot open $listfile: $gzerrno\n";
172                                 my $file;
173                                 while ($gz->gzreadline($file) > 0) {
174                                         chomp $file;
175                                         $num_files++;
176                                         if (defined($fileinfo{$file})) {
177                                                 $fileinfo{$file} = "$fileinfo{$file} $listfile";
178                                         } else {
179                                                 $fileinfo{$file} = "$listfile";
180                                                 $unique_files++;
181                                         }
182                                 }
183                                 $gz->gzclose();
184                                 if (!($current_list_num % 100)) {
185                                         print_log(3, "    processing $area in memory: $current_list_num/$num_list_files list files done, $num_files files ($unique_files unique)\n");
186                                 }
187                         }
188
189                         # now push to the hashdb
190                         tie %dbinfo, 'DB_File', "$db_file_name.new";
191                         foreach my $file (keys %fileinfo) {
192                                 $dbinfo{$file} = $fileinfo{$file};          
193                                 $current_file++;
194                                 if (!($current_file % 10000)) {
195                                         print_log(3, "    storing $area to hashdb: $current_file/$unique_files files added\n");
196                                 }
197                         }
198                         untie %dbinfo;
199                         $stop = time();
200                         $taken = $stop - $start;
201                         
202                         rename("$db_file_name.new", "$db_file_name");
203                         print_log(2, "  $db_file_name created in $taken sec: $num_list_files list files, $num_files files referenced\n");
204                 }
205         } elsif ($dbtype =~ "sqlite") {
206                 $update_needed = 0;
207                 $num_list_files = 0;
208                 $db_file_name = "$dbdir/$area.db.sqlite";
209                 $list_file_name = "$dbdir/$area.lists";
210
211                 print_log(1, "Working on area $area:\n");
212                 unlink "$list_file_name.new", "$list_file_name";
213                 open(LISTS, ">> $list_file_name.new") or die ("Can't open lists file $list_file_name.new for writing: $!\n");
214                 if (-f $db_file_name) {
215                         $db_mtime = file_mtime($db_file_name);
216                 } else {
217                         $db_mtime = 0;
218                 }
219                 find (\&check_newer,  "$area");
220                 close LISTS;
221                 print_log(2, "  found $num_list_files list files total, update_needed $update_needed\n");
222
223                 if ($update_needed) {
224                         my $current_list_num = 0;
225                         my $num_files = 0;
226                         my $current_file = 0;
227
228                         unlink "$db_file_name.new";
229                         $start = time();
230                         $dbh = DBI->connect("dbi:SQLite:dbname=$db_file_name.new","","");
231
232                         # Maximise performance - we're doing a bulk insert for a cache...
233                         $dbh->do("PRAGMA synchronous = OFF");
234                         $dbh->do("PRAGMA journal_mode = MEMORY");
235                         $dbh->do("PRAGMA locking_mode = EXCLUSIVE");
236                         $sth = $dbh->prepare("CREATE TABLE entries (filename VARCHAR(512), jigdo VARCHAR(512));");
237                         $sth->execute();
238
239                         $dbh->do("BEGIN TRANSACTION");
240                         open(LISTS, "< $list_file_name.new") or die ("Can't open lists file $list_file_name.new for reading: $!\n");
241                         $sth = $dbh->prepare("INSERT INTO entries VALUES (?, ?)");
242                         while (my $listfile = <LISTS>) {
243                                 $current_list_num++;
244                                 chomp $listfile;
245                                 my $gz = gzopen($listfile, "rb") or die "Cannot open $listfile: $gzerrno\n";
246                                 my $file;
247                                 while ($gz->gzreadline($file) > 0) {
248                                         chomp $file;
249                                         $sth->execute($file, $listfile);
250                                         $num_files++;
251                                 }
252                                 $gz->gzclose();
253                                 if (!($current_list_num % 100)) {
254                                         print_log(3, "    processing $area into sqlite: $current_list_num/$num_list_files list files done, $num_files files\n");
255                                 }
256                         }
257                         if ($current_list_num % 100) {
258                                 print_log(3, "    finished processing $area into sqlite: $current_list_num/$num_list_files list files done, $num_files files; creating index now\n");
259                         }
260                         $dbh->do("CREATE INDEX fn_index ON entries (filename);");
261                         $dbh->do("END TRANSACTION");
262                         $dbh->disconnect();
263                         $stop = time();
264                         $taken = $stop - $start;
265                         rename("$db_file_name.new", "$db_file_name");
266                         print_log(2, "  $db_file_name created in $taken sec: $num_list_files list files, $num_files files referenced\n");
267                 }
268         }
269 }
270 rename("$list_file_name.new", "$list_file_name");
271 print_log(1, "dropping lock for $lock\n");
272 unlock($lockfile);
273 close($lockfile);