1 /* $Id: scan.cc,v 1.11 2005/07/02 22:05:04 atterer Exp $ -*- C++ -*-
3 |_) /| Copyright (C) 2001-2002 | richard@
4 | \/¯| Richard Atterer | atterer.org
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License, version 2. See
8 the file COPYING for details.
10 Scanning of input files
23 #include <sys/types.h>
24 #include <unistd-jigdo.h>
28 #include <configfile.hh>
32 #include <serialize.hh>
33 //______________________________________________________________________
37 void JigdoCache::ProgressReporter::error(const string& message) {
38 cerr << message << endl;
40 void JigdoCache::ProgressReporter::info(const string& message) {
41 cerr << message << endl;
43 void JigdoCache::ProgressReporter::scanningFile(const FilePart*, uint64) { }
45 JigdoCache::ProgressReporter JigdoCache::noReport;
47 struct stat JigdoCache::fileInfo;
48 //______________________________________________________________________
51 /* Interpret a string of bytes (out of the file cache) like this:
53 4 blockLength (of rsync sum)
55 4 blocks (number of valid md5 blocks in this entry), curr. always >0
56 8 rsyncSum of file start (only valid if blocks > 0)
57 16 fileMD5Sum (only valid if
58 blocks == (fileSize+md5BlockLength-1)/md5BlockLength )
59 followed by n entries:
60 16 md5sum of block of size md5BlockLength
62 If stored md5BlockLength doesn't match supplied length, do nothing.
63 Otherwise, restore *this from cached data and return cached
64 blockLength (0 if not cached). The caller needs to make sure the
67 This is not a standard unserialize() member of FilePart because it
68 does not create a complete serialization - e.g. the location path
69 iter is missing. It only creates a cache entry. */
71 size_t FilePart::unserializeCacheEntry(const byte* data, size_t dataSize,
72 size_t md5BlockLength){
73 Assert(dataSize > PART_MD5SUM);
75 // The resize() must have been made by the caller
76 Paranoid(sums.size() == (size() + md5BlockLength - 1) / md5BlockLength);
78 size_t cachedBlockLength;
79 data = unserialize4(cachedBlockLength, data);
80 size_t cachedMd5BlockLength;
81 data = unserialize4(cachedMd5BlockLength, data);
82 if (cachedMd5BlockLength != md5BlockLength) return 0;
85 data = unserialize4(blocks, data);
86 // Ignore strange-looking entries
87 if (blocks * serialSizeOf(md5Sum) != dataSize - PART_MD5SUM
89 if (blocks == 0) debug("ERR #blocks == 0");
90 else debug("ERR wrong entry size (%1 vs %2)",
91 blocks * 16, dataSize - PART_MD5SUM);
94 Paranoid(serialSizeOf(rsyncSum) == 8);
95 data = unserialize(rsyncSum, data);
96 Paranoid(serialSizeOf(md5Sum) == 16);
97 // All blocks of file present?
98 if (blocks == sums.size()) {
100 data = unserialize(md5Sum, data);
105 // Read md5 sums of individual chunks of file
106 vector<MD5>::iterator sum = sums.begin();
107 for (size_t i = blocks; i > 0; --i) {
108 data = unserialize(*sum, data);
112 return cachedBlockLength;
115 //______________________________________________________________________
118 /** Opposite of unserializeCacheEntry; create byte stream from object */
119 struct FilePart::SerializeCacheEntry {
120 SerializeCacheEntry(const FilePart& f, JigdoCache* c, size_t blockLen,
122 : file(f), cache(c), blockLength(blockLen), md5BlockLength(md5Len) { }
123 const FilePart& file;
126 size_t md5BlockLength;
128 size_t serialSizeOf() {
129 return PART_MD5SUM + (file.mdValid() ? file.sums.size() * 16 : 16);
132 void operator()(byte* data) {
133 Paranoid(file.getFlag(TO_BE_WRITTEN));
134 // If empty(), shouldn't have been marked TO_BE_WRITTEN:
135 Assert(!file.sums.empty());
137 data = serialize4(blockLength, data);
138 data = serialize4(md5BlockLength, data);
139 // Nr of valid blocks - either 1 or all
140 size_t blocks = (file.mdValid() ? file.sums.size() : 1);
141 data = serialize4(blocks, data);
142 data = serialize(file.rsyncSum, data);
143 data = serialize(file.md5Sum, data);
144 // Write md5 sums of individual chunks of file
145 vector<MD5>::const_iterator sum = file.sums.begin();
146 for (size_t i = blocks; i > 0; --i) {
147 data = serialize(*sum, data);
153 //______________________________________________________________________
156 JigdoCache::JigdoCache(const string& cacheFileName, size_t expiryInSeconds,
157 size_t bufLen, ProgressReporter& pr)
158 : blockLength(0), md5BlockLength(0), checkFiles(true), files(), nrOfFiles(0),
159 locationPaths(), readAmount(bufLen), buffer(), reporter(pr),
160 cacheExpiry(expiryInSeconds) {
163 if (!cacheFileName.empty())
164 cacheFile = new CacheFile(cacheFileName.c_str());
165 } catch (DbError e) {
166 string err = subst(_("Could not open cache file: %L1"), e.message);
171 JigdoCache::JigdoCache(const string&, size_t, size_t bufLen,
172 ProgressReporter& pr)
173 : blockLength(0), md5BlockLength(0), files(), nrOfFiles(0),
174 locationPaths(), readAmount(bufLen), buffer(), reporter(pr) { }
176 //______________________________________________________________________
178 JigdoCache::~JigdoCache() {
181 // Write out any cache entries that need it
182 for (list<FilePart>::const_iterator i = files.begin(), e = files.end();
184 if (i->deleted() || !i->getFlag(FilePart::TO_BE_WRITTEN)) continue;
185 debug("Writing %1", i->leafName());
186 FilePart::SerializeCacheEntry serializer(*i, this, blockLength,
189 cacheFile->insert(serializer, serializer.serialSizeOf(),
190 i->leafName(), i->mtime(), i->size());
191 } catch (DbError e) {
192 reporter.error(e.message);
196 if (cacheExpiry > 0) {
197 // Expire old cache entries from cache
198 time_t expired = time(0);
199 Paranoid(expired != static_cast<time_t>(-1));
200 expired -= cacheExpiry;
202 cacheFile->expire(expired);
203 } catch (DbError e) {
204 string err = subst(_("Error during cache expiry: %1. The cache "
205 "file may be corrupt, consider deleting it."),
211 // Close db object, flushing changes to disc
216 //______________________________________________________________________
218 /* Either reads data for the first MD5 block and creates sums[0] and
219 rsyncSum, or reads whole file and creates all sums[] entries and
220 rsyncSum and the whole file's MD5 sum. */
221 const MD5* FilePart::getSumsRead(JigdoCache* c, size_t blockNr) {
222 // Should do this check before calling:
223 Paranoid((blockNr == 0 && sums.empty()) || !mdValid());
225 // Do not forget to setParams() before calling this!
226 Assert(c->md5BlockLength != 0);
227 const size_t thisBlockLength = c->blockLength;
229 sums.resize((size() + c->md5BlockLength - 1) / c->md5BlockLength);
230 //____________________
233 // Can we maybe get the info from the cache?
234 if (c->cacheFile != 0 && !getFlag(WAS_LOOKED_UP)) {
235 setFlag(WAS_LOOKED_UP);
239 /* Unserialize will do nothing if md5BlockLength differs. If
240 md5BlockLength matches, but returned blockLength doesn't, we
241 need to re-read the first block. */
242 if (c->cacheFile->find(data, dataSize, leafName(), size(), mtime())
244 debug("%1 found, want block#%2", leafName(), blockNr);
245 size_t cachedBlockLength = unserializeCacheEntry(data, dataSize,
247 // Was all necessary data in cache? Yes => return it now.
248 if (cachedBlockLength == thisBlockLength
249 && (blockNr == 0 || mdValid())) {
250 debug("%1 loaded, blockLen (%2) matched, %3/%4 in cache",
251 leafName(), thisBlockLength, (mdValid() ? sums.size() : 1),
253 return &sums[blockNr];
255 /* blockLengths didn't match and/or the cache only contained
256 the first md5 sum while we asked for a later one. It's as
257 if we never queried the cache, except for the case when we
258 need to re-read the first block because the blockLength
259 changed, but *all* blocks' md5sums were in the cache. */
260 debug("%1 loaded, NO match (blockLen %2 vs %3), %4/%5 in cache",
261 leafName(), cachedBlockLength, thisBlockLength,
262 (mdValid() ? sums.size() : 1), sums.size());
264 } catch (DbError e) {
265 string err = subst(_("Error accessing cache: %1"), e.message);
266 c->reporter.error(err);
269 # endif /* HAVE_LIBDB */
270 //____________________
273 string name(getPath());
275 bifstream input(name.c_str(), ios::binary);
279 /* Actually, stdin /would/ be allowed /here/, but it isn't
280 possible with mktemplate. */
281 err = _("Error opening file `-' "
282 "(using standard input not allowed here)");
284 err = subst(_("Could not open `%L1' for input - excluded"), name);
287 err += strerror(errno);
292 c->reporter.error(err); // might throw
295 //____________________
297 // We're going to write this to the cache later on
298 setFlag(TO_BE_WRITTEN);
300 // Allocate or resize buffer, or do nothing if already right size
301 c->buffer.resize(c->readAmount > c->md5BlockLength ?
302 c->readAmount : c->md5BlockLength);
303 //______________________________
305 // Read data and create sums
307 uint64 off = 0; // File offset of first byte in buf
308 // Nr of bytes before we are to reset() md
309 size_t mdLeft = c->md5BlockLength;
310 /* Call reporter once off reaches this value - only report something
311 if scanning >1 md5 block */
312 uint64 nextReport = mdLeft;
315 vector<MD5>::iterator sum = sums.begin();
316 //____________________
318 // Calculate RsyncSum of head of file and MD5Sums for all blocks
320 Assert(thisBlockLength <= c->md5BlockLength);
321 byte* buf = &c->buffer[0];
323 byte* bufend = buf + (c->readAmount > thisBlockLength ?
324 c->readAmount : thisBlockLength);
325 while (input && static_cast<size_t>(bufpos - buf) < thisBlockLength) {
326 readBytes(input, bufpos, bufend - bufpos);
327 size_t nn = input.gcount();
329 debug("Read %1", nn);
331 size_t n = bufpos - buf;
332 // Create RsyncSum of 1st bytes of file, or leave at 0 if file too small
334 if (n >= thisBlockLength) rsyncSum.addBack(buf, thisBlockLength);
337 while (true) { // Will break out if error or whole file read
339 // n is number of valid bytes in buf[]
341 if (off > size()) break; // Argh - file size changed
343 if (off >= nextReport) {
344 c->reporter.scanningFile(this, off);
345 nextReport += REPORT_INTERVAL;
348 // Create MD5 for chunks of size md5BlockLength
353 md.update(buf, mdLeft);
354 byte* cur = buf + mdLeft;
355 size_t nn = n - mdLeft;
358 debug("%1: mdLeft (0), switching to next md at off %2, left %3, "
359 "writing sum#%4: %5", name, off - n + cur - buf, nn,
360 sum - sums.begin(), md.toString());
361 Paranoid(sum != sums.end());
364 size_t m = (nn < c->md5BlockLength ? nn : c->md5BlockLength);
365 md.reset().update(cur, m);
367 mdLeft = c->md5BlockLength - m;
371 md5Sum.update(buf, n); // Create MD5 for the whole file
373 if (blockNr == 0 && sum != sums.begin()) break; // Only wanted 1st block
374 if (!input) break; // End of file or error
377 readBytes(input, buf, c->readAmount);
379 debug("%1: read %2", name, n);
381 } // Endwhile (true), will break out if error or whole file read
383 Paranoid(sum != sums.end() // >=1 trailing bytes
384 || mdLeft == c->md5BlockLength); // 0 trailing bytes
385 if (off == size() && input.eof()) {
386 // Whole file was read
387 c->reporter.scanningFile(this, size()); // 100% scanned
388 if (mdLeft < c->md5BlockLength) {
389 (*sum) = md.finish(); // Digest of trailing bytes
390 debug("%1: writing trailing sum#%2: %3",
391 name, sum - sums.begin(), md.toString());
393 md5Sum.finish(); // Digest of whole file
395 return &sums[blockNr];
396 } else if (blockNr == 0 && sum != sums.begin()) {
397 // Only first md5 block of file was read
398 debug("%1: file header read, sum#0 written", name);
400 md5Sum.finish(); // else failed assert in FilePart::SerializeCacheEntry
402 md5Sum.abort(); // Saves the memory until whole file is read
406 //____________________
408 // Some error happened
409 string err = subst(_("Error while reading `%1' - file will be ignored "
410 "(%2)"), name, strerror(errno));
412 c->reporter.error(err);
415 //______________________________________________________________________
417 const MD5Sum* FilePart::getMD5SumRead(JigdoCache* c) {
419 (fileSize + c->md5BlockLength - 1) / c->md5BlockLength - 1)
424 //______________________________________________________________________
426 void JigdoCache::setParams(size_t blockLen, size_t md5BlockLen) {
427 if (blockLen == blockLength && md5BlockLen == md5BlockLength) return;
429 blockLength = blockLen;
430 md5BlockLength = md5BlockLen;
431 Assert(blockLength <= md5BlockLength);
432 for (list<FilePart>::iterator file = files.begin(), end = files.end();
433 file != end; ++file) {
434 file->sums.resize(0);
437 //______________________________________________________________________
439 void JigdoCache::addFile(const string& name) {
440 // Do not forget to setParams() before calling this!
441 Assert(md5BlockLength != 0);
442 // Assumes nonempty filenames
443 Paranoid(name.length() > 0);
445 // Find "//" in path and if present split name there
446 string::size_type pathLen = name.rfind(SPLITSEP);
447 string fileUri("file:");
448 string path, nameRest;
449 if (pathLen == string::npos) {
450 size_t splitAfter = 0;
452 // Split after "\" or ".\" or "C:" or "C:\" or "C:.\"
453 if (name.length() > 1 && isalpha(name[0]) && name[1] == ':')
455 if (name.length() > splitAfter && name[splitAfter] == '\\') {
456 // If an absolute path, split after leading '\'
458 } else if (name.length() > splitAfter + 1
459 && name[splitAfter] == '.' && name[splitAfter + 1] == '\\') {
460 // Otherwise, also split after ".\" at start
464 // If an absolute path, split after leading '/'
465 if (name[0] == DIRSEP) splitAfter = 1;
467 path.assign(name, 0, splitAfter);
469 nameRest.assign(name, splitAfter, string::npos);
471 // e.g. for name = "dir//file"
472 path.assign(name, 0, pathLen + 1); // path = "dir/"
473 fileUri.append(name, 0, pathLen + 1); // fileUri = "file:dir/"
475 nameRest.assign(name, pathLen + sizeof(SPLITSEP) - 1, string::npos);
477 compat_swapFileUriChars(fileUri); // Directory separator is always '/'
478 ConfigFile::quote(fileUri);
479 //____________________
481 // If necessary, create a label for the path before "//"
482 static string emptylabel;
483 LocationPath tmp(path, emptylabel, fileUri);
484 LocationPathSet::iterator i = locationPaths.find(tmp);
485 if (i == locationPaths.end())
486 i = locationPaths.insert(tmp).first; // Any new entry has a "" label
487 Paranoid(i != locationPaths.end());
489 // Append new obj at end of list
490 FilePart fp(i, nameRest, fileInfo.st_size, fileInfo.st_mtime);