1 /* $Id: scan.cc,v 1.11 2005/07/02 22:05:04 atterer Exp $ -*- C++ -*-
3 |_) /| Copyright (C) 2001-2002 | richard@
4 | \/¯| Richard Atterer | atterer.org
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License, version 2. See
8 the file COPYING for details.
10 Scanning of input files
23 #include <sys/types.h>
24 #include <unistd-jigdo.h>
28 #include <configfile.hh>
32 #include <serialize.hh>
33 //______________________________________________________________________
37 void JigdoCache::ProgressReporter::error(const string& message) {
38 cerr << message << endl;
40 void JigdoCache::ProgressReporter::info(const string& message) {
41 cerr << message << endl;
43 void JigdoCache::ProgressReporter::scanningFile(const FilePart*, uint64) { }
45 JigdoCache::ProgressReporter JigdoCache::noReport;
47 struct stat JigdoCache::fileInfo;
48 //______________________________________________________________________
52 // How much checksum data do we have per checksum entry?
53 #define CSUM_SIZE (16 + 32) // md5 size + sha256 size
55 /* Interpret a string of bytes (out of the file cache) like this:
57 4 blockLength (of rsync sum)
59 4 blocks (number of valid md5 blocks in this entry), curr. always >0
60 8 rsyncSum of file start (only valid if blocks > 0)
61 16 fileMD5Sum (only valid if
62 blocks == (fileSize+csumBlockLength-1)/csumBlockLength )
63 32 fileSHA256Sum (only valid if
64 blocks == (fileSize+csumBlockLength-1)/csumBlockLength )
65 followed by n entries:
66 16 md5sum of block of size csumBlockLength
67 32 sha256sum of block of size csumBlockLength
69 If stored csumBlockLength doesn't match supplied length, do nothing.
70 Otherwise, restore *this from cached data and return cached
71 blockLength (0 if not cached). The caller needs to make sure the
74 This is not a standard unserialize() member of FilePart because it
75 does not create a complete serialization - e.g. the location path
76 iter is missing. It only creates a cache entry. */
78 size_t FilePart::unserializeCacheEntry(const byte* data, size_t dataSize,
79 size_t csumBlockLength){
80 Assert(dataSize > PART_MD5SUMS);
82 // The resize() must have been made by the caller
83 Paranoid(MD5sums.size() == (size() + csumBlockLength - 1) / csumBlockLength);
84 Paranoid(SHA256sums.size() == (size() + csumBlockLength - 1) / csumBlockLength);
86 size_t cachedBlockLength;
87 data = unserialize4(cachedBlockLength, data);
88 size_t cachedCsumBlockLength;
89 data = unserialize4(cachedCsumBlockLength, data);
90 if (cachedCsumBlockLength != csumBlockLength) return 0;
93 data = unserialize4(blocks, data);
94 // Ignore strange-looking entries
96 debug("ERR #blocks == 0");
99 if (dataSize - PART_MD5SUMS != (blocks * CSUM_SIZE)) {
100 debug("ERR wrong entry size (%1 vs %2)",
101 blocks * CSUM_SIZE, dataSize - PART_MD5SUMS);
104 Paranoid(serialSizeOf(rsyncSum) == 8);
105 data = unserialize(rsyncSum, data);
106 Paranoid(serialSizeOf(md5Sum) == 16);
107 Paranoid(serialSizeOf(sha256Sum) == 32);
108 // All blocks of file present?
109 if (blocks == MD5sums.size() + SHA256sums.size()) {
111 data = unserialize(md5Sum, data);
112 data = unserialize(sha256Sum, data);
117 // Read md5sums of individual chunks of file
118 vector<MD5>::iterator sum = MD5sums.begin();
119 for (size_t i = blocks; i > 0; --i) {
120 data = unserialize(*sum, data);
123 vector<SHA256>::iterator sum2 = SHA256sums.begin();
124 for (size_t i = blocks; i > 0; --i) {
125 data = unserialize(*sum2, data);
129 return cachedBlockLength;
132 //______________________________________________________________________
135 /** Opposite of unserializeCacheEntry; create byte stream from object */
136 struct FilePart::SerializeCacheEntry {
137 SerializeCacheEntry(const FilePart& f, JigdoCache* c, size_t blockLen,
139 : file(f), cache(c), blockLength(blockLen), csumBlockLength(md5Len) { }
140 const FilePart& file;
143 size_t csumBlockLength;
145 size_t serialSizeOf() {
146 return PART_MD5SUMS + (file.mdValid() ? file.MD5sums.size() * CSUM_SIZE : CSUM_SIZE);
149 void operator()(byte* data) {
150 Paranoid(file.getFlag(TO_BE_WRITTEN));
151 // If empty(), shouldn't have been marked TO_BE_WRITTEN:
152 Assert(!file.MD5sums.empty());
153 Assert(!file.SHA256sums.empty());
155 data = serialize4(blockLength, data);
156 data = serialize4(csumBlockLength, data);
157 // Nr of valid blocks - either 1 or all
158 size_t blocks = (file.mdValid() ? file.MD5sums.size() : 1);
159 data = serialize4(blocks, data);
160 data = serialize(file.rsyncSum, data);
161 data = serialize(file.md5Sum, data);
162 data = serialize(file.sha256Sum, data);
163 // Write md5sums of individual chunks of file
164 vector<MD5>::const_iterator sum = file.MD5sums.begin();
165 for (size_t i = blocks; i > 0; --i) {
166 data = serialize(*sum, data);
169 // Write md5sums of individual chunks of file
170 vector<SHA256>::const_iterator sum2 = file.SHA256sums.begin();
171 for (size_t i = blocks; i > 0; --i) {
172 data = serialize(*sum2, data);
178 //______________________________________________________________________
181 JigdoCache::JigdoCache(const string& cacheFileName, size_t expiryInSeconds,
182 size_t bufLen, ProgressReporter& pr)
183 : blockLength(0), csumBlockLength(0), checkFiles(true), files(), nrOfFiles(0),
184 locationPaths(), readAmount(bufLen), buffer(), reporter(pr),
185 cacheExpiry(expiryInSeconds) {
188 if (!cacheFileName.empty())
189 cacheFile = new CacheFile(cacheFileName.c_str());
190 } catch (DbError e) {
191 string err = subst(_("Could not open cache file: %L1"), e.message);
196 JigdoCache::JigdoCache(const string&, size_t, size_t bufLen,
197 ProgressReporter& pr)
198 : blockLength(0), csumBlockLength(0), files(), nrOfFiles(0),
199 locationPaths(), readAmount(bufLen), buffer(), reporter(pr) { }
201 //______________________________________________________________________
203 JigdoCache::~JigdoCache() {
206 // Write out any cache entries that need it
207 for (list<FilePart>::const_iterator i = files.begin(), e = files.end();
209 if (i->deleted() || !i->getFlag(FilePart::TO_BE_WRITTEN)) continue;
210 debug("Writing %1", i->leafName());
211 FilePart::SerializeCacheEntry serializer(*i, this, blockLength,
214 cacheFile->insert(serializer, serializer.serialSizeOf(),
215 i->leafName(), i->mtime(), i->size());
216 } catch (DbError e) {
217 reporter.error(e.message);
221 if (cacheExpiry > 0) {
222 // Expire old cache entries from cache
223 time_t expired = time(0);
224 Paranoid(expired != static_cast<time_t>(-1));
225 expired -= cacheExpiry;
227 cacheFile->expire(expired);
228 } catch (DbError e) {
229 string err = subst(_("Error during cache expiry: %1. The cache "
230 "file may be corrupt, consider deleting it."),
236 // Close db object, flushing changes to disc
241 //______________________________________________________________________
245 1. read data for the first block and create rsyncSum, MD5sums[0]
248 2. read the whole file and create rsyncSum, plus both checksums for
249 all the blocks and both checksums for the whole file.
252 bool FilePart::getChecksumsRead(JigdoCache* c, size_t blockNr) {
253 // Should do this check before calling:
254 Paranoid((blockNr == 0 && MD5sums.empty() && SHA256sums.empty()) || !mdValid());
256 // Do not forget to setParams() before calling this!
257 Assert(c->csumBlockLength != 0);
258 const size_t thisBlockLength = c->blockLength;
260 int64_t num_csum_blocks = (size() + c->csumBlockLength - 1) / c->csumBlockLength;
262 MD5sums.resize((size_t)num_csum_blocks);
263 SHA256sums.resize((size_t)num_csum_blocks);
264 //____________________
267 // Can we maybe get the info from the cache?
268 if (c->cacheFile != 0 && !getFlag(WAS_LOOKED_UP)) {
269 setFlag(WAS_LOOKED_UP);
273 /* Unserialize will do nothing if csumBlockLength differs. If
274 csumBlockLength matches, but returned blockLength doesn't, we
275 need to re-read the first block. */
276 if (c->cacheFile->find(data, dataSize, leafName(), size(), mtime())
278 debug("%1 found, want block#%2", leafName(), blockNr);
279 size_t cachedBlockLength = unserializeCacheEntry(data, dataSize,
281 // Was all necessary data in cache? Yes => return it now.
282 if (cachedBlockLength == thisBlockLength
283 && (blockNr == 0 || mdValid())) {
284 debug("%1 loaded, blockLen (%2) matched, %3/%4 in cache",
285 leafName(), thisBlockLength, (mdValid() ? MD5sums.size() : 1),
289 /* blockLengths didn't match and/or the cache only contained
290 the checksum for the first block while we asked for a later
291 one. It's as if we never queried the cache, except for the
292 case when we need to re-read the first block because the
293 blockLength changed, but *all* blocks' checksums were in the
295 debug("%1 loaded, NO match (blockLen %2 vs %3), %4/%5 in cache",
296 leafName(), cachedBlockLength, thisBlockLength,
297 (mdValid() ? MD5sums.size() : 1), MD5sums.size());
299 } catch (DbError e) {
300 string err = subst(_("Error accessing cache: %1"), e.message);
301 c->reporter.error(err);
304 # endif /* HAVE_LIBDB */
305 //____________________
308 string name(getPath());
310 bifstream input(name.c_str(), ios::binary);
314 /* Actually, stdin /would/ be allowed /here/, but it isn't
315 possible with mktemplate. */
316 err = _("Error opening file `-' "
317 "(using standard input not allowed here)");
319 err = subst(_("Could not open `%L1' for input - excluded"), name);
322 err += strerror(errno);
327 c->reporter.error(err); // might throw
330 //____________________
332 // We're going to write this to the cache later on
333 setFlag(TO_BE_WRITTEN);
335 // Allocate or resize buffer, or do nothing if already right size
336 c->buffer.resize(c->readAmount > c->csumBlockLength ?
337 c->readAmount : c->csumBlockLength);
338 //______________________________
340 // Read data and create checksums
342 uint64 off = 0; // File offset of first byte in buf
343 // Nr of bytes before we are to reset() md
344 size_t mdLeft = c->csumBlockLength;
345 /* Call reporter once off reaches this value - only report something
346 if scanning >1 checksum block */
347 uint64 nextReport = mdLeft;
350 vector<MD5>::iterator sum = MD5sums.begin();
353 vector<SHA256>::iterator sum2 = SHA256sums.begin();
354 //____________________
356 // Calculate RsyncSum of head of file and MD5 and SHA256 for all blocks
358 Assert(thisBlockLength <= c->csumBlockLength);
359 byte* buf = &c->buffer[0];
361 byte* bufend = buf + (c->readAmount > thisBlockLength ?
362 c->readAmount : thisBlockLength);
363 while (input && static_cast<size_t>(bufpos - buf) < thisBlockLength) {
364 readBytes(input, bufpos, bufend - bufpos);
365 size_t nn = input.gcount();
367 debug("Read %1", nn);
369 size_t n = bufpos - buf;
371 // Create RsyncSum of 1st bytes of file, or leave at 0 if file too small
373 if (n >= thisBlockLength)
374 rsyncSum.addBack(buf, thisBlockLength);
377 while (true) { // Will break out if error or whole file read
379 // n is number of valid bytes in buf[]
382 break; // Argh - file size changed
384 if (off >= nextReport) {
385 c->reporter.scanningFile(this, off);
386 nextReport += REPORT_INTERVAL;
389 // Create checksums for chunks of size csumBlockLength
395 md.update(buf, mdLeft);
396 sd.update(buf, mdLeft);
397 byte* cur = buf + mdLeft;
398 size_t nn = n - mdLeft;
402 debug("%1: mdLeft (0), switching to next md at off %2, left %3, "
403 "writing sum#%4: %5/%6", name, off - n + cur - buf, nn,
404 sum - MD5sums.begin(), md.toString(), sd.toString());
405 Paranoid(sum != MD5sums.end());
410 size_t m = (nn < c->csumBlockLength ? nn : c->csumBlockLength);
411 md.reset().update(cur, m);
412 sd.reset().update(cur, m);
415 mdLeft = c->csumBlockLength - m;
419 md5Sum.update(buf, n); // Create MD5 for the whole file
420 sha256Sum.update(buf, n); // Create MD5 for the whole file
422 if (blockNr == 0 && sum != MD5sums.begin())
423 break; // Only wanted 1st block
426 break; // End of file or error
429 readBytes(input, buf, c->readAmount);
431 debug("%1: read %2", name, n);
433 } // Endwhile (true), will break out if error or whole file read
435 Paranoid(sum != MD5sums.end() // >=1 trailing bytes
436 || mdLeft == c->csumBlockLength); // 0 trailing bytes
437 if (off == size() && input.eof()) {
438 // Whole file was read
439 c->reporter.scanningFile(this, size()); // 100% scanned
440 if (mdLeft < c->csumBlockLength) {
441 (*sum) = md.finish(); // Digest of trailing bytes
442 (*sum2) = sd.finish(); // Digest of trailing bytes
443 debug("%1: writing trailing sum#%2: %3/%4",
444 name, sum - MD5sums.begin(), md.toString(), sd.toString());
446 md5Sum.finish(); // Digest of whole file
447 sha256Sum.finish(); // Digest of whole file
450 } else if (blockNr == 0 && sum != MD5sums.begin()) {
451 // Only first md5 block of file was read
452 debug("%1: file header read, sum#0 written", name);
454 md5Sum.finish(); // else failed assert in FilePart::SerializeCacheEntry
457 md5Sum.abort(); // Saves the memory until whole file is read
458 sha256Sum.abort(); // Saves the memory until whole file is read
462 //____________________
464 // Some error happened
465 string err = subst(_("Error while reading `%1' - file will be ignored "
466 "(%2)"), name, strerror(errno));
468 c->reporter.error(err);
471 //______________________________________________________________________
473 const MD5Sum* FilePart::getMD5SumRead(JigdoCache* c) {
474 if (!getChecksumsRead(c, (size_t)((fileSize + c->csumBlockLength - 1) / c->csumBlockLength - 1)))
479 //______________________________________________________________________
481 const SHA256Sum* FilePart::getSHA256SumRead(JigdoCache* c) {
482 if (!getChecksumsRead(c, (size_t)(fileSize + c->csumBlockLength - 1) / c->csumBlockLength - 1))
487 //______________________________________________________________________
489 void JigdoCache::setParams(size_t blockLen, size_t csumBlockLen) {
490 if (blockLen == blockLength && csumBlockLen == csumBlockLength) return;
492 blockLength = blockLen;
493 csumBlockLength = csumBlockLen;
494 Assert(blockLength <= csumBlockLength);
495 for (list<FilePart>::iterator file = files.begin(), end = files.end();
496 file != end; ++file) {
497 file->MD5sums.resize(0);
498 file->SHA256sums.resize(0);
501 //______________________________________________________________________
503 void JigdoCache::addFile(const string& name) {
504 // Do not forget to setParams() before calling this!
505 Assert(csumBlockLength != 0);
506 // Assumes nonempty filenames
507 Paranoid(name.length() > 0);
509 // Find "//" in path and if present split name there
510 string::size_type pathLen = name.rfind(SPLITSEP);
511 string fileUri("file:");
512 string path, nameRest;
513 if (pathLen == string::npos) {
514 size_t splitAfter = 0;
516 // Split after "\" or ".\" or "C:" or "C:\" or "C:.\"
517 if (name.length() > 1 && isalpha(name[0]) && name[1] == ':')
519 if (name.length() > splitAfter && name[splitAfter] == '\\') {
520 // If an absolute path, split after leading '\'
522 } else if (name.length() > splitAfter + 1
523 && name[splitAfter] == '.' && name[splitAfter + 1] == '\\') {
524 // Otherwise, also split after ".\" at start
528 // If an absolute path, split after leading '/'
529 if (name[0] == DIRSEP) splitAfter = 1;
531 path.assign(name, 0, splitAfter);
533 nameRest.assign(name, splitAfter, string::npos);
535 // e.g. for name = "dir//file"
536 path.assign(name, 0, pathLen + 1); // path = "dir/"
537 fileUri.append(name, 0, pathLen + 1); // fileUri = "file:dir/"
539 nameRest.assign(name, pathLen + sizeof(SPLITSEP) - 1, string::npos);
541 compat_swapFileUriChars(fileUri); // Directory separator is always '/'
542 ConfigFile::quote(fileUri);
543 //____________________
545 // If necessary, create a label for the path before "//"
546 static string emptylabel;
547 LocationPath tmp(path, emptylabel, fileUri);
548 LocationPathSet::iterator i = locationPaths.find(tmp);
549 if (i == locationPaths.end())
550 i = locationPaths.insert(tmp).first; // Any new entry has a "" label
551 Paranoid(i != locationPaths.end());
553 // Append new obj at end of list
554 FilePart fp(i, nameRest, fileInfo.st_size, fileInfo.st_mtime);