1 /* $Id: mktemplate.cc,v 1.16 2005/07/06 15:29:59 atterer Exp $ -*- C++ -*-
3 |_) /| Copyright (C) 2001-2002 | richard@
4 | \/¯| Richard Atterer | atterer.org
6 This program is free software; you can redistribute it and/or modify it
7 under the terms of the GNU General Public License, version 2. See the file
10 Create location list (.jigdo) and image template (.template)
12 WARNING: This is very complicated code - be sure to run the "torture"
13 regression test after making changes. Read this file from bottom to top.
14 The .jigdo generating code is in mkjigdo.cc. The code for the queue of
15 partial matches is in partialmatch.*.
17 In the log messages that are output with --debug, uppercase means that it
18 is finally decided what this area of the image is: UNMATCHED or a MATCH of
41 #include <mimestream.hh>
43 #include <mktemplate.hh>
46 #include <zstream-gz.hh>
47 #include <zstream-bz.hh>
48 //______________________________________________________________________
50 void MkTemplate::ProgressReporter::error(const string& message) {
51 cerr << message << endl;
53 void MkTemplate::ProgressReporter::scanningImage(uint64) { }
54 void MkTemplate::ProgressReporter::matchFound(const FilePart*, uint64) { }
55 void MkTemplate::ProgressReporter::finished(uint64) { }
57 MkTemplate::ProgressReporter MkTemplate::noReport;
58 //______________________________________________________________________
60 MkTemplate::MkTemplate(JigdoCache* jcache, bistream* imageStream,
61 JigdoConfig* jigdoInfo, bostream* templateStream, ProgressReporter& pr,
62 int zipQuality, size_t readAmnt, bool addImage, bool addServers,
64 : fileSizeTotal(0U), fileCount(0U), block(), readAmount(readAmnt),
65 off(), unmatchedStart(), greedyMatching(true),
67 image(imageStream), templ(templateStream), zip(0),
68 zipQual(zipQuality), reporter(pr), matches(new PartialMatchQueue()),
70 jigdo(jigdoInfo), addImageSection(addImage),
71 addServersSection(addServers), useBzLib(useBzip2),
73 //______________________________________________________________________
75 /* Because make-template should be debuggable even in non-debug builds,
76 always compile in debug messages. */
78 Logger MkTemplate::debug("make-template");
79 //______________________________________________________________________
83 // Find the position of the highest set bit (e.g. for 0x20, result is 5)
84 inline int bitWidth(uint32 x) {
86 Assert(x <= 0xffffffff); // Can't cope with >32 bits
87 if (x & 0xffff0000) { r += 16; x >>= 16; }
88 if (x & 0xff00) { r += 8; x >>= 8; }
89 if (x & 0xf0) { r += 4; x >>= 4; }
90 if (x & 0xc) { r += 2; x >>= 2; }
91 if (x & 2) { r += 1; x >>=1; }
95 //________________________________________
97 /* Avoid integer divisions: Modulo addition/subtraction, with
99 // Returns (a + b) % m, if (a < m && b <= m)
100 inline size_t modAdd(size_t a, size_t b, size_t m) {
103 Paranoid(r == (a + b) % m);
106 // Returns (a - b) mod m, if (a < m && b <= m)
107 inline size_t modSub(size_t a, size_t b, size_t m) {
108 size_t r = a + m - b;
110 Paranoid(r == (m + a - b) % m);
113 // Returns (a + b) % m, if (a < m && -m <= b && b <= m)
114 inline size_t modAdd(size_t a, int b, size_t m) {
119 Paranoid(r == (a + static_cast<size_t>(b + m)) % m);
122 //____________________
124 /* Write part of a circular buffer to a Zobstream object, starting
125 with offset "start" (incl) and ending with offset "end" (excl).
126 Offsets can be equal to bufferLength. If both offsets are equal,
127 the whole buffer content is written. */
128 inline void writeBuf(const byte* const buf, size_t begin, size_t end,
129 const size_t bufferLength, Zobstream* zip) {
130 Paranoid(begin <= bufferLength && end <= bufferLength);
132 zip->write(buf + begin, (unsigned int)(end - begin));
134 zip->write(buf + begin, (unsigned int)(bufferLength - begin));
135 zip->write(buf, (unsigned int)end);
138 //____________________
140 // Write lower 48 bits of x to s in little-endian order
141 void write48(bostream& s, uint64 x) {
143 s << static_cast<byte>(x & 0xff)
144 << static_cast<byte>((x >> 8) & 0xff)
145 << static_cast<byte>((x >> 16) & 0xff)
146 << static_cast<byte>((x >> 24) & 0xff)
147 << static_cast<byte>((x >> 32) & 0xff)
148 << static_cast<byte>((x >> 40) & 0xff);
150 s.put(static_cast<byte>( x & 0xff));
151 s.put(static_cast<byte>((x >> 8) & 0xff));
152 s.put(static_cast<byte>((x >> 16) & 0xff));
153 s.put(static_cast<byte>((x >> 24) & 0xff));
154 s.put(static_cast<byte>((x >> 32) & 0xff));
155 s.put(static_cast<byte>((x >> 40) & 0xff));
160 //______________________________________________________________________
162 /** Build up a template DESC section by appending items to a
163 JigdoDescVec. Calls to descUnmatchedData() are allowed to accumulate, so
164 that >1 consecutive unmatched data areas are merged into one in the DESC
166 class MkTemplate::Desc {
168 Desc() : files(), offset(0) { }
170 // Insert in DESC section: information about whole image
171 inline void imageInfo(uint64 len, const MD5Sum& md5, size_t blockLength) {
172 files.reserve((files.size() + 16) % 16);
173 files.push_back(new JigdoDesc::ImageInfo(len, md5, blockLength));
175 // Insert in DESC section: info about some data that was not matched
176 inline void unmatchedData(uint64 len) {
177 JigdoDesc::UnmatchedData* u;
179 && (u = dynamic_cast<JigdoDesc::UnmatchedData*>(files.back())) !=0) {
180 // Add more unmatched data to already existing UnmatchedData object
181 u->resize(u->size() + len);
183 // Create new UnmatchedData object.
184 files.reserve((files.size() + 16) % 16);
185 files.push_back(new JigdoDesc::UnmatchedData(offset, len));
189 // Insert in DESC section: information about a file that matched
190 inline void matchedFile(uint64 len, const RsyncSum64& r,
192 files.reserve((files.size() + 16) % 16);
193 files.push_back(new JigdoDesc::MatchedFile(offset, len, r, md5));
196 inline bostream& put(bostream& s, MD5Sum* md) {
204 //______________________________________________________________________
206 /* The following are helpers used by run(). It is usually not adequate
207 to make such large functions inline, but there is only one call to
210 inline bool MkTemplate::scanFiles(size_t blockLength, uint32 blockMask,
211 size_t md5BlockLength) {
212 bool result = SUCCESS;
214 cache->setParams(blockLength, md5BlockLength);
215 FileVec::iterator hashPos;
217 for (JigdoCache::iterator file = cache->begin();
218 file != cache->end(); ++file) {
219 const RsyncSum64* sum = file->getRsyncSum(cache);
220 if (sum == 0) continue; // Error - skip
221 // Add file to hash list
222 hashPos = block.begin() + (sum->getHi() & blockMask);
223 hashPos->push_back(&*file);
227 //________________________________________
229 void MkTemplate::checkRsyncSumMatch2(const size_t blockLen,
230 const size_t back, const size_t md5BlockLength, uint64& nextEvent,
233 /* Don't schedule match if its startOff (== off - blockLen) is impossible.
234 This is e.g. the case when file A is 1024 bytes long (i.e. is
235 immediately matched once seen), and the first 1024 bytes of file B are
236 equal to file A, and file B is in the image.
237 [I think A gets matched, then the following line prevents that a partial
238 match for B is also recorded.]
239 Also prevents matches whose startOffset is <0, which could otherwise
240 happen because rsum covers a blockLen-sized area of 0x7f bytes at the
242 if (off < unmatchedStart + blockLen) return;
244 PartialMatch* x; // Ptr to new entry in "matches"
245 if (matches->full()) {
246 x = matches->findDropCandidate(§orLength, off - blockLen);
247 if (x == 0 || x->file() == file) {
248 /* If no more space left in queue and none of the entries in it is
249 appropriate for discarding, just discard this possible file match!
250 It's the only option if there are many, many overlapping matches,
251 otherwise the program would get extremely slow. */
252 debug(" %1: DROPPED possible %2 match at offset %3 (queue full)",
253 off, file->leafName(), off - blockLen);
256 // Overwrite existing entry in the queue
257 debug(" %1: DROPPED possible %2 match at offset %3 (queue full, match "
258 "below replaces it)",
259 off, x->file()->leafName(), x->startOffset());
260 } else { // !matches->full()
261 // Add new entry to the queue
262 x = matches->addFront();
265 /* Rolling rsum matched - schedule an MD5Sum match. NB: In extreme cases,
266 nextEvent may be equal to off */
267 x->setStartOffset(off - blockLen);
268 size_t eventLen = (file->size() < md5BlockLength ?
269 file->size() : md5BlockLength);
270 x->setNextEvent(matches, x->startOffset() + eventLen);
271 debug(" %1: Head of %2 match at offset %3, my next event %4",
272 off, file->leafName(), x->startOffset(), x->nextEvent());
273 if (x->nextEvent() < nextEvent) nextEvent = x->nextEvent();
274 x->setBlockOffset(back);
275 x->setBlockNumber(0);
279 /* Look for matches of sum (i.e. scanImage()'s rsum). If found, insert
280 appropriate entry in "matches". */
281 void MkTemplate::checkRsyncSumMatch(const RsyncSum64& sum,
282 const uint32& bitMask, const size_t blockLen, const size_t back,
283 const size_t md5BlockLength, uint64& nextEvent) {
285 typedef const vector<FilePart*> FVec;
286 FVec& hashEntry = block[sum.getHi() & bitMask];
287 if (hashEntry.empty()) return;
289 FVec::const_iterator i = hashEntry.begin(), e = hashEntry.end();
292 const RsyncSum64* fileSum = file->getRsyncSum(cache);
293 if (fileSum != 0 && *fileSum == sum)
294 // Insert new partial file match in "matches" queue
295 checkRsyncSumMatch2(blockLen, back, md5BlockLength, nextEvent, file);
300 //________________________________________
302 // Read the 'count' first bytes from file x and write them to zip
303 bool MkTemplate::rereadUnmatched(FilePart* file, uint64 count) {
304 // Lower peak memory usage: Deallocate cache's buffer
305 cache->deallocBuffer();
307 ArrayAutoPtr<byte> tmpBuf(new byte[readAmount]);
308 string inputName = file->getPath();
309 inputName += file->leafName();
310 auto_ptr<bistream> inputFile(new bifstream(inputName.c_str(),ios::binary));
311 while (inputFile->good() && count > 0) {
313 readBytes(*inputFile, tmpBuf.get(),
314 (readAmount < count ? readAmount : count));
315 size_t n = inputFile->gcount();
316 zip->write(tmpBuf.get(), (unsigned int)n); // will catch Zerror "upstream"
317 Paranoid(n <= count);
320 if (count == 0) return SUCCESS;
323 string err = subst(_("Error reading from `%1' (%2)"),
324 inputName, strerror(errno));
328 //________________________________________
330 // Print info about a part of the input image
331 void MkTemplate::printRangeInfo(uint64 start, uint64 end, const char* msg,
332 const PartialMatch* x) {
333 static const string empty;
334 debug("[%1,%2) %3 %4", start, end, msg,
335 (x != 0 ? x->file()->leafName() : empty) );
338 // oldAreaEnd != start, something's seriously wrong
339 void MkTemplate::debugRangeFailed() {
340 static bool printed = false;
341 cerr << "Assertion failed: oldAreaEnd == start" << endl;
344 "You have found a bug in jigdo-file. The generated .template file is\n"
345 "very likely broken! To help me find the bug, please rerun the\n"
346 "command as follows:\n"
347 " [previous-command] --report=noprogress --debug=make-template >log 2>&1\n"
348 "and send the _compressed_ `log' file to <jigdo" << "@atterer.org>."
353 //________________________________________
355 /* The block didn't match, so the whole file x doesn't match - re-read from
356 file any data that is no longer buffered (and not covered by another
357 match), and write it to the Zobstream. */
358 bool MkTemplate::checkMD5Match_mismatch(const size_t stillBuffered,
359 PartialMatch* x, Desc& desc) {
360 const PartialMatch* oldestMatch = matches->findLowestStartOffset();
361 uint64 rereadEnd = off - stillBuffered;
362 uint64 xStartOffset = x->startOffset();
363 if (x != oldestMatch || xStartOffset >= rereadEnd) {
364 // Everything still buffered, or there is another pending match
365 matches->eraseFront(); // return x to free pool
369 /* Reread the right amount of data from the file for x, covering the image
370 area from x->startOffset() to the first byte which is "claimed" by
371 something else - either another match or the start of the still buffered
373 FilePart* xfile = x->file();
374 Assert(matches->front() == x);
375 matches->eraseFront(); // return x to free pool
376 if (!matches->empty()) {
377 // set rereadEnd to new lowest startOffset in matches
378 const PartialMatch* newOldestMatch = matches->findLowestStartOffset();
379 if (rereadEnd > newOldestMatch->startOffset())
380 rereadEnd = newOldestMatch->startOffset();
382 debugRangeInfo(xStartOffset, rereadEnd,
383 "UNMATCHED after some blocks, re-reading from", x);
385 desc.unmatchedData(rereadEnd - xStartOffset);
386 unmatchedStart = rereadEnd;
388 uint64 bytesToWrite = rereadEnd - xStartOffset;
389 return rereadUnmatched(xfile, bytesToWrite);
391 //________________________________________
393 /* The file x was found in the image, and --match-exec is set. Set up env
396 Why does this use system() instead of the "more secure" exec()? Because
397 then things are actually easier to get right IMHO! My scenario is that
398 someone has set up an env var with the destination path for the fallback,
399 say $DEST, which might contain spaces. In that case, using exec() with
400 some kind of "%label" substitution is awkward and error-prone - let's just
401 have one single substitution scheme, that used by the shell! */
402 bool MkTemplate::matchExecCommands(PartialMatch* x) {
403 Paranoid(!matchExec.empty());
405 string matchPath, leaf;
406 const string& leafName = x->file()->leafName();
407 string::size_type lastSlash = leafName.rfind(DIRSEP);
408 if (lastSlash == string::npos) {
411 matchPath.assign(leafName, 0, lastSlash + 1);
412 leaf.assign(leafName, lastSlash + 1, string::npos);
415 md5Sum.write(x->file()->getMD5Sum(cache)->digest(), 16).flush();
416 string file = x->file()->getLocation()->getPath();
419 // Set environment vars
420 if (compat_setenv("LABEL", x->file()->getLocation()->getLabel().c_str())
421 || compat_setenv("LABELPATH", x->file()->getLocation()->getPath()
423 || compat_setenv("MATCHPATH", matchPath.c_str())
424 || compat_setenv("LEAF", leaf.c_str())
425 || compat_setenv("MD5SUM", md5Sum.result().c_str())
426 || compat_setenv("FILE", file.c_str())) {
427 reporter.error(_("Could not set up environment for --match-exec "
433 int status = system(matchExec.c_str());
434 if (status == 0) return SUCCESS;
435 reporter.error(_("Command supplied with --match-exec failed"));
438 //________________________________________
440 /* Calculate MD5 for the previous md5ChunkLength (or less if at end of match)
441 bytes. If the calculated checksum matches and it is the last MD5 block in
442 the file, record a file match. If the i-th MD5Sum does not match, write
443 the i*md5ChunkLength bytes directly to templ.
444 @param stillBuffered bytes of image data "before current position" that
445 are still in the buffer; they are at buf[data] to
446 buf[(data+stillBuffered-1)%bufferLength] */
447 bool MkTemplate::checkMD5Match(byte* const buf,
448 const size_t bufferLength, const size_t data,
449 const size_t md5BlockLength, uint64& nextEvent,
450 const size_t stillBuffered, Desc& desc) {
451 PartialMatch* x = matches->front();
452 Paranoid(x != 0 && matches->nextEvent() == off);
454 /* Calculate MD5Sum from buf[x->blockOff] to buf[data-1], deal with
455 wraparound. NB 0 <= x->blockOff < bufferLength, but 1 <= data <
459 if (x->blockOffset() < data) {
460 md.update(buf + x->blockOffset(), data - x->blockOffset());
462 md.update(buf + x->blockOffset(), bufferLength - x->blockOffset());
463 md.update(buf, data);
466 //____________________
468 const MD5* xfileSum = x->file()->getSums(cache, x->blockNumber());
470 debug("checkMD5Match?: image %1, file %2 block #%3 %4",
471 md.toString(), x->file()->leafName(), x->blockNumber(),
472 (xfileSum ? xfileSum->toString() : "[error]") );
474 if (xfileSum == 0 || md != *xfileSum) {
475 /* The block didn't match, so the whole file doesn't match - re-read from
476 file any data that is no longer buffered (and not covered by another
477 match), and write it to the Zobstream. */
478 return checkMD5Match_mismatch(stillBuffered, x, desc);
480 //____________________
482 // Another block of file matched - was it the last one?
483 if (off < x->startOffset() + x->file()->size()) {
484 // Still some more to go - update x and its position in queue
485 x->setBlockOffset(data);
486 x->setBlockNumber(x->blockNumber() + 1);
487 x->setNextEvent(matches, min(x->nextEvent() + md5BlockLength,
488 x->startOffset() + x->file()->size()));
489 nextEvent = min(nextEvent, x->nextEvent());
490 debug("checkMD5Match: match and more to go, next at off %1",
494 //____________________
496 Assert(off == x->startOffset() + x->file()->size());
498 // x = address of PartialMatch obj of file that matched
500 const PartialMatch* oldestMatch = matches->findLowestStartOffset();
502 if (!greedyMatching && x != oldestMatch) {
503 // A larger match is possible, so skip this match
504 debug("IGNORING match due to --no-greedy-matching: [%1,%2) %3",
505 x->startOffset(), off, x->file()->leafName());
506 matches->eraseFront(); // return x to free pool
510 reporter.matchFound(x->file(), x->startOffset());
511 matchedParts.push_back(x->file());
513 /* Re-read and write out data before the start of the match, i.e. of any
514 half-finished bigger match (which we abandon now that we've found a
515 smaller match inside it). */
516 if (x != oldestMatch && oldestMatch->startOffset() < off - stillBuffered) {
517 unmatchedStart = min(off - stillBuffered, x->startOffset());
518 debugRangeInfo(oldestMatch->startOffset(), unmatchedStart,
519 "UNMATCHED, re-reading partial match from", oldestMatch);
520 size_t toReread = unmatchedStart - oldestMatch->startOffset();
521 desc.unmatchedData(toReread);
522 if (rereadUnmatched(oldestMatch->file(), toReread))
526 /* Write out data that is still buffered, and is before the start of the
528 if (unmatchedStart < x->startOffset()) {
529 debugRangeInfo(unmatchedStart, x->startOffset(),
530 "UNMATCHED, buffer flush before match");
531 size_t toWrite = x->startOffset() - unmatchedStart;
532 Paranoid(off - unmatchedStart <= bufferLength);
533 size_t writeStart = modSub(data, off - unmatchedStart, bufferLength);
534 writeBuf(buf, writeStart, modAdd(writeStart, toWrite, bufferLength),
536 desc.unmatchedData(toWrite);
539 // Assert(x->file->mdValid);
540 desc.matchedFile(x->file()->size(), *(x->file()->getRsyncSum(cache)),
541 *(x->file()->getMD5Sum(cache)));
542 unmatchedStart = off;
543 debugRangeInfo(x->startOffset(), off, "MATCH:", x);
545 // With --match-exec, execute user-supplied command(s)
546 if (!matchExec.empty()
547 && matchExecCommands(x) == FAILURE)
550 /* Remove all matches with startOff < off (this includes x). This is the
551 greedy approach and is not ideal: If a small file A happens to match one
552 small fraction of a large file B and B is contained in the image, then
553 only a match of A will be found. */
554 Paranoid(x->startOffset() < off);
555 matches->eraseStartOffsetLess(off);
559 //________________________________________
561 /* This is called in case the end of the image has been reached, but
562 unmatchedStart < off, i.e. there was a partial match at the end of the
563 image. Just discards that match and writes the data to the template,
564 either by re-reading from the partially matched file, or from the buffer.
565 Compare to similar code in checkMD5Match.
567 Since we are at the end of the image, the full last bufferLength bytes of
568 the image are in the buffer. */
569 bool MkTemplate::unmatchedAtEnd(byte* const buf,
570 const size_t bufferLength, const size_t data, Desc& desc) {
571 Paranoid(unmatchedStart < off); // cf. where this is called
573 // Re-read and write out data that is no longer buffered.
574 const PartialMatch* y = matches->findStartOffset(unmatchedStart);
575 if (y != 0 && y->startOffset() < off - bufferLength) {
576 unmatchedStart = off - bufferLength;
577 debugRangeInfo(y->startOffset(), unmatchedStart,
578 "UNMATCHED at end, re-reading partial match from", y);
579 size_t toReread = unmatchedStart - y->startOffset();
580 desc.unmatchedData(toReread);
581 if (rereadUnmatched(y->file(), toReread))
584 // Write out data that is still buffered
585 if (unmatchedStart < off) {
586 debugRangeInfo(unmatchedStart, off, "UNMATCHED at end");
587 size_t toWrite = off - unmatchedStart;
588 Assert(toWrite <= bufferLength);
589 size_t writeStart = modSub(data, toWrite, bufferLength);
590 writeBuf(buf, writeStart, data, bufferLength, zip);
591 desc.unmatchedData(toWrite);
592 unmatchedStart = off;
596 //________________________________________
598 /* The "matches" queue is full. Typically, when this happens there is a big
599 zero-filled area in the image and one or more input files start with
600 zeroes. At this point, we must start dropping some prospective file
601 matches. This is done based on the heuristics that actual file matches
602 occur at a "sector boundary" in the image. See INITIAL_SECTOR_LENGTH in
603 mktemplate.hh for more.
605 This method is a variant of the main loop which is used when
606 matches.full(), and which advances the rsum window by one sector at a
608 void MkTemplate::scanImage_mainLoop_fastForward(uint64 nextEvent,
609 RsyncSum64* rsum, byte* buf, size_t* data, size_t* n, size_t* rsumBack,
610 size_t bufferLength, size_t blockLength, uint32 blockMask,
611 size_t md5BlockLength) {
615 debug("DROPPING, fast forward (queue full)");
616 Assert(off >= blockLength);
617 unsigned sectorMask = sectorLength - 1;
618 while (off < nextEvent) {
619 rsum->removeFront(buf[*rsumBack], blockLength);
620 rsum->addBack(buf[*data]);
621 ++*data; ++off; --*n;
622 *rsumBack = modAdd(*rsumBack, 1, bufferLength);
623 if (((off - blockLength) & sectorMask) == 0) {
624 checkRsyncSumMatch(*rsum, blockMask, blockLength, *rsumBack,
625 md5BlockLength, nextEvent);
626 sectorMask = sectorLength - 1;
627 Paranoid(matches->empty()
628 || matches->front()->startOffset() >= unmatchedStart);
634 debug("DROPPING, fast forward (queue full)");
635 Assert(off >= blockLength);
637 unsigned sectorMask = sectorLength - 1;
638 uint64 notSectorMask = ~implicit_cast<uint64>(sectorMask);
639 while (off < nextEvent) {
640 /* Calculate next value of off where a match would end up having an even
641 (i.e. sectorSize-aligned) start offset.
643 |----sectorLength----|----sectorLength----|
644 |========+========+===========+===+===========+====+=====>EOF
645 File offset: 0 | off |
650 uint64 nextAlignedOff = off - blockLength;
651 nextAlignedOff = (nextAlignedOff + sectorLength) & notSectorMask;
652 nextAlignedOff += blockLength;
653 Assert(nextAlignedOff > off);
655 unsigned long len = nextAlignedOff - off;
656 if (len > nextEvent - off) len = nextEvent - off;
657 // Advance rsum by len bytes in one go
659 RsyncSum64 rsum2 = *rsum; size_t rsumBack2 = *rsumBack;
660 uint64 off2 = off; size_t data2 = *data;
662 if (*rsumBack + len <= bufferLength) {
663 rsum->removeFront(buf + *rsumBack, len, blockLength);
665 rsum->removeFront(buf + *rsumBack, bufferLength - *rsumBack,
667 rsum->removeFront(buf, len + *rsumBack - bufferLength,
668 blockLength + *rsumBack - bufferLength);
670 Paranoid(*data + len <= bufferLength);
671 rsum->addBack(buf + *data, len);
672 *data += len; off += len; *n -= len;
673 *rsumBack = modAdd(*rsumBack, implicit_cast<size_t>(len), bufferLength);
674 Paranoid(off == nextEvent || off == nextAlignedOff);
676 for (unsigned i = 0; i < len; ++i) {
677 rsum2.removeFront(buf[rsumBack2], blockLength);
678 rsum2.addBack(buf[data2]);
680 rsumBack2 = modAdd(rsumBack2, 1, bufferLength);
682 Assert(rsumBack2 == *rsumBack);
684 Assert(data2 == *data);
685 Assert(rsum2 == *rsum);
688 //debug("DROPPING, fast forward (queue full) to %1", off);
690 if (off == nextAlignedOff) {
691 Paranoid(((off - blockLength) & sectorMask) == 0);
692 checkRsyncSumMatch(*rsum, blockMask, blockLength, *rsumBack,
693 md5BlockLength, nextEvent);
694 Paranoid(matches->empty()
695 || matches->front()->startOffset() >= unmatchedStart);
696 sectorMask = sectorLength - 1;
697 notSectorMask = ~implicit_cast<uint64>(sectorMask);
699 } // endwhile (off < nextEvent)
702 //________________________________________
704 /* Scan image. Central function for template generation.
706 Treat buf as a circular buffer. Read new data into at most half the
707 buffer. Calculate a rolling checksum covering blockLength bytes. When it
708 matches an entry in block, start calculating MD5Sums of blocks of length
711 Since both image and templ can be non-seekable, we run into a problem in
712 the following case: After the initial RsyncSum match, a few of the
713 md5BlockLength-sized chunks of one input file were matched, but not all,
714 so in the end, there is no match. Consequently, we would now need to
715 re-read that part of the image and pump it through zlib to templ - but we
716 can't if the image is stdin! Solution: Since we know that the MD5Sum of a
717 block matched part of an input file, we can re-read from there. */
718 inline bool MkTemplate::scanImage(byte* buf, size_t bufferLength,
719 size_t blockLength, uint32 blockMask, size_t md5BlockLength,
720 MD5Sum& templMd5Sum) {
721 bool result = SUCCESS;
723 /* Cause input files to be analysed */
724 if (scanFiles(blockLength, blockMask, md5BlockLength))
727 /* Initialise rolling sums with blockSize bytes 0x7f, and do the same with
728 part of buffer, to avoid special-case code in main loop. (Any value
729 would do - except that 0x00 or 0xff might lead to a larger number of
732 byte* bufEnd = buf + bufferLength;
733 //for (byte* z = bufEnd - blockLength; z < bufEnd; ++z) *z = 0x7f;
734 // Init entire buf, keep valgrind happy
735 for (byte* z = buf; z < bufEnd; ++z) *z = 0x7f;
736 rsum.addBackNtimes(0x7f, blockLength);
738 // Compression pipe for templ data
739 auto_ptr<Zobstream> zipDel;
741 zipDel.reset(implicit_cast<Zobstream*>(
742 new ZobstreamBz(*templ, zipQual, 256U, &templMd5Sum) ));
744 zipDel.reset(implicit_cast<Zobstream*>(
745 new ZobstreamGz(*templ, ZIPCHUNK_SIZE, zipQual, 15, 8, 256U,
748 Desc desc; // Buffer for DESC data, will be appended to templ at end
749 size_t data = 0; // Offset into buf of byte currently being processed
750 off = 0; // Current absolute offset in image, corresponds to "data"
751 uint64 nextReport = 0; // call reporter once off reaches this value
753 /* The area delimited by unmatchedStart (incl) and off (excl) has "not been
754 dealt with", either by writing it to zip, or by a match with the first
755 MD5 block of an input file. Once a partial match of a file has been
756 detected, unmatchedStart "gets stuck" at the start offset of this file
760 MD5Sum imageMd5Sum; // MD5 of whole image
761 MD5Sum md; // Re-used for each 2nd-level check of any rsum match
763 sectorLength = INITIAL_SECTOR_LENGTH;
766 size_t rsumBack = bufferLength - blockLength;
769 /* Catch Zerrors, which can occur in zip->write(), writeBuf(),
770 checkMD5Match(), zip->close() */
771 while (image->good()) {
773 debug("---------- main loop. off=%1 data=%2 unmatchedStart=%3",
774 off, data, unmatchedStart);
776 if (off >= nextReport) { // Keep user entertained
777 reporter.scanningImage(off);
778 nextReport += REPORT_INTERVAL;
781 // zip->write() out any old data that we'll destroy with read()
782 size_t thisReadAmount = (readAmount < bufferLength - data ?
783 readAmount : bufferLength - data);
784 uint64 newUnmatchedStart = 0;
785 if (off > blockLength)
786 newUnmatchedStart = off - blockLength;
787 if (!matches->empty()) {
788 newUnmatchedStart = min(newUnmatchedStart,
789 matches->lowestStartOffset()->startOffset());
791 if (unmatchedStart < newUnmatchedStart) {
792 size_t toWrite = newUnmatchedStart - unmatchedStart;
793 Paranoid(off - unmatchedStart <= bufferLength);
794 //debug("off=%1 unmatchedStart=%2 buflen=%3",
795 // off, unmatchedStart, bufferLength);
796 size_t writeStart = modSub(data, off - unmatchedStart,
798 debugRangeInfo(unmatchedStart, unmatchedStart + toWrite,
800 writeBuf(buf, writeStart,
801 modAdd(writeStart, toWrite, bufferLength),
803 unmatchedStart = newUnmatchedStart;
804 desc.unmatchedData(toWrite);
807 // Read new data from image
808 # if DEBUG // just for testing, make it sometimes read less
809 static size_t chaosOff, acc;
810 if (unmatchedStart == 0) { chaosOff = 0; acc = 0; }
811 acc += buf[chaosOff] ^ buf[off % bufferLength] + ~off;
812 if (chaosOff == 0) chaosOff = bufferLength - 1; else --chaosOff;
813 if ((acc & 0x7) == 0) {
814 thisReadAmount = acc % thisReadAmount + 1;
815 debug("thisReadAmount=%1", thisReadAmount);
818 readBytes(*image, buf + data, thisReadAmount);
819 size_t n = image->gcount();
820 imageMd5Sum.update(buf + data, n);
822 while (n > 0) { // Still unprocessed bytes left
823 uint64 nextEvent = off + n; // Special event: end of buffer
824 if (!matches->empty())
825 nextEvent = min(nextEvent, matches->front()->nextEvent());
827 if (!matches->full()) {
828 sectorLength = INITIAL_SECTOR_LENGTH;
830 /* Unrolled innermost loop - see below for single-iteration
831 version. Also see checkRsyncSumMatch above. */
832 while (off + 32 < nextEvent && rsumBack < bufferLength - 32) {
833 size_t dataOld = data;
835 const vector<FilePart*>* hashEntry;
837 rsum.removeFront(buf[rsumBack], blockLength); \
838 rsum.addBack(buf[data]); \
839 ++data; ++rsumBack; \
840 hashEntry = &block[rsum.getHi() & blockMask]; \
841 if (hashEntry->size() > 1) break; \
842 if (hashEntry->size() == 1) { \
843 FilePart* file = (*hashEntry)[0]; \
844 const RsyncSum64* fileSum = file->getRsyncSum(cache); \
845 if (fileSum != 0 && *fileSum == rsum) break; \
847 x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x
849 dataOld = data; // Special case: "Did not break out of loop"
851 if (dataOld == data) {
854 dataOld = data - dataOld; off += dataOld; n -= dataOld;
855 checkRsyncSumMatch(rsum, blockMask, blockLength, rsumBack,
856 md5BlockLength, nextEvent);
858 if (matches->full()) break;
860 } // endif (!matches->full())
862 if (!matches->full()) {
863 // Innermost loop - single-byte version, matches not full
864 while (off < nextEvent) {
865 // Roll checksum by one byte
866 rsum.removeFront(buf[rsumBack], blockLength);
867 rsum.addBack(buf[data]);
869 rsumBack = modAdd(rsumBack, 1, bufferLength);
871 /* Look for matches of rsum. If found, insert appropriate
872 entry in matches list and maybe modify nextEvent. */
873 checkRsyncSumMatch(rsum, blockMask, blockLength, rsumBack,
874 md5BlockLength, nextEvent);
876 /* We mustn't by accident schedule an event for a part of
877 the image that has already been flushed out of the
879 Paranoid(matches->empty()
880 || matches->front()->startOffset() >= unmatchedStart);
883 // Innermost loop - MATCHES IS FULL
884 scanImage_mainLoop_fastForward(nextEvent, &rsum, buf, &data, &n,
885 &rsumBack, bufferLength, blockLength, blockMask,
887 } // endif (matches->full())
888 if (matches->empty())
889 debug(" %1: Event, matches empty", off);
891 debug(" %1: Event, matchesOff=%2", off, matches->nextEvent());
893 /* Calculate MD5 for the previous md5ChunkLength (or less if
894 at end of match) bytes, if necessary. If the calculated
895 checksum matches and it is the last MD5 block in the file,
896 record a file match. If the i-th MD5Sum does not match,
897 write the i*md5ChunkLength bytes directly to templ. */
898 while (!matches->empty() && matches->nextEvent() == off) {
899 size_t stillBuffered = bufferLength - n;
900 if (stillBuffered > off) stillBuffered = off;
901 if (checkMD5Match(buf, bufferLength, data, md5BlockLength,
902 nextEvent, stillBuffered, desc))
903 return FAILURE; // no recovery possible, exit immediately
906 Assert(matches->empty() || matches->nextEvent() > off);
907 } // endwhile (n > 0), i.e. more unprocessed bytes left in buffer
909 if (data == bufferLength) data = 0;
910 Assert(data < bufferLength);
912 } // endwhile (image->good()), i.e. more data left in input image
914 // End of image data - any remaining partial match is UNMATCHED
915 if (unmatchedStart < off
916 && unmatchedAtEnd(buf, bufferLength, data, desc)) {
919 Assert(unmatchedStart == off);
924 string err = subst(_("Error during compression: %1"), ze.message);
926 try { zip->close(); } catch (Zerror zze) { }
930 imageMd5Sum.finish();
931 desc.imageInfo(off, imageMd5Sum, cache->getBlockLen());
932 desc.put(*templ, &templMd5Sum);
934 string err = _("Could not write template data");
939 reporter.finished(off);
942 //______________________________________________________________________
944 // Central function which processes the data and finds matches
945 bool MkTemplate::run(const string& imageLeafName,
946 const string& templLeafName) {
947 bool result = SUCCESS;
950 // Kick out files that are too small
951 for (JigdoCache::iterator f = cache->begin(), e = cache->end();
953 if (f->size() < cache->getBlockLen()) {
954 f->markAsDeleted(cache);
957 fileSizeTotal += f->size();
961 // Hash table performance drops when linked lists become long => "+1"
962 int blockBits = bitWidth((uint32)fileCount) + 1;
963 uint32 blockMask = (1 << blockBits) - 1;
964 block.resize(blockMask + 1);
966 size_t max_MD5Len_blockLen =
967 cache->getBlockLen() + 64; // +64 for Assert below
968 if (max_MD5Len_blockLen < cache->getMD5BlockLen())
969 max_MD5Len_blockLen = cache->getMD5BlockLen();
970 /* Pass 1 imposes no minimum buffer length, only pass 2: Must always
971 be able to read readAmount bytes into one buffer half in one go;
972 must be able to start calculating an MD5Sum at a position that is
973 blockLength bytes back in input; must be able to write out at
974 least previous md5BlockLength bytes in case there is no match. */
975 size_t bufferLength = 2 *
976 (max_MD5Len_blockLen > readAmount ? max_MD5Len_blockLen : readAmount);
977 // Avoid reading less bytes than readAmount at any time
978 bufferLength = (bufferLength + readAmount - 1) / readAmount * readAmount;
980 Paranoid(bufferLength % readAmount == 0); // for efficiency only
981 // Asserting this makes things easier in pass 2. Yes it is ">" not ">="
982 Assert(cache->getMD5BlockLen() > cache->getBlockLen());
985 debug("Nr of files: %1 (%2 bits)", fileCount, blockBits);
986 debug("Total bytes: %1", fileSizeTotal);
987 debug("blockLength: %1", cache->getBlockLen());
988 debug("md5BlockLen: %1", cache->getMD5BlockLen());
989 debug("bufLen (kB): %1", bufferLength/1024);
990 debug("zipQual: %1", zipQual);
994 ArrayAutoPtr<byte> bufDel(new byte[bufferLength]);
995 byte* buf = bufDel.get();
997 prepareJigdo(); // Add [Jigdo]
999 { // Write header to template file
1000 string s = TEMPLATE_HDR; append(s, FILEFORMAT_MAJOR); s += '.';
1001 append(s, FILEFORMAT_MINOR); s += " jigdo-file/" JIGDO_VERSION;
1002 s += "\r\nSee "; s += URL; s += " for details about jigdo.\r\n\r\n";
1003 const byte* t = reinterpret_cast<const byte*>(s.data());
1004 writeBytes(*templ, t, s.size());
1005 templMd5Sum.update(t, s.size());
1006 if (!*templ) result = FAILURE;
1009 // Read input image and output parts that do not match
1010 if (scanImage(buf, bufferLength, cache->getBlockLen(), blockMask,
1011 cache->getMD5BlockLen(), templMd5Sum)) {
1014 cache->deallocBuffer();
1015 templMd5Sum.finish();
1017 // Add [Image], (re-)add [Parts]
1018 finalizeJigdo(imageLeafName, templLeafName, templMd5Sum);
1020 debug("MkTemplate::run() finished");