1 /**
2 Thin package implements Content Defined Chunking (CDC) based on a rolling
3 Rabin Checksum.
4 
5 Choosing a Random Irreducible Polynomial
6 ========================================
7 
8 The function `Pol.getRandom()` returns a new random polynomial of degree 53
9 for use with the chunker. The degree 53 is chosen because it is the largest
10 prime below 64-8 = 56, so that the top 8 bits of an ulong can be used for
11 optimising calculations in the chunker.
12 
13 A random polynomial is chosen selecting 64 random bits, masking away bits
14 64..54 and setting bit 53 to one (otherwise the polynomial is not of the
15 desired degree) and bit 0 to one (otherwise the polynomial is trivially
16 reducible), so that 51 bits are chosen at random.
17 
18 This process is repeated until `Pol.irreducible` returns `true`, then this
19 polynomials is returned. If this doesn't happen after 1 million tries, the
20 function returns an error. The probability for selecting an irreducible
21 polynomial at random is about 7.5% ( (2^53-2)/53 / 2^51), so the probability
22 that no irreducible polynomial has been found after 100 tries is lower than
23 0.04%.
24 
25 Verifying Irreducible Polynomials
26 =================================
27 
28 During development the results have been verified using the computational
29 discrete algebra system GAP, which can be obtained from the website at
30 http://www.gap-system.org/.
31 
32 For filtering a given list of polynomials in hexadecimal coefficient notation,
33 the following script can be used:
34 
35 ```
36 	# create x over F_2 = GF(2);
37 	x := Indeterminate(GF(2), "x");
38 
39 	# test if polynomial is irreducible, i.e. the number of factors is one;
40 	IrredPoly := function (poly);
41 		return (Length(Factors(poly)) = 1);
42 	end;;
43 
44 	# create a polynomial in x from the hexadecimal representation of the;
45 	# coefficients;
46 	Hex2Poly := function (s);
47 		return ValuePol(CoefficientsQadic(IntHexString(s), 2), x);
48 	end;;
49 
50 	# list of candidates, in hex;
51 	candidates := [ "3DA3358B4DC173" ];
52 
53 	# create real polynomials;
54 	L := List(candidates, Hex2Poly);
55 
56 	# filter and display the list of irreducible polynomials contained in L;
57 	Display(Filtered(L, x -> (IrredPoly(x))));
58 ```
59 
60 All irreducible polynomials from the list are written to the output.
61 
62 Background Literature
63 =====================
64 
65 An introduction to Rabin Fingerprints/Checksums can be found in the following articles:
66 
67 Michael O. Rabin (1981): "Fingerprinting by Random Polynomials"
68 http://www.xmailserver.org/rabin.pdf
69 
70 Ross N. Williams (1993): "A Painless Guide to CRC Error Detection Algorithms"
71 http://www.zlib.net/crc_v3.txt
72 
73 Andrei Z. Broder (1993): "Some Applications of Rabin's Fingerprinting Method"
74 http://www.xmailserver.org/rabin_apps.pdf
75 
76 Shuhong Gao and Daniel Panario (1997): "Tests and Constructions of Irreducible Polynomials over Finite Fields"
77 http://www.math.clemson.edu/~sgao/papers/GP97a.pdf
78 
79 Andrew Kadatch, Bob Jenkins (2007): "Everything we know about CRC but afraid to forget"
80 http://crcutil.googlecode.com/files/crc-doc.1.0.pdf
81 
82 License:
83 
84 Copyright 2014 Alexander Neumann. All rights reserved.
85 Use of this source code is governed by a BSD-style
86 license that can be found in the LICENSE file.
87 */
88 
89 module chunker;
90 
91 import chunker.polynomials;
92 import chunker.rabin;
93 
94 // Array support
95 import std.range.primitives : empty, front, popFront;
96 
97 private enum size_t kiB = 1024;
98 private enum size_t miB = 1024 * kiB;
99 
100 /// Aim to create chunks of 20 bits or about 1MiB on average.
101 public enum size_t chunkDefaultAverageBits = 20;
102 /// Default minimal size of a chunk.
103 public enum size_t chunkDefaultMinSize = 512 * kiB;
104 /// Default maximal size of a chunk.
105 public enum size_t chunkDefaultMaxSize = 8 * miB;
106 
107 
108 /// Splits content with Rabin Fingerprints.
109 struct Chunker(R)
110 {
111 	/// One content-dependent chunk of bytes whose end was cut when
112 	/// the Rabin Fingerprint had the value stored in `cut`.
113 	struct Chunk
114 	{
115 		/// Contents of the chunk.
116 		ubyte[] data;
117 		/// Value of the rolling hash when the chunk was cut.
118 		ulong cut;
119 	}
120 
121 	private // internal state
122 	{
123 		/// Current hash internal state.
124 		RabinHash hash;
125 
126 		/// Buffer used to receive and keep read data in.
127 		ubyte[] buf;
128 
129 		/// Buffer used to copy chunk data to.
130 		ubyte[] cbuf;
131 		/// Number of bytes within the current chunk.
132 		size_t count;
133 	}
134 
135 	private // configuration
136 	{
137 		/// Minimum and maximum chunk sizes, as configured.
138 		size_t minSize, maxSize;
139 
140 		/// Hash mask used to decide chunk boundaries.
141 		/// By default `(1 << 20) - 1`, or configured from `averageBits`.
142 		ulong splitmask;
143 
144 		/// Input data source.
145 		R source;
146 	}
147 
148 	/// Probably use the `byCDChunk` convenience function instead of this constructor directly.
149 	/// Note that `Chunker` will `popFront` the input range during construction.
150 	public this(R source, Pol pol, uint averageBits, size_t minSize, size_t maxSize, ubyte[] cbuf)
151 	{
152 		this.cbuf = cbuf ? cbuf : new ubyte[maxSize];
153 		this.source = source;
154 		this.minSize = minSize;
155 		this.maxSize = maxSize;
156 		this.splitmask = (1L << averageBits) - 1;
157 		this.hash = RabinHash(pol);
158 		if (this.source.empty)
159 			empty = true;
160 		else
161 		{
162 			this.buf = this.source.front;
163 			popFront();
164 		}
165 	}
166 
167 	Chunk front; /// Current chunk.
168 	bool empty; /// `true` when there are no more chunks (input range is empty).
169 
170 	/// Updates `front` to contain the position and data of the next
171 	/// chunk. Note that `Chunker` reuses the chunk buffer, so the
172 	/// previous contents of `front.data` will be overwritten.
173 	public void popFront()
174 	{
175 		assert(!empty);
176 
177 		hash.start();
178 		hash.slide(1);
179 		count = 0;
180 		auto minSize = this.minSize;
181 		auto maxSize = this.maxSize;
182 		// do not start a new chunk unless at least minSize bytes have been read
183 		auto pre = minSize - RabinHash.windowSize;
184 		auto buf = this.buf;
185 		scope(success) this.buf = buf;
186 
187 		void copyBytes(size_t numBytes)
188 		{
189 			auto bytes = buf[0 .. numBytes];
190 			buf = buf[numBytes .. $];
191 			auto newLen = count + bytes.length;
192 			if (cbuf.length < newLen)
193 				cbuf.length = newLen;
194 			cbuf[count .. newLen] = bytes[];
195 			count += numBytes;
196 		}
197 
198 		while (true)
199 		{
200 			if (buf.length == 0)
201 			{
202 				if (source.empty)
203 				{
204 					// Last chunk has been read.
205 					empty = true;
206 					return;
207 				}
208 
209 				source.popFront();
210 
211 				if (source.empty)
212 				{
213 					// There are no more bytes to buffer, so this was
214 					// the last chunk. Return current chunk, if any
215 					// bytes have been processed.
216 					if (count > 0)
217 						break;
218 					else
219 					{
220 						empty = true;
221 						return;
222 					}
223 				}
224 
225 				buf = source.front;
226 			}
227 
228 			// check if bytes have to be dismissed before starting a new chunk
229 			if (pre > 0)
230 			{
231 				if (pre > buf.length)
232 				{
233 					pre -= buf.length;
234 					copyBytes(buf.length);
235 					continue;
236 				}
237 
238 				copyBytes(pre);
239 				pre = 0;
240 			}
241 
242 			if (count < minSize)
243 			{
244 				auto warmUp = minSize - count;
245 				if (warmUp > buf.length)
246 					warmUp = buf.length;
247 				hash.put(buf[0 .. warmUp]);
248 				copyBytes(warmUp);
249 			}
250 
251 			auto toWrite = buf.length;
252 			if (count + toWrite > maxSize)
253 				toWrite = maxSize - count;
254 			auto written = hash.putUntil(buf[0 .. toWrite], splitmask);
255 			copyBytes(written);
256 			if (buf.length) // stopped early due to maxSize or mask match
257 				break;
258 		}
259 		front = Chunk(cbuf[0 .. count], hash.peek());
260 	}
261 }
262 
263 /// Constructs a new `Chunker` based on polynomial `pol` that
264 /// reads from `source`.
265 /// Params:
266 ///   source = An input range of chunks of bytes, such as that
267 ///     returned by `File.byChunk` or `ubyte[].chunks` or simply
268 ///     `ubyte[].only`.
269 ///   pol = An irreducible polynomial from `F_2[X]`. Use
270 ///     `Pol.getRandom()` to generate a random one.
271 ///   averageBits = Allows to control the frequency of chunk
272 ///     discovery: the lower `averageBits`, the higher amount of
273 ///     chunks will be identified.  The default value is 20 bits,
274 ///     so chunks will be of 1MiB size on average.
275 ///   minSize = Minimum size of emitted chunks. Chunk boundaries that
276 ///     occur less than `minSize` bytes from the chunk start are
277 ///     ignored.
278 ///   maxSize = Maximum size of emitted chunks. If a chunk boundary is
279 ///     not encountered after `maxSize` bytes from the chunk start, it
280 ///     is forcibly split at that point.
281 ///   cbuf = A buffer to store chunk data. When `null` (default), a
282 ///     new buffer is allocated on construction of length `maxSize`.
283 /// Returns:
284 ///   An instance of `Chunker`, an input range of `Chunker.Chunk`,
285 ///   which contains the chunk data, and the fingerprint value when it
286 ///   was cut.
287 Chunker!R byCDChunk(R)(
288 	R source,
289 	Pol pol,
290 	uint averageBits = chunkDefaultAverageBits,
291 	size_t minSize = chunkDefaultMinSize,
292 	size_t maxSize = chunkDefaultMaxSize,
293 	ubyte[] cbuf = null)
294 {
295 	return Chunker!R(source, pol, averageBits, minSize, maxSize, cbuf);
296 }
297 
298 /// ditto
299 Chunker!R byCDChunk(R)(R source, Pol pol, ubyte[] cbuf)
300 {
301 	return Chunker!R(source, pol, chunkDefaultAverageBits, chunkDefaultMinSize, chunkDefaultMaxSize, cbuf);
302 }
303 
304 // -----------------------------------------------------------------------------
305 version(chunkerUnittest) version = test;
306 version(benchmarkChunker) version = test;
307 version(test):
308 private:
309 
310 import chunker.internal.helpers;
311 
312 enum chunkerBufSize = 512 * kiB;
313 
314 template parseDigest(string s)
315 {
316 	import std.conv : hexString;
317 	enum ubyte[] parseDigest = cast(ubyte[])hexString!s;
318 }
319 
320 struct TestChunk
321 {
322 	public size_t length;
323 	public ulong cutFP;
324 	public ubyte[] digest;
325 }
326 
327 /// polynomial used for all the tests below
328 enum testPol = Pol(0x3DA3358B4DC173);
329 
330 /// created for 32MB of random data out of math/rand's Uint32() seeded by
331 /// constant 23
332 //
333 /// chunking configuration:
334 /// window size 64, avg chunksize 1<<20, min chunksize 1<<19, max chunksize 1<<23
335 /// polynom 0x3DA3358B4DC173
336 TestChunk[] chunks1 =
337 [
338 	{2163460, 0x000b98d4cdf00000, parseDigest!"4b94cb2cf293855ea43bf766731c74969b91aa6bf3c078719aabdd19860d590d"},
339 	{643703, 0x000d4e8364d00000, parseDigest!"5727a63c0964f365ab8ed2ccf604912f2ea7be29759a2b53ede4d6841e397407"},
340 	{1528956, 0x0015a25c2ef00000, parseDigest!"a73759636a1e7a2758767791c69e81b69fb49236c6929e5d1b654e06e37674ba"},
341 	{1955808, 0x00102a8242e00000, parseDigest!"c955fb059409b25f07e5ae09defbbc2aadf117c97a3724e06ad4abd2787e6824"},
342 	{2222372, 0x00045da878000000, parseDigest!"6ba5e9f7e1b310722be3627716cf469be941f7f3e39a4c3bcefea492ec31ee56"},
343 	{2538687, 0x00198a8179900000, parseDigest!"8687937412f654b5cfe4a82b08f28393a0c040f77c6f95e26742c2fc4254bfde"},
344 	{609606, 0x001d4e8d17100000, parseDigest!"5da820742ff5feb3369112938d3095785487456f65a8efc4b96dac4be7ebb259"},
345 	{1205738, 0x000a7204dd600000, parseDigest!"cc70d8fad5472beb031b1aca356bcab86c7368f40faa24fe5f8922c6c268c299"},
346 	{959742, 0x00183e71e1400000, parseDigest!"4065bdd778f95676c92b38ac265d361f81bff17d76e5d9452cf985a2ea5a4e39"},
347 	{4036109, 0x001fec043c700000, parseDigest!"b9cf166e75200eb4993fc9b6e22300a6790c75e6b0fc8f3f29b68a752d42f275"},
348 	{1525894, 0x000b1574b1500000, parseDigest!"2f238180e4ca1f7520a05f3d6059233926341090f9236ce677690c1823eccab3"},
349 	{1352720, 0x00018965f2e00000, parseDigest!"afd12f13286a3901430de816e62b85cc62468c059295ce5888b76b3af9028d84"},
350 	{811884, 0x00155628aa100000, parseDigest!"42d0cdb1ee7c48e552705d18e061abb70ae7957027db8ae8db37ec756472a70a"},
351 	{1282314, 0x001909a0a1400000, parseDigest!"819721c2457426eb4f4c7565050c44c32076a56fa9b4515a1c7796441730eb58"},
352 	{1318021, 0x001cceb980000000, parseDigest!"842eb53543db55bacac5e25cb91e43cc2e310fe5f9acc1aee86bdf5e91389374"},
353 	{948640, 0x0011f7a470a00000, parseDigest!"b8e36bf7019bb96ac3fb7867659d2167d9d3b3148c09fe0de45850b8fe577185"},
354 	{645464, 0x00030ce2d9400000, parseDigest!"5584bd27982191c3329f01ed846bfd266e96548dfa87018f745c33cfc240211d"},
355 	{533758, 0x0004435c53c00000, parseDigest!"4da778a25b72a9a0d53529eccfe2e5865a789116cb1800f470d8df685a8ab05d"},
356 	{1128303, 0x0000c48517800000, parseDigest!"08c6b0b38095b348d80300f0be4c5184d2744a17147c2cba5cc4315abf4c048f"},
357 	{800374, 0x000968473f900000, parseDigest!"820284d2c8fd243429674c996d8eb8d3450cbc32421f43113e980f516282c7bf"},
358 	{2453512, 0x001e197c92600000, parseDigest!"5fa870ed107c67704258e5e50abe67509fb73562caf77caa843b5f243425d853"},
359 	{2651975, 0x000ae6c868000000, parseDigest!"181347d2bbec32bef77ad5e9001e6af80f6abcf3576549384d334ee00c1988d8"},
360 	{237392, 0x0000000000000001, parseDigest!"fcd567f5d866357a8e299fd5b2359bb2c8157c30395229c4e9b0a353944a7978"},
361 ];
362 
363 /// test if nullbytes are correctly split, even if length is a multiple of minSize.
364 TestChunk[] chunks2 =
365 [
366 	{chunkDefaultMinSize, 0, parseDigest!"07854d2fef297a06ba81685e660c332de36d5d18d546927d30daad6d7fda1541"},
367 	{chunkDefaultMinSize, 0, parseDigest!"07854d2fef297a06ba81685e660c332de36d5d18d546927d30daad6d7fda1541"},
368 	{chunkDefaultMinSize, 0, parseDigest!"07854d2fef297a06ba81685e660c332de36d5d18d546927d30daad6d7fda1541"},
369 	{chunkDefaultMinSize, 0, parseDigest!"07854d2fef297a06ba81685e660c332de36d5d18d546927d30daad6d7fda1541"},
370 ];
371 
372 /// the same as chunks1, but avg chunksize is 1<<19
373 TestChunk[] chunks3 =
374 [
375 	{1491586, 0x00023e586ea80000, parseDigest!"4c008237df602048039287427171cef568a6cb965d1b5ca28dc80504a24bb061"},
376 	{671874, 0x000b98d4cdf00000, parseDigest!"fa8a42321b90c3d4ce9dd850562b2fd0c0fe4bdd26cf01a24f22046a224225d3"},
377 	{643703, 0x000d4e8364d00000, parseDigest!"5727a63c0964f365ab8ed2ccf604912f2ea7be29759a2b53ede4d6841e397407"},
378 	{1284146, 0x0012b527e4780000, parseDigest!"16d04cafecbeae9eaedd49da14c7ad7cdc2b1cc8569e5c16c32c9fb045aa899a"},
379 	{823366, 0x000d1d6752180000, parseDigest!"48662c118514817825ad4761e8e2e5f28f9bd8281b07e95dcafc6d02e0aa45c3"},
380 	{810134, 0x0016071b6e180000, parseDigest!"f629581aa05562f97f2c359890734c8574c5575da32f9289c5ba70bfd05f3f46"},
381 	{567118, 0x00102a8242e00000, parseDigest!"d4f0797c56c60d01bac33bfd49957a4816b6c067fc155b026de8a214cab4d70a"},
382 	{821315, 0x001b3e42c8180000, parseDigest!"8ebd0fd5db0293bd19140da936eb8b1bbd3cd6ffbec487385b956790014751ca"},
383 	{1401057, 0x00045da878000000, parseDigest!"001360af59adf4871ef138cfa2bb49007e86edaf5ac2d6f0b3d3014510991848"},
384 	{2311122, 0x0005cbd885380000, parseDigest!"8276d489b566086d9da95dc5c5fe6fc7d72646dd3308ced6b5b6ddb8595f0aa1"},
385 	{608723, 0x001cfcd86f280000, parseDigest!"518db33ba6a79d4f3720946f3785c05b9611082586d47ea58390fc2f6de9449e"},
386 	{980456, 0x0013edb7a7f80000, parseDigest!"0121b1690738395e15fecba1410cd0bf13fde02225160cad148829f77e7b6c99"},
387 	{1140278, 0x0001f9f017e80000, parseDigest!"28ca7c74804b5075d4f5eeb11f0845d99f62e8ea3a42b9a05c7bd5f2fca619dd"},
388 	{2015542, 0x00097bf5d8180000, parseDigest!"6fe8291f427d48650a5f0f944305d3a2dbc649bd401d2655fc0bdd42e890ca5a"},
389 	{904752, 0x000e1863eff80000, parseDigest!"62af1f1eb3f588d18aff28473303cc4731fc3cafcc52ce818fee3c4c2820854d"},
390 	{713072, 0x001f3bb1b9b80000, parseDigest!"4bda9dc2e3031d004d87a5cc93fe5207c4b0843186481b8f31597dc6ffa1496c"},
391 	{675937, 0x001fec043c700000, parseDigest!"5299c8c5acec1b90bb020cd75718aab5e12abb9bf66291465fd10e6a823a8b4a"},
392 	{1525894, 0x000b1574b1500000, parseDigest!"2f238180e4ca1f7520a05f3d6059233926341090f9236ce677690c1823eccab3"},
393 	{1352720, 0x00018965f2e00000, parseDigest!"afd12f13286a3901430de816e62b85cc62468c059295ce5888b76b3af9028d84"},
394 	{811884, 0x00155628aa100000, parseDigest!"42d0cdb1ee7c48e552705d18e061abb70ae7957027db8ae8db37ec756472a70a"},
395 	{1282314, 0x001909a0a1400000, parseDigest!"819721c2457426eb4f4c7565050c44c32076a56fa9b4515a1c7796441730eb58"},
396 	{1093738, 0x0017f5d048880000, parseDigest!"5dddfa7a241b68f65d267744bdb082ee865f3c2f0d8b946ea0ee47868a01bbff"},
397 	{962003, 0x000b921f7ef80000, parseDigest!"0cb5c9ebba196b441c715c8d805f6e7143a81cd5b0d2c65c6aacf59ca9124af9"},
398 	{856384, 0x00030ce2d9400000, parseDigest!"7734b206d46f3f387e8661e81edf5b1a91ea681867beb5831c18aaa86632d7fb"},
399 	{533758, 0x0004435c53c00000, parseDigest!"4da778a25b72a9a0d53529eccfe2e5865a789116cb1800f470d8df685a8ab05d"},
400 	{1128303, 0x0000c48517800000, parseDigest!"08c6b0b38095b348d80300f0be4c5184d2744a17147c2cba5cc4315abf4c048f"},
401 	{800374, 0x000968473f900000, parseDigest!"820284d2c8fd243429674c996d8eb8d3450cbc32421f43113e980f516282c7bf"},
402 	{2453512, 0x001e197c92600000, parseDigest!"5fa870ed107c67704258e5e50abe67509fb73562caf77caa843b5f243425d853"},
403 	{665901, 0x00118c842cb80000, parseDigest!"deceec26163842fdef6560311c69bf8a9871a56e16d719e2c4b7e4d668ceb61f"},
404 	{1986074, 0x000ae6c868000000, parseDigest!"64cd64bf3c3bc389eb20df8310f0427d1c36ab2eaaf09e346bfa7f0453fc1a18"},
405 	{237392, 0x0000000000000001, parseDigest!"fcd567f5d866357a8e299fd5b2359bb2c8157c30395229c4e9b0a353944a7978"},
406 ];
407 
408 /// same as chunks1, but with a much bounded min/max size
409 TestChunk[] chunks4 =
410 [
411 	{1310720, 0x0008ec095c2a5e99, parseDigest!"5997b608c2a8b7d827bdfbd69dcffa86984c3a754aa4be92292f9f2ba0af4c22"},
412 	{852740, 0x000b98d4cdf00000, parseDigest!"8df8723fbcb4639c94cd831a3d8c8a6ff3ff746fe77769ee2244d1ef69bbe31e"},
413 	{1310720, 0x0008271396402fa4, parseDigest!"7c29efffbda477bbe56274573c3ba1b7e616eb76f3b9c92168a1f214aa557ba6"},
414 	{861939, 0x0015a25c2ef00000, parseDigest!"5080a1c4d3b538dce8c04c7738a043185d8c76e3251ae69e483b4b17d94dd7b9"},
415 	{1310720, 0x000fe23afea2e3be, parseDigest!"95507aa5fb821925e92ece579b5b50df0a6b311bd517937c4ae1a39e14cb79ea"},
416 	{1310720, 0x000e7cc6e6b093f0, parseDigest!"d2efa31fd050ca04232401b84a06fbde30bcc62f2108a5c0f52cca319952ab90"},
417 	{1310720, 0x00090b61ebb67fc7, parseDigest!"3a7e29b6a323b111ff5860953de004dfb0f69e5ccadb8831176cb0a6e3563262"},
418 	{1310720, 0x0008998f21ddfe08, parseDigest!"d25815c6d82e29d8d59fdde4bd85804526cd45b027106e9526d2bfdba6e8c7ae"},
419 	{1310720, 0x0010ebc94474a0db, parseDigest!"1ef10b00b9894102517e4a2d201877c9eba662dcbd22b2259f44c2c88436cbae"},
420 	{1310720, 0x00180e6341e54abb, parseDigest!"d7a9ebce0573e5a2f6d688a495044766b2323a7a8b73ad2a9f1bdead52429d83"},
421 	{1310720, 0x0010fa125ffa7553, parseDigest!"9803d879cf7cc8221eae87c2bf1ac9e6a4328929dfe32e5cb461663186402531"},
422 	{1310720, 0x000884f3046bc00f, parseDigest!"0d35b3a8949b673821f004eb845af80ae34f082dd4c20b7715c42aff21aeabad"},
423 	{1310720, 0x00154b752483d77e, parseDigest!"fb464beea91aee4eb91b7e5da58cc51f28a9ee74a61bc5ea40e703603e860fd8"},
424 	{1310720, 0x00068be82cf57932, parseDigest!"ab4ebe4015214c4f449782ded96011d3ac1be3daa682a1df2d1905104008f1b5"},
425 	{1310720, 0x000a1c2e12d6c196, parseDigest!"9198f8799fc64d2f6da6365a42f32c7b2d31ebfb23ccd957b705cfc24c69bc5e"},
426 	{1310720, 0x000f73c53cb2ae82, parseDigest!"2d05e2f054b3ce480e7ffa4490db250ed2126fe36b8d767fc51c6e2d3baff87b"},
427 	{915082, 0x000f55a1fea00000, parseDigest!"bcc88a08794bf3f194afa0c351836846eed95acfe7de41dc32d7f63c8fa02a4c"},
428 	{1310720, 0x0016d908ad8f8bbc, parseDigest!"214f5fdf4136b08499ae944d6507ae97ae0eee5e051e336182a18d0881251d96"},
429 	{1310720, 0x001740bda9456bd2, parseDigest!"1fcdad3287449b74a61ac626599dc9b0efb7bf01b599931f141f0ec340547c06"},
430 	{1310720, 0x000c1606d9a0d99f, parseDigest!"d88982bf999af432cfcc94bace5000308cbb1cc6cd2126286d6304ebd83c4765"},
431 	{837117, 0x00030ce2d9400000, parseDigest!"bf6c025df617eac82f643026d1d0295159a63a4e8dea41ed0341bd0469cbd7d3"},
432 	{1310720, 0x0008fb9882a96b1f, parseDigest!"b5a2a7d0dfaa4d66e6c693afe77037cd42a21deac68f21fdafdfdeeea881e6a1"},
433 	{1151715, 0x000968473f900000, parseDigest!"31882d126190910a0260abe5d10cb31b180e3e2d0d2c3e45981a78e1675738a5"},
434 	{1310720, 0x0015cadcf5b2d7ac, parseDigest!"7d878ceb8e6ef8a3adc8a892a6c32d8afc93b9ea6b8a55d2271230847fa4b925"},
435 	{1142792, 0x001e197c92600000, parseDigest!"96095ef256bb29934c024dbd0e3eab9c9dee2aa3308a7f4d77f20ae712ea57f1"},
436 	{1310720, 0x0005408e6716fd12, parseDigest!"2825054e95b14ceb9f6c1947e5f5da0b34ce898850e05390adfcd976a271c107"},
437 	{1310720, 0x000fb439ba516157, parseDigest!"eb4d3e5686f4cc810faa8bb356dbdd3242cb9acfd87fa7c2229c44f3f33aa714"},
438 	{267927, 0x0000000000000001, parseDigest!"19f1aa1f5c3b49452675de394497fe5943cd00ee5a61291c6c0ca92b01ca2312"},
439 ];
440 
441 import std.format : format;
442 import std.digest.sha : sha256Of;
443 
444 Chunker!R.Chunk[] testWithData(R)(ref Chunker!R chnker, TestChunk[] testChunks, bool checkDigest, bool copyData = false)
445 {
446 	Chunker!R.Chunk[] chunks;
447 
448 	foreach (i, chunk; testChunks)
449 	{
450 		auto c = chnker.front;
451 
452 		assert(c.data.length == chunk.length,
453 			format!"Length for chunk %d does not match: expected %d, got %d"
454 			(i, chunk.length, c.data.length));
455 
456 		assert(c.cut == chunk.cutFP,
457 			format!"Cut fingerprint for chunk %d/%d does not match: expected %016x, got %016x"
458 			(i, testChunks.length, chunk.cutFP, c.cut));
459 
460 		if (checkDigest)
461 		{
462 			auto digest = sha256Of(c.data);
463 			assert(chunk.digest == digest,
464 				format!"Digest fingerprint for chunk %d/%d does not match: expected %(%02x%), got %(%02x%)"
465 				(i, testChunks.length, chunk.digest, digest));
466 		}
467 
468 		if (copyData)
469 			c.data = c.data.dup;
470 		chunks ~= c;
471 		chnker.popFront();
472 	}
473 
474 	if (!chnker.empty)
475 		assert(false, "Wrong error returned after last chunk");
476 
477 	assert(chunks.length == testChunks.length,
478 		"Amounts of test and resulting chunks do not match");
479 
480 	return chunks;
481 }
482 
483 import std.array : replicate;
484 import std.range : chunks;
485 
486 @(`Chunker`) unittest
487 {
488 	// setup data source
489 	auto data = getRandom(23, 32*1024*1024);
490 	auto ch = data.chunks(chunkerBufSize).byCDChunk(testPol);
491 	testWithData(ch, chunks1, true);
492 
493 	// setup nullbyte data source
494 	data = replicate([ubyte(0)], chunks2.length*chunkDefaultMinSize);
495 	ch = data.chunks(chunkerBufSize).byCDChunk(testPol);
496 
497 	testWithData(ch, chunks2, true);
498 }
499 
500 @(`ChunkerWithArrayInput`) unittest
501 {
502 	// Test with all data in one array/chunk
503 	auto data = getRandom(23, 32*1024*1024);
504 	auto ch = [data].byCDChunk(testPol);
505 	testWithData(ch, chunks1, true);
506 
507 	data = replicate([ubyte(0)], chunks2.length*chunkDefaultMinSize);
508 	ch = [data].byCDChunk(testPol);
509 	testWithData(ch, chunks2, true);
510 }
511 
512 @(`ChunkerWithFileInput`) unittest
513 {
514 	// Test with File.byChunk
515 	import std.stdio : File;
516 	import std.file : remove;
517 	auto data = getRandom(23, 32*1024*1024);
518 	File("test.bin", "wb").rawWrite(data);
519 	scope(exit) remove("test.bin");
520 	auto ch = File("test.bin", "rb").byChunk(chunkerBufSize).byCDChunk(testPol);
521 	testWithData(ch, chunks1, true);
522 
523 	data = replicate([ubyte(0)], chunks2.length*chunkDefaultMinSize);
524 	File("test.bin", "wb").rawWrite(data);
525 	ch = File("test.bin", "rb").byChunk(chunkerBufSize).byCDChunk(testPol);
526 	testWithData(ch, chunks2, true);
527 }
528 
529 @(`ChunkerWithCustomAverageBits`) unittest
530 {
531 	auto data = getRandom(23, 32*1024*1024);
532 
533 	// sligthly decrease averageBits to get more chunks
534 	auto ch = data.chunks(chunkerBufSize).byCDChunk(testPol, 19);
535 
536 	testWithData(ch, chunks3, true);
537 }
538 
539 @(`ChunkerWithCustomMinMax`) unittest
540 {
541 	auto data = getRandom(23, 32*1024*1024);
542 	auto ch = data.chunks(chunkerBufSize).byCDChunk(testPol,
543 		20,
544 		(1 << 20) - (1 << 18),
545 		(1 << 20) + (1 << 18));
546 
547 	testWithData(ch, chunks4, true);
548 }
549 
550 // Ensure the min/max bounds are strictly followed.
551 // (Check for off-by-one errors.)
552 @(`ChunkerMinMaxBounds`) unittest
553 {
554 	auto data = getRandom(23, 64*1024);
555 	auto ch = data.chunks(chunkerBufSize).byCDChunk(testPol,
556 		7,
557 		RabinHash.windowSize * 2 - 2,
558 		RabinHash.windowSize * 2 + 2);
559 
560 	size_t[] sizes;
561 	foreach (chunk; ch)
562 		if (chunk.data.length != RabinHash.windowSize * 2 + 2)
563 			sizes ~= chunk.data.length;
564 
565 	assert(sizes == [
566 			126, 129, 126, 127, 128, 128, 126, 126, 127, 129,
567 			129, 128, 127, 128, 126, 129, 126, 129, 128, 127, 67]);
568 }
569 
570 import std.stdio : stderr;
571 
572 @(`ChunkerWithRandomPolynomial`) unittest
573 {
574 	// setup data source
575 	auto data = getRandom(23, 32*1024*1024);
576 
577 	// generate a new random polynomial
578 	import std.datetime.systime : Clock, SysTime;
579 	auto start = Clock.currTime();
580 	auto p = Pol.getRandom();
581 	stderr.writefln!"generating random polynomial took %s"(Clock.currTime() - start);
582 
583 	start = Clock.currTime();
584 	auto ch = data.chunks(chunkerBufSize).byCDChunk(p);
585 	stderr.writefln!"creating chunker took %s"(Clock.currTime() - start);
586 
587 	// make sure that first chunk is different
588 	auto c = ch.front;
589 
590 	assert(c.cut != chunks1[0].cutFP,
591 		"Cut point is the same");
592 
593 	assert(c.data.length != chunks1[0].length,
594 		"Length is the same");
595 
596 	assert(sha256Of(c.data) != chunks1[0].digest,
597 		"Digest is the same");
598 }
599 
600 @(`ChunkerWithoutHash`) unittest
601 {
602 	// setup data source
603 	auto data = getRandom(23, 32*1024*1024);
604 
605 	auto ch = data.chunks(chunkerBufSize).byCDChunk(testPol);
606 	auto chunks = testWithData(ch, chunks1, false, true);
607 
608 	// test reader
609 	size_t pos = 0;
610 	foreach (i, c; chunks)
611 	{
612 		assert(c.data.length == chunks1[i].length,
613 			format!"reader returned wrong number of bytes: expected %d, got %d"
614 			(chunks1[i].length, c.data.length));
615 
616 		assert(data[pos .. pos+c.data.length] == c.data,
617 			format!"invalid data for chunk returned: expected %(%02x%), got %(%02x%)"
618 			(data[pos .. pos+c.data.length], c.data));
619 		pos += c.data.length;
620 	}
621 
622 	// setup nullbyte data source
623 	data = replicate([ubyte(0)], chunks2.length*chunkDefaultMinSize);
624 	ch = data.chunks(chunkerBufSize).byCDChunk(testPol);
625 
626 	testWithData(ch, chunks2, false);
627 }
628 
629 version (benchmarkChunker)
630 {
631 	import chunker.internal.benchmark;
632 	mixin BenchmarkThisModule;
633 
634 	void _benchmarkChunker(bool checkDigest)
635 	{
636 		auto size = 32 * 1024 * 1024;
637 		auto buf = new ubyte[chunkDefaultMaxSize];
638 		auto data = [getRandom(23, size)];
639 
640 		// b.SetBytes(long(size));
641 
642 		int chunks;
643 		Benchmark.benchmark({
644 			chunks = 0;
645 
646 			auto ch = data.byCDChunk(testPol, buf);
647 
648 			auto cur = 0;
649 			while (!ch.empty)
650 			{
651 				auto chunk = ch.front;
652 
653 				assert(chunk.data.length == chunks1[cur].length,
654 					format!"wrong chunk length, want %d, got %d"
655 					(chunks1[cur].length, chunk.data.length));
656 
657 				assert(chunk.cut == chunks1[cur].cutFP,
658 					format!"wrong cut fingerprint, want 0x%x, got 0x%x"
659 					(chunks1[cur].cutFP, chunk.cut));
660 
661 				if (checkDigest)
662 				{
663 					auto h = sha256Of(chunk.data);
664 					assert(h == chunks1[cur].digest,
665 						format!"wrong digest, want %(%02x%), got %(%02x%)"
666 						(chunks1[cur].digest, h));
667 				}
668 
669 				chunks++;
670 				cur++;
671 				ch.popFront();
672 			}
673 		});
674 
675 		stderr.writefln!"%d chunks, average chunk size: %d bytes"(chunks, size/chunks);
676 	}
677 
678 	void benchmarkChunkerWithSHA256()
679 	{
680 		_benchmarkChunker(true);
681 	}
682 
683 	void benchmarkChunker()
684 	{
685 		_benchmarkChunker(false);
686 	}
687 
688 	void benchmarkNewChunker()
689 	{
690 		auto p = Pol.getRandom();
691 		ubyte[][] buf;
692 
693 		Benchmark.benchmark({
694 			buf.byCDChunk(p);
695 		});
696 	}
697 }