1 /** 2 Thin package implements Content Defined Chunking (CDC) based on a rolling 3 Rabin Checksum. 4 5 Choosing a Random Irreducible Polynomial 6 ======================================== 7 8 The function `Pol.getRandom()` returns a new random polynomial of degree 53 9 for use with the chunker. The degree 53 is chosen because it is the largest 10 prime below 64-8 = 56, so that the top 8 bits of an ulong can be used for 11 optimising calculations in the chunker. 12 13 A random polynomial is chosen selecting 64 random bits, masking away bits 14 64..54 and setting bit 53 to one (otherwise the polynomial is not of the 15 desired degree) and bit 0 to one (otherwise the polynomial is trivially 16 reducible), so that 51 bits are chosen at random. 17 18 This process is repeated until `Pol.irreducible` returns `true`, then this 19 polynomials is returned. If this doesn't happen after 1 million tries, the 20 function returns an error. The probability for selecting an irreducible 21 polynomial at random is about 7.5% ( (2^53-2)/53 / 2^51), so the probability 22 that no irreducible polynomial has been found after 100 tries is lower than 23 0.04%. 24 25 Verifying Irreducible Polynomials 26 ================================= 27 28 During development the results have been verified using the computational 29 discrete algebra system GAP, which can be obtained from the website at 30 http://www.gap-system.org/. 31 32 For filtering a given list of polynomials in hexadecimal coefficient notation, 33 the following script can be used: 34 35 ``` 36 # create x over F_2 = GF(2); 37 x := Indeterminate(GF(2), "x"); 38 39 # test if polynomial is irreducible, i.e. the number of factors is one; 40 IrredPoly := function (poly); 41 return (Length(Factors(poly)) = 1); 42 end;; 43 44 # create a polynomial in x from the hexadecimal representation of the; 45 # coefficients; 46 Hex2Poly := function (s); 47 return ValuePol(CoefficientsQadic(IntHexString(s), 2), x); 48 end;; 49 50 # list of candidates, in hex; 51 candidates := [ "3DA3358B4DC173" ]; 52 53 # create real polynomials; 54 L := List(candidates, Hex2Poly); 55 56 # filter and display the list of irreducible polynomials contained in L; 57 Display(Filtered(L, x -> (IrredPoly(x)))); 58 ``` 59 60 All irreducible polynomials from the list are written to the output. 61 62 Background Literature 63 ===================== 64 65 An introduction to Rabin Fingerprints/Checksums can be found in the following articles: 66 67 Michael O. Rabin (1981): "Fingerprinting by Random Polynomials" 68 http://www.xmailserver.org/rabin.pdf 69 70 Ross N. Williams (1993): "A Painless Guide to CRC Error Detection Algorithms" 71 http://www.zlib.net/crc_v3.txt 72 73 Andrei Z. Broder (1993): "Some Applications of Rabin's Fingerprinting Method" 74 http://www.xmailserver.org/rabin_apps.pdf 75 76 Shuhong Gao and Daniel Panario (1997): "Tests and Constructions of Irreducible Polynomials over Finite Fields" 77 http://www.math.clemson.edu/~sgao/papers/GP97a.pdf 78 79 Andrew Kadatch, Bob Jenkins (2007): "Everything we know about CRC but afraid to forget" 80 http://crcutil.googlecode.com/files/crc-doc.1.0.pdf 81 82 License: 83 84 Copyright 2014 Alexander Neumann. All rights reserved. 85 Use of this source code is governed by a BSD-style 86 license that can be found in the LICENSE file. 87 */ 88 89 module chunker; 90 91 import chunker.polynomials; 92 import chunker.rabin; 93 94 // Array support 95 import std.range.primitives : empty, front, popFront; 96 97 private enum size_t kiB = 1024; 98 private enum size_t miB = 1024 * kiB; 99 100 /// Aim to create chunks of 20 bits or about 1MiB on average. 101 public enum size_t chunkDefaultAverageBits = 20; 102 /// Default minimal size of a chunk. 103 public enum size_t chunkDefaultMinSize = 512 * kiB; 104 /// Default maximal size of a chunk. 105 public enum size_t chunkDefaultMaxSize = 8 * miB; 106 107 108 /// Splits content with Rabin Fingerprints. 109 struct Chunker(R) 110 { 111 /// One content-dependent chunk of bytes whose end was cut when 112 /// the Rabin Fingerprint had the value stored in `cut`. 113 struct Chunk 114 { 115 /// Contents of the chunk. 116 ubyte[] data; 117 /// Value of the rolling hash when the chunk was cut. 118 ulong cut; 119 } 120 121 private // internal state 122 { 123 /// Current hash internal state. 124 RabinHash hash; 125 126 /// Buffer used to receive and keep read data in. 127 ubyte[] buf; 128 129 /// Buffer used to copy chunk data to. 130 ubyte[] cbuf; 131 /// Number of bytes within the current chunk. 132 size_t count; 133 } 134 135 private // configuration 136 { 137 /// Minimum and maximum chunk sizes, as configured. 138 size_t minSize, maxSize; 139 140 /// Hash mask used to decide chunk boundaries. 141 /// By default `(1 << 20) - 1`, or configured from `averageBits`. 142 ulong splitmask; 143 144 /// Input data source. 145 R source; 146 } 147 148 /// Probably use the `byCDChunk` convenience function instead of this constructor directly. 149 /// Note that `Chunker` will `popFront` the input range during construction. 150 public this(R source, Pol pol, uint averageBits, size_t minSize, size_t maxSize, ubyte[] cbuf) 151 { 152 this.cbuf = cbuf ? cbuf : new ubyte[maxSize]; 153 this.source = source; 154 this.minSize = minSize; 155 this.maxSize = maxSize; 156 this.splitmask = (1L << averageBits) - 1; 157 this.hash = RabinHash(pol); 158 if (this.source.empty) 159 empty = true; 160 else 161 { 162 this.buf = this.source.front; 163 popFront(); 164 } 165 } 166 167 Chunk front; /// Current chunk. 168 bool empty; /// `true` when there are no more chunks (input range is empty). 169 170 /// Updates `front` to contain the position and data of the next 171 /// chunk. Note that `Chunker` reuses the chunk buffer, so the 172 /// previous contents of `front.data` will be overwritten. 173 public void popFront() 174 { 175 assert(!empty); 176 177 hash.start(); 178 hash.slide(1); 179 count = 0; 180 auto minSize = this.minSize; 181 auto maxSize = this.maxSize; 182 // do not start a new chunk unless at least minSize bytes have been read 183 auto pre = minSize - RabinHash.windowSize; 184 auto buf = this.buf; 185 scope(success) this.buf = buf; 186 187 void copyBytes(size_t numBytes) 188 { 189 auto bytes = buf[0 .. numBytes]; 190 buf = buf[numBytes .. $]; 191 auto newLen = count + bytes.length; 192 if (cbuf.length < newLen) 193 cbuf.length = newLen; 194 cbuf[count .. newLen] = bytes[]; 195 count += numBytes; 196 } 197 198 while (true) 199 { 200 if (buf.length == 0) 201 { 202 if (source.empty) 203 { 204 // Last chunk has been read. 205 empty = true; 206 return; 207 } 208 209 source.popFront(); 210 211 if (source.empty) 212 { 213 // There are no more bytes to buffer, so this was 214 // the last chunk. Return current chunk, if any 215 // bytes have been processed. 216 if (count > 0) 217 break; 218 else 219 { 220 empty = true; 221 return; 222 } 223 } 224 225 buf = source.front; 226 } 227 228 // check if bytes have to be dismissed before starting a new chunk 229 if (pre > 0) 230 { 231 if (pre > buf.length) 232 { 233 pre -= buf.length; 234 copyBytes(buf.length); 235 continue; 236 } 237 238 copyBytes(pre); 239 pre = 0; 240 } 241 242 if (count < minSize) 243 { 244 auto warmUp = minSize - count; 245 if (warmUp > buf.length) 246 warmUp = buf.length; 247 hash.put(buf[0 .. warmUp]); 248 copyBytes(warmUp); 249 } 250 251 auto toWrite = buf.length; 252 if (count + toWrite > maxSize) 253 toWrite = maxSize - count; 254 auto written = hash.putUntil(buf[0 .. toWrite], splitmask); 255 copyBytes(written); 256 if (buf.length) // stopped early due to maxSize or mask match 257 break; 258 } 259 front = Chunk(cbuf[0 .. count], hash.peek()); 260 } 261 } 262 263 /// Constructs a new `Chunker` based on polynomial `pol` that 264 /// reads from `source`. 265 /// Params: 266 /// source = An input range of chunks of bytes, such as that 267 /// returned by `File.byChunk` or `ubyte[].chunks` or simply 268 /// `ubyte[].only`. 269 /// pol = An irreducible polynomial from `F_2[X]`. Use 270 /// `Pol.getRandom()` to generate a random one. 271 /// averageBits = Allows to control the frequency of chunk 272 /// discovery: the lower `averageBits`, the higher amount of 273 /// chunks will be identified. The default value is 20 bits, 274 /// so chunks will be of 1MiB size on average. 275 /// minSize = Minimum size of emitted chunks. Chunk boundaries that 276 /// occur less than `minSize` bytes from the chunk start are 277 /// ignored. 278 /// maxSize = Maximum size of emitted chunks. If a chunk boundary is 279 /// not encountered after `maxSize` bytes from the chunk start, it 280 /// is forcibly split at that point. 281 /// cbuf = A buffer to store chunk data. When `null` (default), a 282 /// new buffer is allocated on construction of length `maxSize`. 283 /// Returns: 284 /// An instance of `Chunker`, an input range of `Chunker.Chunk`, 285 /// which contains the chunk data, and the fingerprint value when it 286 /// was cut. 287 Chunker!R byCDChunk(R)( 288 R source, 289 Pol pol, 290 uint averageBits = chunkDefaultAverageBits, 291 size_t minSize = chunkDefaultMinSize, 292 size_t maxSize = chunkDefaultMaxSize, 293 ubyte[] cbuf = null) 294 { 295 return Chunker!R(source, pol, averageBits, minSize, maxSize, cbuf); 296 } 297 298 /// ditto 299 Chunker!R byCDChunk(R)(R source, Pol pol, ubyte[] cbuf) 300 { 301 return Chunker!R(source, pol, chunkDefaultAverageBits, chunkDefaultMinSize, chunkDefaultMaxSize, cbuf); 302 } 303 304 // ----------------------------------------------------------------------------- 305 version(chunkerUnittest) version = test; 306 version(benchmarkChunker) version = test; 307 version(test): 308 private: 309 310 import chunker.internal.helpers; 311 312 enum chunkerBufSize = 512 * kiB; 313 314 template parseDigest(string s) 315 { 316 import std.conv : hexString; 317 enum ubyte[] parseDigest = cast(ubyte[])hexString!s; 318 } 319 320 struct TestChunk 321 { 322 public size_t length; 323 public ulong cutFP; 324 public ubyte[] digest; 325 } 326 327 /// polynomial used for all the tests below 328 enum testPol = Pol(0x3DA3358B4DC173); 329 330 /// created for 32MB of random data out of math/rand's Uint32() seeded by 331 /// constant 23 332 // 333 /// chunking configuration: 334 /// window size 64, avg chunksize 1<<20, min chunksize 1<<19, max chunksize 1<<23 335 /// polynom 0x3DA3358B4DC173 336 TestChunk[] chunks1 = 337 [ 338 {2163460, 0x000b98d4cdf00000, parseDigest!"4b94cb2cf293855ea43bf766731c74969b91aa6bf3c078719aabdd19860d590d"}, 339 {643703, 0x000d4e8364d00000, parseDigest!"5727a63c0964f365ab8ed2ccf604912f2ea7be29759a2b53ede4d6841e397407"}, 340 {1528956, 0x0015a25c2ef00000, parseDigest!"a73759636a1e7a2758767791c69e81b69fb49236c6929e5d1b654e06e37674ba"}, 341 {1955808, 0x00102a8242e00000, parseDigest!"c955fb059409b25f07e5ae09defbbc2aadf117c97a3724e06ad4abd2787e6824"}, 342 {2222372, 0x00045da878000000, parseDigest!"6ba5e9f7e1b310722be3627716cf469be941f7f3e39a4c3bcefea492ec31ee56"}, 343 {2538687, 0x00198a8179900000, parseDigest!"8687937412f654b5cfe4a82b08f28393a0c040f77c6f95e26742c2fc4254bfde"}, 344 {609606, 0x001d4e8d17100000, parseDigest!"5da820742ff5feb3369112938d3095785487456f65a8efc4b96dac4be7ebb259"}, 345 {1205738, 0x000a7204dd600000, parseDigest!"cc70d8fad5472beb031b1aca356bcab86c7368f40faa24fe5f8922c6c268c299"}, 346 {959742, 0x00183e71e1400000, parseDigest!"4065bdd778f95676c92b38ac265d361f81bff17d76e5d9452cf985a2ea5a4e39"}, 347 {4036109, 0x001fec043c700000, parseDigest!"b9cf166e75200eb4993fc9b6e22300a6790c75e6b0fc8f3f29b68a752d42f275"}, 348 {1525894, 0x000b1574b1500000, parseDigest!"2f238180e4ca1f7520a05f3d6059233926341090f9236ce677690c1823eccab3"}, 349 {1352720, 0x00018965f2e00000, parseDigest!"afd12f13286a3901430de816e62b85cc62468c059295ce5888b76b3af9028d84"}, 350 {811884, 0x00155628aa100000, parseDigest!"42d0cdb1ee7c48e552705d18e061abb70ae7957027db8ae8db37ec756472a70a"}, 351 {1282314, 0x001909a0a1400000, parseDigest!"819721c2457426eb4f4c7565050c44c32076a56fa9b4515a1c7796441730eb58"}, 352 {1318021, 0x001cceb980000000, parseDigest!"842eb53543db55bacac5e25cb91e43cc2e310fe5f9acc1aee86bdf5e91389374"}, 353 {948640, 0x0011f7a470a00000, parseDigest!"b8e36bf7019bb96ac3fb7867659d2167d9d3b3148c09fe0de45850b8fe577185"}, 354 {645464, 0x00030ce2d9400000, parseDigest!"5584bd27982191c3329f01ed846bfd266e96548dfa87018f745c33cfc240211d"}, 355 {533758, 0x0004435c53c00000, parseDigest!"4da778a25b72a9a0d53529eccfe2e5865a789116cb1800f470d8df685a8ab05d"}, 356 {1128303, 0x0000c48517800000, parseDigest!"08c6b0b38095b348d80300f0be4c5184d2744a17147c2cba5cc4315abf4c048f"}, 357 {800374, 0x000968473f900000, parseDigest!"820284d2c8fd243429674c996d8eb8d3450cbc32421f43113e980f516282c7bf"}, 358 {2453512, 0x001e197c92600000, parseDigest!"5fa870ed107c67704258e5e50abe67509fb73562caf77caa843b5f243425d853"}, 359 {2651975, 0x000ae6c868000000, parseDigest!"181347d2bbec32bef77ad5e9001e6af80f6abcf3576549384d334ee00c1988d8"}, 360 {237392, 0x0000000000000001, parseDigest!"fcd567f5d866357a8e299fd5b2359bb2c8157c30395229c4e9b0a353944a7978"}, 361 ]; 362 363 /// test if nullbytes are correctly split, even if length is a multiple of minSize. 364 TestChunk[] chunks2 = 365 [ 366 {chunkDefaultMinSize, 0, parseDigest!"07854d2fef297a06ba81685e660c332de36d5d18d546927d30daad6d7fda1541"}, 367 {chunkDefaultMinSize, 0, parseDigest!"07854d2fef297a06ba81685e660c332de36d5d18d546927d30daad6d7fda1541"}, 368 {chunkDefaultMinSize, 0, parseDigest!"07854d2fef297a06ba81685e660c332de36d5d18d546927d30daad6d7fda1541"}, 369 {chunkDefaultMinSize, 0, parseDigest!"07854d2fef297a06ba81685e660c332de36d5d18d546927d30daad6d7fda1541"}, 370 ]; 371 372 /// the same as chunks1, but avg chunksize is 1<<19 373 TestChunk[] chunks3 = 374 [ 375 {1491586, 0x00023e586ea80000, parseDigest!"4c008237df602048039287427171cef568a6cb965d1b5ca28dc80504a24bb061"}, 376 {671874, 0x000b98d4cdf00000, parseDigest!"fa8a42321b90c3d4ce9dd850562b2fd0c0fe4bdd26cf01a24f22046a224225d3"}, 377 {643703, 0x000d4e8364d00000, parseDigest!"5727a63c0964f365ab8ed2ccf604912f2ea7be29759a2b53ede4d6841e397407"}, 378 {1284146, 0x0012b527e4780000, parseDigest!"16d04cafecbeae9eaedd49da14c7ad7cdc2b1cc8569e5c16c32c9fb045aa899a"}, 379 {823366, 0x000d1d6752180000, parseDigest!"48662c118514817825ad4761e8e2e5f28f9bd8281b07e95dcafc6d02e0aa45c3"}, 380 {810134, 0x0016071b6e180000, parseDigest!"f629581aa05562f97f2c359890734c8574c5575da32f9289c5ba70bfd05f3f46"}, 381 {567118, 0x00102a8242e00000, parseDigest!"d4f0797c56c60d01bac33bfd49957a4816b6c067fc155b026de8a214cab4d70a"}, 382 {821315, 0x001b3e42c8180000, parseDigest!"8ebd0fd5db0293bd19140da936eb8b1bbd3cd6ffbec487385b956790014751ca"}, 383 {1401057, 0x00045da878000000, parseDigest!"001360af59adf4871ef138cfa2bb49007e86edaf5ac2d6f0b3d3014510991848"}, 384 {2311122, 0x0005cbd885380000, parseDigest!"8276d489b566086d9da95dc5c5fe6fc7d72646dd3308ced6b5b6ddb8595f0aa1"}, 385 {608723, 0x001cfcd86f280000, parseDigest!"518db33ba6a79d4f3720946f3785c05b9611082586d47ea58390fc2f6de9449e"}, 386 {980456, 0x0013edb7a7f80000, parseDigest!"0121b1690738395e15fecba1410cd0bf13fde02225160cad148829f77e7b6c99"}, 387 {1140278, 0x0001f9f017e80000, parseDigest!"28ca7c74804b5075d4f5eeb11f0845d99f62e8ea3a42b9a05c7bd5f2fca619dd"}, 388 {2015542, 0x00097bf5d8180000, parseDigest!"6fe8291f427d48650a5f0f944305d3a2dbc649bd401d2655fc0bdd42e890ca5a"}, 389 {904752, 0x000e1863eff80000, parseDigest!"62af1f1eb3f588d18aff28473303cc4731fc3cafcc52ce818fee3c4c2820854d"}, 390 {713072, 0x001f3bb1b9b80000, parseDigest!"4bda9dc2e3031d004d87a5cc93fe5207c4b0843186481b8f31597dc6ffa1496c"}, 391 {675937, 0x001fec043c700000, parseDigest!"5299c8c5acec1b90bb020cd75718aab5e12abb9bf66291465fd10e6a823a8b4a"}, 392 {1525894, 0x000b1574b1500000, parseDigest!"2f238180e4ca1f7520a05f3d6059233926341090f9236ce677690c1823eccab3"}, 393 {1352720, 0x00018965f2e00000, parseDigest!"afd12f13286a3901430de816e62b85cc62468c059295ce5888b76b3af9028d84"}, 394 {811884, 0x00155628aa100000, parseDigest!"42d0cdb1ee7c48e552705d18e061abb70ae7957027db8ae8db37ec756472a70a"}, 395 {1282314, 0x001909a0a1400000, parseDigest!"819721c2457426eb4f4c7565050c44c32076a56fa9b4515a1c7796441730eb58"}, 396 {1093738, 0x0017f5d048880000, parseDigest!"5dddfa7a241b68f65d267744bdb082ee865f3c2f0d8b946ea0ee47868a01bbff"}, 397 {962003, 0x000b921f7ef80000, parseDigest!"0cb5c9ebba196b441c715c8d805f6e7143a81cd5b0d2c65c6aacf59ca9124af9"}, 398 {856384, 0x00030ce2d9400000, parseDigest!"7734b206d46f3f387e8661e81edf5b1a91ea681867beb5831c18aaa86632d7fb"}, 399 {533758, 0x0004435c53c00000, parseDigest!"4da778a25b72a9a0d53529eccfe2e5865a789116cb1800f470d8df685a8ab05d"}, 400 {1128303, 0x0000c48517800000, parseDigest!"08c6b0b38095b348d80300f0be4c5184d2744a17147c2cba5cc4315abf4c048f"}, 401 {800374, 0x000968473f900000, parseDigest!"820284d2c8fd243429674c996d8eb8d3450cbc32421f43113e980f516282c7bf"}, 402 {2453512, 0x001e197c92600000, parseDigest!"5fa870ed107c67704258e5e50abe67509fb73562caf77caa843b5f243425d853"}, 403 {665901, 0x00118c842cb80000, parseDigest!"deceec26163842fdef6560311c69bf8a9871a56e16d719e2c4b7e4d668ceb61f"}, 404 {1986074, 0x000ae6c868000000, parseDigest!"64cd64bf3c3bc389eb20df8310f0427d1c36ab2eaaf09e346bfa7f0453fc1a18"}, 405 {237392, 0x0000000000000001, parseDigest!"fcd567f5d866357a8e299fd5b2359bb2c8157c30395229c4e9b0a353944a7978"}, 406 ]; 407 408 /// same as chunks1, but with a much bounded min/max size 409 TestChunk[] chunks4 = 410 [ 411 {1310720, 0x0008ec095c2a5e99, parseDigest!"5997b608c2a8b7d827bdfbd69dcffa86984c3a754aa4be92292f9f2ba0af4c22"}, 412 {852740, 0x000b98d4cdf00000, parseDigest!"8df8723fbcb4639c94cd831a3d8c8a6ff3ff746fe77769ee2244d1ef69bbe31e"}, 413 {1310720, 0x0008271396402fa4, parseDigest!"7c29efffbda477bbe56274573c3ba1b7e616eb76f3b9c92168a1f214aa557ba6"}, 414 {861939, 0x0015a25c2ef00000, parseDigest!"5080a1c4d3b538dce8c04c7738a043185d8c76e3251ae69e483b4b17d94dd7b9"}, 415 {1310720, 0x000fe23afea2e3be, parseDigest!"95507aa5fb821925e92ece579b5b50df0a6b311bd517937c4ae1a39e14cb79ea"}, 416 {1310720, 0x000e7cc6e6b093f0, parseDigest!"d2efa31fd050ca04232401b84a06fbde30bcc62f2108a5c0f52cca319952ab90"}, 417 {1310720, 0x00090b61ebb67fc7, parseDigest!"3a7e29b6a323b111ff5860953de004dfb0f69e5ccadb8831176cb0a6e3563262"}, 418 {1310720, 0x0008998f21ddfe08, parseDigest!"d25815c6d82e29d8d59fdde4bd85804526cd45b027106e9526d2bfdba6e8c7ae"}, 419 {1310720, 0x0010ebc94474a0db, parseDigest!"1ef10b00b9894102517e4a2d201877c9eba662dcbd22b2259f44c2c88436cbae"}, 420 {1310720, 0x00180e6341e54abb, parseDigest!"d7a9ebce0573e5a2f6d688a495044766b2323a7a8b73ad2a9f1bdead52429d83"}, 421 {1310720, 0x0010fa125ffa7553, parseDigest!"9803d879cf7cc8221eae87c2bf1ac9e6a4328929dfe32e5cb461663186402531"}, 422 {1310720, 0x000884f3046bc00f, parseDigest!"0d35b3a8949b673821f004eb845af80ae34f082dd4c20b7715c42aff21aeabad"}, 423 {1310720, 0x00154b752483d77e, parseDigest!"fb464beea91aee4eb91b7e5da58cc51f28a9ee74a61bc5ea40e703603e860fd8"}, 424 {1310720, 0x00068be82cf57932, parseDigest!"ab4ebe4015214c4f449782ded96011d3ac1be3daa682a1df2d1905104008f1b5"}, 425 {1310720, 0x000a1c2e12d6c196, parseDigest!"9198f8799fc64d2f6da6365a42f32c7b2d31ebfb23ccd957b705cfc24c69bc5e"}, 426 {1310720, 0x000f73c53cb2ae82, parseDigest!"2d05e2f054b3ce480e7ffa4490db250ed2126fe36b8d767fc51c6e2d3baff87b"}, 427 {915082, 0x000f55a1fea00000, parseDigest!"bcc88a08794bf3f194afa0c351836846eed95acfe7de41dc32d7f63c8fa02a4c"}, 428 {1310720, 0x0016d908ad8f8bbc, parseDigest!"214f5fdf4136b08499ae944d6507ae97ae0eee5e051e336182a18d0881251d96"}, 429 {1310720, 0x001740bda9456bd2, parseDigest!"1fcdad3287449b74a61ac626599dc9b0efb7bf01b599931f141f0ec340547c06"}, 430 {1310720, 0x000c1606d9a0d99f, parseDigest!"d88982bf999af432cfcc94bace5000308cbb1cc6cd2126286d6304ebd83c4765"}, 431 {837117, 0x00030ce2d9400000, parseDigest!"bf6c025df617eac82f643026d1d0295159a63a4e8dea41ed0341bd0469cbd7d3"}, 432 {1310720, 0x0008fb9882a96b1f, parseDigest!"b5a2a7d0dfaa4d66e6c693afe77037cd42a21deac68f21fdafdfdeeea881e6a1"}, 433 {1151715, 0x000968473f900000, parseDigest!"31882d126190910a0260abe5d10cb31b180e3e2d0d2c3e45981a78e1675738a5"}, 434 {1310720, 0x0015cadcf5b2d7ac, parseDigest!"7d878ceb8e6ef8a3adc8a892a6c32d8afc93b9ea6b8a55d2271230847fa4b925"}, 435 {1142792, 0x001e197c92600000, parseDigest!"96095ef256bb29934c024dbd0e3eab9c9dee2aa3308a7f4d77f20ae712ea57f1"}, 436 {1310720, 0x0005408e6716fd12, parseDigest!"2825054e95b14ceb9f6c1947e5f5da0b34ce898850e05390adfcd976a271c107"}, 437 {1310720, 0x000fb439ba516157, parseDigest!"eb4d3e5686f4cc810faa8bb356dbdd3242cb9acfd87fa7c2229c44f3f33aa714"}, 438 {267927, 0x0000000000000001, parseDigest!"19f1aa1f5c3b49452675de394497fe5943cd00ee5a61291c6c0ca92b01ca2312"}, 439 ]; 440 441 import std.format : format; 442 import std.digest.sha : sha256Of; 443 444 Chunker!R.Chunk[] testWithData(R)(ref Chunker!R chnker, TestChunk[] testChunks, bool checkDigest, bool copyData = false) 445 { 446 Chunker!R.Chunk[] chunks; 447 448 foreach (i, chunk; testChunks) 449 { 450 auto c = chnker.front; 451 452 assert(c.data.length == chunk.length, 453 format!"Length for chunk %d does not match: expected %d, got %d" 454 (i, chunk.length, c.data.length)); 455 456 assert(c.cut == chunk.cutFP, 457 format!"Cut fingerprint for chunk %d/%d does not match: expected %016x, got %016x" 458 (i, testChunks.length, chunk.cutFP, c.cut)); 459 460 if (checkDigest) 461 { 462 auto digest = sha256Of(c.data); 463 assert(chunk.digest == digest, 464 format!"Digest fingerprint for chunk %d/%d does not match: expected %(%02x%), got %(%02x%)" 465 (i, testChunks.length, chunk.digest, digest)); 466 } 467 468 if (copyData) 469 c.data = c.data.dup; 470 chunks ~= c; 471 chnker.popFront(); 472 } 473 474 if (!chnker.empty) 475 assert(false, "Wrong error returned after last chunk"); 476 477 assert(chunks.length == testChunks.length, 478 "Amounts of test and resulting chunks do not match"); 479 480 return chunks; 481 } 482 483 import std.array : replicate; 484 import std.range : chunks; 485 486 @(`Chunker`) unittest 487 { 488 // setup data source 489 auto data = getRandom(23, 32*1024*1024); 490 auto ch = data.chunks(chunkerBufSize).byCDChunk(testPol); 491 testWithData(ch, chunks1, true); 492 493 // setup nullbyte data source 494 data = replicate([ubyte(0)], chunks2.length*chunkDefaultMinSize); 495 ch = data.chunks(chunkerBufSize).byCDChunk(testPol); 496 497 testWithData(ch, chunks2, true); 498 } 499 500 @(`ChunkerWithArrayInput`) unittest 501 { 502 // Test with all data in one array/chunk 503 auto data = getRandom(23, 32*1024*1024); 504 auto ch = [data].byCDChunk(testPol); 505 testWithData(ch, chunks1, true); 506 507 data = replicate([ubyte(0)], chunks2.length*chunkDefaultMinSize); 508 ch = [data].byCDChunk(testPol); 509 testWithData(ch, chunks2, true); 510 } 511 512 @(`ChunkerWithFileInput`) unittest 513 { 514 // Test with File.byChunk 515 import std.stdio : File; 516 import std.file : remove; 517 auto data = getRandom(23, 32*1024*1024); 518 File("test.bin", "wb").rawWrite(data); 519 scope(exit) remove("test.bin"); 520 auto ch = File("test.bin", "rb").byChunk(chunkerBufSize).byCDChunk(testPol); 521 testWithData(ch, chunks1, true); 522 523 data = replicate([ubyte(0)], chunks2.length*chunkDefaultMinSize); 524 File("test.bin", "wb").rawWrite(data); 525 ch = File("test.bin", "rb").byChunk(chunkerBufSize).byCDChunk(testPol); 526 testWithData(ch, chunks2, true); 527 } 528 529 @(`ChunkerWithCustomAverageBits`) unittest 530 { 531 auto data = getRandom(23, 32*1024*1024); 532 533 // sligthly decrease averageBits to get more chunks 534 auto ch = data.chunks(chunkerBufSize).byCDChunk(testPol, 19); 535 536 testWithData(ch, chunks3, true); 537 } 538 539 @(`ChunkerWithCustomMinMax`) unittest 540 { 541 auto data = getRandom(23, 32*1024*1024); 542 auto ch = data.chunks(chunkerBufSize).byCDChunk(testPol, 543 20, 544 (1 << 20) - (1 << 18), 545 (1 << 20) + (1 << 18)); 546 547 testWithData(ch, chunks4, true); 548 } 549 550 // Ensure the min/max bounds are strictly followed. 551 // (Check for off-by-one errors.) 552 @(`ChunkerMinMaxBounds`) unittest 553 { 554 auto data = getRandom(23, 64*1024); 555 auto ch = data.chunks(chunkerBufSize).byCDChunk(testPol, 556 7, 557 RabinHash.windowSize * 2 - 2, 558 RabinHash.windowSize * 2 + 2); 559 560 size_t[] sizes; 561 foreach (chunk; ch) 562 if (chunk.data.length != RabinHash.windowSize * 2 + 2) 563 sizes ~= chunk.data.length; 564 565 assert(sizes == [ 566 126, 129, 126, 127, 128, 128, 126, 126, 127, 129, 567 129, 128, 127, 128, 126, 129, 126, 129, 128, 127, 67]); 568 } 569 570 import std.stdio : stderr; 571 572 @(`ChunkerWithRandomPolynomial`) unittest 573 { 574 // setup data source 575 auto data = getRandom(23, 32*1024*1024); 576 577 // generate a new random polynomial 578 import std.datetime.systime : Clock, SysTime; 579 auto start = Clock.currTime(); 580 auto p = Pol.getRandom(); 581 stderr.writefln!"generating random polynomial took %s"(Clock.currTime() - start); 582 583 start = Clock.currTime(); 584 auto ch = data.chunks(chunkerBufSize).byCDChunk(p); 585 stderr.writefln!"creating chunker took %s"(Clock.currTime() - start); 586 587 // make sure that first chunk is different 588 auto c = ch.front; 589 590 assert(c.cut != chunks1[0].cutFP, 591 "Cut point is the same"); 592 593 assert(c.data.length != chunks1[0].length, 594 "Length is the same"); 595 596 assert(sha256Of(c.data) != chunks1[0].digest, 597 "Digest is the same"); 598 } 599 600 @(`ChunkerWithoutHash`) unittest 601 { 602 // setup data source 603 auto data = getRandom(23, 32*1024*1024); 604 605 auto ch = data.chunks(chunkerBufSize).byCDChunk(testPol); 606 auto chunks = testWithData(ch, chunks1, false, true); 607 608 // test reader 609 size_t pos = 0; 610 foreach (i, c; chunks) 611 { 612 assert(c.data.length == chunks1[i].length, 613 format!"reader returned wrong number of bytes: expected %d, got %d" 614 (chunks1[i].length, c.data.length)); 615 616 assert(data[pos .. pos+c.data.length] == c.data, 617 format!"invalid data for chunk returned: expected %(%02x%), got %(%02x%)" 618 (data[pos .. pos+c.data.length], c.data)); 619 pos += c.data.length; 620 } 621 622 // setup nullbyte data source 623 data = replicate([ubyte(0)], chunks2.length*chunkDefaultMinSize); 624 ch = data.chunks(chunkerBufSize).byCDChunk(testPol); 625 626 testWithData(ch, chunks2, false); 627 } 628 629 version (benchmarkChunker) 630 { 631 import chunker.internal.benchmark; 632 mixin BenchmarkThisModule; 633 634 void _benchmarkChunker(bool checkDigest) 635 { 636 auto size = 32 * 1024 * 1024; 637 auto buf = new ubyte[chunkDefaultMaxSize]; 638 auto data = [getRandom(23, size)]; 639 640 // b.SetBytes(long(size)); 641 642 int chunks; 643 Benchmark.benchmark({ 644 chunks = 0; 645 646 auto ch = data.byCDChunk(testPol, buf); 647 648 auto cur = 0; 649 while (!ch.empty) 650 { 651 auto chunk = ch.front; 652 653 assert(chunk.data.length == chunks1[cur].length, 654 format!"wrong chunk length, want %d, got %d" 655 (chunks1[cur].length, chunk.data.length)); 656 657 assert(chunk.cut == chunks1[cur].cutFP, 658 format!"wrong cut fingerprint, want 0x%x, got 0x%x" 659 (chunks1[cur].cutFP, chunk.cut)); 660 661 if (checkDigest) 662 { 663 auto h = sha256Of(chunk.data); 664 assert(h == chunks1[cur].digest, 665 format!"wrong digest, want %(%02x%), got %(%02x%)" 666 (chunks1[cur].digest, h)); 667 } 668 669 chunks++; 670 cur++; 671 ch.popFront(); 672 } 673 }); 674 675 stderr.writefln!"%d chunks, average chunk size: %d bytes"(chunks, size/chunks); 676 } 677 678 void benchmarkChunkerWithSHA256() 679 { 680 _benchmarkChunker(true); 681 } 682 683 void benchmarkChunker() 684 { 685 _benchmarkChunker(false); 686 } 687 688 void benchmarkNewChunker() 689 { 690 auto p = Pol.getRandom(); 691 ubyte[][] buf; 692 693 Benchmark.benchmark({ 694 buf.byCDChunk(p); 695 }); 696 } 697 }