1 module chunker.rabin; 2 3 import chunker.polynomials; 4 5 /// Calculates a rolling Rabin Fingerprint. 6 struct RabinHash 7 { 8 /// Base type for the calculated digest. 9 alias Digest = ulong; 10 11 /// Size of the sliding window. 12 public enum size_t windowSize = 64; 13 14 /// Current contents of the sliding window. 15 private ubyte[windowSize] window; 16 17 /// Contains frequently-accessed scalar values. 18 /// See Writer for details. 19 private struct Scalars 20 { 21 /// Sliding window cursor. 22 private size_t wpos; 23 24 /// Current hash digest value. 25 private Digest digest; 26 } 27 /// ditto 28 private Scalars scalars; 29 30 /// Reset internal state. 31 void start() 32 { 33 window[] = 0; 34 scalars.digest = 0; 35 scalars.wpos = 0; 36 } 37 38 /// Return current digest. 39 @property Digest peek() const { return scalars.digest; } 40 41 /// Return current digest, then reset. 42 Digest finish() 43 { 44 auto result = scalars.digest; 45 start(); 46 return result; 47 } 48 49 // --------------------------------------------------------------------- 50 51 // cache precomputed tables, these are read-only anyway 52 private struct Cache 53 { 54 Tables[Pol] entries; 55 } 56 private __gshared Cache cache; 57 58 private struct Tables 59 { 60 private Pol[256] out_; 61 private Pol[256] mod; 62 } 63 64 /// Precomputed tables used for hashing. 65 private Tables tables; 66 /// Whether `tables` have already been computed. 67 private bool tablesInitialized; 68 69 /// `fillTables` calculates `out_table` and `mod_table` for 70 /// optimization. This implementation uses a cache in the global 71 /// variable `cache`. 72 private void fillTables(Pol pol) 73 { 74 tablesInitialized = true; 75 76 // test if the tables are cached for this polynomial 77 synchronized 78 { 79 if (auto t = pol in cache.entries) 80 { 81 tables = *t; 82 return; 83 } 84 85 // calculate table for sliding out bytes. The byte to slide out is used as 86 // the index for the table, the value contains the following: 87 // out_table[b] = Hash(b || 0 || ... || 0) 88 // \ windowsize-1 zero bytes / 89 // To slide out byte b_0 for window size w with known hash 90 // H := H(b_0 || ... || b_w), it is sufficient to add out_table[b_0]: 91 // H(b_0 || ... || b_w) + H(b_0 || 0 || ... || 0) 92 // = H(b_0 + b_0 || b_1 + 0 || ... || b_w + 0) 93 // = H( 0 || b_1 || ... || b_w) 94 // 95 // Afterwards a new byte can be shifted in. 96 foreach (b; 0 .. 256) 97 { 98 Pol h; 99 100 static Pol appendByte(Pol hash, ubyte b, Pol pol) 101 { 102 hash.value <<= 8; 103 hash.value |= Pol.Base(b); 104 105 return hash % pol; 106 } 107 108 h = appendByte(h, cast(ubyte)b, pol); 109 foreach (i; 0 .. RabinHash.windowSize-1) 110 h = appendByte(h, 0, pol); 111 tables.out_[b] = h; 112 } 113 114 // calculate table for reduction mod Polynomial 115 auto k = pol.deg(); 116 foreach (b; 0 .. 256) 117 { 118 // mod_table[b] = A | B, where A = (b(x) * x^k mod pol) and B = b(x) * x^k 119 // 120 // The 8 bits above deg(Polynomial) determine what happens next and so 121 // these bits are used as a lookup to this table. The value is split in 122 // two parts: Part A contains the result of the modulus operation, part 123 // B is used to cancel out the 8 top bits so that one XOR operation is 124 // enough to reduce modulo Polynomial 125 tables.mod[b] = Pol( 126 (Pol(Pol.Base(b) << uint(k)) % pol).value | 127 (Pol.Base(b) << uint(k))); 128 } 129 130 cache.entries[pol] = tables; 131 } 132 } 133 134 /// Bits to shift the digest when updating the hash. 135 /// Calculated from the polynomial's degree. 136 private uint polShift; 137 138 /// Initializes this `RabinHash` with the given polynomial. 139 this(Pol pol) 140 { 141 polShift = uint(pol.deg() - 8); 142 fillTables(pol); 143 } 144 145 // --------------------------------------------------------------------- 146 147 /// Type for fast writing of successive bytes. 148 /// Contains only scalar values, so that optimizing compilers 149 /// may break it up and place the fields in registers. 150 struct Writer 151 { 152 /// Contained scalar values. 153 private Scalars scalars; 154 /// Pointer to the rest of the object. 155 private RabinHash* hash; 156 157 /// Return current digest. 158 public @property Digest peek() const { return scalars.digest; } 159 160 /// Slide in one byte (and slide out the corresponding byte from the sliding window). 161 public void slide(ubyte b) 162 { 163 slideImpl(hash.window, scalars.wpos, scalars.digest, hash.tables.out_, hash.tables.mod, hash.polShift, b); 164 } 165 } 166 167 /// Return a fast writer object. 168 /// Must be committed at the end using `commit`. 169 Writer getWriter() 170 { 171 assert(tablesInitialized, "tables for polynomial computation not initialized"); 172 return Writer(scalars, &this); 173 } 174 175 /// Commit a `Writer`. 176 void commit(ref Writer writer) 177 { 178 assert(writer.hash == &this); 179 this.scalars = writer.scalars; 180 } 181 182 /// Slide in one byte (and slide out the corresponding byte from the sliding window). 183 public void slide(ubyte b) 184 { 185 slideImpl(window, scalars.wpos, scalars.digest, tables.out_, tables.mod, polShift, b); 186 } 187 188 /// Slide in all `bytes`. 189 /// Note that using more bytes than `windowSize` is ineffectual, 190 /// and only the last `windowSize` bytes affect the final digest. 191 void put(R)(R bytes) 192 { 193 auto writer = getWriter(); 194 scope(success) commit(writer); 195 foreach (b; bytes) 196 writer.slide(b); 197 } 198 199 /// Keep sliding in `bytes` until `digest & mask == 0`. 200 /// Return number of bytes written, which will always be 201 /// less than `bytes.length` if `mask` matched. 202 size_t putUntil(R)(R bytes, Digest mask) 203 { 204 auto writer = getWriter(); 205 scope(success) commit(writer); 206 size_t c = 0; 207 foreach (b; bytes) 208 { 209 if ((writer.peek & mask) == 0) 210 break; 211 writer.slide(b); 212 c++; 213 } 214 return c; 215 } 216 217 /// Implementation for `slide`. 218 private static void slideImpl(ref ubyte[windowSize] window, ref size_t wpos, ref Digest digest, ref Pol[256] tabout, in ref Pol[256] tabmod, uint polShift, ubyte b) 219 { 220 auto out_ = window[wpos]; 221 window[wpos] = b; 222 digest ^= Digest(tabout[out_].value); 223 wpos++; 224 if (wpos >= windowSize) 225 wpos = 0; 226 227 digest = updateDigest(digest, polShift, tabmod, b); 228 } 229 230 /// ditto 231 private static Digest /*newDigest*/ updateDigest(Digest digest, uint polShift, in ref Pol[256] tabmod, ubyte b) 232 { 233 auto index = cast(ubyte)(digest >> polShift); 234 digest <<= 8; 235 digest |= Digest(b); 236 237 digest ^= Digest(tabmod[index].value); 238 return digest; 239 } 240 }