chunker.rabin source code

1 module chunker.rabin;
2 
3 import chunker.polynomials;
4 
5 /// Calculates a rolling Rabin Fingerprint.
6 struct RabinHash
7 {
8 	/// Base type for the calculated digest.
9 	alias Digest = ulong;
10 
11 	/// Size of the sliding window.
12 	public enum size_t windowSize = 64;
13 
14 	/// Current contents of the sliding window.
15 	private ubyte[windowSize] window;
16 
17 	/// Contains frequently-accessed scalar values.
18 	/// See Writer for details.
19 	private struct Scalars
20 	{
21 		/// Sliding window cursor.
22 		private size_t wpos;
23 
24 		/// Current hash digest value.
25 		private Digest digest;
26 	}
27 	/// ditto
28 	private Scalars scalars;
29 
30 	/// Reset internal state.
31 	void start()
32 	{
33 		window[] = 0;
34 		scalars.digest = 0;
35 		scalars.wpos = 0;
36 	}
37 
38 	/// Return current digest.
39 	@property Digest peek() const { return scalars.digest; }
40 
41 	/// Return current digest, then reset.
42 	Digest finish()
43 	{
44 		auto result = scalars.digest;
45 		start();
46 		return result;
47 	}
48 
49 	// ---------------------------------------------------------------------
50 
51 	// cache precomputed tables, these are read-only anyway
52 	private struct Cache
53 	{
54 		Tables[Pol] entries;
55 	}
56 	private __gshared Cache cache;
57 
58 	private struct Tables
59 	{
60 		private Pol[256] out_;
61 		private Pol[256] mod;
62 	}
63 
64 	/// Precomputed tables used for hashing.
65 	private Tables tables;
66 	/// Whether `tables` have already been computed.
67 	private bool tablesInitialized;
68 
69 	/// `fillTables` calculates `out_table` and `mod_table` for
70 	/// optimization. This implementation uses a cache in the global
71 	/// variable `cache`.
72 	private void fillTables(Pol pol)
73 	{
74 		tablesInitialized = true;
75 
76 		// test if the tables are cached for this polynomial
77 		synchronized
78 		{
79 			if (auto t = pol in cache.entries)
80 			{
81 				tables = *t;
82 				return;
83 			}
84 
85 			// calculate table for sliding out bytes. The byte to slide out is used as
86 			// the index for the table, the value contains the following:
87 			// out_table[b] = Hash(b || 0 ||        ...        || 0)
88 			//                          \ windowsize-1 zero bytes /
89 			// To slide out byte b_0 for window size w with known hash
90 			// H := H(b_0 || ... || b_w), it is sufficient to add out_table[b_0]:
91 			//    H(b_0 || ... || b_w) + H(b_0 || 0 || ... || 0)
92 			//  = H(b_0 + b_0 || b_1 + 0 || ... || b_w + 0)
93 			//  = H(    0     || b_1 || ...     || b_w)
94 			//
95 			// Afterwards a new byte can be shifted in.
96 			foreach (b; 0 .. 256)
97 			{
98 				Pol h;
99 
100 				static Pol appendByte(Pol hash, ubyte b, Pol pol)
101 				{
102 					hash.value <<= 8;
103 					hash.value |= Pol.Base(b);
104 
105 					return hash % pol;
106 				}
107 
108 				h = appendByte(h, cast(ubyte)b, pol);
109 				foreach (i; 0 .. RabinHash.windowSize-1)
110 					h = appendByte(h, 0, pol);
111 				tables.out_[b] = h;
112 			}
113 
114 			// calculate table for reduction mod Polynomial
115 			auto k = pol.deg();
116 			foreach (b; 0 .. 256)
117 			{
118 				// mod_table[b] = A | B, where A = (b(x) * x^k mod pol) and  B = b(x) * x^k
119 				//
120 				// The 8 bits above deg(Polynomial) determine what happens next and so
121 				// these bits are used as a lookup to this table. The value is split in
122 				// two parts: Part A contains the result of the modulus operation, part
123 				// B is used to cancel out the 8 top bits so that one XOR operation is
124 				// enough to reduce modulo Polynomial
125 				tables.mod[b] = Pol(
126 					(Pol(Pol.Base(b) << uint(k)) % pol).value |
127 					    (Pol.Base(b) << uint(k)));
128 			}
129 
130 			cache.entries[pol] = tables;
131 		}
132 	}
133 
134 	/// Bits to shift the digest when updating the hash.
135 	/// Calculated from the polynomial's degree.
136 	private uint polShift;
137 
138 	/// Initializes this `RabinHash` with the given polynomial.
139 	this(Pol pol)
140 	{
141 		polShift = uint(pol.deg() - 8);
142 		fillTables(pol);
143 	}
144 
145 	// ---------------------------------------------------------------------
146 
147 	/// Type for fast writing of successive bytes.
148 	/// Contains only scalar values, so that optimizing compilers
149 	/// may break it up and place the fields in registers.
150 	struct Writer
151 	{
152 		/// Contained scalar values.
153 		private Scalars scalars;
154 		/// Pointer to the rest of the object.
155 		private RabinHash* hash;
156 
157 		/// Return current digest.
158 		public @property Digest peek() const { return scalars.digest; }
159 
160 		/// Slide in one byte (and slide out the corresponding byte from the sliding window).
161 		public void slide(ubyte b)
162 		{
163 			slideImpl(hash.window, scalars.wpos, scalars.digest, hash.tables.out_, hash.tables.mod, hash.polShift, b);
164 		}
165 	}
166 
167 	/// Return a fast writer object.
168 	/// Must be committed at the end using `commit`.
169 	Writer getWriter()
170 	{
171 		assert(tablesInitialized, "tables for polynomial computation not initialized");
172 		return Writer(scalars, &this);
173 	}
174 
175 	/// Commit a `Writer`.
176 	void commit(ref Writer writer)
177 	{
178 		assert(writer.hash == &this);
179 		this.scalars = writer.scalars;
180 	}
181 
182 	/// Slide in one byte (and slide out the corresponding byte from the sliding window).
183 	public void slide(ubyte b)
184 	{
185 		slideImpl(window, scalars.wpos, scalars.digest, tables.out_, tables.mod, polShift, b);
186 	}
187 
188 	/// Slide in all `bytes`.
189 	/// Note that using more bytes than `windowSize` is ineffectual,
190 	/// and only the last `windowSize` bytes affect the final digest.
191 	void put(R)(R bytes)
192 	{
193 		auto writer = getWriter();
194 		scope(success) commit(writer);
195 		foreach (b; bytes)
196 			writer.slide(b);
197 	}
198 
199 	/// Keep sliding in `bytes` until `digest & mask == 0`.
200 	/// Return number of bytes written, which will always be
201 	/// less than `bytes.length` if `mask` matched.
202 	size_t putUntil(R)(R bytes, Digest mask)
203 	{
204 		auto writer = getWriter();
205 		scope(success) commit(writer);
206 		size_t c = 0;
207 		foreach (b; bytes)
208 		{
209 			if ((writer.peek & mask) == 0)
210 				break;
211 			writer.slide(b);
212 			c++;
213 		}
214 		return c;
215 	}
216 
217 	/// Implementation for `slide`.
218 	private static void slideImpl(ref ubyte[windowSize] window, ref size_t wpos, ref Digest digest, ref Pol[256] tabout, in ref Pol[256] tabmod, uint polShift, ubyte b)
219 	{
220 		auto out_ = window[wpos];
221 		window[wpos] = b;
222 		digest ^= Digest(tabout[out_].value);
223 		wpos++;
224 		if (wpos >= windowSize)
225 			wpos = 0;
226 
227 		digest = updateDigest(digest, polShift, tabmod, b);
228 	}
229 
230 	/// ditto
231 	private static Digest /*newDigest*/ updateDigest(Digest digest, uint polShift, in ref Pol[256] tabmod, ubyte b)
232 	{
233 		auto index = cast(ubyte)(digest >> polShift);
234 		digest <<= 8;
235 		digest |= Digest(b);
236 
237 		digest ^= Digest(tabmod[index].value);
238 		return digest;
239 	}
240 }