1 module rt.compiler.util.hash;
2 
3 /// hashes bStart[0..length] bytes
4 extern(C) hash_t rt_hash_str(void *bStart,size_t length, hash_t seed=0){
5     static if (is(hash_t==uint)||is(hash_t==int)){
6         return cast(hash_t)murmurHashAligned2(bStart,length,cast(uint)seed);
7     } else static if (is(hash_t==ulong)||is(hash_t==long)){
8         return cast(hash_t)lookup8_hash(cast(ubyte*)bStart,length,cast(ulong)seed);
9     } else {
10         static assert(0,"unsupported type for hash_t: "~hash_t.stringof);
11     }
12 }
13 
14 /// hashes bStart[0..length] size_t
15 extern(C) hash_t rt_hash_block(size_t *bStart,size_t length, hash_t seed=0){
16     static if (is(hash_t==uint)||is(hash_t==int)){
17         return cast(hash_t)murmurHashAligned2(bStart,length,cast(uint)seed);
18     } else static if (is(hash_t==ulong)||is(hash_t==long)){
19         return cast(hash_t)lookup8_hash2(cast(ulong*)bStart,length,cast(ulong)seed);
20     } else {
21         static assert(0,"unsupported type for hash_t: "~hash_t.stringof);
22     }
23 }
24 
25 /// hashes UTF-8 chars using the UTF codepoints (slower than rt_hash_str that uses the bit representation)
26 extern(C) uint rt_hash_utf8(char[] str, uint seed=0){
27     const uint m = 0x5bd1e995;
28     const int r = 24;
29 
30     uint h = seed ; // ^ length
31     foreach (c;str)
32     {
33         uint k = cast(uint)c;
34         // MIX(h,k,m);
35         { k *= m; k ^= k >> r; k *= m; h *= m; h ^= k; }
36     }
37     h ^= h >> 13;
38     h *= m;
39     h ^= h >> 15;
40     return h;
41 }
42 
43 /// hashes UTF-16 chars using the UTF codepoints (slower than rt_hash_str that uses the bit representation)
44 extern(C) uint rt_hash_utf16(wchar[] str, uint seed=0){
45     const uint m = 0x5bd1e995;
46     const int r = 24;
47 
48     uint h = seed ; // ^ length
49     foreach (c;str)
50     {
51         uint k = cast(uint)c;
52         // MIX(h,k,m);
53         { k *= m; k ^= k >> r; k *= m; h *= m; h ^= k; }
54     }
55     h ^= h >> 13;
56     h *= m;
57     h ^= h >> 15;
58     return h;
59 }
60 
61 /// hashes UTF-32 chars using the UTF codepoints (should be equivalent to rt_hash_str that uses the bit representation)
62 extern(C) uint rt_hash_utf32(dchar[] str, uint seed=0){
63     const uint m = 0x5bd1e995;
64     const int r = 24;
65 
66     uint h = seed ; // ^ length
67     foreach (c;str)
68     {
69         uint k = cast(uint)c;
70         // MIX(h,k,m);
71         { k *= m; k ^= k >> r; k *= m; h *= m; h ^= k; }
72     }
73     h ^= h >> 13;
74     h *= m;
75     h ^= h >> 15;
76     return h;
77 }
78 
79 /// combines two hash values
80 extern(C) hash_t rt_hash_combine( hash_t val1, hash_t val2 ){
81     static if (is(hash_t==uint)||is(hash_t==int)){
82         return cast(hash_t)rt_hash_combine32(cast(uint)val1,cast(uint)val2);
83     } else static if (is(hash_t==ulong)||is(hash_t==long)){
84         return cast(hash_t)rt_hash_combine64(cast(ulong)val1,cast(ulong)val2);
85     } else {
86         static assert(0,"unsupported type for hash_t: "~hash_t.stringof);
87     }
88 }
89 
90 /// hashes bStart[0..length] in an architecture neutral way
91 extern(C) uint rt_hash_str_neutral32(void *bStart,uint length, uint seed=0){
92     return murmurHashNeutral2(bStart,length,seed);
93 }
94 
95 /// hashes bStart[0..length] in an architecture neutral way
96 extern(C) ulong rt_hash_str_neutral64(void *bStart,ulong length, ulong seed=0){
97     return cast(hash_t)lookup8_hash(cast(ubyte*)bStart,length,seed);
98 }
99 
100 /// combines two hash values (32 bit, architecture neutral)
101 extern(C) uint rt_hash_combine32( uint val, uint seed )
102 {
103     const uint m = 0x5bd1e995;
104     const int r = 24;
105 
106     uint h = seed ^ 4u;
107     ubyte * data = cast(ubyte *)&val;
108     uint k;
109 
110     k  = (cast(uint)data[0]);
111     k |= (cast(uint)data[1]) << 8;
112     k |= (cast(uint)data[2]) << 16;
113     k |= (cast(uint)data[3]) << 24;
114     k *= m; 
115     k ^= k >> r; 
116     k *= m;
117     h *= m;
118     h ^= k;
119     h ^= h >> 13;
120     h *= m;
121     h ^= h >> 15;
122 
123     return h;
124 } 
125 
126 /// combines two hash values (64 bit, architecture neutral)
127 extern(C) ulong rt_hash_combine64( ulong value, ulong level)
128 {
129   ulong a,b,c,len;
130   ubyte* k=cast(ubyte*)(&value);
131 
132   /* Set up the internal state */
133   a = b = level;                         /* the previous hash value */
134   c = 0x9e3779b97f4a7c13UL; /* the golden ratio; an arbitrary value */
135   c += 8;
136   a+=(cast(ulong)k[ 7])<<56;
137   a+=(cast(ulong)k[ 6])<<48;
138   a+=(cast(ulong)k[ 5])<<40;
139   a+=(cast(ulong)k[ 4])<<32;
140   a+=(cast(ulong)k[ 3])<<24;
141   a+=(cast(ulong)k[ 2])<<16;
142   a+=(cast(ulong)k[ 1])<<8;
143   a+=(cast(ulong)k[ 0]);
144   mixin(mix64abc);
145   
146   return c;
147 }
148 
149 private:
150 
151 //-----------------------------------------------------------------------------
152 // murmurHashAligned2, by Austin Appleby
153 
154 // Same algorithm as murmurHash2, but only does aligned reads - should be safer
155 // on certain platforms. 
156 
157 // Performance will be lower than murmurHash2
158 private uint murmurHashAligned2 ( void * key, uint len, uint seed )
159 {
160     const uint m = 0x5bd1e995;
161     const int r = 24;
162 
163     ubyte * data = cast(ubyte *)key;
164 
165     uint h = seed ^ len;
166 
167     int alignV = cast(int)data & 3;
168 
169     if(alignV && (len >= 4))
170     {
171         // Pre-load the temp registers
172 
173         uint t = 0, d = 0;
174 
175         switch(alignV)
176         {
177             case 1: t |= (cast(uint)data[2]) << 16;
178             case 2: t |= (cast(uint)data[1]) << 8;
179             case 3: t |= cast(uint)data[0];
180             default:
181         }
182 
183         t <<= (8 * alignV);
184 
185         data += 4-alignV;
186         len -= 4-alignV;
187 
188         int sl = 8 * (4-alignV);
189         int sr = 8 * alignV;
190 
191         // Mix
192 
193         while(len >= 4)
194         {
195             d = *cast(uint *)data;
196             t = (t >> sr) | (d << sl);
197 
198             uint k = t;
199 
200             // MIX(h,k,m);
201             { k *= m; k ^= k >> r; k *= m; h *= m; h ^= k; }
202             
203 
204             t = d;
205 
206             data += 4;
207             len -= 4;
208         }
209 
210         // Handle leftover data in temp registers
211 
212         d = 0;
213 
214         if(len >= alignV)
215         {
216             switch(alignV)
217             {
218             case 3: d |= (cast(uint)data[2]) << 16;
219             case 2: d |= (cast(uint)data[1]) << 8;
220             case 1: d |= (cast(uint)data[0]);
221             default:
222             }
223 
224             uint k = (t >> sr) | (d << sl);
225             // MIX(h,k,m);
226             { k *= m; k ^= k >> r; k *= m; h *= m; h ^= k; }
227 
228             data += alignV;
229             len -= alignV;
230 
231             //----------
232             // Handle tail bytes
233 
234             switch(len)
235             {
236             case 3: h ^= (cast(uint)data[2]) << 16;
237             case 2: h ^= (cast(uint)data[1]) << 8;
238             case 1: h ^= (cast(uint)data[0]);
239                     h *= m;
240             default:
241             };
242         }
243         else
244         {
245             switch(len)
246             {
247             case 3: d |= (cast(uint)data[2]) << 16;
248             case 2: d |= (cast(uint)data[1]) << 8;
249             case 1: d |= (cast(uint)data[0]);
250             case 0: h ^= (t >> sr) | (d << sl);
251                     h *= m;
252             default:
253             }
254         }
255 
256         h ^= h >> 13;
257         h *= m;
258         h ^= h >> 15;
259 
260         return h;
261     }
262     else
263     {
264         while(len >= 4)
265         {
266             uint k = *cast(uint *)data;
267 
268             // MIX(h,k,m);
269             { k *= m; k ^= k >> r; k *= m; h *= m; h ^= k; }
270 
271             data += 4;
272             len -= 4;
273         }
274 
275         //----------
276         // Handle tail bytes
277 
278         switch(len)
279         {
280         case 3: h ^= (cast(uint)data[2]) << 16;
281         case 2: h ^= (cast(uint)data[1]) << 8;
282         case 1: h ^= (cast(uint)data[0]);
283                 h *= m;
284         default:
285         };
286 
287         h ^= h >> 13;
288         h *= m;
289         h ^= h >> 15;
290 
291         return h;
292     }
293 }
294 //-----------------------------------------------------------------------------
295 // murmurHashNeutral2, by Austin Appleby
296 
297 // Same as murmurHash2, but endian- and alignment-neutral.
298 // Half the speed though, alas.
299 
300 private uint murmurHashNeutral2 ( void * key, uint len, uint seed )
301 {
302     const uint m = 0x5bd1e995;
303     const int r = 24;
304 
305     uint h = seed ^ len;
306 
307     ubyte * data = cast(ubyte *)key;
308 
309     while(len >= 4)
310     {
311         uint k;
312 
313         k  = data[0];
314         k |= (cast(uint)data[1]) << 8;
315         k |= (cast(uint)data[2]) << 16;
316         k |= (cast(uint)data[3]) << 24;
317 
318         k *= m; 
319         k ^= k >> r; 
320         k *= m;
321 
322         h *= m;
323         h ^= k;
324 
325         data += 4;
326         len -= 4;
327     }
328 
329     switch(len)
330     {
331     case 3: h ^= (cast(uint)data[2]) << 16;
332     case 2: h ^= (cast(uint)data[1]) << 8;
333     case 1: h ^= (cast(uint)data[0]);
334             h *= m;
335     default:
336     };
337 
338     h ^= h >> 13;
339     h *= m;
340     h ^= h >> 15;
341 
342     return h;
343 } 
344 
345 //#define hashsize(n) (cast(ulong)1<<(n))
346 //#define hashmask(n) (hashsize(n)-1)
347 
348 /*
349 --------------------------------------------------------------------
350 mix -- mix 3 64-bit values reversibly.
351 mix() takes 48 machine instructions, but only 24 cycles on a superscalar
352   machine (like Intel's new MMX architecture).  It requires 4 64-bit
353   registers for 4::2 parallelism.
354 All 1-bit deltas, all 2-bit deltas, all deltas composed of top bits of
355   (a,b,c), and all deltas of bottom bits were tested.  All deltas were
356   tested both on random keys and on keys that were nearly all zero.
357   These deltas all cause every bit of c to change between 1/3 and 2/3
358   of the time (well, only 113/400 to 287/400 of the time for some
359   2-bit delta).  These deltas all cause at least 80 bits to change
360   among (a,b,c) when the mix is run either forward or backward (yes it
361   is reversible).
362 This implies that a hash using mix64 has no funnels.  There may be
363   characteristics with 3-bit deltas or bigger, I didn't test for
364   those.
365 --------------------------------------------------------------------
366 */
367 const char[] mix64abc=`{
368   a -= b; a -= c; a ^= (c>>43);
369   b -= c; b -= a; b ^= (a<<9); 
370   c -= a; c -= b; c ^= (b>>8); 
371   a -= b; a -= c; a ^= (c>>38);
372   b -= c; b -= a; b ^= (a<<23);
373   c -= a; c -= b; c ^= (b>>5); 
374   a -= b; a -= c; a ^= (c>>35);
375   b -= c; b -= a; b ^= (a<<49);
376   c -= a; c -= b; c ^= (b>>11);
377   a -= b; a -= c; a ^= (c>>12);
378   b -= c; b -= a; b ^= (a<<18);
379   c -= a; c -= b; c ^= (b>>22);
380 }`;
381 
382 /*
383 --------------------------------------------------------------------
384 hash() -- hash a variable-length key into a 64-bit value
385   k     : the key (the unaligned variable-length array of bytes)
386   len   : the length of the key, counting by bytes
387   level : can be any 8-byte value
388 Returns a 64-bit value.  Every bit of the key affects every bit of
389 the return value.  No funnels.  Every 1-bit and 2-bit delta achieves
390 avalanche.  About 41+5len instructions.
391 
392 The best hash table sizes are powers of 2.  There is no need to do
393 mod a prime (mod is sooo slow!).  If you need less than 64 bits,
394 use a bitmask.  For example, if you need only 10 bits, do
395   h = (h & hashmask(10));
396 In which case, the hash table should have hashsize(10) elements.
397 
398 If you are hashing n strings (ub1 **)k, do it like this:
399   for (i=0, h=0; i<n; ++i) h = hash( k[i], len[i], h);
400 
401 By Bob Jenkins, Jan 4 1997.  bob_jenkins@burtleburtle.net.  You may
402 use this code any way you wish, private, educational, or commercial,
403 but I would appreciate if you give me credit.
404 
405 See http://burtleburtle.net/bob/hash/evahash.html
406 Use for hash table lookup, or anything where one collision in 2^^64
407 is acceptable.  Do NOT use for cryptographic purposes.
408 --------------------------------------------------------------------
409 */
410 
411 ulong lookup8_hash( ubyte* k, ulong length, ulong level)
412 {
413   ulong a,b,c,len;
414 
415   /* Set up the internal state */
416   len = length;
417   a = b = level;                         /* the previous hash value */
418   c = 0x9e3779b97f4a7c13UL; /* the golden ratio; an arbitrary value */
419 
420   /*---------------------------------------- handle most of the key */
421   while (len >= 24)
422   {
423     a += (k[0]        +(cast(ulong)k[ 1]<< 8)+(cast(ulong)k[ 2]<<16)+(cast(ulong)k[ 3]<<24)
424      +(cast(ulong)k[4 ]<<32)+(cast(ulong)k[ 5]<<40)+(cast(ulong)k[ 6]<<48)+(cast(ulong)k[ 7]<<56));
425     b += (k[8]        +(cast(ulong)k[ 9]<< 8)+(cast(ulong)k[10]<<16)+(cast(ulong)k[11]<<24)
426      +(cast(ulong)k[12]<<32)+(cast(ulong)k[13]<<40)+(cast(ulong)k[14]<<48)+(cast(ulong)k[15]<<56));
427     c += (k[16]       +(cast(ulong)k[17]<< 8)+(cast(ulong)k[18]<<16)+(cast(ulong)k[19]<<24)
428      +(cast(ulong)k[20]<<32)+(cast(ulong)k[21]<<40)+(cast(ulong)k[22]<<48)+(cast(ulong)k[23]<<56));
429     mixin(mix64abc);
430     k += 24; len -= 24;
431   }
432 
433   /*------------------------------------- handle the last 23 bytes */
434   c += length;
435   switch(len)              /* all the case statements fall through */
436   {
437   case 23: c+=(cast(ulong)k[22]<<56);
438   case 22: c+=(cast(ulong)k[21]<<48);
439   case 21: c+=(cast(ulong)k[20]<<40);
440   case 20: c+=(cast(ulong)k[19]<<32);
441   case 19: c+=(cast(ulong)k[18]<<24);
442   case 18: c+=(cast(ulong)k[17]<<16);
443   case 17: c+=(cast(ulong)k[16]<<8);
444     /* the first byte of c is reserved for the length */
445   case 16: b+=(cast(ulong)k[15]<<56);
446   case 15: b+=(cast(ulong)k[14]<<48);
447   case 14: b+=(cast(ulong)k[13]<<40);
448   case 13: b+=(cast(ulong)k[12]<<32);
449   case 12: b+=(cast(ulong)k[11]<<24);
450   case 11: b+=(cast(ulong)k[10]<<16);
451   case 10: b+=(cast(ulong)k[ 9]<<8);
452   case  9: b+=(cast(ulong)k[ 8]);
453   case  8: a+=(cast(ulong)k[ 7]<<56);
454   case  7: a+=(cast(ulong)k[ 6]<<48);
455   case  6: a+=(cast(ulong)k[ 5]<<40);
456   case  5: a+=(cast(ulong)k[ 4]<<32);
457   case  4: a+=(cast(ulong)k[ 3]<<24);
458   case  3: a+=(cast(ulong)k[ 2]<<16);
459   case  2: a+=(cast(ulong)k[ 1]<<8);
460   case  1: a+=(cast(ulong)k[ 0]);
461     /* case 0: nothing left to add */
462   default:
463   }
464   mixin(mix64abc);
465   /*-------------------------------------------- report the result */
466   return c;
467 }
468 
469 /*
470 --------------------------------------------------------------------
471  This works on all machines, is identical to hash() on little-endian 
472  machines, and it is much faster than hash(), but it requires
473  -- that the key be an array of ub8's, and
474  -- that all your machines have the same endianness, and
475  -- that the length be the number of ub8's in the key
476 --------------------------------------------------------------------
477 */
478 ulong lookup8_hash2( ulong *k, ulong length, ulong level)
479 {
480   ulong a,b,c,len;
481 
482   /* Set up the internal state */
483   len = length;
484   a = b = level;                         /* the previous hash value */
485   c = 0x9e3779b97f4a7c13UL; /* the golden ratio; an arbitrary value */
486 
487   /*---------------------------------------- handle most of the key */
488   while (len >= 3)
489   {
490     a += k[0];
491     b += k[1];
492     c += k[2];
493     mixin(mix64abc);
494     k += 3; len -= 3;
495   }
496 
497   /*-------------------------------------- handle the last 2 ub8's */
498   c += (length<<3);
499   switch(len)              /* all the case statements fall through */
500   {
501     /* c is reserved for the length */
502   case  2: b+=k[1];
503   case  1: a+=k[0];
504     /* case 0: nothing left to add */
505   default:
506   }
507   mixin(mix64abc);
508   /*-------------------------------------------- report the result */
509   return c;
510 }
511 
512 debug(UnitTest){
513     void alignIndependence(T)(ubyte[] val,T function (void*,T len,T oldHash)hashF){
514         ubyte[] vval=new ubyte[val.length+16];
515         T oldV=0xdeadbeef;
516         T hash=hashF(val.ptr,val.length,oldV);
517         for(int i=0;i<16;++i){
518             vval[i..i+val.length]=val;
519             T hashAtt=hashF(vval.ptr+i,val.length,oldV);
520             assert(hashAtt==hash);
521         }
522     }
523     
524     unittest{
525         ubyte[] val=cast(ubyte[])"blaterare";
526         alignIndependence!(uint)(val,&murmurHashNeutral2);
527         alignIndependence!(uint)(val,&murmurHashAligned2);
528         alignIndependence!(ulong)(val,
529             function ulong(void* str,ulong len,ulong oldV){
530                 return lookup8_hash(cast(ubyte*)str,len,oldV);
531             }
532         );
533     }
534 }