Hi everyone.
I've made port of a 32-bit hashing function that works quite fast. That is, speed is 4x higher than the usual CRC32.
Port is available here.
And I'd like to receive some feedback.
Hi everyone.
I've made port of a 32-bit hashing function that works quite fast. That is, speed is 4x higher than the usual CRC32.
Port is available here.
And I'd like to receive some feedback.
@biocheshire
Did you do your benchmark against AVR standard CRC funtions or just quoting the original post?
What was the performance of this implementation?
And how did you tweak it for the AVR processor?
The original implementation and benchmark was at 1.6 GHz i386 architecture, And "Data is time in seconds taken to hash a random buffer of 256 bytes 5 million times" Hash functions.
How long time would this take on an Arduino ;-)?
I believe the incremental version is more useful for a micro-controller with only 2-4 Kbyte of memory. And there are some tweaks to be done to improve performance on an 8-bit architecture.
Here is an interesting link: ECRYPT Lightweight Hash Functions Page
Cheers!
Thanks for sharing (a challenge!)
Some performance optimizations (for Arduino 8-bitter), on the internal loop,
there will be some more gain possible in optimizing the remainders.
Please check
NOTE: the original version of rokkithash is copyrighted LGPL 2.1 by Paul Hsieh
//
// FILE: rokkitHash.ino
// AUTHOR: Rob Tillaart
// VERSION: 0.1.00
// PURPOSE: optimizing rokkitHash
// DATE: 2014-03-18
// URL:
//
// original version of rokkithash is copyrighted LGPL 2.1 by Paul Hsieh
//
volatile uint32_t x;
char data[] = "the quick brown fox jumps over the lazy dog.";
void setup()
{
Serial.begin(115200);
Serial.println("Start ");
int le = strlen(data);
uint32_t start = micros();
for (int i=0; i<1000; i++)
{
x = rokkit(data, le);
}
uint32_t stop = micros();
Serial.print("pre:\t");
Serial.println(stop - start);
Serial.println(x);
start = micros();
for (int i=0; i<1000; i++)
{
x = rokkitNew(data, le);
}
stop = micros();
Serial.print("post:\t");
Serial.println(stop - start);
Serial.println(x);
}
void loop()
{
}
uint32_t rokkitNew(const char * data, uint16_t len)
{
union
{
uint32_t h;
uint16_t b[2];
uint8_t a[4];
}
hash, tmp;
uint8_t rem;
uint16_t *p = (uint16_t*) data;
if (data == 0) return 0;
hash.h = len;
rem = len & 3;
len >>= 2;
/* Main loop */
while (len > 0) {
// hash += *((uint16_t*)data);
hash.h += *p;
p++;
// tmp = (*((uint16_t*)(data+2)) << 11) ^ hash;
// hash = (hash << 16) ^ tmp;
// ==>
// hash = (hash << 16) ^ (*((uint16_t*)(data+2)) << 11) ^ hash;
// ==>
// hash ^= (hash << 16) ^ (*((uint16_t*)(data+2)) << 11);
tmp.h = hash.h;
tmp.b[1] = tmp.b[0]; // (hash << 16) === clean lower 16 bits!
tmp.b[0] = (*p) << 11; // (*((uint16_t*)(data+2)) << 11) === 16 bit operation!!
p++;
// hash.h ^= tmp.h;
hash.a[0] ^= tmp.a[0];
hash.a[1] ^= tmp.a[1];
hash.a[2] ^= tmp.a[2];
hash.a[3] ^= tmp.a[3];
// hash += hash >> 11;
tmp.a[0] = hash.a[1]; // shift 8
tmp.a[1] = hash.a[2];
tmp.a[2] = hash.a[3];
tmp.a[3] = 0;
hash.h += tmp.h >> 3;
// len--;
--len;
}
data = (char*) p;
/* Handle end cases */
switch (rem) {
case 3:
hash.h += *((uint16_t*)data);
hash.h ^= hash.h << 16;
hash.h ^= ((signed char)data[2]) << 18;
hash.h += hash.h >> 11;
break;
case 2:
hash.h += *((uint16_t*)data);
hash.h ^= hash.h << 11;
hash.h += hash.h >> 17;
break;
case 1:
hash.h += (signed char)*data;
hash.h ^= hash.h << 10;
hash.h += hash.h >> 1;
}
/* Force "avalanching" of final 127 bits */
hash.h ^= hash.h << 3;
hash.h += hash.h >> 5;
hash.h ^= hash.h << 4;
hash.h += hash.h >> 17;
hash.h ^= hash.h << 25;
hash.h += hash.h >> 6;
return hash.h;
}
uint32_t rokkit(const char * data, int len)
{
uint32_t hash, tmp;
int rem;
if (len <= 0 || data == 0) return 0;
hash = len;
rem = len & 3;
len >>= 2;
/* Main loop */
while (len > 0) {
hash += *((uint16_t*)data);
tmp = (*((uint16_t*)(data+2)) << 11) ^ hash;
hash = (hash << 16) ^ tmp;
data += 2*2;
hash += hash >> 11;
len--;
}
/* Handle end cases */
switch (rem) {
case 3:
hash += *((uint16_t*)data);
hash ^= hash << 16;
hash ^= ((signed char)data[2]) << 18;
hash += hash >> 11;
break;
case 2:
hash += *((uint16_t*)data);
hash ^= hash << 11;
hash += hash >> 17;
break;
case 1:
hash += (signed char)*data;
hash ^= hash << 10;
hash += hash >> 1;
}
/* Force "avalanching" of final 127 bits */
hash ^= hash << 3;
hash += hash >> 5;
hash ^= hash << 4;
hash += hash >> 17;
hash ^= hash << 25;
hash += hash >> 6;
return hash;
}
output
Start
pre: 121136
669202426
post: 76956
669202426
~35% faster
uint32_t rokkitNew2(const char * data, uint16_t len)
{
union
{
uint32_t h;
uint16_t b[2];
uint8_t a[4];
}
hash, tmp;
uint8_t rem;
uint16_t *p = (uint16_t*) data;
if (data == 0) return 0;
hash.h = len;
rem = len & 3;
len >>= 2;
/* Main loop */
while (len > 0) {
// hash += *((uint16_t*)data);
hash.h += *p;
p++;
// tmp = (*((uint16_t*)(data+2)) << 11) ^ hash;
// hash = (hash << 16) ^ tmp;
// ==>
// hash = (hash << 16) ^ (*((uint16_t*)(data+2)) << 11) ^ hash;
// ==>
// hash ^= (hash << 16) ^ (*((uint16_t*)(data+2)) << 11);
tmp.h = hash.h;
tmp.b[1] = tmp.b[0]; // shift 16
tmp.a[1] = ((*p) & 0xFF) << 3;
tmp.a[0] = 0;
// tmp.b[0] = (*p) << 11; // 16 bit operation!!
p++;
// hash.h ^= tmp.h;
// hash.a[0] ^= tmp.a[0]; // not needed as tmp.a[0] == 0
hash.a[1] ^= tmp.a[1];
hash.a[2] ^= tmp.a[2];
hash.a[3] ^= tmp.a[3];
// hash += hash >> 11;
tmp.a[0] = hash.a[1]; // shift 8
tmp.a[1] = hash.a[2];
tmp.a[2] = hash.a[3];
tmp.a[3] = 0;
hash.h += tmp.h >> 3;
// len--;
--len;
}
data = (char*) p;
/* Handle end cases */
switch (rem) {
case 3:
hash.h += *((uint16_t*)data);
hash.h ^= hash.h << 16; // todo
hash.h ^= ((signed char)data[2]) << 18;
hash.h += hash.h >> 11; // todo
break;
case 2:
hash.h += *((uint16_t*)data);
hash.h ^= hash.h << 11; // todo
hash.h += hash.h >> 17; // todo
break;
case 1:
hash.h += (signed char)*data;
hash.h ^= hash.h << 10; // todo
hash.h += hash.h >> 1;
}
/* Force "avalanching" of final 127 bits */
hash.h ^= hash.h << 3;
hash.h += hash.h >> 5;
hash.h ^= hash.h << 4;
// hash.h += hash.h >> 17; // todo
tmp.b[0] = hash.b[1] >> 1;
tmp.b[1] = 0;
hash.h += tmp.h;
// hash.h ^= hash.h << 25;
// tmp.a[3] = hash.a[0] << 1; // shift 25!!
// tmp.a[2] = 0;
// tmp.a[1] = 0;
// tmp.a[0] = 0;
hash.a[3] ^= hash.a[0] << 1; // shift 25!!;
hash.h += hash.h >> 6;
return hash.h;
}
pre: 121136
669202426
post: 58976
669202426
==>50% off
note: Arduino 1.5.4 + UNO
@kowalski
I did comparsion between rokkit and this particular crc32 implementation - Arduino CRC-32
So it showed ~4x gain.
No tweaking was done, just changed some pointer operation
@robtillaart
thanks for your contribution!
I will check the results
Sorry for resparning this old topic, but I am trying to use the rokkit hash to checksum some data sent from Arduino to PC and I have inconsistent results.
Rob's test program produces the same output (669202426 == 0x27E337FA) for both the original code and his modified code, when run on an Arduino.
If I run the program on PC (compiled with GCC 4.9.3 on Linux, with the obvious Serial.print -> printf() modifications), I get something different:
Start pre: 118
2628211179
post: 245
669202426
Since the original code seems faster on PC, I would like to use that, but it yields a different checksum. Is this expected? Can it be fixed?
Can you post the PC code?
Here you go, not really clean but hope you don't mind:
#include <stdio.h>
#include <string.h>
#include <stdint.h>
#include <time.h>
#define micros clock
#undef get16bits
#if (defined(__GNUC__) && defined(__i386__)) || defined(__WATCOMC__) \
|| defined(_MSC_VER) || defined (__BORLANDC__) || defined (__TURBOC__)
#define get16bits(d) ((uint16_t) *((const uint16_t *) (d)))
#endif
#if !defined (get16bits)
#define get16bits(d) ((((uint32_t)(((const uint8_t *)(d))[1])) << 8)\
+(uint32_t)(((const uint8_t *)(d))[0]) )
#endif
uint32_t rokkit (const char *data, int len) {
uint32_t hash = len, tmp;
int rem;
if (len <= 0 || data == NULL) {
return 0;
}
rem = len & 3;
len >>= 2;
/* Main loop */
for (; len > 0; len--) {
hash += get16bits (data);
tmp = (get16bits (data + 2) << 11) ^ hash;
hash = (hash << 16) ^ tmp;
data += 2 * sizeof (uint16_t);
hash += hash >> 11;
}
/* Handle end cases */
switch (rem) {
case 3:
hash += get16bits (data);
hash ^= hash << 16;
hash ^= ((signed char)data[sizeof (uint16_t)]) << 18;
hash += hash >> 11;
break;
case 2:
hash += get16bits (data);
hash ^= hash << 11;
hash += hash >> 17;
break;
case 1:
hash += (signed char) * data;
hash ^= hash << 10;
hash += hash >> 1;
}
/* Force "avalanching" of final 127 bits */
hash ^= hash << 3;
hash += hash >> 5;
hash ^= hash << 4;
hash += hash >> 17;
hash ^= hash << 25;
hash += hash >> 6;
return hash;
}
uint32_t rokkitNew (const char *data, uint16_t len) {
union {
uint32_t h;
uint16_t b[2];
uint8_t a[4];
}
hash, tmp;
uint8_t rem;
uint16_t *p = (uint16_t *) data;
if (data == 0) {
return 0;
}
hash.h = len;
rem = len & 3;
len >>= 2;
/* Main loop */
while (len > 0) {
// hash += *((uint16_t*)data);
hash.h += *p;
p++;
//~ tmp.h = ((*p) << 11) ^ hash.h;
//~ hash.h = (hash.h << 16) ^ tmp.h;
// ==>
// hash = (hash << 16) ^ (*((uint16_t*)(data+2)) << 11) ^ hash;
// ==>
// hash ^= (hash << 16) ^ (*((uint16_t*)(data+2)) << 11);
tmp.h = hash.h;
tmp.b[1] = tmp.b[0]; // (hash << 16) === clean lower 16 bits!
tmp.b[0] = *p << 11; // (*((uint16_t*)(data+2)) << 11) === 16 bit operation!!
p++;
//~ hash.h ^= tmp.h;
hash.a[0] ^= tmp.a[0];
hash.a[1] ^= tmp.a[1];
hash.a[2] ^= tmp.a[2];
hash.a[3] ^= tmp.a[3];
//~ hash.h += hash.h >> 11;
tmp.a[0] = hash.a[1]; // shift 8
tmp.a[1] = hash.a[2];
tmp.a[2] = hash.a[3];
tmp.a[3] = 0;
hash.h += tmp.h >> 3;
// len--;
--len;
}
data = (char *) p;
/* Handle end cases */
switch (rem) {
case 3:
hash.h += * ((uint16_t *)data);
hash.h ^= hash.h << 16;
hash.h ^= ((signed char)data[2]) << 18;
hash.h += hash.h >> 11;
break;
case 2:
hash.h += * ((uint16_t *)data);
hash.h ^= hash.h << 11;
hash.h += hash.h >> 17;
break;
case 1:
hash.h += (signed char) * data;
hash.h ^= hash.h << 10;
hash.h += hash.h >> 1;
}
/* Force "avalanching" of final 127 bits */
hash.h ^= hash.h << 3;
hash.h += hash.h >> 5;
hash.h ^= hash.h << 4;
hash.h += hash.h >> 17;
hash.h ^= hash.h << 25;
hash.h += hash.h >> 6;
return hash.h;
}
uint32_t rokkitNew2 (const char *data, uint16_t len) {
union {
uint32_t h;
uint16_t b[2];
uint8_t a[4];
}
hash, tmp;
uint8_t rem;
uint16_t *p = (uint16_t *) data;
if (data == 0) {
return 0;
}
hash.h = len;
rem = len & 3;
len >>= 2;
/* Main loop */
while (len > 0) {
// hash += *((uint16_t*)data);
hash.h += *p;
p++;
// tmp = (*((uint16_t*)(data+2)) << 11) ^ hash;
// hash = (hash << 16) ^ tmp;
// ==>
// hash = (hash << 16) ^ (*((uint16_t*)(data+2)) << 11) ^ hash;
// ==>
// hash ^= (hash << 16) ^ (*((uint16_t*)(data+2)) << 11);
tmp.h = hash.h;
tmp.b[1] = tmp.b[0]; // shift 16
tmp.a[1] = ((*p) & 0xFF) << 3;
tmp.a[0] = 0;
// tmp.b[0] = (*p) << 11; // 16 bit operation!!
p++;
// hash.h ^= tmp.h;
// hash.a[0] ^= tmp.a[0]; // not needed as tmp.a[0] == 0
hash.a[1] ^= tmp.a[1];
hash.a[2] ^= tmp.a[2];
hash.a[3] ^= tmp.a[3];
// hash += hash >> 11;
tmp.a[0] = hash.a[1]; // shift 8
tmp.a[1] = hash.a[2];
tmp.a[2] = hash.a[3];
tmp.a[3] = 0;
hash.h += tmp.h >> 3;
// len--;
--len;
}
data = (char *) p;
/* Handle end cases */
switch (rem) {
case 3:
hash.h += * ((uint16_t *)data);
hash.h ^= hash.h << 16; // todo
hash.h ^= ((signed char)data[2]) << 18;
hash.h += hash.h >> 11; // todo
break;
case 2:
hash.h += * ((uint16_t *)data);
hash.h ^= hash.h << 11; // todo
hash.h += hash.h >> 17; // todo
break;
case 1:
hash.h += (signed char) * data;
hash.h ^= hash.h << 10; // todo
hash.h += hash.h >> 1;
}
/* Force "avalanching" of final 127 bits */
hash.h ^= hash.h << 3;
hash.h += hash.h >> 5;
hash.h ^= hash.h << 4;
// hash.h += hash.h >> 17; // todo
tmp.b[0] = hash.b[1] >> 1;
tmp.b[1] = 0;
hash.h += tmp.h;
// hash.h ^= hash.h << 25;
// tmp.a[3] = hash.a[0] << 1; // shift 25!!
// tmp.a[2] = 0;
// tmp.a[1] = 0;
// tmp.a[0] = 0;
hash.a[3] ^= hash.a[0] << 1; // shift 25!!;
hash.h += hash.h >> 6;
return hash.h;
}
// http://forum.arduino.cc/index.php?topic=226686.0
// FILE: rokkitHash.ino
// AUTHOR: Rob Tillaart
// VERSION: 0.1.00
// PURPOSE: optimizing rokkitHash
// DATE: 2014-03-18
// URL:
//
// original version of rokkithash is copyrighted LGPL 2.1 by Paul Hsieh
//
char data[] = "the quick brown fox jumps over the lazy dog.";
int main (void) {
uint32_t x;
printf ("Start ");
int le = strlen (data);
uint32_t start = micros();
for (int i = 0; i < 1000; i++) {
x = rokkit (data, le);
}
uint32_t stop = micros();
printf ("pre:\t");
printf ("%u\n", stop - start);
printf ("%0X\n", x);
start = micros();
for (int i = 0; i < 1000; i++) {
x = rokkitNew (data, le);
}
stop = micros();
printf ("post:\t");
printf ("%u\n", stop - start);
printf ("%0X\n", x);
start = micros();
for (int i = 0; i < 1000; i++) {
x = rokkitNew2 (data, le);
}
stop = micros();
printf ("post:\t");
printf ("%u\n", stop - start);
printf ("%0X\n", x);
}
BTW, I think have tracked down the issue to the following lines:
// hash ^= (hash << 16) ^ (*((uint16_t*)(data+2)) << 11);
tmp.h = hash.h;
tmp.b[1] = tmp.b[0]; // (hash << 16) === clean lower 16 bits!
tmp.b[0] = (*p) << 11; // (*((uint16_t*)(data+2)) << 11) === 16 bit operation!!
While the equivalence stands, it seems you are treating the XOR on the right side as an OR, i.e. tmp.b[1] should get XORed with the bits that overflow tmp.b[0] when shifting. So I've changed the code to:
tmp.h = *p << 11;
tmp.b[1] ^= hash.b[0];
This makes it yield the same result as the original code on PC, while it doesn't change the behaviour on Arduino (i.e.: I still get 0x27E337FA from all the different pieces of code!). The slowdown is barely noticeable: 80532 -> 81724 for rokkitNew(), 60904 -> 64368 for rokkitNew2().
So, to summarize: I am now getting the same hash from all the three pieces of code, but said hash is different between Arduino and PC.
Still I can't understand why the original code behaves differently on Arduino!
OK, got to the point! This fixes the original code:
tmp = ((uint32_t) (*((uint16_t*)(data+2))) << 11) ^ hash;
To make a long story short, the C standard states that the shift operator's operands must be promoted to (unsigned) int, which is 32 bits wide on PC and 16 on Arduino. This results in the different behaviour, since part of the result gets truncated on Arduino. Now go figure what was Paul Hsieh's original intention...
Rob's improved code can be fixed like this:
tmp.h = ((uint32_t) *p) << 11;
tmp.b[1] ^= hash.b[0];
Anyway, This slows down the original code considerably:
Start
pre: 171584
9CA751EB
post: 136116
9CA751EB
post: 116248
9CA751EB
All hashes are now the same, job done? No! More code needs fixing:
/home/sukko/Development/arduino/Rokkit/Rokkit.ino: In function 'uint32_t rokkitNew(const char*, uint16_t)':
/home/sukko/Development/arduino/Rokkit/Rokkit.ino:122:40: warning: left shift count >= width of type [enabled by default]
hash.h ^= ((signed char)data[2]) << 18;
This line of the "end cases" is never reached with Rob's test string, so I will have to find a string that triggers it and compare what happens on Arduino and PC, sigh...
Hi,
you did a lot of good work. Can you post the fixed version of the function so I can compare (and understand)
Of course. I plan to fork and improve the OP's library. I am going to be on holidays for a couple days, so it will probably come sometime next week.
So, here's the library with my modifications: GitHub - SukkoPera/Arduino-Rokkit-Hash: Arduino port for Paul Hsieh's "SuperFastHash". Great alternative for CRC/hashing applications, performs 4x faster than classic CRC32
I have also included a function to hash flash strings (might be useful from time to time) and the library is now in 1.5 format.
I have some more things to add as time permits: incremental hash calculation, for instance.
Note that the library also compiles as-is under Linux.
Speed, as measured here with Arduino 1.6.6 on a Micro, is:
RAM: 115704 9CA751EB
Flash: 121772 9CA751EB