A small test for different methods, (IDE 21, Atmel 328, windows 7)
void setup()
{
Serial.begin(115200);
}
void loop()
{
unsigned long start = micros();
for (int i=0; i<256; i++)
{
volatile uint8_t b = i;
b = ((b * 0x0802LU & 0x22110LU) | (b * 0x8020LU & 0x88440LU)) * 0x10101LU >> 16;
b = ((b * 0x0802LU & 0x22110LU) | (b * 0x8020LU & 0x88440LU)) * 0x10101LU >> 16;
}
Serial.println(micros() - start);
while (1);
}
Size : 2758
Time: 6408 usec (512 flips) => 12.5 usec = 200 clockcycles / flip
void setup()
{
Serial.begin(115200);
}
void loop()
{
unsigned long start = micros();
for (int i=0; i<256; i++)
{
volatile uint8_t b = i;
b = (b * 0x0202020202ULL & 0x010884422010ULL) % 1023;
b = (b * 0x0202020202ULL & 0x010884422010ULL) % 1023;
}
Serial.println(micros() - start);
while (1);
}
Size : 6872
Time: 151804 usec (512 flips) => 296.5 usec per flip = 4744 clockcycles / flip
Although the formula is shorter the use of ULL datatype in combination with modulo makes this far bigger and slower
void setup()
{
Serial.begin(115200);
}
// big lookup table
uint8_t x[256] = { 0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,8,136,72,200,40,168,104,232,24,152,88,216,56,184,120,248,4,132,68,196,36,164,100,228,20,148,84,212,52,180,116,244,12,140,76,204,44,172,108,236,28,156,92,220,60,188,124,252,2,130,66,194,34,162,98,226,18,146,82,210,50,178,114,242,10,138,74,202,42,170,106,234,26,154,90,218,58,186,122,250,6,134,70,198,38,166,102,230,22,150,86,214,54,182,118,246,14,142,78,206,46,174,110,238,30,158,94,222,62,190,126,254,1,129,65,193,33,161,97,225,17,145,81,209,49,177,113,241,9,137,73,201,41,169,105,233,25,153,89,217,57,185,121,249,5,133,69,197,37,165,101,229,21,149,85,213,53,181,117,245,13,141,77,205,45,173,109,237,29,157,93,221,61,189,125,253,3,131,67,195,35,163,99,227,19,147,83,211,51,179,115,243,11,139,75,203,43,171,107,235,27,155,91,219,59,187,123,251,7,135,71,199,39,167,103,231,23,151,87,215,55,183,119,247,15,143,79,207,47,175,111,239,31,159,95,223,63,191,127,255};
void loop()
{
unsigned long start = micros();
for (int i=0; i<256; i++)
{
volatile uint8_t b = i;
b = x[b];
b = x[b];
}
Serial.println(micros() - start);
while (1);
}
Size : 2804
Time: 436 usec (512 flips) => 0.85 usec per flip = 13.6 clockcycles / flip
Really fast as expected, but not the smallest footprint so far
void setup()
{
Serial.begin(115200);
}
// small lookup table - 4bit level iso 8 bit
// notice the pattern between the first 8 and the latter 8 in the table
uint8_t xx[16] = { 0,8,4,12, 2,10,6,14, 1,9,5,13, 3,11,7,15};
void loop()
{
unsigned long start = micros();
for (int i=0; i<256; i++)
{
volatile uint8_t b = i;
b = xx[b & 0x0F] << 4 | xx[b>>4];
b = xx[b & 0x0F] << 4 | xx[b>>4];
}
Serial.println(micros() - start);
while (1);
}
Size : 2624
Time: 1468 usec (512 flips) => 2.86 usec per flip = 46 clockcycles / flip
Three times slower than the fastest, but still second fastest.
With 2624 the smallest in footprint so far.
void setup()
{
Serial.begin(115200);
}
void loop()
{
unsigned long start = micros();
for (int j=0; j<256; j++)
{
volatile uint8_t value = j;
volatile uint8_t ret = 0;
for (int i = 0; i < 8; i++)
if ((value & (uint8_t)(1 << i)) != 0) ret += (uint8_t)(1 << (7 - i));
value = ret;
for (int i = 0; i < 8; i++)
if ((value & (uint8_t)(1 << i)) != 0) ret += (uint8_t)(1 << (7 - i));
}
Serial.println(micros() - start);
while (1);
}
Size : 2676
Time: 14008 usec (512 flips) => 27.4 usec per flip = 438 clockcycles / flip
Good footprint, but relative slow.
void setup()
{
Serial.begin(115200);
}
void loop()
{
unsigned long start = micros();
for (int j=0; j<256; j++)
{
volatile uint8_t in = j;
uint8_t out = 0;
if (in & 0x01) out |= 0x80;
if (in & 0x02) out |= 0x40;
if (in & 0x04) out |= 0x20;
if (in & 0x08) out |= 0x10;
if (in & 0x10) out |= 0x08;
if (in & 0x20) out |= 0x04;
if (in & 0x40) out |= 0x02;
if (in & 0x80) out |= 0x01;
in = out;
out = 0;
if (in & 0x01) out |= 0x80;
if (in & 0x02) out |= 0x40;
if (in & 0x04) out |= 0x20;
if (in & 0x08) out |= 0x10;
if (in & 0x10) out |= 0x08;
if (in & 0x20) out |= 0x04;
if (in & 0x40) out |= 0x02;
if (in & 0x80) out |= 0x01;
}
Serial.println(micros() - start);
while (1);
}
Size : 2596
Time: 996 usec (512 flips) => 1.95 usec per flip = 32 clockcycles / flip
Smallest footprint, second in performance
So the 256 byte lookup table is fastest but the enrolled loop is very fast too and has a 200byte smaller footprint.
All disclaimers apply