Being busy with optimizing divisions lately
I recalled that the PING sensor has to do an division by 29 to convert duration to centimetre. In fact the divisor should be 29.41176, but float divisions are slow so a integer division is mostly used (which are also slow:).
In the code below I have written an optimized integer versions for doing the conversion of duration to cm and to mm.
The code compares the performance.
//
// FILE: divide.ino
// AUTHOR: Rob Tillaart
// DATE: 2013-05-11
//
// PUPROSE: fast divide routines for PING sensors
//
unsigned long start = 0;
unsigned long stop = 0;
volatile unsigned long x;
void setup()
{
Serial.begin(115200);
Serial.print("1/29.4 \t");
start = micros();
for (unsigned long i = 0; i < 10000; i++)
{
x = i / 29.41176;
}
stop = micros();
Serial.println((stop - start)/10000.0, 4);
Serial.println("--------------");
Serial.print("i*0.034\t");
start = micros();
for (unsigned long i = 0; i < 10000; i++)
{
x = i * 0.034;
}
stop = micros();
Serial.println((stop - start)/10000.0, 4);
Serial.println("--------------");
Serial.print("1/29\t");
start = micros();
for (unsigned long i = 0; i < 10000; i++)
{
x = i / 29;
}
stop = micros();
Serial.println((stop - start)/10000.0, 4);
Serial.println("--------------");
Serial.print("div29\t");
start = micros();
for (long i = 0; i < 10000; i++)
{
x = div29(i);
}
stop = micros();
Serial.println((stop - start)/10000.0, 4);
Serial.println("--------------");
Serial.print("PING2cm\t");
start = micros();
for (unsigned long i = 0; i < 10000; i++)
{
x = PING2cm(i);
}
stop = micros();
Serial.println((stop - start)/10000.0, 4);
Serial.println("--------------");
Serial.print("PING2mm\t");
start = micros();
for (unsigned long i = 0; i < 10000; i++)
{
x = PING2mm(i);
}
stop = micros();
Serial.println((stop - start)/10000.0, 4);
Serial.println("--------------");
Serial.println();
for (unsigned long i = 0; i < 100000UL; i+=100)
{
Serial.print(i);
Serial.print('\t');
Serial.print(i/29.41176);
Serial.print('\t');
Serial.print(div29(i));
Serial.print('\t');
Serial.print(PING2cm(i));
Serial.print('\t');
Serial.print(PING2mm(i));
Serial.println();
}
Serial.println();
Serial.println("done");
}
void loop()
{
}
/////////////////////////////////////////////////////////////////////////////
uint32_t div29(uint32_t in)
{
// uint32_t x = (in >> 5) + (in >> 9) + (in >> 10) + (in >> 12)+ (in >> 15) + (in >> 16);
uint32_t x = (in >> 5) + (in >> 12);
x = x + (x >> 4) + (in >> 10) + (in >> 15);
return x;
}
uint32_t PING2cm(uint32_t in)
{
// divide by 29.41176 == * 0.034
// uint32_t x = (in >> 5) + (in>> 9) + (in >> 11) + (in >> 12) + (in >> 14);
uint32_t x = ((in >> 1) + (in>> 5) + (in >> 7) + (in >> 8) + (in >> 10)) >> 4;
return x;
}
uint32_t PING2mm(uint32_t in)
{
// divide by 2.941176 == * 0.34;
// uint32_t x = (in >> 2) + (in>>4) + (in>>6) + (in>>7) + (in>>8) + (in>>13);
uint32_t x = (in >> 2) + (in >> 4);
x = x + (x >> 4);
uint32_t t = ((in >> 1) + (in >> 7)) >> 6;
return x+t;
}
output:
1/29.4 38.2200
i*0.034 17.2888
1/29 38.7452
div29 16.2968
PING2cm 14.9772
PING2mm 13.5316
conclusions:
- float multiplication is about twice as fast as division
- long int division is as slow as the float division
- optimized div29 is 6% faster than float multiplication
- PING2CM multiplies * 0.034 in integer math is 15% faster than float multiplication
- PING2CM is 158% faster than the default 1/29
- PING2MM multiplies * 0.34 in integer math is 27% faster than float multiplication and 186% faster than 1/29
Besides faster, the PING2CM is also more precise:
1/29 has an error about 1.4% compared to the 1/29.41176, where PING2CM has an error of about 3E-6
in fact as the duration is forth and back the division should be by 58.823…
So the code becomes (red are the changes with above)
uint32_t PING2cm(uint32_t in)
{
uint32_t x = ((in >> 1) + (in>> 5) + (in >> 7) + (in >> 8) + (in >> 10)) >> [color=red]5[/color];
return x;
}
uint32_t PING2mm(uint32_t in)
{
// divide by 2.941176 == * 0.34;
// uint32_t x = (in >> 2) + (in>>4) + (in>>6) + (in>>7) + (in>>8) + (in>>13);
uint32_t x = (in >> 2) + (in >> 4);
x = x + (x >> 4);
uint32_t t = ((in >> 1) + (in >> 7)) >> 6;
return (x+t)[color=red]>>1[/color];
}