joogaa:
Time in microsec
/ is the compiler division
rdiv8u is the Russian Peasant Division
div8u is the AVR200b division
8 bit division
/ time = 140688 rdiv8u time = 41108 div8u time = 88068
rdiv16u is the Russian Peasant Division
div16u is the AVR200b division
16 bit division
rdiv16u time = 324 / time = 720 div16u time = 996
I did not benchmark the 32 bit division because with my current compiler version, the result of the division of two 32 bit numbers is incorrect, the compiler only uses the first 16 bits of each. Does this happen in the latest version? If so, then that explains the slower results.
I've done some extended version of the benchmark posted above.
The rdiv32u() function seems to work fine, I cound not find any difference to the results using '/' with division of 'long' variables. The test whether the results are the same are also included in the benchmark sketch.
This is the Arduino code I used:
// benchmark test for the Russian peasant multiplication
#include <Arduino.h>
#include "rmulfun.h"
#include "rdivfun.h"
volatile int dummy=0;
#define LOWFACTOR1 (1234L+dummy)
#define HIGHFACTOR1 (LOWFACTOR1+500)
#define LOWFACTOR2 (4567L+dummy)
#define HIGHFACTOR2 (LOWFACTOR2+500)
#define LOWDIVIDEND (123456789L+dummy)
#define HIGHDIVIDEND (LOWDIVIDEND+500)
#define LOWDIVISOR (2345678L+dummy)
//#define LOWDIVISOR (7L+dummy)
#define HIGHDIVISOR (LOWDIVISOR+500)
void compareMulError(uint32_t loFactor1, uint32_t hiFactor1, uint32_t loFactor2, uint32_t hiFactor2)
{
long sameresult=0,errors=0;
for (int i=loFactor1;i<hiFactor1;i++)
{
for (int j=loFactor2;j<hiFactor2;j++)
{
uint32_t a = rmul16u(i, j);
if (a== (long)i*j) sameresult++;
else errors++;
}
}
Serial.print("Multiplication Same Result: ");Serial.println(sameresult);
Serial.print("Multiplication Error Count: ");Serial.println(errors);
if (errors==0) Serial.println("Multiplication Test PASSED OK");
else Serial.println("Multiplication Test ERROR");
Serial.println();
}
void compareDivError(uint32_t loDividend, uint32_t hiDividend, uint32_t loDivisor, uint32_t hiDivisor)
{
long sameresult=0,errors=0;
for (uint32_t i=loDividend;i<hiDividend;i++)
{
for (uint32_t j=loDivisor;j<hiDivisor;j++)
{
uint32_t a,b;
rdiv32u(&a, &b, i, j);
if (a== i/j) sameresult++;
else errors++;
}
}
Serial.print("Division Same Result: ");Serial.println(sameresult);
Serial.print("Division Error Count: ");Serial.println(errors);
if (errors==0) Serial.println("Division Test PASSED OK");
else Serial.println("Division Test ERROR");
Serial.println();
}
long testRussMul(uint32_t loFactor1, uint32_t hiFactor1, uint32_t loFactor2, uint32_t hiFactor2)
{
long c=0;
for (int i=loFactor1;i<hiFactor1;i++)
{
for (int j=loFactor2;j<hiFactor2;j++)
{
uint32_t a = rmul16u(i, j);
c+= bitRead(a,0)+bitRead(a,31);
}
}
return c;
}
long testArduMul(uint32_t loFactor1, uint32_t hiFactor1, uint32_t loFactor2, uint32_t hiFactor2)
{
long c=0;
for (int i=loFactor1;i<hiFactor1;i++)
{
for (int j=loFactor2;j<hiFactor2;j++)
{
uint32_t a = (long)i * j;
c+= bitRead(a,0)+bitRead(a,31);
}
}
return c;
}
long testRussDiv(uint32_t loDividend, uint32_t hiDividend, uint32_t loDivisor, uint32_t hiDivisor)
{
long c=0;
for (uint32_t i=loDividend;i<hiDividend;i++)
{
for (uint32_t j=loDivisor;j<hiDivisor;j++)
{
uint32_t a,b;
rdiv32u(&a, &b, i, j);
c+= bitRead(a,0)+bitRead(a,31);
}
}
return c;
}
long testArduDiv(uint32_t loDividend, uint32_t hiDividend, uint32_t loDivisor, uint32_t hiDivisor)
{
long c=0;
for (uint32_t i=loDividend;i<hiDividend;i++)
{
for (uint32_t j=loDivisor;j<hiDivisor;j++)
{
uint32_t a= i/j;
c+= bitRead(a,0)+bitRead(a,31);
}
}
return c;
}
void showResult(char* testname, long result, float time)
{
Serial.print("Test: ");
Serial.print(testname);
Serial.print(" Result: ");
Serial.print(result);
Serial.print(" Time: ");
Serial.println(time,6);
}
void setup()
{
long result;
unsigned long time;
Serial.begin(9600);
Serial.println("\nStart Multiplication Compare Test:");
compareMulError(LOWFACTOR1, HIGHFACTOR1, LOWFACTOR2, HIGHFACTOR2);
Serial.println("Start Multiplication Benchmark Test:");
delay(100); // Wait until serial message is sent
time=micros();
result=testRussMul(LOWFACTOR1, HIGHFACTOR1, LOWFACTOR2, HIGHFACTOR2);
time=micros()-time;
showResult("Russian MUL",result,time/1000000.0);
delay(100); // Wait until serial message is sent
time=micros();
result=testArduMul(LOWFACTOR1, HIGHFACTOR1, LOWFACTOR2, HIGHFACTOR2);
time=micros()-time;
showResult("Arduino MUL",result,time/1000000.0);
delay(100); // Wait until serial message is sent
Serial.println("\nStart Division Compare Test:");
compareDivError(LOWDIVIDEND, HIGHDIVIDEND, LOWDIVISOR, HIGHDIVISOR);
Serial.println("Start Division Benchmark Test:");
delay(100); // Wait until serial message is sent
time=micros();
result=testRussDiv(LOWDIVIDEND, HIGHDIVIDEND, LOWDIVISOR, HIGHDIVISOR);
time=micros()-time;
showResult("Russian DIV",result,time/1000000.0);
delay(100); // Wait until serial message is sent
time=micros();
result=testArduDiv(LOWDIVIDEND, HIGHDIVIDEND, LOWDIVISOR, HIGHDIVISOR);
time=micros()-time;
showResult("Arduino DIV",result,time/1000000.0);
delay(100); // Wait until serial message is sent
}
void loop(){
}
The output created from this code is:
Start Multiplication Compare Test:
Multiplication Same Result: 250000
Multiplication Error Count: 0
Multiplication Test PASSED OK
Start Multiplication Benchmark Test:
Test: Russian MUL Result: 62500 Time: 4.666188
Test: Arduino MUL Result: 62500 Time: 1.970244
Start Division Compare Test:
Division Same Result: 250000
Division Error Count: 0
Division Test PASSED OK
Start Division Benchmark Test:
Test: Russian DIV Result: 0 Time: 6.368096
Test: Arduino DIV Result: 0 Time: 10.360857
So if the result of the divisions is a relatively small number, the rdiv32u() function can be faster than the '/' division.
BUT: If the resulting number is bigger, i.e. chaning the divisor to a small number like:
#define LOWDIVISOR (7L+dummy)
then the benchmark result changes to:
Start Multiplication Compare Test:
Multiplication Same Result: 250000
Multiplication Error Count: 0
Multiplication Test PASSED OK
Start Multiplication Benchmark Test:
Test: Russian MUL Result: 62500 Time: 4.666188
Test: Arduino MUL Result: 62500 Time: 1.970244
Start Division Compare Test:
Division Same Result: 250000
Division Error Count: 0
Division Test PASSED OK
Start Division Benchmark Test:
Test: Russian DIV Result: 123512 Time: 13.541772
Test: Arduino DIV Result: 123512 Time: 10.699304
While the time for the Arduino '/' division stays nearly the same, not depending much on how big the resulting number is (10.360857 seconds vs. 10.699304 seconds), the calculation time with the rdiv32u() division will depend very much on the dividend, divisor and result calculated: While the calculation is fast when the resulting number is small (6.368096 seconds), the time will increase while the division result is increasing (13.541772 seconds).
BTW: When developing the benchmark code I had to do some extra coding to make the Arduino compiler really calculate at execution time and prevent the compiler from including the final result for '/' divisions calculated at compile time, which would lead to a timing, that 250000 'long' divisions would just need 4 microseconds.