UPDATE Feb 24, 2021: the post is updated with an even better function, yielding a 14x speed increase over the typical approach! Thanks @westfw for the idea
I did some quick experiments that I wanted to share and archive in case others find it useful. I wanted to find a fast way of finding the gcd between two numbers. I compared running 3 4 functions on the same numbers 10,000 times each to find the average execution time on an Uno. The three four approaches:
Standard Euclidean Algorithm using recursion
Standard Euclidean Algorithm using a while loop
Binary GCD Algorithm (from here). This is a clever approach because it uses a gcc builtin: __builtin_ctz(). That built-in counts the number of trailing zeros in a binary number. That makes shifting out the trailing zeros in the Binary GCD super efficient.
UPDATE: a binary GCD algorithm using fewer __builtin_ctz() because AVRs don't have an specific instruction to help us out with that.
All three four are for unsigned longs (because my application needs that dynamic range), but you can change them to fit your need. If you are just using ints, change the variable types and change __builtin_ctzl() to __builtin_ctz()
unsigned long gcd_recurse(unsigned long a, unsigned long b) {
if (b == 0) return a;
if (a == 0) return b;
if (b > a) {
return gcd_recurse(a, b % a);
}
return gcd_recurse(b, a % b);
}
unsigned long gcd_while(unsigned long a, unsigned long b) {
if (b > a) {
unsigned long tmp = b;
b = a;
a = tmp;
}
while (b != 0) {
unsigned long r = a % b;
a = b;
b = r;
}
return a;
}
unsigned long gcd_ctz(unsigned long u, unsigned long v) {
int shift;
if (u == 0) return v;
if (v == 0) return u;
shift = __builtin_ctzl(u | v);
u >>= __builtin_ctzl(u);
do {
v >>= __builtin_ctzl(v);
if (u > v) {
unsigned long t = v;
v = u;
u = t;
}
v = v - u;
} while (v != 0);
return u << shift;
}
unsigned long gcd_ctz_manual(unsigned long u, unsigned long v) {
int shift;
if (u == 0) return v;
if (v == 0) return u;
shift = __builtin_ctzl(u | v);
/*
// This is actually 20 usecs (40%) slower than using __builtin_ctzl to find shift
while ( !((u|v) & 0b1L) ) {
u >>= 1;
v >>= 1;
shift++;
}
*/
/*
// this is 10 usecs slower than using the builtin to find shift
for (shift = 0; !((u|v) & 0b1L); shift++) {
u >>= 1;
v >>= 1;
}
*/
while ( !(u & 0b1L)) u >>= 1; // manual u >>= __buitin_ctzl(u)
while (v != 0) {
while ( !(v & 0b1L)) v >>= 1;
if (u > v) {
unsigned long t = v;
v = u;
u = t;
}
v = v - u;
}
return u << shift;
}
Results (for 10,000 pairs of random unsigned longs, the same data sent to each method):
Recursive = 679.20 microseconds per call
While loop = 668.00 microseconds per call
The Binary GCD approach (using the GCC built-in) = 105.90 microseconds per call
Binary GCD (using a mix of manual and built-in cvt, as needed for AVRs) = 49.20 microseconds per call
Since I didn't know about the binary GCD algorithm, and the speed improvement was quite good amazing, I thought it would be good to share that here in case it's useful for others.
Interesting. Have you looked at the object code produced?
The AVR doesn't have any special instructions that would help with __builtin_ctzl(), so I'm not entirely convinced that you couldn't make even faster code by manually combining the ctz counting with the shifting...
Thanks for the idea - I haven't compared the actual instructions yet. I probably should just compile the function alone w/o the Arduino.h stuff and see what the assembly language is doing.
looked through the GCC source code and found some of the built-in stuff for i386, but I couldn't really figure out how it was doing it in AVR. So if it isn't actually doing anything special (since there are no instructions to help), you're right that we could push this even faster. I'll take a look at that, could be fun to learn what's going on under the hood here.
Practically speaking, when using "realistic" numbers for my project (finding PLL divider multiples), I'm averaging 20-30 microseconds per function call because there are lots of trailing zeros.
westfw:
Interesting. Have you looked at the object code produced?
The AVR doesn't have any special instructions that would help with __builtin_ctzl(), so I'm not entirely convinced that you couldn't make even faster code by manually combining the ctz counting with the shifting...
You are totally correct!!! You get another 2x speed increase doing it manually!
i'll update the original post to include this new way of doing it. Here are the new results:
Number of iterations: 10000
Recursive calls: 677.50 microseconds per call
While loop: 666.30 microseconds per call
CVTL Built-in: 105.30 microseconds per call
Manual shift + cvtl: 49.20 microseconds per call
so a 14x speed up compared to the recursive method.
here's the complete code I used for testing:
#define SPOTCHECK false // set to true for debugging/spot checking values
void setup() {
// put your setup code here, to run once:
Serial.begin(9600);
}
float cumA = 0, cumB = 0, cumC = 0, cumD = 0;
int totiters = 0;
void loop() {
// put your main code here, to run repeatedly:
int iters = 100;
totiters += iters;
unsigned long timer1, timer2;
unsigned long data1[iters];
unsigned long data2[iters];
for (int i = 0; i < iters; i++) {
data1[i] = random();
data2[i] = random();
}
if (SPOTCHECK) {
Serial.println("Spot checking:");
Serial.println(data1[0]);
Serial.println(data2[0]);
Serial.println(gcd_ctz_manual(data1[0], data2[0]));
Serial.println(gcd_ctz(data1[0], data2[0]));
Serial.println(gcd_recurse(data1[0], data2[0]));
Serial.println(gcd_while(data1[0], data2[0]));
}
Serial.println("--------------");
Serial.print("Number of iterations: ");
Serial.println(totiters);
timer1 = millis();
for (int i = 0; i < iters; i++) gcd_recurse(data1[i], data2[i]);
timer2 = millis();
Serial.print("Recursive calls: ");
cumA += timer2 - timer1;
Serial.print(1000.0 * cumA / totiters);
Serial.println(" microseconds per call");
timer1 = millis();
for (int i = 0; i < iters; i++) gcd_while(data1[i], data2[i]);
timer2 = millis();
cumB += timer2 - timer1;
Serial.print("While loop: ");
Serial.print(1000.0 * cumB / totiters);
Serial.println(" microseconds per call");
timer1 = millis();
for (int i = 0; i < iters; i++) gcd_ctz(data1[i], data2[i]);
timer2 = millis();
cumC += (timer2 - timer1);
Serial.print("CVTL Built-in: ");
Serial.print(1000.0 * cumC / totiters);
Serial.println(" microseconds per call");
timer1 = millis();
for (int i = 0; i < iters; i++) gcd_ctz_manual(data1[i], data2[i]);
timer2 = millis();
cumD += (timer2 - timer1);
Serial.print("Manual shift + cvtl: ");
Serial.print(1000.0 * cumD / totiters);
Serial.println(" microseconds per call");
}
unsigned long gcd_recurse(unsigned long a, unsigned long b) {
if (b == 0) return a;
if (a == 0) return b;
if (b > a) {
return gcd_recurse(a, b % a);
}
return gcd_recurse(b, a % b);
}
unsigned long gcd_while(unsigned long a, unsigned long b) {
if (b > a) {
unsigned long tmp = b;
b = a;
a = tmp;
}
while (b != 0) {
unsigned long r = a % b;
a = b;
b = r;
}
return a;
}
unsigned long gcd_ctz(unsigned long u, unsigned long v) {
int shift;
if (u == 0) return v;
if (v == 0) return u;
shift = __builtin_ctzl(u | v);
u >>= __builtin_ctzl(u);
do {
v >>= __builtin_ctzl(v);
if (u > v) {
unsigned long t = v;
v = u;
u = t;
}
v = v - u;
} while (v != 0);
return u << shift;
}
unsigned long gcd_ctz_manual(unsigned long u, unsigned long v) {
int shift;
if (u == 0) return v;
if (v == 0) return u;
shift = __builtin_ctzl(u | v);
/*
// This is actually 20 usecs (40%) slower than using __builtin_ctzl to find shift
while ( !((u|v) & 0b1L) ) {
u >>= 1;
v >>= 1;
shift++;
}
*/
/*
// this is 10 usecs slower than using the builtin to find shift
for (shift = 0; !((u|v) & 0b1L); shift++) {
u >>= 1;
v >>= 1;
}
*/
while ( !(u & 0b1L)) u >>= 1; // manual u >>= __buitin_ctzl(u)
while (v != 0) {
while ( !(v & 0b1L)) v >>= 1;
if (u > v) {
unsigned long t = v;
v = u;
u = t;
}
v = v - u;
}
return u << shift;
}