Why is the Zero slower than the Uno?

John_S · August 30, 2016, 3:59pm

Hi all,
I have a sparkfun SAMD31 dev board, and have been playing around with it. I have a sketch (attached below) that calculates the sunrise and sunset times for a given date and time. It is almost exclusively floating point math. I've run it on the Uno, Zero based board, and Due; of all those, the Zero is the slowest. I've also changed the "float"s to "double"s, and the time difference isn't that significant. The results are:

Uno using float: 4148 microseconds
Uno using double: 4148 microseconds (same time, because the Uno does not do doubles)

Zero using float: 6109 microseconds
Zero using double: 6276 microseconds

Due using float: 1543 microseconds
Due using double: 1586 microseconds

So, my questions are why is the Zero the slowest? and why doesn't the float vs double make much difference in the calculation time of the 32 bit boards?

int Year=2016;
int Month= 8;
int Day= 13;
double UTC= 12.0;
double Lat= 43.69;
double Lon= -79.45;
double UTC_offset = -4.0;


unsigned long timer1;
unsigned long timer2;

long julian;
double j; //Julian Day
double obl;
double N;
double i;
double w;
double a;
double e;
double M;
double L;
double E;
double xv;
double yv;
double v;
double r;
double lonsun;
double xs;
double ys;
double zs;
double xe;
double ye;
double ze;
double RA;
double Dec;
double GMST0;
double GMST;
double LST;
double HA;
double x;
double y;
double z;
double xhor;
double yhor;
double zhor;
double azim;
double alt;
double h = -0.833;
double UT_Sun_in_south;
double LHA;
double Rise_time;
double Set_time;
double d0 = 1919.26;
double Apparent_Diameter;

void setup() {
  // put your setup code here, to run once:
Serial.begin(9600);
timer1 = micros();
j= 367UL*Year -(7*(Year+((Month+9)/12))/4)+((275UL*Month)/9)+ Day - 730530UL + UTC/24.0;
obl = 23.4393 - 0.0000003563 * j;
N = 0;
i = 0;
w = 282.9404 + 0.0000470935 * j;
a=1;
e= 0.016709 - 0.000000001151 * j;
M = (356.047 + 0.9856002585 * j);
M = mod(M, 360);
L = mod(w + M, 360);
E = M+(180/PI)*e*sin(M/(180/PI))*(1+e*cos(M/(180/PI)));           
xv= cos(E/(180/PI))-e;
yv=sqrt(1-e*e) *sin(E/(180/PI));
v=atan2(yv/(180/PI),xv/(180/PI))*(180/PI);
r=sqrt((xv*xv)+(yv*yv));                                         
lonsun=v+w;
xs = r * cos(lonsun/(180/PI));
ys = r * sin(lonsun/(180/PI));
//zs = ?
xe = xs;
ye = ys * cos(obl/(180/PI));
ze = ys * sin(obl/(180/PI));                                     

RA = mod(atan2(ye/(180/PI), xe/(180/PI))*180/PI, 360);
Dec= asin( ze/r)*180/PI;                                          

GMST0 = mod(L + 180, 360)/15.0; //unit: hours.
GMST = GMST0 + UTC;
LST  = GMST + Lon/15.0; //unit (hours)
if (LST > 24) LST=LST-24.0;

HA = LST*15.0 - RA; //unit:degrees

x = cos(HA/(180/PI)) * cos(Dec/(180/PI));
y = sin(HA/(180/PI)) * cos(Dec/(180/PI));
z = sin(Dec/(180/PI));

xhor = x * sin(Lat/(180/PI)) - z * cos(Lat/(180/PI));
yhor = y;
zhor = x * cos(Lat/(180/PI)) + z * sin(Lat/(180/PI));

azim  = atan2( yhor/(180/PI), xhor/(180/PI) )*(180/PI) + 180;
alt = asin( zhor)*(180/PI);                             


h = -0.833;
UT_Sun_in_south = mod((RA-GMST0*15.0 - Lon)/15.0 ,24);

LHA =acos((sin(h/(180.0/PI)) - sin(Lat/(180.0/PI))*sin(Dec/(180.0/PI)))/(cos(Lat/(180.0/PI))*cos(Dec/(180.0/PI))))*(180/PI)/15 ;
Rise_time = mod((UT_Sun_in_south-LHA+UTC_offset)/24, 24) * 24;
Set_time = mod((UT_Sun_in_south+LHA+UTC_offset)/24, 24) * 24;
//d0 = 1919.26;
//Apparent_Diameter = 0;
timer2 = micros();


Serial.print("Time to calculate: ");Serial.print(timer2-timer1);Serial.println(" micros");
/*
Serial.print("j = ");Serial.println(j,4);
Serial.print("obl= ");Serial.println(obl,4);
Serial.print("w= ");Serial.println(w,4);
Serial.print("e= ");Serial.println(e,4);
Serial.print("M= ");Serial.println(M,4);
Serial.print("L= ");Serial.println(L,4);
Serial.print("E= ");Serial.println(E,4);
Serial.print("xv= ");Serial.println(xv,4);
Serial.print("yv= ");Serial.println(yv,4);
Serial.print("v= ");Serial.println(v,4);
Serial.print("r= ");Serial.println(r,4);
Serial.print("lonsun= ");Serial.println(lonsun,4);
Serial.print("xs= ");Serial.println(xs,4);
Serial.print("ys= ");Serial.println(ys,4);
Serial.print("xe= ");Serial.println(xe,4);
Serial.print("ye= ");Serial.println(ye,4);
Serial.print("ze= ");Serial.println(ze,4);
Serial.print("RA= ");Serial.println(RA,4);
Serial.print("Dec= ");Serial.println(Dec,4);
Serial.print("GMST0= ");Serial.println(GMST0,4);
Serial.print("GMST= ");Serial.println(GMST,4);
Serial.print("LST= ");Serial.println(LST,4);
Serial.print("Ha= ");Serial.println(HA,4);
Serial.print("x= ");Serial.println(x,4);
Serial.print("y= ");Serial.println(y,4);https://www.google.ca/webhp?hl=en&sa=X&ved=0ahUKEwjwnM_jhujOAhXL7hoKHTNsBIcQPAgD
Serial.print("z= ");Serial.println(z,4);
Serial.print("xhor= ");Serial.println(xhor,4);
Serial.print("yhor= ");Serial.println(yhor,4);
Serial.print("zhor= ");Serial.println(zhor,4);
Serial.print("azim= ");Serial.println(azim,4);
Serial.print("alt= ");Serial.println(alt,4);
Serial.println();
Serial.print("UT_Sun_in_south= ");Serial.println(UT_Sun_in_south,4);
Serial.print("LHA= ");Serial.println(LHA,4);
*/
Serial.print("Rise_time= ");Serial.print(Rise_time,0);Serial.print(":");Serial.print(mod(floor(Rise_time*60),60),0);Serial.print(":");Serial.println(mod(Rise_time*3600,60),0);
Serial.print("Set_time= ");Serial.print(Set_time,0);Serial.print(":");Serial.print(mod(Set_time*60.0,60),0);Serial.print(":");Serial.print(mod(Set_time*3600,60),0);
}

void loop() {
}
double mod (double dividend, int divisor){
  if (dividend > divisor){
    do{
      dividend = dividend - divisor;
    }while (dividend > divisor);
    return dividend;
  }
  else if (dividend < 0){
    do{
      dividend = dividend + divisor;
    }while (dividend <0);
    return dividend;
  }  
  else return dividend;
}

Output:

Time to calculate: 6276 micros
Rise_time= 6:20:42
Set_time= 20:24:28

westfw · August 30, 2016, 11:15pm

When you compiled on the ARM platforms without using "double", did you also convert all the trig and sqrt function calls to their non-double forms (sinf(), sqrtf(), etc ?) If not, then much of the calculations you're doing are done with doubles anyway. (also, there's the "-fsingle-precision-constant" compiler option that should probably be used.)
(huh. How come there isn't a c++ library thing that overloads these to do the proper thing with whichever argument was provided? Rhetorical question )
The CM0, last I looked (gcc 4.8.x?), was the only processor of the three that has unoptimized floating point code. ARM CM3 has an ARM assemblyr float/double library. AVR has a highly-optimized assembly float library. ARM CM0 has the default gcc float library, written in C and not particularly optimized for any instruction set.
CM0 doesn't have a hardware divide instruction, and has a somewhat limited multiply instruction (compared to CM3), so it's not clear that it has much of a performance edge of AVR for calculating floating point, in an operation-by-operation comparison.

See also Qfplib: a family of floating-point libraries for ARM Cortex-M cores

John_S · August 31, 2016, 2:34am

Hmm, thanks for your insight. It appears that the Zero optimization is not equal to the AVR or Due level. That's unfortunate because it appears to have a lot of potential for number-crunching (32 bit, 48 mhz, ect).

After running some more tests, the Zero calculates the problem in 3631 microseconds (using sinf, ect), and the Due comes in at 1088 microseconds. I suppose that for the project I'm thinking about, an extra millisecond or two for these calculations isn't going to matter in the end.

One more question, how do I add the "-fsingle-precision-constant" to the arduino compiler? What exactly does that option do?

Jpreisler · August 31, 2016, 12:31pm

westfw:
See also Qfplib: a family of floating-point libraries for ARM Cortex-M cores

how would i use this in an arduino project?

John_S · August 31, 2016, 6:48pm

Jpreisler:
how would i use this in an arduino project?

I was also wondering about that too. It's a library, but not an arduino library. For my project, timing isn't critical, as it would only need to run these calculations once per day, but for some heavy number-crunching, a fast library would be super beneficial.

AloyseTech · August 31, 2016, 7:09pm

Try to create folders named qfplib and qfpio respectively in the Arduino/libraries folder. Put the qfpio source file in the qfpio folder and do the same for the qfplib. In your Arduino program, include qfplib.h and qfpio.h .

EDIT : Doesn't seem to work. I'll investigate.

AloyseTech · September 1, 2016, 4:17pm

I've been able to run the qfplib correctly. Simply use the .S extension (uppercase) instead of .s for assembly files.

Here is a benchmark :
Code :

//needed to avoid loop optimization
#pragma GCC optimize ("-O0")

#include <qfpio.h>
#include <qfplib.h>

#define LOOP_COUNT 100000
uint32_t timer1 = 0, timer2 = 0;

void setup() {
  // put your setup code here, to run once:
  
  while (!SerialUSB);
  testAddition();
  testSubtraction();
  testMultiplication();
  testDivision();

}

void loop() {
  // put your main code here, to run repeatedly:

}

void testAddition()
{
  float a = 123.987, b = 987.123, r = 0.0;
  SerialUSB.println("Addition :");
  timer1 = micros();
  for (uint32_t i = 0; i < LOOP_COUNT; i++)
  {
    r = qfp_fadd(a, b);
  }
  timer2 = micros();
  SerialUSB.print("qfplib : ");
  SerialUSB.print(timer2-timer1);
  SerialUSB.println(" us.");

  timer1 = micros();
  for (uint32_t i = 0; i < LOOP_COUNT; i++)
  {
    r = a + b;
  }
  timer2 = micros();
  SerialUSB.print("gcc : ");
  SerialUSB.print(timer2-timer1);
  SerialUSB.println(" us.\n");
}

void testSubtraction()
{
  float a = 123.987, b = 987.123, r = 0.0;
  SerialUSB.println("Subtraction :");
  timer1 = micros();
  for (uint32_t i = 0; i < LOOP_COUNT; i++)
  {
    r = qfp_fsub(a, b);
  }
  timer2 = micros();
  SerialUSB.print("qfplib : ");
  SerialUSB.print(timer2-timer1);
  SerialUSB.println(" us.");

  timer1 = micros();
  for (uint32_t i = 0; i < LOOP_COUNT; i++)
  {
    r = a - b;
  }
  timer2 = micros();
  SerialUSB.print("gcc : ");
  SerialUSB.print(timer2-timer1);
  SerialUSB.println(" us.\n");
}

void testMultiplication()
{
  float a = 123.987, b = 987.123, r = 0.0;
  SerialUSB.println("Multiplication :");
  timer1 = micros();
  for (uint32_t i = 0; i < LOOP_COUNT; i++)
  {
    r = qfp_fmul(a, b);
  }
  timer2 = micros();
  SerialUSB.print("qfplib : ");
  SerialUSB.print(timer2-timer1);
  SerialUSB.println(" us.");

  timer1 = micros();
  for (uint32_t i = 0; i < LOOP_COUNT; i++)
  {
    r = a * b;
  }
  timer2 = micros();
  SerialUSB.print("gcc : ");
  SerialUSB.print(timer2-timer1);
  SerialUSB.println(" us.\n");
}

void testDivision()
{
  float a = 123.987, b = 987.123, r = 0.0;
  SerialUSB.println("Division :");
  timer1 = micros();
  for (uint32_t i = 0; i < LOOP_COUNT; i++)
  {
    r = qfp_fdiv(a, b);
  }
  timer2 = micros();
  SerialUSB.print("qfplib : ");
  SerialUSB.print(timer2-timer1);
  SerialUSB.println(" us.");

  timer1 = micros();
  for (uint32_t i = 0; i < LOOP_COUNT; i++)
  {
    r = a / b;
  }
  timer2 = micros();
  SerialUSB.print("gcc : ");
  SerialUSB.print(timer2-timer1);
  SerialUSB.println(" us.\n");
}

Result:

Addition :
qfplib : 389331 us.
gcc : 347495 us.

Subtraction :
qfplib : 420774 us.
gcc : 316102 us.

Multiplication :
qfplib : 433350 us.
gcc : 468917 us.

Division :
qfplib : 757818 us.
gcc : 1076084 us.

westfw · September 1, 2016, 8:55pm

Note that qfplib's main attraction was supposed to be that's it's much SMALLER than the gcc libraries (1k!!)
It also turns out that the trig functions are significantly faster, even though the primitive operations are approximately the same speed...

I did some work with getting qfplib to work transparently with gcc some time ago (mostly for SAMD10), and I was planning on writing up some instructions for this (and arduino.) But it doesn't look like I actually did anything

Short answer:

Make sure that there are aliases so that gcc can call the function names that it expects (I think this was actually merged into qfplib)
replace "-lm" with "-lqfplib" in the link step
add the gcc switch mentioned above, which I think prevents gcc from using double math when doing calculations involving floating point constants (ie "float f = 3.14159 * sinf(theta);" would normally invoke a double precision multiplication, even though both arguments are only floats.)

I'll try to take another look if I ever finish with all the "summer commitments" in my non-techie pile of things to do.

Jpreisler · September 2, 2016, 2:08am

Thanks, guys! I got it working over here.

Topic		Replies	Views
Zero Floating Point Benchmark vs. Duo Zero	20	9381	May 6, 2021
why are fp arithmetics so slow on a M0/Zero, compared to AVR or M3 Due? 3rd Party Boards	27	5112	May 6, 2021
Arduino Zero running slower than my Arduino micro! 3rd Party Boards	3	901	May 6, 2021
How run FP64lib on Arduino ZERO? Programming	20	139	April 14, 2025
Arduino Due Benchmark - Newton Approximation for Pi Due	10	23026	May 6, 2021

Why is the Zero slower than the Uno?

Related topics