Why is the Zero slower than the Uno?

I've been able to run the qfplib correctly. Simply use the .S extension (uppercase) instead of .s for assembly files.

Here is a benchmark :
Code :

//needed to avoid loop optimization
#pragma GCC optimize ("-O0")

#include <qfpio.h>
#include <qfplib.h>

#define LOOP_COUNT 100000
uint32_t timer1 = 0, timer2 = 0;

void setup() {
  // put your setup code here, to run once:
  
  while (!SerialUSB);
  testAddition();
  testSubtraction();
  testMultiplication();
  testDivision();

}

void loop() {
  // put your main code here, to run repeatedly:

}

void testAddition()
{
  float a = 123.987, b = 987.123, r = 0.0;
  SerialUSB.println("Addition :");
  timer1 = micros();
  for (uint32_t i = 0; i < LOOP_COUNT; i++)
  {
    r = qfp_fadd(a, b);
  }
  timer2 = micros();
  SerialUSB.print("qfplib : ");
  SerialUSB.print(timer2-timer1);
  SerialUSB.println(" us.");

  timer1 = micros();
  for (uint32_t i = 0; i < LOOP_COUNT; i++)
  {
    r = a + b;
  }
  timer2 = micros();
  SerialUSB.print("gcc : ");
  SerialUSB.print(timer2-timer1);
  SerialUSB.println(" us.\n");
}

void testSubtraction()
{
  float a = 123.987, b = 987.123, r = 0.0;
  SerialUSB.println("Subtraction :");
  timer1 = micros();
  for (uint32_t i = 0; i < LOOP_COUNT; i++)
  {
    r = qfp_fsub(a, b);
  }
  timer2 = micros();
  SerialUSB.print("qfplib : ");
  SerialUSB.print(timer2-timer1);
  SerialUSB.println(" us.");

  timer1 = micros();
  for (uint32_t i = 0; i < LOOP_COUNT; i++)
  {
    r = a - b;
  }
  timer2 = micros();
  SerialUSB.print("gcc : ");
  SerialUSB.print(timer2-timer1);
  SerialUSB.println(" us.\n");
}

void testMultiplication()
{
  float a = 123.987, b = 987.123, r = 0.0;
  SerialUSB.println("Multiplication :");
  timer1 = micros();
  for (uint32_t i = 0; i < LOOP_COUNT; i++)
  {
    r = qfp_fmul(a, b);
  }
  timer2 = micros();
  SerialUSB.print("qfplib : ");
  SerialUSB.print(timer2-timer1);
  SerialUSB.println(" us.");

  timer1 = micros();
  for (uint32_t i = 0; i < LOOP_COUNT; i++)
  {
    r = a * b;
  }
  timer2 = micros();
  SerialUSB.print("gcc : ");
  SerialUSB.print(timer2-timer1);
  SerialUSB.println(" us.\n");
}

void testDivision()
{
  float a = 123.987, b = 987.123, r = 0.0;
  SerialUSB.println("Division :");
  timer1 = micros();
  for (uint32_t i = 0; i < LOOP_COUNT; i++)
  {
    r = qfp_fdiv(a, b);
  }
  timer2 = micros();
  SerialUSB.print("qfplib : ");
  SerialUSB.print(timer2-timer1);
  SerialUSB.println(" us.");

  timer1 = micros();
  for (uint32_t i = 0; i < LOOP_COUNT; i++)
  {
    r = a / b;
  }
  timer2 = micros();
  SerialUSB.print("gcc : ");
  SerialUSB.print(timer2-timer1);
  SerialUSB.println(" us.\n");
}

Result:

Addition :
qfplib : 389331 us.
gcc : 347495 us.

Subtraction :
qfplib : 420774 us.
gcc : 316102 us.

Multiplication :
qfplib : 433350 us.
gcc : 468917 us.

Division :
qfplib : 757818 us.
gcc : 1076084 us.