This is a continuation of my experiments of improving the performance of printing floats in the Print class by means of the fast divmod10() algorithm and some additional improvements. This discussion started in the divmod10() thread but this topic is broader that's why I started a new thread.
Some of printFloat() related posts: (timings refer to the test script)
- divmod10() : a fast replacement for /10 and %10 (unsigned) - #58 by robtillaart - Libraries - Arduino Forum - initial test (2144 uSec for reference Print.cpp, 1472 uSec for improved version)
- divmod10() : a fast replacement for /10 and %10 (unsigned) - #77 by robtillaart - Libraries - Arduino Forum - improved performance of remainder (1136 uSec)
- divmod10() : a fast replacement for /10 and %10 (unsigned) - #100 by robtillaart - Libraries - Arduino Forum - new remainder code, (1104 uSec, followed by a faster '.' => 1072uSec)
- divmod10() : a fast replacement for /10 and %10 (unsigned) - #104 by robtillaart - Libraries - Arduino Forum - used Stimmer's ASM divmod10_asm (1008 uSec)
I have been adapting the printFloat() function for some time now, not only speeding up performance but also include support for Engineering and Scientific notation. This latter is a breaking change in the interface which is discussed here - Proposed update for the printFloat code of print.cpp - Libraries - Arduino Forum
The current version of printFloat takes 720 uSec for the test, almost 3x as fast as the original print.cpp [IDE 1.0.4]
This is mainly due to new remainder + rounding code.
test-code (definitely not complete) with a small check to see if 0..6 digits works OK.
unsigned long start = 0;
unsigned long stop = 0;
volatile unsigned long q;
void setup()
{
Serial.begin(115200);
Serial.println("testing");
Serial.println(1.5, 6);
Serial.println(1.49999, 6);
Serial.println(1.9999, 6);
Serial.println(1.5, 5);
Serial.println(1.49999, 5);
Serial.println(1.9999, 5);
Serial.println(1.5, 4);
Serial.println(1.49999, 4);
Serial.println(1.9999, 4);
Serial.println(1.5, 3);
Serial.println(1.49999, 3);
Serial.println(1.9999, 3);
Serial.println(1.5, 2);
Serial.println(1.49999, 2);
Serial.println(1.9999, 2);
Serial.println(1.5, 1);
Serial.println(1.49999, 1);
Serial.println(1.9999, 1);
Serial.println(1.5, 0);
Serial.println(1.49999, 0);
Serial.println(1.9999, 0);
Serial.println();
byte backup = TIMSK0;
TCCR1A = 0;
TCCR1B = 4;
TIMSK1 |= _BV(TOIE1);
Serial.flush(); //wait for serial buffer to clear
TIMSK0 = 0; //disable millis;
TCNT1 = 0;
Serial.println(10737.41824, 4);
Serial.println(1.01819584, 4);
Serial.println(107.37, 2);
// test odd #digits
// Serial.println(10737.41824, 5);
// Serial.println(1.01819584, 3);
// Serial.println(107.37, 1);
// code to check the overhead float vs long ~~ same amount of digits.
// Serial.println(10737);
// Serial.println(1);
// Serial.println(107);
// Serial.println(4182);
// Serial.println(9181); // 0181 would be octal :)
// Serial.println(37);
stop = TCNT1; //how many clock cycles.
TIMSK0 = backup; //renable millis;
stop *= 16; //There are 16us per clock cycle with a 1:256 prescaler.
Serial.print("\nTime=");
Serial.println(stop);
Serial.print("per char incl .\\r\\n : ");
Serial.println(stop/28.0); // depends on test run to be right
Serial.println("done");
TIMSK1 &= ~_BV(TOIE1);
}
void loop()
{
}
ISR(TIMER1_OVF_vect) {
// reentrancy ...
// Serial.println("I Overflowed!"); //just to make sure we can tell if this happens.
}
The output:
testing
...
10737.4179
1.0182
107.37
Time=720 <<<<<<<<<<<<<<<<<<<<<<<<< came from 2144 -> factor 3
per char incl .\r\n : 25.71
done
My current print.h and print.cpp file are attached (warning contains experimental code)
The last improvements are
- new strategy (again) for remainder code // original + prev version in comments, (several intermediate version rejected
- optimized the rounding loop // yes, a lookup table for rounding is faster, ==> ~40 additional bytes
as always, improvements and remarks are welcome
Print.h (2.88 KB)
Print.cpp (14.7 KB)