divmod10() : a fast replacement for /10 and %10 (unsigned)

robtillaart · May 20, 2013, 6:39pm

For a project I needed both / 10 and %10 several times. Although the straightforward code was fast enough I investigated if it could be even faster. Could be useful e.g. for a library or so. This resulted in the following divmod10() function that calculates both the divide by 10 and the modulo 10 "simultaneously".

Note 1: the code is written for unsigned 32 bit math. It can not be used for signed numbers. One can derive a 16 bit version by removing one line and making the types 16 bit.

Note 2: tested only on a UNO

void divmod10(uint32_t in, uint32_t &div, uint32_t &mod)
{
  // q = in * 0.8;
  uint32_t q = (in >> 1) + (in >> 2);
  q = q + (q >> 4);
  q = q + (q >> 8);
  q = q + (q >> 16);  // not needed for 16 bit version

  // q = q / 8;  ==> q =  in *0.1;
  q = q >> 3;
  
  // determine error
  uint32_t  r = in - ((q << 3) + (q << 1));   // r = in - q*10; 
  div = q + (r > 9);
  if (r > 9) mod = r - 10;
  else mod = r;
}

The code is based upon the divu10() code from the book Hackers Delight¹. My insight was that the error formula in divu10() was in fact modulo 10 but not always. Sometimes it was 10 more. I wrote a small test-sketch that is not exhaustive but gives quite some confidence. The performance of the divmod10() is very good, the output of the test-sketch determines timing in microseconds per call.

	i/10		38.6920
	i%10		38.6120
	divmod10	14.9240

Divmod10() is more than twice the speed of /10 or %10. When both are needed the gain is approx a factor 5, or ~60 micros per calculation.

Test-sketch

//
//    FILE: divmod10.ino
//  AUTHOR: Rob Tillaart
//    DATE: 2013-05-20
//
// PUPROSE: fast divide and modulo by 10 in one function
//
// Code based upon book hackers delight
// 

unsigned long start = 0;
unsigned long stop = 0;
volatile unsigned long q;

void setup()
{

  Serial.begin(115200);
  Serial.println("testing divmod10()\n");

  Serial.print("i/10\t");
  start = micros();
  for (unsigned long i = 0; i < 1000; i++)
  {
    q = i/10;
  }
  stop = micros();
  Serial.println((stop - start)/1000.0, 4);

  Serial.print("i%10\t");
  start = micros();
  for (unsigned long i = 0; i < 1000; i++)
  {
    q = i%10;
  }
  stop = micros();
  Serial.println((stop - start)/1000.0, 4);
  Serial.println("--------------");


  Serial.print("divmod10\t");
  start = micros();
  uint32_t m = 0;
  uint32_t d = 0;
  for (unsigned long i = 0; i < 1000; i++)
  {
    divmod10(i, d, m);
  }
  stop = micros();
  Serial.println((stop - start)/1000.0, 4);
  Serial.println("--------------");


  Serial.println("test div 0..65535");
  for (unsigned int i = 0; i< 65535; i++)
  {
    uint32_t a = i/10;
    uint32_t d;
    uint32_t m;
    divmod10(i, d, m);
    if ( abs(a - d) > 0)
    {
      Serial.print(i);
      Serial.print("\t");
      Serial.print(a);
      Serial.print("\t");
      Serial.println(d);
    }
  } 

  Serial.println("test mod 0..65535");
  for (unsigned int i = 0; i < 65535; i++)
  {
    uint32_t a = i%10;
    uint32_t d;
    uint32_t m;
    divmod10(i, d, m);
    if ( abs(a - m) > 0)
    {
      Serial.print(i);
      Serial.print("\t");
      Serial.print(a);
      Serial.print("\t");
      Serial.println(m);
    }
  } 

  Serial.println("test div 0..maxlong (20K random samples)");
  for (unsigned int i = 0; i< 20000; i++)
  {
    uint32_t n = random(0xFFFFFFFF);
    uint32_t a = n/10;
    uint32_t d;
    uint32_t m;
    divmod10(n, d, m);
    if ( abs(a - d) > 0)
    {
      Serial.print(n);
      Serial.print("\t");
      Serial.print(a);
      Serial.print("\t");
      Serial.println(d);
    }
  } 

  Serial.println("test mod 0..maxlong (20K random samples)");
  for (unsigned int i = 0; i< 20000; i++)
  {
    uint32_t n = random(0xFFFFFFFF);
    uint32_t a = n%10;
    uint32_t d;
    uint32_t m;
    divmod10(n, d, m);
    if ( abs(a - m) > 0)
    {
      Serial.print(n);
      Serial.print("\t");
      Serial.print(a);
      Serial.print("\t");
      Serial.println(m);
    }
  } 

  Serial.println("done");
}

void loop()
{
}

void divmod10(uint32_t in, uint32_t &div, uint32_t &mod)
{
  uint32_t q = (in >> 1) + (in >> 2);
  q = q + (q >> 4);
  q = q + (q >> 8);
  q = q + (q >> 16);
  q = q >> 3;
  uint32_t  r = in - ((q << 3) + (q << 1)); // r = in - q*10;
  div = q + (r > 9);
  if (r > 9) mod = r - 10;
  else mod = r;
}

As always comments, remarks are welcome.

¹ - http://www.amazon.com/Hackers-Delight-Edition-Henry-Warren/dp/0321842685/ -

Jantje · May 20, 2013, 7:22pm

you keep on surprising me

Coding_Badly · May 21, 2013, 1:36am

No timing comparison to the div / ldiv functions?

robtillaart · May 21, 2013, 9:05pm

Not thought of div and ldiv. So wrappeed them into the test sketch :

testing divmod10()

i/10 38.6920
i%10 38.6120

ldiv 44.7880

div 15.3960

divmod10 14.9200

test div 0..65535
test mod 0..65535
test div 0..maxlong (20K random samples)
test mod 0..maxlong (20K random samples)
done

ldiv is very interesting as for almost the same performance one gets / & %

div is 16 bit and signed, where the rest is 32 bit (unsigned), still interesting numbers.
Should make a 16 bit test version.

snippet to add

  Serial.print("ldiv\t");
  start = micros();
  ldiv_t ld;
  volatile long x;
  for (unsigned long i = 0; i < 1000; i++)
  {
    ld = ldiv(i,10);
  }
  stop = micros();
  Serial.println((stop - start)/1000.0, 4);
  Serial.println("--------------");
  x = ld.quot;


  Serial.print("div\t");
  start = micros();
  div_t id;
  for (unsigned long i = 0; i < 1000; i++)
  {
    id = div(i,10);
  }
  stop = micros();
  Serial.println((stop - start)/1000.0, 4);
  Serial.println("--------------");
  x = id.quot;

genom2 · May 25, 2013, 11:33am

Hi robtillaart,

I wondered why the correction term (and therefore if-then-else) is needed. (This slows down performance.) I tried to find a precise approximation and came up with this:

// for in between 0x0 .. 0x7fffffff
void divmod10(uint32_t in, uint32_t &div, uint32_t &mod)
{
  uint32_t x=in>>1; // div = in/10 <==> div = ((in/2)/5) 
  uint32_t q=x;
  q = x + (q >> 8);
  q = x + (q >> 8);
  x++;
  q = x + (q >> 8);
  q = x + (q >> 8);
  q = 3*q;
  div = (q + (q >> 4)) >> 4;
  mod = in - ((div << 3) + (div << 1));
}

On my Duemilanove it's faster then the divu10() code from the book Hackers Delight you gave. Maybe because no if-the-else is needed or shift by 8 is good for the optimizer. But I have no insight into the resulting assembler code.

Its a somewhat "weird" solution, again.

robtillaart · May 25, 2013, 1:28pm

added your version to the test sketch :

i/10 38.6920
i%10 38.6120

ldiv 44.7880

div 15.3960

divmod10 14.9200 ( )

divmod10 2 13.6040 ( in/2/5 version)

That is 10% faster. Yes! almost 3x faster than the native one
your comments states it works for 0..0x7FFFFFFF that is half of the possible numbers.
My 20 K random test show it seems to work also above it (akthough 20K samples is no proof)

can it be faster?

robtillaart · May 25, 2013, 2:22pm

inspired by 10 = 2 x 5 as you did I found this one

void divmod10_2(uint32_t in, uint32_t &div, uint32_t &mod)
{
  uint32_t x = in>>1;
  uint32_t q = x;
  q = x + (q >> 8);
  q = x + (q >> 8);
  x++;
  q = x + (q >> 8);
  q = x + (q >> 8);
  q = 3*q;
  div = (q + (q >> 4)) >> 4;
  mod = in - (((div << 2) + div)<<1);  // <<<<<<<<<<< this line changed, in total one shift less.
}

it resulted in : 13.1600 another slice off

genom2 · June 1, 2013, 11:44am

Yes, constrain 0..0x7FFFFFFF was needed because it only works in this interval.

BTW: Your test with random() is useless for testing in 0..0xFFFFFFFF because it produces only values between 0..RANDOM_MAX (with RANDOM_MAX = 0x7FFFFFFF). ( See avr-libc: <stdlib.h>: General utilities )

"can it be faster?"
Yes, it can!

I took the challenge and found a solution which is the fastest (so far on my atmega328p) and finally works on the complete interval 0..0xFFFFFFFF.

void divmod10(uint32_t in, uint32_t &div, uint32_t &mod)
{
	uint32_t x=(in>>1); // div = in/10 <==> div = ((in/2)/5)
	uint32_t q=x;

	q= (q>>8) + x;
	q= (q>>8) + x;
	q= (q>>8) + x;
	q= (q>>8) + x + 1;

	x= q;
	q= (q>>1) + x;
	q= (q>>3) + x;
	q= (q>>1) + x;

	div = (q >> 3);
	mod = in - (((div << 2) + div) << 1);
}

robtillaart · June 1, 2013, 6:16pm

divmod10 ==> 12.6600

impressive!

ajk · June 6, 2013, 8:09pm

Make them q and x register variables, it should go even faster.

robtillaart · June 6, 2013, 8:24pm

tried adding register to q and x, but no success,

Think the compiler either already optimizes the vars as they are not volatile, or the compiler ignores the keyword.

pYro_65 · June 7, 2013, 2:59pm

I was curious so I had a quick browse.

According to the C++ standard,

D.4 register keyword [depr.register]
1 The use of the register keyword as a storage-class-specifier (7.1.1) is deprecated.

And 7.1.1:

The register speci?er shall be applied only to names of variables declared in a block (6.3) or to function parameters (8.4). It speci?es that the named variable has automatic storage duration (3.7.3). A variable declared without a storage-class-speci?er at block scope or declared as a function parameter has automatic storage duration by default.

So to summarize, register has the same semantics as auto. Local variables are auto by default, therefore register is completely redundant in C++.

In C, it simply stops you from taking the address as this isn't allowed, whereas if done in C++, the register semantics are simply ignored.

ajk · June 8, 2013, 12:12am

Nice to know that G++ does this. Not all compilers to though.
It does not mean that the code is incorrect in any case.
I guess it depends on the compiler and optimizer.

Cool hack none-the-less.

pYro_65 · June 8, 2013, 12:37am

Nice to know that G++ does this. Not all compilers to though.

Thats not GCC documentation, its the ISO C++99 standard. The same goes for C++11.
Optimizing compilers are able to pick register candidates far better than us. Even then 'register', like 'inline' is only a hint, and can be ignored.

genom2 · June 10, 2013, 9:46pm

Sorry for this. I don't want to abuse this thread for progress reports on senseless optimization. But, please excuse me ... I'm in a kind of flow.

Another ~1.5% off:

void divmod10(uint32_t in, uint32_t &div, uint32_t &mod)
{
	uint32_t x= (in|1) - (in>>2); // div = in/10 <~~> div = (0.75*in) >> 3
	uint32_t q= (x>>4) + x;
	x= q;
	q= (q>>8) + x;
	q= (q>>8) + x;
	q= (q>>8) + x;
	q= (q>>8) + x;

	div = (q >> 3);
	mod = in - (((div << 2) + div) << 1);
}

pYro_65 · June 11, 2013, 1:19am

genom2:
Sorry for this. I don't want to abuse this thread for progress reports on senseless optimization. But, please excuse me ... I'm in a kind of flow.

I don't think you're in danger of that, I'm sure robtillaart will enjoy testing your approach. Its not senseless either, someone one day will need to squeeze in a few extra repetitions that the standard method can't do in time.

robtillaart · June 11, 2013, 3:52pm

Well done genom2!!

On my "reference UNO" it clocks 12.4760

~~I looked at the code and I could remove 2 lines and it still passes the random test..~~UPdate : false positve see below

timing becomes a staggering 11.4680

void divmod10_4(uint32_t in, uint32_t &div, uint32_t &mod)
{
  uint32_t x= (in|1) - (in>>2); // div = in/10 <~~> div = (0.75*in) >> 3
  uint32_t q= (x>>4) + x;  // 0.796875 *in
  x= q;
  q= (q>>8) + x;     // 0.799987793 * in
  q= (q>>8) + x;     // 0.803112745 * in
  //q= (q>>8) + x;
  //q= (q>>8) + x;

  div = (q >> 3);
  mod = in - (((div << 2) + div) << 1);
}

~~did a extended 1000K test~~
~~divmod10 11.4680~~
~~--------------~~
~~test div 0..65535~~
~~test mod 0..65535~~
~~test divmod 0..maxlong (1000K random samples)~~
~~done~~

genom2 · June 11, 2013, 7:49pm

No, you can't remove the 2 lines! As you said, a random test is no proof!

Take this number 7031721 as a counter example:
7031721/10 expected 703172 but your modification returns 703171

But an inconspicuous modification on calculating mod (in the last line) does the trick and another ~8% off:

void divmod10(uint32_t in, uint32_t &div, uint32_t &mod)
{
	uint32_t x= (in|1) - (in>>2); // div = in/10 <~~> div = 0.75*in/8
	uint32_t q= (x>>4) + x;
	x= q;
	q= (q>>8) + x;
	q= (q>>8) + x;
	q= (q>>8) + x;
        q= (q>>8) + x;

	div = (q >> 3);
	mod = in - ((q & ~0x7) + (div << 1));
}

robtillaart · June 11, 2013, 8:29pm

Great counter example!
and a very smart way to mod --> 11.7200 impressive !

ajk · June 12, 2013, 5:19am

I don't have that exact Arduino model, but I'm curious -- if rotates happen one bit at a time, and take 2 instructions per bit, or 16 operations to rotate, perhaps a copy is faster? What do you get with this?

void divmod10(uint32_t in, uint32_t &div, uint32_t &mod)
{

        union {
                uint32_t q;
                uint8_t qq[4];
        };

        union {
                uint32_t r;
                uint8_t rr[4];
        };

        r = 0;
        uint32_t x = (in|1) - (in >> 2); // div = in/10 <~~> div = (0.75*in) >> 3
        q = (x >> 4) + x;
        x = q;
        rr[0] = qq[1];
        rr[1] = qq[2];
        rr[2] = qq[3];
        q = r +x

        rr[0] = qq[1];
        rr[1] = qq[2];
        rr[2] = qq[3];
        q = r + x

        rr[0] = qq[1];
        rr[1] = qq[2];
        rr[2] = qq[3];
        q = r + x

        rr[0] = qq[1];
        rr[1] = qq[2];
        rr[2] = qq[3];
        q = r + x


        div = (q >> 3);
        mod = in - (((div << 2) + div) << 1);
}

Topic		Replies	Views
divu5() and divu3(). Fast replacements for unsigned division by 3, 5, 6, 10, 15 Libraries	17	16385	May 6, 2021
Really fast divmod10 for 64-bit integers Libraries	1	2417	May 6, 2021
faster printing of floats by divmod10() and others Libraries	75	54229	May 6, 2021
Fast modulo for unsigned 32 bit. - prime sieve 2 demo Libraries	1	1348	May 6, 2021
Speeding up division Programming	12	4642	May 5, 2021

divmod10() : a fast replacement for /10 and %10 (unsigned)

i/10 38.6920 i%10 38.6120

ldiv 44.7880

div 15.3960

divmod10 14.9200

i/10 38.6920 i%10 38.6120

ldiv 44.7880

div 15.3960

divmod10 14.9200 ( )

divmod10 2 13.6040 ( in/2/5 version)

Related topics

i/10 38.6920
i%10 38.6120

i/10 38.6920
i%10 38.6120