divmod10() : a fast replacement for /10 and %10 (unsigned)

For a project I needed both / 10 and %10 several times. Although the straightforward code was fast enough I investigated if it could be even faster. Could be useful e.g. for a library or so. This resulted in the following divmod10() function that calculates both the divide by 10 and the modulo 10 “simultaneously”.

Note 1: the code is written for unsigned 32 bit math. It can not be used for signed numbers. One can derive a 16 bit version by removing one line and making the types 16 bit.

Note 2: tested only on a UNO

void divmod10(uint32_t in, uint32_t &div, uint32_t &mod)
{
  // q = in * 0.8;
  uint32_t q = (in >> 1) + (in >> 2);
  q = q + (q >> 4);
  q = q + (q >> 8);
  q = q + (q >> 16);  // not needed for 16 bit version

  // q = q / 8;  ==> q =  in *0.1;
  q = q >> 3;
  
  // determine error
  uint32_t  r = in - ((q << 3) + (q << 1));   // r = in - q*10; 
  div = q + (r > 9);
  if (r > 9) mod = r - 10;
  else mod = r;
}

The code is based upon the divu10() code from the book Hackers Delight1. My insight was that the error formula in divu10() was in fact modulo 10 but not always. Sometimes it was 10 more. I wrote a small test-sketch that is not exhaustive but gives quite some confidence. The performance of the divmod10() is very good, the output of the test-sketch determines timing in microseconds per call.

	i/10		38.6920
	i%10		38.6120
	divmod10	14.9240

Divmod10() is more than twice the speed of /10 or %10. When both are needed the gain is approx a factor 5, or ~60 micros per calculation.

Test-sketch

//
//    FILE: divmod10.ino
//  AUTHOR: Rob Tillaart
//    DATE: 2013-05-20
//
// PUPROSE: fast divide and modulo by 10 in one function
//
// Code based upon book hackers delight
// 

unsigned long start = 0;
unsigned long stop = 0;
volatile unsigned long q;

void setup()
{

  Serial.begin(115200);
  Serial.println("testing divmod10()\n");

  Serial.print("i/10\t");
  start = micros();
  for (unsigned long i = 0; i < 1000; i++)
  {
    q = i/10;
  }
  stop = micros();
  Serial.println((stop - start)/1000.0, 4);

  Serial.print("i%10\t");
  start = micros();
  for (unsigned long i = 0; i < 1000; i++)
  {
    q = i%10;
  }
  stop = micros();
  Serial.println((stop - start)/1000.0, 4);
  Serial.println("--------------");


  Serial.print("divmod10\t");
  start = micros();
  uint32_t m = 0;
  uint32_t d = 0;
  for (unsigned long i = 0; i < 1000; i++)
  {
    divmod10(i, d, m);
  }
  stop = micros();
  Serial.println((stop - start)/1000.0, 4);
  Serial.println("--------------");


  Serial.println("test div 0..65535");
  for (unsigned int i = 0; i< 65535; i++)
  {
    uint32_t a = i/10;
    uint32_t d;
    uint32_t m;
    divmod10(i, d, m);
    if ( abs(a - d) > 0)
    {
      Serial.print(i);
      Serial.print("\t");
      Serial.print(a);
      Serial.print("\t");
      Serial.println(d);
    }
  } 

  Serial.println("test mod 0..65535");
  for (unsigned int i = 0; i < 65535; i++)
  {
    uint32_t a = i%10;
    uint32_t d;
    uint32_t m;
    divmod10(i, d, m);
    if ( abs(a - m) > 0)
    {
      Serial.print(i);
      Serial.print("\t");
      Serial.print(a);
      Serial.print("\t");
      Serial.println(m);
    }
  } 

  Serial.println("test div 0..maxlong (20K random samples)");
  for (unsigned int i = 0; i< 20000; i++)
  {
    uint32_t n = random(0xFFFFFFFF);
    uint32_t a = n/10;
    uint32_t d;
    uint32_t m;
    divmod10(n, d, m);
    if ( abs(a - d) > 0)
    {
      Serial.print(n);
      Serial.print("\t");
      Serial.print(a);
      Serial.print("\t");
      Serial.println(d);
    }
  } 

  Serial.println("test mod 0..maxlong (20K random samples)");
  for (unsigned int i = 0; i< 20000; i++)
  {
    uint32_t n = random(0xFFFFFFFF);
    uint32_t a = n%10;
    uint32_t d;
    uint32_t m;
    divmod10(n, d, m);
    if ( abs(a - m) > 0)
    {
      Serial.print(n);
      Serial.print("\t");
      Serial.print(a);
      Serial.print("\t");
      Serial.println(m);
    }
  } 

  Serial.println("done");
}

void loop()
{
}

void divmod10(uint32_t in, uint32_t &div, uint32_t &mod)
{
  uint32_t q = (in >> 1) + (in >> 2);
  q = q + (q >> 4);
  q = q + (q >> 8);
  q = q + (q >> 16);
  q = q >> 3;
  uint32_t  r = in - ((q << 3) + (q << 1)); // r = in - q*10;
  div = q + (r > 9);
  if (r > 9) mod = r - 10;
  else mod = r;
}

As always comments, remarks are welcome.

1 - http://www.amazon.com/Hackers-Delight-Edition-Henry-Warren/dp/0321842685/ -

you keep on surprising me :D

No timing comparison to the div / ldiv functions?

Not thought of div and ldiv. So wrappeed them into the test sketch :

testing divmod10()

i/10 38.6920

i%10 38.6120

ldiv 44.7880

div 15.3960

divmod10 14.9200

test div 0..65535 test mod 0..65535 test div 0..maxlong (20K random samples) test mod 0..maxlong (20K random samples) done

ldiv is very interesting as for almost the same performance one gets / & %

div is 16 bit and signed, where the rest is 32 bit (unsigned), still interesting numbers. Should make a 16 bit test version.

snippet to add

  Serial.print("ldiv\t");
  start = micros();
  ldiv_t ld;
  volatile long x;
  for (unsigned long i = 0; i < 1000; i++)
  {
    ld = ldiv(i,10);
  }
  stop = micros();
  Serial.println((stop - start)/1000.0, 4);
  Serial.println("--------------");
  x = ld.quot;


  Serial.print("div\t");
  start = micros();
  div_t id;
  for (unsigned long i = 0; i < 1000; i++)
  {
    id = div(i,10);
  }
  stop = micros();
  Serial.println((stop - start)/1000.0, 4);
  Serial.println("--------------");
  x = id.quot;

Hi robtillaart,

I wondered why the correction term (and therefore if-then-else) is needed. (This slows down performance.) I tried to find a precise approximation and came up with this:

// for in between 0x0 .. 0x7fffffff
void divmod10(uint32_t in, uint32_t &div, uint32_t &mod)
{
  uint32_t x=in>>1; // div = in/10 <==> div = ((in/2)/5) 
  uint32_t q=x;
  q = x + (q >> 8);
  q = x + (q >> 8);
  x++;
  q = x + (q >> 8);
  q = x + (q >> 8);
  q = 3*q;
  div = (q + (q >> 4)) >> 4;
  mod = in - ((div << 3) + (div << 1));
}

On my Duemilanove it's faster then the divu10() code from the book Hackers Delight you gave. Maybe because no if-the-else is needed or shift by 8 is good for the optimizer. But I have no insight into the resulting assembler code.

Its a somewhat "weird" solution, again.

added your version to the test sketch :

i/10 38.6920

i%10 38.6120

ldiv 44.7880

div 15.3960

divmod10 14.9200 ( )

divmod10 2 13.6040 ( in/2/5 version)

That is 10% faster. Yes! almost 3x faster than the native one your comments states it works for 0..0x7FFFFFFF that is half of the possible numbers. My 20 K random test show it seems to work also above it (akthough 20K samples is no proof)

  • can it be faster?

inspired by 10 = 2 x 5 as you did I found this one

void divmod10_2(uint32_t in, uint32_t &div, uint32_t &mod)
{
  uint32_t x = in>>1;
  uint32_t q = x;
  q = x + (q >> 8);
  q = x + (q >> 8);
  x++;
  q = x + (q >> 8);
  q = x + (q >> 8);
  q = 3*q;
  div = (q + (q >> 4)) >> 4;
  mod = in - (((div << 2) + div)<<1);  // <<<<<<<<<<< this line changed, in total one shift less.
}

it resulted in : 13.1600 another slice off

Yes, constrain 0..0x7FFFFFFF was needed because it only works in this interval.

BTW: Your test with random() is useless for testing in 0..0xFFFFFFFF because it produces only values between 0..RANDOM_MAX (with RANDOM_MAX = 0x7FFFFFFF). ( See http://www.nongnu.org/avr-libc/user-manual/group__avr__stdlib.html )

"can it be faster?" Yes, it can!

I took the challenge and found a solution which is the fastest (so far on my atmega328p) and finally works on the complete interval 0..0xFFFFFFFF.

void divmod10(uint32_t in, uint32_t &div, uint32_t &mod)
{
    uint32_t x=(in>>1); // div = in/10 <==> div = ((in/2)/5)
    uint32_t q=x;

    q= (q>>8) + x;
    q= (q>>8) + x;
    q= (q>>8) + x;
    q= (q>>8) + x + 1;

    x= q;
    q= (q>>1) + x;
    q= (q>>3) + x;
    q= (q>>1) + x;

    div = (q >> 3);
    mod = in - (((div << 2) + div) << 1);
}

divmod10 ==> 12.6600

impressive!

Make them q and x register variables, it should go even faster. :grin:

tried adding register to q and x, but no success,

Think the compiler either already optimizes the vars as they are not volatile, or the compiler ignores the keyword.

I was curious so I had a quick browse.

According to the C++ standard,

D.4 register keyword [depr.register]
1 The use of the register keyword as a storage-class-specifier (7.1.1) is deprecated.

And 7.1.1:

The register speci?er shall be applied only to names of variables declared in a block (6.3) or to function parameters (8.4). It speci?es that the named variable has automatic storage duration (3.7.3). A variable declared without a storage-class-speci?er at block scope or declared as a function parameter has automatic storage duration by default.

So to summarize, register has the same semantics as auto. Local variables are auto by default, therefore register is completely redundant in C++.

In C, it simply stops you from taking the address as this isn’t allowed, whereas if done in C++, the register semantics are simply ignored.

Nice to know that G++ does this. Not all compilers to though.
It does not mean that the code is incorrect in any case.
I guess it depends on the compiler and optimizer.

Cool hack none-the-less.

Nice to know that G++ does this. Not all compilers to though.

Thats not GCC documentation, its the ISO C++99 standard. The same goes for C++11. Optimizing compilers are able to pick register candidates far better than us. Even then 'register', like 'inline' is only a hint, and can be ignored.

Sorry for this. I don't want to abuse this thread for progress reports on senseless optimization. But, please excuse me ... I'm in a kind of flow.

Another ~1.5% off:

void divmod10(uint32_t in, uint32_t &div, uint32_t &mod)
{
    uint32_t x= (in|1) - (in>>2); // div = in/10 <~~> div = (0.75*in) >> 3
    uint32_t q= (x>>4) + x;
    x= q;
    q= (q>>8) + x;
    q= (q>>8) + x;
    q= (q>>8) + x;
    q= (q>>8) + x;

    div = (q >> 3);
    mod = in - (((div << 2) + div) << 1);
}

genom2: Sorry for this. I don't want to abuse this thread for progress reports on senseless optimization. But, please excuse me ... I'm in a kind of flow.

I don't think you're in danger of that, I'm sure robtillaart will enjoy testing your approach. Its not senseless either, someone one day will need to squeeze in a few extra repetitions that the standard method can't do in time.

Well done genom2!!

On my "reference UNO" it clocks 12.4760

I looked at the code and I could remove 2 lines and it still passes the random test..UPdate : false positve see below

timing becomes a staggering 11.4680

void divmod10_4(uint32_t in, uint32_t &div, uint32_t &mod)
{
  uint32_t x= (in|1) - (in>>2); // div = in/10 <~~> div = (0.75*in) >> 3
  uint32_t q= (x>>4) + x;  // 0.796875 *in
  x= q;
  q= (q>>8) + x;     // 0.799987793 * in
  q= (q>>8) + x;     // 0.803112745 * in
  //q= (q>>8) + x;
  //q= (q>>8) + x;

  div = (q >> 3);
  mod = in - (((div << 2) + div) << 1);
}

did a extended 1000K test divmod10 11.4680 -------------- test div 0..65535 test mod 0..65535 test divmod 0..maxlong (1000K random samples) done

No, you can't remove the 2 lines! As you said, a random test is no proof!

Take this number 7031721 as a counter example: 7031721/10 expected 703172 but your modification returns 703171

But an inconspicuous modification on calculating mod (in the last line) does the trick and another ~8% off:

void divmod10(uint32_t in, uint32_t &div, uint32_t &mod)
{
    uint32_t x= (in|1) - (in>>2); // div = in/10 <~~> div = 0.75*in/8
    uint32_t q= (x>>4) + x;
    x= q;
    q= (q>>8) + x;
    q= (q>>8) + x;
    q= (q>>8) + x;
        q= (q>>8) + x;

    div = (q >> 3);
    mod = in - ((q & ~0x7) + (div << 1));
}

Great counter example! and a very smart way to mod --> 11.7200 impressive !

I don’t have that exact Arduino model, but I’m curious – if rotates happen one bit at a time, and take 2 instructions per bit, or 16 operations to rotate, perhaps a copy is faster? What do you get with this? :grin:

void divmod10(uint32_t in, uint32_t &div, uint32_t &mod)
{

        union {
                uint32_t q;
                uint8_t qq[4];
        };

        union {
                uint32_t r;
                uint8_t rr[4];
        };

        r = 0;
        uint32_t x = (in|1) - (in >> 2); // div = in/10 <~~> div = (0.75*in) >> 3
        q = (x >> 4) + x;
        x = q;
        rr[0] = qq[1];
        rr[1] = qq[2];
        rr[2] = qq[3];
        q = r +x

        rr[0] = qq[1];
        rr[1] = qq[2];
        rr[2] = qq[3];
        q = r + x

        rr[0] = qq[1];
        rr[1] = qq[2];
        rr[2] = qq[3];
        q = r + x

        rr[0] = qq[1];
        rr[1] = qq[2];
        rr[2] = qq[3];
        q = r + x


        div = (q >> 3);
        mod = in - (((div << 2) + div) << 1);
}