Do some faster integer square root function exist?

I have now updated the code in two ways. I use integer instead of long integer in comparisons, which is possible in a significant part of the code. It reduced the time from about 25 us to 22 us. Then I introduced an additional look-up table to narrow the search in square table, so only one or two comparisons was needed with this table. Then the calculation time was reduced to 8 to 13 us. The statistics of error is still the same:

/*
This code is intended to test calculation speed of some aritmetric operations
and perhaps more. It may be done by using a testpin to be measured by oscilloscope

*/

const uint8_t TestOutPin = 13;         // Test pin for signal to oscilloscope
const uint8_t TestOutPin2 = 5;         // Test pin for to test speed
const uint8_t AnalogPotPin = A0;        // Analog input pin that the potentiometer is attached to

const unsigned long int PrintPeriodMs = 3000;
const int Max10BitP1 = 1023 + 1;
const int Max9Bit = 511;
const float Rad60degr = 2 * M_PI / 6;

unsigned long int printTrigMillis = 0;
int i1 = 160;
int i2 = 170;
int i3 = 175;
long int li1 = 180;
long int li2 = 190;
long int li3 = 200;
float r1=0;
float r2=0;

int printCnt = 0;

void setTestPin() {       //Uses testpin 13
  PORTB |= 0B00100000;
  };

void clearTestPin() {     //Uses testpin 13
  PORTB &= 0B11011111;
  };

/*
Table based square root function for integers only. Accuracy within 1 % when input values are above 2500.

Update using integer instead of long integer when possible caused reduction from 22-25 us to 18-22 us.

It is resonable to suggest, that the long int comparisons is the part that recuire time.
The highest input x possible is 3xMax9Bit^2 = 783363. It recuire 4 long comparisons and 3 times long division by 4.
However, it seems this initial correction is quite fast.
Now changed x to to integer when possible, but the binary search is still time consuming

With update using look-up table to narrow binary search the time was reduced from 18-22 us to 8-13 us.
*/
int broendegaard3_isqrt(unsigned long x) {                   // Calculation time is in range 8 to 13 us.

  static const unsigned int sq_table[] = {
3969,                                                       //0  the table is (index+63)^2
4096,4225,4356,4489,4624,4761,4900,5041,5184,5329,          //1 to 10
5476,5625,5776,5929,6084,6241,6400,6561,6724,6889,          //11-20
7056,7225,7396,7569,7744,7921,8100,8281,8464,8649,          //21-30
8836,9025,9216,9409,9604,9801,10000,10201,10404,10609,      //31-40
10816,11025,11236,11449,11664,11881,12100,12321,12544,12769,//41-50
12996,13225,13456,13689,13924,14161,14400,14641,14884,15129,//51-60
15376,15625,15876,16129,16384};                             //61-65
  static const byte lookUp_m[] = {
0,                                   //0
1,2,4,6,8,10,12,13,15,17,            //1-10
18,19,21,23,24,26,27,28,30,31,       //11-20
33,34,35,36,38,39,40,41,43,44,       //21-30
45,46,47,49,50,51,52,53,54,55,       //31-40
56,57,58,59,60,61,62,63    };        //41-48

  byte i=0;
  byte j=0;
  byte m=0;
  byte n=64;
  byte p=32;
  unsigned int sq = 1;
  int sqrr = 1;
  if (x > 16129) {            // if x above table range then divide x by 4 until it is in range.
                              // this initial correction is actually quite fast and require about 2 us.
    do {
    x >>= 2;
    i++;}
    while (x > 16129);
    sq = x;}
  else {
    sq = x;
    if (sq <= 1) {               //return when x= 0 or x = 1.
      return sq; }               
    else {
      while (sq < 3969) {        // if x below table range then increase x by factor 4 until it is in range
        sq <<= 2;
        j++; };
      };
    };

  p= (sq >> 8) - 15;    // uses loop up to get closer start of m, n and p in binary search with square values
  m= lookUp_m[p];
  if (p<=28) {
    n=m+3;}             //somewhat rough estimate. Sometimes n=m+2 could be OK
  else {
    if (p==33) {        //single case of n=m+3 in this range
      n= m+3;}
    else
      n= m+2;
    };
  p=m+1;   
  while (m+1 != n) {          //binary search in table. This search require 14 us time.
    if (sq < sq_table[p]) {   //with update by look-up table it is about 3-6 us.
      n = p;
      p = (p+m) >> 1; }
    else {
      m = p;
      p = (p+n) >> 1; };
    };

  if ((sq - sq_table[p]) > (sq_table[p+1] - sq)) {  // correct to closest value in table
    p++;};
  if (j > 0) {
    sqrr = (p+63) >> j; }         //divide by two each time x was multiplied by 4 at start of function
  else {
    sqrr = (p+63) << i; };        //multiply by two each time x was divided by 4 at start of function
    
  return sqrr;  
  };

void setup() {
  Serial.begin(9600);
  pinMode(TestOutPin, OUTPUT);
  pinMode(TestOutPin2, OUTPUT);
  digitalWrite(TestOutPin,LOW);
  pinMode(AnalogPotPin, INPUT);
  printTrigMillis = millis() + PrintPeriodMs;
}

void loop() {
  // put your main code here, to run repeatedly:
  r1 = random(58, Max9Bit);                 //phase-phase amplitude of BLDC EMF encoder
  r2 = 2*M_PI*random(Max9Bit)/Max9Bit;      //phase angle of V1 of BLDC EMF encoder
  i1 = r1*sin(r2);                          //simulated voltage seen on ADC from V1 of EMF encoder
  i2 = r1*sin(r2 + Rad60degr);              //simulated voltage seen on ADC from V2 of EMF encoder
 
  li3 = (long) i1*(i1-i2) + (long) i2*i2; // Do work with about 6 us calculation time
  setTestPin();
  i3 = broendegaard3_isqrt( li3 );
  clearTestPin();

  delay(1);
  if ((millis() > printTrigMillis) && (printCnt<1000)) {
    printTrigMillis += PrintPeriodMs;
    printCnt++;
    // print the results to the Serial Monitor:
    //Serial.print("r1 = "); Serial.print(r1);
    //Serial.print("\t r2 = "); Serial.print(r2);
    r1=sqrt(li3);
    //Serial.print("\t i1 = "); Serial.print(i1);
    //Serial.print("\t i2 = "); Serial.print(i2);
    //Serial.print("\t li3 = "); Serial.print(li3);
    //Serial.print("\t i3 = "); Serial.print(i3);
    //Serial.print("\t err% = ");
    Serial.print((i3-r1)*100/r1);
    Serial.println();
    };
}

1 Like

But now you are applying indirect strategies to cut the work load and it is nearing fruit?

You tried something out, it works, you keep going, all good!

Are you tabling in PROGMEM? It takes 3 cycles per byte to fetch PROGMEM but that byte can save loads of cycles. I tabled (sines 0 to 90) * 10000 as unsigned int (less typing Arduino type name is word) in PROGMEM. Since floats do powers of ten fast, float s = (float)tableInt / 10000.0; // should run fast. It takes 6 cycles to fetch the mantissa and the divide should be all exponent.

If you haven't used PROGMEM, a Nano or Uno have 32K flash where an 8K sketch is pretty big. Do the same with an ATmega1284P (comes in 40 pin wide DIP, same as the Intel 8088) and there's 128K flash and 16K RAM. The Mega2560 board can run external RAM as internal however the 2560 is the first 8K addresses, Mega2560 can see 64K-8K of the external. Robust Circuits made and still might, 512K plug-in cards set up as 8 64K banks. In default mode the internal RAM becomes 8K of dedicated stack space withj 8 56K heap spaces.

At that point, buy something with an AMD M4F chip, it will be like going from props to jets. My biggest problem with now Teensies is I'd need a breakout to DIP size connections. The board has so many contacts it needs a kind of socket for big-fingered old hobbyists.

Last thing. Why table square roots as signed values? It kills half the range, 65536 to 32768. "Don't be so negative! Ahhhh!".

1 Like

I'm thinking that an inlined binary search:

 int guess = 0;
 for (i=12; i >0; i--) {
   newguess = guess + (1<<i);
   if (newguess * newguess < N) {
     guess += 1<<i;
   }
  }
  // adjust/round ?

is promising, if you unroll the loop, finesse the multiplies/compares, and save a bunch of intermediate results (most requiring asm, though. I might give it a try.) (as shown, the algorithm seems to come up with off-by-1 errors "frequently"; I wonder if that matters?)

The number that you might want the rounded down or up integer square root can be razored down by how many bytes it uses.

A number <= 256 is one of the first 16 squares.
A " " 65536 " " " 256 squares.

The 1st 16 can run at 8 bit level, the next 240 at 16-bit level, and there are 16777216 integers in 24 bits and over 4 billion in 32.

To be sure, the 1st 256 integer squares cover 1 to 65536. The index + 1 is the square root, index 0 to 255.

If you scale the 32-bit values down to 16, you get the same curve with less precision? 16 bits is being always able to show 4 places as 0 to 9.
16 bit values process twice as fast as 32 bit when you run AVRs.

With an array of 256 16 bit squares the sqrt = index+1.

If the value to find the root of is > x0FFF, the root is >= x40 so maybe start there.

On a computer, binary squares are kind of easy! Double the bits so how close can you get a close approx off the high set bit and start high with half as many bits for the sqrt like above then index down and compare to the tabled value until it is =< the entered value, nearest sqrt = index+1.

I could see Occam's Razoring on 2 bit bounds here but it's late.

No, not yet. Thank you for the information. I now studied what PROGMEM is about, and I beforehand guessed that defining a constant did it, but now I learned, that it did not. I saw some information, with a guy that made all figures in table to hex presentation. Is it necessary for the compiler to have it this way?

You are right - working on it :grinning:

use a bit mask in the loop, less shifting

 int guess = 0;
 long mask = 0x8000;
 while (mask * mask > N) mask >>= 1;  //  optional search for first mask < sqrt(N)

 guess = mask;
 mask >>= 1;

 while (mask != 0) {
   newguess = guess + mask;
   if (newguess * newguess < N) {
     guess += mask;
   }
   mask >>= 1;
  }

This part of the code is about defining some pseudo random numbers to supply the square root function. The statement comes from a study on how the voltage from a three phase tacho generator (encoder) will provide two voltages. Look this thread:
https://forum.arduino.cc/t/alternative-to-digital-rotary-encoder/1225470/10

Won't matter, after it's unrolled. That was just a description of the algorithm (which DOES seem to work, BTW. Too low by 1 in essentially half of the cases (8390653 out of 16777216) (grr.))

Here's the whole test code, (with correction at the end) just for grins...
(having to do the extra square in the correction code is ... annoying.)

/*
 * fast integer square root, for input < 2^24
 * Strategy:
 *  SQRT(2^24-1) is ~4096: the result will always be less than 13 bits.
 *  We should be able to do essentially a binary search, but with the test
 *  optimized.
 */
#include <stdint.h>
#include <stdio.h>
#include <math.h>

uint32_t fisqrt(uint32_t n);
#define MAXERROR 0

int main() {
  uint32_t result;
  uint32_t nerrors = 0;
  for (uint32_t n=3; n<16*1024*(uint32_t)1024; n+=1) {
    result = fisqrt(n);
    if (result != (uint32_t)(sqrt(n)+0.5)) {
      nerrors ++;
      int error = (uint32_t)(sqrt(n)+0.5) - result;
      if (error > MAXERROR || error < -MAXERROR) {
        printf("for n=%d, result = %d (error %d) (guess**2 = %d)\n",
             n, result, error, result*result);
      }
    }
  }
  printf("\nTotal errors %d\n", nerrors);
}

uint32_t fisqrt(uint32_t n) {
  uint16_t guess = 0;
  uint32_t newguess;
  uint32_t testsqr;
  for (int i=12; i >= 0; i--) {
    newguess = guess + (1<<i);
    testsqr = newguess * newguess;
    if (testsqr <= n) {
      guess += 1<<i;
    }
  }
  /*
   * guess will be low, about half the time,
   *  for numbers > halfway under the perfect squares 
   * correct for that.
   *  (guess+1)^2 - n < n - guess^2
   *    with a bit of algebra should be
   *  guess < n - guess^2
   */

  if (guess < n - guess*guess) {
    guess++;
  }
  return guess;
}

Thanks. I tried this function code as you see below.
The error on result was very low, but the calculation time was about 68 us on my Arduino Nano V3, and therefore it cannot be used as that in my application. I look for calculation times below 20 us.

/*
This code is intended to test calculation speed of some aritmetric operations
and perhaps more. It may be done by using a testpin to be measured by oscilloscope

*/


const uint8_t TestOutPin = 13;         // Test pin for signal to oscilloscope
const uint8_t TestOutPin2 = 5;         // Test pin for to test speed
const uint8_t AnalogPotPin = A0;        // Analog input pin that the potentiometer is attached to

const unsigned long int PrintPeriodMs = 35;
const int Max10BitP1 = 1023 + 1;
const int Max9Bit = 511;
const float Rad60degr = 2 * M_PI / 6;

unsigned long int printTrigMillis = 0;
int i1 = 160;
int i2 = 170;
int i3 = 175;
long int li1 = 180;
long int li2 = 190;
unsigned long int li3 = 200;
float r1=0;
float r2=0;

int printCnt = 0;

void setTestPin() {       //Uses testpin 13
  PORTB |= 0B00100000;
  };

void clearTestPin() {     //Uses testpin 13
  PORTB &= 0B11011111;
  };

#include <stdint.h>
#include <stdio.h>
#include <math.h>

/*
This function is tested with range 16-255 and 4194304 to 16777216.
The calculation time was about 68 us.
The error on result was very low.
*/
uint32_t fisqrt(uint32_t n) {
  uint16_t guess = 0;
  uint32_t newguess;
  uint32_t testsqr;
  for (int i=12; i >= 0; i--) {
    newguess = guess + (1<<i);
    testsqr = newguess * newguess;
    if (testsqr <= n) {
      guess += 1<<i;
      }
    }
  if (guess < n - guess*guess) {
    guess++;
    }
  return guess;
  };


void setup() {
  Serial.begin(9600);
  pinMode(TestOutPin, OUTPUT);
  pinMode(TestOutPin2, OUTPUT);
  digitalWrite(TestOutPin,LOW);
  pinMode(AnalogPotPin, INPUT);
  printTrigMillis = millis() + PrintPeriodMs;
}

void loop() {

  li3 = random(16, 255);
//  li3 = random(4194304, 16777216);

  setTestPin();
  i3 = fisqrt( li3 );
  clearTestPin();        

  delay(1);
  if ((millis() > printTrigMillis) && (printCnt<1000)) {
    printTrigMillis += PrintPeriodMs;
    printCnt++;
    // print the results to the Serial Monitor:
    //Serial.print("r1 = "); Serial.print(r1);
    //Serial.print("\t r2 = "); Serial.print(r2);
    r1=sqrt(li3);
    //Serial.print("\t i1 = "); Serial.print(i1);
    //Serial.print("\t i2 = "); Serial.print(i2);
    //Serial.print("\t li3 = "); Serial.print(li3);
    //Serial.print("\t i3 = "); Serial.print(i3);
    //Serial.print("\t err% = ");
    Serial.print((i3-r1)*100/r1);
    Serial.println();
    };
}

I think I now tried in the direction you here pointed out. But I use a table in 7-8 bit range. Low numbers got their own table. This is the result with 1000 random numbers in some ranges:

I traded accuracy for speed, and I think the result is within what I need. But it is likely, that better methods exist. The calculation time for numbers I need will be below 10 us.

/*
This code is intended to test calculation speed of some aritmetric operations
and perhaps more. It may be done by using a testpin to be measured by oscilloscope

*/

const uint8_t TestOutPin = 13;         // Test pin for signal to oscilloscope
const uint8_t TestOutPin2 = 5;         // Test pin for to test speed
const uint8_t AnalogPotPin = A0;        // Analog input pin that the potentiometer is attached to

const unsigned long int PrintPeriodMs = 35;
const int Max10BitP1 = 1023 + 1;
const int Max9Bit = 511;
const float Rad60degr = 2 * M_PI / 6;

unsigned long int printTrigMillis = 0;
int i1 = 160;
int i2 = 170;
unsigned int i3 = 175;
long int li1 = 180;
long int li2 = 190;
unsigned long int li3 = 200;
float r1=0;
float r2=0;

int printCnt = 0;

void setTestPin() {       //Uses testpin 13
  PORTB |= 0B00100000;
  };

void clearTestPin() {     //Uses testpin 13
  PORTB &= 0B11011111;
  };

/*
Table based square root function for integers only.

The highest input x possible is 3xMax9Bit^2 = 783363. With 10 bit sine waves from 
3-phase encoder, it becomes 0.75 x Max9Bit^2 = 156641.

Update 5-3-24: skipping binary search and uses only look up tables. For low value the calculation time became high,
and therefore a special look up for that was created. CPU Time depend of value range:
0-255:      2 us
256 - 991:  7 us
992 - 3967: 6 us
3968 - 15871: 4 us
15872 - 65279: 3 us        Accuracy: 
65280 - 261116: 4 us
261117 - 1044464: 6 us
1044465 - 4177856: 8 us    // do not think a use for over max 11 bit square x 0.75 is realistic for use - it is 3142656. 
4177857 - 16711424: 10 us
16711425 - 66845696: 12 us 
*/

int broendegaard4_isqrt(unsigned long x) {              

  static const byte res_table[] PROGMEM = {                          // lowest square is 15872. Highest is 65279
                                                             // table is based on best value mid range providing this index
126,                                            //0
127,128,129,130,131,132,133,134,135,136,
137,138,139,140,141,142,143,144,144,145,
146,147,148,149,150,151,151,152,153,154,
155,156,156,157,158,159,160,160,161,162,
163,164,164,165,166,167,167,168,169,170,
170,171,172,173,173,174,175,176,176,177,
178,179,179,180,181,181,182,183,183,184,
185,186,186,187,188,188,189,190,190,191,
192,192,193,194,194,195,196,196,197,198,
198,199,200,200,201,201,202,203,203,204,
205,205,206,206,207,208,208,209,210,210,
211,211,212,213,213,214,214,215,216,216,
217,217,218,219,219,220,220,221,221,222,
223,223,224,224,225,225,226,227,227,228,
228,229,229,230,230,231,232,232,233,233,
234,234,235,235,236,237,237,238,238,239,
239,240,240,241,241,242,242,243,243,244,
244,245,246,246,247,247,248,248,249,249,
250,250,251,251,252,252,253,253,254,254,
255,255                                   //191-192
 };   

  static const byte low_value_result[] PROGMEM = {
    0,
    1,1,2,2,2,2,3,3,3,3,
    3,3,4,4,4,4,4,4,4,4,
    5,5,5,5,5,5,5,5,5,5,
    6,6,6,6,6,6,6,6,6,6,
    6,6,7,7,7,7,7,7,7,7,
    7,7,7,7,7,7,8,8,8,8,
    8,8,8,8,8,8,8,8,8,8,
    8,8,9,9,9,9,9,9,9,9,
    9,9,9,9,9,9,9,9,9,9,
    10,10,10,10,10,10,10,10,10,10,
    10,10,10,10,10,10,10,10,10,10,
    11,11,11,11,11,11,11,11,11,11,
    11,11,11,11,11,11,11,11,11,11,
    11,11,12,12,12,12,12,12,12,12,
    12,12,12,12,12,12,12,12,12,12,
    12,12,12,12,12,12,13,13,13,13,
    13,13,13,13,13,13,13,13,13,13,
    13,13,13,13,13,13,13,13,13,13,
    13,13,14,14,14,14,14,14,14,14,
    14,14,14,14,14,14,14,14,14,14,
    14,14,14,14,14,14,14,14,14,14,
    15,15,15,15,15,15,15,15,15,15,
    15,15,15,15,15,15,15,15,15,15,
    15,15,15,15,15,15,15,15,15,15,
    16,16,16,16,16,16,16,16,16,16,
    16,16,16,16,16 };

  byte i=0;
  byte j=0;
  byte m=0;
  byte p=0;
  unsigned int sq = 0;
  int sqrr = 0;

  if (x > 65279) {            // if x above table range then divide x by 4 until it is in range.
                              // this initial correction is actually quite fast and require about 2 us.
    do {
    x >>= 2;
    i++;}
    while (x > 65279);
    sq = x;}
  else {
    sq = x;
    if (sq <= 255) {
      sqrr = pgm_read_byte_near(low_value_result + sq);
      return sqrr;
      }
    else {
      while (sq < 15872) {        // if x below table range then increase x by factor 4 until it is in range
        sq <<= 2;
        j++; };
      };
    };

  p= (sq >> 8) - 62;    // index to table
  m = pgm_read_byte_near(res_table + p);

  if (j > 0) {          //divide by two each time x was multiplied by 4 at start of function
    if (j == 1 ) {      
      if ((m & 0B00000001) == 1) {
        sqrr = (m+1) >> j; }     //round up if next bit is 1
      else {
        sqrr = m >> j; }
      }
    else {
      m >>= (j-1);
      if ((m & 0B00000001) == 1) {
        sqrr = (m+1) >> 1; }     //round up if next bit is 1
      else {
        sqrr = m >> 1; }  
      };
    }         
  else {
    sqrr = m << i; };        //multiply by two each time x was divided by 4 at start of function
    
  return sqrr;  
  };

void setup() {
  Serial.begin(9600);
  pinMode(TestOutPin, OUTPUT);
  pinMode(TestOutPin2, OUTPUT);
  digitalWrite(TestOutPin,LOW);
  pinMode(AnalogPotPin, INPUT);
  printTrigMillis = millis() + PrintPeriodMs;
}

void loop() {
  // put your main code here, to run repeatedly:
  //r1 = random(10, 20);                 //phase-phase amplitude of BLDC EMF encoder
  //r2 = 2*M_PI*random(Max9Bit)/Max9Bit;      //phase angle of V1 of BLDC EMF encoder
  //i1 = r1*sin(r2);                          //simulated voltage seen on ADC from V1 of EMF encoder
  //i2 = r1*sin(r2 + Rad60degr);              //simulated voltage seen on ADC from V2 of EMF encoder
 
  //li3 = (long) i1*(i1-i2) + (long) i2*i2; // Do work with about 6 us calculation time

  //li3 = random(4194304, 16777216);
  //li3 = random(15872, 65279);
  //li3 = random(15872, 16777216);
  //li3 = random(3968, 15872);
  //li3 = random(992, 3968);
  //li3 = random(256, 992);
  li3 = random(16,255);

  setTestPin();
  i3 = broendegaard4_isqrt( li3 );
  clearTestPin();        

  delay(1);
  if ((millis() > printTrigMillis) && (printCnt<1000)) {
    printTrigMillis += PrintPeriodMs;
    printCnt++;
    r1=sqrt(li3);
    Serial.print((i3-r1)*100/r1);
    Serial.println();
    };
}

For his purpose, hex may have given flexibility and control.
What is stored in AVR 8-bit registers cpu variables is binary values used to make 8, 16, 32 and 64 bit integers (signed or not), 32 bit floats and text in two ways though one is more suited to huge code and RAM programming and leads to problems in small environments like AVR's.

Look at the write functions. You can write bigger than bytes with the passed arg a variable name. The variable is already binary, how it behaves is signed or not, I prefer unsigned as it's less complicated.

The newer bootloaders have functions to write flash after bootloading. It should be possible to load a file image to flash where it can be used. It would be kind of neat if the compiled hex file had that in the first place....

1 Like

The Raspberry Pi PICO basic that can be programmed with Arduino IDE sold for $5 less than a year ago. It runs at 133MHz, has lots of RAM and is dual-core.

16MHz AVR has 160 8-bit cycles in 10 us. PICO gets 1330 32-bit cycles.

When you included to lose precision... the sqrt of a close value stands up.

If you have the log of a value, the log of the sqrt would be half as much. A look-up table can speed that up lots!

1 Like

This topic was automatically closed 180 days after the last reply. New replies are no longer allowed.