Speeding Up matrix computation

Hi everyone,

I am trying to learn to produce more efficient and faster computations in Arduino.

I am performing a maths equation with two 13x13 Matrix that takes around 400 us to compute. I had the idea to maybe use the cache for faster access to the matrix's that i saved as global int Array's but wasn't really sure how to go about it.

Can you recommend places to go or further ideas to achieve faster operation.

Z

void setup() {
  // put your setup code here, to run once:
Serial.begin(57600);
}

  int A[13][13] = {
  {106,62,69,74,8,91,97,6,114,86,45,48,34},
  {72,49,100,34,69,7,33,21,23,10,71,33,26},
  {85,120,57,101,65,59,63,104,94,75,54,107,43},
  {19,25,89,100,3,27,10,91,32,36,86,26,94},
  {100,31,87,111,39,89,66,6,123,46,84,30,73},
  {18,38,108,60,37,17,76,98,68,71,40,116,79},
  {38,9,112,81,14,106,61,67,4,29,94,57,106},
  {40,17,29,122,98,52,7,15,58,93,124,4,16},
  {101,23,13,94,124,107,84,21,46,80,66,98,48},
  {37,80,44,27,124,49,118,83,43,3,123,15,126},
  {87,79,41,126,60,97,35,13,113,31,93,32,118},
  {23,118,69,55,75,95,118,59,50,34,99,11,101},
  {72,115,5,26,95,28,96,10,113,112,39,119,22}
};

  int B[13][13] = {
    {84,67,47,117,51,34,100,80,91,1,87,107,69},
    {116,90,91,57,85,39,45,83,56,108,6,66,72},
    {101,5,118,66,89,100,20,123,67,67,16,97,16},
    {0,68,73,108,93,96,78,93,46,21,69,4,53},
    {126,26,63,35,52,55,126,124,93,41,7,57,33},
    {54,9,8,36,48,48,109,103,101,39,1,52,10},
    {102,99,94,107,80,127,73,126,122,32,19,113,13},
    {6,82,30,99,52,119,59,89,92,83,22,84,127},
    {110,113,105,96,82,116,123,67,110,73,20,35,55},
    {30,80,110,14,33,90,25,27,122,119,11,8,28},
    {47,67,61,71,92,5,17,29,83,51,73,97,37},
    {38,67,25,46,26,93,9,18,87,46,65,1,110},
    {0,95,18,99,31,68,30,43,53,71,31,109,49}
  };

int C[13][13];

void loop() {
  
uint32_t START = micros();
    for (int i = 0; i < 13; i++)//rows
    {
           for (int j = 0; j < 13; j++)//coloumns 
           {
               C[i][j] = (((A[i][j])*(B[i][j]))+(2*(A[i][j])));  //this is the computation that I would like to 
                                                                                //make more efficient 
           } 
    }
  uint32_t END = micros();
  uint32_t result = END - START;
  Serial.print("Time taken for computation of (A*B)+(2*A) = ");
  Serial.print(result); Serial.print(" us\nResult:\n");
  
    for (int i = 0; i < 13; i++)//rows
    {
           for (int j = 0; j < 13; j++)//coloumns 
           {
               Serial.print(C[i][j]); Serial.print(" ");
               //Serial.print(i); Serial.print(j); Serial.print("  ");
           }
      Serial.println(); 
    }
  
  delay(3000);  
  exit(0);
  
}

(deleted)

Please read this and place your code inside tags.

use the cache for faster access

What cache?

This

C[i][j] = (((A[i][j])-(B[i][j]))+(2*(A[i][j])));  //this is the computation that I would like to
                                                                                //make more efficient

is equivalent to

C[i][j] = 3*A[i][j] - B[i][j];

but the compiler may have already figured that out for you. It is quite "smart".

The equation disagrees with the printed comment:

  Serial.print("Time taken for computation of (A*B)+(2*A) = ")

Why are you using 2D arrays for a point by point, 1D computation?

whoops, it should of been this to match the text equation.

C[i][j] = (((A[i][j])*(B[i][j]))+(2*(A[i][j])));  //this is the computation that I would like to
                                                                                //make more efficient

jremington:

jremington:
Why are you using 2D arrays for a point by point, 1D computation?

I'm not sure i understand. But my idea was to cycle through the matrix for each point so that the computation would be faster.

I wonder if pointer math and a single loop would be faster than array math and a double loop:

void loop()
{
  uint32_t START = micros();
  int *c = &C[0][0];
  int *a = &A[0][0];
  int *b = &B[0][0];


  for (int i = 0; i < 13 * 13; i++) // All array elements
  {
    //this is the computation that I would like to
    //make more efficient
    *c = (*a * *b) + (2 * *a);
    c++;
    a++;
    b++;
  }


  uint32_t END = micros();

I have been questioning if its the maths (or at least make the maths as efficient as possible), or how easy it is to try and change how quickly the memory is accessed and stored. Any ideas?

johnwasser:
I wonder if pointer math and a single loop would be faster than array math and a double loop:

So I gave it a try and compared the results, apparently using your method is faster measuring at (312 us, from the previous 464 us). Which is a pretty impressive boost.

First matrix computation.PNG

Second matrix computation.PNG

Next thing I would try is to declare A and B as const uint8_t. All of the values are below 255 and this might save on fetch time. Be sure to change the 'a' and 'b' pointers to match or the answers will be WAY off.

void setup()
{
  // put your setup code here, to run once:
  Serial.begin(57600);
}


const uint8_t A[13][13] =
{
  {106, 62, 69, 74, 8, 91, 97, 6, 114, 86, 45, 48, 34},
  {72, 49, 100, 34, 69, 7, 33, 21, 23, 10, 71, 33, 26},
  {85, 120, 57, 101, 65, 59, 63, 104, 94, 75, 54, 107, 43},
  {19, 25, 89, 100, 3, 27, 10, 91, 32, 36, 86, 26, 94},
  {100, 31, 87, 111, 39, 89, 66, 6, 123, 46, 84, 30, 73},
  {18, 38, 108, 60, 37, 17, 76, 98, 68, 71, 40, 116, 79},
  {38, 9, 112, 81, 14, 106, 61, 67, 4, 29, 94, 57, 106},
  {40, 17, 29, 122, 98, 52, 7, 15, 58, 93, 124, 4, 16},
  {101, 23, 13, 94, 124, 107, 84, 21, 46, 80, 66, 98, 48},
  {37, 80, 44, 27, 124, 49, 118, 83, 43, 3, 123, 15, 126},
  {87, 79, 41, 126, 60, 97, 35, 13, 113, 31, 93, 32, 118},
  {23, 118, 69, 55, 75, 95, 118, 59, 50, 34, 99, 11, 101},
  {72, 115, 5, 26, 95, 28, 96, 10, 113, 112, 39, 119, 22}
};




const uint8_t  B[13][13] =
{
  {84, 67, 47, 117, 51, 34, 100, 80, 91, 1, 87, 107, 69},
  {116, 90, 91, 57, 85, 39, 45, 83, 56, 108, 6, 66, 72},
  {101, 5, 118, 66, 89, 100, 20, 123, 67, 67, 16, 97, 16},
  {0, 68, 73, 108, 93, 96, 78, 93, 46, 21, 69, 4, 53},
  {126, 26, 63, 35, 52, 55, 126, 124, 93, 41, 7, 57, 33},
  {54, 9, 8, 36, 48, 48, 109, 103, 101, 39, 1, 52, 10},
  {102, 99, 94, 107, 80, 127, 73, 126, 122, 32, 19, 113, 13},
  {6, 82, 30, 99, 52, 119, 59, 89, 92, 83, 22, 84, 127},
  {110, 113, 105, 96, 82, 116, 123, 67, 110, 73, 20, 35, 55},
  {30, 80, 110, 14, 33, 90, 25, 27, 122, 119, 11, 8, 28},
  {47, 67, 61, 71, 92, 5, 17, 29, 83, 51, 73, 97, 37},
  {38, 67, 25, 46, 26, 93, 9, 18, 87, 46, 65, 1, 110},
  {0, 95, 18, 99, 31, 68, 30, 43, 53, 71, 31, 109, 49}
};


int C[13][13];


void loop()
{
  uint32_t START = micros();
  int *c = &C[0][0];
  const uint8_t * a = &A[0][0];
  const uint8_t * b = &B[0][0];


  for (int i = 0; i < 13 * 13; i++) //rows
  {
    //this is the computation that I would like to
    //make more efficient
    *c = (*a * *b) + (2 * *a);
    c++;
    a++;
    b++;
  }


  uint32_t END = micros();
  uint32_t result = END - START;
  Serial.print("Time taken for computation of (A*B)+(2*A) = ");
  Serial.print(result); Serial.print(" us\nResult:\n");


  for (int i = 0; i < 13; i++)//rows
  {
    for (int j = 0; j < 13; j++)//coloumns
    {
      Serial.print(C[i][j]); Serial.print(" ");
      //Serial.print(i); Serial.print(j); Serial.print("  ");
    }
    Serial.println();
  }


  delay(3000);
  exit(0);
}

johnwasser:
Next thing I would try is to declare A and B as const uint8_t. All of the values are below 255 and this might save on fetch time. Be sure to change the 'a' and 'b' pointers to match or the answers will be WAY off.

You are a genius! It has gone down to 248 us. This is really interesting to explore, I am trying to become a better programmer and its fascinating to know that is now almost 2x more efficient.