How to get BLAS/LAPACK (or any other matrix math library) on Arduino?

I cannot seem to find a lightweight library. How is everyone else doing matrix math/control in their projects?

For basic matrix operations, I use a stripped down version of this macro library. It uses no loops, so executes more quickly at the expense of some program memory.

For small Arduinos, the most straightforward approach to solve complex linear algebra problems is to remove from large code libraries the specific routines you need to solve the problem at hand, and include them in the code.

For an example of how to use the above macro library, below is part of a code that solves the specific problem of determining the rotation/translation matrices relating four points in an image to the camera position in space (called "the PnP problem"). It runs fine on a Uno, with plenty of program memory left over.

// Based on lambda_twist code published by Persson and Nordberg:
// https://github.com/midjji/lambdatwist-p3p
// theory paper at
// http://openaccess.thecvf.com/content_ECCV_2018/html/Mikael_Persson_Lambda_Twist_An_ECCV_2018_paper.html
// S. J. Remington 8/2019
//
//p3p solves rotation and translation from first 3 points,
// uses the fourth point to decide which solution is best.
//input: 4 sets of real world coordinates x1..x4,
//       4 sets of distortion corrected camera coordinates y1..4
//output: camera Rs, Ts final solution
//returns 0 if no solution or number of the valid solution (1-4)

int p3p(float x1[3], float x2[3], float x3[3], float x4[3],
        float y1[3], float y2[3], float y3[3], float y4[3])
{
  float b12, b13, b23;
  VEC_DOT_PRODUCT(b12, y1, y2);
  VEC_DOT_PRODUCT(b13, y1, y3);
  VEC_DOT_PRODUCT(b23, y2, y3);
  b12 = -2.0 * b12;
  b13 = -2.0 * b13;
  b23 = -2.0 * b23;

  float d12[3], d13[3], d23[3];
  VEC_DIFF(d12, x1, x2);
  VEC_DIFF(d13, x1, x3);
  VEC_DIFF(d23, x2, x3);

  float d12xd13[3], a12, a13, a23;
  VEC_CROSS_PRODUCT(d12xd13, d12, d13);
  VEC_DOT_PRODUCT(a12, d12, d12); //squared length
  VEC_DOT_PRODUCT(a13, d13, d13);
  VEC_DOT_PRODUCT(a23, d23, d23);

  // a*g^3 + b*g^2 + c*g + d = 0

  float c31, c23, c12, blob, s31_squared, s23_squared, s12_squared;
  c31 = -0.5 * b13;
  c23 = -0.5 * b23;
  c12 = -0.5 * b12;
  blob = (c12 * c23 * c31 - 1.0);

  s31_squared = 1.0 - c31 * c31;
  s23_squared = 1.0 - c23 * c23;
  s12_squared = 1.0 - c12 * c12;

  float p0, p1, p2, p3;
  p3 = (a13 * (a23 * s31_squared - a13 * s23_squared));
  p2 = 2.0 * blob * a23 * a13 + a13 * (2.0 * a12 + a13) * s23_squared
       + a23 * (a23 - a12) * s31_squared;
  p1 = a23 * (a13 - a23) * s12_squared - a12 * a12 * s23_squared
       - 2.0 * a12 * (blob * a23 +  a13 * s23_squared);

  p0 = a12 * (a12 * s23_squared - a23 * s12_squared);
  //printf("p3..0 %10.5e %10.5e %10.5e %10.5e\n",p3,p2,p1,p0);

  // solve cubic
  float g;
  if (fabs(p3) >= fabs(p0)) {
    if (p3 == 0.0) return 0;
    p3 = 1.0 / p3;
    p2 = p2 * p3;
    p1 = p1 * p3;
    p0 = p0 * p3;
    g = cubick(p2, p1, p0);
  }
  else {
    if (p0 == 0.0) return 0;
    // lower numerical performance
    g = 1.0 / (cubick(p1 / p0, p2 / p0, p3 / p0));
  } //end if (fabs(p3)

  //printf(" gamma %10.5e\n",g);
  //  Serial.print("gamma ");
  //  Serial.println(g,5);

  float A00, A01, A02, A11, A12, A22;
  A00 = a23 * (1.0 - g);
  A01 = (a23 * b12) * 0.5;
  A02 = (a23 * b13 * g) * (-0.5);
  A11 =  a23 - a12 + a13 * g;
  A12 =  b23 * (a13 * g - a12) * 0.5;
  A22 =  g * (a13 - a23) - a12;

  float A[3][3] = {{A00, A01, A02}, {A01, A11, A12}, {A02, A12, A22}};
  float V[3][3], L[3];

  // get the eigenvalues
  eig0(A, V, L);
  //print_v3("E ",L);
  //print_m33("EM",V);

  float v, Ls[4][3];
  int valid = 0; //number of valid solutions found

  //the two eigenvalues should have opposite signs
  v = sqrt(fmax(0.0, -L[1] / L[0]));

  // use the t=Vl with t2,st2,t3 and solve for t3 in t2
  //+v block locals redefined and reused
  {
    float s, w0, w1, w2, a, b, c, d, tau, tau1, tau2;
    s = v;

    w2 = 1.0 / ( s * V[0][1] - V[0][0]);
    w0 = (V[1][0] - s * V[1][1]) * w2;
    w1 = (V[2][0] - s * V[2][1]) * w2;

    a = 1.0 / ((a13 - a12) * w1 * w1 - a12 * b13 * w1 - a12);
    b = (a13 * b12 * w1 - a12 * b13 * w0 - 2.0 * w0 * w1 * (a12 - a13)) * a;
    c = ((a13 - a12) * w0 * w0 + a13 * b12 * w0 + a13) * a;

    if (b * b - 4.0 * c >= 0 ) {

      root2real(b, c, &tau1, &tau2);
      if (tau1 > 0) {
        float l1, l2, l3;
        tau = tau1;
        d = a23 / (tau * (b23 + tau) + 1.0);
        l2 = sqrt(d);
        l3 = tau * l2;
        l1 = w0 * l2 + w1 * l3;
        if (l1 >= 0) {
          Ls[valid][0] = l1;
          Ls[valid][1] = l2;
          Ls[valid][2] = l3;
          valid = valid + 1;
        }
      }
      if (tau2 > 0) {
        float l1, l2, l3;
        tau = tau2;
        d = a23 / (tau * (b23 + tau) + 1.0);
        l2 = sqrt(d);
        l3 = tau * l2;
        l1 = w0 * l2 + w1 * l3;
        if (l1 >= 0) {
          Ls[valid][0] = l1;
          Ls[valid][1] = l2;
          Ls[valid][2] = l3;
          valid = valid + 1;
        }

      }
    }
  }  //end +v block

  //-v block locals redefined and reused
  {
    float s, w0, w1, w2, a, b, c, d, tau, tau1, tau2;
    s = -v;
    w2 = 1.0 / ( s * V[0][1] - V[0][0]);
    w0 = (V[1][0] - s * V[1][1]) * w2;
    w1 = (V[2][0] - s * V[2][1]) * w2;

    a = 1.0 / ((a13 - a12) * w1 * w1 - a12 * b13 * w1 - a12);
    b = (a13 * b12 * w1 - a12 * b13 * w0 - 2.0 * w0 * w1 * (a12 - a13)) * a;
    c = ((a13 - a12) * w0 * w0 + a13 * b12 * w0 + a13) * a;

    if (b * b - 4.0 * c >= 0 ) {

      root2real(b, c, &tau1, &tau2);
      if (tau1 > 0) {
        float l1, l2, l3;
        tau = tau1;
        d = a23 / (tau * (b23 + tau) + 1.0);
        l2 = sqrt(d);
        l3 = tau * l2;
        l1 = w0 * l2 + w1 * l3;
        if (l1 >= 0) {
          Ls[valid][0] = l1;
          Ls[valid][1] = l2;
          Ls[valid][2] = l3;
          valid = valid + 1;
        }
      }
      if (tau2 > 0) {
        float l1, l2, l3;
        tau = tau2;
        d = a23 / (tau * (b23 + tau) + 1.0);
        l2 = sqrt(d);
        l3 = tau * l2;
        l1 = w0 * l2 + w1 * l3;
        if (l1 >= 0) {
          Ls[valid][0] = l1;
          Ls[valid][1] = l2;
          Ls[valid][2] = l3;
          valid = valid + 1;
        }

      }
    }
  } //end -v block

  printf("valid solutions %d\n",valid);

//  Serial.print(F(" valid solutions"));
//  Serial.println(valid);

  float Xinv[3][3];
  { //block
    float X[3][3] = { //local
      {d12[0], d13[0], d12xd13[0]},
      {d12[1], d13[1], d12xd13[1]},
      {d12[2], d13[2], d12xd13[2]}
    };

    float detx = 0.0;
    INVERT_3X3(Xinv, detx, X);
  } //end block

  int i, i_min;
  float min_rms = 1.0e6; //for sorting solutions
  i_min = 1000;

  float R[3][3], T[3];  //locals

  for (i = 0; i < valid; i++) {

    float ry1[3], ry2[3], ry3[3]; //result vectors
    float yd1[3], yd2[3], yd1xd2[3]; //deltas
    float LR[3] = {Ls[i][0], Ls[i][1], Ls[i][2]};

    // Gauss-Newton refinement step (checked by perturbation: OK!)
    refine_lambda(LR, a12, a13, a23, b12, b13, b23);

    //compute the rotation
    VEC_SCALE(ry1, LR[0], y1);
    VEC_SCALE(ry2, LR[1], y2);
    VEC_SCALE(ry3, LR[2], y3);
    VEC_DIFF(yd1, ry1, ry2);
    VEC_DIFF(yd2, ry1, ry3);
    VEC_CROSS_PRODUCT(yd1xd2, yd1, yd2);

    float Y[3][3] = {
      {yd1[0], yd2[0], yd1xd2[0]},
      {yd1[1], yd2[1], yd1xd2[1]},
      {yd1[2], yd2[2], yd1xd2[2]}
    };

    MATRIX_PRODUCT_3X3(R, Y, Xinv);  //local R

    //solve translation using first point (arbitrary choice)
    // now try solving with all 3 points and average.
    // (averaging 3 points not seem to help using calculated camera coords
    //  so repeat test with real cam coords)

    { //block
      float ry[3], rsum[3] = {0.0};
      MAT_DOT_VEC_3X3(ry, R, x1);
      //    Ts=(ry1 - (Rs*x1')');
      VEC_DIFF(rsum, ry1, ry);
      // next point
      MAT_DOT_VEC_3X3(ry, R, x2);
      VEC_DIFF(ry, ry2, ry);
      VEC_SUM(rsum, rsum, ry);
      // and third
      MAT_DOT_VEC_3X3(ry, R, x3);
      VEC_DIFF(ry, ry3, ry);
      VEC_SUM(rsum, rsum, ry);
      VEC_SCALE(T, 1.0 / 3.0, rsum);  //result = local T
    }

    float rms = test_sol(R, T, x1, x2, x3, x4, y1, y2, y3, y4);

    if (rms < min_rms) {
      COPY_MATRIX_3X3(Rs, R); //save this matrix solution
      VEC_COPY(Ts, T); //and this vector

      i_min = i; //solution index
      min_rms = rms;
    printf("index %d rms = %8.5f\n",i_min, min_rms);
//      Serial.print("#");
//      Serial.print(i_min);
//      Serial.print(F(" rmsE = "));
//      Serial.println(min_rms);
    }

  }  //end for i<valid
  if (valid > 0) return i_min + 1; //solution number (index+1)
  else return 0;  //nada
} //end p3p