Hi am thinking of using the algorithm provided here as the basis for determining the line of best fit:
Here is the code adapted for the Arduino Mega:
#include <Wire.h>
#include <math.h>
//using namespace std;
struct Point
{
double x;
double y;
};
Point Collection [6][2] = {
{1,9.3},
{2,11.5},
{3,12.25},
{4,13.1},
{5,14.2},
{6,16.3}
};
void setup() {
// fire up the serial interface for the monitor
Serial.begin(115200); // maximum for Mega 2560#
// Set the headings and tab stops
Serial.println("Linear Regression and Method of Least Squares");
Serial.println("===================================================================================="); Serial.println();
}
void loop() {
leastSqrRegression(*Collection, 6);
while (1);
}
void leastSqrRegression(struct Point* xyCollection, int dataSize)
{
if (xyCollection == NULL || dataSize == 0)
{
Serial.print("Empty data set!");
return;
}
double SUMx = 0; //sum of x values
double SUMy = 0; //sum of y values
double SUMxy = 0; //sum of x * y
double SUMxx = 0; //sum of x^2
double SUMres = 0; //sum of squared residue
double res = 0; //residue squared
double slope = 0; //slope of regression line
double y_intercept = 0; //y intercept of regression line
double SUM_Yres = 0; //sum of squared of the discrepancies
double AVGy = 0; //mean of y
double AVGx = 0; //mean of x
double Yres = 0; //squared of the discrepancies
double Rsqr = 0; //coefficient of determination
//calculate various sums
for (int i = 0; i < dataSize; i++)
{
Serial.print(i); Serial.print(" : "); Serial.print((xyCollection + i)->x); Serial.print("\t"); Serial.println((xyCollection + i)->y);
//sum of x
SUMx = SUMx + (xyCollection + i)->x;
//sum of y
SUMy = SUMy + (xyCollection + i)->y;
//sum of squared x*y
SUMxy = SUMxy + (xyCollection + i)->x * (xyCollection + i)->y;
//sum of squared x
SUMxx = SUMxx + (xyCollection + i)->x * (xyCollection + i)->x;
}
//calculate the means of x and y
AVGy = SUMy / dataSize;
AVGx = SUMx / dataSize;
//slope or a1
slope = (dataSize * SUMxy - SUMx * SUMy) / (dataSize * SUMxx - SUMx*SUMx);
//y itercept or a0
y_intercept = AVGy - slope * AVGx;
Serial.println(); Serial.println("----------------------------------------------------------------------");
Serial.print("x mean(AVGx) = "); Serial.print(AVGx); Serial.print("\t");
Serial.print("y mean(AVGy) = "); Serial.println(AVGy);
Serial.println ("The linear equation that best fits the given data:");
Serial.print (" y = "); Serial.print(slope); Serial.print(" * x + "); Serial.println(y_intercept);
Serial.println ("----------------------------------------------------------------------");
Serial.println (" Original (x,y) (y_i - y_avg)^2 (y_i - a_o - a_1*x_i)^2");
Serial.println ("----------------------------------------------------------------------");
//calculate squared residues, their sum etc.
for (int i = 0; i < dataSize; i++)
{
//current (y_i - a0 - a1 * x_i)^2
Yres = pow(((xyCollection + i)->y - y_intercept - (slope * (xyCollection + i)->x)), 2);
//sum of (y_i - a0 - a1 * x_i)^2
SUM_Yres += Yres;
//current residue squared (y_i - AVGy)^2
res = pow((xyCollection + i)->y - AVGy, 2);
//sum of squared residues
SUMres += res;
Serial.print("\t");
Serial.print((xyCollection + i)->x); Serial.print("\t");
Serial.print((xyCollection + i)->y); Serial.print("\tres: ");
Serial.print(res); Serial.print("\tYres: ");
Serial.println(Yres);
}
//calculate r^2 coefficient of determination
Rsqr = (SUMres - SUM_Yres) / SUMres;
Serial.println();
Serial.println("------------------------------------------------------------");
Serial.print("Sum of (y_i - y_avg)^2 = "); Serial.println(SUMres,3);
Serial.print("Sum of (y_i - a_o - a_1*x_i)^2 = "); Serial.println(SUM_Yres,3);
Serial.print("Standard deviation(St) = "); Serial.println( sqrt(SUMres / (dataSize - 1)),3);
Serial.print("Standard error of the estimate(Sr) = "); Serial.println(sqrt(SUM_Yres / (dataSize-2)),3);
Serial.print("Coefficent of determination(r^2) = "); Serial.println((SUMres - SUM_Yres)/SUMres,3);
Serial.print("Correlation coefficient(r) = "); Serial.println(sqrt(Rsqr),3);
}
The code is almost working. Equally it is not working at all as expected.
Here is the sample output:
Linear Regression and Method of Least Squares
0 : 1.00 9.30
1 : 0.00 0.00
2 : 2.00 11.50
3 : 0.00 0.00
4 : 3.00 12.25
5 : 0.00 0.00
x mean(AVGx) = 1.00 y mean(AVGy) = 5.51
The linear equation that best fits the given data:
y = 4.50 * x + 1.01Original (x,y) (y_i - y_avg)^2 (y_i - a_o - a_1*x_i)^2
1.00 9.30 res: 14.38 Yres: 14.38
0.00 0.00 res: 30.34 Yres: 1.02
2.00 11.50 res: 35.90 Yres: 2.23
0.00 0.00 res: 30.34 Yres: 1.02
3.00 12.25 res: 45.45 Yres: 5.10
0.00 0.00 res: 30.34 Yres: 1.02
Sum of (y_i - y_avg)^2 = 186.752
Sum of (y_i - a_o - a_1*x_i)^2 = 24.752
Standard deviation(St) = 6.111
Standard error of the estimate(Sr) = 2.488
Coefficent of determination(r^2) = 0.867
Correlation coefficient(r) = 0.931
As you can see I have added a few additional print statements to highlight what is happening in the first loop. It seems as if the array is not being passed correctly. The first data pair is {1,9.3} which is passed OK; the second data pair appears as (0,0); the third data pair is in fact the second pair {2,11.5} and so on. The correct solution should be f(x) = 1.28 * x + 8.3143.
I have been racking my brains on this for hours and finally given in. It is not likely the loop is incrementing incorrectly so either the pointer to the array is not being passed correctly or the loop is not accessing the pointers correctly and therefore failing to point to the correct data values. Can anyone help me spot where I have introduced an error (and why)?
Thanks,
Ric