Hi all,
I would be interested to gather any insights into this.
I have 4 versions of the same sketch here. Each has the same amount of bit-wise processing to be performed, which includes lots of shifts and logical AND/OR etc. The application is running Conway's Game Of Life on a 128x192 grid and using the TVOut library to display the results on a Nano 3.
The difference between the 4 sketches is the number of bits/cells processed in parallel: 8 bits, 16 bits, 32 bits or 64 bits at a time.
The results are:
| Version | Sketch size | Generations per second |
|---|---|---|
| 8 bit | 8K | 10 |
| 16 bit | 9K | 15 |
| 32 bit | 10K | 15 |
| 64 bit | 18K | 3 |
The surprise for me was how poor the 64 bit version performs, and its size.
8 bit sketch:
// Conway's Game Of Life 128x96 using TVout
// P.Beard
// March 2013
#include <TVout.h>
#define matWidth 16
#define matHeight 96
TVout TV;
unsigned char * myScreen;
void setup() {
TV.begin(PAL, matWidth * 8, matHeight);
myScreen = (unsigned char *) TV.screen;
randomSeed(analogRead(0));
randomiseMatrix();
Serial.begin(38400);
}
void loop() {
unsigned long start = millis();
for (int i = 1; i <= 10; i++) {
generateMatrix();
digitalWrite(13, !digitalRead(13));
}
Serial.print( 100000UL / (millis() - start));
Serial.println( " Gen per 10s");
}
unsigned int swapBytes(unsigned int x) {
return x;
}
void randomiseMatrix() {
//Set up initial cells in matrix
for (int r = 0; r < matHeight; r++) {
for (int c = 0; c < matWidth; c++) {
myScreen[r * matWidth + c] = random(0xff);
}
}
}
void injectGlider() {
byte col = random(matWidth);
byte row = random(matHeight);
myScreen[(row+0) * matWidth + col] |= B0000111;
myScreen[(row+1) * matWidth + col] |= B0000001;
myScreen[(row+2) * matWidth + col] |= B0000010;
}
void generateMatrix() {
//Variables holding data on neighbouring cells
unsigned char NeighbourN[matWidth], NeighbourNW[matWidth], NeighbourNE[matWidth], CurrCells[matWidth], NeighbourW[matWidth];
unsigned char NeighbourE[matWidth], NeighbourS[matWidth], NeighbourSW[matWidth], NeighbourSE[matWidth], firstRow[matWidth];
unsigned char tot1, tot2, tot4, carry, NewCells;
int changes = 0; // counts the changes in the matrix
static int prevChanges = 256; // counts the changes in the matrix on prev generation
static int staleCount = 0; // counts the consecutive occurrances of the same number of changes in the matrix
//set up N, NW, NE, W & E neighbour data
//also take a copy of the first row data for use later when calculating last row
for (byte b = 0; b < matWidth; b++) {
NeighbourN[b] = swapBytes(myScreen[(matHeight-1) * matWidth + b]);
firstRow[b] = CurrCells[b] = swapBytes(myScreen[b]);
}
carry = NeighbourN[matWidth-1];
for (char b = 0; b < matWidth; b++) {
NewCells = NeighbourN[b];
NeighbourNW[b] = NewCells >> 1 | carry << 7;
carry = NewCells;
}
carry = NeighbourN[0];
for (char b = matWidth-1; b >= 0; b--) {
NewCells = NeighbourN[b];
NeighbourNE[b] = NewCells << 1 | carry >> 7;
carry = NewCells;
}
carry = CurrCells[matWidth-1];
for (char b = 0; b < matWidth; b++) {
NewCells = CurrCells[b];
NeighbourW[b] = NewCells >> 1 | carry << 7;
carry = NewCells;
}
carry = CurrCells[0];
for (char b = matWidth-1; b >= 0; b--) {
NewCells = CurrCells[b];
NeighbourE[b] = NewCells << 1 | carry >> 7;
carry = NewCells;
}
//Process each row of the matrix
for (byte row = 0; row < matHeight; row++) {
//Pick up new S, SW & SE neighbours
if (row < matHeight - 1) {
for (byte b = 0; b < matWidth; b++) {
NeighbourS[b] = swapBytes(myScreen[(row+1) * matWidth + b]);
}
}
else {
for (byte b = 0; b < matWidth; b++) {
NeighbourS[b] = firstRow[b];
}
}
carry = NeighbourS[matWidth-1];
for (char b = 0; b < matWidth; b++) {
NewCells = NeighbourS[b];
NeighbourSW[b] = NewCells >> 1 | carry << 7;
carry = NewCells;
}
carry = NeighbourS[0];
for (char b = matWidth-1; b >= 0; b--) {
NewCells = NeighbourS[b];
NeighbourSE[b] = NewCells << 1 | carry >> 7;
carry = NewCells;
}
for (char b = 0; b < matWidth; b++) {
//Count the live neighbours (in parallel) for the current row of cells
//However, if total goes over 3, we don't care (see below), so counting stops at 4
tot1 = NeighbourN[b];
tot2 = tot1 & NeighbourNW[b]; tot1 = tot1 ^ NeighbourNW[b];
carry = tot1 & NeighbourNE[b]; tot1 = tot1 ^ NeighbourNE[b]; tot4 = tot2 & carry; tot2 = tot2 ^ carry;
carry = tot1 & NeighbourW[b]; tot1 = tot1 ^ NeighbourW[b]; tot4 = tot2 & carry | tot4; tot2 = tot2 ^ carry;
carry = tot1 & NeighbourE[b]; tot1 = tot1 ^ NeighbourE[b]; tot4 = tot2 & carry | tot4; tot2 = tot2 ^ carry;
carry = tot1 & NeighbourS[b]; tot1 = tot1 ^ NeighbourS[b]; tot4 = tot2 & carry | tot4; tot2 = tot2 ^ carry;
carry = tot1 & NeighbourSW[b]; tot1 = tot1 ^ NeighbourSW[b]; tot4 = tot2 & carry | tot4; tot2 = tot2 ^ carry;
carry = tot1 & NeighbourSE[b]; tot1 = tot1 ^ NeighbourSE[b]; tot4 = tot2 & carry | tot4; tot2 = tot2 ^ carry;
//Calculate the updated cells:
// <2 or >3 neighbours, cell dies
// =2 neighbours, cell continues to live
// =3 neighbours, new cell born
NewCells = (CurrCells[b] | tot1) & tot2 & ~ tot4;
//Have any cells changed?
if (NewCells != CurrCells[b]) {
myScreen[row * matWidth + b] = swapBytes(NewCells);
//Count the change for "stale" test
changes++;
}
//Current cells (before update), E , W, SE, SW and S neighbours become
//new N, NW, NE, E, W neighbours and current cells for next loop
NeighbourN[b] = CurrCells[b];
NeighbourNW[b] = NeighbourW[b];
NeighbourNE[b] = NeighbourE[b];
NeighbourE[b] = NeighbourSE[b];
NeighbourW[b] = NeighbourSW[b];
CurrCells[b] = NeighbourS[b];
} //next col
} //next row
if (changes != prevChanges) staleCount = 0; else staleCount++; //Detect "stale" matrix
if (staleCount > 32) injectGlider(); //Inject a glider
prevChanges = changes;
}