Performance question: 4 versions of same sketch

64 bit version:

// Conway's Game Of Life 128x96 using TVout
// P.Beard
// March 2013

#include <TVout.h>

#define matWidth 2
#define matHeight 96

TVout TV;
unsigned long long * myScreen;

void setup() {
  TV.begin(PAL, matWidth * 64, matHeight);
  myScreen = (unsigned long long *) TV.screen;
  randomSeed(analogRead(0));
  randomiseMatrix();
  Serial.begin(38400);
}

void loop() {
  unsigned long start = millis();
  for (int i = 1; i <= 10; i++) {
    generateMatrix();
    digitalWrite(13, !digitalRead(13));
  }
  Serial.print( 100000UL / (millis() - start));
  Serial.println( " Gen per 10s");
}

unsigned long long swapBytes(unsigned long long x) {
  return ((x & 0x00000000000000ffULL) << 48) |
         ((x & 0x000000000000ff00ULL) << 32) |
         ((x & 0x0000000000ff0000ULL) << 16) |
         ((x & 0x00000000ff000000ULL) << 8) |
         ((x & 0x000000ff00000000ULL) >> 8) |
         ((x & 0x0000ff0000000000ULL) >> 16) |
         ((x & 0x00ff000000000000ULL) >> 32) |
         ((x & 0xff00000000000000ULL) >> 48);
}

void randomiseMatrix() {

  //Set up initial cells in matrix
  for (int r = 0; r < matHeight; r++) {
    for (int c = 0; c < matWidth; c++) {
      myScreen[r * matWidth + c] = random(0xffff) << 16 | random(0xffff);
    }
  }
}

void injectGlider() {

  byte col = random(matWidth);
  byte row = random(matHeight);
  myScreen[(row+0) * matWidth + col] |= B0000111;
  myScreen[(row+1) * matWidth + col] |= B0000001;
  myScreen[(row+2) * matWidth + col] |= B0000010;

}
	
void generateMatrix() {
  
  //Variables holding data on neighbouring cells
  unsigned long long NeighbourN[matWidth], NeighbourNW[matWidth], NeighbourNE[matWidth], CurrCells[matWidth], NeighbourW[matWidth];
  unsigned long long NeighbourE[matWidth], NeighbourS[matWidth], NeighbourSW[matWidth], NeighbourSE[matWidth], firstRow[matWidth];
	
  unsigned long long tot1, tot2, tot4, carry, NewCells;

  int changes = 0; // counts the changes in the matrix
  static int prevChanges = 256; // counts the changes in the matrix on prev generation
  static int staleCount = 0; // counts the consecutive occurrances of the same number of changes in the matrix

  //set up N, NW, NE, W & E neighbour data
  //also take a copy of the first row data for use later when calculating last row
  for (byte b = 0; b < matWidth; b++) {
    NeighbourN[b] = swapBytes(myScreen[(matHeight-1) * matWidth + b]);
    firstRow[b] = CurrCells[b] = swapBytes(myScreen[b]);
  }

  carry = NeighbourN[matWidth-1];
  for (char b = 0; b < matWidth; b++) {
    NewCells = NeighbourN[b];
    NeighbourNW[b] = NewCells >> 1 | carry << 31; 
    carry = NewCells;
  }
  
  carry = NeighbourN[0];    
  for (char b = matWidth-1; b >= 0; b--) {
    NewCells = NeighbourN[b];
    NeighbourNE[b] = NewCells << 1 | carry >> 31;
    carry = NewCells;
  }
	
  carry = CurrCells[matWidth-1];
  for (char b = 0; b < matWidth; b++) {
    NewCells = CurrCells[b];
    NeighbourW[b] = NewCells >> 1 | carry << 31;
    carry = NewCells;
  }
  
  carry = CurrCells[0];    
  for (char b = matWidth-1; b >= 0; b--) {
    NewCells = CurrCells[b];
    NeighbourE[b] = NewCells << 1 | carry >> 31;
    carry = NewCells;
  }
  
  //Process each row of the matrix
  for (byte row = 0; row < matHeight; row++) {
		
    //Pick up new S, SW & SE neighbours
    if (row < matHeight - 1) {
      for (byte b = 0; b < matWidth; b++) {
        NeighbourS[b] = swapBytes(myScreen[(row+1) * matWidth + b]);
      }
    }
    else {
      for (byte b = 0; b < matWidth; b++) {
        NeighbourS[b] = firstRow[b];
      }
    }
  
    carry = NeighbourS[matWidth-1];
    for (char b = 0; b < matWidth; b++) {
      NewCells = NeighbourS[b];
      NeighbourSW[b] = NewCells >> 1 | carry << 31;
      carry = NewCells;
    }
      
    carry = NeighbourS[0];    
    for (char b = matWidth-1; b >= 0; b--) {
      NewCells = NeighbourS[b];
      NeighbourSE[b] = NewCells << 1 | carry >> 31;
      carry = NewCells;
    }
  
    for (char b = 0; b < matWidth; b++) {
      
       //Count the live neighbours (in parallel) for the current row of cells
      //However, if total goes over 3, we don't care (see below), so counting stops at 4
      tot1 = NeighbourN[b];
      tot2 = tot1 & NeighbourNW[b]; tot1 = tot1 ^ NeighbourNW[b];
      carry = tot1 & NeighbourNE[b]; tot1 = tot1 ^ NeighbourNE[b]; tot4 = tot2 & carry; tot2 = tot2 ^ carry;
      carry = tot1 & NeighbourW[b]; tot1 = tot1 ^ NeighbourW[b]; tot4 = tot2 & carry | tot4; tot2 = tot2 ^ carry;
      carry = tot1 & NeighbourE[b]; tot1 = tot1 ^ NeighbourE[b]; tot4 = tot2 & carry | tot4; tot2 = tot2 ^ carry;
      carry = tot1 & NeighbourS[b]; tot1 = tot1 ^ NeighbourS[b]; tot4 = tot2 & carry | tot4; tot2 = tot2 ^ carry;
      carry = tot1 & NeighbourSW[b]; tot1 = tot1 ^ NeighbourSW[b]; tot4 = tot2 & carry | tot4; tot2 = tot2 ^ carry;
      carry = tot1 & NeighbourSE[b]; tot1 = tot1 ^ NeighbourSE[b]; tot4 = tot2 & carry | tot4; tot2 = tot2 ^ carry;
		
      //Calculate the updated cells:
      // <2 or >3 neighbours, cell dies
      // =2 neighbours, cell continues to live
      // =3 neighbours, new cell born
      NewCells = (CurrCells[b] | tot1) & tot2 & ~ tot4;
      
      //Have any cells changed?
      if (NewCells != CurrCells[b]) {
        myScreen[row * matWidth + b] = swapBytes(NewCells);
        //Count the change for "stale" test
        changes++;
      }
      
      //Current cells (before update), E , W, SE, SW and S neighbours become
      //new N, NW, NE, E, W neighbours and current cells for next loop
      NeighbourN[b] = CurrCells[b];
      NeighbourNW[b] = NeighbourW[b];
      NeighbourNE[b] = NeighbourE[b];
      NeighbourE[b] = NeighbourSE[b];
      NeighbourW[b] = NeighbourSW[b];
      CurrCells[b] = NeighbourS[b];
    } //next col
  } //next row
    
  if (changes != prevChanges) staleCount = 0; else staleCount++; //Detect "stale" matrix
  if (staleCount > 32) injectGlider(); //Inject a glider

  prevChanges = changes;
}