You have 4 shift registers, yes?
So send the data out using 4 SPI.transfers.
void loop (){
currentMicros = micros(); // capture the current 'time'
elapsedMicros = currentMicros - nextMicros; // how much time has passed?
if ( elapsedMicros >=onDuration){ // ready for the next row?
nextMicros = nextMicros + onDuration; // set up for next multiplex time
row=row+1;
if (row==24){row=0;} // keep track of rows 0 to 23
digitalWrite (latchPin, LOW); // use direct port manipulation to do this faster
SPI.transfer(anodePins[row]);
SPI.transfer(CathodePins[row*8]); // use 0 when rows are to be off
SPI.transfer(CathodePins[(row*8)+1]);
SPI.transfer(CathodePins[(row*8)+2]);
// data gets moved to output stage with this rising edge:
digitalWrite (latchPin, LOW); // use direct port manipulation to do this faster
} // end time check
//
// do other stuff while waiting
//
} // end loop
anodePins[] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,};
and
colorCathodePins[] = { // 3 bytes of data for each row, with 0's when a color is off
data, 0, 0, // row 0
0, data, 0,
0, 0, data,
data, 0, 0, // row 1
0, data, 0,
0, 0, data,
data, 0, 0, // row 2
0, data, 0,
0, 0, data,
data, 0, 0, // row 3
0, data, 0,
0, 0, data,
data, 0, 0, // row 4
0, data, 0,
0, 0, data,
data, 0, 0, // row 5
0, data, 0,
0, 0, data,
data, 0, 0, // row 6
0, data, 0,
0, 0, data,
data, 0, 0, // row 7
0, data, 0,
0, 0, data,
};
Make sense?