Have I made this hardware SPI transfer as fast as possible?

OK, well running this exact code:

#include <SPI.h>

#define NOP __asm__ __volatile__ ("nop\n");
#define WAIT NOP NOP NOP NOP NOP NOP NOP NOP NOP NOP NOP // 11 NOPs 

namespace myLeds {
   
    const int LEDBYTES = 36;
    const int SCALEBYTES = 24;
    
    int modules;   // Number of LED modules connected.  Defined in config.txt.
  
    byte * data;    // LED data is stored as a string of 12 bit LED values.  There are 3 bytes for every two LEDs.  We store it this way because the data does not need to be altered when we are shifting it out to the TLC5947s.  Modifying the data with bit shifts would be very expensive and there is a lot of data to move.
    byte * scale;   // Scaling factor for LED brightness.  Used when say, you want to be able to adjust the brightness of the powercell without changing all the code that assumes the max desired brightness is 1.0.  Value stored here is scaling_factor*255, where scaling factor is 0..1.
      
}  // end of namespace myLeds

void setup ()
  {
  myLeds::modules = 2;

  myLeds::data =  (byte *) malloc (myLeds::modules * myLeds::LEDBYTES * sizeof(byte));  
  myLeds::scale = (byte *) malloc (myLeds::modules * myLeds::SCALEBYTES * sizeof(byte));
  
  SPI.begin ();
  digitalWrite (SS, HIGH);
  SPI.setClockDivider (SPI_CLOCK_DIV2);
  Serial.begin (115200);
  
  byte *thisLED = &myLeds::data [myLeds::modules * myLeds::LEDBYTES]; 
  byte *lastLED = &myLeds::data [0]; 
  
  // populate the array
  byte val = 0;
   do 
     {       
      *--thisLED = val++;
      } while (thisLED != lastLED); 
  
  }  // end of setup
  
void loop ()
  {
      
  byte *thisLED = &myLeds::data [myLeds::modules * myLeds::LEDBYTES]; 
  byte *lastLED = &myLeds::data [0]; 
     
  digitalWrite (SS, LOW);
  
  noInterrupts ();
  unsigned long startTime = micros();
  
   do 
     {       
     SPDR = *--thisLED; WAIT;
      } while (thisLED != lastLED); 
  
  WAIT; // Wait for last byte to finish transfer. 
  SPSR = SPSR & ~_BV(SPIF); // Clear transfer flag.
  unsigned long endTime = micros();
  digitalWrite (SS, HIGH);

  interrupts ();
  Serial.println (endTime - startTime);
  delay (1000);
  }  // end of loop

And checking with the logic analyzer:

Notice the 125 nS gap between bytes? (The 17th and 18th clock cycle).

That required the 11 NOPs (added to the other stuff in the loop).

If I remove a NOP it doesn't work. So you need a cycle of 18. Put it another way, the time between the leading clock edges of each byte was 1.1250 uS (18 cycles).

The output in the serial monitor was "84" which sounds about right (36 * 2 * 1.1250 = 81, plus a bit of overhead for that last byte etc.)