Have I made this hardware SPI transfer as fast as possible?

Well this the the asm code being generated now. Looks a bit odd to me but it has the right number of nops:

    cli(); // Halt interrupts.
     844:	f8 94       	cli

    do {       
      SPDR = *--thisLED; WAIT;        
     846:	82 91       	ld	r24, -Z
     848:	8e bd       	out	0x2e, r24	; 46
     84a:	00 00       	nop
     84c:	00 00       	nop
     84e:	00 00       	nop
     850:	00 00       	nop
     852:	00 00       	nop
     854:	00 00       	nop
     856:	00 00       	nop
     858:	00 00       	nop
     85a:	00 00       	nop
     85c:	00 00       	nop
     85e:	00 00       	nop
    thisLED = &led::data[led::modules*36]; // The first operation is to decrement this pointer, so the first time we write thisLED it will be array index LEDMODULES*36-1
    lastLED = &led::data[0]; 
    
    cli(); // Halt interrupts.

    do {       
     860:	e4 17       	cp	r30, r20
     862:	f5 07       	cpc	r31, r21
     864:	81 f7       	brne	.-32     	; 0x846 <_Z10updateLEDsv+0x6c>
      SPDR = *--thisLED; WAIT;        
    } while (thisLED != lastLED); // thisLED is decremented one last time before we hit the end of the loop, so after byte 0 transfers, the loop exits.

    WAIT; // Wait for last byte to finish transfer. 
     866:	00 00       	nop
     868:	00 00       	nop
     86a:	00 00       	nop
     86c:	00 00       	nop
     86e:	00 00       	nop
     870:	00 00       	nop
     872:	00 00       	nop
     874:	00 00       	nop
     876:	00 00       	nop
     878:	00 00       	nop
     87a:	00 00       	nop
    SPSR = SPSR & ~_BV(SPIF); // Clear transfer flag.
     87c:	8d b5       	in	r24, 0x2d	; 45
     87e:	8f 77       	andi	r24, 0x7F	; 127
     880:	8d bd       	out	0x2d, r24	; 45
    
    // Move data from shift register into latch register:
    
      LATPORT &= ~(1 << LATPIN); // Set LATCH pin low.  It is important to do this first, just before transitioning high!  I don't know why, but the LED modules won't work right otherwise!
     882:	2a 98       	cbi	0x05, 2	; 5
      LATPORT |= (1 << LATPIN); // Set LATCH pin high.  Transitioning from low to high moves all data from the shift register into the latch registers. 
     884:	2a 9a       	sbi	0x05, 2	; 5

      ENAPORT |= (1 << ENAPIN); // Set BLANK pin high. Turn all outputs off, and reset grayscale timer.  Blanking reduces flicker, but it does not seem to matter if it is before, after, or in the middle of latching.
     886:	2b 9a       	sbi	0x05, 3	; 5
      ENAPORT &= ~(1 << ENAPIN); // Set BLANK pin low. Turn all outputs on.
     888:	2b 98       	cbi	0x05, 3	; 5
      
    sei(); // Reenable interrupts.
     88a:	78 94       	sei