A bit stumped on rewriting a nested for loop for shift register data

I am working on rewriting some code to work with my version of a 3d matrix RGB cube. This is chipkit UC32 code.

main refresh routines use the following array

//  The cube matrix below stores the status of each LED in the cube
//  [column 0-7] [panel 0-7] [layer 0-7] [red, green, blue color components]
//  Each of the 3 color components can vary from 0 to 63, giving us approx. 250,000 possible color variations
byte cube[8][8][8][3];

Here is the refresh routine as it stands now. I currently have cubeStructure set to 0

uint32_t refreshCube(uint32_t currentTime) {
  byte red, green, blue;
  for (byte count=0; count<6; count++){  // BAM counter; each increment doubles the time the LED is on, starting at 5 microsecs.
    digitalWrite(Latch, LOW );  //make sure outputs are latched
    LATDCLR = CLK|SDIR|SDIG|SDIB;
    
    modeNormal();
    
    if (cubeStructure==0){
      LATDSET = OE;// ENABLE OUTPUT
      for (byte layer=0; layer<8; layer++){  // scan thru each layer
        for (byte panel=0; panel<8; panel++){  // scan thru every panel
          for (byte column=0; column<8; column++){  // scan thru every column
          
            //-------------------------------------------------------------------
            red = cube[column][panel][layer][0]; // get its red component
            if ((red & 1<<count)>0) {  // BAM it and shift it out

              LATDSET |= SDIR;
        
            }
            LATDSET = CLK;  //Clock the data in.
            _nop(); 
            LATDCLR = CLK|SDIR|SDIG|SDIB;
            
            
            
            //-------------------------------------------------------------------
            LATECLR = LAYER[7]; // turn on layer          
            green = cube[column][panel][layer][1];  // get its green component
            if ((green & 1<<count)>0) {  // BAM it and shift it out

              LATDSET |= SDIR;
               
            }
            LATDSET = CLK;  //Clock the data in.
            _nop(); 
            LATDCLR = CLK|SDIR|SDIG|SDIB;
            LATESET = LAYER[7]; // turn on layer
            //------------------------------------------------------------------
            blue = cube[column][panel][layer][2];  // get its blue component
            if ((blue & 1<<count)>0) {  // BAM it and shift it out

              LATDSET |= SDIR;

            }

            LATDSET = CLK;  //Clock the data in.
            _nop(); 
            LATDCLR = CLK|SDIR|SDIG|SDIB; 
          } // end of panel loop
        } // end of col loop

        
        // 595 Shift Register Anode Control
        // Shifts out current anode byte on 
        // bit at a time.
        for (int b=0; b<8; b++){
          if (bitRead(anodeLevel[layer],b)){
          LATDSET |= SDIR;  
          }
          LATDSET = CLK;  //Clock the data in.
          _nop(); 
          LATDCLR = CLK|SDIR|SDIG|SDIB;
        } // END ANODE FOR LOOP

        // Pulse Latch Pin High
        // To Latch the data
        LATDSET = LE;  
        _nop(); 
        LATDCLR = LE;
        _nop();        
        
        LATDCLR = OE; // ENABLE OUTPUT 
        LATECLR = LAYER[layer]; // turn on layer
        
        //here count sets time the layer is on
        //starting with 10 microsecs and 
        //ending at 32*10 or 320 microsecs.
        
        delayMicroseconds((1<<count)*10);  
        
        LATDSET = OE; // DISABLE OUTPUT
        LATESET = LAYER[layer]; //turn off layer
        
      }
 
    }
    else {
      
      
      // Disable Outputs.
      LATDSET = OE;
      
      
      // Red DATA loop
      // Scans through cube and outputs the data to the 
      // TLC5916 Shift registers
      // Data is output as a single chain
      // 24 bytes of data shifted as RED byte GREEN byte BLUE byte pattern
      
      // SHIFT OUT 8 RED BYTES
      //
      for (byte layer=0; layer<8; layer++){  // scan thru each layer
        for (byte panel=0; panel<8; panel++){  // scan thru every panel    
          for (byte column=0; column<8; column++){  // scan thru every column
          
            Serial.print(column);
            Serial.print("\t");
            Serial.print(panel);
            Serial.print("\t");
            Serial.print(layer);
            Serial.println("\t");
            
            // red-----------------------------------------------------
            red = cube[column][panel][layer][0]; // get its red component
            
            if ((red & 1<<count)>0) {  // BAM it and shift it out
              LATDSET |= SDIR;
            }
           
            } // end RED column loop
          
            LATDSET = CLK;  //Clock the red data in.
            _nop(); 
            LATDCLR = CLK|SDIR|SDIG|SDIB;
//******************************************************************************************            
             for (byte column=0; column<8; column++){  // scan thru every column
              
              Serial.print(column);
              Serial.print("\t");
              Serial.print(panel);
              Serial.print("\t");
              Serial.print(layer);
              Serial.println("\t");
            
              // red-----------------------------------------------------
              green = cube[column][panel][layer][1]; // get its red component
            
              if ((green & 1<<count)>0) {  // BAM it and shift it out
                LATDSET |= SDIR;
              }
           
              } // end green column loop 
  
              LATDSET = CLK;  //Clock  the green data in.
              _nop(); 
              LATDCLR = CLK|SDIR|SDIG|SDIB;
              
//******************************************************************************************  
             for (byte column=0; column<8; column++){  // scan thru every column
          
              Serial.print(column);
              Serial.print("\t");
              Serial.print(panel);
              Serial.print("\t");
              Serial.print(layer);
              Serial.println("\t");
            
              // blue-----------------------------------------------------
              blue = cube[column][panel][layer][2]; // get its red component
            
              if ((blue & 1<<count)>0) {  // BAM it and shift it out
                LATDSET |= SDIR;
              }
           
              } // end RED column loop 
  
              LATDSET = CLK;  //Clock the blue data in.
              _nop(); 
              LATDCLR = CLK|SDIR|SDIG|SDIB;  
//**************************************************************************************  
  
  
            
        } // DO NEXT PANEL
           

        // 595 Shift Register Anode Control
        // Shifts out current anode byte on 
        // bit at a time.
        for (int b=0; b<8; b++){
          if (bitRead(anodeLevel[layer],b)){
          LATDSET |= SDIR;  
          }
          LATDSET = CLK;  //Clock the data in.
          _nop(); 
          LATDCLR = CLK|SDIR|SDIG|SDIB;
        } // END ANODE FOR LOOP

        // Pulse Latch Pin High
        // To Latch the data
        LATDSET = LE;  
        _nop(); 
        LATDCLR = LE;
        _nop();        
        
        LATDCLR = OE; // ENABLE OUTPUT 
        LATECLR = LAYER[layer]; // turn on layer
        
        //here count sets time the layer is on
        //starting with 10 microsecs and 
        //ending at 32*10 or 320 microsecs.
        
        delayMicroseconds((1<<count)*10);  
        
        LATDSET = OE; // DISABLE OUTPUT
        LATESET = LAYER[layer]; //turn off layer

      } //end layer loop      
    } // end else
 
 
 
    
  } // end bam loop

My problem:

The code as it stands is sending out the data now as 1 bit RED, 1 bit Green, 1 bit Blue then it repeats until all 24 bytes are output all on a single pin instead of three

This is incorrect for my cube. What i need is to have 1 byte RED, 1 byte Green, 1 byte Blue then repeat.

So i need this pattern

Panel 0

Col 0-7 which is 1 byte from the RED array
Col 0-7 which is 1 byte from the Green array
Col 0-7 which is 1 byte from the Blue array

then panel 1 and so on

I attached screen captures of my logic analyzer data

Capture 1 - Full Refresh cycle
Capture 2 - Zoom to first loop of bam counter
Capture 3 - Zoom to first byte out markers show location of GREEN bit.

Capture 1.png

I figured out i needed to remove the old column loop and ad three more loops inside the panel layer loops.

uint32_t refreshCube(uint32_t currentTime) {
  byte red, green, blue;
  for (byte count=0; count<6; count++){  // BAM counter; each increment doubles the time the LED is on, starting at 5 microsecs.
    digitalWrite(Latch, LOW );  //make sure outputs are latched
    LATDCLR = CLK|SDIR|SDIG|SDIB;    
    modeNormal();    
    if (cubeStructure==0){
      LATDSET = OE;// ENABLE OUTPUT
      for (byte layer=0; layer<8; layer++){  // scan thru each layer
        for (byte panel=0; panel<8; panel++){  // scan thru every panel

              // OUTPUTS GIVEN RED BYTE ONE BIT PER LOOP            
              for (int testr=0; testr<8; testr++){
                LATECLR = LAYER[5]; // test
                red = cube[testr][panel][layer][0]; // get its red component
                if ((red & 1<<count)>0) {  // BAM it and shift it out
                  LATDSET |= SDIR;        
                }
                LATDSET = CLK;  //Clock the data in.
                _nop(); 
                LATDCLR = CLK|SDIR|SDIG|SDIB;
                LATESET = LAYER[5]; // test
              } //END RB LOOP RUNS 8 TIMES
              
            //-------------------------------------------------------------------
                
                // OUTPUTS GIVEN GREEN BYTE ONE BIT PER LOOP 
                for (int testg=0; testg<8; testg++){
                  LATECLR = LAYER[6]; // test
                  green = cube[testg][panel][layer][1];  // get its green component
                  if ((green & 1<<count)>0) {  // BAM it and shift it out
                  LATDSET |= SDIR;               
                  }
                  LATDSET = CLK;  //Clock the data in.
                  _nop(); 
                  LATDCLR = CLK|SDIR|SDIG|SDIB;
                  LATESET = LAYER[6]; // test
              
                } //END GB LOOP RUNS 8 TIMES
               
            //------------------------------------------------------------------
            
                for (int testb=0; testb<8; testb++){
                  LATECLR = LAYER[7]; // test
                  blue = cube[testb][panel][layer][2];  // get its blue component
                  if ((blue & 1<<count)>0) {  // BAM it and shift it out
                  LATDSET |= SDIR;
                  }
                  LATDSET = CLK;  //Clock the data in.
                  _nop(); 
                  LATDCLR = CLK|SDIR|SDIG|SDIB;
                  LATESET = LAYER[7]; // test
                } // END BB LOOP RUNS 8 TIMES
            
            	

        } // END PANEL LOOP RUNS 8 TIMES

        
        // 595 Shift Register Anode Control
        // Shifts out current anode byte on 
        // bit at a time.
        for (int b=0; b<8; b++){
          if (bitRead(anodeLevel[layer],b)){
          LATDSET |= SDIR;  
          }
          LATDSET = CLK;  //Clock the data in.
          _nop(); 
          LATDCLR = CLK|SDIR|SDIG|SDIB;
        } // END ANODE FOR LOOP

        // Pulse Latch Pin High
        // To Latch the data
        LATDSET = LE;  
        _nop(); 
        LATDCLR = LE;
        _nop();        
        
        LATDCLR = OE; // ENABLE OUTPUT 
        LATECLR = LAYER[layer]; // turn on layer
        
        //here count sets time the layer is on
        //starting with 10 microsecs and 
        //ending at 32*10 or 320 microsecs.        
        delayMicroseconds((1<<count)*10);          
        LATDSET = OE; // DISABLE OUTPUT
        LATESET = LAYER[layer]; //turn off layer
        
      }
 
    }
    else {
               
        } // end else   
  } // end bam loop

My data is now being sent out as bytes in red, green, blue sequence

I have attached some waveform screenshots