Help with decreasing program loop time

I am building a 13x13 rgb led multiplexed display. With 13 leds in one row that gives me a total of 39(and 13 gound) pins that have to be controlled. I am using 6 shift registers and two darlington arrays for my outputs. Everything is working great I'm independently controlling leds and doing simple animations. The problem I am having is that when I have a lot of the leds on at a time they start to flicker. Does anyone have any ideas on how I can decrease the loop time and in turn update the leds much faster?

Here is the code for you to take a look at.

int SER_Pin = 7;  
int RCLK_Pin = 9;  
int SRCLK_Pin = 8; 
int DELAY = 50;
int redpins[13] = {
  1,4,7,10,16,19,22,25,28,34,37,40,43};
int grnpins[13] = {
  0,3,6,9,12,18,21,24,27,33,36,39,42};
int blupins[13] = {
  2,5,8,11,17,20,23,26,32,35,38,41,44};
int colpins[13] = {
  14,15,16,17,18,19,0,1,2,3,4,5,6};
int face[13][13] = {
  {0,0,0,0,0,0,0,0,0,0,0,0,0},
  {0,0,0,0,0,0,0,0,0,0,0,0,0},
  {0,0,0,0,0,0,0,0,0,0,0,0,0},
  {0,0,0,1,0,0,0,0,0,1,0,0,0},
  {0,0,0,1,0,0,0,0,0,1,0,0,0},
  {0,0,0,0,0,0,3,0,0,0,0,0,0},
  {0,0,0,0,0,0,3,0,0,0,0,0,0},
  {0,2,0,0,0,0,0,0,0,0,0,2,0},
  {0,2,0,0,0,0,0,0,0,0,0,2,0},
  {0,0,2,0,0,0,0,0,0,0,2,0,0},
  {0,0,0,2,2,2,2,2,2,2,0,0,0},
  {0,0,0,0,0,0,0,0,0,0,0,0,0},
  {0,0,0,0,0,0,0,0,0,0,0,0,0},
  };
  int C[13][13] = {
  {0,0,0,0,1,1,1,1,1,0,0,0,0},
  {0,0,0,1,1,1,1,1,1,1,0,0,0},
  {0,0,1,1,0,0,0,0,0,1,1,0,0},
  {0,0,1,1,0,0,0,0,0,0,0,0,0},
  {0,0,1,1,0,0,0,0,0,0,0,0,0},
  {0,0,1,1,0,0,0,0,0,0,0,0,0},
  {0,0,1,1,0,0,0,0,0,0,0,0,0},
  {0,0,1,1,0,0,0,0,0,0,0,0,0},
  {0,0,1,1,0,0,0,0,0,0,0,0,0},
  {0,0,1,1,0,0,0,0,0,0,0,0,0},
  {0,0,1,1,0,0,0,0,0,1,1,0,0},
  {0,0,0,1,1,1,1,1,1,1,0,0,0},
  {0,0,0,0,1,1,1,1,1,0,0,0,0}};
  
  int allon[13][13] = {
  {1,1,1,1,1,1,1,1,1,1,1,1,1},
  {1,1,1,1,1,1,1,1,1,1,1,1,1},
  {1,1,1,1,1,1,1,1,1,1,1,1,1},
  {1,1,1,1,1,1,1,1,1,1,1,1,1},
  {1,1,1,1,1,1,1,1,1,1,1,1,1},
  {1,1,1,1,1,1,1,1,1,1,1,1,1},
  {1,1,1,1,1,1,1,1,1,1,1,1,1},
  {1,1,1,1,1,1,1,1,1,1,1,1,1},
  {1,1,1,1,1,1,1,1,1,1,1,1,1},
  {1,1,1,1,1,1,1,1,1,1,1,1,1},
  {1,1,1,1,1,1,1,1,1,1,1,1,1},
  {1,1,1,1,1,1,1,1,1,1,1,1,1},
  {1,1,1,1,1,1,1,1,1,1,1,1,1}};
  
#define number_of_74hc595s 6
#define numOfRegisterPins number_of_74hc595s * 8

boolean registers[numOfRegisterPins];

void setup(){

  pinMode(SER_Pin, OUTPUT);
  pinMode(RCLK_Pin, OUTPUT);
  pinMode(SRCLK_Pin, OUTPUT);
  clearRegisters();
  for(int x = 14; x < 20; x++){
    pinMode(x, OUTPUT);
    digitalWrite(x, LOW);
  }
  for(int x = 0; x < 7; x++){
    pinMode(x, OUTPUT);
    digitalWrite(x, LOW);
  }
  for(int x = 0; x < 48; x++){
    setRegisterPin(x, LOW);
  }
}               

void loop(){
  for(int x = 0; x < 250; x++){
    Display(C);
  }
}

void writeRegisters(){

  digitalWrite(RCLK_Pin, LOW);

  for(int i = numOfRegisterPins - 1; i >=  0; i--){
    digitalWrite(SRCLK_Pin, LOW);

    int val = registers[i];
    digitalWrite(SER_Pin, val);
    digitalWrite(SRCLK_Pin, HIGH);

  }
  digitalWrite(RCLK_Pin, HIGH);

}

void setRegisterPin(int index, int value){
  registers[index] = value;
}

void clearRegisters(){
  for(int i = numOfRegisterPins - 1; i >=  0; i--){
    registers[i] = LOW;
  }
} 

void Display(int image[13][13]){
  for(int x = 0; x < 13; x++){
    for(int y = 0; y < 13; y++){
      if(image[x][y] == 3){
        setRegisterPin(blupins[y], HIGH);
        digitalWrite(colpins[x], HIGH);
        writeRegisters();
        delayMicroseconds(DELAY);
        setRegisterPin(blupins[y], LOW);
        digitalWrite(colpins[x], LOW);
        writeRegisters();
        delayMicroseconds(DELAY);
      }
      if(image[x][y] == 2){
        setRegisterPin(grnpins[y], HIGH);
        digitalWrite(colpins[x], HIGH);
        writeRegisters();
        delayMicroseconds(DELAY);
        setRegisterPin(grnpins[y], LOW);
        digitalWrite(colpins[x], LOW);
        writeRegisters();
        delayMicroseconds(DELAY);
      }
      if(image[x][y] == 1){
        setRegisterPin(redpins[y], HIGH);
        digitalWrite(colpins[x], HIGH);
        writeRegisters();
        delayMicroseconds(DELAY);
        setRegisterPin(redpins[y], LOW);
        digitalWrite(colpins[x], LOW);
        writeRegisters();
        delayMicroseconds(DELAY);
      }
    }
  }
}

Does anyone have any ideas on how I can decrease the loop time and in turn update the leds much faster?

Well, duh.

        delayMicroseconds(DELAY);

How many time, per iteration of loop() is this call exectuted?

The delayMicroseconds is called twice. I changed it to only have one delay but its not changing the flickering much.

The delayMicroseconds is called twice.

Really?

      if(image[x][y] == 3){
        setRegisterPin(blupins[y], HIGH);
        digitalWrite(colpins[x], HIGH);
        writeRegisters();
        delayMicroseconds(DELAY);
        setRegisterPin(blupins[y], LOW);
        digitalWrite(colpins[x], LOW);
        writeRegisters();
        delayMicroseconds(DELAY);
      }

It is called twice in this block, if the statement is true.

It is called twice more in each of the following blocks.

Those three blocks are located inside some for loops.
for(int x = 0; x < 13; x++){
for(int y = 0; y < 13; y++){
So, I see the possibility of delayMicroseconds() being called 169 * 6 times per call to Display(), depending on the value passed to the function. That could add up to a lot of delay.

Thanks anyway fixed my own problem. I took out the digitalwrites and used port manipulation.

any suggestions on how to scroll the text?

Maybe you want to have a look at my blog, especially the POV experiments POV Reloaded | Blinkenlight. I am basically scrolling 1 colum of LEDs but the idea is the same: just move an array pointer and copy whatever you find from there into the display.

I'd probably use more appropriately-sized variables to store my patterns; saves RAM, and improves speed.