issue w/ multiplexing and shiftin/shiftout example

im currently using a 595 and 4021 to decode a 8x8 button pad.
the 595 output pins are connected directly to the 4021 input pins, and each of the 4021 input pins also have a resistor to ground like the arduino.cc examples. each button has a 1n4148 diode. i do have a .1uf cap to prevent flickering like the tutorials show.

the problem im having is ... without the delayMicroseconds(20) after shifting in the value to the 595, the 4021 reads what appears to be a >> 1 shifted value. it works fine with the added delay the problem is that the delays are starting to add up with 8 iterations.... so the button presses aren't immediate, and lowering the delay much more than 20ms results in poorly shifted data.

any thoughts on how to speed it up while maintaining data integrity?
reasons for it requiring such a long delay? flickering on 595 outputs... flickering on input signal? diode delay? you can see my schematic a few topics down.

thanks for any help. - unsped

the outputs of the following code as an example while holding down appropriate button:
d[8][8] is a boolean array that stores my led states. the code below is simplified and doesn't debounce input or check for existing states. mrow[8] is a byte array that stores the value being sent to the max72XX for each row.

(commas separate each iteration)
with delay and 595 value of 1:
4021 output - N,0,0,0,0,0,0,0

without delay and 595 value of 1:
4021 output - 0,N,0,0,0,0,0,0

with delay and 595 value of 128:
4021 output - 0,0,0,0,0,0,0,N

without delay and 595 value of 128:
4021 output - N,0,0,0,0,0,0,0

where N is the 4021 output value 0-255

void loop() {

  inputrow = 0;
  
  for (i=1; i<=128; i=i*2)  {

    // 595
    digitalWrite(outlatchPin, 0); 
    shiftOut(outdataPin, outclockPin, i);  // set each 595 output high through iteration 
    digitalWrite(outlatchPin, 1);  
    delayMicroseconds(20);// lag on output pin being triggered

    // 4021 Pulse the latch pin:
    digitalWrite(inlatchPin,1);
    delayMicroseconds(20);
    digitalWrite(inlatchPin,0);
    switchVar[inputrow] = shiftIn(indataPin, inclockPin);    

    //migrate switchvar data into d[] 
    d[7][inputrow] += (switchVar[inputrow] & 1);
    d[6][inputrow] += (switchVar[inputrow] & 2) >> 1;
    d[5][inputrow] += (switchVar[inputrow] & 4) >> 2;
    d[4][inputrow] += (switchVar[inputrow] & 8) >> 3;
    d[3][inputrow] += (switchVar[inputrow] & 16) >> 4;
    d[2][inputrow] += (switchVar[inputrow] & 32) >> 5;
    d[1][inputrow] += (switchVar[inputrow] & 64) >> 6;
    d[0][inputrow] += (switchVar[inputrow] & 128) >> 7;    
    //end migration

    inputrow++;
   
   }

  //construct displays from d[] array
  for (e=0; e<=7; e++)  {

    mrow[e] = (d[e][7]) + (d[e][6] << 1) + (d[e][5] << 2) + (d[e][4] << 3) + (d[e][3] << 4) + (d[e][2] << 5) + (d[e][1] << 6) + (d[e][0] << 7);
    maxSingle(e+1,mrow[e]); 

  }

}// end loop()

standard shiftin() function

////// ----------------------------------------shiftIn function
///// just needs the location of the data pin and the clock pin
///// it returns a byte with each bit in the byte corresponding
///// to a pin on the shift register. leftBit 7 = Pin 7 / Bit 0= Pin 0

byte shiftIn(int myDataPin, int myClockPin) { 
  int i;
  int temp = 0;
  int pinState;
  byte myDataIn = 0;

  pinMode(myClockPin, OUTPUT);
  pinMode(myDataPin, INPUT);
//we will be holding the clock pin high 8 times (0,..,7) at the
//end of each time through the for loop

//at the begining of each loop when we set the clock low, it will
//be doing the necessary low to high drop to cause the shift
//register's DataPin to change state based on the value
//of the next bit in its serial information flow.
//The register transmits the information about the pins from pin 7 to pin 0
//so that is why our function counts down
  for (i=7; i>=0; i--)
  {
    digitalWrite(myClockPin, 0);
    delayMicroseconds(0.2);
    temp = digitalRead(myDataPin);
    if (temp) {
      pinState = 1;
      //set the bit to 0 no matter what
      myDataIn = myDataIn | (1 << i);
    }
    else {
      //turn it off -- only necessary for debuging
     //print statement since myDataIn starts as 0
      pinState = 0;
    }

    //Debuging print statements
    //Serial.print(pinState);
    //Serial.print("     ");
    //Serial.println (dataIn, BIN);

    digitalWrite(myClockPin, 1);

  }
  //debuging print statements whitespace
  //Serial.println();
  //Serial.println(myDataIn, BIN);
  return myDataIn;
}

standard shiftout() function

void shiftOut(int myDataPin, int myClockPin, byte myDataOut) {
  // This shifts 8 bits out MSB first, 
  //on the rising edge of the clock,
  //clock idles low

  //internal function setup
  int i=0;
  int pinState;
  pinMode(myClockPin, OUTPUT);
  pinMode(myDataPin, OUTPUT);

  //clear everything out just in case to
  //prepare shift register for bit shifting
  digitalWrite(myDataPin, 0);
  digitalWrite(myClockPin, 0);

  //for each bit in the byte myDataOut?
  //NOTICE THAT WE ARE COUNTING DOWN in our for loop
  //This means that %00000001 or "1" will go through such
  //that it will be pin Q0 that lights. 
  for (i=7; i>=0; i--)  {
    digitalWrite(myClockPin, 0);

    //if the value passed to myDataOut and a bitmask result 
    // true then... so if we are at i=6 and our value is
    // %11010100 it would the code compares it to %01000000 
    // and proceeds to set pinState to 1.
    if ( myDataOut & (1<<i) ) {
      pinState= 1;
    }
    else {      
      pinState= 0;
    }

    //Sets the pin to HIGH or LOW depending on pinState
    digitalWrite(myDataPin, pinState);
    //register shifts bits on upstroke of clock pin  
    digitalWrite(myClockPin, 1);
    //zero the data pin after shift to prevent bleed through
    digitalWrite(myDataPin, 0);
  }

  //stop shifting
  digitalWrite(myClockPin, 0);
}

hi i dont have your answer but did you ever get that figured out? i have a similar setup with 74HC165 and 74HC164.

same here, had a latency issue with the 4021...