How to change variables properly in interrupt with Inline assembler?

I'm trying to replace 25L3206E flash memory with Arduino Nano. Datasheet: 25L3206E datasheet

Memory chip is removed from PCB and Arduino is attached in place. Device is KCodes KC601. Link to product: KC601 - Wired - Product - KCodes

Pin mapping:
Chip Select is on pin D3 and uses interrupt 1.
Clock is on pin D2 and uses interrupt 0.
Serial In is on pin D4.
Serial Out is on pin D5. (not used in this code yet)

This is hobby project. Kind of hardware hacking. I'm trying to run firmware directly from Arduino. Firmware is converted to hexadecimal array and saved in to PROGMEM. It contains over thousand lines of code, so I don't include it here.

I also use Nokia 5110, PCD8544 monochrome graphic display to debug purposes. Datasheet: https://www.sparkfun.com/datasheets/LCD/Monochrome/Nokia5110.pdf

Pin mapping:
Reset - external +5V
CE - D8
DC - D12
DIN- D11
CLK- D10
VCC- 3V
LIGHT- GND
GND - GND

Display works fine, so I don't include it's code either. Saves again 70 lines of code from flooding this post. Serial port cannot be used for debugging because of interrupts.

And worth to mention that Arduino Nano is powered with 3V through 5V output pin and 5V line is disconnected from USB cable. Works fine. KCodes KC601 uses 3V on internal data lines.

And finally the problem:
Seems that count variable is incrementing properly, but when I monitoring a command variable then after a while code is running it outputs several ones and it follows with several twos and then fours and 20 (in hexadecimal format). After that it starts again with ones. It seems that only one bit is coming through and it rolls over with cmd variable.

I don't get any error messages.

Program code:

#define SO 5
#define SI 4

volatile uint32_t addr      = 0;
volatile uint32_t cmd       = 0;
volatile uint16_t count     = 0;
volatile  uint8_t command   = 0;
volatile  uint8_t databyte  = 0;
volatile   int8_t datacount = 0;

void setup() {
  pinMode(2, INPUT); // CLK
  pinMode(SO, INPUT); //input - high impedance state
  pinMode(SI, INPUT);
  //https://sites.google.com/site/qeewiki/books/avr-guide/external-interrupts-on-the-atmega328
  EICRA = 0b0101; // sense any change on the INT1 and any change on the INT0
  EIMSK = 0b11;   // enable INT1 and INT0 interrupt
  sei(); // turn on interrupts
  pinMode(13, OUTPUT); // led output
}

void loop() { // debug screen control
  if (command != 0) {
    asm volatile (

      // calculate and print hex number to debug screen
      "mov r24, %0    \n"
      "swap r24       \n"
      "andi r24, 0x0f \n"
      "cpi r24, 0x0a  \n"
      "brlo lower11   \n"
      "subi r24,-0x27 \n"
      "lower11:       \n"
      "subi r24,-0x30 \n"
      "call LcdChar   \n"

      "mov r24, %0    \n"
      "andi r24, 0x0f \n"
      "cpi r24, 0x0a  \n"
      "brlo lower22   \n"
      "subi r24,-0x27 \n"
      "lower22:       \n"
      "subi r24,-0x30 \n"
      "call LcdChar   \n"

      "ldi r24, 0x22  \n" // delimiter character
      "call LcdChar   \n"
      :
      : "r" (command)
      : "r24"
    );
  }
}

ISR(INT1_vect) { // Chip Select interrupt
  asm volatile (
    "clr %[com_]               \n"
    "clr %[cmd_]               \n"
    "clr %[ad_]                \n"
    "clr %[db_]                \n"
    "clr %[dc_]                \n"
    "clr %[cn_]                \n"
    "cbi %[portd_], %[portd5_] \n" // disable input pullup, digitalWrite(SO,LOW);
    "cbi %[portd_], %[portd4_] \n" // disable input pullup, digitalWrite(SI,LOW);
    "cbi %[ddrd_], %[ddd5_]    \n" // pinMode(SO,INPUT); // going to high-z mode
    "cbi %[ddrd_], %[ddd4_]    \n" // pinMode(SI,INPUT);
    "cbi %[portb_], %[portb5_] \n" // digitalWrite(13,LOW); turn off led
    : [com_]   "+r" (command),
      [cmd_]   "+r" (cmd),
      [ad_]    "+r" (addr),
      [db_]    "+r" (databyte),
      [dc_]    "+r" (datacount),
      [cn_]    "+r" (count)
    : [portd_]  "I" (_SFR_IO_ADDR(PORTD)), [portd5_] "I" (PORTD5),
      [portd4_] "I" (PORTD4),
      [ddrd_]   "I" (_SFR_IO_ADDR(DDRD)),  [ddd5_]   "I" (DDD5),  // SO direction reg
      [ddd4_]   "I" (DDD4),  // SI direction reg
      [portb_]  "I" (_SFR_IO_ADDR(PORTB)), [portb5_] "I" (PORTB5) // pin 13, led
  );
}

ISR(INT0_vect) { // CLK change interrupt
  asm volatile (
    "sbic %[pind_], %[cs_]     \n" // if CS high then do not continue
    "rjmp endInt               \n"
    "sbis %[pind_], %[clk_]    \n" // CLK
    "rjmp sdata                \n" // if low, goto data section
    "sbi %[portb_], %[led_]    \n" // light up led

    // loads incoming byte to cmd bit by bit
    "lsl %A[cmd_]              \n" // Shifts all bits one place to the left
    "rol %B[cmd_]              \n" //
    "rol %C[cmd_]              \n" // 32-bit value
    "rol %D[cmd_]              \n" //
    
    "clr __tmp_reg__           \n" // clear byte
    "sbic %[pind_], %[si_]     \n" // Skip if bit in SI pin is cleared
    "inc __tmp_reg__           \n" // set bit one
    
    "add %A[cmd_],__tmp_reg__  \n" // add result to cmd
    "adc %B[cmd_],__zero_reg__ \n" //
    "adc %C[cmd_],__zero_reg__ \n" // 32-bit value
    "adc %D[cmd_],__zero_reg__ \n" //

    "mov r19, %A[cmd_]         \n" // copy first byte

    "clr __tmp_reg__           \n" // clear byte
    "inc __tmp_reg__           \n" // set bit one
    "add %A[cn_], __tmp_reg__  \n" // add one to counter variable
    "adc %B[cn_], __zero_reg__ \n" // 16-bit value

    "cpi %A[cn_], 8            \n" // compare counter variable to 8
    "cpc %B[cn_], __zero_reg__ \n" // upper byte must be zero
    "breq cont33               \n"
    "rjmp endInt               \n" // if not equal, goto end
    "cont33:                   \n"

    // select 8-bit command
    "cpi r19, 0x01             \n"
    "breq result               \n"
    "cpi r19, 0x02             \n"
    "breq result               \n"
    "cpi r19, 0x03             \n"
    "breq result               \n"
    "cpi r19, 0x04             \n"
    "breq result               \n"
    "cpi r19, 0x05             \n"
    "breq result               \n"
    "cpi r19, 0x06             \n"
    "breq result               \n"
    "cpi r19, 0x0b             \n"
    "breq result               \n"
    "cpi r19, 0x20             \n"
    "breq result               \n"
    "cpi r19, 0x2b             \n"
    "breq result               \n"
    "cpi r19, 0x2f             \n"
    "breq result               \n"
    "cpi r19, 0x3b             \n"
    "breq result               \n"
    "cpi r19, 0x52             \n"
    "breq result               \n"
    "cpi r19, 0x5a             \n"
    "breq result               \n"
    "cpi r19, 0x60             \n"
    "breq result               \n"
    "cpi r19, 0x90             \n"
    "breq result               \n"
    "cpi r19, 0x9f             \n"
    "breq result               \n"
    "cpi r19, 0xab             \n"
    "breq result               \n"
    "cpi r19, 0xb1             \n"
    "breq result               \n"
    "cpi r19, 0xb9             \n"
    "breq result               \n"
    "cpi r19, 0xc1             \n"
    "breq result               \n"
    "cpi r19, 0xc7             \n"
    "breq result               \n"
    "cpi r19, 0xd8             \n"
    "breq result               \n"

    "rjmp endInt               \n"
    "result:                   \n"
    "mov %[com_], r19          \n" // result to command

    "rjmp endInt               \n"
    
    "sdata:                    \n" // data section start

    "endInt:        \n"
    : [com_] "+r" (command),
      [cmd_] "+r" (cmd),
      [ad_]  "+r" (addr),
      [db_]  "+r" (databyte),
      [dc_]  "+r" (datacount),
      [cn_]  "+r" (count)
    : [pind_]  "I" (_SFR_IO_ADDR(PIND)),  [si_]  "I" (PIND4),  // input SI
                                          [cs_]  "I" (PIND3),  // cs
                                          [clk_] "I" (PIND2),  // clk
      [ddrd_]  "I" (_SFR_IO_ADDR(DDRD)),  [mso_] "I" (DDD5),   // pinMode SO
      [portd_] "I" (_SFR_IO_ADDR(PORTD)), [so_]  "I" (PORTD5), // output SO
      [portb_] "I" (_SFR_IO_ADDR(PORTB)), [led_] "I" (PORTB5)  // pin 13, led
    : "r19", "memory"
  );
}

You've written this assembly yourself or copied it?

Question from me is:
Why modify persistent values within the interrupt routine?
Just set a flag, leave the ISR, and continue in the main loop().
In the loop(), next time around, if the flag is set, do your persistent magic, then clear the flag.

If you’re getting too many interrupts, or your persistent loop stuff is too slow, you can use the same ideas, but need to implement a stack to apply the queued changes when loop time is available.

I'm with @lastchancename.

It seems to me there is far too much code in the ISR, even if it is in assembler.

And the GCC compiler is very clever - I suspect there will be very little performance advantage from using assembler, not enough to justify the extra complexity.

...R

DKWatson:
You've written this assembly yourself or copied it?

Yes, I have written it by myself. And about two weeks to think how to get it to work.

Maybe Arduino is not fast enough. I measured clock and data lines with logic analyzer and it is very fast. Measurement is attached to this post. I tried without assembler, but no success. Below is my current code. It returns same values than assembler version. I don't also fully understand why device is sending command 0x20 like in attached image. That command is sector erase. That cannot be right. Why randomly send sector erase commands to memory? Maybe I don't see something important...

#define SO 5
#define SI 4

// http://www.billporter.info/2010/08/18/ready-set-oscillate-the-fastest-way-to-change-arduino-pins/
#define CLR(x,y) (x&=(~(1<<y)))
#define SET(x,y) (x|=(1<<y))

volatile uint32_t cmd       = 0;
volatile uint16_t count     = 0;
volatile  uint8_t command   = 0;

void setup() {
  pinMode(2, INPUT); // CLK
  pinMode(SO, INPUT); //input - high impedance state
  pinMode(SI, INPUT);
  
  //https://sites.google.com/site/qeewiki/books/avr-guide/external-interrupts-on-the-atmega328
  EICRA = 0b1111; // sense rising edges on INT1 and INT0
  EIMSK = 0b11;   // enable INT1 and INT0 interrupt
 
  sei(); // turn on interrupts
}

void loop() { // debug screen control
  if(command) { // calculate and print hex number to debug screen
    LcdChar((command >> 4) + (((command >> 4) >= 0x0a) ? 0x57 : 0x30));
    LcdChar((command & 0xf) + (((command & 0xf) >= 0x0a) ? 0x57 : 0x30));
    LcdChar(0x23); // delimiter character
  }
}

ISR(INT1_vect) { // Chip Select interrupt
  command   = 0;
  cmd       = 0;
  count     = 0;
  PORTD    &= ~((1<<PORTD5)|(1<<PORTD4)); // disable input pullup (digitalWrite low)
  DDRD     &= ~((1<<DDD5)|(1<<DDD4));   // going to high-z mode (pinMode input)
}

ISR(INT0_vect) { // CLK interrupt
  cmd = (cmd << 1) + ((PIND >> PIND4) & 1); // shift cmd and save SI value
  command = (++count == 8) ? cmd & 0xff : command; // increase counter and copy command part of cmd
}

data.png

You have cmd as a 32bit value, but it looks like you copy it to command after only reading 8 bits...

ISR(INT0_vect) { // CLK interrupt
  cmd = (cmd << 1) + ((PIND >> PIND4) & 1); // shift cmd and save SI value
  command = (++count == 8) ? cmd & 0xff : command; // increase counter and copy command part of cmd
}

This is a bit weird, and your asm code does something similar.
There are a couple of ways to make the code faster:

  1. "cmd = (cmd << 1) + ((PIND >> PIND4) & 1" - you're checking one input bit. That doesn't need to input the whole port and shift it as well as shifting the command word.
  2. You reference several volatile variables multiple times under circumstances where they couldn't have changed it. C will happily reload cmd in order to & it with 0xFF, and reload command just to do "command = command;" in the ternary expression.
    (compact source representations do not necessarily mean faster code!)

I believe the following code does the same thing, is 32bytes shorter, and significantly faster...

ISR(INT0_vect) { // CLK interrupt
    uint32_t c = cmd << 1;
    if (PIND & (1<<PIND4))
	c |= 1;
    if (++count == 8) {
	command = c & 0xff; // increase counter and copy command part of cmd
    }
    cmd = c;
}

(It's probably still not "right.")

PS: the chip datasheet looks very SPI-like. Are you sure you don't want to try to figure out the ATmega SPI Slave peripheral? It will be far faster than bit-banging pins in ISRs...

westfw:
PS: the chip datasheet looks very SPI-like. Are you sure you don't want to try to figure out the ATmega SPI Slave peripheral? It will be far faster than bit-banging pins in ISRs...

Thanks for the tip! Now I can receive same data than with logic analyzer. I removed the display and using serial port for debugging. Pin mapping is now this:

Device - Arduino
CS - 10 (SS)
SI - 11 (MOSI)
SO - 12 (MISO)
CLK - 13 (CLK)

Below is my current program. Only question anymore is that why device sends mostly only 0x20 command. It never send read command so it's useless to program anything related to it. I'm stuck on this so probably I don't continue anymore unless some miracle happens.

Sample output looks like this:

,0x20,0x0,0x10,0x0
,0x0
,0x0
,0x0
,0x40
,0x0
,0x20,0x0,0x20,0x0
,0x0
,0x0
,0x0
,0x20,0x0,0x20,0x0
,0x0
,0x0
,0x0
,0x0
,0x0
,0x20,0x0,0x10,0x0
,0x0
,0x50
,0x0
,0x20,0x0,0x0,0x0

Program:

volatile uint8_t buf[20];
volatile uint8_t pos = 0;
volatile bool incoming = false;
uint8_t count = 0;

void setup() {
  Serial.begin(115200);
  //pinMode(MISO,OUTPUT); // must be high-z or device will halt

  // http://www.gammon.com.au/spi
  // https://www.arduino.cc/en/Tutorial/SPIEEPROM
  SPCR = 0b11000000; // SPI settings
}

void loop() { // debug
  if(incoming) {
    for(uint8_t i = 0; i < pos-1; i++) {
      Serial.print(",0x");
      Serial.print(buf[i],HEX);
      if(count == 0) {
        switch (buf[i]) {
          case 0x01:
            count = 1;
            break;
          case 0x02:
            count = 4;
            break;
          case 0x03:
            count = 4;
            break;
          case 0x04:
            count = 1;
            break;
          case 0x05:
            count = 1;
            break;
          case 0x06:
            count = 1;
            break;
          case 0x0b:
            count = 5;
            break;
          case 0x20:
            count = 4;
            break;
          case 0x2b:
            count = 1;
            break;
          case 0x2f:
            count = 1;
            break;
          case 0x3b:
            count = 5;
            break;
          case 0x52:
            count = 4;
            break;
          case 0x5a:
            count = 5;
            break;
          case 0x60:
            count = 1;
            break;
          case 0x90:
            count = 4;
            break;
          case 0x9f:
            count = 1;
            break;
          case 0xab:
            count = 4;
            break;
          case 0xb1:
            count = 1;
            break;
          case 0xb9:
            count = 1;
            break;
          case 0xc1:
            count = 1;
            break;
          case 0xc7:
            count = 1;
            break;
          case 0xd8:
            count = 4;
            break;
          default:
            Serial.print('\n');
        }
      }
      if(count > 0) {
        count--;
        if(count == 0) Serial.print('\n');
      }
    }
    pos = 0;
    incoming = false;
  }
}

ISR(SPI_STC_vect) { // SPI interrupt routine 
  uint8_t c = SPDR; // read byte from SPI Data Register
  if (pos < (sizeof(buf) - 1)) {
    buf[pos++] = c;
    incoming = true;
  }
}