Watchdog ISR in assembly language

asm("#define SREG 0x3f\n");
asm(" in r16,SREG\n"); // fails with "Error: constant value required"
asm(" in r16,0x3f\n"); // works

@CB
I tried with and without \n, no difference. I often wondered about the \n in these assembler examples, it's the sort of thing you do when generating HTML code that has to be read by a human but you wouldn't expect it to be required here.

@Rob
I did the same with the small exception that I don't have python installed :), I'll go and get it.

@stimmer
That approach produced 964

at com.oroinc.text.regex.Perl5Matcher._match(Perl5Matcher.java)

errors.

However I noticed your .equ syntax is different

asm(".equ SREG,0x3f\n");
asm(" in r16,SREG\n"); // this works

but

asm(".equ r_temp1,r16\n");
asm(" push r_temp\n"); // this doesn't work, Error: register name or number from 0 to 31 required
asm(" in r_temp,SREG\n"); // same error

Is this the ASM equivelant of "lvalue required"?

Anyway that's where I left it at 2AM last night. I'm back on deck now so will continue with this.


Rob

I don't know if you can properly use .equ or = to rename registers.

But it just happens to work under certain circumstances. In particular, don't make the first letter of the symbol 'r' and omit the 'r' from the register. ie:

.equ q_temp1,16

which is the same as

q_temp1 = 16

should 'work', at least for now.

I'll have to check the mechanics of having assembler included in your sketch when I'm at home with the source code.
I thought you could add a .S tab to your sketch just the way you can add .h, .c, or .cpp modules. It may have to be capitalized...

Huh. It looks like the "Compiler" part of the IDE understands .S files, but the editor refuses to add such files to the sketch, or pass them to the compiler. Hmmph.

Fair dinkum this IDE is a crock.

Sketch>Add file... does nothing if it doesn't like the file type, at least have an error message.
It seems to add a .S file but doesn't show it in a tab

I don't normally use the IDE but I want what I'm doing to be available to the general Arduino crowd and most people do I think.

Anyway it's working in a fashion. Some mods to the py script (I've never used Python before, I gather the indenting actually defines the code blocks?)

import sys, os

def asm2include(infile, outfile):
    os.remove (outfile)

    line = ""
    with open(infile, 'r') as IN:
        for line in IN:
            line = line.strip(' ')
            if (line != ""):
                line = line.replace('  ',' ')
                line = line.replace('SREG','0x3f')
                line = line.replace('command','r18')
                line = line.replace('temp1','r16')
                line = line.replace('temp2','r17')
                line = line.replace('addlo','r19')
                line = line.replace('addhi','r20')
                line = line.replace('ZH','r31')
                line = line.replace('ZL','r30')
                line = line.replace('PORTB','5')
                line = line.replace('PINB','3')
 
                OUT = open(outfile, "a")
                if (line.rfind("//") == -1):
                    OUT.write("asm(\"")
                    OUT.write(line[:-1])
                    OUT.write("\");\n")
               
                OUT.close()
            print '.',

""" --------------------------------------------------------------
MAIN APPLICATION
"""  

print "ASM2include 0.1"
asm2include(sys.argv[1], sys.argv[2])
print "Done"

It deletes the output file and makes a few substitutions, no sanity test for the file delete so a bit dangerous at present.

.equ works on an "rvalue" but not an "lvalue", hense the py script replacements.

.equ PIN_MOSI,3 // this works so don't bother with the replace
#define addhi r20 // .equ addhi,r20 doesn't work, so leave this here just for reference

Leave all comments out, not point having them in the .S version

Then in the sketch

ISR (WDT_vect, ISR_NAKED) {
#include "d:\work\atmon\atmon\atmon_isr.s"
}

So it works but is pretty fugly.

The next step is to get both a C and ASM version working and see if there's much point in the first place :slight_smile:


Rob

Some improvements

.equ command,16
clr command

works, (note no "r16", just "16")

so the py script is almost back to original

import sys, os

def asm2include(infile, outfile):
    os.remove (outfile)

    line = ""
    with open(infile, 'r') as IN:
        for line in IN:
            line = line.strip(' ')
            if (line != ""):
                line = line.replace('  ',' ')
                OUT = open(outfile, "a")
                if (line.rfind("//") == -1):
                    OUT.write("asm(\"")
                    OUT.write(line[:-1])
                    OUT.write("\");\n")
               
                OUT.close()
            print '.',

""" --------------------------------------------------------------
MAIN APPLICATION
"""  

print "ASM2include 0.1"
asm2include(sys.argv[1], sys.argv[2])
print "Done"

Rob

I know you guys have been losing sleep over whether or not this is all worth it when an ISR written in C can be written in about 2 minutes. So I did a bit of testing.

The ISP function is basically to shift 3 bytes in, do a tiny amount of processing, then shift 1 byte out, all using what is essentially bit banged SPI.

Program size (including Arduino overhead)

ASM 858bytes
C (using digitalWrite()s) 1210 bytes

So there's a large difference mostly because of the digitalWrite library being pulled it, however in any normal program would have that anyway so the above comparison is not really fair. So in setup I added a single digitalWrite to pull in the library

ASM + 1 digitalWrite 1038 bytes.

The ISR code size is

ASM 212bytes
C 388 bytes

So with a normal program we're 176 bytes better off with ASM.

That's not a deal breaker, but have a look at the timing. Using a logic analyser (pics attached, pulse train at the top is normal program flow) I measured the interruption to main program flow.

ASM 37uS
C 400uS

That's a significant improvement, the ASM version is 10x faster. Also the ASM version could be even faster but I've had to add delays to allow the device (another Arduino) to catch up).

There is a third option of course, C with direct port manipulation which would presumably be somewhere between the two, I may write this to test it as well.


Rob

I know you guys have been losing sleep over whether or not this is all worth it

Imho these kinds of insights are worth it. They might cost a lot of time to build up but will save time in the end.

The speedup is so significant that one could think of rewiting digitalWrite (and other core functions) in asm(" ").

Keep us informed of the progress.

BTW what logic analyzer do you use? there is still one my wishlist :wink:

rewiting digitalWrite (and other core functions) in asm(" ").

C is not the problem. There has been extensive discussion on speeding up digitalWrite and friends; search the forum for "digitalwritefast"...

@westfw
If C is not part of the problem, .... is it part of the solution then? or ... :slight_smile:

Know about the discussion about digitalwritefast, seen it passing by several times, thanks anyway - Many (not all) performance problems can be solved by another design or algorithm, for the rest one can only buy more hardware or lower the requirements, and even that is not allways sufficient :slight_smile:

I said before

There is a third option of course, C with direct port manipulation which would presumably be somewhere between the two, I may write this to test it as well.

Well I've done another test and I was wrong, C with direct port manipulation is BETTER than assembler. I've always known C was good but I didn't expect this I have to say.

ASM with delays removed 28uS
C with direct mainipulation 26uS

Code size including Arduino overhead

ASM 858 bytes
C 778 bytes

Code size with neither routine 658 bytes

so the size of each ISR

ASM 200
C 120

Frankly I don't undertand the massive difference in code size, maybe my numbers are out a bit but either way this is very interesting.

As an ex ASM writer I think this is probably the end of my assembly writing days :astonished:

But what do I mean by "C with direct port manipulation"? An example of a byte shift in is as follows

  for (i = 0; i < 8; i++) {
    asm ("sbi 5,5");  // clock high
    cmd |= (PINB >> ASM_PIN_MOSI) & 1; // bit into LSB of byte
    cmd <<= 1;    // move bits up
    asm ("cbi 5,5"); // clock low
  }

So you could argue that it's really half assembly anyway and as written it's not directly portable between different CPUs, nothing a few #defines or conditional code blocks can't handle.

But I think the point is that it's all written within the C/C++/Arduino environment and I think that's a big plus. It's also a 100 times more readable than the 150 lines of assembly language.

BTW what logic analyzer do you use? there is still one my wishlist

Rob, take it off your wish list and put it on your desk, I can't even imagine how people debug code that deals with IO without one. I reckon if everybody had an LA this forum would be half the size as there would never be a "I'm sending X and receiving Y" thread again :slight_smile:

Anyway to answer you question, I use the Salaea Logic, I can't recomend it highly enough. At $170 it's a no brainer to me. In the "old" days with external peripherals you needed to spend $1000s on a 32-channel 100MHz job, but these days all the hard stuff is inside the chip and we're mostly looking at serial comms of one type or another so these new USB LAs are just the ticket for 99% of work IMO.

I also find it usefull for non-IO code, for example to see how often a function is called I pulse a pin and look at it with the LA. Or to see how many times I go through a loop I do the same.


Rob

    asm ("sbi 5,5");  // clock high

The current compiler is pretty good at turning C code like "PORTB |= (1<<PB5);" into a single sbi instruction when it is possible.

You did get rid of ISR_NAKED in your C implementation, right?

You did get rid of ISR_NAKED in your C implementation, right?

He he, yep, after wondering for a while why the code kept restarting :slight_smile:


Rob

Hold the bus, C isn't quite as good as I thought, I was bitten in the bum by the optomiser.

I changed the code as westfw suggested, here's the three bytes being received.

  for (i = 0; i < 8; i++) {
    PORTB |= (1<<PB5); 
    cmd |= (PINB >> PB4) & 1;  // I commented this line
    cmd <<= 1;
    PORTB &= ~(1<<PB5); 
  }
  for (i = 0; i < 8; i++) {
    PORTB |= (1<<PB5); 
    addr_lo |= (PINB >> PB4) & 1;
    addr_lo <<= 1;
    PORTB &= ~(1<<PB5); 
  }
  for (i = 0; i < 8; i++) {
    PORTB |= (1<<PB5); 
    addr_hi |= (PINB >> PB4) & 1;
    addr_hi <<= 1;
    PORTB &= ~(1<<PB5); 
  }

This runs like a cut snake at 27uS for the ISR.

For reasons that I can't remember I commented out the PINB line for the first byte and the ISR blew out to 68uS.

WTF?

It seems that the compiler has decided that "(PINB >> PB4) & 1" will be the same for all three times and has not bothered repeating the code, presumably did that once at the start of the ISR and just did assignments thereafter. If I'd had the other Arduino hooked up I would have realised I was getting duff data, but with MISO unconnected FF is as expected and I didn't look into it.

As I don't know if you can make PINB volatile I made the variables being assigned volatile instead and that seems to work.

So the C version takes 68uS, nearly twice as slow as assembler (thank goodness, the every foundations of my world had taken a beating for a minute there) but not too shabby.


Rob

This thread is dead but I'll summarise for future generations.

Yes it's easy to include an ASM file in an ISR using the following

ISR (WDT_vect, ISR_NAKED) {
#include "my_asm_file.s"
}

The included file must have every line wrapped in asm(""); and there is a Python script to do that written by robtillaart.

To do the same job (SPI bit bang 4 bytes) an ASM version is 10x faster than using the Arduino abstraction layer and quite a lot faster than using direct port manipulation (26uS vs 45uS).

That's it in a nutshell.

Thanks guys.


Rob

It seems to be pretty common for C compilers to be "not very good" at dealing with this sort of "shift a single bit from an IO register into a variable" sort of operation; they're just not very happy dealing with any quantity smaller than the smallest register (eg 8 bits on almost every architecture still in existence.) However:

  for (i = 0; i < 8; i++) {
    PORTB |= (1<<PB5); 
    cmd |= (PINB >> PB4) & 1;
    cmd <<= 1;
    PORTB &= ~(1<<PB5); 
  }

It turns out that this would be significantly smaller and faster written as:

  for (i = 0; i < 8; i++) {
    PORTB |= (1<<PB5); 
    if (PINB & 1<<PB4) {
      cmd |= 1;
    }
    cmd <<= 1;
    PORTB &= ~(1<<PB5); 
  }

Apparently the compile isn't able to recognize the first as a bit isolation technique, and actually goes through with shifting the input value however many times, while the latter manages to become a "skip if bit in io register clear" instruction.

Very interesting westfw, I did some timing from start of first clock pulse to end of last.

First version of the code 18.4uS
Second version of the code 8.5uS

Just a slight improvement :slight_smile:

BTW, there's a bug in my example code, the "cmd <<= 1;" should be moved to the top of the loop.

 for (i = 0; i < 8; i++) {
   cmd <<= 1;
   PORTB |= (1<<PB5); 
    if (PINB & 1<<PB4) {
      cmd |= 1;
    }
 //   cmd <<= 1; // not here
    PORTB &= ~(1<<PB5); 
  }

Or there is a shift done after the last bit and the value received is 2x that sent. (I remember now had the same bug with my ASM version but I'm a slow learner).

Thanks for delving into that westfw, it looks like we've got a nice tight shiftIn() function (albeit hardcoded) for others to use as well.

PS, I've modified my summary above to reflect this.


Rob

Could you post the full code for the final version for comparison purposes?

It's still a work in progess and a bit rough

//
// Acts as an SPI master
// Receives 3 bytes, CMD, ADDR_LO, ADDR_HI
// performs an action based on the CMD
// returns the results as a fourth byte
//
// Still very rough, a work in progress.
//

//#define RUNNING_ON_328
#define RUNNING_ON_2560

//#define  VERSION_ASM
#define  VERSION_C


#ifdef RUNNING_ON_328
#define ATMON_PORT_PIN_MOSI  3
#define ATMON_PORT_PIN_MISO  4
#define ATMON_PORT_PIN_SCK   5

#define ATMON_ARD_PIN_MOSI  11
#define ATMON_ARD_PIN_MISO  12
#define ATMON_ARD_PIN_SCK   13
#define ATMON_ARD_PIN_DEBUG 8
#endif

#ifdef RUNNING_ON_2560
#define ATMON_PORT_PIN_MOSI  2
#define ATMON_PORT_PIN_MISO  3
#define ATMON_PORT_PIN_SCK   1

#define ATMON_ARD_PIN_MOSI  51
#define ATMON_ARD_PIN_MISO  50
#define ATMON_ARD_PIN_SCK   52
#define ATMON_ARD_PIN_DEBUG 53
#endif

#define    CMD_POLL  1
#define    CMD_RD    2
#define    CMD_WR    3

#define  ACK    6
#define  NAK    15

// known location to read from
byte  test_byte = 0x23;


#ifdef VERSION_ASM
ISR (WDT_vect, ISR_NAKED) {
  #include "d:\work\atmon\atmon\atmon_isr.s"
}
#endif

#ifdef VERSION_C
ISR (WDT_vect) {
  volatile byte cmd = 0;
  volatile byte addr_hi = 0;
  volatile byte addr_lo = 0;
  byte response;
  byte *address;
  int i;

  asm ("wdr");
  
  
  // get the command byte from other Arduino
 for (i = 0; i < 8; i++) {
    cmd <<= 1;
    PORTB |= (1 << ATMON_PORT_PIN_SCK);     
    if (PINB & 1 << ATMON_PORT_PIN_MISO) {
      cmd |= 1;
    }
    PORTB &= ~(1 << ATMON_PORT_PIN_SCK); 
  }
  delayMicroseconds (5); // needed to allow the other Arduino some time to load another byte
  
  // get the low address byte from other Arduino
  for (i = 0; i < 8; i++) {
    addr_lo <<= 1;
    PORTB |= (1 << ATMON_PORT_PIN_SCK); 
    if (PINB & 1 << ATMON_PORT_PIN_MISO) {
      addr_lo |= 1;
    }
    PORTB &= ~(1 << ATMON_PORT_PIN_SCK); 
  } 
  delayMicroseconds (5);
  
  // get the high address byte from other Arduino
  for (i = 0; i < 8; i++) {
    addr_hi <<= 1;
    PORTB |= (1 << ATMON_PORT_PIN_SCK); 
    if (PINB & 1 << ATMON_PORT_PIN_MISO) {
      addr_hi |= 1;
    }
    PORTB &= ~(1 << ATMON_PORT_PIN_SCK); 
  }
  
  address = (byte*)((addr_hi << 8) + addr_lo);
  
  // action the command
  switch (cmd) {
    case  CMD_POLL:
      response = ACK;
      break;
 
    case  CMD_RD:
      response = *address;
      break;
 
    case  CMD_WR:
      response = ACK; // not implemented yet
      break;  
      
    default:
      response = NAK;
      break;  
  }

   // send response byte to other Arduino  
   for (i = 0; i < 8; i++) {
     if ((response & 0x80) == 0) 
        PORTB &= ~(1 << ATMON_PORT_PIN_MOSI); 
     else
        PORTB |= (1 << ATMON_PORT_PIN_MOSI); 
  
    PORTB |= (1 << ATMON_PORT_PIN_SCK); 
    response <<= 1;
    PORTB &= ~(1 << ATMON_PORT_PIN_SCK); 
  }
}
#endif

void setup () {
 
  WDTCSR = (1<<WDIE);
  
  Serial.begin (115200);
  
  pinMode(ATMON_ARD_PIN_MOSI, OUTPUT);
  pinMode(ATMON_ARD_PIN_MISO, INPUT);
  pinMode(ATMON_ARD_PIN_SCK, OUTPUT);
  pinMode(ATMON_ARD_PIN_DEBUG,OUTPUT);
  
};

void loop () {
  // pulse to allow measuring of the timeout from main code
   PORTB |= (1); 
   PORTB &= ~(1); 
};

I cleaned it up a bit, hopefully I didn't break anything while doing so.


Rob