NewSoftSerial Library: An AFSoftSerial update

Okay. Here's the bug that the OSX version of avr-gcc 4.3 generates. Basically it fails to preserve all of the necessary registers before calling a function from an interrupt handler. The code then uses the registers which in effect corrupts them from the main program's perspective.

Here's a problem area in the object dump from OSX:

SIGNAL(SIG_PIN_CHANGE0) 
   0:      1f 92             push      r1
   2:      0f 92             push      r0
   4:      0f b6             in      r0, 0x3f      ; 63
   6:      0f 92             push      r0
   8:      11 24             eor      r1, r1
   a:      8f 93             push      r24
   c:      9f 93             push      r25
   e:      ef 93             push      r30
  10:      ff 93             push      r31
******************************************************************************/

/* static */
inline void NewSoftSerial::handle_interrupts(int lopin, int hipin)
{
  if (active_object && 
  12:      e0 91 00 00       lds      r30, 0x0000
  16:      f0 91 00 00       lds      r31, 0x0000
  1a:      30 97             sbiw      r30, 0x00      ; 0
  1c:      01 f0             breq      .+0            ; 0x1e <__vector_3+0x1e>
  1e:      82 81             ldd      r24, Z+2      ; 0x02
  20:      90 e0             ldi      r25, 0x00      ; 0
  22:      88 30             cpi      r24, 0x08      ; 8
  24:      91 05             cpc      r25, r1
  26:      04 f0             brlt      .+0            ; 0x28 <__vector_3+0x28>
  28:      0e 97             sbiw      r24, 0x0e      ; 14
  2a:      04 f4             brge      .+0            ; 0x2c <__vector_3+0x2c>
    active_object->_receivePin >= lopin && 
    active_object->_receivePin <= hipin)
  {
    active_object->recv();
  2c:      cf 01             movw      r24, r30
  2e:      0e 94 00 00       call      0      ; 0x0 <__vector_3>
}

SIGNAL(SIG_PIN_CHANGE0) 
{
  NewSoftSerial::handle_interrupts(8, 13);
}
  32:      ff 91             pop      r31
  34:      ef 91             pop      r30
  36:      9f 91             pop      r25
  38:      8f 91             pop      r24
  3a:      0f 90             pop      r0
  3c:      0f be             out      0x3f, r0      ; 63
  3e:      0f 90             pop      r0
  40:      1f 90             pop      r1
  42:      18 95             reti

Notice that only registers 0, 1, 24, 25, 30 and 31 are preserved on the stack.

Here's the Windows object dump of the same section:

SIGNAL(SIG_PIN_CHANGE0) 
   0:      1f 92             push      r1
   2:      0f 92             push      r0
   4:      0f b6             in      r0, 0x3f      ; 63
   6:      0f 92             push      r0
   8:      11 24             eor      r1, r1
   a:      2f 93             push      r18
   c:      3f 93             push      r19
   e:      4f 93             push      r20
  10:      5f 93             push      r21
  12:      6f 93             push      r22
  14:      7f 93             push      r23
  16:      8f 93             push      r24
  18:      9f 93             push      r25
  1a:      af 93             push      r26
  1c:      bf 93             push      r27
  1e:      ef 93             push      r30
  20:      ff 93             push      r31
******************************************************************************/

/* static */
inline void NewSoftSerial::handle_interrupts(int lopin, int hipin)
{
  if (active_object && 
  22:      e0 91 00 00       lds      r30, 0x0000
  26:      f0 91 00 00       lds      r31, 0x0000
  2a:      30 97             sbiw      r30, 0x00      ; 0
  2c:      01 f0             breq      .+0            ; 0x2e <__vector_3+0x2e>
  2e:      82 81             ldd      r24, Z+2      ; 0x02
  30:      90 e0             ldi      r25, 0x00      ; 0
  32:      88 30             cpi      r24, 0x08      ; 8
  34:      91 05             cpc      r25, r1
  36:      04 f0             brlt      .+0            ; 0x38 <__vector_3+0x38>
  38:      0e 97             sbiw      r24, 0x0e      ; 14
  3a:      04 f4             brge      .+0            ; 0x3c <__vector_3+0x3c>
    active_object->_receivePin >= lopin && 
    active_object->_receivePin <= hipin)
  {
    active_object->recv();
  3c:      cf 01             movw      r24, r30
  3e:      0e 94 00 00       call      0      ; 0x0 <__vector_3>
}

SIGNAL(SIG_PIN_CHANGE0) 
{
  NewSoftSerial::handle_interrupts(8, 13);
}
  42:      ff 91             pop      r31
  44:      ef 91             pop      r30
  46:      bf 91             pop      r27
  48:      af 91             pop      r26
  4a:      9f 91             pop      r25
  4c:      8f 91             pop      r24
  4e:      7f 91             pop      r23
  50:      6f 91             pop      r22
  52:      5f 91             pop      r21
  54:      4f 91             pop      r20
  56:      3f 91             pop      r19
  58:      2f 91             pop      r18
  5a:      0f 90             pop      r0
  5c:      0f be             out      0x3f, r0      ; 63
  5e:      0f 90             pop      r0
  60:      1f 90             pop      r1
  62:      18 95             reti

Notice that many more registers are preserved. The Windows version is preserving registers 0,1, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 30 and 31.

Here's an updated NewSoftSerial recv() function that includes a work-around for the register problem.

void NewSoftSerial::recv() 
{

// Work-around for avr-gcc 4.3 OSX version bug
// Preserve the registers that the compiler misses
  asm volatile(
      "push r18 \n\t"
      "push r19 \n\t"
      "push r20 \n\t"
      "push r21 \n\t"
      "push r22 \n\t"
      "push r23 \n\t"
      "push r26 \n\t"
      "push r27 \n\t"
      ::);

  char i, d = 0; 

  // If RX line is high, then we don't see any start bit
  // so interrupt is probably not for us
  if (! digitalRead(_receivePin)) {

    // Wait approximately 1/2 of a bit width to "center" the sample
    tunedDelay(_bitDelay / 2 - 8);

    // Read each of the 8 bits
    for (i=0; i<8; i++) { 
      //PORTB |= _BV(5);
      tunedDelay(_bitDelay - 6);  // 6 - digitalread takes some time
      //PORTB &= ~_BV(5);
      if (digitalRead(_receivePin))
        d |= (1 << i); 
      else // else clause added to ensure function timing is ~balanced
        d &= ~(1 << i);
    } 

    // skip the stop bit
    tunedDelay(_bitDelay);

    // if buffer full, set the overflow flag and return
    if ((_receive_buffer_tail + 1) % _NewSS_MAX_RX_BUFF != _receive_buffer_head) {
      // save new data in buffer: tail points to where byte goes
      _receive_buffer[_receive_buffer_tail] = d; // save new byte 
      _receive_buffer_tail = (_receive_buffer_tail + 1) % _NewSS_MAX_RX_BUFF;
    } else {
      _buffer_overflow = true;
    }
  }

// Work-around for avr-gcc 4.3 OSX version bug
// Restore the registers that the compiler misses
  asm volatile(
      "pop r27 \n\t"
      "pop r26 \n\t"
      "pop r23 \n\t"
      "pop r22 \n\t"
      "pop r21 \n\t"
      "pop r20 \n\t"
      "pop r19 \n\t"
      "pop r18 \n\t"
      ::);
}

The main change is that I added some inline assembly to preserve the missing registers and then restore them on exit. Additionally I rearranged some of the conditions to eliminate the "returns". This ensures we don't skip the restoring assembly and corrupt the stack.

I've tested this and it works correctly with OSX and I get no lockups. I don't know if this will have any impact on the bit timing so more testing at different baud rates is probably needed. I doubt it will because we're basically just adding in the machine instructions missing from the OSX version (but running on the Windows version).

Although it won't hurt the Windows version to run this code (other than a few more bytes on the stack and a few extra instruction cycles and maybe impacted bit timing?), there's really no reason it needs to. Maybe you can include the assembly blocks inside of conditional compiler directives while testing for the platform and only include them for the OSX version?

It would technically be more correct to put the register preserving code in each interrupt handler before it calls any function. But that would mean adding the push/pop inline assembly 3 times (in each interrupt handler). In this case all of the handlers end up calling recv() before the contents of the "missed" registers are disturbed so it works to have the code only in recv().

Note that this work-around is specific to this code. It's very possible that if the recv() function is modified other registers might be affected that aren't preserved properly (particularly if recv() calls any new functions). So probably the only way to tell is to compare the object dumps between OSX and Windows to see if any more (or different) registers are missing.

etracer--

Fantastic work! This is just the analysis we were looking for and just the result I hoped for. I will generate a new official version of NewSoftSerial shortly incorporating this and some other improvements.

I agree that there is some chance that this will affect bit timings. I have a fairly elaborate test suite built here that tests all the supported baud rates from 300-115.2K, so I should be able to make some tweaks to compensate, if needed.

Ideally, we could conditionally compile this code, because putting it in all implementations means that the Windows interrupt handler will always be slightly slower than the OSX handler.

Thanks again, and congratulations on the fine work!

Mikal

Is this, or is this going to be, compatible with the 328?

I built a BT wireless RFID reader a year ago (why spend 1600 bucks on a commercial model!) and used 2 168's over i2c as I couldn't get past the SS hang.

I'm just about to order 5 328's, can it be assumed the NSS library is compatible?

As long as the clock is 16MHz, I think it's a fair bet that it will work, although I haven't tested anything but the Diecimila.

I've got a few unassembled RBBBs laying around with 16mhz crystals so il put in a 328, when they arrive, and give it a spin.

Wow, you guys are amazing. Nice inline assembly fix. I'm going to upgrade the Mac AVR tools to the latest version from AVR Mac Pack (which includes avr-gcc 4.3.2).

mellis, where will you be posting them? An update to 0013 zip? Or just instructions on what to grab from another site to do surgery ourselves?

The inline assembly is cool but only works on a case-by-case basis, and there are other folks (including me) who have been bitten by the broken interrupt handlers on OSX.

@mellis, back when I was investigating the problem with the broken Ubuntu random() function, the guys on the avr-libc forum advised steering clear of the 4.3.0 avr-gcc compiler, which they characterized as "broken". I thought it was funny that that's the version we use with Windows Arduino. Perhaps it would be good to upgrade all the Arduino distros to 4.3.2?

Mikal

Good point. The latest version of WinAVR has gcc 4.3.2, too. I hope to upgrade both OSes for Arduino 0014.

I'm happy to say that I have incorporated etracer's mods to [u]NewSoftSerial[/u] and after running the Windows-compiled object through my baud rate test suite, I find that the slightly longer RX interrupt handlers do not significantly affect the error rate.

ERROR RATES
   | 300  | 1200 | 2400 | 4800 | 9600  | 14.4K | 19.2K  |  28.8K | 31.25K  | 38.4K  | 57.6K  | 115.2K
TX |  0%  |  0%  |  0%  |  0%  |  0%   |   0%  |   0%   |   0%   |   0%    |   0%   |    0%  |    0%
RX |  0%  |  0%  |  0%  |  0%  |  0%   |   0%  |   0%   |   0%   |   0%    | 0.1%   | 0.35%  |   N/A

@mellis: I would love to be able to make etracer's fix conditional on whether it is needed. This seems an excellent example of a use case that would benefit from the

#define ARDUINO_VERSION 14

proposal I made.

OSX users: Please test. And would someone please compile NSS 5 on OSX for Diecimila/Duemilanove and send me the binary? I would like to run it through my test bench.

Mikal

http://sundial.org/arduino/index.php/newsoftserial/

I'm getting a "Page not found" error when trying to download the latest version.

Ouch. Sorry. I thought I had tested all the links. Try again.

http://sundial.org/arduino/NewSoftSerial/NewSoftSerial5.zip

Mikal

And would someone please compile NSS 5 on OSX for Diecimila/Duemilanove and send me the binary? I would like to run it through my test bench.

Check your private messages.

Tonight I thoroughly tested etracer's OSX build of NewSoftSerial v5 and it performs admirably at all supported baud rates, perhaps even slightly better than the Windows build.

Thanks again, etracer, for your work.

Mikal

i fixed a small bug in the interrupt names that was giving 328's a 'compiles but freaks' problem
also made the examples folder a little neater
otherwise, works very nicely with 1 GPS module under v13 ide on windows and mac osx tested with 168 and 328p

http://www.ladyada.net/media/gpsshield/NewSoftSerial_18-02-09.zip

Thank you very much, ladyada.

Mikal

I posted NewSoftSerial 6 today, which contains ladyada's contributions. This new version supports the Atmega328p that she sells. Go buy a bunch of them.

Thanks,

Mikal

well, to be specific, it should work under any 328p
-however- the interrupt bug in v13 is not resolved so there may be freaky experiences.

Note also that the inline assembly work-around for the OSX 0013 interrupt bug that I provided may not work with the 328. I don't know if the compiler will generate the same register code as for a 168 (and I don't have any 328's to test with).

That being said, I suspect it will work because I can't see any way that the buggy 4.3.0 version of the compiler would know about the 328 and have specific optimizations.

-however- the interrupt bug in v13 is not resolved so there may be freaky experiences.

Oh, sorry, I thought you said in your previous post that you had tested NSS5 with etracer's workaround on the 328p... ?

otherwise, works very nicely with 1 GPS module under v13 ide on windows and mac osx tested with 168 and 328p

Is this not correct?

M