Serial.print messing with Timer Interrupts

Finally I found it out. It is a very subtle bug in MsTimer2:

ISR(TIMER2_OVF_vect) {
#if defined (__AVR_ATmega168__) || defined (__AVR_ATmega48__) || defined (__AVR_ATmega88__) || defined (__AVR_ATmega328P__) || (__AVR_ATmega1280__)
	TCNT2 = MsTimer2::tcnt2;
#elif defined (__AVR_ATmega128__)
	TCNT2 = MsTimer2::tcnt2;
#elif defined (__AVR_ATmega8__)
	TCNT2 = MsTimer2::tcnt2;
#endif
	MsTimer2::_overflow();
}

If you remove the macros and only look at mega168 then you get

ISR(TIMER2_OVF_vect) {
	TCNT2 = MsTimer2::tcnt2;
	MsTimer2::_overflow();
}

What is going on here? The library runs timer2 with a prescaler of 64. Thus the timer will tick with 250 kHz. So every 250 ticks an interrupt must be triggered to achieve the desired rate of one interrupt per millisecond. Instead of using CTC mode the library uses "normal mode". Thus the timer will process 256 ticks between to interrupts but it should use only 250. In order to compensate for this the ISR will advance TCNT2 immediately to the value of MsTimer2::tcnt2; which happens to be 6.

If TCNT2 == 0 at the start of the ISR, then of course this is equivalent to skipping 6 ticks. The result is the desired delay. However if TCNT2 > 0 at the start of the ISR, then it will skip less ticks. In the most extreme case TCNT2 == 255 setting it to 6 would result in 249 additional ticks instead of 6 skipped ticks.

But why is this an issue if I use Serial.print??? In order to find out I disassembled my compiled code. Here is what I found:

ISR(TIMER2_OVF_vect) {
 22c:	1f 92       	push	r1				cycles:	2	total:  2
 22e:	0f 92       	push	r0				cycles:	2	total:  4
 230:	0f b6       	in	r0, 0x3f	; 63		cycles:	1	total:  5
 232:	0f 92       	push	r0				cycles:	2	total:  7
 234:	11 24       	eor	r1, r1				cycles:	1	total:  8
 236:	2f 93       	push	r18				cycles:	2	total: 10
 238:	3f 93       	push	r19				cycles:	2	total: 12
 23a:	4f 93       	push	r20				cycles:	2	total: 14
 23c:	5f 93       	push	r21				cycles:	2	total: 16
 23e:	6f 93       	push	r22				cycles:	2	total: 18
 240:	7f 93       	push	r23				cycles:	2	total: 20
 242:	8f 93       	push	r24				cycles:	2	total: 22
 244:	9f 93       	push	r25				cycles:	2	total: 24
 246:	af 93       	push	r26				cycles:	2	total: 26
 248:	bf 93       	push	r27				cycles:	2	total: 28
 24a:	ef 93       	push	r30				cycles:	2	total: 30
 24c:	ff 93       	push	r31				cycles:	2	total: 32
#if defined (__AVR_ATmega168__) || defined (__AVR_ATmega48__) || defined (__AVR_ATmega88__) || defined (__AVR_ATmega328P__) || (__AVR_ATmega1280__)
	TCNT2 = MsTimer2::tcnt2;
 24e:	80 91 1b 01 	lds	r24, 0x011B			cycles:	2	total: 34
 252:	90 91 1c 01 	lds	r25, 0x011C			cycles:	2	total: 36
 256:	80 93 b2 00 	sts	0x00B2, r24			cycles:	2	total: 38
#elif defined (__AVR_ATmega128__)
	TCNT2 = MsTimer2::tcnt2;
#elif defined (__AVR_ATmega8__)
	TCNT2 = MsTimer2::tcnt2;
#endif
	MsTimer2::_overflow();
 25a:	0e 94 d7 00 	call	0x1ae	; 0x1ae <_ZN8MsTimer29_overflowEv>

So MsTimer2 takes 38 cpu cycles + ISR trigger overhead till it can set TCNT2. So everything that delays the MsTimer2 ISR for more than 24 cpu cycles will drag it down.

Looking into the serial ISR I find

000004b8 <__vector_19>:
#elif defined(USART0_UDRE_vect)
ISR(USART0_UDRE_vect)
#elif defined(USART_UDRE_vect)
ISR(USART_UDRE_vect)
#endif
{
 4b8:	1f 92       	push	r1			cycles:	2	total:  2
 4ba:	0f 92       	push	r0			cycles:	2	total:  4
 4bc:	0f b6       	in	r0, 0x3f	; 63	cycles:	1	total:  5
 4be:	0f 92       	push	r0			cycles:	2	total:  7
 4c0:	11 24       	eor	r1, r1			cycles:	1	total:  8
 4c2:	2f 93       	push	r18			cycles:	2	total: 10	
 4c4:	3f 93       	push	r19			cycles:	2	total: 12
 4c6:	5f 93       	push	r21			cycles:	2	total: 14
 4c8:	6f 93       	push	r22			cycles:	2	total: 16
 4ca:	7f 93       	push	r23			cycles:	2	total: 18
 4cc:	8f 93       	push	r24			cycles:	2	total: 20
 4ce:	9f 93       	push	r25			cycles:	2	total: 22
 4d0:	af 93       	push	r26			cycles:	2	total: 24
 4d2:	bf 93       	push	r27			cycles:	2	total: 26
 4d4:	ef 93       	push	r30			cycles:	2	total: 28
 4d6:	ff 93       	push	r31			cycles:	2	total: 30
  if (tx_buffer.head == tx_buffer.tail) {
 4d8:	20 91 a1 01 	lds	r18, 0x01A1		cycles:	2	total: 32
 4dc:	30 91 a2 01 	lds	r19, 0x01A2		cycles:	2	total: 34
 4e0:	80 91 a3 01 	lds	r24, 0x01A3		cycles:	2	total: 36
 4e4:	90 91 a4 01 	lds	r25, 0x01A4		cycles:	2	total: 38
 4e8:	28 17       	cp	r18, r24		cycles:	1	total: 39
 4ea:	39 07       	cpc	r19, r25		cycles:	1	total: 40
 4ec:	31 f4       	brne	.+12     	; 0x4fa <__vector_19+0x42> cycles:	1-2	total: 41-42
	// Buffer empty, so disable interrupts
#if defined(UCSR0B)
    cbi(UCSR0B, UDRIE0);
 4ee:	80 91 c1 00 	lds	r24, 0x00C1
 4f2:	8f 7d       	andi	r24, 0xDF	; 223
 4f4:	80 93 c1 00 	sts	0x00C1, r24
 4f8:	16 c0       	rjmp	.+44     	; 0x526 <__vector_19+0x6e>
    cbi(UCSRB, UDRIE);
#endif
  }
  else {
    // There is more data in the output buffer. Send the next byte
    unsigned char c = tx_buffer.buffer[tx_buffer.tail];
 4fa:	e0 91 a3 01 	lds	r30, 0x01A3		cycles:	2	total: 44
 4fe:	f0 91 a4 01 	lds	r31, 0x01A4		cycles:	2	total: 46
 502:	ef 59       	subi	r30, 0x9F	; 159	cycles:	1	total: 47
 504:	fe 4f       	sbci	r31, 0xFE	; 254	cycles:	1	total: 48
 506:	20 81       	ld	r18, Z			cycles:	1	total: 49
    tx_buffer.tail = (tx_buffer.tail + 1) % SERIAL_BUFFER_SIZE;
 508:	80 91 a3 01 	lds	r24, 0x01A3		cycles:	2	total: 51	
 50c:	90 91 a4 01 	lds	r25, 0x01A4		cycles:	2	total: 53
 510:	01 96       	adiw	r24, 0x01	; 1	cycles:	2	total: 55
 512:	60 e4       	ldi	r22, 0x40	; 64	cycles:	1	total: 56
 514:	70 e0       	ldi	r23, 0x00	; 0	cycles:	1	total: 57
 516:	0e 94 f0 03 	call	0x7e0	; 0x7e0 <__divmodhi4>	cycles:	2	total: 59 + time for the subroutine
 51a:	90 93 a4 01 	sts	0x01A4, r25		cycles:	2	total: 61
 51e:	80 93 a3 01 	sts	0x01A3, r24		cycles:	2	total: 63
	
  #if defined(UDR0)
    UDR0 = c;
 522:	20 93 c6 00 	sts	0x00C6, r18		cycles:	2	total: 65
    UDR = c;
  #else
    #error UDR not defined
  #endif
  }
}
 526:	ff 91       	pop	r31			cycles:	2	total: 67
 528:	ef 91       	pop	r30			cycles:	2	total: 69
 52a:	bf 91       	pop	r27			cycles:	2	total: 71
 52c:	af 91       	pop	r26			cycles:	2	total: 73
 52e:	9f 91       	pop	r25			cycles:	2	total: 75
 530:	8f 91       	pop	r24			cycles:	2	total: 77	
 532:	7f 91       	pop	r23			cycles:	2	total: 79
 534:	6f 91       	pop	r22			cycles:	2	total: 81
 536:	5f 91       	pop	r21			cycles:	2	total: 83
 538:	3f 91       	pop	r19			cycles:	2	total: 85
 53a:	2f 91       	pop	r18			cycles:	2	total: 87	
 53c:	0f 90       	pop	r0			cycles:	2	total: 89
 53e:	0f be       	out	0x3f, r0	; 63	cycles:	1	total: 90
 540:	0f 90       	pop	r0			cycles:	2	total: 92
 542:	1f 90       	pop	r1			cycles:	2	total: 94
 544:	18 95       	reti				cycles:	4	total: 98

Obviously 98 is greater than 24 and I did not even count the cycles used inside of "call 0x7e0 ; 0x7e0 <__divmodhi4>".

In order to verify my hypthesis I replace MsTimer2 with the following code:

ISR(TIMER2_COMPA_vect) {
    process_one_sample();
}

void initTimer2() {
    // Timer 2 CTC mode, prescaler 64
    TCCR2B = (1<<WGM22) | (1<<CS22);
    TCCR2A = (1<<WGM21);
    
    // 249 + 1 == 250 == 250 000 / 1000 ==  (16 000 000 / 64) / 1000
    OCR2A = 249;
    
    // enable Timer 2 interrupts
    TIMSK2 = (1<<OCIE2A);    
}

Yeah!!! Now only 30ppm deviation. This is within the expected tolerance of the crystal. Bug understood and fixed :slight_smile:

I will contact the author of the library to fix this for the whole community.

One afterthought: I do not like to have ISR driven serial output. But now matter if this is a good idea or not, the library is full with code like

int i = (unsigned int)(buffer->head + 1) % SERIAL_BUFFER_SIZE;

Since the Arduino does have no hardware division this is very costly both in terms of CPU and memory. A much better way to wrap a counter is:

int i = (unsigned int)(buffer->head);
i = i<SERIAL_BUFFER_SIZE-1? i+1: 0;