Improving SIG_OVERFLOW (timer0, millis, etc)

Latest version. More comments; matched style. cycles/milli, sort of.

/*
  Count milliseconds since initialization.  Use the timer0 overflow.
  Timer0 prescaler is set to 64, and overflows every 256 counts.

  There are two strategies implemented:
  1) In the first strategy, we keep track up the number of milliseconds
     and microseconds for each timer tick, and use those to update the
     millisecond count (carrying from microsecs to milliseconds whenever
      microsecs reaches 1000.)  For several popular clock frequencies,
     the number of "remainder" microseconds in each tick can be scaled
     to fit the max of about 1000 in a single byte, saving substantially
     on code space and time.  Obviously, the number of microseconds in
     each tick has to be an integer.  Works for 1, 2, 4, 8, 16MHz.
         millis += millis per tick. (with scale microsecond fractional part.)
  2) The other strategy works for more general clock frequencies.  For
     each clock tick, accumulated the total number of cycles and divide
     out each millisecond worth of cycles.  This works whenever the number
     of clock cycles in a millisecond is an integer (almost always!)
     Since the number of milliseconds per tick is small (1.024 for 16MHz),
     the division can be done moderately efficiently by repeated subtraction.
         millis += cycles/(cyclesPerMilli)
   The source code to implment the ISR in these two cases can be essentially
   identical, although the type of the fractional part varies, and the
   constants change.
*/

/*
  Calculate constants to see if we can make the "microsecPerTick"
  algorithm work.   We'll have:
     ((1/F_CPU) * 256 * 64)  =  seconds per tick.
     ((1/F_CPU) * 256 * 64) * 1e6 =  microseconds per tick.
  But we only have integers, so rearrange this to:
      (64e6/F_CPU) * 256 = us/tick
  or  (1024e6/F_CPU) * 16
*/
#define US_PER_T0TICK ((1024000000L/F_CPU)*16)
#define MS_PER_T0TICK (US_PER_T0TICK/1000)
/*
  Now, to fit at least one millisecond worth of microseconds in a single
  byte, we need to scale by  factor of at least 8 (1 count = 8 uS)
  (4 almost works, excepts that (1000/4) (250) is too close to wraparound)
 */
#define S_US_PER_MS (1000>>3)      /* scaled microsecs per millisec */
#define S_US_PER_T0TICK ((US_PER_T0TICK % 1000)>>3)

#ifdef TEST_NEW_TIMER  // Assign some values to variables so they can be seen.
#warning TEST_NEW_TIMER variables
unsigned long test_total_t0ticks = 0;
unsigned long test_us_per_t0tick = US_PER_T0TICK;
unsigned char test_ms_per_t0tick = MS_PER_T0TICK;
unsigned char test_s_us_per_t0tick = S_US_PER_T0TICK;
#endif

#if ((((MS_PER_T0TICK*1000)+(S_US_PER_T0TICK<<3)) /16) * F_CPU) == 1024000000
/*
  If the fast algorithm will work, the math will go backwards
  Overall, we have s_us = ((16384e6/f_cpu)%1000) / 8,
     and it must be an integer.
 */
typedef unsigned char timer0_fract_t;      /* counts microseconds */
#define FRACT_INC (S_US_PER_T0TICK)      /* Fraction increment by microsecs */
#define FRACT_MAX (S_US_PER_MS)            /*   till we reach one millisec */
#define MILLIS_INC (MS_PER_T0TICK)      /* non-fractional millis */
#warning T0OVERFLOW counts microseconds

#else
/*
  if the math doesn't reverse, we need to use the more general
  algorithm.  Redefine types and  constants appropriately.
*/
typedef unsigned long timer0_fract_t;      /* counts cycles */
#define FRACT_INC (64UL*256UL);            /* fraction increments by cycles */
#define FRACT_MAX (F_CPU/1000)            /* cycles per millisecond */
#define MILLIS_INC 0                  /* no non-fractional parts. */
#warning T0OVERFLOW counts cycles

#endif

volatile unsigned long timer0_millis = 0;
static timer0_fract_t timer0_fract = 0;

SIGNAL(SIG_OVERFLOW0)
{
      unsigned long mstmp = timer0_millis;
      timer0_fract_t fractmp = timer0_fract;

#ifdef TEST_NEW_TIMER
#warning TEST_NEW_TIMER increment
      test_total_t0ticks++;
#endif
      mstmp += MILLIS_INC;            /* increment milliseconds */
      fractmp += FRACT_INC;
      while (fractmp >= FRACT_MAX) {
            fractmp -= FRACT_MAX;
            mstmp += 1;
      }
      timer0_fract = fractmp;
      timer0_millis = mstmp;
}

unsigned long millis()
{
      unsigned long m;
      uint8_t oldSREG = SREG;
      
      // disable interrupts while we read timer0_millis or we might get an
      // inconsistent value (e.g. in the middle of the timer0_millis++)
      cli();
      m = timer0_millis;
      SREG = oldSREG;
      
      return m;
}