Reduce VirtualWire load module size by factor of ~2

I was modifying VirtualWire to work on yet another non-Arduino micro when I noticed the use of float math to calculate the bit times and timer constants. The use of floats is unnecessary and replacing the calculations with long integers resulted in identical timer constants, but a factor of nearly 2 in code size reduction. This is because of the large overhead required to multiply and divide floats. On an ATmega168, final loaded code size was reduced from 5102 to 2696 bytes, or a 48% reduction in load module size.

I would be pleased if people would check out my proposed modification for correctness on standard or other platforms. It probably can be further optimized. Also posted on the Google VirtualWire forum.

Original code [in VirtualWire.cpp, function uint8_t vw_timer_calc(uint16_t speed, uint16_t max_ticks, uint16_t *nticks) ]

 // Amount of time per CPU clock tick (in seconds)
        float clock_time = (1.0 / ((float)F_CPU / (float)prescalers[prescaler]));
        // Fraction of second needed to xmit one bit
        float bit_time = ((1.0 / (float)speed) / 8.0);
        // number of prescaled ticks needed to handle bit time @ speed
        ulticks = (long)(bit_time / clock_time);

Proposed new code:

// 1/Amount of time per CPU clock tick (in seconds)
        unsigned long inv_clock_time = F_CPU / ((unsigned long)prescalers[prescaler]);
        // 1/Fraction of second needed to xmit one bit
        unsigned long inv_bit_time = ((unsigned long)speed) * 8;
        // number of prescaled ticks needed to handle bit time @ speed
        ulticks = inv_clock_time/inv_bit_time;

I think it will be a great improvement.
If both the transmitter and receiver use the same code, any rounding difference is no problem. But to be compatible with existing devices running VirtualWire it has to be the same.

The next sketch generates a table for 4MHz to 20MHz with a speed of 100 to 20k.
The table is attached, and it shows it is always the same !

// Test sketch for jremington
// http://forum.arduino.cc/index.php?topic=228322.0
//
// With Arduino 1.0.5
// Using code from VirtualWire 1.24
// Tested on Arduino Uno
//


// The F_CPU is replaced by a variable.
// I think that has no influence on the result, the compiler doesn't do any precalculation with it.
unsigned long fcpu = 16000000UL;
char buffer[80];

uint8_t vw_timer_calc1(uint16_t speed, uint16_t max_ticks, uint16_t *nticks)
{
    // Clock divider (prescaler) values - 0/3333: error flag
    uint16_t prescalers[] = {0, 1, 8, 64, 256, 1024, 3333};
    uint8_t prescaler=0; // index into array & return bit value
    unsigned long ulticks; // calculate by ntick overflow

    // Div-by-zero protection
    if (speed == 0)
    {
        // signal fault
        *nticks = 0;
        return 0;
    }

    // test increasing prescaler (divisor), decreasing ulticks until no overflow
    for (prescaler=1; prescaler < 7; prescaler += 1)
    {
        // Amount of time per CPU clock tick (in seconds)
        float clock_time = (1.0 / ((float)fcpu / (float)prescalers[prescaler]));
        // Fraction of second needed to xmit one bit
        float bit_time = ((1.0 / (float)speed) / 8.0);
        // number of prescaled ticks needed to handle bit time @ speed
        ulticks = (long)(bit_time / clock_time);
        // Test if ulticks fits in nticks bitwidth (with 1-tick safety margin)
        if ((ulticks > 1) && (ulticks < max_ticks))
        {
            break; // found prescaler
        }
        // Won't fit, check with next prescaler value
    }

    // Check for error
    if ((prescaler == 6) || (ulticks < 2) || (ulticks > max_ticks))
    {
        // signal fault
        *nticks = 0;
        return 0;
    }

    *nticks = ulticks;
    return prescaler;
}

uint8_t vw_timer_calc2(uint16_t speed, uint16_t max_ticks, uint16_t *nticks)
{
    // Clock divider (prescaler) values - 0/3333: error flag
    uint16_t prescalers[] = {0, 1, 8, 64, 256, 1024, 3333};
    uint8_t prescaler=0; // index into array & return bit value
    unsigned long ulticks; // calculate by ntick overflow

    // Div-by-zero protection
    if (speed == 0)
    {
        // signal fault
        *nticks = 0;
        return 0;
    }

    // test increasing prescaler (divisor), decreasing ulticks until no overflow
    for (prescaler=1; prescaler < 7; prescaler += 1)
    {
        // 1/Amount of time per CPU clock tick (in seconds)
        unsigned long inv_clock_time = fcpu / ((unsigned long)prescalers[prescaler]);
        // 1/Fraction of second needed to xmit one bit
        unsigned long inv_bit_time = ((unsigned long)speed) * 8;
        // number of prescaled ticks needed to handle bit time @ speed
        ulticks = inv_clock_time/inv_bit_time;

        // Test if ulticks fits in nticks bitwidth (with 1-tick safety margin)
        if ((ulticks > 1) && (ulticks < max_ticks))
        {
            break; // found prescaler
        }
        // Won't fit, check with next prescaler value
    }

    // Check for error
    if ((prescaler == 6) || (ulticks < 2) || (ulticks > max_ticks))
    {
        // signal fault
        *nticks = 0;
        return 0;
    }

    *nticks = ulticks;
    return prescaler;
}


void setup() 
{
  uint8_t pre1, pre2;
  uint16_t ticks1, ticks2;
  int diff;

  Serial.begin(9600);


  for (fcpu=4000000UL; fcpu<=20000000UL; fcpu+=4000000UL)
  {
    Serial.println(F("------------------------------------------------"));
    Serial.print(F("F_CPU = "));
    Serial.println(fcpu);
    Serial.println(F("Test with calculation float versus integer"));
    Serial.println(F("------------------------------------------------"));
    Serial.println(F(" speed   original    |      integer     diff"));
    Serial.println(F("------------------------------------------------"));
    
    for (unsigned speed=100; speed<20000; speed+=100)
    {
      pre1 = vw_timer_calc1(speed, -1, &ticks1);
      pre2 = vw_timer_calc2(speed, -1, &ticks2);
      
      diff = (int) ((long) ticks2 - (long) ticks1);
      
      sprintf (buffer, "%6d : %3d %6u    |    %3d %6u    diff=%d", speed, pre1, ticks1, pre2, ticks2, diff);
      Serial.println (buffer);
    }
    Serial.println();
  }
}

void loop()
{
}

testtable.txt (49.9 KB)

Wow, thanks for the EXTENSIVE tests! I just tried a few values and hoped for the best. Your table does show one difference that I spotted (there may be others) but that is a bit time error of only 1 part in 500:

5000 : 1 499 | 1 500 diff=1

Edit: I should have pointed out in my original post that almost all of the reduction in code size is due to the elimination of the floating point support package. So, if float variables are used elsewhere in the code, there is no real advantage to the integer-only calculation. But for a small micro that is just transmitting a few bits of sensor data, this reduction in code size makes a huge difference!

I understand, I used only float in the sketch for testing.

But I missed a few, sorry for that.

F_CPU = 20000000
  2500 :   1    999    |      1   1000    diff=1
  5000 :   1    499    |      1    500    diff=1
 10000 :   1    249    |      1    250    diff=1

They are all for the 20MHz. The new integer calculation is a little higher and also more correct if I may say so. There could be a little tweaking perhaps, to simulate the error of the old calculation. Perhaps "inv_clock_time - 1".