@MarkT: This gives you 4.2us pulses, try it out. delayMicroseconds() waits at least 4us even if the parameter is less than 4.
But you can change the code to:
void pulse ()
{
cli () ;
PIND = 0x10 ;
__asm__("nop\n\tnop\n\tnop\n\tnop\n\tnop\n\t");
__asm__("nop\n\tnop\n\tnop\n\tnop\n\tnop\n\t");
__asm__("nop\n\tnop\n\tnop\n\tnop\n\tnop\n\t");
__asm__("nop\n\tnop\n\tnop\n\tnop\n\tnop\n\t");
PIND = 0x10 ;
sei () ;
}
This should result in a pulse of 1.2us.