Maximum pin toggle speed

ok, i'm totally new with arduino plus I don't have a debugger, compiler or any other tool for the moment so I just play with my Macintosh, ATmega1280 board & official SDK arduino 15 downloaded from arduino website.

As you may have found in other thread, i'm working on very fast PWM for my power electronics inverters....

The average poor man's method I use to measure real CPU cycle is as follows

#define NOP __asm__("nop\n\t")

int N = 0;
// long N=0;
unsigned long time, time1, time2;

void setup()
{
  Serial.begin(9600);
}

void loop{
  if (Serial.available()) {
    val = Serial.read();
    if (val == '+') {
      N += 1000;
    } 
    if (val == '-') {
      N -= 1000;
      if (N<0) N = 0;
    }
  }
  
  time1 = micros();
  
  for (int i=0; i < N; i++){
//    __asm__("nop\n\t");
    PORTD |= B1000;
    PORTD &= B11110111;
//   if (UCSR0A & _BV(RXC0)) {
//    }
  }
  
  time2 = micros();

  Serial.print("N: ");
  time=time2-time1;
  Serial.print(time);
  Serial.print(" / ");
  Serial.println(N);
  delay(1000);
}

so with big N value, it converges with average ns measurement for whatever set of instruction I have inside for(i=0;i<;N;i++). For example, 444/1000 print output means 444ns or roughly 7 cycles at 16MHz.

This is how I observed a NOP was really 1 cycle, PORTD |= B1000 (setting bit 3 of PORTD) was 1 cycle,... 6 cycles for INT i and 10 cyles for LONG i overhead or surrounding management of for(i=0;i<;N;i++).

About my project, please note what i'm doing to generate high speed PWM

void loop()
{
 cli();  // turn off interrupts
 while (true) {
// Turns ON coil charging opto-coupler #1
    PORTH |= B10000;
    for(i=0;i<charge_on;i++) NOP;

// Turns OFF coil charging opto-coupler #1
    PORTH &= B11101111;
    for(i=0;i<charge_off;i++) NOP;
  
// Turns ON coil FE extracting opto-coupler #2
    PORTA |= B1;
    for(i=0;i<extract_on;i++) NOP;

// Turns OFF coil FE extracting opto-coupler #2
    PORTA &= B11111110;
    for(i=0;i<extract_off;i++) NOP;

    if (UCSR0A & _BV(RXC0)) { // check uart  (register name changes per port)
      break;  // looks like there is data.  Break out of loop to handle it
    }
  } // end of time critical loop
  sei();  // interrupts back on
  delay(10); // wait for some characters to arrive
  while (Serial.available()) {
// Macintosh serial monitor parameter management to update 4 loops
  }
}

so which is why I need to know while(true) overhead to compensate the computation of charge_on, charge_off, extract_on & extra_off values to get precise duty cycle. Please note I already compensated by including 1 cycle for PORTX writing along with 2 cycles for USB RX from if (UCSR0A & _BV(RXC0)) and 6+1 cycles for local for(i=0;i<...) NOP