Very interesting! When I tried the following, which also tries the inline function:
// 20ns per, but can be 40 (instruction fetch??)
PORT_IOBUS->Group[1].OUTSET.reg = 1 << 10;
PORT_IOBUS->Group[1].OUTCLR.reg = 1 << 10;
PORT_IOBUS->Group[1].OUTSET.reg = 1 << 10;
PORT_IOBUS->Group[1].OUTCLR.reg = 1 << 10;
// 80 ns per
PORT->Group[1].OUTTGL.reg = 1 << 10;
PORT->Group[1].OUTTGL.reg = 1 << 10;
PORT->Group[1].OUTTGL.reg = 1 << 10;
PORT->Group[1].OUTTGL.reg = 1 << 10;
fastWrite(10, 1); // fastWrite modified to use group 1 instead of 0
fastWrite(10, 0);
fastWrite(10, 1);
fastWrite(10, 0);
(By the way, it doesn't matter if I use SET and CLR or use TGL in terms of performance) It groups all the address and mask calculations together at the beginning of the function and then just has a sequence of 12 str instructions. I suspect the decision to build a constant value rather than loading it depends on which is quicker or takes less space depending on optimization choices.
210c: 22c0 movs r2, #192 ; 0xc0
210e: 05d2 lsls r2, r2, #23
2110: 0011 movs r1, r2
2112: 2380 movs r3, #128 ; 0x80
2114: 4808 ldr r0, [pc, #32] ; (2138 <loop+0x2c>)
2116: 00db lsls r3, r3, #3
2118: 3198 adds r1, #152 ; 0x98
211a: 3294 adds r2, #148 ; 0x94
211c: 600b str r3, [r1, #0]
211e: 6013 str r3, [r2, #0]
2120: 600b str r3, [r1, #0]
2122: 6013 str r3, [r2, #0]
2124: 6003 str r3, [r0, #0]
2126: 6003 str r3, [r0, #0]
2128: 6003 str r3, [r0, #0]
212a: 6003 str r3, [r0, #0]
212c: 600b str r3, [r1, #0]
212e: 6013 str r3, [r2, #0]
2130: 600b str r3, [r1, #0]
2132: 6013 str r3, [r2, #0]
2134: 4770 bx lr
2136: 46c0 nop ; (mov r8, r8)
2138: 4100449c .word 0x4100449c