What is the fastest way to read/write GPIOs on SAMD21 boards?

Very interesting! When I tried the following, which also tries the inline function:

  // 20ns per, but can be 40 (instruction fetch??)
 PORT_IOBUS->Group[1].OUTSET.reg = 1 << 10;
 PORT_IOBUS->Group[1].OUTCLR.reg = 1 << 10;
 PORT_IOBUS->Group[1].OUTSET.reg = 1 << 10;
 PORT_IOBUS->Group[1].OUTCLR.reg = 1 << 10;

 // 80 ns per
 PORT->Group[1].OUTTGL.reg = 1 << 10;
 PORT->Group[1].OUTTGL.reg = 1 << 10;
 PORT->Group[1].OUTTGL.reg = 1 << 10;
 PORT->Group[1].OUTTGL.reg = 1 << 10;

  fastWrite(10, 1); // fastWrite modified to use group 1 instead of 0
  fastWrite(10, 0);
  fastWrite(10, 1);
  fastWrite(10, 0);

(By the way, it doesn't matter if I use SET and CLR or use TGL in terms of performance) It groups all the address and mask calculations together at the beginning of the function and then just has a sequence of 12 str instructions. I suspect the decision to build a constant value rather than loading it depends on which is quicker or takes less space depending on optimization choices.

    210c:	22c0      	movs	r2, #192	; 0xc0
    210e:	05d2      	lsls	r2, r2, #23
    2110:	0011      	movs	r1, r2
    2112:	2380      	movs	r3, #128	; 0x80
    2114:	4808      	ldr	r0, [pc, #32]	; (2138 <loop+0x2c>)
    2116:	00db      	lsls	r3, r3, #3
    2118:	3198      	adds	r1, #152	; 0x98
    211a:	3294      	adds	r2, #148	; 0x94
    211c:	600b      	str	r3, [r1, #0]
    211e:	6013      	str	r3, [r2, #0]
    2120:	600b      	str	r3, [r1, #0]
    2122:	6013      	str	r3, [r2, #0]
    2124:	6003      	str	r3, [r0, #0]
    2126:	6003      	str	r3, [r0, #0]
    2128:	6003      	str	r3, [r0, #0]
    212a:	6003      	str	r3, [r0, #0]
    212c:	600b      	str	r3, [r1, #0]
    212e:	6013      	str	r3, [r2, #0]
    2130:	600b      	str	r3, [r1, #0]
    2132:	6013      	str	r3, [r2, #0]
    2134:	4770      	bx	lr
    2136:	46c0      	nop			; (mov r8, r8)
    2138:	4100449c 	.word	0x4100449c