Your assumptions about clock cycles are way out. I've disassembled your code and added the source back in so you can see where:
00000100 <_Z8N64_sendPKhj>:
100: 20 e2 ldi r18, 0x20 ; 32 (1)
102: 23 b9 out 0x03, r18 ; 3 (1) // toggle D13
104: fc 01 movw r30, r24 (1)
106: 27 e0 ldi r18, 0x07 ; 7 (1)
108: 30 e0 ldi r19, 0x00 ; 0 (1)
10a: 40 e0 ldi r20, 0x00 ; 0 (1)
10c: 50 e0 ldi r21, 0x00 ; 0 (1)
byte_loop:
bit_loop:
N64_LOW; //two cycles
10e: 52 9a sbi 0x0a, 2 ; 10 (2)
asm volatile("nop\nnop\nnop\nnop\nnop");
110: 00 00 nop (1)
112: 00 00 nop (1)
114: 00 00 nop (1)
116: 00 00 nop (1)
118: 00 00 nop (1)
if(*current_byte >> shift) //how many cycles? assuming 2
11a: 80 81 ld r24, Z (2)
11c: 90 e0 ldi r25, 0x00 ; 0 (1)
11e: 02 2e mov r0, r18 (1)
120: 02 c0 rjmp .+4 ; 0x126 <_Z8N64_sendPKhj+0x26> (2)
122: 95 95 asr r25 (1)
124: 87 95 ror r24 (1)
126: 0a 94 dec r0 (1)
128: e2 f7 brpl .-8 ; 0x122 <_Z8N64_sendPKhj+0x22> (1/2)
12a: 89 2b or r24, r25 (1)
12c: 91 f0 breq .+36 ; 0x152 <_Z8N64_sendPKhj+0x52> (1/2)
N64_HIGH;
12e: 52 98 cbi 0x0a, 2 ; 10 (2)
asm volatile("nop\nnop\nnop\nnop\n"
"nop\nnop\nnop\nnop\n"
"nop\nnop\nnop\nnop\n"
"nop\nnop\nnop\nnop\n");
130: 00 00 nop (1)
132: 00 00 nop (1)
134: 00 00 nop (1)
136: 00 00 nop (1)
138: 00 00 nop (1)
13a: 00 00 nop (1)
13c: 00 00 nop (1)
13e: 00 00 nop (1)
140: 00 00 nop (1)
142: 00 00 nop (1)
144: 00 00 nop (1)
146: 00 00 nop (1)
148: 00 00 nop (1)
14a: 00 00 nop (1)
14c: 00 00 nop (1)
14e: 00 00 nop (1)
150: 11 c0 rjmp .+34 ; 0x174 <_Z8N64_sendPKhj+0x74> (2)
else
asm volatile("nop\nnop\nnop\nnop\n"
"nop\nnop\nnop\nnop\n"
"nop\nnop\nnop\nnop\n"
"nop\nnop\nnop\nnop\n");
152: 00 00 nop (1)
154: 00 00 nop (1)
156: 00 00 nop (1)
158: 00 00 nop (1)
15a: 00 00 nop (1)
15c: 00 00 nop (1)
15e: 00 00 nop (1)
160: 00 00 nop (1)
162: 00 00 nop (1)
164: 00 00 nop (1)
166: 00 00 nop (1)
168: 00 00 nop (1)
16a: 00 00 nop (1)
16c: 00 00 nop (1)
16e: 00 00 nop (1)
170: 00 00 nop (1)
N64_HIGH;
172: 52 98 cbi 0x0a, 2 ; 10 (2)
// end of if --
--shift;
174: 21 50 subi r18, 0x01 ; 1 (1)
176: 30 40 sbci r19, 0x00 ; 0 (1)
if(shift >= 0)//how many cycles? assuming 1
178: 37 fd sbrc r19, 7 (1/2/3)
17a: 05 c0 rjmp .+10 ; 0x186 <_Z8N64_sendPKhj+0x86> (2)
asm volatile("nop\nnop\nnop\nnop\n");
17c: 00 00 nop (1)
17e: 00 00 nop (1)
180: 00 00 nop (1)
182: 00 00 nop (1)
goto bit_loop;
184: c4 cf rjmp .-120 ; 0x10e <_Z8N64_sendPKhj+0xe> (2)
++counter;
186: 4f 5f subi r20, 0xFF ; 255 (1)
188: 5f 4f sbci r21, 0xFF ; 255 (1)
if(counter < length) //how many cycles? assuming 1
18a: 46 17 cp r20, r22 (1)
18c: 57 07 cpc r21, r23 (1)
18e: 20 f4 brcc .+8 ; 0x198 <_Z8N64_sendPKhj+0x98> (1/2)
++current_byte;
190: 31 96 adiw r30, 0x01 ; 1 (2)
shift = 7;
192: 27 e0 ldi r18, 0x07 ; 7 (1)
194: 30 e0 ldi r19, 0x00 ; 0 (1)
goto byte_loop;
196: bb cf rjmp .-138 ; 0x10e <_Z8N64_sendPKhj+0xe> (2)
N64_LOW;
198: 52 9a sbi 0x0a, 2 ; 10 (2)
asm volatile("nop\nnop\nnop\nnop\nnop\n"
"nop\nnop\n");
19a: 00 00 nop (1)
19c: 00 00 nop (1)
19e: 00 00 nop (1)
1a0: 00 00 nop (1)
1a2: 00 00 nop (1)
1a4: 00 00 nop (1)
1a6: 00 00 nop (1)
N64_HIGH;
1a8: 52 98 cbi 0x0a, 2 ; 10 (2)
asm volatile("nop\nnop\nnop\nnop\nnop\nnop\n"
"nop\nnop\nnop\nnop\nnop\nnop\n"
"nop\nnop\nnop\nnop\nnop\nnop\n"
"nop\nnop\nnop\nnop\nnop\nnop\n");
1aa: 00 00 nop (1)
1ac: 00 00 nop (1)
1ae: 00 00 nop (1)
1b0: 00 00 nop (1)
1b2: 00 00 nop (1)
1b4: 00 00 nop (1)
1b6: 00 00 nop (1)
1b8: 00 00 nop (1)
1ba: 00 00 nop (1)
1bc: 00 00 nop (1)
1be: 00 00 nop (1)
1c0: 00 00 nop (1)
1c2: 00 00 nop (1)
1c4: 00 00 nop (1)
1c6: 00 00 nop (1)
1c8: 00 00 nop (1)
1ca: 00 00 nop (1)
1cc: 00 00 nop (1)
1ce: 00 00 nop (1)
1d0: 00 00 nop (1)
1d2: 00 00 nop (1)
1d4: 00 00 nop (1)
1d6: 00 00 nop (1)
1d8: 00 00 nop (1)
// return from function
1da: 08 95 ret (4)
I added an extra line to toggle D13 because I was initially not getting a result, but that won't affect the results.
You are using int where you can probably use byte. That generates a lot more code.
Just as one example, shifting something right actually involves a loop, so the timing would vary depending on the number of bits:
122: 95 95 asr r25 (1)
124: 87 95 ror r24 (1)
126: 0a 94 dec r0 (1)
128: e2 f7 brpl .-8 ; 0x122 <_Z8N64_sendPKhj+0x22> (1/2)
I timed about 4.3 µS per bit, and that was running at 16 MHz!