Trying to dive into ATtiny assembly

Okay, since my initial question was about correctness of assembly code, as I want to learn some AVR assembly, I'll skip my original electronic lock sketch and focus to this display part which I'm trying to get to work without library to chop down every byte I can save. Here is .ino code:

// TM 1637 7-seg display test code, no library

#define TM1637_DIO_PIN PB0
#define TM1637_CLK_PIN PB1

#define TM1637_DIO_OUTPUT() (DDRB |= _BV(TM1637_DIO_PIN))
#define TM1637_DIO_INPUT() (DDRB &= ~_BV(TM1637_DIO_PIN))
#define TM1637_DIO_READ() (((PINB & _BV(TM1637_DIO_PIN)) > 0) ? 1 : 0)
#define TM1637_CLK_HIGH() (PORTB |= _BV(TM1637_CLK_PIN))
#define TM1637_CLK_LOW() (PORTB &= ~_BV(TM1637_CLK_PIN))
#define TM1637_DIO_HIGH() (PORTB |= _BV(TM1637_DIO_PIN))
#define TM1637_DIO_LOW() (PORTB &= ~_BV(TM1637_DIO_PIN))

                     /*0*/ /*1*/ /*2*/ /*3*/ /*4*/ /*5*/ /*6*/ /*7*/ /*8*/ /*9*/
uint8_t digits[] = { 0x3f, 0x06, 0x5b, 0x4f, 0x66, 0x6d, 0x7d, 0x07, 0x7f, 0x6f };
 
extern "C" {
 // function prototypes
 void TM1637_DELAY_US();
 void TM1637_start();
 //void TM1637_stop();
}

void setup()
{
 DDRB |= (_BV(TM1637_DIO_PIN)|_BV(TM1637_CLK_PIN));
 
 // initialise display and set brightness
 // 0x88 is dim and increasing value to 0x8C increases brightness
 TM1637_start();
 TM1637_write_byte(0x8c);
 TM1637_stop();

 // clear display
 write(0xff, 0xff, 0xff, 0xff);
}

void loop()
{
 // display some numbers in order to check if code works
 write( digits[1], digits[2], digits[2], digits[0] );
}

void write(uint8_t first, uint8_t second, uint8_t third, uint8_t fourth)
{
 TM1637_start();
 TM1637_write_byte(0x40);
 TM1637_stop();

 TM1637_start();
 TM1637_write_byte(0xc0);
 TM1637_write_byte(first);
 TM1637_write_byte(second);
 TM1637_write_byte(third);
 TM1637_write_byte(fourth);
 TM1637_stop();
}

/*
void TM1637_start(void)
{
 TM1637_CLK_HIGH();//send start signal to TM1637
 TM1637_DIO_HIGH();
 TM1637_DELAY_US();
 TM1637_DIO_LOW();
 TM1637_CLK_LOW();
 TM1637_DELAY_US();
}
*/

void TM1637_stop(void)
{
 TM1637_CLK_LOW();
 TM1637_DIO_LOW();
 TM1637_DELAY_US();
 TM1637_CLK_HIGH();
 TM1637_DIO_HIGH();
 TM1637_DELAY_US();
}

uint8_t TM1637_write_byte(uint8_t value)
{
 uint8_t i, ack;

 for (i = 0; i < 8; ++i, value >>= 1) {
 TM1637_CLK_LOW();
 TM1637_DELAY_US();
 if (value & 0x01) {
 TM1637_DIO_HIGH();
 } else {
 TM1637_DIO_LOW();
 }
 TM1637_CLK_HIGH();
 TM1637_DELAY_US();
 }

 TM1637_CLK_LOW();
 TM1637_DIO_INPUT();
 TM1637_DIO_HIGH();
 TM1637_DELAY_US();
 ack = TM1637_DIO_READ();
 if (ack) {
 TM1637_DIO_OUTPUT();
 TM1637_DIO_LOW();
 }
 TM1637_DELAY_US();
 TM1637_CLK_HIGH();
 TM1637_DELAY_US();
 TM1637_CLK_LOW();
 TM1637_DELAY_US();
 TM1637_DIO_OUTPUT();
 return ack;
}

And here goes .S assembly file:

#define TM1637_DIO_PIN 0b00000001 ; PB0
#define TM1637_CLK_PIN 0b00000010 ; PB1
#define DDRB 0x17
#define PORTB 0x18
#define R16 0x10

.global TM1637_DELAY_US
.global TM1637_start
;.global TM1637_stop

TM1637_DELAY_US: ; 50 us at 1.2 mhz
 ; rcall takes 3 cycles
 ; ret takes 4 cycles
 ; we need another 53 cycles
    ldi  r18, 25 ; 1 cycle
L1: dec  r18 
    breq L1 ; 1 cycle for true and 2 for false
 ret
 
TM1637_start:
 ldi R16, TM1637_CLK_PIN | TM1637_DIO_PIN
 or PORTB, R16 ; HI -> (clk, dio)
 rcall TM1637_DELAY_US + 1 ; wait 50 us
 com R16 
 and PORTB, R16 ; LOW -> (clk, dio)
 rcall TM1637_DELAY_US + 1 ; wait 50 us
 ret

TM1637_stop:
 ldi R16, 0b11111100
 and PORTB, R16 ; LOW -> (clk, dio)
 rcall TM1637_DELAY_US + 1 ; wait 50 us
 com R16
 out PORTB, R16 ; HI -> (clk, dio)
 rcall TM1637_DELAY_US + 1 ; wait 50 us
 ret

Why is this last routine, TM1637_stop: flawed?