For the sending commands, you can get away single byte transfers. But for data, it is much faster if you can transfer multiple bytes at a time
Maybe something like
void glcd_data(uint8_t * data, uint16_t len) {
digitalWrite(CS_PIN, HIGH);
for(uint16_t count=0; count<len; count++) {
SPI.transfer(DATA_MSK);
SPI.transfer(data[count] & 0xf0);
SPI.transfer(data[count] << 4);
}
digitalWrite(CS_PIN, LOW);
}