angefixt dürfte stimmen.
Die V3 braucht etwas weniger Speicher als das Original und gibt nun auch 0 bei "kaputten" UTF-8 strings retour. Wenn man das return 0 nicht braucht, ist es noch mal etwas kleiner.
Daher gibts jetzt auch "kaputte" Testdaten und der Teststring enthält nun 1, 2, 3 und 4 Byte Zeichen
// https://forum.arduino.cc/t/frage-zu-chararray-strlen-und-umlaut/897224/
void setup()
{
Serial.begin(115200);
Serial.println();
Serial.println(F("Start..."));
//const char tak[] = "zurück";
const char tak[] = "y䮀𝄞"; // examples in wikipedia.de
//const char tak[] {0x79, 0xC3, 0xA4, 0xAE, 0xE2, 0x82, 0xAC, 0xF0, 0x9D, 0x84, 0x9E}; // "missing" start byte for ®
for (uint8_t i = 0; i < strlen(tak); i++)
{
Serial.print(i);
Serial.print(" ");
Serial.print(tak[i]);
Serial.print(" ");
Serial.println((byte)tak[i], HEX); // you need this explicit cast from char to byte, otherwise print BIN will break on Arduino...
}
Serial.print (F("strlen =")); Serial.println(strlen(tak));
Serial.print (F("utf8_strlen =")); Serial.println(utf8_strlen(tak)); // 2104/202
Serial.print (F("utf8_strlenV2=")); Serial.println(utf8_strlenV2(tak)); // 2096/202
Serial.print (F("utf8_strlenV3=")); Serial.println(utf8_strlenV3(tak)); // 2096/202
}
void loop()
{}
int utf8_strlen(const char * str) // http://www.zedwood.com/article/cpp-utf8-strlen-function
{
int c, i, ix, q;
for (q = 0, i = 0, ix = strlen(str); i < ix; i++, q++)
{
c = (unsigned char) str[i];
if (c >= 0 && c <= 127) i += 0;
else if ((c & 0xE0) == 0xC0) i += 1;
else if ((c & 0xF0) == 0xE0) i += 2;
else if ((c & 0xF8) == 0xF0) i += 3;
//else if (($c & 0xFC) == 0xF8) i+=4; // 111110bb //byte 5, unnecessary in 4 byte UTF-8
//else if (($c & 0xFE) == 0xFC) i+=5; // 1111110b //byte 6, unnecessary in 4 byte UTF-8
else return 0;//invalid utf8
}
return q;
}
/*
calculates the true character length of a UTF-8 c string
*/
int utf8_strlenV2(const char * str)
{
byte c = 0;
byte r = 0; // result
for (size_t i = 0; i < strlen(str); i++)
{
c = (unsigned char) str[i];
if (c <= 127) r++;
else if ((c & 0xE0) == 0xC0) // 0b11100000 0b11000000
{
i += 1;
r++;
}
else if ((c & 0xF0) == 0xE0) // 0b11110000 0b11100000
{
i += 2;
r++;
}
else if ((c & 0xF8) == 0xF0) // 0b11111000 0b11110000
{
i += 3;
r++;
}
}
return r;
}
/*
calculates the true character length of a UTF-8 c string
*/
size_t utf8_strlenV3(const char * str)
{
byte c = 0;
byte r = 0; // result
for (size_t i = 0; i < strlen(str); i++)
{
c = (unsigned char) str[i];
if (c <= 127) r++;
else
{
switch (c & 0b11111000)
{
case 0b11000000 : // one sequence byte
i += 1;
r++;
break;
case 0b11100000 : // two sequence bytes
i += 2;
r++;
break;
case 0b11110000 : // three sequence bytes
i += 3;
r++;
break;
default : // invalid RCF3629,
return 0; // void result and exit - if you need "valid" characters, comment this line
break;
}
}
}
return r;
}
Start...
0 y 79
1 ⸮ C3
2 ⸮ A4
3 ⸮ C2
4 ⸮ AE
5 ⸮ E2
6 ⸮ 82
7 ⸮ AC
8 ⸮ F0
9 ⸮ 9D
10 ⸮ 84
11 ⸮ 9E
strlen =12
utf8_strlen =5
utf8_strlenV2=5
utf8_strlenV3=5
lustig auch, der explizite Cast auf byte (damit die FFFFFF weg sind), spart sogar etwas flash.
edit...
ein Fass ohne Boden!
Natürlich findet Google kürzere Lösungen als der kleine noiasca da zusammenkleistert...
ohne Failsafe:
/*
calculates the true character length of a UTF-8 c string
https://stackoverflow.com/questions/4063146/getting-the-actual-length-of-a-utf-8-encoded-stdstring
*/
size_t utf8_strlenV4(const char * str)
{
int len = 0;
while (*str) len += (*str++ & 0xc0) != 0x80;
return len;
}