fix: UTF-8 string search not working correctly

This commit is contained in:
WerWolv
2025-08-18 20:03:45 +02:00
parent 8b384ef1f9
commit 35a357f1a8

View File

@@ -214,15 +214,8 @@ namespace hex::plugin::builtin {
return { Occurrence::DecodeType::Binary, std::endian::native };
}();
i64 countedCharacters = 0;
u64 startAddress = reader.begin().getAddress();
u64 endAddress = reader.end().getAddress();
u64 progress = 0;
u64 codePointWidth = 0;
i8 remainingCharacters = 0;
for (u8 byte : reader) {
bool validChar =
const auto validAscii = [&](u8 byte) {
return
(settings.lowerCaseLetters && std::islower(byte)) ||
(settings.upperCaseLetters && std::isupper(byte)) ||
(settings.numbers && std::isdigit(byte)) ||
@@ -230,50 +223,64 @@ namespace hex::plugin::builtin {
(settings.underscores && byte == '_') ||
(settings.symbols && std::ispunct(byte) && !std::isspace(byte)) ||
(settings.lineFeeds && (byte == '\r' || byte == '\n'));
};
if (settings.type == UTF16LE) {
i64 countedCharacters = 0;
u64 startAddress = reader.begin().getAddress();
u64 endAddress = reader.end().getAddress();
u64 progress = 0;
i8 remainingCharacters = 0;
for (const u8 byte : reader) {
bool validChar = false;
if (settings.type == ASCII) {
validChar = validAscii(byte);
} else if (settings.type == UTF16LE) {
// Check if second byte of UTF-16 encoded string is 0x00
if (countedCharacters % 2 == 1)
validChar = byte == 0x00;
else
validChar = validAscii(byte);
} else if (settings.type == UTF16BE) {
// Check if first byte of UTF-16 encoded string is 0x00
if (countedCharacters % 2 == 0)
validChar = byte == 0x00;
else
validChar = validAscii(byte);
} else if (settings.type == UTF8) {
if ((byte & 0b1000'0000) == 0b0000'0000) {
// ASCII range
codePointWidth = 1;
remainingCharacters = 0;
validChar = true;
} else if ((byte & 0b1100'0000) == 0b1000'0000) {
// Continuation mark
if (remainingCharacters > 0) {
remainingCharacters -= 1;
if (remainingCharacters > 0) {
// Expect continuation byte
if ((byte & 0b1100'0000) == 0b1000'0000) {
validChar = true;
remainingCharacters -= 1;
} else {
countedCharacters -= std::max<i64>(0, codePointWidth - (remainingCharacters + 1));
codePointWidth = 0;
validChar = false; // broken sequence
remainingCharacters = 0;
validChar = false;
}
} else if ((byte & 0b1110'0000) == 0b1100'0000) {
// Two bytes
codePointWidth = 2;
remainingCharacters = codePointWidth - 1;
validChar = true;
} else if ((byte & 0b1111'0000) == 0b1110'0000) {
// Three bytes
codePointWidth = 3;
remainingCharacters = codePointWidth - 1;
validChar = true;
} else if ((byte & 0b1111'1000) == 0b1111'0000) {
// Four bytes
codePointWidth = 4;
remainingCharacters = codePointWidth - 1;
validChar = true;
} else {
validChar = false;
// New character
if (byte <= 0x7F) {
// ASCII
validChar = validAscii(byte);
remainingCharacters = 0;
} else if ((byte & 0b1110'0000) == 0b1100'0000) {
// 2-byte start (U+80..U+7FF)
validChar = (byte >= 0xC2); // exclude overlongs (0xC0, 0xC1)
remainingCharacters = 1;
} else if ((byte & 0b1111'0000) == 0b1110'0000) {
// 3-byte start (U+800..U+FFFF)
validChar = !(byte == 0xE0 || byte == 0xED);
// E0 must be followed by >= 0xA0, ED must be <= 0x9F (avoid surrogates)
remainingCharacters = 2;
} else if ((byte & 0b1111'1000) == 0b1111'0000) {
// 4-byte start (U+10000..U+10FFFF)
validChar = (byte <= 0xF4); // reject > U+10FFFF
remainingCharacters = 3;
} else {
validChar = false;
remainingCharacters = 0;
}
}
}