mirror of
https://github.com/WerWolv/ImHex.git
synced 2026-03-27 23:37:05 -05:00
fix: UTF-8 string search not working correctly
This commit is contained in:
@@ -214,15 +214,8 @@ namespace hex::plugin::builtin {
|
||||
return { Occurrence::DecodeType::Binary, std::endian::native };
|
||||
}();
|
||||
|
||||
i64 countedCharacters = 0;
|
||||
u64 startAddress = reader.begin().getAddress();
|
||||
u64 endAddress = reader.end().getAddress();
|
||||
|
||||
u64 progress = 0;
|
||||
u64 codePointWidth = 0;
|
||||
i8 remainingCharacters = 0;
|
||||
for (u8 byte : reader) {
|
||||
bool validChar =
|
||||
const auto validAscii = [&](u8 byte) {
|
||||
return
|
||||
(settings.lowerCaseLetters && std::islower(byte)) ||
|
||||
(settings.upperCaseLetters && std::isupper(byte)) ||
|
||||
(settings.numbers && std::isdigit(byte)) ||
|
||||
@@ -230,50 +223,64 @@ namespace hex::plugin::builtin {
|
||||
(settings.underscores && byte == '_') ||
|
||||
(settings.symbols && std::ispunct(byte) && !std::isspace(byte)) ||
|
||||
(settings.lineFeeds && (byte == '\r' || byte == '\n'));
|
||||
};
|
||||
|
||||
if (settings.type == UTF16LE) {
|
||||
i64 countedCharacters = 0;
|
||||
u64 startAddress = reader.begin().getAddress();
|
||||
u64 endAddress = reader.end().getAddress();
|
||||
|
||||
u64 progress = 0;
|
||||
i8 remainingCharacters = 0;
|
||||
for (const u8 byte : reader) {
|
||||
bool validChar = false;
|
||||
|
||||
if (settings.type == ASCII) {
|
||||
validChar = validAscii(byte);
|
||||
} else if (settings.type == UTF16LE) {
|
||||
// Check if second byte of UTF-16 encoded string is 0x00
|
||||
if (countedCharacters % 2 == 1)
|
||||
validChar = byte == 0x00;
|
||||
else
|
||||
validChar = validAscii(byte);
|
||||
} else if (settings.type == UTF16BE) {
|
||||
// Check if first byte of UTF-16 encoded string is 0x00
|
||||
if (countedCharacters % 2 == 0)
|
||||
validChar = byte == 0x00;
|
||||
else
|
||||
validChar = validAscii(byte);
|
||||
} else if (settings.type == UTF8) {
|
||||
if ((byte & 0b1000'0000) == 0b0000'0000) {
|
||||
// ASCII range
|
||||
codePointWidth = 1;
|
||||
remainingCharacters = 0;
|
||||
validChar = true;
|
||||
} else if ((byte & 0b1100'0000) == 0b1000'0000) {
|
||||
// Continuation mark
|
||||
|
||||
if (remainingCharacters > 0) {
|
||||
remainingCharacters -= 1;
|
||||
if (remainingCharacters > 0) {
|
||||
// Expect continuation byte
|
||||
if ((byte & 0b1100'0000) == 0b1000'0000) {
|
||||
validChar = true;
|
||||
remainingCharacters -= 1;
|
||||
} else {
|
||||
countedCharacters -= std::max<i64>(0, codePointWidth - (remainingCharacters + 1));
|
||||
codePointWidth = 0;
|
||||
validChar = false; // broken sequence
|
||||
remainingCharacters = 0;
|
||||
validChar = false;
|
||||
}
|
||||
} else if ((byte & 0b1110'0000) == 0b1100'0000) {
|
||||
// Two bytes
|
||||
codePointWidth = 2;
|
||||
remainingCharacters = codePointWidth - 1;
|
||||
validChar = true;
|
||||
} else if ((byte & 0b1111'0000) == 0b1110'0000) {
|
||||
// Three bytes
|
||||
codePointWidth = 3;
|
||||
remainingCharacters = codePointWidth - 1;
|
||||
validChar = true;
|
||||
} else if ((byte & 0b1111'1000) == 0b1111'0000) {
|
||||
// Four bytes
|
||||
codePointWidth = 4;
|
||||
remainingCharacters = codePointWidth - 1;
|
||||
validChar = true;
|
||||
} else {
|
||||
validChar = false;
|
||||
// New character
|
||||
if (byte <= 0x7F) {
|
||||
// ASCII
|
||||
validChar = validAscii(byte);
|
||||
remainingCharacters = 0;
|
||||
} else if ((byte & 0b1110'0000) == 0b1100'0000) {
|
||||
// 2-byte start (U+80..U+7FF)
|
||||
validChar = (byte >= 0xC2); // exclude overlongs (0xC0, 0xC1)
|
||||
remainingCharacters = 1;
|
||||
} else if ((byte & 0b1111'0000) == 0b1110'0000) {
|
||||
// 3-byte start (U+800..U+FFFF)
|
||||
validChar = !(byte == 0xE0 || byte == 0xED);
|
||||
// E0 must be followed by >= 0xA0, ED must be <= 0x9F (avoid surrogates)
|
||||
remainingCharacters = 2;
|
||||
} else if ((byte & 0b1111'1000) == 0b1111'0000) {
|
||||
// 4-byte start (U+10000..U+10FFFF)
|
||||
validChar = (byte <= 0xF4); // reject > U+10FFFF
|
||||
remainingCharacters = 3;
|
||||
} else {
|
||||
validChar = false;
|
||||
remainingCharacters = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user