diff --git a/plugins/builtin/source/content/views/view_find.cpp b/plugins/builtin/source/content/views/view_find.cpp index 96ce4f887..59d22b71c 100644 --- a/plugins/builtin/source/content/views/view_find.cpp +++ b/plugins/builtin/source/content/views/view_find.cpp @@ -214,15 +214,8 @@ namespace hex::plugin::builtin { return { Occurrence::DecodeType::Binary, std::endian::native }; }(); - i64 countedCharacters = 0; - u64 startAddress = reader.begin().getAddress(); - u64 endAddress = reader.end().getAddress(); - - u64 progress = 0; - u64 codePointWidth = 0; - i8 remainingCharacters = 0; - for (u8 byte : reader) { - bool validChar = + const auto validAscii = [&](u8 byte) { + return (settings.lowerCaseLetters && std::islower(byte)) || (settings.upperCaseLetters && std::isupper(byte)) || (settings.numbers && std::isdigit(byte)) || @@ -230,50 +223,64 @@ namespace hex::plugin::builtin { (settings.underscores && byte == '_') || (settings.symbols && std::ispunct(byte) && !std::isspace(byte)) || (settings.lineFeeds && (byte == '\r' || byte == '\n')); + }; - if (settings.type == UTF16LE) { + i64 countedCharacters = 0; + u64 startAddress = reader.begin().getAddress(); + u64 endAddress = reader.end().getAddress(); + + u64 progress = 0; + i8 remainingCharacters = 0; + for (const u8 byte : reader) { + bool validChar = false; + + if (settings.type == ASCII) { + validChar = validAscii(byte); + } else if (settings.type == UTF16LE) { // Check if second byte of UTF-16 encoded string is 0x00 if (countedCharacters % 2 == 1) validChar = byte == 0x00; + else + validChar = validAscii(byte); } else if (settings.type == UTF16BE) { // Check if first byte of UTF-16 encoded string is 0x00 if (countedCharacters % 2 == 0) validChar = byte == 0x00; + else + validChar = validAscii(byte); } else if (settings.type == UTF8) { - if ((byte & 0b1000'0000) == 0b0000'0000) { - // ASCII range - codePointWidth = 1; - remainingCharacters = 0; - validChar = true; - } else if ((byte & 0b1100'0000) == 0b1000'0000) { - // Continuation mark - - if (remainingCharacters > 0) { - remainingCharacters -= 1; + if (remainingCharacters > 0) { + // Expect continuation byte + if ((byte & 0b1100'0000) == 0b1000'0000) { validChar = true; + remainingCharacters -= 1; } else { - countedCharacters -= std::max(0, codePointWidth - (remainingCharacters + 1)); - codePointWidth = 0; + validChar = false; // broken sequence remainingCharacters = 0; - validChar = false; } - } else if ((byte & 0b1110'0000) == 0b1100'0000) { - // Two bytes - codePointWidth = 2; - remainingCharacters = codePointWidth - 1; - validChar = true; - } else if ((byte & 0b1111'0000) == 0b1110'0000) { - // Three bytes - codePointWidth = 3; - remainingCharacters = codePointWidth - 1; - validChar = true; - } else if ((byte & 0b1111'1000) == 0b1111'0000) { - // Four bytes - codePointWidth = 4; - remainingCharacters = codePointWidth - 1; - validChar = true; } else { - validChar = false; + // New character + if (byte <= 0x7F) { + // ASCII + validChar = validAscii(byte); + remainingCharacters = 0; + } else if ((byte & 0b1110'0000) == 0b1100'0000) { + // 2-byte start (U+80..U+7FF) + validChar = (byte >= 0xC2); // exclude overlongs (0xC0, 0xC1) + remainingCharacters = 1; + } else if ((byte & 0b1111'0000) == 0b1110'0000) { + // 3-byte start (U+800..U+FFFF) + validChar = !(byte == 0xE0 || byte == 0xED); + // E0 must be followed by >= 0xA0, ED must be <= 0x9F (avoid surrogates) + remainingCharacters = 2; + } else if ((byte & 0b1111'1000) == 0b1111'0000) { + // 4-byte start (U+10000..U+10FFFF) + validChar = (byte <= 0xF4); // reject > U+10FFFF + remainingCharacters = 3; + } else { + validChar = false; + remainingCharacters = 0; + } } }