fix: UTF-8 string search not working correctly

2026-03-27 23:37:05 -05:00 · 2025-08-18 20:03:45 +02:00
parent 8b384ef1f9
commit 35a357f1a8
1 changed files with 46 additions and 39 deletions
--- a/plugins/builtin/source/content/views/view_find.cpp
+++ b/plugins/builtin/source/content/views/view_find.cpp
@@ -214,15 +214,8 @@ namespace hex::plugin::builtin {
                return { Occurrence::DecodeType::Binary, std::endian::native };
        }();

-        i64 countedCharacters = 0;
-        u64 startAddress = reader.begin().getAddress();
-        u64 endAddress = reader.end().getAddress();
-
-        u64 progress = 0;
-        u64 codePointWidth = 0;
-        i8 remainingCharacters = 0;
-        for (u8 byte : reader) {
-            bool validChar =
+        const auto validAscii = [&](u8 byte) {
+            return
                (settings.lowerCaseLetters    && std::islower(byte))  ||
                (settings.upperCaseLetters    && std::isupper(byte))  ||
                (settings.numbers             && std::isdigit(byte))  ||
@@ -230,50 +223,64 @@ namespace hex::plugin::builtin {
                (settings.underscores         && byte == '_')             ||
                (settings.symbols             && std::ispunct(byte) && !std::isspace(byte))  ||
                (settings.lineFeeds           && (byte == '\r' || byte == '\n'));
+        };

-            if (settings.type == UTF16LE) {
+        i64 countedCharacters = 0;
+        u64 startAddress = reader.begin().getAddress();
+        u64 endAddress = reader.end().getAddress();
+
+        u64 progress = 0;
+        i8 remainingCharacters = 0;
+        for (const u8 byte : reader) {
+            bool validChar = false;
+
+            if (settings.type == ASCII) {
+                validChar = validAscii(byte);
+            } else if (settings.type == UTF16LE) {
                // Check if second byte of UTF-16 encoded string is 0x00
                if (countedCharacters % 2 == 1)
                    validChar = byte == 0x00;
+                else
+                    validChar = validAscii(byte);
            } else if (settings.type == UTF16BE) {
                // Check if first byte of UTF-16 encoded string is 0x00
                if (countedCharacters % 2 == 0)
                    validChar = byte == 0x00;
+                else
+                    validChar = validAscii(byte);
            } else if (settings.type == UTF8) {
-                if ((byte & 0b1000'0000) == 0b0000'0000) {
-                    // ASCII range
-                    codePointWidth = 1;
-                    remainingCharacters = 0;
-                    validChar = true;
-                } else if ((byte & 0b1100'0000) == 0b1000'0000) {
-                    // Continuation mark
-
-                    if (remainingCharacters > 0) {
-                        remainingCharacters -= 1;
+                if (remainingCharacters > 0) {
+                    // Expect continuation byte
+                    if ((byte & 0b1100'0000) == 0b1000'0000) {
                        validChar = true;
+                        remainingCharacters -= 1;
                    } else {
-                        countedCharacters -= std::max<i64>(0, codePointWidth - (remainingCharacters + 1));
-                        codePointWidth = 0;
+                        validChar = false; // broken sequence
                        remainingCharacters = 0;
-                        validChar = false;
                    }
-                } else if ((byte & 0b1110'0000) == 0b1100'0000) {
-                    // Two bytes
-                    codePointWidth = 2;
-                    remainingCharacters = codePointWidth - 1;
-                    validChar = true;
-                } else if ((byte & 0b1111'0000) == 0b1110'0000) {
-                    // Three bytes
-                    codePointWidth = 3;
-                    remainingCharacters = codePointWidth - 1;
-                    validChar = true;
-                } else if ((byte & 0b1111'1000) == 0b1111'0000) {
-                    // Four bytes
-                    codePointWidth = 4;
-                    remainingCharacters = codePointWidth - 1;
-                    validChar = true;
                } else {
-                    validChar = false;
+                    // New character
+                    if (byte <= 0x7F) {
+                        // ASCII
+                        validChar = validAscii(byte);
+                        remainingCharacters = 0;
+                    } else if ((byte & 0b1110'0000) == 0b1100'0000) {
+                        // 2-byte start (U+80..U+7FF)
+                        validChar = (byte >= 0xC2); // exclude overlongs (0xC0, 0xC1)
+                        remainingCharacters = 1;
+                    } else if ((byte & 0b1111'0000) == 0b1110'0000) {
+                        // 3-byte start (U+800..U+FFFF)
+                        validChar = !(byte == 0xE0 || byte == 0xED);
+                        // E0 must be followed by >= 0xA0, ED must be <= 0x9F (avoid surrogates)
+                        remainingCharacters = 2;
+                    } else if ((byte & 0b1111'1000) == 0b1111'0000) {
+                        // 4-byte start (U+10000..U+10FFFF)
+                        validChar = (byte <= 0xF4); // reject > U+10FFFF
+                        remainingCharacters = 3;
+                    } else {
+                        validChar = false;
+                        remainingCharacters = 0;
+                    }
                }
            }