diff --git a/README.md b/README.md index 218bc00..95fc594 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,7 @@ # ImHex Database This repository serves as a database for files to use with the [ImHex Hex Editor](https://github.com/WerWolv/ImHex). It currently contains + - [Patterns](/patterns) - Binary Format definitions for the Pattern Language - [Pattern Libraries](/includes) - Libraries that make using the Pattern Language easier - [Magic Files](/magic) - Custom magic file definitions for the use with libmagic @@ -56,6 +57,7 @@ Everything will immediately show up in ImHex's Content Store and gets bundled wi | File System | | [`patterns/fs.hexpat`](patterns/fs.hexpat) | Drive File System | | FLAC | `audio/flac` | [`patterns/flac.hexpat`](patterns/flac.hexpat) | Free Lossless Audio Codec, FLAC Audio Format | | GB | `application/x-gameboy-rom` | [`patterns/gb.hexpat`](patterns/gb.hexpat) | Gameboy ROM | +| GGUF | | [`patterns/gguf.hexpat`](patterns/gguf.hexpat) | GGML Inference Models | | GIF | `image/gif` | [`patterns/gif.hexpat`](patterns/gif.hexpat) | GIF image files | | GZIP | `application/gzip` | [`patterns/gzip.hexpat`](patterns/gzip.hexpat) | GZip compressed data format | | Halo Bitmap || [`patterns/hinf_bitmap.hexpat`](patterns/hinf_bitmap.hexpat) | Halo Infinite Bitmap tag files | @@ -88,10 +90,10 @@ Everything will immediately show up in ImHex's Content Store and gets bundled wi | PP | | [`patterns/selinuxpp.hexpat`](patterns/selinuxpp.pat) | SE Linux package | | PFS0 | | [`patterns/pfs0.hexpat`](patterns/pfs0.hexpat) | Nintendo Switch PFS0 archive (NSP files) | | PIF | `image/pif` | [`patterns/pif.hexpat`](patterns/pif.hexpat) | PIF Image Format | -| PNG | `image/png` | [`patterns/png.hexpat`](patterns/png.hexpat) | PNG image files | -| PRODINFO | | [`patterns/prodinfo.hexpat`](patterns/prodinfo.hexpat) | Nintendo Switch PRODINFO | -| Protobuf | | [`patterns/protobuf.hexpat`](patterns/protobuf.hexpat) | Google Protobuf encoding | -| PyInstaller | | [`patterns/pyinstaller.hexpat`](patterns/pyinstaller.hexpat) | PyInstaller binray files | +| PNG | `image/png` | [`patterns/png.hexpat`](patterns/png.hexpat) | PNG image files | +| PRODINFO | | [`patterns/prodinfo.hexpat`](patterns/prodinfo.hexpat) | Nintendo Switch PRODINFO | +| Protobuf | | [`patterns/protobuf.hexpat`](patterns/protobuf.hexpat) | Google Protobuf encoding | +| PyInstaller | | [`patterns/pyinstaller.hexpat`](patterns/pyinstaller.hexpat) | PyInstaller binray files | | PYC | | [`patterns/pyc.hexpat`](patterns/pyc.hexpat) | Python bytecode files | | QBCL | | [`patterns/qbcl.hexpat`](patterns/qbcl.hexpat) | Qubicle voxel scene project file | | QOI | `image/qoi` | [`patterns/qoi.hexpat`](patterns/qoi.hexpat) | QOI image files | @@ -203,12 +205,14 @@ Everything will immediately show up in ImHex's Content Store and gets bundled wi > import custom encoding from File -> Import... -> Custome Encoding File ### Data Processor Nodes + | Name | Path | Description | |------|------|-------------| | Caesar Cipher | [`nodes/caesar.hexnode`](nodes/caesar.hexnode) | Simple adjustable per-byte Caecar Cipher (ROT) | | XOR Cipher | [`nodes/xor.hexnode`](nodes/xor.hexnode) | XORs a input with a repeating XOR pad | ### Themes + | Name | Path | Description | |------|------|-------------| | Visual Studio Dark | [`themes/vs_dark.json`](themes/vs_dark.json) | Theme similar to Visual Studio's Dark theme | diff --git a/patterns/gguf.hexpat b/patterns/gguf.hexpat new file mode 100644 index 0000000..0c62e43 --- /dev/null +++ b/patterns/gguf.hexpat @@ -0,0 +1,215 @@ +// https://github.com/ggerganov/ggml/blob/master/docs/gguf.md +// https://github.com/openxla/iree/blob/main/runtime/src/iree/io/formats/gguf/gguf_parser.c + +#pragma description GGUF v3 File Format Patter +#pragma authors @leonjza, jessie @ imhex discord + +#pragma pattern_limit 300000 + +enum ggml_type: u32 { + GGML_TYPE_F32 = 0, + GGML_TYPE_F16 = 1, + GGML_TYPE_Q4_0 = 2, + GGML_TYPE_Q4_1 = 3, + // GGML_TYPE_Q4_2 = 4, support has been removed + // GGML_TYPE_Q4_3 = 5, support has been removed + GGML_TYPE_Q5_0 = 6, + GGML_TYPE_Q5_1 = 7, + GGML_TYPE_Q8_0 = 8, + GGML_TYPE_Q8_1 = 9, + GGML_TYPE_Q2_K = 10, + GGML_TYPE_Q3_K = 11, + GGML_TYPE_Q4_K = 12, + GGML_TYPE_Q5_K = 13, + GGML_TYPE_Q6_K = 14, + GGML_TYPE_Q8_K = 15, + GGML_TYPE_IQ2_XXS = 16, + GGML_TYPE_IQ2_XS = 17, + GGML_TYPE_IQ3_XXS = 18, + GGML_TYPE_IQ1_S = 19, + GGML_TYPE_IQ4_NL = 20, + GGML_TYPE_IQ3_S = 21, + GGML_TYPE_IQ2_S = 22, + GGML_TYPE_IQ4_XS = 23, + GGML_TYPE_I8 = 24, + GGML_TYPE_I16 = 25, + GGML_TYPE_I32 = 26, + GGML_TYPE_I64 = 27, + GGML_TYPE_F64 = 28, + GGML_TYPE_IQ1_M = 29, + GGML_TYPE_COUNT, +}; + +enum gguf_metadata_value_type: u32 { + // The value is a 8-bit unsigned integer. + GGUF_METADATA_VALUE_TYPE_UINT8 = 0, + // The value is a 8-bit signed integer. + GGUF_METADATA_VALUE_TYPE_INT8 = 1, + // The value is a 16-bit unsigned little-endian integer. + GGUF_METADATA_VALUE_TYPE_UINT16 = 2, + // The value is a 16-bit signed little-endian integer. + GGUF_METADATA_VALUE_TYPE_INT16 = 3, + // The value is a 32-bit unsigned little-endian integer. + GGUF_METADATA_VALUE_TYPE_UINT32 = 4, + // The value is a 32-bit signed little-endian integer. + GGUF_METADATA_VALUE_TYPE_INT32 = 5, + // The value is a 32-bit IEEE754 floating point number. + GGUF_METADATA_VALUE_TYPE_FLOAT32 = 6, + // The value is a boolean. + // 1-byte value where 0 is false and 1 is true. + // Anything else is invalid, and should be treated as either the model being invalid or the reader being buggy. + GGUF_METADATA_VALUE_TYPE_BOOL = 7, + // The value is a UTF-8 non-null-terminated string, with length prepended. + GGUF_METADATA_VALUE_TYPE_STRING = 8, + // The value is an array of other values, with the length and type prepended. + /// + // Arrays can be nested, and the length of the array is the number of elements in the array, not the number of bytes. + GGUF_METADATA_VALUE_TYPE_ARRAY = 9, + // The value is a 64-bit unsigned little-endian integer. + GGUF_METADATA_VALUE_TYPE_UINT64 = 10, + // The value is a 64-bit signed little-endian integer. + GGUF_METADATA_VALUE_TYPE_INT64 = 11, + // The value is a 64-bit IEEE754 floating point number. + GGUF_METADATA_VALUE_TYPE_FLOAT64 = 12, +}; + +// A string in GGUF. +struct gguf_string_t { + // The length of the string, in bytes. + u64 len; + // The string as a UTF-8 non-null-terminated string. + char string[len]; +}; + + +struct gguf_metadata_value_t { + gguf_metadata_value_type type; + u64 length; + + match(type) { + (gguf_metadata_value_type::GGUF_METADATA_VALUE_TYPE_UINT8): u8 value[length]; + (gguf_metadata_value_type::GGUF_METADATA_VALUE_TYPE_INT8): s8 value[length]; + (gguf_metadata_value_type::GGUF_METADATA_VALUE_TYPE_UINT16): u16 value[length]; + (gguf_metadata_value_type::GGUF_METADATA_VALUE_TYPE_INT16): s16 value[length]; + (gguf_metadata_value_type::GGUF_METADATA_VALUE_TYPE_UINT32): u32 value[length]; + (gguf_metadata_value_type::GGUF_METADATA_VALUE_TYPE_INT32): s32 value[length]; + (gguf_metadata_value_type::GGUF_METADATA_VALUE_TYPE_FLOAT32): float value[length]; + (gguf_metadata_value_type::GGUF_METADATA_VALUE_TYPE_BOOL): bool value[length]; + (gguf_metadata_value_type::GGUF_METADATA_VALUE_TYPE_STRING): gguf_string_t value[length]; + (gguf_metadata_value_type::GGUF_METADATA_VALUE_TYPE_UINT64): u64 value[length]; + (gguf_metadata_value_type::GGUF_METADATA_VALUE_TYPE_FLOAT64): double value[length]; + (gguf_metadata_value_type::GGUF_METADATA_VALUE_TYPE_ARRAY): double value[length]; + } +}; + +struct gguf_metadata_value { + gguf_metadata_value_type type; + + match(type) { + (gguf_metadata_value_type::GGUF_METADATA_VALUE_TYPE_UINT8): u8 value; + (gguf_metadata_value_type::GGUF_METADATA_VALUE_TYPE_INT8): s8 value; + (gguf_metadata_value_type::GGUF_METADATA_VALUE_TYPE_UINT16): u16 value; + (gguf_metadata_value_type::GGUF_METADATA_VALUE_TYPE_INT16): s16 value; + (gguf_metadata_value_type::GGUF_METADATA_VALUE_TYPE_UINT32): u32 value; + (gguf_metadata_value_type::GGUF_METADATA_VALUE_TYPE_INT32): s32 value; + (gguf_metadata_value_type::GGUF_METADATA_VALUE_TYPE_FLOAT32): float value; + (gguf_metadata_value_type::GGUF_METADATA_VALUE_TYPE_BOOL): bool value; + (gguf_metadata_value_type::GGUF_METADATA_VALUE_TYPE_STRING): gguf_string_t value; + (gguf_metadata_value_type::GGUF_METADATA_VALUE_TYPE_UINT64): u64 value; + (gguf_metadata_value_type::GGUF_METADATA_VALUE_TYPE_FLOAT64): double value; + (gguf_metadata_value_type::GGUF_METADATA_VALUE_TYPE_ARRAY): gguf_metadata_value_t value; + } +}; + +struct gguf_metadata_kv_t { + // The key of the metadata. It is a standard GGUF string, with the following caveats: + // - It must be a valid ASCII string. + // - It must be a hierarchical key, where each segment is `lower_snake_case` and separated by a `.`. + // - It must be at most 2^16-1/65535 bytes long. + // Any keys that do not follow these rules are invalid. + gguf_string_t key; + + // The type of the value. + // Must be one of the `gguf_metadata_value_type` values. + // gguf_metadata_value_type value_type; + + // The value. + gguf_metadata_value value; +}; + +struct gguf_header_t { + // Magic number to announce that this is a GGUF file. + // Must be `GGUF` at the byte level: `0x47` `0x47` `0x55` `0x46`. + // Your executor might do little-endian byte order, so it might be + // check for 0x46554747 and letting the endianness cancel out. + // Consider being *very* explicit about the byte order here. + u32 magic; + // The version of the format implemented. + // Must be `3` for version described in this spec, which introduces big-endian support. + // + // This version should only be increased for structural changes to the format. + // Changes that do not affect the structure of the file should instead update the metadata + // to signify the change. + u32 version; + // The number of tensors in the file. + // This is explicit, instead of being included in the metadata, to ensure it is always present + // for loading the tensors. + u64 tensor_count; + // The number of metadata key-value pairs. + u64 metadata_kv_count; + // The metadata key-value pairs. + gguf_metadata_kv_t metadata_kv[metadata_kv_count]; +}; + +struct gguf_tensor_info_t { + // The name of the tensor. It is a standard GGUF string, with the caveat that + // it must be at most 64 bytes long. + gguf_string_t name; + // The number of dimensions in the tensor. + // Currently at most 4, but this may change in the future. + u32 n_dimensions; + // The dimensions of the tensor. + u64 dimensions[n_dimensions]; + // The type of the tensor. + ggml_type type; + // The offset of the tensor's data in this file in bytes. + // + // This offset is relative to `tensor_data`, not to the start + // of the file, to make it easier for writers to write the file. + // Readers should consider exposing this offset relative to the + // file to make it easier to read the data. + // + // Must be a multiple of `ALIGNMENT`. That is, `align_offset(offset) == offset`. + u64 offset; +}; + +struct gguf_file_t { + // The header of the file. + gguf_header_t header; + + // Tensor infos, which can be used to locate the tensor data. + gguf_tensor_info_t tensor_infos[header.tensor_count]; + + // Padding to the nearest multiple of `ALIGNMENT`. + // + // That is, if `sizeof(header) + sizeof(tensor_infos)` is not a multiple of `ALIGNMENT`, + // this padding is added to make it so. + // + // This can be calculated as `align_offset(position) - position`, where `position` is + // the position of the end of `tensor_infos` (i.e. `sizeof(header) + sizeof(tensor_infos)`). + u8 _padding[]; + + // Tensor data. + // + // This is arbitrary binary data corresponding to the weights of the model. This data should be close + // or identical to the data in the original model file, but may be different due to quantization or + // other optimizations for inference. Any such deviations should be recorded in the metadata or as + // part of the architecture definition. + // + // Each tensor's data must be stored within this array, and located through its `tensor_infos` entry. + // The offset of each tensor's data must be a multiple of `ALIGNMENT`, and the space between tensors + // should be padded to `ALIGNMENT` bytes. + u8 tensor_data[]; +}; + +gguf_file_t GGUF @ 0x00; \ No newline at end of file diff --git a/tests/patterns/test_data/gguf.hexpat.gguf b/tests/patterns/test_data/gguf.hexpat.gguf new file mode 100644 index 0000000..5f59f79 Binary files /dev/null and b/tests/patterns/test_data/gguf.hexpat.gguf differ