mirror of
https://github.com/WerWolv/ImHex-Patterns.git
synced 2026-03-27 23:37:04 -05:00
patterns: Added gguf pattern (#235)
* (feat) add gguf parser * (chore) cleanup empty struct * (tests) add sample gguf for tests and update readme * (feat) update type enum
This commit is contained in:
12
README.md
12
README.md
@@ -1,6 +1,7 @@
|
||||
# ImHex Database
|
||||
|
||||
This repository serves as a database for files to use with the [ImHex Hex Editor](https://github.com/WerWolv/ImHex). It currently contains
|
||||
|
||||
- [Patterns](/patterns) - Binary Format definitions for the Pattern Language
|
||||
- [Pattern Libraries](/includes) - Libraries that make using the Pattern Language easier
|
||||
- [Magic Files](/magic) - Custom magic file definitions for the use with libmagic
|
||||
@@ -56,6 +57,7 @@ Everything will immediately show up in ImHex's Content Store and gets bundled wi
|
||||
| File System | | [`patterns/fs.hexpat`](patterns/fs.hexpat) | Drive File System |
|
||||
| FLAC | `audio/flac` | [`patterns/flac.hexpat`](patterns/flac.hexpat) | Free Lossless Audio Codec, FLAC Audio Format |
|
||||
| GB | `application/x-gameboy-rom` | [`patterns/gb.hexpat`](patterns/gb.hexpat) | Gameboy ROM |
|
||||
| GGUF | | [`patterns/gguf.hexpat`](patterns/gguf.hexpat) | GGML Inference Models |
|
||||
| GIF | `image/gif` | [`patterns/gif.hexpat`](patterns/gif.hexpat) | GIF image files |
|
||||
| GZIP | `application/gzip` | [`patterns/gzip.hexpat`](patterns/gzip.hexpat) | GZip compressed data format |
|
||||
| Halo Bitmap || [`patterns/hinf_bitmap.hexpat`](patterns/hinf_bitmap.hexpat) | Halo Infinite Bitmap tag files |
|
||||
@@ -88,10 +90,10 @@ Everything will immediately show up in ImHex's Content Store and gets bundled wi
|
||||
| PP | | [`patterns/selinuxpp.hexpat`](patterns/selinuxpp.pat) | SE Linux package |
|
||||
| PFS0 | | [`patterns/pfs0.hexpat`](patterns/pfs0.hexpat) | Nintendo Switch PFS0 archive (NSP files) |
|
||||
| PIF | `image/pif` | [`patterns/pif.hexpat`](patterns/pif.hexpat) | PIF Image Format |
|
||||
| PNG | `image/png` | [`patterns/png.hexpat`](patterns/png.hexpat) | PNG image files |
|
||||
| PRODINFO | | [`patterns/prodinfo.hexpat`](patterns/prodinfo.hexpat) | Nintendo Switch PRODINFO |
|
||||
| Protobuf | | [`patterns/protobuf.hexpat`](patterns/protobuf.hexpat) | Google Protobuf encoding |
|
||||
| PyInstaller | | [`patterns/pyinstaller.hexpat`](patterns/pyinstaller.hexpat) | PyInstaller binray files |
|
||||
| PNG | `image/png` | [`patterns/png.hexpat`](patterns/png.hexpat) | PNG image files |
|
||||
| PRODINFO | | [`patterns/prodinfo.hexpat`](patterns/prodinfo.hexpat) | Nintendo Switch PRODINFO |
|
||||
| Protobuf | | [`patterns/protobuf.hexpat`](patterns/protobuf.hexpat) | Google Protobuf encoding |
|
||||
| PyInstaller | | [`patterns/pyinstaller.hexpat`](patterns/pyinstaller.hexpat) | PyInstaller binray files |
|
||||
| PYC | | [`patterns/pyc.hexpat`](patterns/pyc.hexpat) | Python bytecode files |
|
||||
| QBCL | | [`patterns/qbcl.hexpat`](patterns/qbcl.hexpat) | Qubicle voxel scene project file |
|
||||
| QOI | `image/qoi` | [`patterns/qoi.hexpat`](patterns/qoi.hexpat) | QOI image files |
|
||||
@@ -203,12 +205,14 @@ Everything will immediately show up in ImHex's Content Store and gets bundled wi
|
||||
> import custom encoding from File -> Import... -> Custome Encoding File
|
||||
|
||||
### Data Processor Nodes
|
||||
|
||||
| Name | Path | Description |
|
||||
|------|------|-------------|
|
||||
| Caesar Cipher | [`nodes/caesar.hexnode`](nodes/caesar.hexnode) | Simple adjustable per-byte Caecar Cipher (ROT) |
|
||||
| XOR Cipher | [`nodes/xor.hexnode`](nodes/xor.hexnode) | XORs a input with a repeating XOR pad |
|
||||
|
||||
### Themes
|
||||
|
||||
| Name | Path | Description |
|
||||
|------|------|-------------|
|
||||
| Visual Studio Dark | [`themes/vs_dark.json`](themes/vs_dark.json) | Theme similar to Visual Studio's Dark theme |
|
||||
|
||||
215
patterns/gguf.hexpat
Normal file
215
patterns/gguf.hexpat
Normal file
@@ -0,0 +1,215 @@
|
||||
// https://github.com/ggerganov/ggml/blob/master/docs/gguf.md
|
||||
// https://github.com/openxla/iree/blob/main/runtime/src/iree/io/formats/gguf/gguf_parser.c
|
||||
|
||||
#pragma description GGUF v3 File Format Patter
|
||||
#pragma authors @leonjza, jessie @ imhex discord
|
||||
|
||||
#pragma pattern_limit 300000
|
||||
|
||||
enum ggml_type: u32 {
|
||||
GGML_TYPE_F32 = 0,
|
||||
GGML_TYPE_F16 = 1,
|
||||
GGML_TYPE_Q4_0 = 2,
|
||||
GGML_TYPE_Q4_1 = 3,
|
||||
// GGML_TYPE_Q4_2 = 4, support has been removed
|
||||
// GGML_TYPE_Q4_3 = 5, support has been removed
|
||||
GGML_TYPE_Q5_0 = 6,
|
||||
GGML_TYPE_Q5_1 = 7,
|
||||
GGML_TYPE_Q8_0 = 8,
|
||||
GGML_TYPE_Q8_1 = 9,
|
||||
GGML_TYPE_Q2_K = 10,
|
||||
GGML_TYPE_Q3_K = 11,
|
||||
GGML_TYPE_Q4_K = 12,
|
||||
GGML_TYPE_Q5_K = 13,
|
||||
GGML_TYPE_Q6_K = 14,
|
||||
GGML_TYPE_Q8_K = 15,
|
||||
GGML_TYPE_IQ2_XXS = 16,
|
||||
GGML_TYPE_IQ2_XS = 17,
|
||||
GGML_TYPE_IQ3_XXS = 18,
|
||||
GGML_TYPE_IQ1_S = 19,
|
||||
GGML_TYPE_IQ4_NL = 20,
|
||||
GGML_TYPE_IQ3_S = 21,
|
||||
GGML_TYPE_IQ2_S = 22,
|
||||
GGML_TYPE_IQ4_XS = 23,
|
||||
GGML_TYPE_I8 = 24,
|
||||
GGML_TYPE_I16 = 25,
|
||||
GGML_TYPE_I32 = 26,
|
||||
GGML_TYPE_I64 = 27,
|
||||
GGML_TYPE_F64 = 28,
|
||||
GGML_TYPE_IQ1_M = 29,
|
||||
GGML_TYPE_COUNT,
|
||||
};
|
||||
|
||||
enum gguf_metadata_value_type: u32 {
|
||||
// The value is a 8-bit unsigned integer.
|
||||
GGUF_METADATA_VALUE_TYPE_UINT8 = 0,
|
||||
// The value is a 8-bit signed integer.
|
||||
GGUF_METADATA_VALUE_TYPE_INT8 = 1,
|
||||
// The value is a 16-bit unsigned little-endian integer.
|
||||
GGUF_METADATA_VALUE_TYPE_UINT16 = 2,
|
||||
// The value is a 16-bit signed little-endian integer.
|
||||
GGUF_METADATA_VALUE_TYPE_INT16 = 3,
|
||||
// The value is a 32-bit unsigned little-endian integer.
|
||||
GGUF_METADATA_VALUE_TYPE_UINT32 = 4,
|
||||
// The value is a 32-bit signed little-endian integer.
|
||||
GGUF_METADATA_VALUE_TYPE_INT32 = 5,
|
||||
// The value is a 32-bit IEEE754 floating point number.
|
||||
GGUF_METADATA_VALUE_TYPE_FLOAT32 = 6,
|
||||
// The value is a boolean.
|
||||
// 1-byte value where 0 is false and 1 is true.
|
||||
// Anything else is invalid, and should be treated as either the model being invalid or the reader being buggy.
|
||||
GGUF_METADATA_VALUE_TYPE_BOOL = 7,
|
||||
// The value is a UTF-8 non-null-terminated string, with length prepended.
|
||||
GGUF_METADATA_VALUE_TYPE_STRING = 8,
|
||||
// The value is an array of other values, with the length and type prepended.
|
||||
///
|
||||
// Arrays can be nested, and the length of the array is the number of elements in the array, not the number of bytes.
|
||||
GGUF_METADATA_VALUE_TYPE_ARRAY = 9,
|
||||
// The value is a 64-bit unsigned little-endian integer.
|
||||
GGUF_METADATA_VALUE_TYPE_UINT64 = 10,
|
||||
// The value is a 64-bit signed little-endian integer.
|
||||
GGUF_METADATA_VALUE_TYPE_INT64 = 11,
|
||||
// The value is a 64-bit IEEE754 floating point number.
|
||||
GGUF_METADATA_VALUE_TYPE_FLOAT64 = 12,
|
||||
};
|
||||
|
||||
// A string in GGUF.
|
||||
struct gguf_string_t {
|
||||
// The length of the string, in bytes.
|
||||
u64 len;
|
||||
// The string as a UTF-8 non-null-terminated string.
|
||||
char string[len];
|
||||
};
|
||||
|
||||
|
||||
struct gguf_metadata_value_t {
|
||||
gguf_metadata_value_type type;
|
||||
u64 length;
|
||||
|
||||
match(type) {
|
||||
(gguf_metadata_value_type::GGUF_METADATA_VALUE_TYPE_UINT8): u8 value[length];
|
||||
(gguf_metadata_value_type::GGUF_METADATA_VALUE_TYPE_INT8): s8 value[length];
|
||||
(gguf_metadata_value_type::GGUF_METADATA_VALUE_TYPE_UINT16): u16 value[length];
|
||||
(gguf_metadata_value_type::GGUF_METADATA_VALUE_TYPE_INT16): s16 value[length];
|
||||
(gguf_metadata_value_type::GGUF_METADATA_VALUE_TYPE_UINT32): u32 value[length];
|
||||
(gguf_metadata_value_type::GGUF_METADATA_VALUE_TYPE_INT32): s32 value[length];
|
||||
(gguf_metadata_value_type::GGUF_METADATA_VALUE_TYPE_FLOAT32): float value[length];
|
||||
(gguf_metadata_value_type::GGUF_METADATA_VALUE_TYPE_BOOL): bool value[length];
|
||||
(gguf_metadata_value_type::GGUF_METADATA_VALUE_TYPE_STRING): gguf_string_t value[length];
|
||||
(gguf_metadata_value_type::GGUF_METADATA_VALUE_TYPE_UINT64): u64 value[length];
|
||||
(gguf_metadata_value_type::GGUF_METADATA_VALUE_TYPE_FLOAT64): double value[length];
|
||||
(gguf_metadata_value_type::GGUF_METADATA_VALUE_TYPE_ARRAY): double value[length];
|
||||
}
|
||||
};
|
||||
|
||||
struct gguf_metadata_value {
|
||||
gguf_metadata_value_type type;
|
||||
|
||||
match(type) {
|
||||
(gguf_metadata_value_type::GGUF_METADATA_VALUE_TYPE_UINT8): u8 value;
|
||||
(gguf_metadata_value_type::GGUF_METADATA_VALUE_TYPE_INT8): s8 value;
|
||||
(gguf_metadata_value_type::GGUF_METADATA_VALUE_TYPE_UINT16): u16 value;
|
||||
(gguf_metadata_value_type::GGUF_METADATA_VALUE_TYPE_INT16): s16 value;
|
||||
(gguf_metadata_value_type::GGUF_METADATA_VALUE_TYPE_UINT32): u32 value;
|
||||
(gguf_metadata_value_type::GGUF_METADATA_VALUE_TYPE_INT32): s32 value;
|
||||
(gguf_metadata_value_type::GGUF_METADATA_VALUE_TYPE_FLOAT32): float value;
|
||||
(gguf_metadata_value_type::GGUF_METADATA_VALUE_TYPE_BOOL): bool value;
|
||||
(gguf_metadata_value_type::GGUF_METADATA_VALUE_TYPE_STRING): gguf_string_t value;
|
||||
(gguf_metadata_value_type::GGUF_METADATA_VALUE_TYPE_UINT64): u64 value;
|
||||
(gguf_metadata_value_type::GGUF_METADATA_VALUE_TYPE_FLOAT64): double value;
|
||||
(gguf_metadata_value_type::GGUF_METADATA_VALUE_TYPE_ARRAY): gguf_metadata_value_t value;
|
||||
}
|
||||
};
|
||||
|
||||
struct gguf_metadata_kv_t {
|
||||
// The key of the metadata. It is a standard GGUF string, with the following caveats:
|
||||
// - It must be a valid ASCII string.
|
||||
// - It must be a hierarchical key, where each segment is `lower_snake_case` and separated by a `.`.
|
||||
// - It must be at most 2^16-1/65535 bytes long.
|
||||
// Any keys that do not follow these rules are invalid.
|
||||
gguf_string_t key;
|
||||
|
||||
// The type of the value.
|
||||
// Must be one of the `gguf_metadata_value_type` values.
|
||||
// gguf_metadata_value_type value_type;
|
||||
|
||||
// The value.
|
||||
gguf_metadata_value value;
|
||||
};
|
||||
|
||||
struct gguf_header_t {
|
||||
// Magic number to announce that this is a GGUF file.
|
||||
// Must be `GGUF` at the byte level: `0x47` `0x47` `0x55` `0x46`.
|
||||
// Your executor might do little-endian byte order, so it might be
|
||||
// check for 0x46554747 and letting the endianness cancel out.
|
||||
// Consider being *very* explicit about the byte order here.
|
||||
u32 magic;
|
||||
// The version of the format implemented.
|
||||
// Must be `3` for version described in this spec, which introduces big-endian support.
|
||||
//
|
||||
// This version should only be increased for structural changes to the format.
|
||||
// Changes that do not affect the structure of the file should instead update the metadata
|
||||
// to signify the change.
|
||||
u32 version;
|
||||
// The number of tensors in the file.
|
||||
// This is explicit, instead of being included in the metadata, to ensure it is always present
|
||||
// for loading the tensors.
|
||||
u64 tensor_count;
|
||||
// The number of metadata key-value pairs.
|
||||
u64 metadata_kv_count;
|
||||
// The metadata key-value pairs.
|
||||
gguf_metadata_kv_t metadata_kv[metadata_kv_count];
|
||||
};
|
||||
|
||||
struct gguf_tensor_info_t {
|
||||
// The name of the tensor. It is a standard GGUF string, with the caveat that
|
||||
// it must be at most 64 bytes long.
|
||||
gguf_string_t name;
|
||||
// The number of dimensions in the tensor.
|
||||
// Currently at most 4, but this may change in the future.
|
||||
u32 n_dimensions;
|
||||
// The dimensions of the tensor.
|
||||
u64 dimensions[n_dimensions];
|
||||
// The type of the tensor.
|
||||
ggml_type type;
|
||||
// The offset of the tensor's data in this file in bytes.
|
||||
//
|
||||
// This offset is relative to `tensor_data`, not to the start
|
||||
// of the file, to make it easier for writers to write the file.
|
||||
// Readers should consider exposing this offset relative to the
|
||||
// file to make it easier to read the data.
|
||||
//
|
||||
// Must be a multiple of `ALIGNMENT`. That is, `align_offset(offset) == offset`.
|
||||
u64 offset;
|
||||
};
|
||||
|
||||
struct gguf_file_t {
|
||||
// The header of the file.
|
||||
gguf_header_t header;
|
||||
|
||||
// Tensor infos, which can be used to locate the tensor data.
|
||||
gguf_tensor_info_t tensor_infos[header.tensor_count];
|
||||
|
||||
// Padding to the nearest multiple of `ALIGNMENT`.
|
||||
//
|
||||
// That is, if `sizeof(header) + sizeof(tensor_infos)` is not a multiple of `ALIGNMENT`,
|
||||
// this padding is added to make it so.
|
||||
//
|
||||
// This can be calculated as `align_offset(position) - position`, where `position` is
|
||||
// the position of the end of `tensor_infos` (i.e. `sizeof(header) + sizeof(tensor_infos)`).
|
||||
u8 _padding[];
|
||||
|
||||
// Tensor data.
|
||||
//
|
||||
// This is arbitrary binary data corresponding to the weights of the model. This data should be close
|
||||
// or identical to the data in the original model file, but may be different due to quantization or
|
||||
// other optimizations for inference. Any such deviations should be recorded in the metadata or as
|
||||
// part of the architecture definition.
|
||||
//
|
||||
// Each tensor's data must be stored within this array, and located through its `tensor_infos` entry.
|
||||
// The offset of each tensor's data must be a multiple of `ALIGNMENT`, and the space between tensors
|
||||
// should be padded to `ALIGNMENT` bytes.
|
||||
u8 tensor_data[];
|
||||
};
|
||||
|
||||
gguf_file_t GGUF @ 0x00;
|
||||
BIN
tests/patterns/test_data/gguf.hexpat.gguf
Normal file
BIN
tests/patterns/test_data/gguf.hexpat.gguf
Normal file
Binary file not shown.
Reference in New Issue
Block a user