patterns: Added gguf pattern (#235)

* (feat) add gguf parser

* (chore) cleanup empty struct

* (tests) add sample gguf for tests and update readme

* (feat) update type enum
This commit is contained in:
Leon Jacobs
2024-03-28 22:55:18 +02:00
committed by GitHub
parent 0549e62a14
commit 236fadee47
3 changed files with 223 additions and 4 deletions

View File

@@ -1,6 +1,7 @@
# ImHex Database
This repository serves as a database for files to use with the [ImHex Hex Editor](https://github.com/WerWolv/ImHex). It currently contains
- [Patterns](/patterns) - Binary Format definitions for the Pattern Language
- [Pattern Libraries](/includes) - Libraries that make using the Pattern Language easier
- [Magic Files](/magic) - Custom magic file definitions for the use with libmagic
@@ -56,6 +57,7 @@ Everything will immediately show up in ImHex's Content Store and gets bundled wi
| File System | | [`patterns/fs.hexpat`](patterns/fs.hexpat) | Drive File System |
| FLAC | `audio/flac` | [`patterns/flac.hexpat`](patterns/flac.hexpat) | Free Lossless Audio Codec, FLAC Audio Format |
| GB | `application/x-gameboy-rom` | [`patterns/gb.hexpat`](patterns/gb.hexpat) | Gameboy ROM |
| GGUF | | [`patterns/gguf.hexpat`](patterns/gguf.hexpat) | GGML Inference Models |
| GIF | `image/gif` | [`patterns/gif.hexpat`](patterns/gif.hexpat) | GIF image files |
| GZIP | `application/gzip` | [`patterns/gzip.hexpat`](patterns/gzip.hexpat) | GZip compressed data format |
| Halo Bitmap || [`patterns/hinf_bitmap.hexpat`](patterns/hinf_bitmap.hexpat) | Halo Infinite Bitmap tag files |
@@ -88,10 +90,10 @@ Everything will immediately show up in ImHex's Content Store and gets bundled wi
| PP | | [`patterns/selinuxpp.hexpat`](patterns/selinuxpp.pat) | SE Linux package |
| PFS0 | | [`patterns/pfs0.hexpat`](patterns/pfs0.hexpat) | Nintendo Switch PFS0 archive (NSP files) |
| PIF | `image/pif` | [`patterns/pif.hexpat`](patterns/pif.hexpat) | PIF Image Format |
| PNG | `image/png` | [`patterns/png.hexpat`](patterns/png.hexpat) | PNG image files |
| PRODINFO | | [`patterns/prodinfo.hexpat`](patterns/prodinfo.hexpat) | Nintendo Switch PRODINFO |
| Protobuf | | [`patterns/protobuf.hexpat`](patterns/protobuf.hexpat) | Google Protobuf encoding |
| PyInstaller | | [`patterns/pyinstaller.hexpat`](patterns/pyinstaller.hexpat) | PyInstaller binray files |
| PNG | `image/png` | [`patterns/png.hexpat`](patterns/png.hexpat) | PNG image files |
| PRODINFO | | [`patterns/prodinfo.hexpat`](patterns/prodinfo.hexpat) | Nintendo Switch PRODINFO |
| Protobuf | | [`patterns/protobuf.hexpat`](patterns/protobuf.hexpat) | Google Protobuf encoding |
| PyInstaller | | [`patterns/pyinstaller.hexpat`](patterns/pyinstaller.hexpat) | PyInstaller binray files |
| PYC | | [`patterns/pyc.hexpat`](patterns/pyc.hexpat) | Python bytecode files |
| QBCL | | [`patterns/qbcl.hexpat`](patterns/qbcl.hexpat) | Qubicle voxel scene project file |
| QOI | `image/qoi` | [`patterns/qoi.hexpat`](patterns/qoi.hexpat) | QOI image files |
@@ -203,12 +205,14 @@ Everything will immediately show up in ImHex's Content Store and gets bundled wi
> import custom encoding from File -> Import... -> Custome Encoding File
### Data Processor Nodes
| Name | Path | Description |
|------|------|-------------|
| Caesar Cipher | [`nodes/caesar.hexnode`](nodes/caesar.hexnode) | Simple adjustable per-byte Caecar Cipher (ROT) |
| XOR Cipher | [`nodes/xor.hexnode`](nodes/xor.hexnode) | XORs a input with a repeating XOR pad |
### Themes
| Name | Path | Description |
|------|------|-------------|
| Visual Studio Dark | [`themes/vs_dark.json`](themes/vs_dark.json) | Theme similar to Visual Studio's Dark theme |

215
patterns/gguf.hexpat Normal file
View File

@@ -0,0 +1,215 @@
// https://github.com/ggerganov/ggml/blob/master/docs/gguf.md
// https://github.com/openxla/iree/blob/main/runtime/src/iree/io/formats/gguf/gguf_parser.c
#pragma description GGUF v3 File Format Patter
#pragma authors @leonjza, jessie @ imhex discord
#pragma pattern_limit 300000
enum ggml_type: u32 {
GGML_TYPE_F32 = 0,
GGML_TYPE_F16 = 1,
GGML_TYPE_Q4_0 = 2,
GGML_TYPE_Q4_1 = 3,
// GGML_TYPE_Q4_2 = 4, support has been removed
// GGML_TYPE_Q4_3 = 5, support has been removed
GGML_TYPE_Q5_0 = 6,
GGML_TYPE_Q5_1 = 7,
GGML_TYPE_Q8_0 = 8,
GGML_TYPE_Q8_1 = 9,
GGML_TYPE_Q2_K = 10,
GGML_TYPE_Q3_K = 11,
GGML_TYPE_Q4_K = 12,
GGML_TYPE_Q5_K = 13,
GGML_TYPE_Q6_K = 14,
GGML_TYPE_Q8_K = 15,
GGML_TYPE_IQ2_XXS = 16,
GGML_TYPE_IQ2_XS = 17,
GGML_TYPE_IQ3_XXS = 18,
GGML_TYPE_IQ1_S = 19,
GGML_TYPE_IQ4_NL = 20,
GGML_TYPE_IQ3_S = 21,
GGML_TYPE_IQ2_S = 22,
GGML_TYPE_IQ4_XS = 23,
GGML_TYPE_I8 = 24,
GGML_TYPE_I16 = 25,
GGML_TYPE_I32 = 26,
GGML_TYPE_I64 = 27,
GGML_TYPE_F64 = 28,
GGML_TYPE_IQ1_M = 29,
GGML_TYPE_COUNT,
};
enum gguf_metadata_value_type: u32 {
// The value is a 8-bit unsigned integer.
GGUF_METADATA_VALUE_TYPE_UINT8 = 0,
// The value is a 8-bit signed integer.
GGUF_METADATA_VALUE_TYPE_INT8 = 1,
// The value is a 16-bit unsigned little-endian integer.
GGUF_METADATA_VALUE_TYPE_UINT16 = 2,
// The value is a 16-bit signed little-endian integer.
GGUF_METADATA_VALUE_TYPE_INT16 = 3,
// The value is a 32-bit unsigned little-endian integer.
GGUF_METADATA_VALUE_TYPE_UINT32 = 4,
// The value is a 32-bit signed little-endian integer.
GGUF_METADATA_VALUE_TYPE_INT32 = 5,
// The value is a 32-bit IEEE754 floating point number.
GGUF_METADATA_VALUE_TYPE_FLOAT32 = 6,
// The value is a boolean.
// 1-byte value where 0 is false and 1 is true.
// Anything else is invalid, and should be treated as either the model being invalid or the reader being buggy.
GGUF_METADATA_VALUE_TYPE_BOOL = 7,
// The value is a UTF-8 non-null-terminated string, with length prepended.
GGUF_METADATA_VALUE_TYPE_STRING = 8,
// The value is an array of other values, with the length and type prepended.
///
// Arrays can be nested, and the length of the array is the number of elements in the array, not the number of bytes.
GGUF_METADATA_VALUE_TYPE_ARRAY = 9,
// The value is a 64-bit unsigned little-endian integer.
GGUF_METADATA_VALUE_TYPE_UINT64 = 10,
// The value is a 64-bit signed little-endian integer.
GGUF_METADATA_VALUE_TYPE_INT64 = 11,
// The value is a 64-bit IEEE754 floating point number.
GGUF_METADATA_VALUE_TYPE_FLOAT64 = 12,
};
// A string in GGUF.
struct gguf_string_t {
// The length of the string, in bytes.
u64 len;
// The string as a UTF-8 non-null-terminated string.
char string[len];
};
struct gguf_metadata_value_t {
gguf_metadata_value_type type;
u64 length;
match(type) {
(gguf_metadata_value_type::GGUF_METADATA_VALUE_TYPE_UINT8): u8 value[length];
(gguf_metadata_value_type::GGUF_METADATA_VALUE_TYPE_INT8): s8 value[length];
(gguf_metadata_value_type::GGUF_METADATA_VALUE_TYPE_UINT16): u16 value[length];
(gguf_metadata_value_type::GGUF_METADATA_VALUE_TYPE_INT16): s16 value[length];
(gguf_metadata_value_type::GGUF_METADATA_VALUE_TYPE_UINT32): u32 value[length];
(gguf_metadata_value_type::GGUF_METADATA_VALUE_TYPE_INT32): s32 value[length];
(gguf_metadata_value_type::GGUF_METADATA_VALUE_TYPE_FLOAT32): float value[length];
(gguf_metadata_value_type::GGUF_METADATA_VALUE_TYPE_BOOL): bool value[length];
(gguf_metadata_value_type::GGUF_METADATA_VALUE_TYPE_STRING): gguf_string_t value[length];
(gguf_metadata_value_type::GGUF_METADATA_VALUE_TYPE_UINT64): u64 value[length];
(gguf_metadata_value_type::GGUF_METADATA_VALUE_TYPE_FLOAT64): double value[length];
(gguf_metadata_value_type::GGUF_METADATA_VALUE_TYPE_ARRAY): double value[length];
}
};
struct gguf_metadata_value {
gguf_metadata_value_type type;
match(type) {
(gguf_metadata_value_type::GGUF_METADATA_VALUE_TYPE_UINT8): u8 value;
(gguf_metadata_value_type::GGUF_METADATA_VALUE_TYPE_INT8): s8 value;
(gguf_metadata_value_type::GGUF_METADATA_VALUE_TYPE_UINT16): u16 value;
(gguf_metadata_value_type::GGUF_METADATA_VALUE_TYPE_INT16): s16 value;
(gguf_metadata_value_type::GGUF_METADATA_VALUE_TYPE_UINT32): u32 value;
(gguf_metadata_value_type::GGUF_METADATA_VALUE_TYPE_INT32): s32 value;
(gguf_metadata_value_type::GGUF_METADATA_VALUE_TYPE_FLOAT32): float value;
(gguf_metadata_value_type::GGUF_METADATA_VALUE_TYPE_BOOL): bool value;
(gguf_metadata_value_type::GGUF_METADATA_VALUE_TYPE_STRING): gguf_string_t value;
(gguf_metadata_value_type::GGUF_METADATA_VALUE_TYPE_UINT64): u64 value;
(gguf_metadata_value_type::GGUF_METADATA_VALUE_TYPE_FLOAT64): double value;
(gguf_metadata_value_type::GGUF_METADATA_VALUE_TYPE_ARRAY): gguf_metadata_value_t value;
}
};
struct gguf_metadata_kv_t {
// The key of the metadata. It is a standard GGUF string, with the following caveats:
// - It must be a valid ASCII string.
// - It must be a hierarchical key, where each segment is `lower_snake_case` and separated by a `.`.
// - It must be at most 2^16-1/65535 bytes long.
// Any keys that do not follow these rules are invalid.
gguf_string_t key;
// The type of the value.
// Must be one of the `gguf_metadata_value_type` values.
// gguf_metadata_value_type value_type;
// The value.
gguf_metadata_value value;
};
struct gguf_header_t {
// Magic number to announce that this is a GGUF file.
// Must be `GGUF` at the byte level: `0x47` `0x47` `0x55` `0x46`.
// Your executor might do little-endian byte order, so it might be
// check for 0x46554747 and letting the endianness cancel out.
// Consider being *very* explicit about the byte order here.
u32 magic;
// The version of the format implemented.
// Must be `3` for version described in this spec, which introduces big-endian support.
//
// This version should only be increased for structural changes to the format.
// Changes that do not affect the structure of the file should instead update the metadata
// to signify the change.
u32 version;
// The number of tensors in the file.
// This is explicit, instead of being included in the metadata, to ensure it is always present
// for loading the tensors.
u64 tensor_count;
// The number of metadata key-value pairs.
u64 metadata_kv_count;
// The metadata key-value pairs.
gguf_metadata_kv_t metadata_kv[metadata_kv_count];
};
struct gguf_tensor_info_t {
// The name of the tensor. It is a standard GGUF string, with the caveat that
// it must be at most 64 bytes long.
gguf_string_t name;
// The number of dimensions in the tensor.
// Currently at most 4, but this may change in the future.
u32 n_dimensions;
// The dimensions of the tensor.
u64 dimensions[n_dimensions];
// The type of the tensor.
ggml_type type;
// The offset of the tensor's data in this file in bytes.
//
// This offset is relative to `tensor_data`, not to the start
// of the file, to make it easier for writers to write the file.
// Readers should consider exposing this offset relative to the
// file to make it easier to read the data.
//
// Must be a multiple of `ALIGNMENT`. That is, `align_offset(offset) == offset`.
u64 offset;
};
struct gguf_file_t {
// The header of the file.
gguf_header_t header;
// Tensor infos, which can be used to locate the tensor data.
gguf_tensor_info_t tensor_infos[header.tensor_count];
// Padding to the nearest multiple of `ALIGNMENT`.
//
// That is, if `sizeof(header) + sizeof(tensor_infos)` is not a multiple of `ALIGNMENT`,
// this padding is added to make it so.
//
// This can be calculated as `align_offset(position) - position`, where `position` is
// the position of the end of `tensor_infos` (i.e. `sizeof(header) + sizeof(tensor_infos)`).
u8 _padding[];
// Tensor data.
//
// This is arbitrary binary data corresponding to the weights of the model. This data should be close
// or identical to the data in the original model file, but may be different due to quantization or
// other optimizations for inference. Any such deviations should be recorded in the metadata or as
// part of the architecture definition.
//
// Each tensor's data must be stored within this array, and located through its `tensor_infos` entry.
// The offset of each tensor's data must be a multiple of `ALIGNMENT`, and the space between tensors
// should be padded to `ALIGNMENT` bytes.
u8 tensor_data[];
};
gguf_file_t GGUF @ 0x00;

Binary file not shown.