mirror of
https://github.com/WerWolv/ImHex-Patterns.git
synced 2026-03-27 23:37:04 -05:00
* Add pickle pattern file * Add test file * Update README.md --------- Co-authored-by: Nik <werwolv98@gmail.com>
358 lines
15 KiB
Rust
358 lines
15 KiB
Rust
|
|
/*
|
|
References:
|
|
Pickle Source Code:
|
|
https://github.com/python/cpython/blob/main/Lib/pickle.py
|
|
Pickle Protocol Version Breakdown:
|
|
https://docs.python.org/3.13/library/pickle.html#data-stream-format
|
|
Pickle OpCode Breakdown:
|
|
https://github.com/python/cpython/blob/main/Lib/pickletools.py
|
|
*/
|
|
|
|
#pragma author ODeux
|
|
#pragma description Python Binary Object Serialization Protocol
|
|
|
|
#pragma endian little
|
|
|
|
import std.mem;
|
|
import std.string;
|
|
|
|
#pragma array_limit 524288
|
|
|
|
fn todo(auto message){
|
|
std::error(std::format("@0x{:08X} TODO: " + message, $));
|
|
};
|
|
|
|
fn utf8_fmt(auto s){
|
|
return std::format("{}", s);
|
|
};
|
|
#define UTF8_FMT format("utf8_fmt"), transform("utf8_fmt")
|
|
|
|
fn utf8_rl_fmt(auto s){
|
|
str new_s = std::string::substr(s, 0, std::string::length(s) - 1);
|
|
return std::format("{}", new_s);
|
|
};
|
|
#define UTF8_RL_FMT format("utf8_rl_fmt"), transform("utf8_rl_fmt")
|
|
|
|
fn int_rl_fmt(auto s){
|
|
if(s == "01\n") return true; /* == TRUE(b'I01\n')[1:] */
|
|
if(s == "00\n") return false; /* == FALSE(b'I00\n')[1:] */
|
|
str new_s = std::string::substr(s, 0, std::string::length(s) - 1);
|
|
return std::string::parse_int(new_s, 0);
|
|
};
|
|
#define INT_RL_FMT format("int_rl_fmt"), transform("int_rl_fmt")
|
|
|
|
fn float_rl_fmt(auto s){
|
|
str new_s = std::string::substr(s, 0, std::string::length(s) - 1);
|
|
return std::string::parse_float(new_s);
|
|
};
|
|
#define FLOAT_RL_FMT format("float_rl_fmt"), transform("float_rl_fmt")
|
|
|
|
fn long_rl_fmt(auto s){
|
|
str new_s = std::string::substr(s, 0, std::string::length(s) - 1);
|
|
if(new_s != "" && std::string::at(new_s, std::string::length(new_s) - 1) == "L")
|
|
new_s = std::string::substr(new_s, 0, std::string::length(new_s) - 1);
|
|
return std::string::parse_int(new_s, 0);
|
|
};
|
|
#define LONG_RL_FMT format("long_rl_fmt"), transform("long_rl_fmt")
|
|
|
|
fn ascii_rl_fmt(auto s){
|
|
return std::string::substr(s, 0, std::string::length(s) - 1);
|
|
};
|
|
#define ASCII_RL_FMT format("ascii_rl_fmt"), transform("ascii_rl_fmt")
|
|
|
|
fn integer_rl_fmt(auto s){
|
|
str new_s = std::string::substr(s, 0, std::string::length(s) - 1);
|
|
return std::string::parse_int(new_s, 0);
|
|
};
|
|
#define INTEGER_RL_FMT format("integer_rl_fmt"), transform("integer_rl_fmt")
|
|
|
|
fn string_rl_fmt(auto s){
|
|
str new_s = std::string::substr(s, 0, std::string::length(s) - 1);
|
|
auto length = std::string::length(new_s);
|
|
if(length >= 2 && new_s[0] == new_s[length - 1] && (new_s[0] == '\'' || new_s[0] == '"'))
|
|
new_s = std::string::substr(new_s, 1, length - 1);
|
|
else std::error("the STRING opcode argument must be quoted");
|
|
return new_s;
|
|
};
|
|
#define STRING_RL_FMT format("string_rl_fmt"), transform("string_rl_fmt")
|
|
|
|
enum OpcodesEnum: u8{
|
|
MARK = '(', /* push special markobject on stack */
|
|
STOP = '.', /* every pickle ends with STOP */
|
|
POP = '0', /* discard topmost stack item */
|
|
POP_MARK = '1', /* discard stack top through topmost markobject */
|
|
DUP = '2', /* duplicate top stack item */
|
|
FLOAT = 'F', /* push float object; decimal string argument */
|
|
INT = 'I', /* push integer or bool; decimal string argument */
|
|
BININT = 'J', /* push four-byte signed int */
|
|
BININT1 = 'K', /* push 1-byte unsigned int */
|
|
LONG = 'L', /* push long; decimal string argument */
|
|
BININT2 = 'M', /* push 2-byte unsigned int */
|
|
NONE = 'N', /* push None */
|
|
PERSID = 'P', /* push persistent object; id is taken from string arg */
|
|
BINPERSID = 'Q', /* " " " ; " " " " stack */
|
|
REDUCE = 'R', /* apply callable to argtuple, both on stack */
|
|
STRING = 'S', /* push string; NL-terminated string argument */
|
|
BINSTRING = 'T', /* push string; counted binary string argument */
|
|
SHORT_BINSTRING = 'U', /* " " ; " " " " < 256 bytes */
|
|
UNICODE = 'V', /* push Unicode string; raw-unicode-escaped'd argument */
|
|
BINUNICODE = 'X', /* " " " ; counted UTF-8 string argument */
|
|
APPEND = 'a', /* append stack top to list below it */
|
|
BUILD = 'b', /* call __setstate__ or __dict__.update() */
|
|
GLOBAL = 'c', /* push self.find_class(modname, name); 2 string args */
|
|
DICT = 'd', /* build a dict from stack items */
|
|
EMPTY_DICT = '}', /* push empty dict */
|
|
APPENDS = 'e', /* extend list on stack by topmost stack slice */
|
|
GET = 'g', /* push item from memo on stack; index is string arg */
|
|
BINGET = 'h', /* " " " " " " ; " " 1-byte arg */
|
|
INST = 'i', /* build & push class instance */
|
|
LONG_BINGET = 'j', /* push item from memo on stack; index is 4-byte arg */
|
|
LIST = 'l', /* build list from topmost stack items */
|
|
EMPTY_LIST = ']', /* push empty list */
|
|
OBJ = 'o', /* build & push class instance */
|
|
PUT = 'p', /* store stack top in memo; index is string arg */
|
|
BINPUT = 'q', /* " " " " " ; " " 1-byte arg */
|
|
LONG_BINPUT = 'r', /* " " " " " ; " " 4-byte arg */
|
|
SETITEM = 's', /* add key+value pair to dict */
|
|
TUPLE = 't', /* build tuple from topmost stack items */
|
|
EMPTY_TUPLE = ')', /* push empty tuple */
|
|
SETITEMS = 'u', /* modify dict by adding topmost key+value pairs */
|
|
BINFLOAT = 'G', /* push float; arg is 8-byte float encoding */
|
|
/* ---- Protocol 2 ---- */
|
|
PROTO = 0x80, /* identify pickle protocol */
|
|
NEWOBJ = 0x81, /* build object by applying cls.__new__ to argtuple */
|
|
EXT1 = 0x82, /* push object from extension registry; 1-byte index */
|
|
EXT2 = 0x83, /* ditto, but 2-byte index */
|
|
EXT4 = 0x84, /* ditto, but 4-byte index */
|
|
TUPLE1 = 0x85, /* build 1-tuple from stack top */
|
|
TUPLE2 = 0x86, /* build 2-tuple from two topmost stack items */
|
|
TUPLE3 = 0x87, /* build 3-tuple from three topmost stack items */
|
|
NEWTRUE = 0x88, /* push True */
|
|
NEWFALSE = 0x89, /* push False */
|
|
LONG1 = 0x8A, /* push long from < 256 bytes */
|
|
LONG4 = 0x8B, /* push really big long */
|
|
/* ---- Protocol 3 (Python 3.x) ---- */
|
|
BINBYTES = 'B', /* push bytes; counted binary string argument */
|
|
SHORT_BINBYTES = 'C', /* " " ; " " " " < 256 bytes */
|
|
/* ---- Protocol 4 ---- */
|
|
SHORT_BINUNICODE = 0x8C, /* push short string; UTF-8 length < 256 bytes */
|
|
BINUNICODE8 = 0x8D, /* push very long string */
|
|
BINBYTES8 = 0x8E, /* push very long bytes string */
|
|
EMPTY_SET = 0x8F, /* push empty set on the stack */
|
|
ADDITEMS = 0x90, /* modify set by adding topmost stack items */
|
|
FROZENSET = 0x91, /* build frozenset from topmost stack items */
|
|
NEWOBJ_EX = 0x92, /* like NEWOBJ but work with keyword only arguments */
|
|
STACK_GLOBAL = 0x93, /* same as GLOBAL but using names on the stacks */
|
|
MEMOIZE = 0x94, /* store top of the stack in memo */
|
|
FRAME = 0x95, /* indicate the beginning of a new frame */
|
|
/* ---- Protocol 5 ---- */
|
|
BYTEARRAY8 = 0x96, /* push bytearray */
|
|
NEXT_BUFFER = 0x97, /* push next out-of-band buffer */
|
|
READONLY_BUFFER = 0x98 /* make top of stack readonly */
|
|
};
|
|
|
|
fn readline(){
|
|
auto i = 0;
|
|
while(std::mem::read_unsigned($ + i, 1) != '\n') i += 1;
|
|
return i + 1;
|
|
};
|
|
|
|
struct Opcodes{
|
|
OpcodesEnum opcode;
|
|
match(opcode){
|
|
(OpcodesEnum::MARK): {}
|
|
(OpcodesEnum::STOP): break;
|
|
(OpcodesEnum::POP): {}
|
|
(OpcodesEnum::POP_MARK): {}
|
|
(OpcodesEnum::DUP): {}
|
|
(OpcodesEnum::FLOAT): {
|
|
char Float[readline()] [[FLOAT_RL_FMT]]; /* float(readline()[:1]) */
|
|
}
|
|
(OpcodesEnum::INT): {
|
|
/* == TRUE(b'I01\n')[1:], == FALSE(b'I00\n')[1:], int(readline(), 0) */
|
|
char Int[readline()] [[INT_RL_FMT]];
|
|
}
|
|
(OpcodesEnum::BININT): {
|
|
s32 Int;
|
|
}
|
|
(OpcodesEnum::BININT1): {
|
|
s8 Int;
|
|
}
|
|
(OpcodesEnum::LONG): {
|
|
/* val = readline()[:-1], val = val and val[-1] == b"L"[0] ? val[:-1]: val */
|
|
char Long[readline()] [[LONG_RL_FMT]]; /* int(val, 0) */
|
|
}
|
|
(OpcodesEnum::BININT2): {
|
|
u16 Int;
|
|
}
|
|
(OpcodesEnum::NONE): {}
|
|
(OpcodesEnum::PERSID): {
|
|
char id[readline()] [[ASCII_RL_FMT]]; /* readline()[:-1].decode("ascii") */
|
|
}
|
|
(OpcodesEnum::BINPERSID): {}
|
|
(OpcodesEnum::REDUCE): {}
|
|
/*
|
|
def _decode_string(self, value):
|
|
# Used to allow strings from Python 2 to be decoded either as bytes or Unicode strings.
|
|
# This should be used only with the STRING, BINSTRING and SHORT_BINSTRING opcodes.
|
|
if self.encoding == "bytes":
|
|
return value
|
|
else:
|
|
return value.decode(self.encoding, self.errors)
|
|
*/
|
|
(OpcodesEnum::STRING): {
|
|
/* data must be in quotes ("..." or '...'), dataStripped = stripQuote(readline()[:-1]) */
|
|
/* _decode_string(codecs.escape_decode(dataStripped)[0]) */
|
|
char data[readline()] [[STRING_RL_FMT]];
|
|
}
|
|
(OpcodesEnum::BINSTRING): {
|
|
s32 length;
|
|
char data[length]; /* _decode_string(data) */
|
|
}
|
|
(OpcodesEnum::SHORT_BINSTRING): {
|
|
u8 length;
|
|
char data[length]; /* _decode_string(data) */
|
|
}
|
|
(OpcodesEnum::UNICODE): {
|
|
/*
|
|
"raw-unicode-escape":
|
|
Latin-1 encoding with \uXXXX and \UXXXXXXXX for other code points.
|
|
Existing backslashes are not escaped in any way.
|
|
*/
|
|
char data[readline()] [[UTF8_RL_FMT]]; /* str(readline()[:-1], "raw-unicode-escape") */
|
|
}
|
|
(OpcodesEnum::BINUNICODE): {
|
|
u32 length;
|
|
char data[length] [[UTF8_FMT]]; /* str(data, "utf-8", "surrogatepass") */
|
|
}
|
|
(OpcodesEnum::APPEND): {}
|
|
(OpcodesEnum::BUILD): {}
|
|
(OpcodesEnum::GLOBAL): {
|
|
char module[readline()] [[UTF8_RL_FMT]]; /* readline()[:-1].decode("utf-8") */
|
|
char name[readline()] [[UTF8_RL_FMT]]; /* readline()[:-1].decode("utf-8") */
|
|
}
|
|
(OpcodesEnum::DICT): {}
|
|
(OpcodesEnum::EMPTY_DICT): {}
|
|
(OpcodesEnum::APPENDS): {}
|
|
(OpcodesEnum::GET): {
|
|
char index[readline()] [[INTEGER_RL_FMT]]; /* int(readline()[:-1]) */
|
|
}
|
|
(OpcodesEnum::BINGET): {
|
|
u8 index;
|
|
}
|
|
(OpcodesEnum::INST): {
|
|
char module[readline()] [[ASCII_RL_FMT]]; /* readline()[:-1].decode("ascii") */
|
|
char name[readline()] [[ASCII_RL_FMT]]; /* readline()[:-1].decode("ascii") */
|
|
}
|
|
(OpcodesEnum::LONG_BINGET): {
|
|
u32 index;
|
|
}
|
|
(OpcodesEnum::LIST): {}
|
|
(OpcodesEnum::EMPTY_LIST): {}
|
|
(OpcodesEnum::OBJ): {}
|
|
(OpcodesEnum::PUT): {
|
|
char index[readline()] [[INTEGER_RL_FMT]]; /* int(readline()[:-1]) */
|
|
}
|
|
(OpcodesEnum::BINPUT): {
|
|
s8 index;
|
|
}
|
|
(OpcodesEnum::LONG_BINPUT): {
|
|
u32 index;
|
|
}
|
|
(OpcodesEnum::SETITEM): {}
|
|
(OpcodesEnum::TUPLE): {}
|
|
(OpcodesEnum::EMPTY_TUPLE): {}
|
|
(OpcodesEnum::SETITEMS): {}
|
|
(OpcodesEnum::BINFLOAT): {
|
|
be double Double;
|
|
}
|
|
/* ---- Protocol 2 ---- */
|
|
(OpcodesEnum::PROTO): {
|
|
u8 version;
|
|
}
|
|
(OpcodesEnum::NEWOBJ): {}
|
|
(OpcodesEnum::EXT1): {
|
|
u8 code;
|
|
}
|
|
(OpcodesEnum::EXT2): {
|
|
u16 code;
|
|
}
|
|
(OpcodesEnum::EXT4): {
|
|
s32 code;
|
|
}
|
|
(OpcodesEnum::TUPLE1): {}
|
|
(OpcodesEnum::TUPLE2): {}
|
|
(OpcodesEnum::TUPLE3): {}
|
|
(OpcodesEnum::NEWTRUE): {}
|
|
(OpcodesEnum::NEWFALSE): {}
|
|
/*
|
|
def decode_long(data):
|
|
r"""Decode a long from a two's complement little-endian binary string.
|
|
>>> decode_long(b"") => 0
|
|
>>> decode_long(b"\xff\x00") => 255
|
|
>>> decode_long(b"\xff\x7f") => 32767
|
|
>>> decode_long(b"\x00\xff") => -256
|
|
>>> decode_long(b"\x00\x80") => -32768
|
|
>>> decode_long(b"\x80") => -128
|
|
>>> decode_long(b"\x7f") => 127
|
|
"""
|
|
return int.from_bytes(data, byteorder="little", signed=True)
|
|
*/
|
|
(OpcodesEnum::LONG1): {
|
|
u8 length;
|
|
u8 data[length]; /* decode_long(data) */
|
|
}
|
|
(OpcodesEnum::LONG4): {
|
|
s32 length;
|
|
u8 data[length]; /* decode_long(data) */
|
|
}
|
|
/* ---- Protocol 3 (Python 3.x) ---- */
|
|
(OpcodesEnum::BINBYTES): {
|
|
u32 length;
|
|
u8 bytes[length];
|
|
}
|
|
(OpcodesEnum::SHORT_BINBYTES): {
|
|
u8 length;
|
|
u8 bytes[length];
|
|
}
|
|
/* ---- Protocol 4 ---- */
|
|
(OpcodesEnum::SHORT_BINUNICODE): {
|
|
u8 length;
|
|
char data[length] [[UTF8_FMT]]; /* str(data, "utf-8", "surrogatepass") */
|
|
}
|
|
(OpcodesEnum::BINUNICODE8): {
|
|
u64 length;
|
|
char data[length] [[UTF8_FMT]]; /* str(data, "utf-8", "surrogatepass") */
|
|
}
|
|
(OpcodesEnum::BINBYTES8): {
|
|
u64 length;
|
|
u8 bytes[length];
|
|
}
|
|
(OpcodesEnum::EMPTY_SET): {}
|
|
(OpcodesEnum::ADDITEMS): {}
|
|
(OpcodesEnum::FROZENSET): {}
|
|
(OpcodesEnum::NEWOBJ_EX): {}
|
|
(OpcodesEnum::STACK_GLOBAL): {}
|
|
(OpcodesEnum::MEMOIZE): {}
|
|
(OpcodesEnum::FRAME): {
|
|
u64 length;
|
|
Opcodes opcodes[while($ < addressof(length) + sizeof(length) + length)];
|
|
}
|
|
/* ---- Protocol 5 ---- */
|
|
(OpcodesEnum::BYTEARRAY8): {
|
|
u64 length;
|
|
u8 array[length];
|
|
}
|
|
(OpcodesEnum::NEXT_BUFFER): {}
|
|
(OpcodesEnum::READONLY_BUFFER): {}
|
|
(_): std::error(std::format("Unrecognized {}", opcode));
|
|
}
|
|
};
|
|
|
|
struct Pickle{
|
|
Opcodes opcodes[while(!std::mem::eof())];
|
|
};
|
|
|
|
Pickle pickle @ 0x0;
|