patterns: Add Python Pickle Pattern (#446)

* Add pickle pattern file

* Add test file

* Update README.md

---------

Co-authored-by: Nik <werwolv98@gmail.com>
This commit is contained in:
ODeux
2025-09-21 11:17:40 +02:00
committed by GitHub
parent 0e67ee102b
commit 7a9a5097a2
3 changed files with 358 additions and 0 deletions

357
patterns/pickle.hexpat Normal file
View File

@@ -0,0 +1,357 @@
/*
References:
Pickle Source Code:
https://github.com/python/cpython/blob/main/Lib/pickle.py
Pickle Protocol Version Breakdown:
https://docs.python.org/3.13/library/pickle.html#data-stream-format
Pickle OpCode Breakdown:
https://github.com/python/cpython/blob/main/Lib/pickletools.py
*/
#pragma author ODeux
#pragma description Python Binary Object Serialization Protocol
#pragma endian little
import std.mem;
import std.string;
#pragma array_limit 524288
fn todo(auto message){
std::error(std::format("@0x{:08X} TODO: " + message, $));
};
fn utf8_fmt(auto s){
return std::format("{}", s);
};
#define UTF8_FMT format("utf8_fmt"), transform("utf8_fmt")
fn utf8_rl_fmt(auto s){
str new_s = std::string::substr(s, 0, std::string::length(s) - 1);
return std::format("{}", new_s);
};
#define UTF8_RL_FMT format("utf8_rl_fmt"), transform("utf8_rl_fmt")
fn int_rl_fmt(auto s){
if(s == "01\n") return true; /* == TRUE(b'I01\n')[1:] */
if(s == "00\n") return false; /* == FALSE(b'I00\n')[1:] */
str new_s = std::string::substr(s, 0, std::string::length(s) - 1);
return std::string::parse_int(new_s, 0);
};
#define INT_RL_FMT format("int_rl_fmt"), transform("int_rl_fmt")
fn float_rl_fmt(auto s){
str new_s = std::string::substr(s, 0, std::string::length(s) - 1);
return std::string::parse_float(new_s);
};
#define FLOAT_RL_FMT format("float_rl_fmt"), transform("float_rl_fmt")
fn long_rl_fmt(auto s){
str new_s = std::string::substr(s, 0, std::string::length(s) - 1);
if(new_s != "" && std::string::at(new_s, std::string::length(new_s) - 1) == "L")
new_s = std::string::substr(new_s, 0, std::string::length(new_s) - 1);
return std::string::parse_int(new_s, 0);
};
#define LONG_RL_FMT format("long_rl_fmt"), transform("long_rl_fmt")
fn ascii_rl_fmt(auto s){
return std::string::substr(s, 0, std::string::length(s) - 1);
};
#define ASCII_RL_FMT format("ascii_rl_fmt"), transform("ascii_rl_fmt")
fn integer_rl_fmt(auto s){
str new_s = std::string::substr(s, 0, std::string::length(s) - 1);
return std::string::parse_int(new_s, 0);
};
#define INTEGER_RL_FMT format("integer_rl_fmt"), transform("integer_rl_fmt")
fn string_rl_fmt(auto s){
str new_s = std::string::substr(s, 0, std::string::length(s) - 1);
auto length = std::string::length(new_s);
if(length >= 2 && new_s[0] == new_s[length - 1] && (new_s[0] == '\'' || new_s[0] == '"'))
new_s = std::string::substr(new_s, 1, length - 1);
else std::error("the STRING opcode argument must be quoted");
return new_s;
};
#define STRING_RL_FMT format("string_rl_fmt"), transform("string_rl_fmt")
enum OpcodesEnum: u8{
MARK = '(', /* push special markobject on stack */
STOP = '.', /* every pickle ends with STOP */
POP = '0', /* discard topmost stack item */
POP_MARK = '1', /* discard stack top through topmost markobject */
DUP = '2', /* duplicate top stack item */
FLOAT = 'F', /* push float object; decimal string argument */
INT = 'I', /* push integer or bool; decimal string argument */
BININT = 'J', /* push four-byte signed int */
BININT1 = 'K', /* push 1-byte unsigned int */
LONG = 'L', /* push long; decimal string argument */
BININT2 = 'M', /* push 2-byte unsigned int */
NONE = 'N', /* push None */
PERSID = 'P', /* push persistent object; id is taken from string arg */
BINPERSID = 'Q', /* " " " ; " " " " stack */
REDUCE = 'R', /* apply callable to argtuple, both on stack */
STRING = 'S', /* push string; NL-terminated string argument */
BINSTRING = 'T', /* push string; counted binary string argument */
SHORT_BINSTRING = 'U', /* " " ; " " " " < 256 bytes */
UNICODE = 'V', /* push Unicode string; raw-unicode-escaped'd argument */
BINUNICODE = 'X', /* " " " ; counted UTF-8 string argument */
APPEND = 'a', /* append stack top to list below it */
BUILD = 'b', /* call __setstate__ or __dict__.update() */
GLOBAL = 'c', /* push self.find_class(modname, name); 2 string args */
DICT = 'd', /* build a dict from stack items */
EMPTY_DICT = '}', /* push empty dict */
APPENDS = 'e', /* extend list on stack by topmost stack slice */
GET = 'g', /* push item from memo on stack; index is string arg */
BINGET = 'h', /* " " " " " " ; " " 1-byte arg */
INST = 'i', /* build & push class instance */
LONG_BINGET = 'j', /* push item from memo on stack; index is 4-byte arg */
LIST = 'l', /* build list from topmost stack items */
EMPTY_LIST = ']', /* push empty list */
OBJ = 'o', /* build & push class instance */
PUT = 'p', /* store stack top in memo; index is string arg */
BINPUT = 'q', /* " " " " " ; " " 1-byte arg */
LONG_BINPUT = 'r', /* " " " " " ; " " 4-byte arg */
SETITEM = 's', /* add key+value pair to dict */
TUPLE = 't', /* build tuple from topmost stack items */
EMPTY_TUPLE = ')', /* push empty tuple */
SETITEMS = 'u', /* modify dict by adding topmost key+value pairs */
BINFLOAT = 'G', /* push float; arg is 8-byte float encoding */
/* ---- Protocol 2 ---- */
PROTO = 0x80, /* identify pickle protocol */
NEWOBJ = 0x81, /* build object by applying cls.__new__ to argtuple */
EXT1 = 0x82, /* push object from extension registry; 1-byte index */
EXT2 = 0x83, /* ditto, but 2-byte index */
EXT4 = 0x84, /* ditto, but 4-byte index */
TUPLE1 = 0x85, /* build 1-tuple from stack top */
TUPLE2 = 0x86, /* build 2-tuple from two topmost stack items */
TUPLE3 = 0x87, /* build 3-tuple from three topmost stack items */
NEWTRUE = 0x88, /* push True */
NEWFALSE = 0x89, /* push False */
LONG1 = 0x8A, /* push long from < 256 bytes */
LONG4 = 0x8B, /* push really big long */
/* ---- Protocol 3 (Python 3.x) ---- */
BINBYTES = 'B', /* push bytes; counted binary string argument */
SHORT_BINBYTES = 'C', /* " " ; " " " " < 256 bytes */
/* ---- Protocol 4 ---- */
SHORT_BINUNICODE = 0x8C, /* push short string; UTF-8 length < 256 bytes */
BINUNICODE8 = 0x8D, /* push very long string */
BINBYTES8 = 0x8E, /* push very long bytes string */
EMPTY_SET = 0x8F, /* push empty set on the stack */
ADDITEMS = 0x90, /* modify set by adding topmost stack items */
FROZENSET = 0x91, /* build frozenset from topmost stack items */
NEWOBJ_EX = 0x92, /* like NEWOBJ but work with keyword only arguments */
STACK_GLOBAL = 0x93, /* same as GLOBAL but using names on the stacks */
MEMOIZE = 0x94, /* store top of the stack in memo */
FRAME = 0x95, /* indicate the beginning of a new frame */
/* ---- Protocol 5 ---- */
BYTEARRAY8 = 0x96, /* push bytearray */
NEXT_BUFFER = 0x97, /* push next out-of-band buffer */
READONLY_BUFFER = 0x98 /* make top of stack readonly */
};
fn readline(){
auto i = 0;
while(std::mem::read_unsigned($ + i, 1) != '\n') i += 1;
return i + 1;
};
struct Opcodes{
OpcodesEnum opcode;
match(opcode){
(OpcodesEnum::MARK): {}
(OpcodesEnum::STOP): break;
(OpcodesEnum::POP): {}
(OpcodesEnum::POP_MARK): {}
(OpcodesEnum::DUP): {}
(OpcodesEnum::FLOAT): {
char Float[readline()] [[FLOAT_RL_FMT]]; /* float(readline()[:1]) */
}
(OpcodesEnum::INT): {
/* == TRUE(b'I01\n')[1:], == FALSE(b'I00\n')[1:], int(readline(), 0) */
char Int[readline()] [[INT_RL_FMT]];
}
(OpcodesEnum::BININT): {
s32 Int;
}
(OpcodesEnum::BININT1): {
s8 Int;
}
(OpcodesEnum::LONG): {
/* val = readline()[:-1], val = val and val[-1] == b"L"[0] ? val[:-1]: val */
char Long[readline()] [[LONG_RL_FMT]]; /* int(val, 0) */
}
(OpcodesEnum::BININT2): {
u16 Int;
}
(OpcodesEnum::NONE): {}
(OpcodesEnum::PERSID): {
char id[readline()] [[ASCII_RL_FMT]]; /* readline()[:-1].decode("ascii") */
}
(OpcodesEnum::BINPERSID): {}
(OpcodesEnum::REDUCE): {}
/*
def _decode_string(self, value):
# Used to allow strings from Python 2 to be decoded either as bytes or Unicode strings.
# This should be used only with the STRING, BINSTRING and SHORT_BINSTRING opcodes.
if self.encoding == "bytes":
return value
else:
return value.decode(self.encoding, self.errors)
*/
(OpcodesEnum::STRING): {
/* data must be in quotes ("..." or '...'), dataStripped = stripQuote(readline()[:-1]) */
/* _decode_string(codecs.escape_decode(dataStripped)[0]) */
char data[readline()] [[STRING_RL_FMT]];
}
(OpcodesEnum::BINSTRING): {
s32 length;
char data[length]; /* _decode_string(data) */
}
(OpcodesEnum::SHORT_BINSTRING): {
u8 length;
char data[length]; /* _decode_string(data) */
}
(OpcodesEnum::UNICODE): {
/*
"raw-unicode-escape":
Latin-1 encoding with \uXXXX and \UXXXXXXXX for other code points.
Existing backslashes are not escaped in any way.
*/
char data[readline()] [[UTF8_RL_FMT]]; /* str(readline()[:-1], "raw-unicode-escape") */
}
(OpcodesEnum::BINUNICODE): {
u32 length;
char data[length] [[UTF8_FMT]]; /* str(data, "utf-8", "surrogatepass") */
}
(OpcodesEnum::APPEND): {}
(OpcodesEnum::BUILD): {}
(OpcodesEnum::GLOBAL): {
char module[readline()] [[UTF8_RL_FMT]]; /* readline()[:-1].decode("utf-8") */
char name[readline()] [[UTF8_RL_FMT]]; /* readline()[:-1].decode("utf-8") */
}
(OpcodesEnum::DICT): {}
(OpcodesEnum::EMPTY_DICT): {}
(OpcodesEnum::APPENDS): {}
(OpcodesEnum::GET): {
char index[readline()] [[INTEGER_RL_FMT]]; /* int(readline()[:-1]) */
}
(OpcodesEnum::BINGET): {
u8 index;
}
(OpcodesEnum::INST): {
char module[readline()] [[ASCII_RL_FMT]]; /* readline()[:-1].decode("ascii") */
char name[readline()] [[ASCII_RL_FMT]]; /* readline()[:-1].decode("ascii") */
}
(OpcodesEnum::LONG_BINGET): {
u32 index;
}
(OpcodesEnum::LIST): {}
(OpcodesEnum::EMPTY_LIST): {}
(OpcodesEnum::OBJ): {}
(OpcodesEnum::PUT): {
char index[readline()] [[INTEGER_RL_FMT]]; /* int(readline()[:-1]) */
}
(OpcodesEnum::BINPUT): {
s8 index;
}
(OpcodesEnum::LONG_BINPUT): {
u32 index;
}
(OpcodesEnum::SETITEM): {}
(OpcodesEnum::TUPLE): {}
(OpcodesEnum::EMPTY_TUPLE): {}
(OpcodesEnum::SETITEMS): {}
(OpcodesEnum::BINFLOAT): {
be double Double;
}
/* ---- Protocol 2 ---- */
(OpcodesEnum::PROTO): {
u8 version;
}
(OpcodesEnum::NEWOBJ): {}
(OpcodesEnum::EXT1): {
u8 code;
}
(OpcodesEnum::EXT2): {
u16 code;
}
(OpcodesEnum::EXT4): {
s32 code;
}
(OpcodesEnum::TUPLE1): {}
(OpcodesEnum::TUPLE2): {}
(OpcodesEnum::TUPLE3): {}
(OpcodesEnum::NEWTRUE): {}
(OpcodesEnum::NEWFALSE): {}
/*
def decode_long(data):
r"""Decode a long from a two's complement little-endian binary string.
>>> decode_long(b"") => 0
>>> decode_long(b"\xff\x00") => 255
>>> decode_long(b"\xff\x7f") => 32767
>>> decode_long(b"\x00\xff") => -256
>>> decode_long(b"\x00\x80") => -32768
>>> decode_long(b"\x80") => -128
>>> decode_long(b"\x7f") => 127
"""
return int.from_bytes(data, byteorder="little", signed=True)
*/
(OpcodesEnum::LONG1): {
u8 length;
u8 data[length]; /* decode_long(data) */
}
(OpcodesEnum::LONG4): {
s32 length;
u8 data[length]; /* decode_long(data) */
}
/* ---- Protocol 3 (Python 3.x) ---- */
(OpcodesEnum::BINBYTES): {
u32 length;
u8 bytes[length];
}
(OpcodesEnum::SHORT_BINBYTES): {
u8 length;
u8 bytes[length];
}
/* ---- Protocol 4 ---- */
(OpcodesEnum::SHORT_BINUNICODE): {
u8 length;
char data[length] [[UTF8_FMT]]; /* str(data, "utf-8", "surrogatepass") */
}
(OpcodesEnum::BINUNICODE8): {
u64 length;
char data[length] [[UTF8_FMT]]; /* str(data, "utf-8", "surrogatepass") */
}
(OpcodesEnum::BINBYTES8): {
u64 length;
u8 bytes[length];
}
(OpcodesEnum::EMPTY_SET): {}
(OpcodesEnum::ADDITEMS): {}
(OpcodesEnum::FROZENSET): {}
(OpcodesEnum::NEWOBJ_EX): {}
(OpcodesEnum::STACK_GLOBAL): {}
(OpcodesEnum::MEMOIZE): {}
(OpcodesEnum::FRAME): {
u64 length;
Opcodes opcodes[while($ < addressof(length) + sizeof(length) + length)];
}
/* ---- Protocol 5 ---- */
(OpcodesEnum::BYTEARRAY8): {
u64 length;
u8 array[length];
}
(OpcodesEnum::NEXT_BUFFER): {}
(OpcodesEnum::READONLY_BUFFER): {}
(_): std::error(std::format("Unrecognized {}", opcode));
}
};
struct Pickle{
Opcodes opcodes[while(!std::mem::eof())];
};
Pickle pickle @ 0x0;