pattern/protobuf: Allow parsing of nested messages (#378)

Mostly this enables attempted recursive parsing of submessages. Note
that it is inherently impossible to determine the underlying data type
for LengthDelimited for sure, so this is a best-effort attempt. The user
can disable recursive submessage parsing via Settings.

Other minor changes:
* added #pragma MIME and #pragma endian directives
* enabled UTF-8 display for LengthDelimited types
* added signed LEB128 display for Varint types (although this doesn't
  seem to be working on my test case)
* swapped if/else-if structure for match
* fail upon receiving unknown or unsupported WireType

Co-authored-by: Nik <werwolv98@gmail.com>
This commit is contained in:
Glenn Hartmann
2025-05-10 07:53:01 -04:00
committed by GitHub
parent 7ad9cd4f41
commit 6cadad3d1f

View File

@@ -1,12 +1,25 @@
#pragma author WerWolv
#pragma author WerWolv and Glenn Hartmann
#pragma description Google Protobuf wire encoding (.pb)
#pragma MIME application/protobuf
#pragma MIME application/vnd.google.protobuf
#pragma endian little
import std.core;
import std.io;
import std.mem;
import std.string;
import std.sys;
import type.leb128;
// Attempting to recursively parse submessages is a guess-and-check process
// since it's inherently impossible to tell for sure what type a
// LengthDelimited field is. This could be imprecise and could be slow for
// large or ambiguous files, so we give the user an option to disable it.
bool disable_recursive_submessage_parsing in;
struct ZigZag32 {
u32 value;
} [[sealed, format("format_zigzag32")]];
@@ -32,7 +45,6 @@ enum WireType : u8 {
_32Bit = 5
};
struct Key {
type::uLEB128 keyDec;
u32 field_number = u32(keyDec) >> 3;
@@ -55,23 +67,55 @@ union _32Bit {
float flt;
};
using Field;
struct Message<auto Size> {
Field fields[while(!std::mem::reached(addressof(this) + Size))];
};
struct Utf8String<auto Length> {
char data[Length];
} [[sealed, format("std::string::impl::format_string"), transform("std::string::impl::format_string")]];
union _LengthDelimitedData<auto Length> {
u8 bytes[Length];
Utf8String<Length> utf8;
if (!disable_recursive_submessage_parsing) {
try {
// Attempt to parse binary data as an embedded Message. This is
// expected to fail often, as the proto format uses LengthDelimited
// for several different data types.
Message<Length> msg;
std::assert(sizeof(msg) == Length, "Attempted parse of Message consumed wrong number of bytes.");
}
}
};
struct LengthDelimited {
type::uLEB128 length;
char data[length];
std::assert($ + length <= std::mem::size(), "Attempting to parse _LengthDelimitedData would exceed file length.");
_LengthDelimitedData<length> data;
};
union _LEB128 {
type::uLEB128 uLEB128;
type::sLEB128 sLEB128; // NOTE: the signed version doesn't seem to be working properly
};
struct Entry {
struct Field {
Key key;
if (key.wire_type == WireType::Varint)
type::uLEB128 value;
else if (key.wire_type == WireType::_64Bit)
_64Bit value;
else if (key.wire_type == WireType::LengthDelimited)
LengthDelimited value;
else if (key.wire_type == WireType::_32Bit)
_32Bit value;
match (key.wire_type) {
(WireType::Varint): _LEB128 value;
(WireType::_64Bit): _64Bit value;
(WireType::LengthDelimited): LengthDelimited value;
(WireType::_32Bit): _32Bit value;
(WireType::StartGroup | WireType::EndGroup): std::unimplemented();
(_): std::error("Unknown WireType.");
}
};
Entry entries[while(!std::mem::eof())] @ 0x00;
Message<std::mem::size()> msg @ 0x00;
std::assert(std::mem::eof(), "Parsing did not consume whole file.");