Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
2374094
Use Flexbuffer
FTRobbin Feb 23, 2026
dae76d2
Implement SizeReport
FTRobbin Mar 3, 2026
156f463
Dig deeper into the size blowup
FTRobbin Mar 3, 2026
92cc333
Serialize span into unit
FTRobbin Mar 3, 2026
c37fd3a
Add control for how much size information to output
FTRobbin Mar 3, 2026
a099964
Merge remote-tracking branch 'origin' into haobin-mining
FTRobbin Mar 4, 2026
4234f79
Extract experiment runs
FTRobbin Mar 5, 2026
9c85469
Tweak nightly frontent to display extract experiment results
FTRobbin Mar 5, 2026
54533db
Show egraph size in size report
FTRobbin Mar 5, 2026
dcf81e5
Add include ser time option, add a speedup graph
FTRobbin Mar 5, 2026
41a6fe8
Merge remote-tracking branch 'origin' into haobin-mining
FTRobbin Mar 5, 2026
63d2be2
fmt
FTRobbin Mar 5, 2026
c54b1a2
Skip tests because containers are not yet supported
FTRobbin Mar 5, 2026
c15978f
Merge remote-tracking branch 'origin' into haobin-mining
FTRobbin Mar 5, 2026
85dcdcf
Comment local dev setup
FTRobbin Mar 5, 2026
1d46162
Output a csv file with serialization size data
FTRobbin Mar 6, 2026
a575829
fmt
FTRobbin Mar 6, 2026
96ea226
Hacks
FTRobbin Mar 6, 2026
53cb8f8
fmt
FTRobbin Mar 6, 2026
78f79fb
More more evil hacks
FTRobbin Mar 7, 2026
41742d6
Remove Easteregg from the list of experiments
FTRobbin Mar 7, 2026
35fa1d9
Clean up evil hacks
FTRobbin Mar 26, 2026
a939da6
fmt
FTRobbin Mar 26, 2026
9b6eaf1
Merge remote-tracking branch 'origin' into haobin-mining
FTRobbin Mar 26, 2026
0e3ffb4
fmt
FTRobbin Mar 26, 2026
eeac398
Fix the rebuilding bug
FTRobbin Apr 1, 2026
381b1ea
Added `stablize` to prevent stale rows in serialization
FTRobbin Apr 2, 2026
22361fc
Typo
FTRobbin Apr 2, 2026
04007f1
Addressing Noah's comments
FTRobbin Apr 2, 2026
023d3e3
Addressing Anjali's comments
FTRobbin Apr 3, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
99 changes: 91 additions & 8 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 3 additions & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -52,10 +52,11 @@ getrandom = "0.3"
once_cell = "1.21"
num-bigint = { version = "0.4", features = ["serde"] }
num-rational = {version = "0.4", features = ["serde"]}
csv = "1.3"
csv = "1.4"
typetag = "0.2"
serde = { version = "1.0", features = ["derive", "rc"] }
serde_json = "1.0"
flexbuffers = "25.12.19"

######################
# build dependencies
Expand Down Expand Up @@ -162,6 +163,7 @@ serde_json_diff = "0.2.0"
anyhow.workspace = true
walkdir = "2.5.0"
egglog-reports = { workspace = true }
flexbuffers.workspace = true

[build-dependencies]
chrono = { workspace = true, features = ["now"], optional = true }
Expand Down
6 changes: 6 additions & 0 deletions core-relations/src/free_join/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -732,6 +732,12 @@ impl Database {
pub(crate) fn plan_query(&mut self, query: Query) -> Plan {
plan::plan_query(query)
}

pub fn stabilize(&mut self) {
for (_, t) in self.tables.iter_mut() {
t.table.stabilize();
}
}
}

impl Drop for Database {
Expand Down
1 change: 1 addition & 0 deletions core-relations/src/hash_index/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -915,6 +915,7 @@ static THREAD_POOL: Lazy<rayon::ThreadPool> = Lazy::new(|| {
/// to the beginning of an unused vector.
#[derive(Default, Clone, Serialize, Deserialize)]
pub(super) struct FreeList {
#[serde(skip)]
Copy link
Copy Markdown
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If we skip serializing the only field in this struct, is it better to skip serializing the FreeList where it's used in SubsetBuffer?

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I agree that would be better. In fact, we can skip the whole SubsetBuffer. But, it is used in many places, and the overhead is negligible.

data: HashMap<usize, Vec<BufferIndex>>,
}
impl FreeList {
Expand Down
96 changes: 78 additions & 18 deletions core-relations/src/row_buffer/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ use std::{cell::Cell, mem, ops::Deref};
use crate::numeric_id::NumericId;
use egglog_concurrency::ParallelVecWriter;
use rayon::iter::ParallelIterator;
use serde::{ser::SerializeStruct, Deserialize, Deserializer, Serialize};
use serde::{Deserialize, Deserializer, Serialize};
use smallvec::SmallVec;

use crate::{
Expand Down Expand Up @@ -35,33 +35,93 @@ impl<'de> Deserialize<'de> for RowBuffer {
where
D: Deserializer<'de>,
{
#[derive(Deserialize)]
struct Partial {
n_columns: usize,
total_rows: usize,
data: Vec<Cell<Value>>,
}
struct RowBufferVisitor;

impl<'de> serde::de::Visitor<'de> for RowBufferVisitor {
type Value = RowBuffer;

fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result {
formatter.write_str("Expecting a byte array")
}

let helper = Partial::deserialize(deserializer)?;
fn visit_bytes<E>(self, bytes: &[u8]) -> Result<Self::Value, E>
where
E: serde::de::Error,
{
let mut it = bytes.iter();
let n_columns = deserialize_compressed(&mut it);
let total_rows = deserialize_compressed(&mut it);
let mut data = <Vec<Cell<Value>>>::new();
for _i in 0..n_columns * total_rows {
data.push(Cell::new(Value::new(deserialize_compressed(&mut it))));
}
Ok(RowBuffer {
n_columns: n_columns.try_into().unwrap(),
total_rows: total_rows.try_into().unwrap(),
data: Pooled::new(data),
})
}
}

Ok(RowBuffer {
n_columns: helper.n_columns,
total_rows: helper.total_rows,
data: Pooled::new(helper.data),
})
deserializer.deserialize_bytes(RowBufferVisitor)
}
}

/// Serialize an u32
/// The highest bit of each byte represents whether this is the last byte (0 = no; 1 = yes).
/// The lower seven bits encodes seven bits from the number.
/// This encoding uses fewer than 4 bytes if the number is small as shown in the following function.
/// ```
/// fn get_n_compressed_bytes(x: u32) -> usize {
/// if x < (1u32 << 7) {
/// 1
/// } else if x < (1u32 << 14) {
/// 2
/// } else if x < (1u32 << 21) {
/// 3
/// } else if x < (1u32 << 28) {
/// 4
/// } else {
/// 5
/// }
/// }
/// ```
/// In practice, small number usually out-proportions large numbers, so this encoding saves space.

fn compressed_serialize(buf: &mut Vec<u8>, x: u32) {
Comment thread
FTRobbin marked this conversation as resolved.
let mut rem = x;
while rem >= (1u32 << 7) {
buf.push((rem & ((1u32 << 7) - 1)).try_into().unwrap());
rem = rem >> 7;
}
buf.push((rem | (1u32 << 7)).try_into().unwrap());
}

fn deserialize_compressed<'a, T: Iterator<Item = &'a u8>>(it: &mut T) -> u32 {
let mut ret = 0u32;
let mut delta = 0u32;
let mut val: u32 = <u8>::into(*it.next().unwrap());
while val < (1u32 << 7) {
ret = ret | (val << delta);
delta += 7;
val = <u8>::into(*it.next().unwrap());
}
let last = (val ^ (1u32 << 7)) << delta;
ret | last
}

impl Serialize for RowBuffer {
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
where
S: serde::Serializer,
{
let mut state = serializer.serialize_struct("RowBuffer", 3)?;
state.serialize_field("n_columns", &self.n_columns)?;
state.serialize_field("total_rows", &self.total_rows)?;
state.serialize_field("data", &*self.data)?;
state.end()
let mut buf = Vec::new();
compressed_serialize(&mut buf, self.n_columns.try_into().unwrap());
compressed_serialize(&mut buf, self.total_rows.try_into().unwrap());
for r in self.data.iter() {
compressed_serialize(&mut buf, r.get().rep);
}
serializer.serialize_bytes(&buf)
}
}

Expand Down
Loading
Loading