11package io .github .dfa1 .vortex .encoding ;
22
33import com .google .protobuf .InvalidProtocolBufferException ;
4+ import io .github .dfa1 .vortex .core .ArrayStats ;
45import io .github .dfa1 .vortex .core .DType ;
6+ import io .github .dfa1 .vortex .core .PType ;
57import io .github .dfa1 .vortex .core .VortexException ;
68import io .github .dfa1 .vortex .core .array .Array ;
9+ import io .github .dfa1 .vortex .core .array .LongArray ;
710import io .github .dfa1 .vortex .proto .EncodingProtos ;
811
12+ import java .lang .foreign .MemorySegment ;
13+ import java .lang .foreign .ValueLayout ;
914import java .nio .ByteBuffer ;
15+ import java .nio .ByteOrder ;
1016
1117/// Decoder for {@code vortex.pco} (pcodec numerical compression).
1218///
1925///
2026/// <p>Wire format (pcodec layer, per chunk/page):
2127/// <ul>
22- /// <li>Chunk meta: mode nibble + extra mode bits + delta nibble + extra delta bits +
23- /// per-latent: ans_size_log (4b), bin_count (15b), per-bin {weight-1, lower, offset_bits}</li>
24- /// <li>Page: initial latent state (delta state_n + 4 tANS state indices) → byte align →
25- /// per 256-batch: tANS-decoded bin indices + offset bits</li>
28+ /// <li>Chunk meta: [4b mode][extra mode bits][4b delta][extra delta bits]
29+ /// [per-latent: 4b ans_size_log, 15b n_bins, per-bin {weight-1, lower, offset_bits}]
30+ /// [0–7b alignment]</li>
31+ /// <li>Page: [4 × ans_size_log b initial states][0–7b alignment]
32+ /// [per 256-batch: ANS bits for all k, then offset bits for all k]</li>
2633/// <li>All bit packing little-endian (LSB first)</li>
2734/// </ul>
2835///
29- /// <p>Phase 1: skeleton only — parses metadata, validates header, dispatches on PType .
30- /// Phase 2 adds Classic/None decode for I64; later phases extend to all ptypes and modes .
36+ /// <p>Supported ( Phase 2): Classic mode, None delta, non-null, I64 .
37+ /// Other modes/deltas/ptypes throw with a clear "not yet implemented" message .
3138public final class PcoEncoding implements Encoding {
3239
3340 static final byte PCO_FORMAT_MAJOR = 0x04 ;
3441 static final byte PCO_FORMAT_MINOR = 0x01 ;
3542
43+ // bits needed to encode offset_bits field per latent type
44+ static final int BITS_TO_ENCODE_OFFSET_BITS_64 = 7 ; // log2(64) + 1
45+ static final int BITS_TO_ENCODE_OFFSET_BITS_32 = 6 ; // log2(32) + 1
46+ static final int BITS_TO_ENCODE_OFFSET_BITS_16 = 5 ; // log2(16) + 1
47+
3648 @ Override
3749 public EncodingId encodingId () {
3850 return EncodingId .VORTEX_PCO ;
@@ -58,12 +70,92 @@ static EncodeResult encode(DType dtype, Object data) {
5870
5971 static final class Decoder {
6072
73+ private static final ValueLayout .OfLong LE_LONG =
74+ ValueLayout .JAVA_LONG_UNALIGNED .withOrder (ByteOrder .LITTLE_ENDIAN );
75+
6176 static Array decode (DecodeContext ctx ) {
6277 EncodingProtos .PcoMetadata meta = parseMeta (ctx );
6378 validateHeader (meta );
64- throw new VortexException (EncodingId .VORTEX_PCO ,
65- "pco decode not yet implemented — Phase 2 pending (chunks="
66- + meta .getChunksCount () + ")" );
79+
80+ DType dtype = ctx .dtype ();
81+ if (!(dtype instanceof DType .Primitive dt )) {
82+ throw new VortexException (EncodingId .VORTEX_PCO ,
83+ "pco decode requires Primitive dtype, got: " + dtype );
84+ }
85+ PType ptype = dt .ptype ();
86+ if (ptype != PType .I64 ) {
87+ throw new VortexException (EncodingId .VORTEX_PCO ,
88+ "pco decode Phase 2: only I64 supported, got: " + ptype );
89+ }
90+
91+ long n = ctx .rowCount ();
92+ MemorySegment out = ctx .arena ().allocate (n * Long .BYTES );
93+
94+ int nChunks = meta .getChunksCount ();
95+ int bufIdx = 0 ;
96+ long outByteOffset = 0L ;
97+
98+ for (int c = 0 ; c < nChunks ; c ++) {
99+ EncodingProtos .PcoChunkInfo chunkInfo = meta .getChunks (c );
100+ MemorySegment chunkMetaBuf = ctx .buffer (bufIdx ++);
101+
102+ PcoChunkMeta chunkMeta = readChunkMeta (chunkMetaBuf );
103+ PcoTansDecoder tans = PcoTansDecoder .build (chunkMeta .ansSizeLog (), chunkMeta .bins ());
104+
105+ int nPages = chunkInfo .getPagesCount ();
106+ for (int p = 0 ; p < nPages ; p ++) {
107+ int pageN = chunkInfo .getPages (p ).getNValues ();
108+ MemorySegment pageBuf = ctx .buffer (bufIdx ++);
109+
110+ LeBitReader pageReader = new LeBitReader (pageBuf );
111+ int [] stateIdxs = new int [PcoTansDecoder .ANS_INTERLEAVING ];
112+ for (int i = 0 ; i < PcoTansDecoder .ANS_INTERLEAVING ; i ++) {
113+ stateIdxs [i ] = (int ) pageReader .readBits (chunkMeta .ansSizeLog ());
114+ }
115+ pageReader .alignToByte ();
116+
117+ tans .decodePage (pageReader , stateIdxs , pageN , out , outByteOffset );
118+ outByteOffset += (long ) pageN * Long .BYTES ;
119+ }
120+ }
121+
122+ // Convert U64 latents → I64: flip sign bit (from_latent_ordered for signed types)
123+ for (long i = 0 ; i < n ; i ++) {
124+ long byteOff = i * Long .BYTES ;
125+ out .set (LE_LONG , byteOff , out .get (LE_LONG , byteOff ) ^ Long .MIN_VALUE );
126+ }
127+
128+ return new LongArray (dtype , n , out , ArrayStats .empty ());
129+ }
130+
131+ private static PcoChunkMeta readChunkMeta (MemorySegment buf ) {
132+ LeBitReader r = new LeBitReader (buf );
133+
134+ int modeNibble = (int ) r .readBits (4 );
135+ if (modeNibble != 0 ) {
136+ throw new VortexException (EncodingId .VORTEX_PCO ,
137+ "pco mode " + modeNibble + " not yet implemented (only Classic=0)" );
138+ }
139+ int deltaNibble = (int ) r .readBits (4 );
140+ if (deltaNibble != 0 ) {
141+ throw new VortexException (EncodingId .VORTEX_PCO ,
142+ "pco delta " + deltaNibble + " not yet implemented (only None=0)" );
143+ }
144+
145+ // One primary latent variable for Classic + None delta.
146+ int ansSizeLog = (int ) r .readBits (4 );
147+ int nBins = (int ) r .readBits (15 );
148+
149+ PcoBin [] bins = new PcoBin [nBins ];
150+ for (int b = 0 ; b < nBins ; b ++) {
151+ int weight = (int ) r .readBits (ansSizeLog ) + 1 ;
152+ long lower = r .readBits (64 ); // dtype_size = 64 for I64/U64
153+ int offsetBits = (int ) r .readBits (BITS_TO_ENCODE_OFFSET_BITS_64 );
154+ bins [b ] = new PcoBin (weight , lower , offsetBits );
155+ }
156+ r .alignToByte (); // drain padding at end of chunk meta
157+
158+ return new PcoChunkMeta (ansSizeLog , bins );
67159 }
68160
69161 private static EncodingProtos .PcoMetadata parseMeta (DecodeContext ctx ) {
@@ -74,7 +166,8 @@ private static EncodingProtos.PcoMetadata parseMeta(DecodeContext ctx) {
74166 try {
75167 return EncodingProtos .PcoMetadata .parseFrom (raw .duplicate ());
76168 } catch (InvalidProtocolBufferException e ) {
77- throw new VortexException (EncodingId .VORTEX_PCO , "invalid PcoMetadata: " + e .getMessage ());
169+ throw new VortexException (EncodingId .VORTEX_PCO ,
170+ "invalid PcoMetadata: " + e .getMessage ());
78171 }
79172 }
80173
@@ -92,4 +185,7 @@ private static void validateHeader(EncodingProtos.PcoMetadata meta) {
92185 }
93186 }
94187 }
188+
189+ private record PcoChunkMeta (int ansSizeLog , PcoBin [] bins ) {
190+ }
95191}
0 commit comments