diff --git a/include/pyraview_header.h b/include/pyraview_header.h
index 6971733..8ef58e9 100644
--- a/include/pyraview_header.h
+++ b/include/pyraview_header.h
@@ -49,7 +49,9 @@ typedef PV_ALIGN_PREFIX(64) struct {
 
 // API Function
 // Returns 0 on success, negative values for errors
-// layout: 0=SxC (Sample-Major), 1=CxS (Channel-Major)
+// layout: Specifies the *Input* memory layout.
+//         0=SxC (Sample-Major/Interleaved), 1=CxS (Channel-Major/Planar)
+//         Note: The *Output* file format is always Interleaved (Sample-Major).
 int pyraview_process_chunk(
     const void* dataArray,      // Pointer to raw data
     int64_t numRows,            // Number of samples per channel
diff --git a/src/c/pyraview.cpp b/src/c/pyraview.cpp
index 39faaa9..fa9ce24 100644
--- a/src/c/pyraview.cpp
+++ b/src/c/pyraview.cpp
@@ -108,6 +108,11 @@ static int pv_internal_execute_##SUFFIX( \
     /* Pre-calculate rates and decimations */ \
     double rates[16]; \
     int decimations[16]; \
+    T* global_buffers[16]; /* Interleaved buffers */ \
+    int64_t global_sizes[16]; \
+    \
+    /* Allocation Logic (Main Thread) */ \
+    int64_t prev_len = R; \
     for (int i = 0; i < nLevels; i++) { \
         currentDecimation *= steps[i]; \
         currentRate /= steps[i]; \
@@ -118,16 +123,30 @@ static int pv_internal_execute_##SUFFIX( \
         snprintf(filename, sizeof(filename), "%s_L%d.bin", prefix, i+1); \
         int status = pv_validate_or_create(&files[i], filename, (int)C, dataType, rates[i], nativeRate, startTime, decimations[i], append); \
         if (status <= 0) { \
-            /* Cleanup previous opens */ \
+            /* Cleanup */ \
             for (int j = 0; j < i; j++) fclose(files[j]); \
+            for (int j = 0; j < i; j++) if(global_buffers[j]) free(global_buffers[j]); \
             return (status == 0) ? -2 : -1; \
         } \
+        \
+        int64_t out_len = prev_len / steps[i]; \
+        global_sizes[i] = out_len; \
+        global_buffers[i] = NULL; \
+        if (out_len > 0) { \
+            global_buffers[i] = (T*)malloc(out_len * C * 2 * sizeof(T)); \
+            if (!global_buffers[i]) { \
+                /* Cleanup */ \
+                for (int j = 0; j <= i; j++) fclose(files[j]); \
+                for (int j = 0; j < i; j++) if(global_buffers[j]) free(global_buffers[j]); \
+                return -1; \
+            } \
+        } \
+        prev_len = out_len; \
     } \
     \
     /* Stride logic */ \
-    int64_t stride_ch = (layout == 1) ? R : 1; /* Distance between samples of same channel */ \
-    int64_t stride_sample = (layout == 1) ? 1 : C; /* Distance between channels at same sample */ \
-    \
+    int64_t stride_ch = (layout == 1) ? R : 1; \
+    /* int64_t stride_sample = (layout == 1) ? 1 : C; Unused */ \
     int64_t input_stride = (layout == 0) ? C : 1; \
     int64_t channel_step = (layout == 0) ? 1 : R; \
     \
@@ -135,127 +154,94 @@ static int pv_internal_execute_##SUFFIX( \
     int effective_threads = (nThreads > 0) ? nThreads : std::thread::hardware_concurrency(); \
     if (effective_threads < 1) effective_threads = 1; \
     \
-    /* Synchronization primitives */ \
     std::atomic<int64_t> next_channel(0); \
-    std::atomic<int64_t> next_write_ticket(0); \
-    std::mutex write_mutex; \
-    std::condition_variable write_cv; \
     std::atomic<int> error_occurred(0); \
     \
     auto worker = [&]() { \
         while (true) { \
-            /* Atomic fetch of work */ \
             int64_t ch = next_channel.fetch_add(1); \
             if (ch >= C) break; \
             \
-            /* If global error, we skip work but must still process ticket */ \
-            int skip_work = error_occurred.load(); \
-            \
             const T* ch_data = data + (ch * channel_step); \
             T* buffers[16]; \
-            for(int i=0; i<16; i++) buffers[i] = NULL; \
-            int64_t sizes[16]; \
-            int64_t prev_len = R; \
-            int alloc_failed = 0; \
+            /* Each thread computes partial reduction for its channel */ \
+            /* And writes directly to the interleaved global buffer */ \
             \
-            if (!skip_work) { \
-                /* Buffer Allocation */ \
-                for (int i = 0; i < nLevels; i++) { \
-                    int64_t out_len = prev_len / steps[i]; \
-                    sizes[i] = out_len; \
-                    if (out_len > 0) { \
-                        buffers[i] = (T*)malloc(out_len * 2 * sizeof(T)); \
-                        if (!buffers[i]) { alloc_failed = 1; break; } \
-                    } \
-                    prev_len = out_len; \
-                } \
-                \
-                if (!alloc_failed) { \
-                    /* Compute L1 */ \
-                    if (sizes[0] > 0) { \
-                        int step = steps[0]; \
-                        T* out = buffers[0]; \
-                        int64_t count = sizes[0]; \
-                        for (int64_t i = 0; i < count; i++) { \
-                            T min_val = ch_data[i * step * input_stride]; \
-                            T max_val = min_val; \
-                            for (int j = 1; j < step; j++) { \
-                                T val = ch_data[(i * step + j) * input_stride]; \
-                                if (val < min_val) min_val = val; \
-                                if (val > max_val) max_val = val; \
-                            } \
-                            out[2*i] = min_val; \
-                            out[2*i+1] = max_val; \
-                        } \
-                    } \
-                    /* Compute L2..Ln */ \
-                    for (int lvl = 1; lvl < nLevels; lvl++) { \
-                        if (sizes[lvl] > 0) { \
-                            int step = steps[lvl]; \
-                            T* prev_buf = buffers[lvl-1]; \
-                            T* out = buffers[lvl]; \
-                            int64_t count = sizes[lvl]; \
-                            for (int64_t i = 0; i < count; i++) { \
-                                T min_val = prev_buf[i * step * 2]; \
-                                T max_val = prev_buf[i * step * 2 + 1]; \
-                                for (int j = 1; j < step; j++) { \
-                                    T p_min = prev_buf[(i * step + j) * 2]; \
-                                    T p_max = prev_buf[(i * step + j) * 2 + 1]; \
-                                    if (p_min < min_val) min_val = p_min; \
-                                    if (p_max > max_val) max_val = p_max; \
-                                } \
-                                out[2*i] = min_val; \
-                                out[2*i+1] = max_val; \
-                            } \
-                        } \
+            /* We simulate the buffer pointers to point to local data? No, we write to global */ \
+            /* But we need temporary buffers? No. */ \
+            \
+            /* We compute L1 for Channel ch */ \
+            if (global_sizes[0] > 0) { \
+                int step = steps[0]; \
+                T* out_base = global_buffers[0]; \
+                int64_t count = global_sizes[0]; \
+                for (int64_t i = 0; i < count; i++) { \
+                    T min_val = ch_data[i * step * input_stride]; \
+                    T max_val = min_val; \
+                    for (int j = 1; j < step; j++) { \
+                        T val = ch_data[(i * step + j) * input_stride]; \
+                        if (val < min_val) min_val = val; \
+                        if (val > max_val) max_val = val; \
                     } \
-                } else { \
-                    /* Allocation failed */ \
-                    error_occurred.store(1); \
+                    /* Interleaved Output Index: Sample i, Channel ch */ \
+                    /* Index = i * C + ch */ \
+                    /* Pairs = 2 * Index */ \
+                    out_base[2 * (i * C + ch)] = min_val; \
+                    out_base[2 * (i * C + ch) + 1] = max_val; \
                 } \
             } \
-            \
-            /* Ordered Write Section */ \
-            { \
-                std::unique_lock<std::mutex> lock(write_mutex); \
-                write_cv.wait(lock, [&]{ return next_write_ticket.load() == ch; }); \
-                \
-                if (!skip_work && !alloc_failed && !error_occurred.load()) { \
-                    for (int i = 0; i < nLevels; i++) { \
-                        if (sizes[i] > 0 && buffers[i]) { \
-                            if (fwrite(buffers[i], sizeof(T), sizes[i] * 2, files[i]) != (size_t)(sizes[i] * 2)) { \
-                                error_occurred.store(1); \
-                            } \
+            /* Compute L2..Ln */ \
+            for (int lvl = 1; lvl < nLevels; lvl++) { \
+                if (global_sizes[lvl] > 0) { \
+                    int step = steps[lvl]; \
+                    T* prev_buf = global_buffers[lvl-1]; \
+                    T* out_base = global_buffers[lvl]; \
+                    int64_t count = global_sizes[lvl]; \
+                    for (int64_t i = 0; i < count; i++) { \
+                        /* Input is Interleaved from Previous Level */ \
+                        /* Sample i at Prev Level corresponds to step*i .. step*i + step - 1 */ \
+                        /* We need to read Channel ch for those samples */ \
+                        \
+                        /* First sample index in prev level: i * step */ \
+                        /* Interleaved index: (i * step) * C + ch */ \
+                        \
+                        int64_t start_idx = (i * step) * C + ch; \
+                        T min_val = prev_buf[2 * start_idx]; \
+                        T max_val = prev_buf[2 * start_idx + 1]; \
+                        \
+                        for (int j = 1; j < step; j++) { \
+                            int64_t idx = ((i * step) + j) * C + ch; \
+                            T p_min = prev_buf[2 * idx]; \
+                            T p_max = prev_buf[2 * idx + 1]; \
+                            if (p_min < min_val) min_val = p_min; \
+                            if (p_max > max_val) max_val = p_max; \
                         } \
+                        out_base[2 * (i * C + ch)] = min_val; \
+                        out_base[2 * (i * C + ch) + 1] = max_val; \
                     } \
                 } \
-                \
-                next_write_ticket.store(ch + 1); \
-                write_cv.notify_all(); \
-            } \
-            \
-            /* Cleanup buffers */ \
-            for (int i = 0; i < nLevels; i++) { \
-                if(buffers[i]) free(buffers[i]); \
             } \
         } \
     }; \
     \
-    /* Spawn Threads */ \
     std::vector<std::thread> threads; \
     for (int i = 0; i < effective_threads; ++i) { \
         threads.emplace_back(worker); \
     } \
+    for (auto& t : threads) { t.join(); } \
     \
-    /* Join Threads */ \
-    for (auto& t : threads) { \
-        t.join(); \
+    /* Write to files sequentially */ \
+    for (int i = 0; i < nLevels; i++) { \
+        if (global_sizes[i] > 0 && global_buffers[i]) { \
+            if (fwrite(global_buffers[i], sizeof(T), global_sizes[i] * C * 2, files[i]) != (size_t)(global_sizes[i] * C * 2)) { \
+                ret = -1; \
+            } \
+        } \
+        if (global_buffers[i]) free(global_buffers[i]); \
+        fclose(files[i]); \
     } \
     \
-    /* Close files */ \
-    for (int i = 0; i < nLevels; i++) fclose(files[i]); \
-    \
-    return error_occurred.load() ? -1 : 0; \
+    return ret; \
 }
 
 // Instantiate workers
@@ -270,9 +256,7 @@ DEFINE_WORKER(uint64_t, u64)
 DEFINE_WORKER(float, f32)
 DEFINE_WORKER(double, f64)
 
-// Master Dispatcher (Extern C for ABI compatibility)
 extern "C" {
-
 int pyraview_process_chunk(
     const void* dataArray,
     int64_t numRows,
@@ -287,38 +271,21 @@ int pyraview_process_chunk(
     double startTime,
     int numThreads
 ) {
-    // 1. Validate inputs (basic)
     if (!dataArray || !filePrefix || !levelSteps || numLevels <= 0 || numLevels > 16) return -1;
+    for (int i=0; i<numLevels; i++) { if (levelSteps[i] <= 0) return -1; }
 
-    // Validate levelSteps
-    for (int i=0; i<numLevels; i++) {
-        if (levelSteps[i] <= 0) return -1;
-    }
-
-    // Dispatch to typed worker
     switch (dataType) {
-        case PV_INT8: // 0
-            return pv_internal_execute_i8((const int8_t*)dataArray, numRows, numCols, layout, filePrefix, append, levelSteps, numLevels, nativeRate, startTime, dataType, numThreads);
-        case PV_UINT8: // 1
-            return pv_internal_execute_u8((const uint8_t*)dataArray, numRows, numCols, layout, filePrefix, append, levelSteps, numLevels, nativeRate, startTime, dataType, numThreads);
-        case PV_INT16: // 2
-            return pv_internal_execute_i16((const int16_t*)dataArray, numRows, numCols, layout, filePrefix, append, levelSteps, numLevels, nativeRate, startTime, dataType, numThreads);
-        case PV_UINT16: // 3
-            return pv_internal_execute_u16((const uint16_t*)dataArray, numRows, numCols, layout, filePrefix, append, levelSteps, numLevels, nativeRate, startTime, dataType, numThreads);
-        case PV_INT32: // 4
-            return pv_internal_execute_i32((const int32_t*)dataArray, numRows, numCols, layout, filePrefix, append, levelSteps, numLevels, nativeRate, startTime, dataType, numThreads);
-        case PV_UINT32: // 5
-            return pv_internal_execute_u32((const uint32_t*)dataArray, numRows, numCols, layout, filePrefix, append, levelSteps, numLevels, nativeRate, startTime, dataType, numThreads);
-        case PV_INT64: // 6
-            return pv_internal_execute_i64((const int64_t*)dataArray, numRows, numCols, layout, filePrefix, append, levelSteps, numLevels, nativeRate, startTime, dataType, numThreads);
-        case PV_UINT64: // 7
-            return pv_internal_execute_u64((const uint64_t*)dataArray, numRows, numCols, layout, filePrefix, append, levelSteps, numLevels, nativeRate, startTime, dataType, numThreads);
-        case PV_FLOAT32: // 8
-            return pv_internal_execute_f32((const float*)dataArray, numRows, numCols, layout, filePrefix, append, levelSteps, numLevels, nativeRate, startTime, dataType, numThreads);
-        case PV_FLOAT64: // 9
-            return pv_internal_execute_f64((const double*)dataArray, numRows, numCols, layout, filePrefix, append, levelSteps, numLevels, nativeRate, startTime, dataType, numThreads);
-        default:
-            return -1; // Unknown data type
+        case PV_INT8: return pv_internal_execute_i8((const int8_t*)dataArray, numRows, numCols, layout, filePrefix, append, levelSteps, numLevels, nativeRate, startTime, dataType, numThreads);
+        case PV_UINT8: return pv_internal_execute_u8((const uint8_t*)dataArray, numRows, numCols, layout, filePrefix, append, levelSteps, numLevels, nativeRate, startTime, dataType, numThreads);
+        case PV_INT16: return pv_internal_execute_i16((const int16_t*)dataArray, numRows, numCols, layout, filePrefix, append, levelSteps, numLevels, nativeRate, startTime, dataType, numThreads);
+        case PV_UINT16: return pv_internal_execute_u16((const uint16_t*)dataArray, numRows, numCols, layout, filePrefix, append, levelSteps, numLevels, nativeRate, startTime, dataType, numThreads);
+        case PV_INT32: return pv_internal_execute_i32((const int32_t*)dataArray, numRows, numCols, layout, filePrefix, append, levelSteps, numLevels, nativeRate, startTime, dataType, numThreads);
+        case PV_UINT32: return pv_internal_execute_u32((const uint32_t*)dataArray, numRows, numCols, layout, filePrefix, append, levelSteps, numLevels, nativeRate, startTime, dataType, numThreads);
+        case PV_INT64: return pv_internal_execute_i64((const int64_t*)dataArray, numRows, numCols, layout, filePrefix, append, levelSteps, numLevels, nativeRate, startTime, dataType, numThreads);
+        case PV_UINT64: return pv_internal_execute_u64((const uint64_t*)dataArray, numRows, numCols, layout, filePrefix, append, levelSteps, numLevels, nativeRate, startTime, dataType, numThreads);
+        case PV_FLOAT32: return pv_internal_execute_f32((const float*)dataArray, numRows, numCols, layout, filePrefix, append, levelSteps, numLevels, nativeRate, startTime, dataType, numThreads);
+        case PV_FLOAT64: return pv_internal_execute_f64((const double*)dataArray, numRows, numCols, layout, filePrefix, append, levelSteps, numLevels, nativeRate, startTime, dataType, numThreads);
+        default: return -1;
     }
 }
 
@@ -334,5 +301,4 @@ int pyraview_get_header(const char* filename, PyraviewHeader* header) {
     if (memcmp(header->magic, "PYRA", 4) != 0) return -1;
     return 0;
 }
-
-} // extern "C"
+}
diff --git a/src/matlab/+pyraview/get_header.m b/src/matlab/+pyraview/get_header.m
index b7efc4e..f0ecd6d 100644
--- a/src/matlab/+pyraview/get_header.m
+++ b/src/matlab/+pyraview/get_header.m
@@ -1,3 +1,4 @@
+function header = get_header(filename)
 %GET_HEADER Read the Pyraview binary header from a file.
 %
 %   HEADER = pyraview.get_header(FILENAME) reads the standard 1024-byte header
@@ -23,3 +24,50 @@
 %       fprintf('Channels: %d, Rate: %.2f Hz\n', h.channelCount, h.sampleRate);
 %
 %   See also PYRAVIEW.READFILE, PYRAVIEW.DATASET
+
+    if ~isfile(filename)
+        error('Pyraview:FileNotFound', 'File not found: %s', filename);
+    end
+
+    % Open file for binary reading, little-endian
+    fid = fopen(filename, 'rb', 'ieee-le');
+    if fid == -1
+        error('Pyraview:FileOpenError', 'Could not open file: %s', filename);
+    end
+
+    % Ensure file is closed on exit
+    c = onCleanup(@() fclose(fid));
+
+    % Read 1024 byte header block
+    % Structure layout (packed/aligned to 64 bytes, but members are standard sizes):
+    % char magic[4];              // 0
+    % uint32_t version;           // 4
+    % uint32_t dataType;          // 8
+    % uint32_t channelCount;      // 12
+    % double sampleRate;          // 16
+    % double nativeRate;          // 24
+    % double startTime;           // 32
+    % uint32_t decimationFactor;  // 40
+    % uint8_t reserved[980];      // 44 .. 1023
+
+    % Read Magic
+    magicBytes = fread(fid, 4, '*char')';
+    if ~strcmp(magicBytes, 'PYRA')
+        error('Pyraview:InvalidHeader', 'Invalid magic string. Expected "PYRA", got "%s"', magicBytes);
+    end
+    header.magic = magicBytes;
+
+    header.version = fread(fid, 1, 'uint32');
+    header.dataType = fread(fid, 1, 'uint32');
+    header.channelCount = fread(fid, 1, 'uint32');
+    header.sampleRate = fread(fid, 1, 'double');
+    header.nativeRate = fread(fid, 1, 'double');
+    header.startTime = fread(fid, 1, 'double');
+    header.decimationFactor = fread(fid, 1, 'uint32');
+
+    % Verify size? Optional but good for sanity.
+    % We read 4 + 4 + 4 + 4 + 8 + 8 + 8 + 4 = 44 bytes.
+    % Remaining 980 bytes are reserved.
+
+    % No need to read reserved unless we want to validate file size.
+end
diff --git a/src/matlab/+pyraview/macOneTime.m b/src/matlab/+pyraview/macOneTime.m
new file mode 100644
index 0000000..b84e12a
--- /dev/null
+++ b/src/matlab/+pyraview/macOneTime.m
@@ -0,0 +1,65 @@
+function macOneTime()
+%MACONETIME Coach the user to allow MEX files on macOS.
+%
+%   pyraview.macOneTime()
+%
+%   This function guides the user through the process of allowing the
+%   'pyraview.mex' binary to run on macOS, which often blocks it by
+%   default as "unidentified developer" software.
+%
+%   This process only needs to be done once per installation/upgrade.
+
+    if ~ismac
+        disp('This function is intended for macOS users only.');
+        return;
+    end
+
+    fprintf('----------------------------------------------------------------\n');
+    fprintf('Pyraview macOS Security Check\n');
+    fprintf('----------------------------------------------------------------\n');
+    fprintf('macOS often blocks MEX files from running because they are not\n');
+    fprintf('signed by an identified developer. You will likely see a popup\n');
+    fprintf('saying "pyraview.mex" cannot be opened.\n\n');
+
+    fprintf('INSTRUCTIONS:\n');
+    fprintf('1. Open "System Settings" -> "Privacy & Security".\n');
+    fprintf('2. Scroll down to the "Security" section.\n');
+    fprintf('3. Look for a message saying "pyraview.mex" was blocked.\n');
+    fprintf('4. Click "Allow Anyway" or "Open Anyway".\n');
+    fprintf('   (You might need to click "Cancel" on the popup first).\n');
+    fprintf('----------------------------------------------------------------\n');
+
+    input('Press Enter to attempt running pyraview.pyraview...', 's');
+
+    try
+        % Call with no arguments to trigger "Usage" error from C code.
+        % If the library is blocked, this call will throw a system error (e.g. Invalid MEX-file)
+        % before executing the C code.
+        pyraview.pyraview();
+    catch ME
+        % If it is the "InvalidInput" error from our C code, it loaded fine!
+        if strcmp(ME.identifier, 'Pyraview:InvalidInput')
+            fprintf('Success! pyraview.mex loaded successfully.\n');
+        else
+            fprintf('\nError: %s\n', ME.message);
+            fprintf('It seems pyraview.mex was blocked or failed to load.\n');
+            fprintf('Please go to System Settings -> Privacy & Security and allow it.\n');
+            input('Press Enter after you have allowed the file...', 's');
+
+            % Retry
+            try
+                pyraview.pyraview();
+            catch ME2
+                 if strcmp(ME2.identifier, 'Pyraview:InvalidInput')
+                    fprintf('Success! pyraview.mex loaded successfully on retry.\n');
+                 else
+                    fprintf('Still failed: %s\n', ME2.message);
+                    fprintf('You may need to try again manually.\n');
+                 end
+            end
+        end
+    end
+
+    fprintf('\n----------------------------------------------------------------\n');
+    fprintf('Security check complete. You should not need to do this again.\n');
+end
diff --git a/src/matlab/+pyraview/pyraview_mex.c b/src/matlab/+pyraview/pyraview_gateway.c
similarity index 88%
rename from src/matlab/+pyraview/pyraview_mex.c
rename to src/matlab/+pyraview/pyraview_gateway.c
index 6afd82f..bb8f402 100644
--- a/src/matlab/+pyraview/pyraview_mex.c
+++ b/src/matlab/+pyraview/pyraview_gateway.c
@@ -3,11 +3,11 @@
 #include <string.h>
 
 /*
- * pyraview_mex.c
+ * pyraview_gateway.c
  * Gateway for Pyraview C Engine
  *
  * Usage:
- *   status = pyraview_mex(data, prefix, steps, nativeRate, startTime, [append], [numThreads])
+ *   status = pyraview(data, prefix, steps, nativeRate, startTime, [append], [numThreads])
  *
  * Inputs:
  *   data: (Samples x Channels) matrix. uint8, int16, single, or double.
@@ -25,7 +25,7 @@
 void mexFunction(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]) {
     // Check inputs
     if (nrhs < 5) {
-        mexErrMsgIdAndTxt("Pyraview:InvalidInput", "Usage: pyraview_mex(data, prefix, steps, nativeRate, startTime, [append], [numThreads])");
+        mexErrMsgIdAndTxt("Pyraview:InvalidInput", "Usage: pyraview(data, prefix, steps, nativeRate, startTime, [append], [numThreads])");
     }
 
     // 1. Data
@@ -54,7 +54,12 @@ void mexFunction(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]) {
 
     // Layout: Matlab is Column-Major. If input is Samples x Channels, then it is CxS.
     // Layout code for CxS is 1.
-    int layout = 1;
+    // Note: The Pyraview engine uses '1' for Column-Major (Planar) layout.
+    // This matches MATLAB's internal storage for a Samples x Channels matrix,
+    // where all samples for Channel 1 are stored contiguously, followed by Channel 2, etc.
+    // Note: This defines the INPUT memory layout. The output file format is Sample-Major (Interleaved).
+    const int PV_LAYOUT_COLUMN_MAJOR = 1;
+    int layout = PV_LAYOUT_COLUMN_MAJOR;
 
     // 2. Prefix
     if (!mxIsChar(prhs[1])) {
diff --git a/src/matlab/+pyraview/pyraview_get_header_mex.c b/src/matlab/+pyraview/pyraview_get_header_mex.c
deleted file mode 100644
index 61edf88..0000000
--- a/src/matlab/+pyraview/pyraview_get_header_mex.c
+++ /dev/null
@@ -1,57 +0,0 @@
-#include "mex.h"
-#include "pyraview_header.h"
-#include <string.h>
-
-/*
- * pyraview_get_header_mex.c
- * MEX wrapper for pyraview_get_header
- *
- * Usage:
- *   header = pyraview_get_header_mex(filename)
- *
- * Inputs:
- *   filename: char array (string).
- *
- * Outputs:
- *   header: struct with fields:
- *     version, dataType, channelCount, sampleRate, nativeRate, startTime, decimationFactor
- */
-
-void mexFunction(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]) {
-    if (nrhs != 1) {
-        mexErrMsgIdAndTxt("Pyraview:InvalidInput", "Usage: pyraview_get_header_mex(filename)");
-    }
-
-    if (!mxIsChar(prhs[0])) {
-        mexErrMsgIdAndTxt("Pyraview:InvalidInput", "Filename must be a string.");
-    }
-    char *filename = mxArrayToString(prhs[0]);
-
-    PyraviewHeader h;
-    if (pyraview_get_header(filename, &h) != 0) {
-        mxFree(filename);
-        mexErrMsgIdAndTxt("Pyraview:ReadError", "Failed to read Pyraview header from %s", filename);
-    }
-    mxFree(filename);
-
-    const char *field_names[] = {
-        "version",
-        "dataType",
-        "channelCount",
-        "sampleRate",
-        "nativeRate",
-        "startTime",
-        "decimationFactor"
-    };
-    int n_fields = 7;
-
-    plhs[0] = mxCreateStructMatrix(1, 1, n_fields, field_names);
-
-    mxSetField(plhs[0], 0, "version", mxCreateDoubleScalar((double)h.version));
-    mxSetField(plhs[0], 0, "dataType", mxCreateDoubleScalar((double)h.dataType));
-    mxSetField(plhs[0], 0, "channelCount", mxCreateDoubleScalar((double)h.channelCount));
-    mxSetField(plhs[0], 0, "sampleRate", mxCreateDoubleScalar(h.sampleRate));
-    mxSetField(plhs[0], 0, "nativeRate", mxCreateDoubleScalar(h.nativeRate));
-    mxSetField(plhs[0], 0, "startTime", mxCreateDoubleScalar(h.startTime));
-    mxSetField(plhs[0], 0, "decimationFactor", mxCreateDoubleScalar((double)h.decimationFactor));
-}
diff --git a/src/matlab/+pyraview/readFile.m b/src/matlab/+pyraview/readFile.m
index e7de948..dc16715 100644
--- a/src/matlab/+pyraview/readFile.m
+++ b/src/matlab/+pyraview/readFile.m
@@ -18,21 +18,10 @@
 %                  D(:, :, 1) contains the minimum values for each sample.
 %                  D(:, :, 2) contains the maximum values for each sample.
 %
-%   Example:
-%       % Read samples 100 to 200 from 'data_L1.bin'
-%       d = pyraview.readFile('data_L1.bin', 100, 200);
-%
-%       % Read from the beginning to sample 500
-%       d = pyraview.readFile('data_L1.bin', -Inf, 500);
-%
-%       % Read from sample 1000 to the end
-%       d = pyraview.readFile('data_L1.bin', 1000, Inf);
-%
 %   Notes:
-%       - Indices are clamped to the file's valid sample range.
-%       - Returns an empty array if the requested range is invalid or empty.
-%       - Pyraview files use a planar layout where channels are stored contiguously.
-%       - Each "sample" in a level file consists of a Min/Max pair.
+%       - The file format is Interleaved (Sample-Major):
+%         [Sample0_Ch0, Sample0_Ch1...][Sample1_Ch0, Sample1_Ch1...]
+%       - Each "sample point" consists of a Min/Max pair.
 
     if ~isfile(filename)
         error('Pyraview:FileNotFound', 'File not found: %s', filename);
@@ -68,6 +57,7 @@
     dataArea = fileSize - headerSize;
 
     numChannels = double(h.channelCount);
+    % Frame: Min/Max pair for ALL channels
     frameSize = numChannels * 2 * itemSize;
     totalSamples = floor(dataArea / frameSize);
 
@@ -108,32 +98,54 @@
     if f == -1
         error('Pyraview:FileOpenError', 'Could not open file: %s', filename);
     end
+    c = onCleanup(@() fclose(f));
+
+    % Interleaved Format:
+    % [Header]
+    % [Sample 0 (Ch0..ChN)]
+    % [Sample 1 (Ch0..ChN)]
+
+    % Seek to Start Sample
+    seekOffset = headerSize + startSample * frameSize;
+    fseek(f, seekOffset, 'bof');
+
+    % Read block
+    % We read numSamplesToRead * frameSize bytes
+    % Raw read into vector
+    numElements = numSamplesToRead * numChannels * 2;
+    raw = fread(f, numElements, ['*' char(precision)]);
+
+    if length(raw) < numElements
+        warning('Pyraview:ShortRead', 'Short read. Expected %d elements, got %d.', numElements, length(raw));
+        % Truncate
+        numSamplesToRead = floor(length(raw) / (numChannels * 2));
+        raw = raw(1 : numSamplesToRead * numChannels * 2);
+        d = d(1:numSamplesToRead, :, :, :);
+    end
 
-    cleanupObj = onCleanup(@() fclose(f));
+    % Reshape
+    % Raw is [S0C0m S0C0M S0C1m S0C1M ... S1C0m ...]
+    % Shape into (2, Channels, Samples) first?
+    % MATLAB is Column Major.
+    % Reshape to (2*Channels, Samples) -> Columns are Samples.
+    % Data in file (Interleaved) is Sample 0 (all values), Sample 1 (all values).
+    % So file is stored Row-Major if mapped to (Samples x [2*Ch]).
+    % fread reads linearly.
+    % raw = [S0_Vals, S1_Vals ...]
+    % If we reshape to [2*Channels, NumSamples], MATLAB fills column 1 with raw(1..2*Ch).
+    % raw(1..2*Ch) IS S0_Vals.
+    % So reshape(raw, 2*numChannels, numSamplesToRead) puts Sample 0 in Column 1.
+    % Transpose to (NumSamples, 2*numChannels).
+
+    reshaped = reshape(raw, 2*numChannels, numSamplesToRead)';
+
+    % Now reshaped is (Samples x [2*Ch]).
+    % Columns: C0m C0M C1m C1M ...
+    % We want d(Sample, Ch, 1) = C_Ch_m
+    % We want d(Sample, Ch, 2) = C_Ch_M
 
     for ch = 1:numChannels
-        % Calculate offset
-        % Planar layout: [Header] [Ch1 Data] [Ch2 Data] ...
-        % Ch Data size = totalSamples * 2 * itemSize
-
-        chStartOffset = headerSize + (ch-1) * totalSamples * 2 * itemSize;
-        readOffset = chStartOffset + startSample * 2 * itemSize;
-
-        fseek(f, readOffset, 'bof');
-
-        % Read min/max pairs
-        % precision needs to be char for fread
-        raw = fread(f, numSamplesToRead * 2, ['*' char(precision)]);
-
-        if length(raw) < numSamplesToRead * 2
-            warning('Pyraview:ShortRead', 'Short read on channel %d. Expected %d, got %d.', ch, numSamplesToRead*2, length(raw));
-            % Fill what we got
-            nRead = floor(length(raw)/2);
-            d(1:nRead, ch, 1) = raw(1:2:2*nRead);
-            d(1:nRead, ch, 2) = raw(2:2:2*nRead);
-        else
-            d(:, ch, 1) = raw(1:2:end);
-            d(:, ch, 2) = raw(2:2:end);
-        end
+        d(:, ch, 1) = reshaped(:, 2*ch - 1);
+        d(:, ch, 2) = reshaped(:, 2*ch);
     end
 end
diff --git a/src/matlab/README.md b/src/matlab/README.md
index 2bd2a24..d2307fd 100644
--- a/src/matlab/README.md
+++ b/src/matlab/README.md
@@ -1,12 +1,12 @@
 # Pyraview Matlab MEX
 
 ## Compilation
-To compile the MEX file:
+To compile the MEX file (`pyraview.mex`):
 1. Open Matlab and `cd` to this directory.
 2. Run `build_pyraview`.
 
 ## Usage
-`status = pyraview_mex(data, prefix, steps, nativeRate, [append], [numThreads])`
+`status = pyraview.pyraview(data, prefix, steps, nativeRate, [append], [numThreads])`
 
 * `data`: Samples x Channels matrix (Single, Double, Int16, Uint8).
 * `prefix`: Base file name (e.g. 'data/mydata').
diff --git a/src/matlab/build_pyraview.m b/src/matlab/build_pyraview.m
index 5c85ace..71e28bd 100644
--- a/src/matlab/build_pyraview.m
+++ b/src/matlab/build_pyraview.m
@@ -6,8 +6,7 @@
 include_path = '-I../../include';
 
 % Source files inside +pyraview
-mex_src = '+pyraview/pyraview_mex.c';
-header_src = '+pyraview/pyraview_get_header_mex.c';
+mex_src = '+pyraview/pyraview_gateway.c';
 
 % Output directory: +pyraview/
 out_dir = '+pyraview';
@@ -21,10 +20,6 @@
     fprintf('Building pyraview...\n');
     mex('-v', '-outdir', out_dir, '-output', 'pyraview', include_path, src_path, mex_src);
     fprintf('Build pyraview successful.\n');
-
-    fprintf('Building get_header...\n');
-    mex('-v', '-outdir', out_dir, '-output', 'get_header', include_path, src_path, header_src);
-    fprintf('Build get_header successful.\n');
 catch e
     fprintf('Build failed: %s\n', e.message);
     rethrow(e);
diff --git a/src/python/pyraview/__init__.py b/src/python/pyraview/__init__.py
index e32a101..fe1978c 100644
--- a/src/python/pyraview/__init__.py
+++ b/src/python/pyraview/__init__.py
@@ -211,52 +211,26 @@ def get_view_data(self, t_start, t_end, pixels):
         if duration <= 0:
             return np.array([]), np.array([])
 
-        # Required sample rate to satisfy pixels
-        # We want approx 'pixels' samples in 'duration'
-        # target_rate = pixels / duration
-        # But we actually have min/max pairs. So we need pixels/2 pairs?
-        # Standard approach: We want 'pixels' data points.
-        # Since each sample is min/max (2 values), we need 'pixels/2' effective source samples?
-        # Or does 'pixels' mean screen pixels? Usually 1 sample per pixel column.
-        # Let's assume we want at least 'pixels' aggregated samples.
-
         target_rate = pixels / duration
 
         # Find best level
         selected_file = self.files[0] # Default to highest res
-        for f in self.files:
-            # If this file's rate is sufficient (>= target), pick it.
-            # We iterate from high res (low decimation) to low res.
-            # Actually we want the *lowest* res that is still sufficient.
-            # So we should iterate from low res (high decimation) to high res?
-            pass
-
-        # Better: Filter for files with rate >= target_rate, then pick the one with lowest rate (highest decimation)
+
+        # Filter for files with rate >= target_rate, then pick the one with lowest rate (highest decimation)
         candidates = [f for f in self.files if f['rate'] >= target_rate]
         if candidates:
             # Pick the one with the lowest rate (highest decimation) among candidates
-            # This gives us the coarsest level that still meets the requirement
             selected_file = min(candidates, key=lambda x: x['rate'])
         else:
             # If none meet requirement (zoomed in too far), pick highest res (index 0)
             selected_file = self.files[0]
 
-        # Calculate aperture (3x window)
-        window = duration
-        t_center = (t_start + t_end) / 2
-        aperture_start = t_center - 1.5 * window
-        aperture_end = t_center + 1.5 * window
+        # Calculate range
+        if t_start < self.start_time: t_start = self.start_time
+        if t_end < self.start_time: return np.array([]), np.array([])
 
-        # Clamp to file bounds?
-        # We don't know file duration from header easily without file size.
-        # But start_time is known.
-        if aperture_start < self.start_time:
-            aperture_start = self.start_time
-
-        # Convert time to sample indices
-        # Index = (t - start_time) * sample_rate
-        rel_start = aperture_start - self.start_time
-        rel_end = aperture_end - self.start_time
+        rel_start = t_start - self.start_time
+        rel_end = t_end - self.start_time
 
         idx_start = int(rel_start * selected_file['rate'])
         idx_end = int(rel_end * selected_file['rate'])
@@ -266,9 +240,7 @@ def get_view_data(self, t_start, t_end, pixels):
 
         num_samples_to_read = idx_end - idx_start
 
-        # Map file
-        # Header is 1024 bytes.
-        # Data size depends on type.
+        # Map type
         dtype_map_rev = {
             0: np.int8, 1: np.uint8,
             2: np.int16, 3: np.uint16,
@@ -279,101 +251,33 @@ def get_view_data(self, t_start, t_end, pixels):
         dt = dtype_map_rev.get(self.data_type, np.float64)
         item_size = np.dtype(dt).itemsize
 
-        # Layout is CxS (1) or SxC (0)?
-        # The writer usually does CxS for MATLAB compatibility, but let's check.
-        # Wait, the writer code shows logic for both. But `pyraview.c` usually writes contiguous blocks per channel?
-        # Actually `pyraview_process_chunk` writes `fwrite(buffers[i], ...)` inside a loop over channels:
-        # `for (ch = 0; ch < C; ch++) ... fwrite(...)`.
-        # This implies the file format is Channel-Major (blocks of channel data).
-        # Channel 0 [all samples], Channel 1 [all samples]...
-        # Wait, the `fwrite` is per channel, per level.
-        # If we have multiple chunks appended, the file structure becomes:
+        # Interleaved (Sample-Major) format
         # [Header]
-        # [Ch0_Chunk1][Ch1_Chunk1]...
-        # [Ch0_Chunk2][Ch1_Chunk2]...
-        # This is strictly not purely Channel-Major if appended. It's Chunk-Interleaved.
-        # BUT, the `pyraview_process_chunk` function is usually called once for the whole file in offline processing,
-        # OR if appending, it's appended in chunks.
-        # If it's chunked, random access by time is hard without an index.
-        # HOWEVER, the prompt implies "Time-to-sample-index conversion".
-        # If the file is just one big chunk (offline conversion), then it's:
-        # [Ch0 L1][Ch1 L1]...
-
-        # If we assume standard "One Big Write" (no append loops in valid use case for random access):
-        # The file is Ch0_All, Ch1_All...
-        # We need to know total samples per channel to jump to Ch1.
-        # File size = 1024 + Channels * Samples * 2 * ItemSize.
-        # Samples = (FileSize - 1024) / (Channels * 2 * ItemSize).
-
-        file_size = os.path.getsize(selected_file['path'])
-        data_area = file_size - 1024
-        frame_size = self.channels * 2 * item_size # 2 for min/max
-        total_samples = data_area // frame_size # This assumes interleaved SxC or Blocked CxS?
-
-        # Re-reading `pyraview.c`:
-        # `for (ch = 0; ch < C; ch++) { ... fwrite(...) }`
-        # It writes ALL data for Channel 0, then ALL data for Channel 1.
-        # So it is Channel-Major Planar.
-        # [Header][Ch0 MinMax...][Ch1 MinMax...]
-
-        samples_per_channel = data_area // (self.channels * 2 * item_size)
-
-        if idx_start >= samples_per_channel:
-             return np.array([]), np.array([])
-
-        if idx_end > samples_per_channel:
-            idx_end = samples_per_channel
-            num_samples_to_read = idx_end - idx_start
-
-        # We need to read 'num_samples_to_read' from EACH channel.
-        # Ch0 Offset = 1024 + idx_start * 2 * item_size
-        # Ch1 Offset = 1024 + (samples_per_channel * 2 * item_size) + (idx_start * 2 * item_size)
-
-        # Read logic
-        data_out = np.zeros((num_samples_to_read, self.channels * 2), dtype=dt)
+        # [Sample 0 (C0m C0M C1m C1M ...)]
+        # [Sample 1 (C0m C0M C1m C1M ...)]
+
+        read_start_offset = 1024 + idx_start * (self.channels * 2 * item_size)
+        bytes_to_read = num_samples_to_read * (self.channels * 2 * item_size)
 
         with open(selected_file['path'], 'rb') as f:
-            for ch in range(self.channels):
-                # Calculate offset
-                ch_start_offset = 1024 + (ch * samples_per_channel * 2 * item_size)
-                read_offset = ch_start_offset + (idx_start * 2 * item_size)
-
-                f.seek(read_offset)
-                raw = f.read(num_samples_to_read * 2 * item_size)
-                # Parse
-                ch_data = np.frombuffer(raw, dtype=dt)
-
-                # Interleave into output?
-                # Output format: Rows=Samples, Cols=Channels*2 (Min,Max,Min,Max...)
-                # data_out[:, 2*ch] = ch_data[0::2]
-                # data_out[:, 2*ch+1] = ch_data[1::2]
-                # Or just keep it separate?
-                # Let's return (Samples x Channels*2)
-
-                # Check bounds (short read?)
-                read_len = len(ch_data)
-                if read_len > 0:
-                    # Direct assign might fail if shapes mismatch due to short read
-                    # Reshape ch_data to (N, 2)?
-                    # ch_data is flat Min0, Max0, Min1, Max1...
-                    # We want to place it in data_out
-
-                    # Ensure alignment
-                    limit = min(num_samples_to_read * 2, read_len)
-                    # We have 'limit' values.
-                    # We need to distribute them.
-                    # data_out is (N, C*2).
-                    # We want data_out[:, 2*ch] and data_out[:, 2*ch+1]
-
-                    # Reshape ch_data to (-1, 2)
-                    pairs = ch_data[:limit].reshape(-1, 2)
-                    rows = pairs.shape[0]
-                    data_out[:rows, 2*ch] = pairs[:, 0]
-                    data_out[:rows, 2*ch+1] = pairs[:, 1]
+            f.seek(read_start_offset)
+            raw = f.read(bytes_to_read)
+
+        data_flat = np.frombuffer(raw, dtype=dt)
+
+        # Output is (Samples x Channels*2)
+        # Flat: [S0C0m S0C0M S0C1m ... S1C0m ...]
+        # Reshape to (Samples, Channels*2)
+        # Check size (short read)
+        num_read_samples = len(data_flat) // (self.channels * 2)
+        if num_read_samples == 0:
+            return np.array([]), np.array([])
+
+        data_flat = data_flat[:num_read_samples * self.channels * 2]
+        data_out = data_flat.reshape(num_read_samples, self.channels * 2)
 
         # Time vector
-        # t = start_time + (idx_start + i) / rate
-        t_vec = self.start_time + (idx_start + np.arange(num_samples_to_read)) / selected_file['rate']
+        t_vec = self.start_time + (idx_start + np.arange(num_read_samples)) / selected_file['rate']
 
         return t_vec, data_out
 
@@ -382,7 +286,7 @@ def read_file(filename, s0, s1):
     Reads a specific range of samples from a Pyraview level file.
 
     This function reads Min/Max pairs for each sample in the specified range.
-    Pyraview level files store data in a planar format (Channel 0, then Channel 1, etc.).
+    Pyraview level files store data in an Interleaved (Sample-Major) format.
 
     Args:
         filename (str): Path to the Pyraview level file.
@@ -436,9 +340,10 @@ def read_file(filename, s0, s1):
     if data_area < 0:
         return np.zeros((0, num_channels, 2), dtype=dt)
 
-    # Planar layout: [Header][Ch0][Ch1]...
-    # Each channel block: TotalSamples * 2 * ItemSize
-    total_samples = data_area // (num_channels * 2 * item_size)
+    # Interleaved layout: [Header][S0_AllCh][S1_AllCh]...
+    # Each sample block: NumChannels * 2 * ItemSize
+    frame_size = num_channels * 2 * item_size
+    total_samples = data_area // frame_size
 
     # Handle indices
     start_sample = 0 if (s0 == float('-inf') or s0 < 0) else int(s0)
@@ -456,25 +361,44 @@ def read_file(filename, s0, s1):
 
     num_samples_to_read = end_sample - start_sample + 1
 
-    # Allocate output
-    d = np.zeros((num_samples_to_read, num_channels, 2), dtype=dt)
+    # Seek and Read Block
+    read_start_offset = header_size + start_sample * frame_size
+    bytes_to_read = num_samples_to_read * frame_size
 
     with open(filename, 'rb') as f:
-        samples_per_channel_in_file = total_samples
+        f.seek(read_start_offset)
+        raw_bytes = f.read(bytes_to_read)
+
+    raw_data = np.frombuffer(raw_bytes, dtype=dt)
+
+    # Reshape
+    # Raw is [S0C0m S0C0M S0C1m ... S1C0m ...]
+    # Length check
+    read_items = len(raw_data)
+    actual_samples = read_items // (num_channels * 2)
+
+    if actual_samples == 0:
+        return np.zeros((0, num_channels, 2), dtype=dt)
+
+    raw_data = raw_data[:actual_samples * num_channels * 2]
+
+    # Reshape to (Samples, Channels, 2)
+    # raw_data sequence: Sample0(Ch0m,Ch0M, Ch1m,Ch1M...), Sample1...
+    # Reshape to (Samples, Channels*2)
+    reshaped_flat = raw_data.reshape(actual_samples, num_channels * 2)
 
-        for ch in range(num_channels):
-            ch_start_offset = header_size + (ch * samples_per_channel_in_file * 2 * item_size)
-            read_offset = ch_start_offset + (start_sample * 2 * item_size)
+    # Now separate Min/Max
+    # reshaped_flat[:, 0] is S_C0_m
+    # reshaped_flat[:, 1] is S_C0_M
+    # reshaped_flat[:, 2] is S_C1_m ...
 
-            f.seek(read_offset)
-            raw_bytes = f.read(num_samples_to_read * 2 * item_size)
+    d = np.zeros((actual_samples, num_channels, 2), dtype=dt)
 
-            raw_data = np.frombuffer(raw_bytes, dtype=dt)
+    # Vectorized assignment
+    # d[:, :, 0] (Mins) -> columns 0, 2, 4...
+    # d[:, :, 1] (Maxs) -> columns 1, 3, 5...
 
-            # Handle short read
-            n_read = len(raw_data) // 2
-            if n_read > 0:
-                d[:n_read, ch, 0] = raw_data[0 : 2*n_read : 2]
-                d[:n_read, ch, 1] = raw_data[1 : 2*n_read : 2]
+    d[:, :, 0] = reshaped_flat[:, 0::2]
+    d[:, :, 1] = reshaped_flat[:, 1::2]
 
     return d
diff --git a/toolboxPackaging.prj b/toolboxPackaging.prj
index ce2d8e4..767c843 100644
--- a/toolboxPackaging.prj
+++ b/toolboxPackaging.prj
@@ -84,7 +84,7 @@
     </fileset.rootdir>
     <fileset.rootfiles>
       <file>${PROJECT_ROOT}/src/matlab/README.md</file>
-      <file>${PROJECT_ROOT}/src/matlab/pyraview_mex.c</file>
+      <file>${PROJECT_ROOT}/src/matlab/+pyraview/pyraview_gateway.c</file>
       <!-- Note: MEX binaries will be here during CI build -->
     </fileset.rootfiles>
     <fileset.depfun.included />