From d1e8e8aa1be735ac4733a5815d1d4083d71099cd Mon Sep 17 00:00:00 2001 From: Steven Atkinson Date: Thu, 12 Mar 2026 23:33:44 -0700 Subject: [PATCH] Formatting --- NAM/conv1d.cpp | 355 +++++++++++------------ NAM/convnet.cpp | 4 +- NAM/dsp.cpp | 115 ++++---- NAM/dsp.h | 4 +- NAM/film.h | 4 +- NAM/gating_activations.h | 4 +- NAM/lstm.cpp | 4 +- tools/render.cpp | 8 +- tools/test/test_container.cpp | 28 +- tools/test/test_noncontiguous_blocks.cpp | 16 +- 10 files changed, 273 insertions(+), 269 deletions(-) diff --git a/NAM/conv1d.cpp b/NAM/conv1d.cpp index f8ec91b3..b561786c 100644 --- a/NAM/conv1d.cpp +++ b/NAM/conv1d.cpp @@ -272,9 +272,9 @@ void Conv1D::Process(const Eigen::MatrixXf& input, const int num_frames) { // Fused 4x4 kernel_size=3: read all 3 input blocks and compute in one pass const long dil = this->_dilation; - auto in0 = _input_buffer.Read(num_frames, 2 * dil); // oldest (k=0) - auto in1 = _input_buffer.Read(num_frames, dil); // middle (k=1) - auto in2 = _input_buffer.Read(num_frames, 0); // newest (k=2) + auto in0 = _input_buffer.Read(num_frames, 2 * dil); // oldest (k=0) + auto in1 = _input_buffer.Read(num_frames, dil); // middle (k=1) + auto in2 = _input_buffer.Read(num_frames, 0); // newest (k=2) const float* __restrict__ in0_ptr = in0.data(); const float* __restrict__ in1_ptr = in1.data(); @@ -282,7 +282,7 @@ void Conv1D::Process(const Eigen::MatrixXf& input, const int num_frames) float* __restrict__ output_ptr = _output.data(); // Get weight pointers for all 3 taps - const size_t wsize = 16; // 4x4 + const size_t wsize = 16; // 4x4 const float* __restrict__ w0 = this->_weight[0].data(); const float* __restrict__ w1 = this->_weight[1].data(); const float* __restrict__ w2 = this->_weight[2].data(); @@ -371,10 +371,7 @@ void Conv1D::Process(const Eigen::MatrixXf& input, const int num_frames) auto in4 = _input_buffer.Read(num_frames, dil); auto in5 = _input_buffer.Read(num_frames, 0); - const float* __restrict__ in_ptrs[6] = { - in0.data(), in1.data(), in2.data(), - in3.data(), in4.data(), in5.data() - }; + const float* __restrict__ in_ptrs[6] = {in0.data(), in1.data(), in2.data(), in3.data(), in4.data(), in5.data()}; float* __restrict__ output_ptr = _output.data(); // Cache all 54 weights on stack (6 taps x 3x3 matrix, column-major) @@ -408,207 +405,207 @@ void Conv1D::Process(const Eigen::MatrixXf& input, const int num_frames) } else { - // General inline GEMM path uses += accumulation, so needs setZero - _output.leftCols(num_frames).setZero(); + // General inline GEMM path uses += accumulation, so needs setZero + _output.leftCols(num_frames).setZero(); - // General inline GEMM path for other configurations - for (size_t k = 0; k < kernel_size; k++) - { - const long offset = this->_dilation * (k + 1 - (long)kernel_size); - const long lookback = -offset; - auto input_block = _input_buffer.Read(num_frames, lookback); + // General inline GEMM path for other configurations + for (size_t k = 0; k < kernel_size; k++) + { + const long offset = this->_dilation * (k + 1 - (long)kernel_size); + const long lookback = -offset; + auto input_block = _input_buffer.Read(num_frames, lookback); - const float* __restrict__ input_ptr = input_block.data(); - const float* __restrict__ weight_ptr = this->_weight[k].data(); - float* __restrict__ output_ptr = _output.data(); + const float* __restrict__ input_ptr = input_block.data(); + const float* __restrict__ weight_ptr = this->_weight[k].data(); + float* __restrict__ output_ptr = _output.data(); - // Specialized fully-unrolled paths for common small channel counts - // These avoid all loop overhead for the tiny matrices in NAM models - if (out_ch == 2 && in_ch == 2) - { - // 2x2 fully unrolled - const float w00 = weight_ptr[0], w10 = weight_ptr[1]; - const float w01 = weight_ptr[2], w11 = weight_ptr[3]; - for (int f = 0; f < num_frames; f++) + // Specialized fully-unrolled paths for common small channel counts + // These avoid all loop overhead for the tiny matrices in NAM models + if (out_ch == 2 && in_ch == 2) { - const float i0 = input_ptr[f * 2]; - const float i1 = input_ptr[f * 2 + 1]; - output_ptr[f * 2] += w00 * i0 + w01 * i1; - output_ptr[f * 2 + 1] += w10 * i0 + w11 * i1; + // 2x2 fully unrolled + const float w00 = weight_ptr[0], w10 = weight_ptr[1]; + const float w01 = weight_ptr[2], w11 = weight_ptr[3]; + for (int f = 0; f < num_frames; f++) + { + const float i0 = input_ptr[f * 2]; + const float i1 = input_ptr[f * 2 + 1]; + output_ptr[f * 2] += w00 * i0 + w01 * i1; + output_ptr[f * 2 + 1] += w10 * i0 + w11 * i1; + } } - } - else if (out_ch == 2 && in_ch == 4) - { - // 2x4 fully unrolled - const float w00 = weight_ptr[0], w10 = weight_ptr[1]; - const float w01 = weight_ptr[2], w11 = weight_ptr[3]; - const float w02 = weight_ptr[4], w12 = weight_ptr[5]; - const float w03 = weight_ptr[6], w13 = weight_ptr[7]; - for (int f = 0; f < num_frames; f++) + else if (out_ch == 2 && in_ch == 4) { - const float i0 = input_ptr[f * 4]; - const float i1 = input_ptr[f * 4 + 1]; - const float i2 = input_ptr[f * 4 + 2]; - const float i3 = input_ptr[f * 4 + 3]; - output_ptr[f * 2] += w00 * i0 + w01 * i1 + w02 * i2 + w03 * i3; - output_ptr[f * 2 + 1] += w10 * i0 + w11 * i1 + w12 * i2 + w13 * i3; + // 2x4 fully unrolled + const float w00 = weight_ptr[0], w10 = weight_ptr[1]; + const float w01 = weight_ptr[2], w11 = weight_ptr[3]; + const float w02 = weight_ptr[4], w12 = weight_ptr[5]; + const float w03 = weight_ptr[6], w13 = weight_ptr[7]; + for (int f = 0; f < num_frames; f++) + { + const float i0 = input_ptr[f * 4]; + const float i1 = input_ptr[f * 4 + 1]; + const float i2 = input_ptr[f * 4 + 2]; + const float i3 = input_ptr[f * 4 + 3]; + output_ptr[f * 2] += w00 * i0 + w01 * i1 + w02 * i2 + w03 * i3; + output_ptr[f * 2 + 1] += w10 * i0 + w11 * i1 + w12 * i2 + w13 * i3; + } } - } - else if (out_ch == 4 && in_ch == 1) - { - // 4x1 fully unrolled - const float w0 = weight_ptr[0], w1 = weight_ptr[1]; - const float w2 = weight_ptr[2], w3 = weight_ptr[3]; - for (int f = 0; f < num_frames; f++) + else if (out_ch == 4 && in_ch == 1) { - const float in_val = input_ptr[f]; - output_ptr[f * 4] += w0 * in_val; - output_ptr[f * 4 + 1] += w1 * in_val; - output_ptr[f * 4 + 2] += w2 * in_val; - output_ptr[f * 4 + 3] += w3 * in_val; + // 4x1 fully unrolled + const float w0 = weight_ptr[0], w1 = weight_ptr[1]; + const float w2 = weight_ptr[2], w3 = weight_ptr[3]; + for (int f = 0; f < num_frames; f++) + { + const float in_val = input_ptr[f]; + output_ptr[f * 4] += w0 * in_val; + output_ptr[f * 4 + 1] += w1 * in_val; + output_ptr[f * 4 + 2] += w2 * in_val; + output_ptr[f * 4 + 3] += w3 * in_val; + } } - } - else if (out_ch == 4 && in_ch == 4) - { - // 4x4 fully unrolled - cache weights in registers - const float w00 = weight_ptr[0], w10 = weight_ptr[1], w20 = weight_ptr[2], w30 = weight_ptr[3]; - const float w01 = weight_ptr[4], w11 = weight_ptr[5], w21 = weight_ptr[6], w31 = weight_ptr[7]; - const float w02 = weight_ptr[8], w12 = weight_ptr[9], w22 = weight_ptr[10], w32 = weight_ptr[11]; - const float w03 = weight_ptr[12], w13 = weight_ptr[13], w23 = weight_ptr[14], w33 = weight_ptr[15]; - for (int f = 0; f < num_frames; f++) + else if (out_ch == 4 && in_ch == 4) { - const int in_off = f * 4; - const int out_off = f * 4; - const float i0 = input_ptr[in_off]; - const float i1 = input_ptr[in_off + 1]; - const float i2 = input_ptr[in_off + 2]; - const float i3 = input_ptr[in_off + 3]; - output_ptr[out_off] += w00 * i0 + w01 * i1 + w02 * i2 + w03 * i3; - output_ptr[out_off + 1] += w10 * i0 + w11 * i1 + w12 * i2 + w13 * i3; - output_ptr[out_off + 2] += w20 * i0 + w21 * i1 + w22 * i2 + w23 * i3; - output_ptr[out_off + 3] += w30 * i0 + w31 * i1 + w32 * i2 + w33 * i3; + // 4x4 fully unrolled - cache weights in registers + const float w00 = weight_ptr[0], w10 = weight_ptr[1], w20 = weight_ptr[2], w30 = weight_ptr[3]; + const float w01 = weight_ptr[4], w11 = weight_ptr[5], w21 = weight_ptr[6], w31 = weight_ptr[7]; + const float w02 = weight_ptr[8], w12 = weight_ptr[9], w22 = weight_ptr[10], w32 = weight_ptr[11]; + const float w03 = weight_ptr[12], w13 = weight_ptr[13], w23 = weight_ptr[14], w33 = weight_ptr[15]; + for (int f = 0; f < num_frames; f++) + { + const int in_off = f * 4; + const int out_off = f * 4; + const float i0 = input_ptr[in_off]; + const float i1 = input_ptr[in_off + 1]; + const float i2 = input_ptr[in_off + 2]; + const float i3 = input_ptr[in_off + 3]; + output_ptr[out_off] += w00 * i0 + w01 * i1 + w02 * i2 + w03 * i3; + output_ptr[out_off + 1] += w10 * i0 + w11 * i1 + w12 * i2 + w13 * i3; + output_ptr[out_off + 2] += w20 * i0 + w21 * i1 + w22 * i2 + w23 * i3; + output_ptr[out_off + 3] += w30 * i0 + w31 * i1 + w32 * i2 + w33 * i3; + } } - } - else if (out_ch == 3 && in_ch == 1) - { - // 3x1 fully unrolled - const float w0 = weight_ptr[0], w1 = weight_ptr[1], w2 = weight_ptr[2]; - for (int f = 0; f < num_frames; f++) + else if (out_ch == 3 && in_ch == 1) { - const float in_val = input_ptr[f]; - output_ptr[f * 3] += w0 * in_val; - output_ptr[f * 3 + 1] += w1 * in_val; - output_ptr[f * 3 + 2] += w2 * in_val; + // 3x1 fully unrolled + const float w0 = weight_ptr[0], w1 = weight_ptr[1], w2 = weight_ptr[2]; + for (int f = 0; f < num_frames; f++) + { + const float in_val = input_ptr[f]; + output_ptr[f * 3] += w0 * in_val; + output_ptr[f * 3 + 1] += w1 * in_val; + output_ptr[f * 3 + 2] += w2 * in_val; + } } - } - else if (out_ch == 3 && in_ch == 3) - { - // 3x3 fully unrolled - const float w00 = weight_ptr[0], w10 = weight_ptr[1], w20 = weight_ptr[2]; - const float w01 = weight_ptr[3], w11 = weight_ptr[4], w21 = weight_ptr[5]; - const float w02 = weight_ptr[6], w12 = weight_ptr[7], w22 = weight_ptr[8]; - for (int f = 0; f < num_frames; f++) + else if (out_ch == 3 && in_ch == 3) { - const int off = f * 3; - const float i0 = input_ptr[off]; - const float i1 = input_ptr[off + 1]; - const float i2 = input_ptr[off + 2]; - output_ptr[off] += w00 * i0 + w01 * i1 + w02 * i2; - output_ptr[off + 1] += w10 * i0 + w11 * i1 + w12 * i2; - output_ptr[off + 2] += w20 * i0 + w21 * i1 + w22 * i2; + // 3x3 fully unrolled + const float w00 = weight_ptr[0], w10 = weight_ptr[1], w20 = weight_ptr[2]; + const float w01 = weight_ptr[3], w11 = weight_ptr[4], w21 = weight_ptr[5]; + const float w02 = weight_ptr[6], w12 = weight_ptr[7], w22 = weight_ptr[8]; + for (int f = 0; f < num_frames; f++) + { + const int off = f * 3; + const float i0 = input_ptr[off]; + const float i1 = input_ptr[off + 1]; + const float i2 = input_ptr[off + 2]; + output_ptr[off] += w00 * i0 + w01 * i1 + w02 * i2; + output_ptr[off + 1] += w10 * i0 + w11 * i1 + w12 * i2; + output_ptr[off + 2] += w20 * i0 + w21 * i1 + w22 * i2; + } } - } - else if (out_ch == 4 && in_ch == 3) - { - // 4x3 fully unrolled - const float w00 = weight_ptr[0], w10 = weight_ptr[1], w20 = weight_ptr[2], w30 = weight_ptr[3]; - const float w01 = weight_ptr[4], w11 = weight_ptr[5], w21 = weight_ptr[6], w31 = weight_ptr[7]; - const float w02 = weight_ptr[8], w12 = weight_ptr[9], w22 = weight_ptr[10], w32 = weight_ptr[11]; - for (int f = 0; f < num_frames; f++) + else if (out_ch == 4 && in_ch == 3) { - const float i0 = input_ptr[f * 3]; - const float i1 = input_ptr[f * 3 + 1]; - const float i2 = input_ptr[f * 3 + 2]; - output_ptr[f * 4] += w00 * i0 + w01 * i1 + w02 * i2; - output_ptr[f * 4 + 1] += w10 * i0 + w11 * i1 + w12 * i2; - output_ptr[f * 4 + 2] += w20 * i0 + w21 * i1 + w22 * i2; - output_ptr[f * 4 + 3] += w30 * i0 + w31 * i1 + w32 * i2; + // 4x3 fully unrolled + const float w00 = weight_ptr[0], w10 = weight_ptr[1], w20 = weight_ptr[2], w30 = weight_ptr[3]; + const float w01 = weight_ptr[4], w11 = weight_ptr[5], w21 = weight_ptr[6], w31 = weight_ptr[7]; + const float w02 = weight_ptr[8], w12 = weight_ptr[9], w22 = weight_ptr[10], w32 = weight_ptr[11]; + for (int f = 0; f < num_frames; f++) + { + const float i0 = input_ptr[f * 3]; + const float i1 = input_ptr[f * 3 + 1]; + const float i2 = input_ptr[f * 3 + 2]; + output_ptr[f * 4] += w00 * i0 + w01 * i1 + w02 * i2; + output_ptr[f * 4 + 1] += w10 * i0 + w11 * i1 + w12 * i2; + output_ptr[f * 4 + 2] += w20 * i0 + w21 * i1 + w22 * i2; + output_ptr[f * 4 + 3] += w30 * i0 + w31 * i1 + w32 * i2; + } } - } - else if (out_ch == 3 && in_ch == 4) - { - // 3x4 fully unrolled - const float w00 = weight_ptr[0], w10 = weight_ptr[1], w20 = weight_ptr[2]; - const float w01 = weight_ptr[3], w11 = weight_ptr[4], w21 = weight_ptr[5]; - const float w02 = weight_ptr[6], w12 = weight_ptr[7], w22 = weight_ptr[8]; - const float w03 = weight_ptr[9], w13 = weight_ptr[10], w23 = weight_ptr[11]; - for (int f = 0; f < num_frames; f++) + else if (out_ch == 3 && in_ch == 4) { - const float i0 = input_ptr[f * 4]; - const float i1 = input_ptr[f * 4 + 1]; - const float i2 = input_ptr[f * 4 + 2]; - const float i3 = input_ptr[f * 4 + 3]; - output_ptr[f * 3] += w00 * i0 + w01 * i1 + w02 * i2 + w03 * i3; - output_ptr[f * 3 + 1] += w10 * i0 + w11 * i1 + w12 * i2 + w13 * i3; - output_ptr[f * 3 + 2] += w20 * i0 + w21 * i1 + w22 * i2 + w23 * i3; + // 3x4 fully unrolled + const float w00 = weight_ptr[0], w10 = weight_ptr[1], w20 = weight_ptr[2]; + const float w01 = weight_ptr[3], w11 = weight_ptr[4], w21 = weight_ptr[5]; + const float w02 = weight_ptr[6], w12 = weight_ptr[7], w22 = weight_ptr[8]; + const float w03 = weight_ptr[9], w13 = weight_ptr[10], w23 = weight_ptr[11]; + for (int f = 0; f < num_frames; f++) + { + const float i0 = input_ptr[f * 4]; + const float i1 = input_ptr[f * 4 + 1]; + const float i2 = input_ptr[f * 4 + 2]; + const float i3 = input_ptr[f * 4 + 3]; + output_ptr[f * 3] += w00 * i0 + w01 * i1 + w02 * i2 + w03 * i3; + output_ptr[f * 3 + 1] += w10 * i0 + w11 * i1 + w12 * i2 + w13 * i3; + output_ptr[f * 3 + 2] += w20 * i0 + w21 * i1 + w22 * i2 + w23 * i3; + } } - } - else if (out_ch == 6 && in_ch == 1) - { - // 6x1 fully unrolled - const float w0 = weight_ptr[0], w1 = weight_ptr[1], w2 = weight_ptr[2]; - const float w3 = weight_ptr[3], w4 = weight_ptr[4], w5 = weight_ptr[5]; - for (int f = 0; f < num_frames; f++) + else if (out_ch == 6 && in_ch == 1) { - const float in_val = input_ptr[f]; - const int off = f * 6; - output_ptr[off] += w0 * in_val; - output_ptr[off + 1] += w1 * in_val; - output_ptr[off + 2] += w2 * in_val; - output_ptr[off + 3] += w3 * in_val; - output_ptr[off + 4] += w4 * in_val; - output_ptr[off + 5] += w5 * in_val; + // 6x1 fully unrolled + const float w0 = weight_ptr[0], w1 = weight_ptr[1], w2 = weight_ptr[2]; + const float w3 = weight_ptr[3], w4 = weight_ptr[4], w5 = weight_ptr[5]; + for (int f = 0; f < num_frames; f++) + { + const float in_val = input_ptr[f]; + const int off = f * 6; + output_ptr[off] += w0 * in_val; + output_ptr[off + 1] += w1 * in_val; + output_ptr[off + 2] += w2 * in_val; + output_ptr[off + 3] += w3 * in_val; + output_ptr[off + 4] += w4 * in_val; + output_ptr[off + 5] += w5 * in_val; + } } - } - else if (out_ch == 6 && in_ch == 6) - { - // 6x6 - unroll weights, loop over frames - for (int f = 0; f < num_frames; f++) + else if (out_ch == 6 && in_ch == 6) { - const float* __restrict__ in_col = input_ptr + f * 6; - float* __restrict__ out_col = output_ptr + f * 6; - const float i0 = in_col[0], i1 = in_col[1], i2 = in_col[2]; - const float i3 = in_col[3], i4 = in_col[4], i5 = in_col[5]; - for (int o = 0; o < 6; o++) + // 6x6 - unroll weights, loop over frames + for (int f = 0; f < num_frames; f++) { - out_col[o] += weight_ptr[o] * i0 + weight_ptr[6 + o] * i1 + weight_ptr[12 + o] * i2 - + weight_ptr[18 + o] * i3 + weight_ptr[24 + o] * i4 + weight_ptr[30 + o] * i5; + const float* __restrict__ in_col = input_ptr + f * 6; + float* __restrict__ out_col = output_ptr + f * 6; + const float i0 = in_col[0], i1 = in_col[1], i2 = in_col[2]; + const float i3 = in_col[3], i4 = in_col[4], i5 = in_col[5]; + for (int o = 0; o < 6; o++) + { + out_col[o] += weight_ptr[o] * i0 + weight_ptr[6 + o] * i1 + weight_ptr[12 + o] * i2 + + weight_ptr[18 + o] * i3 + weight_ptr[24 + o] * i4 + weight_ptr[30 + o] * i5; + } } } - } - else if (out_ch == 8 && in_ch == 8) - { - // 8x8 - unroll weights, loop over frames - for (int f = 0; f < num_frames; f++) + else if (out_ch == 8 && in_ch == 8) { - const float* __restrict__ in_col = input_ptr + f * 8; - float* __restrict__ out_col = output_ptr + f * 8; - const float i0 = in_col[0], i1 = in_col[1], i2 = in_col[2], i3 = in_col[3]; - const float i4 = in_col[4], i5 = in_col[5], i6 = in_col[6], i7 = in_col[7]; - for (int o = 0; o < 8; o++) + // 8x8 - unroll weights, loop over frames + for (int f = 0; f < num_frames; f++) { - out_col[o] += weight_ptr[o] * i0 + weight_ptr[8 + o] * i1 + weight_ptr[16 + o] * i2 - + weight_ptr[24 + o] * i3 + weight_ptr[32 + o] * i4 + weight_ptr[40 + o] * i5 - + weight_ptr[48 + o] * i6 + weight_ptr[56 + o] * i7; + const float* __restrict__ in_col = input_ptr + f * 8; + float* __restrict__ out_col = output_ptr + f * 8; + const float i0 = in_col[0], i1 = in_col[1], i2 = in_col[2], i3 = in_col[3]; + const float i4 = in_col[4], i5 = in_col[5], i6 = in_col[6], i7 = in_col[7]; + for (int o = 0; o < 8; o++) + { + out_col[o] += weight_ptr[o] * i0 + weight_ptr[8 + o] * i1 + weight_ptr[16 + o] * i2 + + weight_ptr[24 + o] * i3 + weight_ptr[32 + o] * i4 + weight_ptr[40 + o] * i5 + + weight_ptr[48 + o] * i6 + weight_ptr[56 + o] * i7; + } } } + else + { + // Fall back to Eigen for larger matrices where it's more efficient + _output.leftCols(num_frames).noalias() += this->_weight[k] * input_block; + } } - else - { - // Fall back to Eigen for larger matrices where it's more efficient - _output.leftCols(num_frames).noalias() += this->_weight[k] * input_block; - } - } } // end else (general GEMM path) #else // Eigen fallback uses += accumulation, so needs setZero diff --git a/NAM/convnet.cpp b/NAM/convnet.cpp index 0a9e2758..329caf61 100644 --- a/NAM/convnet.cpp +++ b/NAM/convnet.cpp @@ -341,8 +341,8 @@ nam::convnet::ConvNetConfig nam::convnet::parse_config_json(const nlohmann::json // ConvNetConfig::create() std::unique_ptr nam::convnet::ConvNetConfig::create(std::vector weights, double sampleRate) { - return std::make_unique(in_channels, out_channels, channels, dilations, batchnorm, activation, - weights, sampleRate, groups); + return std::make_unique( + in_channels, out_channels, channels, dilations, batchnorm, activation, weights, sampleRate, groups); } // Config parser for ConfigParserRegistry diff --git a/NAM/dsp.cpp b/NAM/dsp.cpp index 3aa22304..e975001b 100644 --- a/NAM/dsp.cpp +++ b/NAM/dsp.cpp @@ -498,7 +498,7 @@ void nam::Conv1x1::process_(const Eigen::Ref& input, cons for (int f = 0; f < num_frames; f++) { const float in_val = input_ptr[f * in_stride]; - output_ptr[f * 2] = w0 * in_val; + output_ptr[f * 2] = w0 * in_val; output_ptr[f * 2 + 1] = w1 * in_val; } } @@ -508,7 +508,7 @@ void nam::Conv1x1::process_(const Eigen::Ref& input, cons for (int f = 0; f < num_frames; f++) { const float in_val = input_ptr[f * in_stride]; - output_ptr[f * 3] = w0 * in_val; + output_ptr[f * 3] = w0 * in_val; output_ptr[f * 3 + 1] = w1 * in_val; output_ptr[f * 3 + 2] = w2 * in_val; } @@ -520,7 +520,7 @@ void nam::Conv1x1::process_(const Eigen::Ref& input, cons for (int f = 0; f < num_frames; f++) { const float in_val = input_ptr[f * in_stride]; - output_ptr[f * 4] = w0 * in_val; + output_ptr[f * 4] = w0 * in_val; output_ptr[f * 4 + 1] = w1 * in_val; output_ptr[f * 4 + 2] = w2 * in_val; output_ptr[f * 4 + 3] = w3 * in_val; @@ -567,7 +567,7 @@ void nam::Conv1x1::process_(const Eigen::Ref& input, cons const float* __restrict__ in_col = input_ptr + f * in_stride; const float i0 = in_col[0]; const float i1 = in_col[1]; - output_ptr[f * 2] = w00 * i0 + w01 * i1; + output_ptr[f * 2] = w00 * i0 + w01 * i1; output_ptr[f * 2 + 1] = w10 * i0 + w11 * i1; } } @@ -584,7 +584,7 @@ void nam::Conv1x1::process_(const Eigen::Ref& input, cons const float i1 = in_col[1]; const float i2 = in_col[2]; const float i3 = in_col[3]; - output_ptr[f * 2] = w00 * i0 + w01 * i1 + w02 * i2 + w03 * i3; + output_ptr[f * 2] = w00 * i0 + w01 * i1 + w02 * i2 + w03 * i3; output_ptr[f * 2 + 1] = w10 * i0 + w11 * i1 + w12 * i2 + w13 * i3; } } @@ -595,8 +595,7 @@ void nam::Conv1x1::process_(const Eigen::Ref& input, cons for (int f = 0; f < num_frames; f++) { const float* __restrict__ in_col = input_ptr + f * in_stride; - output_ptr[f] = w0 * in_col[0] + w1 * in_col[1] - + w2 * in_col[2] + w3 * in_col[3]; + output_ptr[f] = w0 * in_col[0] + w1 * in_col[1] + w2 * in_col[2] + w3 * in_col[3]; } } else if (out_ch == 4 && in_ch == 2) @@ -608,7 +607,7 @@ void nam::Conv1x1::process_(const Eigen::Ref& input, cons const float* __restrict__ in_col = input_ptr + f * in_stride; const float i0 = in_col[0]; const float i1 = in_col[1]; - output_ptr[f * 4] = w00 * i0 + w01 * i1; + output_ptr[f * 4] = w00 * i0 + w01 * i1; output_ptr[f * 4 + 1] = w10 * i0 + w11 * i1; output_ptr[f * 4 + 2] = w20 * i0 + w21 * i1; output_ptr[f * 4 + 3] = w30 * i0 + w31 * i1; @@ -628,7 +627,7 @@ void nam::Conv1x1::process_(const Eigen::Ref& input, cons const float i0 = in_col[0]; const float i1 = in_col[1]; const float i2 = in_col[2]; - output_ptr[f * 3] = w00 * i0 + w01 * i1 + w02 * i2 + b0; + output_ptr[f * 3] = w00 * i0 + w01 * i1 + w02 * i2 + b0; output_ptr[f * 3 + 1] = w10 * i0 + w11 * i1 + w12 * i2 + b1; output_ptr[f * 3 + 2] = w20 * i0 + w21 * i1 + w22 * i2 + b2; } @@ -642,7 +641,7 @@ void nam::Conv1x1::process_(const Eigen::Ref& input, cons const float i0 = in_col[0]; const float i1 = in_col[1]; const float i2 = in_col[2]; - output_ptr[f * 3] = w00 * i0 + w01 * i1 + w02 * i2; + output_ptr[f * 3] = w00 * i0 + w01 * i1 + w02 * i2; output_ptr[f * 3 + 1] = w10 * i0 + w11 * i1 + w12 * i2; output_ptr[f * 3 + 2] = w20 * i0 + w21 * i1 + w22 * i2; } @@ -650,9 +649,9 @@ void nam::Conv1x1::process_(const Eigen::Ref& input, cons } else if (out_ch == 4 && in_ch == 4) { - const float w00 = weight_ptr[0], w10 = weight_ptr[1], w20 = weight_ptr[2], w30 = weight_ptr[3]; - const float w01 = weight_ptr[4], w11 = weight_ptr[5], w21 = weight_ptr[6], w31 = weight_ptr[7]; - const float w02 = weight_ptr[8], w12 = weight_ptr[9], w22 = weight_ptr[10], w32 = weight_ptr[11]; + const float w00 = weight_ptr[0], w10 = weight_ptr[1], w20 = weight_ptr[2], w30 = weight_ptr[3]; + const float w01 = weight_ptr[4], w11 = weight_ptr[5], w21 = weight_ptr[6], w31 = weight_ptr[7]; + const float w02 = weight_ptr[8], w12 = weight_ptr[9], w22 = weight_ptr[10], w32 = weight_ptr[11]; const float w03 = weight_ptr[12], w13 = weight_ptr[13], w23 = weight_ptr[14], w33 = weight_ptr[15]; for (int f = 0; f < num_frames; f++) { @@ -661,7 +660,7 @@ void nam::Conv1x1::process_(const Eigen::Ref& input, cons const float i1 = in_col[1]; const float i2 = in_col[2]; const float i3 = in_col[3]; - output_ptr[f * 4] = w00 * i0 + w01 * i1 + w02 * i2 + w03 * i3; + output_ptr[f * 4] = w00 * i0 + w01 * i1 + w02 * i2 + w03 * i3; output_ptr[f * 4 + 1] = w10 * i0 + w11 * i1 + w12 * i2 + w13 * i3; output_ptr[f * 4 + 2] = w20 * i0 + w21 * i1 + w22 * i2 + w23 * i3; output_ptr[f * 4 + 3] = w30 * i0 + w31 * i1 + w32 * i2 + w33 * i3; @@ -677,8 +676,8 @@ void nam::Conv1x1::process_(const Eigen::Ref& input, cons const float i3 = in_col[3], i4 = in_col[4], i5 = in_col[5]; for (int o = 0; o < 6; o++) { - out_col[o] = weight_ptr[o] * i0 + weight_ptr[6 + o] * i1 + weight_ptr[12 + o] * i2 - + weight_ptr[18 + o] * i3 + weight_ptr[24 + o] * i4 + weight_ptr[30 + o] * i5; + out_col[o] = weight_ptr[o] * i0 + weight_ptr[6 + o] * i1 + weight_ptr[12 + o] * i2 + weight_ptr[18 + o] * i3 + + weight_ptr[24 + o] * i4 + weight_ptr[30 + o] * i5; } } } @@ -693,7 +692,8 @@ void nam::Conv1x1::process_(const Eigen::Ref& input, cons for (int o = 0; o < 8; o++) { out_col[o] = weight_ptr[o] * i0 + weight_ptr[8 + o] * i1 + weight_ptr[16 + o] * i2 + weight_ptr[24 + o] * i3 - + weight_ptr[32 + o] * i4 + weight_ptr[40 + o] * i5 + weight_ptr[48 + o] * i6 + weight_ptr[56 + o] * i7; + + weight_ptr[32 + o] * i4 + weight_ptr[40 + o] * i5 + weight_ptr[48 + o] * i6 + + weight_ptr[56 + o] * i7; } } } @@ -708,7 +708,8 @@ void nam::Conv1x1::process_(const Eigen::Ref& input, cons for (int o = 0; o < 4; o++) { out_col[o] = weight_ptr[o] * i0 + weight_ptr[4 + o] * i1 + weight_ptr[8 + o] * i2 + weight_ptr[12 + o] * i3 - + weight_ptr[16 + o] * i4 + weight_ptr[20 + o] * i5 + weight_ptr[24 + o] * i6 + weight_ptr[28 + o] * i7; + + weight_ptr[16 + o] * i4 + weight_ptr[20 + o] * i5 + weight_ptr[24 + o] * i6 + + weight_ptr[28 + o] * i7; } } } @@ -754,56 +755,56 @@ void nam::Conv1x1::process_(const Eigen::Ref& input, cons #ifdef NAM_USE_INLINE_GEMM if (!bias_fused) { - const int out_ch = (int)get_out_channels(); - float* __restrict__ output_ptr = _output.data(); - const float* __restrict__ bias_ptr = this->_bias.data(); + const int out_ch = (int)get_out_channels(); + float* __restrict__ output_ptr = _output.data(); + const float* __restrict__ bias_ptr = this->_bias.data(); - // Specialized paths for common small channel counts - if (out_ch == 2) - { - const float b0 = bias_ptr[0], b1 = bias_ptr[1]; - for (int f = 0; f < num_frames; f++) + // Specialized paths for common small channel counts + if (out_ch == 2) { - const int off = f * 2; - output_ptr[off] += b0; - output_ptr[off + 1] += b1; + const float b0 = bias_ptr[0], b1 = bias_ptr[1]; + for (int f = 0; f < num_frames; f++) + { + const int off = f * 2; + output_ptr[off] += b0; + output_ptr[off + 1] += b1; + } } - } - else if (out_ch == 3) - { - const float b0 = bias_ptr[0], b1 = bias_ptr[1], b2 = bias_ptr[2]; - for (int f = 0; f < num_frames; f++) + else if (out_ch == 3) { - const int off = f * 3; - output_ptr[off] += b0; - output_ptr[off + 1] += b1; - output_ptr[off + 2] += b2; + const float b0 = bias_ptr[0], b1 = bias_ptr[1], b2 = bias_ptr[2]; + for (int f = 0; f < num_frames; f++) + { + const int off = f * 3; + output_ptr[off] += b0; + output_ptr[off + 1] += b1; + output_ptr[off + 2] += b2; + } } - } - else if (out_ch == 4) - { - const float b0 = bias_ptr[0], b1 = bias_ptr[1]; - const float b2 = bias_ptr[2], b3 = bias_ptr[3]; - for (int f = 0; f < num_frames; f++) + else if (out_ch == 4) { - const int off = f * 4; - output_ptr[off] += b0; - output_ptr[off + 1] += b1; - output_ptr[off + 2] += b2; - output_ptr[off + 3] += b3; + const float b0 = bias_ptr[0], b1 = bias_ptr[1]; + const float b2 = bias_ptr[2], b3 = bias_ptr[3]; + for (int f = 0; f < num_frames; f++) + { + const int off = f * 4; + output_ptr[off] += b0; + output_ptr[off + 1] += b1; + output_ptr[off + 2] += b2; + output_ptr[off + 3] += b3; + } } - } - else - { - for (int f = 0; f < num_frames; f++) + else { - float* __restrict__ out_col = output_ptr + f * out_ch; - for (int o = 0; o < out_ch; o++) + for (int f = 0; f < num_frames; f++) { - out_col[o] += bias_ptr[o]; + float* __restrict__ out_col = output_ptr + f * out_ch; + for (int o = 0; o < out_ch; o++) + { + out_col[o] += bias_ptr[o]; + } } } - } } // !bias_fused #else _output.leftCols(num_frames).colwise() += this->_bias; diff --git a/NAM/dsp.h b/NAM/dsp.h index c20a5163..1fadcf70 100644 --- a/NAM/dsp.h +++ b/NAM/dsp.h @@ -27,8 +27,8 @@ /// \brief Use a sample rate of -1 if we don't know what the model expects to be run at #define NAM_UNKNOWN_EXPECTED_SAMPLE_RATE -1.0 -#if defined(_MSC_VER) && !defined(__llvm__) -#define __restrict__ __restrict +#if defined(_MSC_VER) && !defined(__llvm__) + #define __restrict__ __restrict #endif namespace nam diff --git a/NAM/film.h b/NAM/film.h index 27685269..d41911d4 100644 --- a/NAM/film.h +++ b/NAM/film.h @@ -123,7 +123,7 @@ class FiLM int i = 0; for (; i + 3 < input_dim; i += 4) { - out_col[i] = in_col[i] * scale_col[i] + shift_col[i]; + out_col[i] = in_col[i] * scale_col[i] + shift_col[i]; out_col[i + 1] = in_col[i + 1] * scale_col[i + 1] + shift_col[i + 1]; out_col[i + 2] = in_col[i + 2] * scale_col[i + 2] + shift_col[i + 2]; out_col[i + 3] = in_col[i + 3] * scale_col[i + 3] + shift_col[i + 3]; @@ -161,7 +161,7 @@ class FiLM int i = 0; for (; i + 3 < input_dim; i += 4) { - out_col[i] = in_col[i] * scale_col[i]; + out_col[i] = in_col[i] * scale_col[i]; out_col[i + 1] = in_col[i + 1] * scale_col[i + 1]; out_col[i + 2] = in_col[i + 2] * scale_col[i + 2]; out_col[i + 3] = in_col[i + 3] * scale_col[i + 3]; diff --git a/NAM/gating_activations.h b/NAM/gating_activations.h index 0d52298c..ff21c3b5 100644 --- a/NAM/gating_activations.h +++ b/NAM/gating_activations.h @@ -72,7 +72,7 @@ class GatingActivation const int input_stride = (int)input.outerStride(); const float* __restrict__ input_ptr = input.derived().data(); float* __restrict__ output_ptr = output.derived().data(); - const int output_stride = (int)output.outerStride(); // Column stride for output + const int output_stride = (int)output.outerStride(); // Column stride for output for (int f = 0; f < num_samples; f++) { @@ -178,7 +178,7 @@ class BlendingActivation const int input_stride = (int)input.outerStride(); const float* __restrict__ input_ptr = input.derived().data(); float* __restrict__ output_ptr = output.derived().data(); - const int output_stride = (int)output.outerStride(); // Column stride for output + const int output_stride = (int)output.outerStride(); // Column stride for output for (int f = 0; f < num_samples; f++) { diff --git a/NAM/lstm.cpp b/NAM/lstm.cpp index 2828d50b..9169a7ee 100644 --- a/NAM/lstm.cpp +++ b/NAM/lstm.cpp @@ -179,8 +179,8 @@ nam::lstm::LSTMConfig nam::lstm::parse_config_json(const nlohmann::json& config) // LSTMConfig::create() std::unique_ptr nam::lstm::LSTMConfig::create(std::vector weights, double sampleRate) { - return std::make_unique(in_channels, out_channels, num_layers, input_size, hidden_size, weights, - sampleRate); + return std::make_unique( + in_channels, out_channels, num_layers, input_size, hidden_size, weights, sampleRate); } // Config parser for ConfigParserRegistry diff --git a/tools/render.cpp b/tools/render.cpp index c3cabb2a..4f50fa97 100644 --- a/tools/render.cpp +++ b/tools/render.cpp @@ -38,7 +38,7 @@ bool SaveWavFloat32(const char* fileName, const float* samples, size_t numSample const uint32_t fmtSize = 16; out.write("fmt ", 4); out.write(reinterpret_cast(&fmtSize), 4); - const uint16_t audioFormat = 3; // IEEE float + const uint16_t audioFormat = 3; // IEEE float out.write(reinterpret_cast(&audioFormat), 2); const uint16_t numChannels = 1; out.write(reinterpret_cast(&numChannels), 2); @@ -59,7 +59,7 @@ bool SaveWavFloat32(const char* fileName, const float* samples, size_t numSample return out.good(); } -} // namespace +} // namespace int main(int argc, char* argv[]) { @@ -138,8 +138,8 @@ int main(int argc, char* argv[]) const double expectedRate = model->GetExpectedSampleRate(); if (expectedRate > 0 && std::abs(inputSampleRate - expectedRate) > 0.5) { - std::cerr << "Error: Input WAV sample rate (" << inputSampleRate - << " Hz) does not match model expected rate (" << expectedRate << " Hz)\n"; + std::cerr << "Error: Input WAV sample rate (" << inputSampleRate << " Hz) does not match model expected rate (" + << expectedRate << " Hz)\n"; return 1; } diff --git a/tools/test/test_container.cpp b/tools/test/test_container.cpp index 9030e28a..993c9065 100644 --- a/tools/test/test_container.cpp +++ b/tools/test/test_container.cpp @@ -38,8 +38,9 @@ nlohmann::json build_container_json(const std::string& small_path, const std::st nlohmann::json container; container["version"] = "0.7.0"; container["architecture"] = "SlimmableContainer"; - container["config"]["submodels"] = nlohmann::json::array( - {{{"max_value", 0.33}, {"model", small_model}}, {{"max_value", 0.66}, {"model", medium_model}}, {{"max_value", 1.0}, {"model", large_model}}}); + container["config"]["submodels"] = nlohmann::json::array({{{"max_value", 0.33}, {"model", small_model}}, + {{"max_value", 0.66}, {"model", medium_model}}, + {{"max_value", 1.0}, {"model", large_model}}}); container["weights"] = nlohmann::json::array(); container["sample_rate"] = 48000; return container; @@ -74,21 +75,24 @@ void process_and_verify(nam::DSP* dsp, int num_buffers, int buffer_size) void test_container_loads_from_json() { - auto j = build_container_json("example_models/lstm.nam", "example_models/wavenet.nam", "example_models/wavenet_a2_max.nam"); + auto j = + build_container_json("example_models/lstm.nam", "example_models/wavenet.nam", "example_models/wavenet_a2_max.nam"); auto dsp = nam::get_dsp(j); assert(dsp != nullptr); } void test_container_processes_audio() { - auto j = build_container_json("example_models/lstm.nam", "example_models/wavenet.nam", "example_models/wavenet_a2_max.nam"); + auto j = + build_container_json("example_models/lstm.nam", "example_models/wavenet.nam", "example_models/wavenet_a2_max.nam"); auto dsp = nam::get_dsp(j); process_and_verify(dsp.get(), 3, 64); } void test_container_slimmable_selects_submodel() { - auto j = build_container_json("example_models/lstm.nam", "example_models/wavenet.nam", "example_models/wavenet_a2_max.nam"); + auto j = + build_container_json("example_models/lstm.nam", "example_models/wavenet.nam", "example_models/wavenet_a2_max.nam"); auto dsp = nam::get_dsp(j); const double sample_rate = 48000.0; const int buffer_size = 64; @@ -129,7 +133,8 @@ void test_container_slimmable_selects_submodel() void test_container_boundary_values() { - auto j = build_container_json("example_models/lstm.nam", "example_models/wavenet.nam", "example_models/wavenet_a2_max.nam"); + auto j = + build_container_json("example_models/lstm.nam", "example_models/wavenet.nam", "example_models/wavenet_a2_max.nam"); auto dsp = nam::get_dsp(j); const double sample_rate = 48000.0; const int buffer_size = 16; @@ -230,8 +235,8 @@ void test_container_unsorted_submodels_throws() nlohmann::json j; j["version"] = "0.7.0"; j["architecture"] = "SlimmableContainer"; - j["config"]["submodels"] = nlohmann::json::array( - {{{"max_value", 0.8}, {"model", small_json}}, {{"max_value", 0.5}, {"model", medium_json}}}); + j["config"]["submodels"] = + nlohmann::json::array({{{"max_value", 0.8}, {"model", small_json}}, {{"max_value", 0.5}, {"model", medium_json}}}); j["weights"] = nlohmann::json::array(); j["sample_rate"] = 48000; @@ -263,8 +268,8 @@ void test_container_sample_rate_mismatch_throws() nlohmann::json j; j["version"] = "0.7.0"; j["architecture"] = "SlimmableContainer"; - j["config"]["submodels"] = nlohmann::json::array( - {{{"max_value", 0.5}, {"model", model_44k}}, {{"max_value", 1.0}, {"model", model_48k}}}); + j["config"]["submodels"] = + nlohmann::json::array({{{"max_value", 0.5}, {"model", model_44k}}, {{"max_value", 1.0}, {"model", model_48k}}}); j["weights"] = nlohmann::json::array(); j["sample_rate"] = 48000; @@ -296,7 +301,8 @@ void test_container_load_from_file() void test_container_default_is_max_size() { - auto j = build_container_json("example_models/lstm.nam", "example_models/wavenet.nam", "example_models/wavenet_a2_max.nam"); + auto j = + build_container_json("example_models/lstm.nam", "example_models/wavenet.nam", "example_models/wavenet_a2_max.nam"); auto dsp = nam::get_dsp(j); const double sample_rate = 48000.0; const int buffer_size = 64; diff --git a/tools/test/test_noncontiguous_blocks.cpp b/tools/test/test_noncontiguous_blocks.cpp index 7044fd10..e72f342e 100644 --- a/tools/test/test_noncontiguous_blocks.cpp +++ b/tools/test/test_noncontiguous_blocks.cpp @@ -136,10 +136,10 @@ void test_conv1x1_process_toprows_2x2() conv.SetMaxBufferSize(64); Eigen::MatrixXf full_matrix(total_rows, num_frames); - full_matrix << 1.0f, 2.0f, 3.0f, // row 0 (top, used) - 4.0f, 5.0f, 6.0f, // row 1 (top, used) - 99.0f, 99.0f, 99.0f, // row 2 (bottom, NOT used) - 99.0f, 99.0f, 99.0f; // row 3 (bottom, NOT used) + full_matrix << 1.0f, 2.0f, 3.0f, // row 0 (top, used) + 4.0f, 5.0f, 6.0f, // row 1 (top, used) + 99.0f, 99.0f, 99.0f, // row 2 (bottom, NOT used) + 99.0f, 99.0f, 99.0f; // row 3 (bottom, NOT used) conv.process_(full_matrix.topRows(bottleneck), num_frames); const auto& output = conv.GetOutput(); @@ -236,12 +236,12 @@ void test_film_process_toprows_with_shift() // Configure Conv1x1 with zero weights, fixed biases for scale/shift std::vector weights((2 * input_dim) * condition_dim + (2 * input_dim), 0.0f); const int bias_offset = (2 * input_dim) * condition_dim; - weights[bias_offset + 0] = 2.0f; // scale[0] + weights[bias_offset + 0] = 2.0f; // scale[0] weights[bias_offset + 1] = -1.0f; // scale[1] - weights[bias_offset + 2] = 0.5f; // scale[2] + weights[bias_offset + 2] = 0.5f; // scale[2] weights[bias_offset + 3] = 10.0f; // shift[0] weights[bias_offset + 4] = -5.0f; // shift[1] - weights[bias_offset + 5] = 3.0f; // shift[2] + weights[bias_offset + 5] = 3.0f; // shift[2] auto it = weights.begin(); film.set_weights_(it); @@ -447,7 +447,7 @@ void test_gating_output_toprows() { for (int c = 0; c < bottleneck; c++) { - const float input_val = input(c, f); // identity activation + const float input_val = input(c, f); // identity activation const float gate_val = 1.0f / (1.0f + expf(-input(c + bottleneck, f))); // sigmoid const float expected = input_val * gate_val; assert(std::abs(output_matrix(c, f) - expected) < 1e-5f);