diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
new file mode 100644
index 0000000..b490243
--- /dev/null
+++ b/.github/workflows/ci.yml
@@ -0,0 +1,27 @@
+name: CUDA CMake CI
+
+on:
+  push:
+    branches: [ main ]
+  pull_request:
+    branches: [ main ]
+
+jobs:
+  build:
+    runs-on: ubuntu-latest
+    container:
+      image: leonardeyer/cuda-builder:latest
+    steps:
+      - name: Checkout source
+        uses: actions/checkout@v4
+
+      - name: Configure with CMake
+        run: cmake -B build -S .
+
+      - name: Build
+        run: cmake --build build --parallel
+
+      - uses: actions/upload-artifact@v4
+        with:
+          name: mps
+          path: build/mps
diff --git a/.gitignore b/.gitignore
index cb385db..c4b8c6f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -4,3 +4,5 @@
 *.ptx
 *.cubin
 *.fatbin
+.*
+*__pycache__*
\ No newline at end of file
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 8b82b19..5bbc13c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -35,18 +35,7 @@ CPMAddPackage("gh:gabime/spdlog@1.8.2")
 include_directories(include)
 
 # mps
-add_executable(mps
-        src/mps.cu
-
-        include/tensor.h
-        include/decomposition.h
-        include/permutation.h
-        include/Operators.hpp
-        include/mps.h
-        include/tensordot.h
-        include/cutensor_utils.h
-        include/reduction.h
-)
+add_executable(mps src/mps.cu)
 
 target_link_libraries(mps PRIVATE
         cuTENSOR::cuTENSOR
@@ -70,8 +59,7 @@ target_compile_options(cuda_compile_options INTERFACE
 
 target_link_libraries(mps PRIVATE cuda_compile_options)
 
-set_target_properties(mps PROPERTIES
-        CUDA_SEPARABLE_COMPILATION ON)
+set_target_properties(mps PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
 
 
 # test_mps
@@ -86,61 +74,4 @@ target_link_libraries(test_mps PRIVATE
 
 target_compile_options(test_mps PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:--extended-lambda>)
 
-set_target_properties(test_mps PROPERTIES
-        CUDA_SEPARABLE_COMPILATION ON)
-
-# Add the test executable
-add_executable(test_tensordot tests/test_tensordot.cu)
-
-# Link against CUTENSOR and CUDA libraries.
-target_link_libraries(test_tensordot
-        cuTENSOR::cuTENSOR
-        CUDA::cudart
-        CUDA::cublas
-        spdlog::spdlog
-)
-
-set_target_properties(test_tensordot PROPERTIES
-        CUDA_SEPARABLE_COMPILATION ON)
-
-
-add_executable(test_einsum tests/test_einsum.cu)
-
-# Link against CUTENSOR and CUDA libraries.
-target_link_libraries(test_einsum
-        cuTENSOR::cuTENSOR
-        CUDA::cudart
-        CUDA::cublas
-        spdlog::spdlog
-)
-
-set_target_properties(test_einsum PROPERTIES
-        CUDA_SEPARABLE_COMPILATION ON)
-
-add_executable(test_permutation tests/test_permutation.cu)
-
-# Link against CUTENSOR and CUDA libraries.
-target_link_libraries(test_permutation
-        cuTENSOR::cuTENSOR
-        CUDA::cudart
-        CUDA::cublas
-        spdlog::spdlog
-)
-
-set_target_properties(test_permutation PROPERTIES
-        CUDA_SEPARABLE_COMPILATION ON)
-
-
-add_executable(test_truncation tests/test_reduction.cu)
-
-# Link against CUTENSOR and CUDA libraries.
-target_link_libraries(test_truncation
-        cuTENSOR::cuTENSOR
-        CUDA::cudart
-        CUDA::cublas
-        spdlog::spdlog
-        cuda_compile_options
-)
-
-set_target_properties(test_truncation PROPERTIES
-        CUDA_SEPARABLE_COMPILATION ON)
+set_target_properties(test_mps PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
\ No newline at end of file
diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 0000000..5e6dd37
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,20 @@
+# Use the NVIDIA CUDA 12.8.0 development image with Ubuntu 24.04
+FROM nvidia/cuda:12.8.0-devel-ubuntu24.04
+
+# Set non-interactive mode for apt
+ENV DEBIAN_FRONTEND=noninteractive
+
+# Install system dependencies
+RUN apt-get update && apt-get install -y \
+    build-essential \
+    cmake \
+    git \
+    nvtop \
+    python3 \
+    python3-pip \
+    python3-dev \
+    libcutensor2 \
+    && rm -rf /var/lib/apt/lists/*
+
+# Install CuPy for CUDA 12.x
+RUN python3 -m pip install cupy-cuda12x --break-system-packages
diff --git a/README.md b/README.md
index 8c92515..e0057be 100644
--- a/README.md
+++ b/README.md
@@ -14,7 +14,7 @@ The main computational task involves decomposing a tensor of shape $(D_1, d, d,
 
 Since exact SVD decompositions do not parallelize well on GPUs ([ref](https://arxiv.org/pdf/2212.09782)) I opted for the existing one-sided Jacobi based implementation `cusolverDn<t>gesvdj` from [cuSolver](https://docs.nvidia.com/cuda/cusolver/index.html#cusolverdn-t-gesvdj).
 
-The implementation focused mainly on avoiding copying data to the host when possible and preallocating all the required memory / workspaces up front to aid getting the data ready for the decomposition. This involved using scractch tensors and custom out-of-place kernels to allow directly writing to the destination tensor memory. The truncation and normalization procedure was fused into a single kernel making use of cooperative groups to synchronize across blocks.
+The implementation focuses mainly on avoiding copying data to the host when possible and pre-allocating all the required memory / workspaces up front to aid getting the data ready for the decomposition. This involved using scractch tensors and custom out-of-place kernels to allow directly writing to the destination tensor memory. 
 
 ---
 ##### cuTensorNet
diff --git a/include/MPSSimulator.cuh b/include/MPSSimulator.cuh
deleted file mode 100644
index e02f2e3..0000000
--- a/include/MPSSimulator.cuh
+++ /dev/null
@@ -1,362 +0,0 @@
-#pragma once
-
-#include "Operators.hpp"
-
-#include <cstdlib>
-#include <cstdio>
-#include <cassert>
-#include <complex>
-#include <vector>
-#include <iostream>
-
-#include <cuda_runtime.h>
-#include <cutensornet.h>
-#include <spdlog/spdlog.h>
-#include <spdlog/fmt/ostr.h>  // To extend fmt functionality for custom types
-#include <numeric>
-
-
-#define HANDLE_CUDA_ERROR(x) \
-{ const auto err = x; \
-  if( err != cudaSuccess ) \
-  { printf("CUDA error %s in line %d\n", cudaGetErrorString(err), __LINE__); fflush(stdout); std::abort(); } \
-};
-
-#define HANDLE_CUTN_ERROR(x) \
-{ const auto err = x; \
-  if( err != CUTENSORNET_STATUS_SUCCESS ) \
-  { printf("cuTensorNet error %s in line %d\n", cutensornetGetErrorString(err), __LINE__); fflush(stdout); std::abort(); } \
-};
-
-
-
-
-namespace mps {
-    using Scalar = float;
-    static auto dataType = CUDA_C_32F;
-
-    static std::size_t fpsize = sizeof(Scalar);
-
-    struct Extents {
-        std::vector<std::vector<int64_t> > extents;
-        std::vector<int64_t *> extentsPtr;
-        std::vector<void *> d_mpsTensors;
-
-        explicit Extents(int32_t numQubits, int64_t maxExtent) : extentsPtr(numQubits),
-                                                                 d_mpsTensors(numQubits, nullptr) {
-            for (int32_t i = 0; i < numQubits; i++) {
-                if (i == 0) {
-                    // left boundary MPS tensor
-                    extents.push_back({2, maxExtent});
-                    HANDLE_CUDA_ERROR(cudaMalloc(&d_mpsTensors[i], 2 * maxExtent * 2 * fpsize));
-                } else if (i == numQubits - 1) {
-                    // right boundary MPS tensor
-                    extents.push_back({maxExtent, 2});
-                    HANDLE_CUDA_ERROR(cudaMalloc(&d_mpsTensors[i], 2 * maxExtent * 2 * fpsize));
-                } else {
-                    // middle MPS tensors
-                    extents.push_back({maxExtent, 2, maxExtent});
-                    HANDLE_CUDA_ERROR(cudaMalloc(&d_mpsTensors[i], 2 * maxExtent * maxExtent * 2 * fpsize));
-                }
-                extentsPtr[i] = extents[i].data();
-            }
-        }
-
-        ~Extents() {
-            for (auto &d_mpsTensor: d_mpsTensors) {
-                HANDLE_CUDA_ERROR(cudaFree(d_mpsTensor));
-            }
-        }
-    };
-
-    struct Simulator {
-        cutensornetHandle_t cutnHandle{nullptr};
-        cutensornetState_t state{nullptr};
-
-        void *d_scratch{nullptr};
-        std::size_t scratchSize{0};
-
-        cutensornetWorkspaceDescriptor_t workDesc{nullptr};
-
-        cutensornetStateExpectation_t expectation{nullptr};
-
-        int32_t numQubits;
-        std::vector<int64_t> qubitDims;
-        Extents extents; // Stores tensor dimensions for MPS representation.
-
-        // Constructor initializes everything.
-        Simulator(int deviceId, int numQubits, int64_t maxExtent) : numQubits(numQubits), qubitDims(numQubits, 2),
-            extents(numQubits, maxExtent) {
-
-            // Initialize the cuTensorNet library
-            HANDLE_CUDA_ERROR(cudaSetDevice(deviceId));
-            HANDLE_CUTN_ERROR(cutensornetCreate(&cutnHandle));
-
-            // Initialize the quantum state and MPS representation
-            {
-                HANDLE_CUTN_ERROR(cutensornetCreateState(cutnHandle,
-                CUTENSORNET_STATE_PURITY_PURE,
-                numQubits,
-                std::vector<int64_t>(numQubits, 2).data(),
-                dataType,
-                &state));
-
-                HANDLE_CUTN_ERROR(cutensornetStateFinalizeMPS(cutnHandle, state,
-                    CUTENSORNET_BOUNDARY_CONDITION_OPEN,
-                    extents.extentsPtr.data(), /*strides=*/nullptr));
-            }
-
-            // Allocate scratch memory
-            {
-                // Query the free memory on Device
-                std::size_t freeSize{0}, totalSize{0};
-                HANDLE_CUDA_ERROR(cudaMemGetInfo(&freeSize, &totalSize));
-                scratchSize = (freeSize - (freeSize % 4096)) / 2; // use half of available memory with alignment
-                HANDLE_CUDA_ERROR(cudaMalloc(&d_scratch, scratchSize));
-            }
-            // Setup workspace
-            {
-                HANDLE_CUTN_ERROR(cutensornetCreateWorkspaceDescriptor(cutnHandle, &workDesc));
-            }
-
-            // Set up the SVD method for truncation.
-            {
-                // Optional, set up the SVD method for truncation.
-                cutensornetTensorSVDAlgo_t algo = CUTENSORNET_TENSOR_SVD_ALGO_GESVDJ;
-                HANDLE_CUTN_ERROR(cutensornetStateConfigure(cutnHandle, state,
-                    CUTENSORNET_STATE_CONFIG_MPS_SVD_ALGO, &algo, sizeof(algo)));
-
-                double absCutoff = 1e-12;
-                HANDLE_CUTN_ERROR(cutensornetStateConfigure(cutnHandle, state,
-                     CUTENSORNET_STATE_CONFIG_MPS_SVD_ABS_CUTOFF, &absCutoff, sizeof(absCutoff)));
-
-                /*double relCutoff = 1e-12;
-                HANDLE_CUTN_ERROR(cutensornetStateConfigure(cutnHandle, state,
-                    CUTENSORNET_STATE_CONFIG_MPS_SVD_REL_CUTOFF, &relCutoff, sizeof(relCutoff)));*/
-
-                // CUTENSORNET_STATE_CONFIG_MPS_SVD_ALGO_PARAMS
-                // cutensornetGesvdrParams_t algoParams = CUTENSORNET_TENSOR_SVD_ALGO_PARAMS_DEFAULT;
-
-                cutensornetTensorSVDNormalization_t normalization = CUTENSORNET_TENSOR_SVD_NORMALIZATION_L2;
-                HANDLE_CUTN_ERROR(cutensornetStateConfigure(cutnHandle, state,
-                    CUTENSORNET_STATE_CONFIG_MPS_SVD_S_NORMALIZATION, &normalization, sizeof(normalization)));
-            }
-        }
-
-        int64_t apply_operator(operators::DeviceOperator &op, const std::vector<int32_t> &qubits, bool unitary = true) {
-            int64_t id;
-            HANDLE_CUTN_ERROR(
-                cutensornetStateApplyTensorOperator(cutnHandle, state,
-                    qubits.size(),
-                    qubits.data(),
-                    op.ptr(),
-                    nullptr, 1, 0, unitary, &id));
-            return id;
-        }
-
-        void mps_factorization() {
-            // Prepare the MPS factorization & setup workspace memory requirements
-            {
-                spdlog::debug("Preparing MPS factorization...");
-                HANDLE_CUTN_ERROR(cutensornetStatePrepare(cutnHandle, state, scratchSize, workDesc, 0x0));
-
-                double flops {0.0};
-                HANDLE_CUTN_ERROR(cutensornetStateGetInfo(cutnHandle, state,
-                                  CUTENSORNET_STATE_INFO_FLOPS, &flops, sizeof(flops)));
-
-
-                spdlog::debug("Total flop count = {} GFlop", (flops / 1e9));
-                if(flops < 0.0) {
-                    spdlog::error("ERROR: Negative Flop count!");
-                    std::abort();
-                }
-
-                int64_t workSize = 0;
-                // Attach the workspace buffer
-                HANDLE_CUTN_ERROR(cutensornetWorkspaceGetMemorySize(cutnHandle,
-                    workDesc,
-                    CUTENSORNET_WORKSIZE_PREF_MAX,
-                    CUTENSORNET_MEMSPACE_DEVICE,
-                    CUTENSORNET_WORKSPACE_SCRATCH,
-                    &workSize));
-                spdlog::debug("Required scratch GPU workspace size (bytes) = {}", workSize);
-                if (workSize <= scratchSize) {
-                    HANDLE_CUTN_ERROR(cutensornetWorkspaceSetMemory(cutnHandle, workDesc, CUTENSORNET_MEMSPACE_DEVICE,
-                        CUTENSORNET_WORKSPACE_SCRATCH, d_scratch,
-                        workSize));
-                } else {
-                    spdlog::error("ERROR: Insufficient workspace size on Device!");
-                    std::abort();
-                }
-
-                int64_t requiredWorkspaceSizeCache = 0;
-                HANDLE_CUTN_ERROR( cutensornetWorkspaceGetMemorySize(cutnHandle,
-                                                                workDesc,
-                                                                CUTENSORNET_WORKSIZE_PREF_MIN,
-                                                                CUTENSORNET_MEMSPACE_DEVICE,
-                                                                CUTENSORNET_WORKSPACE_CACHE,
-
-                                                                &requiredWorkspaceSizeCache) );
-
-                spdlog::debug("Required cache GPU workspace size (bytes) = {}", requiredWorkspaceSizeCache);
-
-
-                void* workCache = nullptr;
-                HANDLE_CUDA_ERROR( cudaMalloc(&workCache, requiredWorkspaceSizeCache) );
-                HANDLE_CUTN_ERROR( cutensornetWorkspaceSetMemory(cutnHandle,
-                                                            workDesc,
-                                                            CUTENSORNET_MEMSPACE_DEVICE,
-                                                            CUTENSORNET_WORKSPACE_CACHE,
-                                                            workCache,
-                                                            requiredWorkspaceSizeCache) );
-            }
-
-            // Set the workspace buffer
-            auto &extentsPtr = extents.extentsPtr;
-            auto &d_mpsTensors = extents.d_mpsTensors;
-
-            HANDLE_CUTN_ERROR(cutensornetStateCompute(cutnHandle, state,
-                workDesc, extentsPtr.data(), /*strides=*/nullptr,
-                d_mpsTensors.data(),
-                0x0));
-
-            /*
-            {
-                // cutensornetGetOutputStateDetails needs to be called 3 times to get all the details
-                auto numTensorsOut = 0;
-                HANDLE_CUTN_ERROR(cutensornetGetOutputStateDetails(cutnHandle, state, &numTensorsOut, nullptr, nullptr, nullptr));
-                spdlog::debug("Number of output tensors: {}", numTensorsOut);
-
-                auto numModesOut = std::vector<int32_t>(numTensorsOut);
-                HANDLE_CUTN_ERROR(cutensornetGetOutputStateDetails(cutnHandle, state, &numTensorsOut, numModesOut.data(), nullptr, nullptr));
-                spdlog::debug("Number of modes for each tensor: [{}]", fmt::join(numModesOut, ", "));
-
-                auto extentsOutVec = std::vector<std::vector<int64_t>>(numTensorsOut);
-                auto extentsOutPtr = std::vector<int64_t *>(numTensorsOut);
-                for (int i = 0; i < numTensorsOut; i++) {
-                    auto nModes = numModesOut[i];
-                    if (nModes <= 0) {
-                        spdlog::error("Invalid number of modes for tensor {}: {}", i, nModes);
-                        throw std::runtime_error("Invalid number of modes");
-                    }
-                    extentsOutVec[i].resize(nModes);
-                    extentsOutPtr[i] = extentsOutVec[i].data();
-                }
-
-                HANDLE_CUTN_ERROR(cutensornetGetOutputStateDetails(cutnHandle, state, &numTensorsOut, nullptr, extentsOutPtr.data(), nullptr));
-
-                spdlog::debug("MPS factorization computed successfully!");
-                for (int i = 0; i < numTensorsOut; i++) {
-                    spdlog::debug("Tensor {} extents: [{}]", i, fmt::join(extentsOutVec[i], ", "));
-                }
-            }
-            */
-
-        }
-
-
-        Scalar expectation_value(int32_t index, const operators::DeviceOperator &op) {
-
-            int64_t id;
-            // Create an empty tensor network operator (projector 00
-            cutensornetNetworkOperator_t hamiltonian;
-            HANDLE_CUTN_ERROR(
-                cutensornetCreateNetworkOperator(cutnHandle, numQubits, qubitDims.data(), dataType, &hamiltonian));
-            // Append component P0 (projector 00) to the tensor network operator
-            {
-                const int32_t numModes[] = {1};
-                const int32_t modes[] = {index};
-                const int32_t *stateModes[] = {modes};
-                const void *gateData[] = {op.ptr()}; // GPU pointers to gate data
-                HANDLE_CUTN_ERROR(
-                    cutensornetNetworkOperatorAppendProduct(cutnHandle, hamiltonian, cuDoubleComplex{1.0, 0.0},
-                        1, numModes, stateModes, NULL, gateData, &id));
-            }
-
-
-            HANDLE_CUTN_ERROR(cutensornetCreateExpectation(cutnHandle, state, hamiltonian, &expectation));
-
-            int32_t numHyperSamples = 8;
-            HANDLE_CUTN_ERROR(cutensornetExpectationConfigure(cutnHandle, expectation,
-                CUTENSORNET_EXPECTATION_CONFIG_NUM_HYPER_SAMPLES,
-                &numHyperSamples, sizeof(numHyperSamples)));
-
-            // Prepare the specified quantum circuit expectation value for computation
-            HANDLE_CUTN_ERROR(
-                cutensornetExpectationPrepare(cutnHandle, expectation, scratchSize, workDesc,
-                    0x0));
-
-            spdlog::debug("Prepared the specified quantum circuit expectation value");
-            auto flops = 0.0;
-            HANDLE_CUTN_ERROR(cutensornetExpectationGetInfo(cutnHandle, expectation,
-                CUTENSORNET_EXPECTATION_INFO_FLOPS, &flops, sizeof(flops)));
-            spdlog::debug("Total flop count = {} GFlop", (flops / 1e9));
-            if (flops <= 0.0) {
-                spdlog::error("ERROR: Invalid Flop count!");
-                std::abort();
-            }
-
-            {
-                int64_t workSize = 0;
-                // Attach the workspace buffer
-                HANDLE_CUTN_ERROR(cutensornetWorkspaceGetMemorySize(cutnHandle,
-                    workDesc,
-                    CUTENSORNET_WORKSIZE_PREF_MAX,
-                    CUTENSORNET_MEMSPACE_DEVICE,
-                    CUTENSORNET_WORKSPACE_SCRATCH,
-                    &workSize));
-                spdlog::debug("Required scratch GPU workspace size (bytes) = {}", workSize);
-                if (workSize <= scratchSize) {
-                    HANDLE_CUTN_ERROR(cutensornetWorkspaceSetMemory(cutnHandle, workDesc, CUTENSORNET_MEMSPACE_DEVICE,
-                        CUTENSORNET_WORKSPACE_SCRATCH, d_scratch,
-                        workSize));
-                } else {
-                    spdlog::error("ERROR: Insufficient workspace size on Device!");
-                    std::abort();
-                }
-            }
-
-            const auto [expectVal, stateNorm2] = [&]() -> std::pair<Complex, Complex> {
-                // Compute the specified quantum circuit expectation value
-                Complex expectVal{0.0, 0.0}, stateNorm2{0.0, 0.0};
-                HANDLE_CUTN_ERROR(cutensornetExpectationCompute(cutnHandle, expectation, workDesc,
-                    static_cast<void *>(&expectVal),
-                    static_cast<void *>(&stateNorm2), 0x0));
-
-                return {expectVal, stateNorm2};
-            }();
-
-            HANDLE_CUTN_ERROR(cutensornetDestroyNetworkOperator(hamiltonian));
-            HANDLE_CUTN_ERROR(cutensornetDestroyExpectation(expectation));
-
-            return (expectVal / stateNorm2).real();
-        }
-
-        std::vector<Scalar> expectation_value(operators::DeviceOperator &op, const std::vector<int32_t> &qubits) {
-            std::vector<Scalar> probs{};
-            probs.resize(qubits.size());
-            for (int i = 0; i < qubits.size(); i++) {
-                probs[i] = expectation_value(qubits[i], op);
-            }
-            return probs;
-        }
-
-        std::vector<Scalar> expectation_value(operators::DeviceOperator &op) {
-            std::vector<Scalar> probs{};
-            probs.resize(numQubits);
-            for (int i = 0; i < numQubits; i++) {
-                probs[i] = expectation_value(i, op);
-            }
-
-            return probs;
-        }
-
-        ~Simulator() {
-            HANDLE_CUTN_ERROR(cutensornetDestroy(cutnHandle));
-            HANDLE_CUTN_ERROR(cutensornetDestroyState(state));
-
-            HANDLE_CUDA_ERROR(cudaFree(d_scratch));
-            HANDLE_CUTN_ERROR(cutensornetDestroyWorkspaceDescriptor(workDesc));
-        }
-    };
-} // namespace mps::simulator
diff --git a/include/Operators.hpp b/include/Operators.hpp
index 461cdb9..1f8b6f5 100644
--- a/include/Operators.hpp
+++ b/include/Operators.hpp
@@ -10,7 +10,6 @@
 #include <iomanip>
 #include <complex>
 
-// TODO: remove operators and simply merge into tensor
 namespace operators {
     using Scalar = ScalarPrecision;
     using Complex = std::complex<Scalar>;
diff --git a/include/cusolver_utils.h b/include/cusolver_utils.h
index e0cd7ce..7f9c3d6 100644
--- a/include/cusolver_utils.h
+++ b/include/cusolver_utils.h
@@ -1,45 +1,10 @@
-/*
- * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *  * Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- *  * Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- *  * Neither the name of NVIDIA CORPORATION nor the names of its
- *    contributors may be used to endorse or promote products derived
- *    from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
- * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
- * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
- * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
- * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
- * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
- * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
- * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
 #pragma once
 
-#include <cmath>
-#include <functional>
 #include <iostream>
-#include <random>
 #include <stdexcept>
-#include <string>
 
-#include <cuComplex.h>
-#include <cuda_runtime_api.h>
 #include <cublas_api.h>
 #include <cusolverDn.h>
-#include <library_types.h>
 
 
 // cusolver API error checking
@@ -61,241 +26,3 @@
             throw std::runtime_error("cublas error");                                              \
         }                                                                                          \
     } while (0)
-
-// cublas API error checking
-#define CUSPARSE_CHECK(err)                                                                        \
-    do {                                                                                           \
-        cusparseStatus_t err_ = (err);                                                             \
-        if (err_ != CUSPARSE_STATUS_SUCCESS) {                                                     \
-            printf("cusparse error %d at %s:%d\n", err_, __FILE__, __LINE__);                      \
-            throw std::runtime_error("cusparse error");                                            \
-        }                                                                                          \
-    } while (0)
-
-// memory alignment
-#define ALIGN_TO(A, B) (((A + B - 1) / B) * B)
-
-// device memory pitch alignment
-static const size_t device_alignment = 32;
-
-// type traits
-template <typename T> struct traits;
-
-template <> struct traits<float> {
-    // scalar type
-    typedef float T;
-    typedef T S;
-
-    static constexpr T zero = 0.f;
-    static constexpr cudaDataType cuda_data_type = CUDA_R_32F;
-#if CUDART_VERSION >= 11000
-    static constexpr cusolverPrecType_t cusolver_precision_type = CUSOLVER_R_32F;
-#endif
-
-    inline static S abs(T val) { return fabs(val); }
-
-    template <typename RNG> inline static T rand(RNG &gen) { return (S)gen(); }
-
-    inline static T add(T a, T b) { return a + b; }
-
-    inline static T mul(T v, S f) { return v * f; }
-};
-
-template <> struct traits<double> {
-    // scalar type
-    typedef double T;
-    typedef T S;
-
-    static constexpr T zero = 0.;
-    static constexpr cudaDataType cuda_data_type = CUDA_R_64F;
-#if CUDART_VERSION >= 11000
-    static constexpr cusolverPrecType_t cusolver_precision_type = CUSOLVER_R_64F;
-#endif
-
-    inline static S abs(T val) { return fabs(val); }
-
-    template <typename RNG> inline static T rand(RNG &gen) { return (S)gen(); }
-
-    inline static T add(T a, T b) { return a + b; }
-
-    inline static T mul(T v, S f) { return v * f; }
-};
-
-template <> struct traits<cuFloatComplex> {
-    // scalar type
-    typedef float S;
-    typedef cuFloatComplex T;
-
-    static constexpr T zero = {0.f, 0.f};
-    static constexpr cudaDataType cuda_data_type = CUDA_C_32F;
-#if CUDART_VERSION >= 11000
-    static constexpr cusolverPrecType_t cusolver_precision_type = CUSOLVER_C_32F;
-#endif
-
-    inline static S abs(T val) { return cuCabsf(val); }
-
-    template <typename RNG> inline static T rand(RNG &gen) {
-        return make_cuFloatComplex((S)gen(), (S)gen());
-    }
-
-    inline static T add(T a, T b) { return cuCaddf(a, b); }
-    inline static T add(T a, S b) { return cuCaddf(a, make_cuFloatComplex(b, 0.f)); }
-
-    inline static T mul(T v, S f) { return make_cuFloatComplex(v.x * f, v.y * f); }
-};
-
-template <> struct traits<cuDoubleComplex> {
-    // scalar type
-    typedef double S;
-    typedef cuDoubleComplex T;
-
-    static constexpr T zero = {0., 0.};
-    static constexpr cudaDataType cuda_data_type = CUDA_C_64F;
-#if CUDART_VERSION >= 11000
-    static constexpr cusolverPrecType_t cusolver_precision_type = CUSOLVER_C_64F;
-#endif
-
-    inline static S abs(T val) { return cuCabs(val); }
-
-    template <typename RNG> inline static T rand(RNG &gen) {
-        return make_cuDoubleComplex((S)gen(), (S)gen());
-    }
-
-    inline static T add(T a, T b) { return cuCadd(a, b); }
-    inline static T add(T a, S b) { return cuCadd(a, make_cuDoubleComplex(b, 0.)); }
-
-    inline static T mul(T v, S f) { return make_cuDoubleComplex(v.x * f, v.y * f); }
-};
-
-template <typename T> void print_matrix(const int &m, const int &n, const T *A, const int &lda);
-
-template <> void print_matrix(const int &m, const int &n, const float *A, const int &lda) {
-    for (int i = 0; i < m; i++) {
-        for (int j = 0; j < n; j++) {
-            std::printf("%0.2f ", A[j * lda + i]);
-        }
-        std::printf("\n");
-    }
-}
-
-template <> void print_matrix(const int &m, const int &n, const double *A, const int &lda) {
-    for (int i = 0; i < m; i++) {
-        for (int j = 0; j < n; j++) {
-            std::printf("%0.2f ", A[j * lda + i]);
-        }
-        std::printf("\n");
-    }
-}
-
-template <> void print_matrix(const int &m, const int &n, const cuComplex *A, const int &lda) {
-    for (int i = 0; i < m; i++) {
-        for (int j = 0; j < n; j++) {
-            std::printf("%0.2f + %0.2fj ", A[j * lda + i].x, A[j * lda + i].y);
-        }
-        std::printf("\n");
-    }
-}
-
-template <>
-void print_matrix(const int &m, const int &n, const cuDoubleComplex *A, const int &lda) {
-    for (int i = 0; i < m; i++) {
-        for (int j = 0; j < n; j++) {
-            std::printf("%0.2f + %0.2fj ", A[j * lda + i].x, A[j * lda + i].y);
-        }
-        std::printf("\n");
-    }
-}
-
-template <typename T>
-void generate_random_matrix(cusolver_int_t m, cusolver_int_t n, T **A, int *lda) {
-    std::random_device rd;
-    std::mt19937 gen(rd());
-    std::uniform_real_distribution<typename traits<T>::S> dis(-1.0, 1.0);
-    auto rand_gen = std::bind(dis, gen);
-
-    *lda = n;
-
-    size_t matrix_mem_size = static_cast<size_t>(*lda * m * sizeof(T));
-    // suppress gcc 7 size warning
-    if (matrix_mem_size <= PTRDIFF_MAX)
-        *A = (T *)malloc(matrix_mem_size);
-    else
-        throw std::runtime_error("Memory allocation size is too large");
-
-    if (*A == NULL)
-        throw std::runtime_error("Unable to allocate host matrix");
-
-    // random matrix and accumulate row sums
-    for (int i = 0; i < m; ++i) {
-        for (int j = 0; j < n; ++j) {
-            T *A_row = (*A) + *lda * i;
-            A_row[j] = traits<T>::rand(rand_gen);
-        }
-    }
-}
-
-// Makes matrix A of size mxn and leading dimension lda diagonal dominant
-template <typename T>
-void make_diag_dominant_matrix(cusolver_int_t m, cusolver_int_t n, T *A, int lda) {
-    for (int i = 0; i < std::min(m, n); ++i) {
-        T *A_row = A + lda * i;
-        auto row_sum = traits<typename traits<T>::S>::zero;
-        for (int j = 0; j < n; ++j) {
-            row_sum += traits<T>::abs(A_row[j]);
-        }
-        A_row[i] = traits<T>::add(A_row[i], row_sum);
-    }
-}
-
-// Returns cudaDataType value as defined in library_types.h for the string containing type name
-cudaDataType get_cuda_library_type(std::string type_string) {
-    if (type_string.compare("CUDA_R_16F") == 0)
-        return CUDA_R_16F;
-    else if (type_string.compare("CUDA_C_16F") == 0)
-        return CUDA_C_16F;
-    else if (type_string.compare("CUDA_R_32F") == 0)
-        return CUDA_R_32F;
-    else if (type_string.compare("CUDA_C_32F") == 0)
-        return CUDA_C_32F;
-    else if (type_string.compare("CUDA_R_64F") == 0)
-        return CUDA_R_64F;
-    else if (type_string.compare("CUDA_C_64F") == 0)
-        return CUDA_C_64F;
-    else if (type_string.compare("CUDA_R_8I") == 0)
-        return CUDA_R_8I;
-    else if (type_string.compare("CUDA_C_8I") == 0)
-        return CUDA_C_8I;
-    else if (type_string.compare("CUDA_R_8U") == 0)
-        return CUDA_R_8U;
-    else if (type_string.compare("CUDA_C_8U") == 0)
-        return CUDA_C_8U;
-    else if (type_string.compare("CUDA_R_32I") == 0)
-        return CUDA_R_32I;
-    else if (type_string.compare("CUDA_C_32I") == 0)
-        return CUDA_C_32I;
-    else if (type_string.compare("CUDA_R_32U") == 0)
-        return CUDA_R_32U;
-    else if (type_string.compare("CUDA_C_32U") == 0)
-        return CUDA_C_32U;
-    else
-        throw std::runtime_error("Unknown CUDA datatype");
-}
-
-// Returns cusolverIRSRefinement_t value as defined in cusolver_common.h for the string containing
-// solver name
-cusolverIRSRefinement_t get_cusolver_refinement_solver(std::string solver_string) {
-    if (solver_string.compare("CUSOLVER_IRS_REFINE_NONE") == 0)
-        return CUSOLVER_IRS_REFINE_NONE;
-    else if (solver_string.compare("CUSOLVER_IRS_REFINE_CLASSICAL") == 0)
-        return CUSOLVER_IRS_REFINE_CLASSICAL;
-    else if (solver_string.compare("CUSOLVER_IRS_REFINE_GMRES") == 0)
-        return CUSOLVER_IRS_REFINE_GMRES;
-    else if (solver_string.compare("CUSOLVER_IRS_REFINE_CLASSICAL_GMRES") == 0)
-        return CUSOLVER_IRS_REFINE_CLASSICAL_GMRES;
-    else if (solver_string.compare("CUSOLVER_IRS_REFINE_GMRES_GMRES") == 0)
-        return CUSOLVER_IRS_REFINE_GMRES_GMRES;
-    else
-        printf("Unknown solver parameter: \"%s\"\n", solver_string.c_str());
-
-    return CUSOLVER_IRS_REFINE_NOT_SET;
-}
diff --git a/include/decomposition.h b/include/decomposition.h
index eca4452..175ac0a 100644
--- a/include/decomposition.h
+++ b/include/decomposition.h
@@ -420,6 +420,7 @@ namespace decomposition {
             }
         }
 
+        // TODO: To perform fixed extent truncation, we directly set maxExtent to half of the full extent corresponding to exact SVD.
         void compute(tensor::DeviceTensor<T> &A,
                      tensor::DeviceTensor<ValueType> &S,
                      tensor::DeviceTensor<T> &U,
diff --git a/include/reduction.h b/include/reduction.h
index eef4c76..d57b805 100644
--- a/include/reduction.h
+++ b/include/reduction.h
@@ -30,13 +30,13 @@ namespace reduction {
         auto stride = blockDim.x * gridDim.x;
 
         while (idx < n) {
-            auto val = input[idx];
+            auto val = input[idx] * input[idx];
             if (val < tol) {
                 // since the input is sorted, we can break here
                 return {sum, norm};
             }
             sum++;
-            norm += val * val;
+            norm += val;
             idx += stride;
         }
         return {sum, norm};
@@ -116,14 +116,11 @@ namespace reduction {
             atomicAdd(count_d, block_count_sum);
             atomicAdd(normsq_d, block_norm_sum);
         }
+    }
 
-        // now we normalize the count for this we want a global sync to make sure all blocks have finished
-        // Create a grid-level group and synchronize all blocks.
-        cg::grid_group grid = cg::this_grid();
-        grid.sync();
-
-        // We could actually just divide up to count_d
-        thread_divide(input, n, block_norm_sum);
+    template<typename T>
+    __global__ void truncation_normalize_divide(T *input, int n, Scalar<T> division) {
+        thread_divide(input, n, division);
     }
 
     template<typename T>
@@ -139,21 +136,18 @@ namespace reduction {
         size_t aligned_offset = align_up<Scalar<T>>(base_offset);
         size_t sharedMemSize = aligned_offset + blockSize * sizeof(Scalar<T>);
 
-        void *kernelArgs[] = {
-                (void *)&count_d,
-                (void *)&normsq_d,
-                (void *)&input_d,
-                (void *)&n,
-                (void *)&tol
-        };
-
-        CUDA_CHECK(cudaLaunchCooperativeKernel(
-                (void *)truncation_normalize_count<T>,
-                gridSize, blockSize, kernelArgs, sharedMemSize, stream
-        ));
+        truncation_normalize_count<<<gridSize, blockSize, sharedMemSize, stream>>>(count_d, normsq_d, input_d, n, tol);
 
+        // Copy results back to host
         int count;
+        Scalar<T> norm;
         CUDA_CHECK(cudaMemcpyAsync(&count, count_d, sizeof(int), cudaMemcpyDeviceToHost, stream));
+        CUDA_CHECK(cudaMemcpyAsync(&norm, normsq_d, sizeof(Scalar<T>),cudaMemcpyDeviceToHost, stream));
+        CUDA_CHECK(cudaStreamSynchronize(stream));
+
+        // Launch normalization kernel
+        const Scalar<T> divisor = sqrt(norm);
+        truncation_normalize_divide<<<gridSize, blockSize, 0, stream>>>(input_d, n, divisor);
 
         return count;
     }
diff --git a/julia/main.jl b/julia/main.jl
new file mode 100644
index 0000000..c47d363
--- /dev/null
+++ b/julia/main.jl
@@ -0,0 +1,85 @@
+using ITensors, ITensorMPS
+using Printf
+using CUDA
+
+"""
+  Runs an MPS simulation for a Trotterized quantum circuit.
+
+  Args:
+    T: Number of Trotter steps.
+    L: Number of sites in the chain.
+    chi: Maximum bond dimension.
+    theta: Angle for the Rx and CRx gates.
+"""
+function run_mps_simulation(; T=50, L=30, chi=1024, theta=0.14, verbose=true)
+  cutoff = 1e-12
+  # Initialize the sites for the 1D quantum system
+  sites = siteinds("S=1/2", L)
+  states = ["Dn" for n in 1:L]
+
+  # Initialize MPS to the |11...1> state
+  psi = cu(MPS(sites, states))
+  # Record mean-P1 at each step
+  mean_expectations = Float64[]
+
+  # Perform the Trotterized time evolution
+  for t in 1:T
+    # Trotter sweep (4 layers of two-site gates)
+    # Layer 1: CRX on even sites (1-indexed: 1, 3, 5...)
+    for i in 1:2:(L-1)
+      op_crx = cu(op("CRx", sites[i], sites[i+1]; θ=theta))
+      psi = apply(op_crx, psi; maxdim=chi, cutoff=cutoff)
+    end
+
+    # Layer 2: CLRX on odd sites (1-indexed: 2, 4, 6...)
+    for i in 2:2:(L-1)
+      op_clrx = cu(op("CRx", sites[i+1], sites[i]; θ=theta))
+      psi = apply(op_clrx, psi; maxdim=chi, cutoff=cutoff)
+    end
+
+    # Layer 3: CRX on odd sites (1-indexed: 2, 4, 6...)
+    for i in 2:2:(L-1)
+      op_crx = cu(op("CRx", sites[i], sites[i+1]; θ=theta))
+      psi = apply(op_crx, psi; maxdim=chi, cutoff=cutoff)
+    end
+
+    # Layer 4: CLRX on even sites (1-indexed: 1, 3, 5...)
+    for i in 1:2:(L-1)
+      op_clrx = cu(op("CRx", sites[i+1], sites[i]; θ=theta))
+      psi = apply(op_clrx, psi; maxdim=chi, cutoff=cutoff)
+    end
+
+    # Compute mean ⟨P1⟩ on all sites
+    exps = expect(psi, "Proj1")
+    total = sum(exps)
+    mean = total / L
+    push!(mean_expectations, mean)
+
+    # Bond-dimension stats
+    dims = linkdims(psi)
+    if verbose
+        @printf "t = %2d, max(chi) = %4d, mean(chi) = %8.2f\n" t maximum(dims) sum(dims) / L
+#         for (i, d) in enumerate(dims)
+#             left_dim  = i == 1 ? 1 : dim(linkind(psi, i-1))
+#             phys_dim  = dim(siteind(psi, i))
+#             right_dim = i == L ? 1 : dim(linkind(psi, i))
+#             println("   Site $(i-1): chi = $right_dim: shape = ($left_dim, $phys_dim, $right_dim)")
+#         end
+    end
+  end
+
+  # 7. Final printout
+  if verbose
+    print("[1, " * join([@sprintf("%.6f", m) for m in mean_expectations], ", ") * "]\n")
+  end
+end
+
+# call the function once for warming up
+run_mps_simulation(verbose=false)
+
+let
+  start_time = time()
+  run_mps_simulation(verbose=false)
+  dt = (time() - start_time) * 1000
+  @printf "Execution time: %.0f ms\n" dt
+end
diff --git a/python/Notebook.ipynb b/python/Notebook.ipynb
deleted file mode 100644
index b48ea07..0000000
--- a/python/Notebook.ipynb
+++ /dev/null
@@ -1,813 +0,0 @@
-{
- "cells": [
-  {
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2025-02-12T09:30:09.715602Z",
-     "start_time": "2025-02-12T09:30:09.706367Z"
-    }
-   },
-   "cell_type": "code",
-   "source": [
-    "import matplotlib.pyplot as plt\n",
-    "import numpy as np"
-   ],
-   "id": "42ad527ae2278efa",
-   "outputs": [],
-   "execution_count": 74
-  },
-  {
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2025-02-12T09:30:09.747371Z",
-     "start_time": "2025-02-12T09:30:09.721308Z"
-    }
-   },
-   "cell_type": "code",
-   "source": [
-    "def plus():\n",
-    "    return np.array([1, 1]) / np.sqrt(2)\n",
-    "\n",
-    "def minus():\n",
-    "    return np.array([1, -1]) / np.sqrt(2)\n",
-    "\n",
-    "def null():\n",
-    "    return np.array([1, 0])\n",
-    "\n",
-    "def one():\n",
-    "    return np.array([0, 1])\n",
-    "\n",
-    "def multi_kron(*args):\n",
-    "    if len(args) == 1:\n",
-    "        return args[0]\n",
-    "    return np.kron(args[0], multi_kron(*args[1:]))\n",
-    "\n",
-    "\n",
-    "def Rxy(theta):\n",
-    "    cos_theta_2 = np.cos(theta / 2)\n",
-    "    sin_theta_2 = np.sin(theta / 2)\n",
-    "    return np.array([\n",
-    "        [1, 0, 0, 0],\n",
-    "        [0, cos_theta_2, -1j * sin_theta_2, 0],\n",
-    "        [0, -1j * sin_theta_2, cos_theta_2, 0],\n",
-    "        [0, 0, 0, 1]\n",
-    "    ])\n",
-    "\n",
-    "def Rx(theta):\n",
-    "    cos_theta_2 = np.cos(theta / 2)\n",
-    "    sin_theta_2 = np.sin(theta / 2)\n",
-    "    return np.array([\n",
-    "        [cos_theta_2, -1j * sin_theta_2],\n",
-    "        [-1j * sin_theta_2, cos_theta_2]\n",
-    "    ])\n",
-    "\n",
-    "def SWAP():\n",
-    "    return np.array([\n",
-    "        [1, 0, 0, 0],\n",
-    "        [0, 0, 1, 0],\n",
-    "        [0, 1, 0, 0],\n",
-    "        [0, 0, 0, 1]\n",
-    "    ])\n",
-    "\n",
-    "def CNOT():\n",
-    "    return np.array([\n",
-    "        [1, 0, 0, 0],\n",
-    "        [0, 1, 0, 0],\n",
-    "        [0, 0, 0, 1],\n",
-    "        [0, 0, 1, 0]\n",
-    "    ])\n",
-    "\n",
-    "\n",
-    "def X():\n",
-    "    return np.array([[0, 1], [1, 0]])\n",
-    "\n",
-    "def Z():\n",
-    "    return np.array([[1, 0], [0, -1]])\n",
-    "\n",
-    "def Y():\n",
-    "    return 1j * X() @ Z()\n",
-    "\n",
-    "def H():\n",
-    "    return 1 / np.sqrt(2) * (X() + Z())\n",
-    "\n",
-    "\n",
-    "def Z_spider(alpha, n=1, m=1):\n",
-    "\n",
-    "    zeros_in = multi_kron([null()] * m)\n",
-    "    zeros_out = multi_kron([null()] * n)\n",
-    "\n",
-    "    ones_in = multi_kron([one()] * m)\n",
-    "    ones_out = multi_kron([one()] * n)\n",
-    "\n",
-    "    return np.outer(zeros_in, zeros_out) + np.exp(1j*alpha) * np.outer(ones_in, ones_out)\n",
-    "\n",
-    "def X_spider(alpha, n=1, m=1):\n",
-    "\n",
-    "    plus_in = multi_kron([plus()] * m)\n",
-    "    plus_out = multi_kron([plus()] * n)\n",
-    "\n",
-    "    minus_in = multi_kron([minus()] * m)\n",
-    "    minus_out = multi_kron([minus()] * n)\n",
-    "\n",
-    "    return np.outer(plus_in, plus_out) + np.exp(1j*alpha) * np.outer(minus_in, minus_out)\n",
-    "\n",
-    "\n",
-    "# Projection operators\n",
-    "def P0():\n",
-    "    return np.outer(ket0(), ket0())\n",
-    "\n",
-    "def P1():\n",
-    "    return np.outer(ket1(), ket1())\n",
-    "\n",
-    "def CONTROL(U):\n",
-    "    return np.kron(P0(), np.eye(2)) + np.kron(P1(), U)\n",
-    "\n",
-    "def CONTROL_L(U):\n",
-    "    return np.kron(np.eye(2), P0()) + np.kron(U, P1())\n",
-    "\n",
-    "def ket0():\n",
-    "    \"\"\"Representing spin up\"\"\"\n",
-    "    return np.array([1, 0])\n",
-    "\n",
-    "def ket1():\n",
-    "    \"\"\"Representing spin down\"\"\"\n",
-    "    return np.array([0, 1])\n",
-    "\n",
-    "def bin2dec(b):\n",
-    "    return int(''.join(str(i) for i in b), 2)\n",
-    "\n",
-    "def dec2bin(d, L):\n",
-    "    return [int(i) for i in bin(d)[2:].zfill(L)]\n",
-    "\n",
-    "def ket(i, dim=2):\n",
-    "    assert 0 <= i < dim\n",
-    "    return np.eye(dim)[:, i]\n",
-    "\n",
-    "def ket_bin(b):\n",
-    "    dec = bin2dec(b)\n",
-    "    dim = 2 ** len(b)\n",
-    "    return ket(dec, dim)\n",
-    "\n",
-    "def make_state(L):\n",
-    "    # L times spin down, 1 time spin up\n",
-    "    return np.array([ket1()] * L + [ket0()])\n",
-    "\n",
-    "def measure_operator(op, state, qbit_index):\n",
-    "    # measure the expectation value of the operator op on the qbit_index-th qbit of the state\n",
-    "    # return the expectation value and the normalized state\n",
-    "    # op is a 2x2 matrix\n",
-    "    # state is a 2^L dimensional vector\n",
-    "    # qbit_index is an integer\n",
-    "    # return a tuple (expectation_value, normalized_state)\n",
-    "\n",
-    "    n_qbits = int(np.log2(len(state)))\n",
-    "    assert n_qbits > qbit_index\n",
-    "    assert op.shape == (2, 2)\n",
-    "\n",
-    "    ops = [np.eye(2)] * n_qbits\n",
-    "    ops[qbit_index] = op\n",
-    "\n",
-    "    op = None\n",
-    "    for i in range(n_qbits):\n",
-    "        if op is None:\n",
-    "            op = ops[i]\n",
-    "        else:\n",
-    "            op = np.kron(op, ops[i])\n",
-    "\n",
-    "    unnormalized_expectation_value = np.conj(state) @ op @ state\n",
-    "    norm = np.sqrt(np.conj(state) @ state)\n",
-    "\n",
-    "    return np.real(unnormalized_expectation_value), np.real(norm)\n",
-    "\n",
-    "\n",
-    "def batched_expectation_value(op, state, qbit_indices):\n",
-    "    return [measure_operator(op, state, qbit_index) for qbit_index in qbit_indices]\n"
-   ],
-   "id": "70f159042b2b5568",
-   "outputs": [],
-   "execution_count": 75
-  },
-  {
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2025-02-12T09:30:09.772938Z",
-     "start_time": "2025-02-12T09:30:09.765310Z"
-    }
-   },
-   "cell_type": "code",
-   "source": [
-    "phi = 0.1327\n",
-    "theta = 0.14\n",
-    "\n",
-    "U = CONTROL(Rx(theta))\n",
-    "U_L = CONTROL_L(Rx(theta))\n",
-    "state = np.kron(ket1(), ket1())\n",
-    "\n",
-    "state_t = U_L @ U @ state\n",
-    "expectations = batched_expectation_value(P1(), state_t, [0, 1])\n",
-    "expectations = np.array([np.real(e / norm) for e, norm in expectations])\n",
-    "print(f\"Normalized expectation values: {expectations}\")"
-   ],
-   "id": "60138f2ecb0c7f00",
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Normalized expectation values: [0.99513193 0.995108  ]\n"
-     ]
-    }
-   ],
-   "execution_count": 76
-  },
-  {
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2025-02-12T09:30:09.811442Z",
-     "start_time": "2025-02-12T09:30:09.792107Z"
-    }
-   },
-   "cell_type": "code",
-   "source": [
-    "def apply_local_op(op, i, M):\n",
-    "    \"\"\"\n",
-    "    Apply a local operator op to site i of an mps tensor M.\n",
-    "    \"\"\"\n",
-    "    M[i] = np.einsum(\"ij,aib->ajb\", op, M[i])\n",
-    "\n",
-    "def wavefunction(M):\n",
-    "    L, D, d, _ = M.shape\n",
-    "\n",
-    "    # Initialize the left boundary as a 2-D array with shape (1, D).\n",
-    "    left = np.zeros((1, D))\n",
-    "    left[0, 0] = 1.0  # v_left = [1, 0, ..., 0]\n",
-    "\n",
-    "    # Iteratively contract each site.\n",
-    "    # At each step, left has shape (d**site, D).\n",
-    "    for site in range(L):\n",
-    "        # M[site] has shape (D, d, D)\n",
-    "        # We want to contract the bond index of left (axis 1) with the first index of M[site].\n",
-    "        # The contraction below yields an array of shape (d**site, d, D).\n",
-    "        left = np.tensordot(left, M[site], axes=([1], [0]))\n",
-    "        # Merge the accumulated physical indices into one axis.\n",
-    "        left = left.reshape(-1, D)  # Now left has shape (d**(site+1), D)\n",
-    "\n",
-    "    # The right boundary: a vector of shape (D,) with a 1 in the first entry.\n",
-    "    right = np.zeros((D,))\n",
-    "    right[0] = 1.0  # v_right = [1, 0, ..., 0]\n",
-    "\n",
-    "    # Final contraction: contract the bond index of left with the right boundary.\n",
-    "    psi = np.dot(left, right)  # psi has shape (d**L,)\n",
-    "\n",
-    "    return psi\n",
-    "\n",
-    "def expectation_site(O, site, M, v_left=None, v_right=None):\n",
-    "    L, D, d, _ = M.shape\n",
-    "    # Default boundaries: choose the minimal subspace.\n",
-    "    if v_left is None:\n",
-    "        v_left = np.zeros(D, dtype=complex)\n",
-    "        v_left[0] = 1.0\n",
-    "    if v_right is None:\n",
-    "        v_right = np.zeros(D, dtype=complex)\n",
-    "        v_right[0] = 1.0\n",
-    "\n",
-    "    E_left = np.outer(v_left, v_left.conjugate())\n",
-    "\n",
-    "    # Contract sites 0 to site-1 to build the left environment.\n",
-    "    for j in range(site):\n",
-    "        E_left = np.einsum(\"xy,xsz,ysw->zw\", E_left, M[j], M[j].conjugate())\n",
-    "\n",
-    "    E_right = np.outer(v_right, v_right.conjugate())\n",
-    "    for j in range(L-1, site, -1):\n",
-    "        E_right = np.einsum(\"asz,bsw,zw->ab\", M[j], M[j].conjugate(), E_right)\n",
-    "    F = np.einsum(\"asb,st,atd->bd\", M[site], O, M[site].conjugate())\n",
-    "\n",
-    "    val = np.einsum(\"ab,ab->\", E_left, F * E_right)\n",
-    "    return np.real(val)\n",
-    "\n",
-    "\n",
-    "def expectation_sites(O, sites, M, v_left=None, v_right=None):\n",
-    "    expectations = []\n",
-    "    for site in sites:\n",
-    "        expectations.append(expectation_site(O, site, M, v_left, v_right))\n",
-    "    return np.array(expectations)\n",
-    "\n",
-    "def canonical_form(M):\n",
-    "    L = len(M)  # Number of tensors (sites)\n",
-    "\n",
-    "    # Sweep from left to right: to bring the mps into left-canonical form.\n",
-    "    for i in range(1, L-1):\n",
-    "        # The tensor has shape (D, d, D), so we reshape it to (D * d, D)\n",
-    "        tensor = M[i]\n",
-    "        print(f\"{tensor.shape = }\")\n",
-    "        reshaped_tensor = tensor.reshape(-1, tensor.shape[2])  # Shape (D * d, D)\n",
-    "        print(f\"{reshaped_tensor.shape = }\")\n",
-    "\n",
-    "        # Perform QR decomposition: reshaped_tensor = U * S\n",
-    "        # U has dimensions (D*d, D), S has dimensions (D, D)\n",
-    "        Q, R = np.linalg.qr(reshaped_tensor)\n",
-    "\n",
-    "        # apply S to the right tensor\n",
-    "        M[i+1] = np.einsum(\"ij,jkl->ikl\", R, M[i+1])\n",
-    "\n",
-    "        # Reshape Q to (D, d, new_D) and update tensor\n",
-    "        M[i] = Q.reshape(tensor.shape[0], tensor.shape[1], -1)\n",
-    "        print(f\"{M[i].shape = }\")\n",
-    "\n",
-    "    Ss = []\n",
-    "    # Sweep from right to left: to compute singular values\n",
-    "    for i in range(L-1, -1, -1):\n",
-    "        print(f\"{i = }\")\n",
-    "        tensor = M[i]\n",
-    "        reshaped_tensor = tensor.reshape(tensor.shape[0], -1)  # Shape (D, d * D)\n",
-    "        # svd: reshaped_tensor = U * S * V^T\n",
-    "        U, S, V = np.linalg.svd(reshaped_tensor, full_matrices=False)\n",
-    "\n",
-    "        # store singular values\n",
-    "        Ss.append(S)\n",
-    "\n",
-    "        # TODO cutoff and renormalize\n",
-    "\n",
-    "        # apply U to the left tensor M_i-1 @ U\n",
-    "        M[i-1] = np.einsum(\"ijk,kl->ijl\", M[i-1], U)\n",
-    "\n",
-    "        # store V in the right tensor M_i\n",
-    "        M[i] = V.reshape(tensor.shape[0], tensor.shape[1], -1)\n",
-    "\n",
-    "    return M, Ss"
-   ],
-   "id": "174f6e4c5482a128",
-   "outputs": [],
-   "execution_count": 77
-  },
-  {
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2025-02-12T09:30:09.835266Z",
-     "start_time": "2025-02-12T09:30:09.828843Z"
-    }
-   },
-   "cell_type": "code",
-   "source": [
-    "# Set the mps parameters\n",
-    "d = 2; D = 2; L = 4\n",
-    "\n",
-    "# Initialize the mps tensor\n",
-    "M = np.zeros((L,D,d,D))\n",
-    "\n",
-    "# initialize to |00..0>\n",
-    "for i in range(L):\n",
-    "    M[i][0][0][0] = 1\n",
-    "\n",
-    "apply_local_op(X(), 3, M)\n",
-    "wavefunc = wavefunction(M)\n",
-    "\n",
-    "exp_site_3 = expectation_sites(P0(), range(L), M)\n",
-    "print(exp_site_3)\n",
-    "\n",
-    "M_i, S = canonical_form(M)\n",
-    "\n",
-    "print(S)"
-   ],
-   "id": "a59b0ff20eea8580",
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[1. 1. 1. 0.]\n",
-      "tensor.shape = (2, 2, 2)\n",
-      "reshaped_tensor.shape = (4, 2)\n",
-      "M[i].shape = (2, 2, 2)\n",
-      "tensor.shape = (2, 2, 2)\n",
-      "reshaped_tensor.shape = (4, 2)\n",
-      "M[i].shape = (2, 2, 2)\n",
-      "i = 3\n",
-      "i = 2\n",
-      "i = 1\n",
-      "i = 0\n",
-      "[array([1., 0.]), array([1.41421356, 0.        ]), array([1.41421356, 0.        ]), array([1., 0.])]\n"
-     ]
-    }
-   ],
-   "execution_count": 78
-  },
-  {
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2025-02-12T09:30:09.936898Z",
-     "start_time": "2025-02-12T09:30:09.908725Z"
-    }
-   },
-   "cell_type": "code",
-   "source": [
-    "def contract_MPS_MPO_expectation(psi, O):\n",
-    "    \"\"\"\n",
-    "    Compute the contraction for one site corresponding to\n",
-    "      <psi| O |psi>\n",
-    "\n",
-    "    Parameters:\n",
-    "      psi : 3D array, mps tensor of shape (D, d, D)\n",
-    "      O : 2D array, local operator of shape (d, d)\n",
-    "\n",
-    "    Returns:\n",
-    "      result : float, the expectation value <psi| O |psi>\n",
-    "    \"\"\"\n",
-    "\n",
-    "    ket = np.einsum(\"ij,kjl->ikl\", O, psi)\n",
-    "    bra = psi.conj()\n",
-    "\n",
-    "    # inner product <bra|ket>\n",
-    "    return np.vdot(bra, ket)\n",
-    "\n",
-    "class Site:\n",
-    "    def __init__(self, d, D):\n",
-    "        self.d = d\n",
-    "        self.D = D\n",
-    "\n",
-    "        self.tensor = np.zeros((D, d, D))\n",
-    "        self.tensor[0, 0, 0] = 1.0  # Initialize to |0>\n",
-    "\n",
-    "        self.S = None # np.zeros((D, D))\n",
-    "        self.canonical = False\n",
-    "\n",
-    "class MPS:\n",
-    "    sites = [] # List of Site objects\n",
-    "\n",
-    "    def __init__(self, L, chi):\n",
-    "        self.L = L\n",
-    "        self.d = 2\n",
-    "        self.chi = chi\n",
-    "        self.sites = [Site(d, 1) for _ in range(L+1)]\n",
-    "\n",
-    "    def apply_local_op(self, i, op):\n",
-    "        \"\"\"\n",
-    "        Apply a local operator op to site i of the mps.\n",
-    "        \"\"\"\n",
-    "        self.sites[i].tensor = np.einsum(\"ij,aib->ajb\", op, self.sites[i].tensor)\n",
-    "\n",
-    "    def canonical_form(self):\n",
-    "\n",
-    "        for i in range(0, self.L-1):\n",
-    "            # The tensor has shape (D, d, D), so we reshape it to (D * d, D)\n",
-    "            tensor = self.sites[i].tensor\n",
-    "            reshaped_tensor = tensor.reshape(-1, tensor.shape[2])\n",
-    "\n",
-    "            # Perform QR decomposition: reshaped_tensor = U * S\n",
-    "            # U has dimensions (D*d, D), S has dimensions (D, D)\n",
-    "            Q, R = np.linalg.qr(reshaped_tensor)\n",
-    "\n",
-    "            # apply S to the right tensor\n",
-    "            self.sites[i+1].tensor = np.einsum(\"ij,jkl->ikl\", R, self.sites[i+1].tensor)\n",
-    "\n",
-    "            # Reshape Q to (D, d, new_D) and update tensor\n",
-    "            self.sites[i].tensor = Q.reshape(tensor.shape[0], tensor.shape[1], -1)\n",
-    "\n",
-    "        # right boundary singular values\n",
-    "        # self.sites[self.L].S = np.array([1.0])\n",
-    "\n",
-    "        for i in range(self.L-1, -1, -1):\n",
-    "            tensor = self.sites[i].tensor\n",
-    "            reshaped_tensor = tensor.reshape(tensor.shape[0], -1)\n",
-    "            # svd: reshaped_tensor = U * S * V^T\n",
-    "            U, S, V = np.linalg.svd(reshaped_tensor, full_matrices=False)\n",
-    "\n",
-    "            # store singular values\n",
-    "            self.sites[i].S = S / np.linalg.norm(S)\n",
-    "\n",
-    "            # store V in the right tensor M_i\n",
-    "            self.sites[i].tensor = V.reshape(tensor.shape[0], tensor.shape[1], -1)\n",
-    "\n",
-    "            if i > 0:\n",
-    "                # apply U to the left tensor M_i-1 @ U\n",
-    "                self.sites[i-1].tensor = np.einsum(\"ijk,kl->ijl\", self.sites[i-1].tensor, U)\n",
-    "\n",
-    "        # save phase\n",
-    "        # assert (U[0, 0] == 1)\n",
-    "        self.sites[0].tensor *= U[0, 0]\n",
-    "        if U[0, 0] != 1:\n",
-    "            print(f\"Phase: {U[0, 0]}\")\n",
-    "\n",
-    "        # left boundary singular values\n",
-    "        # self.sites[0].S = np.array([1.0])\n",
-    "        self.sites[self.L].S = self.sites[0].S\n",
-    "\n",
-    "\n",
-    "    def get_B(self, i, form=(True, True)):\n",
-    "        a, b = form\n",
-    "\n",
-    "        B = self.sites[i].tensor\n",
-    "        SL = np.diag(self.sites[i].S)\n",
-    "        SR = np.diag(self.sites[i+1].S)\n",
-    "\n",
-    "        if a:\n",
-    "            B = np.einsum(\"ij,jkl->ikl\", SL, B)\n",
-    "\n",
-    "        if b:\n",
-    "            B = np.einsum(\"ijk,kl->ijl\", B, SR)\n",
-    "\n",
-    "        return B\n",
-    "\n",
-    "    def theta(self, i, n=2):\n",
-    "        \"\"\"Calculates the `n`-site wavefunction on ``sites[i:i+n]``.\n",
-    "\n",
-    "        Parameters\n",
-    "        ----------\n",
-    "        i : int\n",
-    "            Site index.\n",
-    "        n : int\n",
-    "            Number of sites. The result lives on ``sites[i:i+n]``.\n",
-    "        \"\"\"\n",
-    "        if n == 1:\n",
-    "            return self.get_B(i, form=(True, True))\n",
-    "        if n == 2:\n",
-    "            b1 = self.get_B(i, form=(True, False))\n",
-    "            b2 = self.get_B(i+1, form=(True, True))\n",
-    "            return np.tensordot(b1, b2, axes=1)\n",
-    "        else:\n",
-    "            raise NotImplementedError(\"n > 2 not implemented\")\n",
-    "\n",
-    "    def apply_2site_op(self, i, O):\n",
-    "        \"\"\"\n",
-    "        Apply a 2-site operator O to sites i and i+1.\n",
-    "\n",
-    "        Parameters\n",
-    "        ----------\n",
-    "        i : int\n",
-    "            Site index.\n",
-    "        O : np.ndarray\n",
-    "            (4x4) operator acting on sites i and i+1.\n",
-    "        \"\"\"\n",
-    "        th = self.theta(i, n=2) # shape (D1, d, d, D2)\n",
-    "\n",
-    "        # reshape th into (D1, d*d, D2)\n",
-    "        D1, d, _, D2 = th.shape\n",
-    "        th = th.reshape(D1, d*d, D2)\n",
-    "\n",
-    "        # apply the operator\n",
-    "        th = np.einsum('ij,ajk->aik', O, th)\n",
-    "\n",
-    "        # Reshape back to (D1, d, d, D2)\n",
-    "        th = th.reshape(D1, d, d, D2)\n",
-    "\n",
-    "        matrix = th.reshape(d*D1, d*D2)  # Shape: (2, 2)\n",
-    "\n",
-    "        # Perform SVD and truncate to bond dimension 1\n",
-    "        U, S, Vh = np.linalg.svd(matrix, full_matrices=False)\n",
-    "\n",
-    "        # determine the truncation dimension (bond dimension) based on the singular values\n",
-    "        trunc = np.sum(S > 1e-12) # number of singular values larger than 1e-12\n",
-    "        if trunc > self.chi:\n",
-    "            trunc = self.chi\n",
-    "\n",
-    "        U = U[:, :trunc]\n",
-    "        S = S[:trunc]\n",
-    "        Vh = Vh[:trunc, :]\n",
-    "\n",
-    "        # Absorb singular values into one tensor\n",
-    "        A_new = U.reshape(D1, d, trunc)\n",
-    "        B_new = Vh.reshape(trunc, d, D2)\n",
-    "\n",
-    "        self.sites[i].tensor = A_new\n",
-    "        self.sites[i+1].tensor = B_new\n",
-    "        self.sites[i+1].S = S / np.linalg.norm(S)\n",
-    "\n",
-    "\n",
-    "    def expectation_value(self, O, sites):\n",
-    "\n",
-    "        expectations = []\n",
-    "        for i in sites:\n",
-    "            theta = self.theta(i, n=1)\n",
-    "\n",
-    "            expectations.append(contract_MPS_MPO_expectation(theta, O))\n",
-    "\n",
-    "        return np.array(expectations)\n",
-    "\n",
-    "    def chis(self):\n",
-    "        return [max(self.sites[i].tensor.shape[0], self.sites[i].tensor.shape[2])  for i in range(L)]\n",
-    "\n",
-    "psi = MPS(L=4, chi=2)\n",
-    "psi.canonical_form()\n",
-    "\n",
-    "# psi.apply_local_op(1, op=X())\n",
-    "#psi.apply_local_op(2, op=H())\n",
-    "#        psi.apply_2site_op(2, CNOT())\n",
-    " #       npt.assert_array_almost_equal(psi.expectation_value(P1(), [0, 1, 2, 3]), [0, 1, 0.5, 0.5])\n"
-   ],
-   "id": "2904c781c2b93577",
-   "outputs": [],
-   "execution_count": 79
-  },
-  {
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2025-02-12T09:54:26.653795Z",
-     "start_time": "2025-02-12T09:54:26.601733Z"
-    }
-   },
-   "cell_type": "code",
-   "source": [
-    "from tenpy.networks.mps import MPS as TMPS\n",
-    "from tenpy.networks.site import SpinHalfSite\n",
-    "from tenpy.networks.site import kron\n",
-    "import time\n",
-    "\n",
-    "site = SpinHalfSite(conserve=None, sort_charge=False)\n",
-    "\n",
-    "theta = 0.1\n",
-    "\n",
-    "op_o = np.array([[1., 0.], [0., 0.]])\n",
-    "op_i = np.array([[0., 0.], [0., 1.]])\n",
-    "op_X = np.array([[0., 1.], [1., 0.]])\n",
-    "op_H = H()\n",
-    "op_Rx = Rx(theta)\n",
-    "\n",
-    "site.add_op('X', op_X)\n",
-    "site.add_op('ii', op_i)\n",
-    "site.add_op('oo', op_o)\n",
-    "site.add_op('H', op_H)\n",
-    "site.add_op('Rx', op_Rx)\n",
-    "\n",
-    "L = 3\n",
-    "\n",
-    "\n",
-    "op_CX = kron(site.get_op('oo'), site.get_op('Id'), group=False) + kron(site.get_op('ii'), site.get_op('X'), group=False)\n",
-    "op_CRX = kron(site.get_op('oo'), site.get_op('Id'), group=False) + kron(site.get_op('ii'), site.get_op('Rx'), group=False)\n",
-    "op_CLX = kron(site.get_op('Id'), site.get_op('oo'), group=False) + kron(site.get_op('Rx'), site.get_op('ii'), group=False)\n",
-    "\n",
-    "mps = TMPS.from_product_state([site] * L, ['up'] * L, 'finite')\n",
-    "\n",
-    "mps.apply_local_op(0, \"X\")\n",
-    "mps.apply_local_op(1, \"X\")\n",
-    "mps.apply_local_op(2, \"X\")\n",
-    "\n",
-    "def printBBS(site, mps):\n",
-    "    print(f\"B_{site}:\\n{mps.get_B(site)._data}\")\n",
-    "    print(f\"B_{site+1}:\\n{mps.get_B(site+1)._data}\")\n",
-    "    print(f\"S_{site+1}:\\n{mps.get_SL(site+1)}\")\n",
-    "\n",
-    "mps.apply_local_op(0, op_CRX)\n",
-    "#printBBS(0, mps)\n",
-    "print(\"Expectations:\", mps.expectation_value(\"ii\", [0, 1, 2]))\n",
-    "mps.apply_local_op(1, op_CRX)\n",
-    "#printBBS(1, mps)\n",
-    "print(\"Expectations:\", mps.expectation_value(\"ii\", [0, 1, 2]))\n",
-    "mps.apply_local_op(0, op_CLX)\n",
-    "#printBBS(0, mps)\n",
-    "print(\"Expectations:\", mps.expectation_value(\"ii\", [0, 1, 2]))\n",
-    "mps.apply_local_op(1, op_CLX)\n",
-    "#printBBS(1, mps)\n",
-    "print(\"Expectations:\", mps.expectation_value(\"ii\", [0, 1, 2]))\n",
-    "\n",
-    "#print(mps.chi)"
-   ],
-   "id": "e5310f200a30da4",
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Expectations: [1.         0.99750208 1.        ]\n",
-      "Expectations: [1.         0.99750208 0.99750832]\n",
-      "Expectations: [0.99750832 0.99750208 0.99750832]\n",
-      "Expectations: [0.99750832 0.99005196 0.99750832]\n"
-     ]
-    }
-   ],
-   "execution_count": 88
-  },
-  {
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2025-02-12T09:30:10.034189Z",
-     "start_time": "2025-02-12T09:30:10.029152Z"
-    }
-   },
-   "cell_type": "code",
-   "source": [
-    "psi = ket_bin([1, 1, 1])\n",
-    "theta = 0.1\n",
-    "op_Rx = Rx(theta)\n",
-    "op_CRX = CONTROL(op_Rx)\n",
-    "psi = np.kron(op_CRX, np.eye(2)) @ psi\n",
-    "psi = np.kron(np.eye(2), op_CRX) @ psi\n",
-    "\n",
-    "e, norm = measure_operator(P1(), psi, 2)\n",
-    "print(f\"{e = } {norm = }\")\n",
-    "print(f\"Expectation value: {e / norm}\")"
-   ],
-   "id": "14148111ce9f52c",
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "e = np.float64(0.9975083222301554) norm = np.float64(1.0)\n",
-      "Expectation value: 0.9975083222301554\n"
-     ]
-    }
-   ],
-   "execution_count": 81
-  },
-  {
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2025-02-12T09:30:10.059683Z",
-     "start_time": "2025-02-12T09:30:10.048863Z"
-    }
-   },
-   "cell_type": "code",
-   "source": [
-    "def K1(phi):\n",
-    "    return np.array([[0, -1j / 2 * np.sin(phi / 2)], [0, 0]])\n",
-    "\n",
-    "def K2(phi):\n",
-    "    return np.array([[1, 0], [0, np.cos(phi / 2)]])\n",
-    "\n",
-    "phi = 0.1 #np.pi / 2\n",
-    "\n",
-    "psi0 = np.kron(ket0(), ket0())\n",
-    "psi1 = np.kron(ket1(), ket0())\n",
-    "psi2 = np.kron(ket0(), ket1())\n",
-    "\n",
-    "psi0_k = ket0()\n",
-    "psi1_k = ket1()\n",
-    "psi2_k = ket0()\n",
-    "\n",
-    "\n",
-    "psi0 = Rxy(phi) @ psi0\n",
-    "psi1 = Rxy(phi) @ psi1\n",
-    "psi2 = np.kron(H(), np.eye(2)) @ psi2\n",
-    "psi2 = Rxy(phi) @ psi2\n",
-    "\n",
-    "# measure the expectation value of the operator P0 on the second qbit of the state\n",
-    "\n",
-    "e0, n0 = measure_operator(P0(), psi0, 1)\n",
-    "e1, n1 = measure_operator(P0(), psi1, 1)\n",
-    "e2, n2 = measure_operator(P0(), psi2, 1)\n",
-    "\n",
-    "e0k, n0k = measure_operator(K2(phi), psi0_k, 0)\n",
-    "e1k, n1k = measure_operator(K2(phi), psi1_k, 0)\n",
-    "e2k, n2k = measure_operator(K2(phi), psi2_k, 0)\n",
-    "\n",
-    "print(f\"Expectation value of |0x0| on qubit: 1 for Rxy(theta)|00> = {e0 / n0}\")\n",
-    "print(f\"Expectation value of |0x0| on qubit: 1 for Rxy(theta)|10> = {e1 / n1}\")\n",
-    "print(f\"Expectation value of |0x0| on qubit: 1 for Rxy(theta)|+0> = {e2 / n2}\")\n",
-    "\n",
-    "print(f\"Expectation value of K2 on qubit: 0 for |0> = {e0k / n0k}\")\n",
-    "print(f\"Expectation value of K2 on qubit: 0 for |1> = {e1k / n1k}\")\n",
-    "print(f\"Expectation value of K2 on qubit: 0 for |+> = {e2k / n2k}\")"
-   ],
-   "id": "cd6ae0f26afe819a",
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Expectation value of |0x0| on qubit: 1 for Rxy(theta)|00> = 1.0\n",
-      "Expectation value of |0x0| on qubit: 1 for Rxy(theta)|10> = 0.997502082639013\n",
-      "Expectation value of |0x0| on qubit: 1 for Rxy(theta)|+0> = 0.0012489586804935587\n",
-      "Expectation value of K2 on qubit: 0 for |0> = 1.0\n",
-      "Expectation value of K2 on qubit: 0 for |1> = 0.9987502603949663\n",
-      "Expectation value of K2 on qubit: 0 for |+> = 1.0\n"
-     ]
-    }
-   ],
-   "execution_count": 82
-  },
-  {
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2025-02-12T09:30:10.078098Z",
-     "start_time": "2025-02-12T09:30:10.076369Z"
-    }
-   },
-   "cell_type": "code",
-   "source": "",
-   "id": "58a74999e21432a0",
-   "outputs": [],
-   "execution_count": null
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 2
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython2",
-   "version": "2.7.6"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}
diff --git a/python/ZX.ipynb b/python/ZX.ipynb
deleted file mode 100644
index 08927f6..0000000
--- a/python/ZX.ipynb
+++ /dev/null
@@ -1,53 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "code",
-   "id": "initial_id",
-   "metadata": {
-    "collapsed": true,
-    "ExecuteTime": {
-     "end_time": "2025-02-04T11:27:11.354176Z",
-     "start_time": "2025-02-04T11:27:11.197144Z"
-    }
-   },
-   "source": [
-    "import pyzx as zx\n",
-    "from pyzx.circuit import Circuit, CZ, RXX\n"
-   ],
-   "outputs": [],
-   "execution_count": 2
-  },
-  {
-   "metadata": {},
-   "cell_type": "code",
-   "outputs": [],
-   "execution_count": null,
-   "source": [
-    "c = Circuit(2)\n",
-    "c.add_gate(CZ(0,1))\n"
-   ],
-   "id": "e3ba7b648ad767e7"
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 2
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython2",
-   "version": "2.7.6"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}
diff --git a/python/main.py b/python/main.py
deleted file mode 100644
index 735314f..0000000
--- a/python/main.py
+++ /dev/null
@@ -1,140 +0,0 @@
-import os
-import sys
-
-sys.path.append("/home/robinpr")
-sys.path.append("/home/robinpr/tenpy-main")
-from itertools import cycle
-import numpy as np
-import random as rng
-from tenpy.networks.site import SpinHalfSite
-from tenpy.networks.mps import MPS
-from tenpy.networks.site import kron
-from own_TeNPy import update_bond
-import logging
-
-rng = np.random.default_rng(seed=42)
-# Set the logging level for TenPy to WARNING to suppress info messages
-logging.getLogger('tenpy_git').setLevel(logging.WARNING)
-
-
-class InitialState:
-    def down_state(L):
-        # print("Create state |00..1..0>")
-        state = []
-        for i in range(L):
-            if i == L // 2:
-                state.append('down');
-            else:
-                state.append('down');
-        return state
-
-
-def execute(psi_t_initial, t_final, parameter, Ntraj=1):
-    L = parameter['L']
-    p = parameter['p']
-    theta = parameter['theta']
-    phi = parameter['phi']
-    cos_theta_2 = np.cos(theta / 2)
-    sin_theta_2 = np.sin(theta / 2)
-    # print(psi_t_initial)
-    cos_phi_2 = np.cos(phi / 2)
-    sin_phi_2 = np.sin(phi / 2)
-
-    site = SpinHalfSite(conserve=None, sort_charge=False)
-    op_o = np.array([[1., 0.], [0., 0.]])
-    op_i = np.array([[0., 0.], [0., 1.]])
-    op_Rx = np.array([[cos_theta_2, -1j * sin_theta_2], [-1j * sin_theta_2, cos_theta_2]])
-
-    op_K1 = np.array([[0., -1j / 2 * sin_phi_2], [0., 0.]])
-
-    op_K2 = np.array([[1., 0.], [0., cos_phi_2]])
-
-    site.add_op('Rx', op_Rx)
-    site.add_op('ii', op_i)
-    site.add_op('oo', op_o)
-    site.add_op('K1', op_K1)
-    site.add_op('K2', op_K2)
-
-    opR = kron(site.get_op('oo'), site.get_op('Id'), group=False) + kron(site.get_op('ii'), site.get_op('Rx'),
-                                                                         group=False)
-    opL = kron(site.get_op('Id'), site.get_op('oo'), group=False) + kron(site.get_op('Rx'), site.get_op('ii'),
-                                                                         group=False)
-
-    #with open(output_file, 'w') as f:
-    for _ in range(Ntraj):
-        psi_t = MPS.from_product_state([site] * L, psi_t_initial, 'finite')
-        state_list = [np.sum(psi_t.expectation_value('ii')) / L]
-
-        #pbar = tqdm.tqdm(range(t_final - 1))
-        for t in range(t_final - 1):
-            # Maximal optimierte Variante
-            # prob = psi_t.expectation_value('K2')
-            # decisions = rng.random(size=L) > prob
-            # print(f"{np.mean(prob) = }", f"{np.sum(decisions) = }")
-            #
-            # # Batch-Weise Anwendung (keine Parallelität, aber vektorisiert)
-            # for op, indices in [('K1', np.where(decisions)[0]),
-            #                     ('K2', np.where(~decisions)[0])]:
-            #     for i in indices:
-            #         psi_t.apply_local_op(i, op, unitary=False)
-
-
-            # 1. Definition der Update-Sequenz als Zyklus
-            update_sequence = [
-                (range(0, L - 1, 2), opR),  # Phase 1: Gerade Bonds mit opR
-                (range(1, L - 1, 2), opL),  # Phase 2: Ungerade Bonds mit opL
-                (range(1, L - 1, 2), opR),  # Phase 3: Ungerade Bonds mit opR
-                (range(0, L - 1, 2), opL)  # Phase 4: Gerade Bonds mit opL
-            ]
-
-            # 2. Komprimierte Schleife mit paralleler Indexgenerierung
-            for indices, op in update_sequence:
-                for i in indices:
-                    update_bond.SVD_based(psi_t, i, op, parameter['tebd_params']['trunc_params'])
-            state_list.append(np.sum(psi_t.expectation_value('ii')) / L)
-            #print(f"{t = }, max(chi) = {max(psi_t.chi)}, mean(chi) = {sum(psi_t.chi) / L:.2f}, mean(ii) = {state_list[-1]:.4f}")
-            print(f"{t = }, max(chi) = {max(psi_t.chi)}, mean(chi) = {sum(psi_t.chi) / L:.2f}, mean(P1) = {state_list[-1]:.4f}")
-        #print(np.array(state_list))
-
-        #f.write(str(state_list) + '\n')
-        print(np.array(state_list))
-    return
-
-
-if __name__ == '__main__':
-    np.set_printoptions(precision=3)
-    parameter = {
-        'state_info': 'down_state',
-        'theta': 0.14,
-        'phi': 0.1222,
-        'p': 1,
-        'L': 30,
-        'model_params': {
-            'bc_MPS': 'finite',
-            'omega': 6,
-            'gamma': 1,
-        },
-        'tebd_params': {
-            'N_steps': 2,
-            'dt': 0.05,
-            'order': 1,
-            'trunc_params': {'chi_max': 1224, 'svd_min': 1.e-12}
-        }
-    }
-    # param_theta = float(parameter['theta'])
-    # param_p = str(parameter['p'])
-    # base_output_file = f"long_output_theta_{param_theta:.3f}_phi0.185.txt"
-
-    # Check if the file already exists and append a counter if necessary
-    # output_file = base_output_file
-    # counter = 1
-    # while os.path.exists(output_file):
-    #    output_file = f"long_output_theta_{param_theta:.3f}_{counter}_phi0.185.txt"
-    #    counter += 1
-
-    import time
-
-    start = time.time()
-    execute(InitialState.down_state(parameter['L']), t_final=31, parameter=parameter, Ntraj=1)
-    duration = time.time() - start
-    print(f"Duration: {duration:.6f} seconds")
\ No newline at end of file
diff --git a/python/mps/mps/mps.py b/python/mps.py
similarity index 68%
rename from python/mps/mps/mps.py
rename to python/mps.py
index 844f034..972495e 100644
--- a/python/mps/mps/mps.py
+++ b/python/mps.py
@@ -1,5 +1,81 @@
 import numpy as np
 
+import numpy as np
+
+def truncate(P, mindim=None, maxdim=None, cutoff=1E-12, use_absolute_cutoff=False, use_relative_cutoff=True):
+    """
+    Truncates a 1D numpy array based on mindim, maxdim, and a cutoff.
+
+    Args:
+        P (np.ndarray): The input vector to truncate.
+        mindim (int, optional): The minimum allowed dimension.
+        maxdim (int, optional): The maximum allowed dimension.
+        cutoff (float, optional): The cutoff value for truncation. Defaults to 1E-12.
+        use_absolute_cutoff (bool, optional): If True, use an absolute cutoff. Defaults to False.
+        use_relative_cutoff (bool, optional): If True, use a relative cutoff. Defaults to True.
+
+    Returns:
+        tuple: A tuple containing the truncated array, the truncation error, and the cutoff value used.
+    """
+    P = P.copy()
+
+    if mindim is None:
+        mindim = 1
+    if maxdim is None:
+        maxdim = len(P)
+
+    # Handle negative values by taking the absolute value
+    P = np.abs(P)
+
+    origm = len(P)
+    docut = 0.0
+
+    if origm == 1:
+        docut = abs(P[0]) / 2
+        return P, 0.0, docut
+
+    # Zero out any negative weights from the end
+    for i in range(origm - 1, -1, -1):
+        if P[i] >= 0:
+            break
+        P[i] = 0
+
+    n = origm
+    truncerr = 0.0
+
+    # Truncate to maxdim
+    while n > maxdim:
+        truncerr += P[n - 1]
+        n -= 1
+
+    if use_absolute_cutoff:
+        # Test if individual values fall below cutoff
+        while P[n - 1] <= cutoff and n > mindim:
+            truncerr += P[n - 1]
+            n -= 1
+    else:  # use_relative_cutoff
+        scale = np.sum(P)
+        if scale == 0.0:
+            scale = 1.0
+
+        # Continue truncating until sum of discarded weight reaches cutoff
+        while (truncerr + P[n - 1] <= cutoff * scale) and (n > mindim):
+            truncerr += P[n - 1]
+            n -= 1
+
+        truncerr /= scale
+
+    if n < 1:
+        n = 1
+
+    if n < origm:
+        docut = (P[n - 1] + P[n]) / 2.0
+        if abs(P[n - 1] - P[n]) < 1e-3 * P[n - 1]:
+            docut += 1e-3 * P[n - 1]
+    # print(f"Truncated from {origm} to {n} with error {truncerr} (cutoff {cutoff}, docut {docut})")
+    P_new = P[:n]
+
+    return P_new, truncerr, docut
 
 def contract_MPS_MPO_expectation(psi, O):
     """
@@ -129,7 +205,7 @@ def theta(self, i, n=2):
         else:
             raise NotImplementedError("n > 2 not implemented")
 
-    def apply_2site_op(self, i, O):
+    def apply_2site_op(self, i, O, cutoff=1E-12):
         """
         Apply a 2-site operator O to sites i and i+1.
 
@@ -158,11 +234,13 @@ def apply_2site_op(self, i, O):
         # Perform SVD and truncate to bond dimension 1
         U, S, Vh = np.linalg.svd(Oth, full_matrices=True)
 
-        V = np.conj(Vh).T
+        #V = np.conj(Vh).T
+        # print(f"Site = {i}, original S = [{', '.join(f'{x:.8f}' for x in S)}]")
+
+        S_new, truncerr, docut = truncate(np.pow(S, 2), maxdim=self.chi, cutoff=cutoff, use_relative_cutoff=True)
 
-        # determine the truncation dimension (bond dimension) based on the singular values
-        trunc = np.sum(S > 1e-12)  # number of singular values larger than 1e-12
-        trunc = min(trunc, self.chi)
+        # The new truncation dimension is simply the length of the new singular value array
+        trunc = len(S_new)
 
         U = U[:, :trunc]
         S = S[:trunc]
@@ -190,4 +268,4 @@ def expectation_value(self, O, sites):
         return np.array(expectations)
 
     def chis(self):
-        return [max(self.sites[i].tensor.shape[0], self.sites[i].tensor.shape[2]) for i in range(self.L)]
+        return [self.sites[i].tensor.shape[2] for i in range(self.L - 1)]
diff --git a/python/mps/mps/__init__.py b/python/mps/mps/__init__.py
deleted file mode 100644
index a2cd0c7..0000000
--- a/python/mps/mps/__init__.py
+++ /dev/null
@@ -1,2 +0,0 @@
-from .mps import *
-from .operators import *
\ No newline at end of file
diff --git a/python/mps/tests/util.py b/python/mps/tests/util.py
deleted file mode 100644
index a8ff576..0000000
--- a/python/mps/tests/util.py
+++ /dev/null
@@ -1,75 +0,0 @@
-import numpy as np
-import re
-
-
-def parse_complex_array_from_string(s):
-    """
-    Parse a string representing a complex-valued array.
-
-    This function first attempts to evaluate the string directly. If that fails
-    (for example, if commas are missing), it uses a simple regex-based approach
-    to split rows and columns.
-    """
-    try:
-        # Try to evaluate the string as a Python literal.
-        arr = np.array(eval(s))
-        return arr
-    except Exception:
-        # Remove outer brackets.
-        s = s.strip()
-        if s.startswith('[') and s.endswith(']'):
-            s = s[1:-1]
-        # Split into rows (using newlines or the pattern "],[").
-        rows = re.split(r'\]\s*,?\s*\[', s)
-        # Remove any extra brackets.
-        rows = [row.replace('[', '').replace(']', '') for row in rows]
-        data = []
-        for row in rows:
-            # Split by whitespace.
-            items = row.split()
-            # Convert each item into a complex number.
-            row_data = [complex(item) for item in items]
-            data.append(row_data)
-        arr = np.array(data)
-        return arr
-
-
-def main():
-    # First string: includes commas.
-    s1 = """[[-1.71e-09-0.0499j, 3.42e-08+0.998j, 0.0499-3.25e-09j, 6.22e-06-7.99e-12j],
-            [2.77e-18+0.00249j, -5.53e-17-0.0498j, 0.999-3.09e-08j, 0.000125+4.12e-12j],
-            [-0.999+5.86e-17j, -0.0499-1.17e-15j, 1.49e-08-4.69e-15j, -0.000125-5.8e-19j],
-            [-0.000125+1.66e-13j, -6.22e-06-3.32e-12j, -0.000125+7.58e-12j, 1+9.46e-16j]]"""
-
-    # Second string: space-separated entries.
-    s2 = """[[ 1.370e-16-4.986e-02j  2.740e-15-9.975e-01j -4.985e-02-2.591e-23j 6.216e-06-2.078e-19j]
-            [ 0.000e+00-2.489e-03j  0.000e+00-4.979e-02j  9.988e-01-2.744e-15j -1.245e-04+1.237e-19j]
-            [-9.988e-01+4.135e-18j  4.992e-02+8.274e-17j  1.555e-08-2.591e-23j 1.247e-04+3.230e-27j]
-            [ 1.245e-04-2.195e-22j -6.224e-06-4.392e-21j  1.247e-04-2.078e-19j 1.000e+00+2.591e-23j]]"""
-
-    # Parse the strings.
-    arr1 = np.array(eval(s1))
-    arr2 = parse_complex_array_from_string(s2)
-
-    # Print flattened arrays.
-    print("Array 1 (flattened):")
-    print(arr1.flatten())
-    print("\nArray 2 (flattened):")
-    print(arr2.flatten())
-
-    # Convert both to double precision for comparison.
-    arr1_d = np.abs(arr1.astype(np.complex128))
-    arr2_d = np.abs(arr2.astype(np.complex128))
-
-    tol = 1e-3
-    if np.allclose(arr1_d.flatten(), arr2_d.flatten(), atol=tol):
-        print("\nArrays are equal within tolerance", tol)
-    else:
-        print("\nArrays differ!")
-        diff = np.abs(arr1_d.flatten() - arr2_d.flatten())
-        print("Differences:")
-        print(diff)
-
-
-if __name__ == "__main__":
-    main()
\ No newline at end of file
diff --git a/python/mps/mps/operators.py b/python/operators.py
similarity index 100%
rename from python/mps/mps/operators.py
rename to python/operators.py
diff --git a/python/own_TeNPy.py b/python/own_TeNPy.py
deleted file mode 100644
index cbc5816..0000000
--- a/python/own_TeNPy.py
+++ /dev/null
@@ -1,801 +0,0 @@
-"""
-Not my own code, I only changed parts of algorithms to use it for my purpose
-original code: https://github.com/tenpy/tenpy
-"""
-
-import numpy as np
-import logging
-logger = logging.getLogger(__name__)
-
-from tenpy.linalg import np_conserved as npc
-from tenpy.algorithms.truncation import svd_theta, TruncationError, truncate
-from tenpy.algorithms.tebd import TEBDEngine
-
-from tenpy.algorithms.algorithm import TimeEvolutionAlgorithm
-import warnings
-import time
-class update_bond:
-    @staticmethod
-    def SVD_based(psi, i, U_bond, trunc_params):
-        
-        """Updates the B matrices on a given bond.
-    
-        Function that updates the B matrices, the bond matrix s between and the
-        bond dimension chi for bond i. The corresponding tensor networks look like this::
-    
-        |           --S--B1--B2--           --B1--B2--
-        |                |   |                |   |
-        |     theta:     U_bond        C:     U_bond
-        |                |   |                |   |
-    
-        Parameters
-        ----------
-        i : int
-            Bond index; we update the matrices at sites ``i-1, i``.
-        U_bond : :class:`~tenpy.linalg.np_conserved.Array`
-            The bond operator which we apply to the wave function.
-            We expect labels ``'p0', 'p1', 'p0*', 'p1*'``.
-    
-        Returns
-        -------
-        trunc_err : :class:`~tenpy.algorithms.truncation.TruncationError`
-            The error of the represented state which is introduced by the truncation
-            during this update step.
-        """
-        i0, i1 = i , i +1
-        logger.debug("Update sites (%d, %d)", i0, i1)
-        # Construct the theta matrix
-        C = psi.get_theta(i0, n=2, formL=0.)  # the two B without the S on the left
-        C = npc.tensordot(U_bond, C, axes=(['p0*', 'p1*'], ['p0', 'p1']))  # apply U
-        C.itranspose(['vL', 'p0', 'p1', 'vR'])
-        theta = C.scale_axis(psi.get_SL(i0), 'vL')
-        # now theta is the same as if we had done
-        #   theta = psi.psi.get_theta(i0, n=2)
-        #   theta = npc.tensordot(U_bond, theta, axes=(['p0*', 'p1*'], ['p0', 'p1']))  # apply U
-        # but also have C which is the same except the missing "S" on the left
-        # so we don't have to apply inverses of S (see below)
-    
-        theta = theta.combine_legs([('vL', 'p0'), ('p1', 'vR')], qconj=[+1, -1])
-        # Perform the SVD and truncate the wavefunction
-        U, S, V, trunc_err, renormalize = svd_theta(theta,
-                                                    trunc_params,
-                                                    [psi.get_B(i0, None).qtotal, None],
-                                                    inner_labels=['vR', 'vL'])
-    
-        # Split tensor and update matrices
-        B_R = V.split_legs(1).ireplace_label('p1', 'p')
-    
-        # In general, we want to do the following:
-        #     U = U.iscale_axis(S, 'vR')
-        #     B_L = U.split_legs(0).iscale_axis(self.psi.get_SL(i0)**-1, 'vL')
-        #     B_L = B_L.ireplace_label('p0', 'p')
-        # i.e. with SL = self.psi.get_SL(i0), we have ``B_L = SL**-1 U S``
-        #
-        # However, the inverse of SL is problematic, as it might contain very small singular
-        # values.  Instead, we use ``C == SL**-1 theta == SL**-1 U S V``,
-        # such that we obtain ``B_L = SL**-1 U S = SL**-1 U S V V^dagger = C V^dagger``
-        # here, C is the same as theta, but without the `S` on the very left
-        # (Note: this requires no inverse if the MPS is initially in 'B' canonical form)
-        B_L = npc.tensordot(C.combine_legs(('p1', 'vR'), pipes=theta.legs[1]),
-                            V.conj(),
-                            axes=['(p1.vR)', '(p1*.vR*)'])
-        B_L.ireplace_labels(['vL*', 'p0'], ['vR', 'p'])
-        B_L /= renormalize  # re-normalize to <psi|psi> = 1
-        #psi.norm *= renormalize
-        psi.set_SR(i0, S)
-        psi.set_B(i0, B_L, form='B')
-        psi.set_B(i1, B_R, form='B')
-        return trunc_err
-
-    @staticmethod
-    def _expansion_rate(psi,chi,cbe_expand, i):
-        """get expansion rate for updating bond i"""
-        
-        expand = cbe_expand#trunc_params.get('cbe_expand', None)
-        expand_0 = None#trunc_params.get('cbe_expand_0', None)
-        if expand_0 is None or expand_0 == expand:
-            return expand
-
-        chi_max = chi#trunc_params.get('chi_max', None)
-        if chi_max is None:
-            raise ValueError('Need to specify trunc_params["chi_max"] in order to use cbe_expand_0.')
-
-        chi = min(psi.get_SL(i).shape)
-        return max(expand_0 - chi / chi_max * (expand_0 - expand), expand)
-
-    @staticmethod
-    def update_bond_QR(psi, i, U_bond,chi,cbe_expand,trunc_params):
-        i0, i1 = i - 1, i
-        expand = update_bond._expansion_rate(psi,chi,cbe_expand,i)
-        logger.debug(f'Update sites ({i0}, {i1}). CBE expand={expand}')
-        # Construct the theta matrix
-        C = psi.get_theta(i0, n=2, formL=0.)  # the two B without the S on the left
-        C = npc.tensordot(U_bond, C, axes=(['p0*', 'p1*'], ['p0', 'p1']))  # apply U
-        C.itranspose(['vL', 'p0', 'p1', 'vR'])
-        theta = C.scale_axis(psi.get_SL(i0), 'vL')
-        theta = theta.combine_legs([('vL', 'p0'), ('p1', 'vR')], qconj=[+1, -1])
-        
-        min_block_increase = 1#trunc_params.get('cbe_min_block_increase', 1)
-        Y0 = QRBasedTEBDEngine._qr_tebd_cbe_Y0(B_L=psi.get_B(i0, 'B'), B_R=psi.get_B(i1, 'B'), theta=theta,
-                             expand=expand, min_block_increase=min_block_increase)
-        A_L, S, B_R, trunc_err, renormalize = QRBasedTEBDEngine._qr_based_decomposition(
-            theta=theta, Y0=Y0, use_eig_based_svd=False,#trunc_params.get('use_eig_based_svd', False),
-            need_A_L=False, compute_err=False,#trunc_params.get('compute_err', False),
-            trunc_params=trunc_params)
-        B_L = npc.tensordot(C.combine_legs(('p1', 'vR'), pipes=theta.legs[1]),
-                            B_R.conj(),
-                            axes=[['(p1.vR)'], ['(p*.vR*)']]) / renormalize
-        B_L.ireplace_labels(['p0', 'vL*'], ['p', 'vR'])
-        B_R = B_R.split_legs(1)
-        psi.norm *= renormalize
-        psi.set_B(i0, B_L, form='B')
-        psi.set_SL(i1, S)
-        psi.set_B(i1, B_R, form='B')
-        return trunc_err
-
-class QRBasedTEBDEngine(TEBDEngine):
-    r"""Version of TEBD that relies on QR decompositions rather than SVD.
-
-    As introduced in :arxiv:`2212.09782`.
-
-    .. todo ::
-        To use `use_eig_based_svd == True`, which makes sense on GPU only, we need to implement
-        the `_eig_based_svd` for "non-square" matrices.
-        This means that :math:`M^{\dagger} M` and :math:`M M^{\dagger}` dont have the same size,
-        and we need to disregard those eigenvectors of the larger one, that have eigenvalue zero,
-        since we dont have corresponding eigenvalues of the smaller one.
-
-    Options
-    -------
-    .. cfg:config :: QRBasedTEBDEngine
-        :include: TEBDEngine
-
-        cbe_expand : float
-            Expansion rate. The QR-based decomposition is carried out at an expanded bond dimension
-            ``eta = (1 + cbe_expand) * chi``, where ``chi`` is the bond dimension before the time step.
-            Default is `0.1`.
-        cbe_expand_0 : float
-            Expansion rate at low ``chi``.
-            If given, the expansion rate decreases linearly from ``cbe_expand_0`` at ``chi == 1``
-            to ``cbe_expand`` at ``chi == trunc_params['chi_max']``, then remains constant.
-            If not given, the expansion rate is ``cbe_expand`` at all ``chi``.
-        cbe_min_block_increase : int
-            Minimum bond dimension increase for each block. Default is `1`.
-        use_eig_based_svd : bool
-            Whether the SVD of the bond matrix :math:`\Xi` should be carried out numerically via
-            the eigensystem. This is faster on GPUs, but less accurate.
-            It makes no sense to do this on CPU. It is currently not supported for update_imag.
-            Default is `False`.
-        compute_err : bool
-            Whether the truncation error should be computed exactly.
-            Compared to SVD-based TEBD, computing the truncation error is significantly more expensive.
-            If `True` (default), the full error is computed.
-            Otherwise, the truncation error is set to NaN.
-    """
-
-    def _expansion_rate(self, i):
-        """get expansion rate for updating bond i"""
-        expand = self.options.get('cbe_expand', 0.1)
-        expand_0 = self.options.get('cbe_expand_0', None)
-
-        if expand_0 is None or expand_0 == expand:
-            return expand
-
-        chi_max = self.trunc_params.get('chi_max', None)
-        if chi_max is None:
-            raise ValueError('Need to specify trunc_params["chi_max"] in order to use cbe_expand_0.')
-
-        chi = min(self.psi.get_SL(i).shape)
-        return max(expand_0 - chi / chi_max * (expand_0 - expand), expand)
-
-    def update_bond(self, i, U_bond):
-        i0, i1 = i - 1, i
-        expand = self._expansion_rate(i)
-        logger.debug(f'Update sites ({i0}, {i1}). CBE expand={expand}')
-        # Construct the theta matrix
-        C = self.psi.get_theta(i0, n=2, formL=0.)  # the two B without the S on the left
-        C = npc.tensordot(U_bond, C, axes=(['p0*', 'p1*'], ['p0', 'p1']))  # apply U
-        C.itranspose(['vL', 'p0', 'p1', 'vR'])
-        theta = C.scale_axis(self.psi.get_SL(i0), 'vL')
-        theta = theta.combine_legs([('vL', 'p0'), ('p1', 'vR')], qconj=[+1, -1])
-
-        min_block_increase = self.options.get('cbe_min_block_increase', 1)
-        Y0 = _qr_tebd_cbe_Y0(B_L=self.psi.get_B(i0, 'B'), B_R=self.psi.get_B(i1, 'B'), theta=theta,
-                             expand=expand, min_block_increase=min_block_increase)
-        A_L, S, B_R, trunc_err, renormalize = _qr_based_decomposition(
-            theta=theta, Y0=Y0, use_eig_based_svd=self.options.get('use_eig_based_svd', False),
-            need_A_L=False, compute_err=self.options.get('compute_err', True),
-            trunc_params=self.trunc_params
-        )
-        B_L = npc.tensordot(C.combine_legs(('p1', 'vR'), pipes=theta.legs[1]),
-                            B_R.conj(),
-                            axes=[['(p1.vR)'], ['(p*.vR*)']]) #/ renormalize
-        
-        #B_L = npc.tensordot(C.combine_legs(('p1', 'vR'), pipes=theta.legs[1]),
-        #                    B_R.conj(),
-        #                    axes=[['(p1.vR)'], ['(p*.vR*)']]) / renormalize
-        #
-        #deleted the /renormalize to make it work for non hermitian Hamitltonian
-        
-        #
-        B_L.ireplace_labels(['p0', 'vL*'], ['p', 'vR'])
-        B_R = B_R.split_legs(1)
-        
-        #self.psi.norm *= renormalize
-        #
-        #deleted self.psi.norm because we need to calculate the norm on our own
-        self.psi.set_B(i0, B_L, form='B')
-        self.psi.set_SL(i1, S)
-        self.psi.set_B(i1, B_R, form='B')
-        self._trunc_err_bonds[i] = self._trunc_err_bonds[i] + trunc_err
-        return trunc_err
-
-    def update_bond_imag(self, i, U_bond):
-        i0, i1 = i - 1, i
-        expand = self._expansion_rate(i)
-        logger.debug(f'Update sites ({i0}, {i1}). CBE expand={expand}')
-        # Construct the theta matrix
-        theta = self.psi.get_theta(i0, n=2)
-        theta = npc.tensordot(U_bond, theta, axes=(['p0*', 'p1*'], ['p0', 'p1']))
-        theta.itranspose(['vL', 'p0', 'p1', 'vR'])
-        theta = theta.combine_legs([('vL', 'p0'), ('p1', 'vR')], qconj=[+1, -1])
-
-        use_eig_based_svd = self.options.get('use_eig_based_svd', False)
-
-        if use_eig_based_svd:
-            # see todo comment in _eig_based_svd
-            raise NotImplementedError('update_bond_imag does not (yet) support eig based SVD')
-
-        min_block_increase = self.options.get('cbe_min_block_increase', 1)
-        Y0 = _qr_tebd_cbe_Y0(B_L=self.psi.get_B(i0, 'B'), B_R=self.psi.get_B(i1, 'B'), theta=theta,
-                             expand=expand, min_block_increase=min_block_increase)
-        A_L, S, B_R, trunc_err, renormalize = _qr_based_decomposition(
-            theta=theta, Y0=Y0, use_eig_based_svd=use_eig_based_svd,
-            need_A_L=True, compute_err=self.options.get('compute_err', True),
-            trunc_params=self.trunc_params
-        )
-        A_L = A_L.split_legs(0)
-        B_R = B_R.split_legs(1)
-
-        self.psi.norm *= renormalize
-        self.psi.set_B(i0, A_L, form='A')
-        self.psi.set_SL(i1, S)
-        self.psi.set_B(i1, B_R, form='B')
-        self._trunc_err_bonds[i] = self._trunc_err_bonds[i] + trunc_err
-
-        return trunc_err
-def _qr_tebd_cbe_Y0(B_L: npc.Array, B_R: npc.Array, theta: npc.Array, expand: float, min_block_increase: int):
-    """Generate the initial guess Y0 for the left isometry in QR based TEBD
-
-    Parameters
-    ----------
-    B_L : Array with legs [vL, p, vR]
-    B_R : Array with legs [vL, p, vR]
-    theta : Array with legs [(vL.p0), (p1.vR)]
-    expand : float or None
-
-    Returns
-    -------
-    Y0 : Array with legs [vL, (p1.vR)]
-    """
-    if expand is None or expand == 0:
-        return B_R.combine_legs(['p', 'vR']).ireplace_labels('(p.vR)', '(p1.vR)')
-
-    assert min_block_increase >= 0
-
-    Y0 = theta.copy(deep=False)
-    Y0.legs[0] = Y0.legs[0].to_LegCharge()
-    Y0.ireplace_label('(vL.p0)', 'vL')
-    if any(B_L.qtotal != 0):
-        Y0.gauge_total_charge('vL', new_qtotal=B_R.qtotal)
-    vL_old = B_R.get_leg('vL')
-    if not vL_old.is_blocked():
-        vL_old = vL_old.sort()[1]
-    vL_new = Y0.get_leg('vL')  # is blocked, since created from pipe
-
-    # vL_old is guaranteed to be a slice of vL_new by charge rule in B_L
-    piv = np.zeros(vL_new.ind_len, dtype=bool)  # indices to keep in vL_new
-    increase_per_block = max(min_block_increase, int(vL_old.ind_len * expand // vL_new.block_number))
-    sizes_old = vL_old.get_block_sizes()
-    sizes_new = vL_new.get_block_sizes()
-    # iterate over charge blocks in vL_new and vL_old at the same time
-    j_old = 0
-    q_old = vL_old.charges[j_old, :]
-    qdata_order = np.argsort(Y0._qdata[:, 0])
-    qdata_idx = 0
-    for j_new, q_new in enumerate(vL_new.charges):
-        if all(q_new == q_old):  # have charge block in both vL_new and vL_old
-            s_new = sizes_old[j_old] + increase_per_block
-            # move to next charge block in next loop iteration
-            j_old += 1
-            if j_old < len(vL_old.charges):
-                q_old = vL_old.charges[j_old, :]
-        else:  # charge block only in vL_new
-            s_new = increase_per_block
-        s_new = min(s_new, sizes_new[j_new])  # don't go beyond block
-
-        if Y0._qdata[qdata_order[qdata_idx], 0] != j_new:
-            # block does not exist
-            # while we could set corresponding piv entries to True, it would not help, since
-            # the corresponding "entries" of Y0 are zero anyway
-            continue
-
-        # block has axis [vL, (p1.vR)]. want to keep the s_new slices of the vL axis
-        #  that have the largest norm
-        norms = np.linalg.norm(Y0._data[qdata_order[qdata_idx]], axis=1)
-        kept_slices = np.argsort(-norms)[:s_new]  # negative sign so we sort large to small
-        start = vL_new.slices[j_new]
-        piv[start + kept_slices] = True
-
-        qdata_idx += 1
-        if qdata_idx >= Y0._qdata.shape[0]:
-            break
-
-    Y0.iproject(piv, 'vL')
-    return Y0
-def _qr_based_decomposition(theta: npc.Array, Y0: npc.Array, use_eig_based_svd: bool, trunc_params,
-                            need_A_L: bool, compute_err: bool):
-    """Perform the decomposition step of QR based TEBD
-
-    Parameters
-    ----------
-    theta : Array with legs [(vL.p0), (p1.vR)]
-    Y0 : Array with legs [vL, (p1.vR)]
-    ...
-
-    Returns
-    -------
-    A_L : array with legs [(vL.p), vR] or None
-    S : 1D numpy array
-    B_R : array with legs [vL, (p.vR)]
-    trunc_err : TruncationError
-    renormalize : float
-    """
-
-    if compute_err:
-        need_A_L = True
-
-    # QR based updates
-    theta_i0 = npc.tensordot(theta, Y0.conj(), ['(p1.vR)', '(p1*.vR*)']).ireplace_label('vL*', 'vR')
-    A_L, _ = npc.qr(theta_i0, inner_labels=['vR', 'vL'])
-    # A_L: [(vL.p0), vR]
-    theta_i1 = npc.tensordot(A_L.conj(), theta, ['(vL*.p0*)', '(vL.p0)']).ireplace_label('vR*', 'vL')
-    theta_i1.itranspose(['(p1.vR)', 'vL'])
-    B_R, Xi = npc.qr(theta_i1, inner_labels=['vL', 'vR'], inner_qconj=-1)
-    B_R.itranspose(['vL', '(p1.vR)'])
-    Xi.itranspose(['vL', 'vR'])
-
-    # SVD of bond matrix Xi
-    if use_eig_based_svd:
-        U, S, Vd, trunc_err, renormalize = _eig_based_svd(
-            Xi, inner_labels=['vR', 'vL'], need_U=need_A_L, trunc_params=trunc_params
-        )
-    else:
-        U, S, Vd, _, renormalize = svd_theta(Xi, trunc_params)
-    B_R = npc.tensordot(Vd, B_R, ['vR', 'vL'])
-    if need_A_L:
-        A_L = npc.tensordot(A_L, U, ['vR', 'vL'])
-    else:
-        A_L = None
-
-    if compute_err:
-        theta_approx = npc.tensordot(A_L.scale_axis(S, axis='vR'), B_R, ['vR', 'vL'])
-        eps = npc.norm(theta - theta_approx) ** 2
-        trunc_err = TruncationError(eps, 1. - 2. * eps)
-    else:
-        trunc_err = TruncationError(np.nan, np.nan)
-
-    B_R = B_R.ireplace_label('(p1.vR)', '(p.vR)')
-    if need_A_L:
-        A_L = A_L.ireplace_label('(vL.p0)', '(vL.p)')
-
-    return A_L, S, B_R, trunc_err, renormalize
-def _eig_based_svd(A, need_U: bool = True, need_Vd: bool = True, inner_labels=[None, None],
-                   trunc_params=None):
-    """Computes the singular value decomposition of a matrix A via eigh
-
-    Singular values and vectors are obtained by diagonalizing the "square" A.hc @ A and/or A @ A.hc,
-    i.e. with two eigh calls instead of an svd call.
-
-    Truncation if performed if and only if trunc_params are given.
-    This performs better on GPU, but is not really useful on CPU.
-    If isometries U or Vd are not needed, their computation can be omitted for performance.
-
-    Does not (yet) support computing both U and Vd
-    """
-
-    assert A.rank == 2
-
-    if need_U and need_Vd:
-        # TODO (JU) just doing separate eighs for U, S and for S, Vd is not sufficient
-        #  the phases of U / Vd are arbitrary.
-        #  Need to put in more work in that case...
-        raise NotImplementedError
-
-    if need_U:
-        Vd = None
-        A_Ahc = npc.tensordot(A, A.conj(), [1, 1])
-        L, U = npc.eigh(A_Ahc, sort='>')
-        S = np.sqrt(np.abs(L))  # abs to avoid `nan` due to accidentially negative values close to zero
-        U = U.ireplace_label('eig', inner_labels[0])
-    elif need_Vd:
-        U = None
-        Ahc_A = npc.tensordot(A.conj(), A, [0, 0])
-        L, V = npc.eigh(Ahc_A, sort='>')
-        S = np.sqrt(np.abs(L))  # abs to avoid `nan` due to accidentially negative values close to zero
-        Vd = V.iconj().itranspose().ireplace_label('eig*', inner_labels[1])
-    else:
-        U = None
-        Vd = None
-        # use the smaller of the two square matrices -- they have the same eigenvalues
-        if A.shape[1] >= A.shape[0]:
-            A2 = npc.tensordot(A, A.conj(), [1, 0])
-        else:
-            A2 = npc.tensordot(A.conj(), A, [1, 0])
-        L = npc.eigvalsh(A2)
-        S = np.sqrt(np.abs(L))  # abs to avoid `nan` due to accidentially negative values close to zero
-
-    if trunc_params is not None:
-        piv, renormalize, trunc_err = truncate(S, trunc_params)
-        S = S[piv]
-        S /= renormalize
-        if need_U:
-            U.iproject(piv, 1)
-        if need_Vd:
-            Vd.iproject(piv, 0)
-    else:
-        renormalize = np.linalg.norm(S)
-        S /= renormalize
-        trunc_err = TruncationError()
-
-    return U, S, Vd, trunc_err, renormalize
-
-
-class SVDBasedTEBDEngine(TimeEvolutionAlgorithm):
-    def __init__(self, psi, model, options, **kwargs):
-        TimeEvolutionAlgorithm.__init__(self, psi, model, options, **kwargs)
-        self.trunc_err = self.options.get('start_trunc_err', TruncationError())
-        self._U = None
-        self._U_param = {}
-        self._trunc_err_bonds = [TruncationError() for i in range(psi.L + 1)]
-        self._update_index = None
-
-    @property
-    def TEBD_params(self):
-        warnings.warn("renamed self.TEBD_params -> self.options", FutureWarning, stacklevel=2)
-        return self.options
-
-    @property
-    def trunc_err_bonds(self):
-        """truncation error introduced on each non-trivial bond."""
-        return self._trunc_err_bonds[self.psi.nontrivial_bonds]
-
-    def run(self):
-        """Run TEBD real time evolution by `N_steps`*`dt`."""
-        # initialize parameters
-        delta_t = self.options.get('dt', 0.1)
-        N_steps = self.options.get('N_steps', 10)
-        TrotterOrder = self.options.get('order', 2)
-        E_offset = self.options.get('E_offset', None)
-
-        self.calc_U(TrotterOrder, delta_t, type_evo='real', E_offset=E_offset)
-
-        Sold = np.mean(self.psi.entanglement_entropy())
-        start_time = time.time()
-
-        self.update(N_steps)
-        
-        S = self.psi.entanglement_entropy()
-        logger.info(
-            "--> time=%(t)3.3f, max(chi)=%(chi)d, max(S)=%(S).5f, "
-            "avg DeltaS=%(dS).4e, since last update: %(wall_time).1fs", {
-                't': self.evolved_time.real,
-                'chi': max(self.psi.chi),
-                'S': max(S),
-                'dS': np.mean(S) - Sold,
-                'wall_time': time.time() - start_time,
-            })
-
-    def run_GS(self):
-        # initialize parameters
-        delta_tau_list = self.options.get(
-            'delta_tau_list',
-            [0.1, 0.01, 0.001, 1.e-4, 1.e-5, 1.e-6, 1.e-7, 1.e-8, 1.e-9, 1.e-10, 1.e-11, 0.])
-        max_error_E = self.options.get('max_error_E', 1.e-13)
-        N_steps = self.options.get('N_steps', 10)
-        TrotterOrder = self.options.get('order', 2)
-
-        Eold = np.mean(self.model.bond_energies(self.psi))
-        Sold = np.mean(self.psi.entanglement_entropy())
-        start_time = time.time()
-
-        for delta_tau in delta_tau_list:
-            logger.info("delta_tau=%e", delta_tau)
-            self.calc_U(TrotterOrder, delta_tau, type_evo='imag')
-            DeltaE = 2 * max_error_E
-            step = 0
-            while (DeltaE > max_error_E):
-                if self.psi.finite and TrotterOrder == 2:
-                    self.update_imag(N_steps)
-                else:
-                    self.update(N_steps)
-                step += N_steps
-                E = np.mean(self.model.bond_energies(self.psi))
-                DeltaE = abs(Eold - E)
-                Eold = E
-                S = self.psi.entanglement_entropy()
-                max_S = max(S)
-                S = np.mean(S)
-                DeltaS = S - Sold
-                Sold = S
-                logger.info(
-                    "--> step=%(step)6d, beta=%(beta)3.3f, max(chi)=%(max_chi)d,"
-                    "DeltaE=%(dE).2e, E_bond=%(E).10f, Delta_S=%(dS).4e, "
-                    "max(S)=%(max_S).10f, time simulated: %(wall_time).1fs", {
-                        'step': step,
-                        'beta': -self.evolved_time.imag,
-                        'max_chi': max(self.psi.chi),
-                        'dE': DeltaE,
-                        'E': E.real,
-                        'dS': DeltaS,
-                        'max_S': max_S,
-                        'wall_time': time.time() - start_time,
-                    })
-        # done
-
-    @staticmethod
-    def suzuki_trotter_time_steps(order):
-        if order == 1:
-            return [1.]
-        elif order == 2:
-            return [0.5, 1.]
-        elif order == 4:
-            t1 = 1. / (4. - 4.**(1 / 3.))
-            t3 = 1. - 4. * t1
-            return [t1 / 2., t1, (t1 + t3) / 2., t3]
-        elif order == '4_opt':
-            # Eq (30a) of arXiv:1901.04974
-            a1 = 0.095848502741203681182
-            b1 = 0.42652466131587616168
-            a2 = -0.078111158921637922695
-            b2 = -0.12039526945509726545
-            return [a1, b1, a2, b2, 0.5 - a1 - a2, 1. - 2 * (b1 + b2)]  # a1 b1 a2 b2 a3 b3
-        # else
-        raise ValueError("Unknown order %r for Suzuki Trotter decomposition" % order)
-
-    @staticmethod
-    def suzuki_trotter_decomposition(order, N_steps):
-        even, odd = 0, 1
-        if N_steps == 0:
-            return []
-        if order == 1:
-            a = (0, odd)
-            b = (0, even)
-            return [a, b] * N_steps
-        elif order == 2:
-            a = (0, odd)  # dt/2
-            a2 = (1, odd)  # dt
-            b = (1, even)  # dt
-            # U = [a b a]*N
-            #   = a b [a2 b]*(N-1) a
-            return [a, b] + [a2, b] * (N_steps - 1) + [a]
-        elif order == 4:
-            a = (0, odd)  # t1/2
-            a2 = (1, odd)  # t1
-            b = (1, even)  # t1
-            c = (2, odd)  # (t1 + t3) / 2 == (1 - 3 * t1)/2
-            d = (3, even)  # t3 = 1 - 4 * t1
-            # From Schollwoeck 2011 (:arxiv:`1008.3477`):
-            # U = U(t1) U(t2) U(t3) U(t2) U(t1)
-            # with U(dt) = U(dt/2, odd) U(dt, even) U(dt/2, odd) and t1 == t2
-            # Using above definitions, we arrive at:
-            # U = [a b a2 b c d c b a2 b a] * N
-            #   = [a b a2 b c d c b a2 b] + [a2 b a2 b c d c b a2 b a] * (N-1) + [a]
-            steps = [a, b, a2, b, c, d, c, b, a2, b]
-            steps = steps + [a2, b, a2, b, c, d, c, b, a2, b] * (N_steps - 1)
-            steps = steps + [a]
-            return steps
-        elif order == '4_opt':
-            # symmetric: a1 b1 a2 b2 a3 b3 a2 b2 a2 b1 a1
-            steps = [(0, odd), (1, even), (2, odd), (3, even), (4, odd),  (5, even),
-                     (4, odd), (3, even), (2, odd), (1, even), (0, odd)]  # yapf: disable
-            return steps * N_steps
-        # else
-        raise ValueError("Unknown order {0!r} for Suzuki Trotter decomposition".format(order))
-
-    def calc_U(self, order, delta_t, type_evo='real', E_offset=None):
-        U_param = dict(order=order, delta_t=delta_t, type_evo=type_evo, E_offset=E_offset)
-        if type_evo == 'real':
-            U_param['tau'] = delta_t
-        elif type_evo == 'imag':
-            U_param['tau'] = -1.j * delta_t
-        else:
-            raise ValueError("Invalid value for `type_evo`: " + repr(type_evo))
-        if self._U_param == U_param:  # same keys and values as cached
-            logger.debug("Skip recalculation of U with same parameters as before")
-            return  # nothing to do: U is cached
-        self._U_param = U_param
-        logger.info("Calculate U for %s", U_param)
-
-        L = self.psi.L
-        self._U = []
-        for dt in self.suzuki_trotter_time_steps(order):
-            U_bond = [
-                self._calc_U_bond(i_bond, dt * delta_t, type_evo, E_offset) for i_bond in range(L)
-            ]
-            self._U.append(U_bond)
-        # done
-
-    def update(self, N_steps):
-        #print("update norm =",self.psi.norm)
-        #print("psi =",self.psi)
-        trunc_err = TruncationError()
-        order = self._U_param['order']
-        for U_idx_dt, odd in self.suzuki_trotter_decomposition(order, N_steps):
-            trunc_err += self.update_step(U_idx_dt, odd)
-        self.evolved_time = self.evolved_time + N_steps * self._U_param['tau']
-        self.trunc_err = self.trunc_err + trunc_err  # not += : make a copy!
-        # (this is done to avoid problems of users storing self.trunc_err after each `update`)
-        return trunc_err
-
-    def update_step(self, U_idx_dt, odd):
-        Us = self._U[U_idx_dt]
-        trunc_err = TruncationError()
-        for i_bond in np.arange(int(odd) % 2, self.psi.L, 2):
-            if Us[i_bond] is None:
-                continue  # handles finite vs. infinite boundary conditions
-            self._update_index = (U_idx_dt, i_bond)
-            trunc_err += self.update_bond(i_bond, Us[i_bond])
-        self._update_index = None
-        return trunc_err
-
-    def update_bond(self, i, U_bond):
-        #print("beginning update bond =", self.psi.norm)
-        #print("position = ",i)
-        i0, i1 = i - 1, i
-        logger.debug("Update sites (%d, %d)", i0, i1)
-        # Construct the theta matrix
-        C = self.psi.get_theta(i0, n=2, formL=0.)  # the two B without the S on the left
-        C = npc.tensordot(U_bond, C, axes=(['p0*', 'p1*'], ['p0', 'p1']))  # apply U
-        C.itranspose(['vL', 'p0', 'p1', 'vR'])
-        theta = C.scale_axis(self.psi.get_SL(i0), 'vL')
-        # now theta is the same as if we had done
-        #   theta = self.psi.get_theta(i0, n=2)
-        #   theta = npc.tensordot(U_bond, theta, axes=(['p0*', 'p1*'], ['p0', 'p1']))  # apply U
-        # but also have C which is the same except the missing "S" on the left
-        # so we don't have to apply inverses of S (see below)
-
-        theta = theta.combine_legs([('vL', 'p0'), ('p1', 'vR')], qconj=[+1, -1])
-        # Perform the SVD and truncate the wavefunction
-        U, S, V, trunc_err, renormalize = svd_theta(theta,
-                                                    self.trunc_params,
-                                                    [self.psi.get_B(i0, None).qtotal, None],
-                                                    inner_labels=['vR', 'vL'])
-
-        # Split tensor and update matrices
-        B_R = V.split_legs(1).ireplace_label('p1', 'p')
-
-
-        B_L = npc.tensordot(C.combine_legs(('p1', 'vR'), pipes=theta.legs[1]),
-                            V.conj(),
-                            axes=['(p1.vR)', '(p1*.vR*)'])
-        B_L.ireplace_labels(['vL*', 'p0'], ['vR', 'p'])
-        #print("renormalize =", renormalize,"")
-        #print("B_L = ",B_L)
-        #print("norm = ",B_L.norm())
-        #print("singulaerwert Norm = ",LA.norm(S))
-        #print("B_R = ",B_R)
-        #re-normalize to <psi|psi> = 1
-        #print("before renormalize =",self.psi.norm)
-        
-        
-        
-        #S = S * renormalize
-        #Der Vektor S der Singulärwerte wurde bereits mit dem Faktor "renormalize" normalisiert, muss dies 
-        #Rückgängig gemacht werden?
-        
-        
-        #print("renormalizefactor =", renormalize)
-        #B_L /= renormalize 
-        #Wenn ich die renormalisierung von B_L unterlasse, ändert sich dann die Norm wie folgt? 
-        #self.psi.norm *= renormalize
-        #Vergleich zu 
-        #update_bond_imag(self, i, U_bond):
-        #dort wird die norm mit dem Faktor multipliziert
-        
-        #print("norm = ",B_L.norm())
-        self.psi.set_SR(i0, S)
-        self.psi.set_B(i0, B_L, form='B')
-        self.psi.set_B(i1, B_R, form='B')
-        
-        #print("norm =", self.psi.norm, type(self.psi))
-        self._trunc_err_bonds[i] = self._trunc_err_bonds[i] + trunc_err
-        #print("end renormalize =", self.psi.norm)
-        return trunc_err
-
-    def update_imag(self, N_steps):
-        trunc_err = TruncationError()
-        order = self._U_param['order']
-        # allow only second order evolution
-        if order != 2 or not self.psi.finite:
-            # Would lead to loss of canonical form. What about DMRG?
-            raise NotImplementedError("Use DMRG instead...")
-        U_idx_dt = 0  # always with dt=0.5
-        assert (self.suzuki_trotter_time_steps(order)[U_idx_dt] == 0.5)
-        assert (self.psi.finite)  # finite or segment bc
-        Us = self._U[U_idx_dt]
-        for _ in range(N_steps):
-            # sweep right
-            for i_bond in range(self.psi.L):
-                if Us[i_bond] is None:
-                    continue  # handles finite vs. infinite boundary conditions
-                self._update_index = (U_idx_dt, i_bond)
-                trunc_err += self.update_bond_imag(i_bond, Us[i_bond])
-            # sweep left
-            for i_bond in range(self.psi.L - 1, -1, -1):
-                if Us[i_bond] is None:
-                    continue  # handles finite vs. infinite boundary conditions
-                self._update_index = (U_idx_dt, i_bond)
-                trunc_err += self.update_bond_imag(i_bond, Us[i_bond])
-        self._update_index = None
-        self.evolved_time = self.evolved_time + N_steps * self._U_param['tau']
-        self.trunc_err = self.trunc_err + trunc_err  # not += : make a copy!
-        # (this is done to avoid problems of users storing self.trunc_err after each `update`)
-        return trunc_err
-
-    def update_bond_imag(self, i, U_bond):
-        i0, i1 = i - 1, i
-        logger.debug("Update sites (%d, %d)", i0, i1)
-        # Construct the theta matrix
-        theta = self.psi.get_theta(i0, n=2)  # 'vL', 'vR', 'p0', 'p1'
-        theta = npc.tensordot(U_bond, theta, axes=(['p0*', 'p1*'], ['p0', 'p1']))
-        theta = theta.combine_legs([('vL', 'p0'), ('vR', 'p1')], qconj=[+1, -1])
-        # Perform the SVD and truncate the wavefunction
-        U, S, V, trunc_err, renormalize = svd_theta(theta,
-                                                    self.trunc_params,
-                                                    inner_labels=['vR', 'vL'])
-       
-        self.psi.norm *= renormalize
-        # Split legs and update matrices
-        B_R = V.split_legs(1).ireplace_label('p1', 'p')
-        A_L = U.split_legs(0).ireplace_label('p0', 'p')
-        self.psi.set_SR(i0, S)
-        self.psi.set_B(i0, A_L, form='A')
-        self.psi.set_B(i1, B_R, form='B')
-        self._trunc_err_bonds[i] = self._trunc_err_bonds[i] + trunc_err
-        return trunc_err
-
-    def _calc_U_bond(self, i_bond, dt, type_evo, E_offset):
-        """Calculate exponential of a bond Hamitonian.
-        * ``U_bond = exp(-i dt (H_bond-E_offset_bond))`` for ``type_evo='real'``, or
-        * ``U_bond = exp(- dt H_bond)`` for ``type_evo='imag'``.
-        """
-        h = self.model.H_bond[i_bond]
-        if h is None:
-            return None  # don't calculate exp(i H t), if `H` is None
-        H2 = h.combine_legs([('p0', 'p1'), ('p0*', 'p1*')], qconj=[+1, -1])
-        if type_evo == 'imag':
-            H2 = (-dt) * H2
-        elif type_evo == 'real':
-            if E_offset is not None:
-                H2 = H2 - npc.diag(E_offset[i_bond], H2.legs[0])
-            H2 = (-1.j * dt) * H2
-        else:
-            raise ValueError("Expect either 'real' or 'imag'inary time, got " + repr(type_evo))
-        U = npc.expm(H2)
-#print("calc_U_bond =", U)
-        assert (tuple(U.get_leg_labels()) == ('(p0.p1)', '(p0*.p1*)'))
-        return U.split_legs()
-
-
-class Engine(SVDBasedTEBDEngine):
-    """Deprecated old name of :class:`TEBDEngine`.
-    .. deprecated : v0.8.0
-        Renamed the `Engine` to `TEBDEngine` to have unique algorithm class names.
-    """
-    def __init__(self, psi, model, options):
-        msg = "Renamed `Engine` class to `TEBDEngine`."
-        warnings.warn(msg, category=FutureWarning, stacklevel=2)
-        SVDBasedTEBDEngine.__init__(self, psi, model, options)
-
-
diff --git a/python/test-python.py b/python/test-python.py
new file mode 100644
index 0000000..aae1867
--- /dev/null
+++ b/python/test-python.py
@@ -0,0 +1,58 @@
+import time
+import numpy as np
+
+from mps import MPS
+from operators import X, P1, Rx, CONTROL, CONTROL_L
+
+def run_mps_simulation(T=50, L=30, chi=1024, theta=0.14, cutoff=1e-12):
+    # build operators
+    X_op   = X()
+    P1_op  = P1()
+    Rx_op  = Rx(theta)
+    CRX    = CONTROL(Rx_op)
+    CLRX   = CONTROL_L(Rx_op)
+
+    # initialize MPS
+    psi = MPS(L, chi)
+
+    # apply X on every site
+    for i in range(L):
+        psi.apply_local_op(i, X_op)
+
+    # record mean‐P1 at each step
+    mean_expectations = []
+
+    for t in range(T):
+        # Trotter sweep
+        for i in range(0, L-1, 2):
+            psi.apply_2site_op(i, CRX, cutoff=cutoff)
+        for i in range(1, L-1, 2):
+            psi.apply_2site_op(i, CLRX, cutoff=cutoff)
+        for i in range(1, L-1, 2):
+            psi.apply_2site_op(i, CRX, cutoff=cutoff)
+        for i in range(0, L-1, 2):
+            psi.apply_2site_op(i, CLRX, cutoff=cutoff)
+
+        # compute ⟨P1⟩ on all sites
+        exps = psi.expectation_value(P1_op, list(range(L)))  # NumPy array
+
+        total = float(np.sum(exps))
+        mean  = total / L
+        mean_expectations.append(mean)
+
+        # bond‐dimension stats
+        chis = psi.chis()
+        print(f"t = {t:2d}, max(chi) = {max(chis):4d}, mean(chi) = {sum(chis)/L:8.2f}")
+
+        # for (i, d) in enumerate(chis):
+        #     print(f"  Site {i}: chi = {d}: shape = {psi.sites[i].tensor.shape}")
+
+
+    # final printout in C++‐style list
+    print("[1, " + ", ".join(f"{m:.6f}" for m in mean_expectations) + "]")
+
+if __name__ == "__main__":
+    start = time.perf_counter()
+    run_mps_simulation()
+    dt = (time.perf_counter() - start) * 1000.0
+    print(f"Execution time: {dt:.0f} ms")
\ No newline at end of file
diff --git a/python/mps/tests/test_MPS.py b/python/test_MPS.py
similarity index 97%
rename from python/mps/tests/test_MPS.py
rename to python/test_MPS.py
index 51dccd3..339d45a 100644
--- a/python/mps/tests/test_MPS.py
+++ b/python/test_MPS.py
@@ -5,8 +5,8 @@
 
 from math import pi as M_PI
 
-from mps.mps import MPS
-from mps.mps.operators import X, H, CNOT, P1, P0, CONTROL, CONTROL_L, Rx
+from mps import MPS
+from operators import X, H, CNOT, P1, P0, CONTROL, CONTROL_L, Rx
 
 class MPSTest(unittest.TestCase):
     def test_application(self):
diff --git a/src/main.cu b/src/main.cu
deleted file mode 100644
index 670e5fb..0000000
--- a/src/main.cu
+++ /dev/null
@@ -1,134 +0,0 @@
-#include "Operators.hpp"
-#include "MPSSimulator.cuh"
-
-#include <spdlog/spdlog.h>
-
-
-void pretty_print(const std::vector<Scalar> &probs) {
-    std::cout << "[";
-    for (auto i = 0; i < probs.size(); i++) {
-        std::cout << fmt::format("{:.8f}", probs[i]);
-        if (i < probs.size() - 1) {
-            std::cout << ", ";
-        }
-    }
-    std::cout << "]\n";
-}
-
-void operator_print(const operators::Operator &op, unsigned N_rows = 0) {
-
-    std::cout << "Operator: [\n\t";
-    for (auto i = 0; i < op.data.size(); i++) {
-        std::cout << op.data[i] << " ";
-        if (i % N_rows == N_rows - 1 && i < op.data.size() - 1) {
-            std::cout << "\n\t";
-        }
-    }
-    std::cout << "\n]\n";
-
-}
-
-int main() {
-    // enable debug logs
-    //  spdlog::set_level(spdlog::level::debug);
-
-    static_assert(sizeof(size_t) == sizeof(int64_t), "Please build this sample on a 64-bit architecture!");
-
-    static Scalar theta = 0.14;
-    static Scalar phi = 0.1222;
-
-    // only used for constructing the operators on the host
-    const auto Rx = operators::Rx(theta);
-    auto Rx_d = Rx.device();
-
-    // Allocate memory for operators on the GPU
-    auto CRRx = operators::CONTROL(Rx).device();
-    auto CLRx = operators::CONTROL_L(Rx).device();
-
-    auto K1 = operators::K1(phi).device();
-    auto K2 = operators::K2(phi).device();
-
-    auto X = operators::X.device();
-    auto P0 = operators::P0.device();
-    auto P1 = operators::P1.device();
-
-    constexpr auto numQubits = 60u;
-    constexpr auto chi = 1224u;
-    mps::Simulator simulator(0, numQubits, chi);
-
-    spdlog::info("Allocated {} bytes of scratch memory on GPU: [{}:{}]", simulator.scratchSize,
-                 (void *) simulator.d_scratch,
-                 (void *) (((char *) simulator.d_scratch) + simulator.scratchSize));
-
-
-    {
-        // apply X gate to all qubits |1111...1>
-        for (auto i = 0; i < numQubits; i++) {
-            simulator.apply_operator(X, {i});
-        }
-    }
-    constexpr auto random_values = std::array{
-        0.6394267984578837, 0.025010755222666936, 0.27502931836911926, 0.22321073814882275, 0.7364712141640124, 0.6766994874229113, 0.8921795677048454, 0.08693883262941615, 0.4219218196852704, 0.029797219438070344, 0.21863797480360336, 0.5053552881033624, 0.026535969683863625, 0.1988376506866485, 0.6498844377795232, 0.5449414806032167, 0.2204406220406967, 0.5892656838759087, 0.8094304566778266, 0.006498759678061017, 0.8058192518328079, 0.6981393949882269, 0.3402505165179919, 0.15547949981178155, 0.9572130722067812, 0.33659454511262676, 0.09274584338014791, 0.09671637683346401, 0.8474943663474598, 0.6037260313668911, 0.8071282732743802, 0.7297317866938179, 0.5362280914547007, 0.9731157639793706, 0.3785343772083535, 0.552040631273227, 0.8294046642529949, 0.6185197523642461, 0.8617069003107772, 0.577352145256762, 0.7045718362149235, 0.045824383655662215, 0.22789827565154686, 0.28938796360210717, 0.0797919769236275, 0.23279088636103018, 0.10100142940972912, 0.2779736031100921, 0.6356844442644002, 0.36483217897008424, 0.37018096711688264, 0.2095070307714877, 0.26697782204911336, 0.936654587712494, 0.6480353852465935, 0.6091310056669882, 0.171138648198097, 0.7291267979503492, 0.1634024937619284, 0.3794554417576478, 0.9895233506365952, 0.6399997598540929};
-
-    static auto N_time_steps = 150;
-    auto probs_index = 0;
-    for (auto t = 0; t < N_time_steps; t++) {
-
-        {
-            std::cout << "t = " << t << std::endl;
-
-            // Measure expectation value of K2
-            simulator.mps_factorization();
-            auto probs = simulator.expectation_value(K2);
-            std::cout << "Expectation values: ";
-            pretty_print(probs);
-
-            for (auto i = 0; i < numQubits; i++) {
-                if (probs[i] > random_values[probs_index]) {
-                    simulator.apply_operator(K1, {i}, false);
-                } else {
-                    simulator.apply_operator(K2, {i}, false);
-                }
-                probs_index++;
-            }
-        }
-
-        {
-            // apply a layer of CX gates to neighboring qubits
-            for (auto i = 0; i < numQubits - 2; i += 2) {
-                simulator.apply_operator(CRRx, {i, i + 1});
-            }
-
-            for (auto i = 1; i < numQubits - 2; i += 2) {
-                simulator.apply_operator(CLRx, {i, i + 1});
-            };
-
-            for (auto i = 0; i < numQubits - 2; i += 2) {
-                simulator.apply_operator(CLRx, {i, i + 1});
-            };
-
-            for (auto i = 1; i < numQubits - 2; i += 2) {
-                simulator.apply_operator(CRRx, {i, i + 1});
-            };
-
-        }
-        {
-            simulator.mps_factorization();
-            // Compute expectation values
-            auto expects = simulator.expectation_value(P1);
-            //std::cout << "Expectation values: ";
-            //pretty_print(expects);
-            auto mean = std::accumulate(expects.begin(), expects.end(), 0.0) / expects.size();
-            std::cout << "Mean expectation value: " << mean << std::endl;
-        }
-    }
-    {
-        simulator.mps_factorization();
-        // Compute expectation values
-        auto expects = simulator.expectation_value(P1);
-        std::cout << "Expectation values: ";
-        pretty_print(expects);
-        auto mean = std::accumulate(expects.begin(), expects.end(), 0.0) / expects.size();
-        std::cout << "Mean expectation value: " << mean << std::endl;
-    }
-}
diff --git a/src/mps.cu b/src/mps.cu
index 68e6ac1..42639a6 100644
--- a/src/mps.cu
+++ b/src/mps.cu
@@ -20,7 +20,7 @@ void print_expectation_values(mps::MPS& psi, std::vector<int>& sites, tensor::De
     spdlog::info("Expectation values: [{}]", ss.str());
 }
 
-int run() {
+void run() {
     constexpr int T = 30;
     constexpr int L = 30;
     constexpr int chi = 1024;
@@ -38,36 +38,34 @@ int run() {
     auto sites = std::vector<int>(L, 0);
     std::iota(sites.begin(), sites.end(), 0);
 
-    auto streams = std::vector<cudaStream_t>(2);
-    /*for (auto& stream : streams) {
-        HANDLE_CUDA_ERROR(cudaStreamCreate(&stream));
-    }*/
+    auto stream = cudaStream_t{nullptr};
+    HANDLE_CUDA_ERROR(cudaStreamCreate(&stream));
 
     auto summed_expectations = thrust::device_vector<mps::Scalar>(T);
     auto expectations = thrust::device_vector<mps::Complex>(L);
 
 
     for (auto i = 0; i < L; ++i) {
-        psi.apply_local_op(i, X);
+        psi.apply_local_op(i, X, stream);
     }
 
     for (auto t = 0; t < T; ++t) {
 
         for (auto i = 0; i < L-1; i += 2) {
-            psi.apply_2_site_op(i, CRX);
+            psi.apply_2_site_op(i, CRX, stream);
         }
         for (auto i = 1; i < L-1; i += 2) {
-            psi.apply_2_site_op(i, CLRX);
+            psi.apply_2_site_op(i, CLRX, stream);
         }
         for (auto i = 1; i < L-1; i += 2) {
-            psi.apply_2_site_op(i, CRX);
+            psi.apply_2_site_op(i, CRX, stream);
         }
         for (auto i = 0; i < L-1; i += 2) {
-            psi.apply_2_site_op(i, CLRX);
+            psi.apply_2_site_op(i, CLRX, stream);
         }
 
-        psi.expectation_values(P1, sites, expectations);
-        reduction::sum(thrust::raw_pointer_cast(expectations.data()), expectations.size(), thrust::raw_pointer_cast(&summed_expectations[t]));
+        psi.expectation_values(P1, sites, expectations, stream);
+        reduction::sum(thrust::raw_pointer_cast(expectations.data()), expectations.size(), thrust::raw_pointer_cast(&summed_expectations[t]), stream);
 
 
         std::cout << "t = " << t << ", max(chi) = " << psi.get_max_bond_dimension() << ", mean(chi) = " << psi.get_avg_bond_dimension() << std::endl;// ", mean(P1) = " << mean_expectations[t] << std::endl;
@@ -85,12 +83,10 @@ int run() {
     }
     std::cout << "]" << std::endl;
 
-    for (auto& stream : streams) {
-        if (stream != nullptr)
-            HANDLE_CUDA_ERROR(cudaStreamDestroy(stream));
-    }
 
-    return true;
+    if (stream != nullptr)
+        HANDLE_CUDA_ERROR(cudaStreamDestroy(stream));
+
 }
 
 int main() {
diff --git a/tests/test_einsum.cu b/tests/test_einsum.cu
deleted file mode 100644
index 63b91b1..0000000
--- a/tests/test_einsum.cu
+++ /dev/null
@@ -1,155 +0,0 @@
-/*
- * test_einsum.cpp
- *
- * A simple test for the Einsum functionality using the equation "ij,jk->ik".
- *
- * This test creates two matrices:
- *   - A of shape (2, 3)
- *   - B of shape (3, 4)
- * and computes the contraction C = einsum("ij,jk->ik", A, B)
- * which should be equivalent to matrix multiplication (A * B).
- *
- * Expected C has shape (2, 4).
- */
-
-#include <iostream>
-#include <vector>
-#include <cmath>
-#include "tensordot.h"    // Contains the Einsum and tensordot implementations
-#include "tensor.h"       // Contains the DeviceTensor definition
-#include "contraction.cuh"// Contains any additional contraction helper code
-
-// Macro for CUDA error checking (if not defined elsewhere)
-#ifndef HANDLE_CUDA_ERROR
-#define HANDLE_CUDA_ERROR(x)                                      \
-    {                                                             \
-        cudaError_t err = x;                                      \
-        if(err != cudaSuccess){                                  \
-            std::cerr << "CUDA Error: " << cudaGetErrorString(err) << std::endl; \
-            exit(EXIT_FAILURE);                                   \
-        }                                                         \
-    }
-#endif
-
-int main() {
-    // Select CUDA device 0.
-    cudaSetDevice(0);
-
-    // Create a cuTENSOR handle.
-    cutensorHandle_t handle;
-    CUTENSOR_CHECK(cutensorCreate(&handle));
-
-    // Define shapes for A (2x3) and B (3x4)
-    std::vector<int64_t> shapeA = {2, 3}; // 2 rows, 3 columns.
-    std::vector<int64_t> shapeB = {3, 4}; // 3 rows, 4 columns.
-    tensor::DeviceTensor<cuComplex> A(shapeA);
-    tensor::DeviceTensor<cuComplex> B(shapeB);
-
-    // Prepare host data for A and B in column-major order.
-    // For A:
-    //   Column 0: [1, 4]
-    //   Column 1: [2, 5]
-    //   Column 2: [3, 6]
-    std::vector<cuComplex> hA = {
-        make_cuComplex(1.0f, 0.0f), // A(0,0)
-        make_cuComplex(4.0f, 0.0f), // A(1,0)
-        make_cuComplex(2.0f, 0.0f), // A(0,1)
-        make_cuComplex(5.0f, 0.0f), // A(1,1)
-        make_cuComplex(3.0f, 0.0f), // A(0,2)
-        make_cuComplex(6.0f, 0.0f)  // A(1,2)
-    };
-
-    // For B:
-    //   Column 0: [7, 10, 13]  (B(0,0), B(1,0), B(2,0))
-    //   Column 1: [8, 11, 14]
-    //   Column 2: [9, 12, 15]
-    //   Column 3: [10, 13, 16]
-    std::vector<cuComplex> hB = {
-        make_cuComplex(7.0f, 0.0f),  // B(0,0)
-        make_cuComplex(10.0f, 0.0f), // B(1,0)
-        make_cuComplex(13.0f, 0.0f), // B(2,0)
-        make_cuComplex(8.0f, 0.0f),  // B(0,1)
-        make_cuComplex(11.0f, 0.0f), // B(1,1)
-        make_cuComplex(14.0f, 0.0f), // B(2,1)
-        make_cuComplex(9.0f, 0.0f),  // B(0,2)
-        make_cuComplex(12.0f, 0.0f), // B(1,2)
-        make_cuComplex(15.0f, 0.0f), // B(2,2)
-        make_cuComplex(10.0f, 0.0f), // B(0,3)
-        make_cuComplex(13.0f, 0.0f), // B(1,3)
-        make_cuComplex(16.0f, 0.0f)  // B(2,3)
-    };
-
-    // Copy host data to device.
-    size_t totalA = 2 * 3;
-    size_t totalB = 3 * 4;
-    HANDLE_CUDA_ERROR(cudaMemcpy(A.ptr(), hA.data(), totalA * sizeof(cuComplex), cudaMemcpyHostToDevice));
-    HANDLE_CUDA_ERROR(cudaMemcpy(B.ptr(), hB.data(), totalB * sizeof(cuComplex), cudaMemcpyHostToDevice));
-
-    // Prepare output tensor C; expected shape is (2,4).
-    std::vector<int64_t> shapeC = {2, 4};
-    tensor::DeviceTensor<cuComplex> C(shapeC);
-
-    // Create a workspace for the Einsum call.
-    // (Size may need to be adjusted depending on the problem.)
-    contraction::Workspace workspace(1024 * 1024);
-
-    // Use the Einsum helper to compute C = einsum("ij,jk->ik", A, B)
-    std::string subscripts = "ij,jk->ik";
-    contraction::einsum(handle, subscripts, A, B, C, 0, workspace, false, false);
-
-    // Copy result from device to host.
-    std::vector<cuComplex> hC(2 * 4);
-    HANDLE_CUDA_ERROR(cudaMemcpy(hC.data(), C.ptr(), hC.size() * sizeof(cuComplex), cudaMemcpyDeviceToHost));
-
-    // Print the result.
-    std::cout << "Result tensor C (matrix multiplication via einsum \"ij,jk->ik\"):" << std::endl;
-    // Since our DeviceTensor uses column-major order, element (i,j) is at index i + j*2.
-    for (int i = 0; i < 2; i++) {
-        for (int j = 0; j < 4; j++) {
-            int idx = i + j * 2;
-            cuComplex val = hC[idx];
-            std::cout << "(" << val.x << "," << val.y << ") ";
-        }
-        std::cout << std::endl;
-    }
-
-    // Expected result computed on host:
-    // A = [ [1,2,3],
-    //       [4,5,6] ]
-    // B = [ [7, 8, 9, 10],
-    //       [10,11,12,13],
-    //       [13,14,15,16] ]
-    // So, C = A*B = [ [ 1*7+2*10+3*13, 1*8+2*11+3*14, 1*9+2*12+3*15, 1*10+2*13+3*16 ],
-    //                [ 4*7+5*10+6*13, 4*8+5*11+6*14, 4*9+5*12+6*15, 4*10+5*13+6*16 ] ]
-    //     = [ [66, 72, 78, 84],
-    //         [156,171,186,201] ]
-    std::vector<cuComplex> expected = {
-        make_cuComplex(66.0f, 0.0f),
-        make_cuComplex(156.0f, 0.0f),
-        make_cuComplex(72.0f, 0.0f),
-        make_cuComplex(171.0f, 0.0f),
-        make_cuComplex(78.0f, 0.0f),
-        make_cuComplex(186.0f, 0.0f),
-        make_cuComplex(84.0f, 0.0f),
-        make_cuComplex(201.0f, 0.0f)
-    };
-
-    float tol = 1e-3f;
-    bool pass = true;
-    for (size_t i = 0; i < expected.size(); i++) {
-        if (std::abs(hC[i].x - expected[i].x) > tol || std::abs(hC[i].y - expected[i].y) > tol) {
-            pass = false;
-            std::cout << "Mismatch at index " << i << ": got (" << hC[i].x << ", " << hC[i].y
-                      << "), expected (" << expected[i].x << ", " << expected[i].y << ")" << std::endl;
-        }
-    }
-    if (pass)
-        std::cout << "Einsum matrix multiplication test passed!" << std::endl;
-    else
-        std::cout << "Einsum matrix multiplication test failed." << std::endl;
-
-    // Cleanup the cuTENSOR handle.
-    CUTENSOR_CHECK(cutensorDestroy(handle));
-
-    return 0;
-}
diff --git a/tests/test_permutation.cu b/tests/test_permutation.cu
deleted file mode 100644
index 60ead1f..0000000
--- a/tests/test_permutation.cu
+++ /dev/null
@@ -1,138 +0,0 @@
-// test.cu
-#include <iostream>
-#include <vector>
-#include <cassert>
-#include <cmath>
-#include <cuComplex.h>
-#include <thrust/host_vector.h>
-#include <thrust/device_vector.h>
-#include "permutation.h"  // your permutation helper header
-#include "tensor.h"       // your DeviceTensor definition
-#include <cutensor.h>
-
-// Host-side permutation function for verification.
-// Given a tensor stored in column-major order (host vector h_src),
-// with shape 'shape', and a permutation vector 'permutation',
-// returns a new host vector representing the permuted tensor.
-thrust::host_vector<cuComplex> host_permute(
-    const thrust::host_vector<cuComplex>& h_src,
-    const std::vector<int64_t>& shape,
-    const std::vector<int>& permutation)
-{
-    int n = shape.size();
-    int total = 1;
-    for (auto s : shape)
-        total *= s;
-    thrust::host_vector<cuComplex> h_dst(total);
-    // Compute source strides (column-major)
-    std::vector<int> srcStrides(n, 1);
-    for (int i = 1; i < n; i++) {
-        srcStrides[i] = srcStrides[i-1] * shape[i-1];
-    }
-    // Compute new shape: new_shape[i] = shape[ permutation[i] ]
-    std::vector<int64_t> newShape(n);
-    for (int i = 0; i < n; i++) {
-        newShape[i] = shape[ permutation[i] ];
-    }
-    // Compute destination strides (column-major for new shape)
-    std::vector<int> dstStrides(n, 1);
-    for (int i = 1; i < n; i++) {
-        dstStrides[i] = dstStrides[i-1] * newShape[i-1];
-    }
-    // Iterate over all multi-indices in source.
-    std::vector<int> index(n, 0);
-    for (int count = 0; count < total; count++) {
-        int srcIdx = 0;
-        for (int j = 0; j < n; j++) {
-            srcIdx += index[j] * srcStrides[j];
-        }
-        // Compute the new multi-index: new_index[k] = index[ permutation[k] ]
-        std::vector<int> newIndex(n);
-        for (int k = 0; k < n; k++) {
-            newIndex[k] = index[ permutation[k] ];
-        }
-        int dstIdx = 0;
-        for (int k = 0; k < n; k++) {
-            dstIdx += newIndex[k] * dstStrides[k];
-        }
-        h_dst[dstIdx] = h_src[srcIdx];
-        // Increment multi-index in column-major order.
-        int pos = 0;
-        while (pos < n) {
-            index[pos]++;
-            if (index[pos] < shape[pos])
-                break;
-            index[pos] = 0;
-            pos++;
-        }
-    }
-    return h_dst;
-}
-
-// Test function: creates a tensor with given shape and permutation,
-// runs the device permutation, and compares against expected result.
-void run_test(const std::vector<int64_t>& shape, const std::vector<int>& permutation) {
-    int nModes = shape.size();
-    int total = 1;
-    for (auto s : shape)
-        total *= s;
-    std::cout << "Running test for tensor shape " << tensor::vector_to_string(shape)
-              << " with permutation " << tensor::vector_to_string(permutation) << std::endl;
-
-    // Create host data: fill with sequential values (as cuComplex with zero imaginary part).
-    thrust::host_vector<cuComplex> h_src(total);
-    for (int i = 0; i < total; i++) {
-        h_src[i] = make_cuComplex(static_cast<float>(i), 0.0f);
-    }
-
-    // Create device tensor 'src' from h_src.
-    tensor::DeviceTensor<cuComplex> src(h_src, shape);
-    // Create destination device tensor 'dst' with same shape (will be reshaped by permutation).
-    tensor::DeviceTensor<cuComplex> dst(shape);
-
-    // Create cuTENSOR handle.
-    cutensorHandle_t handle;
-    CUTENSOR_CHECK(cutensorCreate(&handle));
-
-    // Call our permutation function.
-    permutation::permute(handle, src, dst, permutation);
-
-    // Copy result back to host.
-    thrust::host_vector<cuComplex> h_dst(dst.device_data);
-
-    // Compute expected result using host_permute.
-    thrust::host_vector<cuComplex> expected = host_permute(h_src, shape, permutation);
-
-    // Compare and print results.
-    bool pass = true;
-    float tol = 1e-4f;
-    for (int i = 0; i < total; i++) {
-        float diff = std::abs(h_dst[i].x - expected[i].x) + std::abs(h_dst[i].y - expected[i].y);
-        if (diff > tol) {
-            pass = false;
-            std::cout << "Mismatch at index " << i << ": got (" << h_dst[i].x << "," << h_dst[i].y
-                      << "), expected (" << expected[i].x << "," << expected[i].y << ")" << std::endl;
-        }
-    }
-    if (pass)
-        std::cout << "Test PASSED for shape " << tensor::vector_to_string(shape)
-                  << " and permutation " << tensor::vector_to_string(permutation) << std::endl;
-    else
-        std::cout << "Test FAILED for shape " << tensor::vector_to_string(shape)
-                  << " and permutation " << tensor::vector_to_string(permutation) << std::endl;
-
-    CUTENSOR_CHECK(cutensorDestroy(handle));
-}
-
-int main() {
-    // Test 1: 2D tensor: shape [3, 4], permutation: swap axes -> [1, 0] (expected shape: [4, 3])
-    run_test({3, 4}, {1, 0});
-
-    // Test 2: 3D tensor: shape [2, 3, 4], permutation: {2, 0, 1} (expected shape: [4, 2, 3])
-    run_test({2, 3, 4}, {2, 0, 1});
-
-    // Test 3: 4D tensor: shape [2, 2, 2, 2], permutation: {2, 0, 1, 3} (expected shape: [2, 2, 2, 2])
-    run_test({2, 2, 2, 2}, {2, 0, 1, 3});
-
-    return 0;
-}
diff --git a/tests/test_reduction.cu b/tests/test_reduction.cu
deleted file mode 100644
index c5b8bb6..0000000
--- a/tests/test_reduction.cu
+++ /dev/null
@@ -1,69 +0,0 @@
-#include <cuda_runtime.h>
-#include <cstdio>
-#include <cstdlib>
-#include <cmath>
-#include <iostream>
-
-#include "reduction.h"
-
-int main() {
-    const int N = 4;
-    const float truncation_tol = 0.5f;
-
-    float h_S[N] = {1.0f, 2.0f, 3.0f, 4.0f};
-    float expected_norm = sqrtf(30.0f);
-
-    float *d_S;
-    cudaMalloc(&d_S, N * sizeof(float));
-    cudaMemcpy(d_S, h_S, N * sizeof(float), cudaMemcpyHostToDevice);
-
-    int *d_count;
-    float *d_normsq;
-    cudaMalloc(&d_count, sizeof(int));
-    cudaMalloc(&d_normsq, sizeof(float));
-
-    cudaStream_t stream;
-    cudaStreamCreate(&stream);
-
-    int count = reduction::fusedCountNormScaleAndReturn<float>(d_S, N, truncation_tol, d_count, d_normsq, stream);
-
-    float h_S_normalized[N];
-    cudaMemcpy(h_S_normalized, d_S, N * sizeof(float), cudaMemcpyDeviceToHost);
-
-    int h_count;
-    float h_normsq;
-    cudaMemcpy(&h_count, d_count, sizeof(int), cudaMemcpyDeviceToHost);
-    cudaMemcpy(&h_normsq, d_normsq, sizeof(float), cudaMemcpyDeviceToHost);
-
-    std::cout << "Count (elements processed): " << count << " (expected " << N << ")" << std::endl;
-    std::cout << "Block-reduced count from device: " << h_count << std::endl;
-    std::cout << "Accumulated norm-squared from device: " << h_normsq << std::endl;
-    std::cout << "Expected normalization factor: " << expected_norm << std::endl;
-    std::cout << "Normalized S:" << std::endl;
-    for (int i = 0; i < N; i++) {
-        std::cout << h_S_normalized[i] << " ";
-    }
-    std::cout << std::endl;
-
-    bool success = true;
-    for (int i = 0; i < N; i++) {
-        float expected_value = h_S[i] / expected_norm;
-        if (fabs(h_S_normalized[i] - expected_value) > 1e-5) {
-            success = false;
-            std::cerr << "Mismatch at index " << i << ": got "
-                      << h_S_normalized[i] << ", expected " << expected_value << std::endl;
-        }
-    }
-    if (success)
-        std::cout << "Test PASSED!" << std::endl;
-    else
-        std::cerr << "Test FAILED!" << std::endl;
-
-    // Cleanup.
-    cudaFree(d_S);
-    cudaFree(d_count);
-    cudaFree(d_normsq);
-    cudaStreamDestroy(stream);
-
-    return 0;
-}
diff --git a/tests/test_tensordot.cu b/tests/test_tensordot.cu
deleted file mode 100644
index 54517e8..0000000
--- a/tests/test_tensordot.cu
+++ /dev/null
@@ -1,371 +0,0 @@
-#include <iostream>
-#include <vector>
-#include <cassert>
-#include <cmath>
-
-#include "tensordot.h"
-#include "tensor.h"
-
-int test_tensordot() {
-    // Use CUDA device 0.
-    cudaSetDevice(0);
-
-    // =============================
-    // Test 1: Matrix Multiplication
-    // =============================
-    std::cout << "Running matrix multiplication tensordot test..." << std::endl;
-    // A is a 2x3 matrix and B is a 3x2 matrix.
-    // (Remember: DeviceTensor uses column-major order.)
-    std::vector<int64_t> shapeA = {2, 3}; // 2 rows, 3 columns.
-    std::vector<int64_t> shapeB = {3, 2}; // 3 rows, 2 columns.
-    tensor::DeviceTensor<cuComplex> A(shapeA);
-    tensor::DeviceTensor<cuComplex> B(shapeB);
-
-    // Prepare host data in column-major order.
-    // For A (2x3):
-    //   Column 0: [1, 4]
-    //   Column 1: [2, 5]
-    //   Column 2: [3, 6]
-    std::vector<cuComplex> hA = {
-        make_cuComplex(1.0f, 0.0f), // A(0,0)
-        make_cuComplex(4.0f, 0.0f), // A(1,0)
-        make_cuComplex(2.0f, 0.0f), // A(0,1)
-        make_cuComplex(5.0f, 0.0f), // A(1,1)
-        make_cuComplex(3.0f, 0.0f), // A(0,2)
-        make_cuComplex(6.0f, 0.0f)  // A(1,2)
-    };
-
-    // For B (3x2):
-    //   Column 0: [7, 9, 11]
-    //   Column 1: [8, 10, 12]
-    std::vector<cuComplex> hB = {
-        make_cuComplex(7.0f, 0.0f),  // B(0,0)
-        make_cuComplex(9.0f, 0.0f),  // B(1,0)
-        make_cuComplex(11.0f, 0.0f), // B(2,0)
-        make_cuComplex(8.0f, 0.0f),  // B(0,1)
-        make_cuComplex(10.0f, 0.0f), // B(1,1)
-        make_cuComplex(12.0f, 0.0f)  // B(2,1)
-    };
-
-    size_t totalA = 2 * 3;
-    size_t totalB = 3 * 2;
-    HANDLE_CUDA_ERROR(cudaMemcpy(A.ptr(), hA.data(), totalA * sizeof(cuComplex), cudaMemcpyHostToDevice));
-    HANDLE_CUDA_ERROR(cudaMemcpy(B.ptr(), hB.data(), totalB * sizeof(cuComplex), cudaMemcpyHostToDevice));
-
-    // Prepare output tensor C for the result (should be 2x2).
-    std::vector<int64_t> shapeC = {2, 2};
-    tensor::DeviceTensor<cuComplex> C(shapeC);
-
-    // For matrix multiplication, contract A over axis 1 (columns) with B over axis 0 (rows).
-    std::vector<int> axesA = {1};
-    std::vector<int> axesB = {0};
-
-    cutensorHandle_t handle;
-    CUTENSOR_CHECK(cutensorCreate(&handle));
-    auto workspace = contraction::Workspace(100);
-
-
-    contraction::tensordot(handle, workspace, A, B, C, axesA, axesB, false, false, 0);
-
-    // Copy the result from device to host.
-    std::vector<cuComplex> hC(2 * 2);
-    HANDLE_CUDA_ERROR(cudaMemcpy(hC.data(), C.ptr(), hC.size() * sizeof(cuComplex), cudaMemcpyDeviceToHost));
-
-    // Print the result (C is stored in column-major order).
-    std::cout << "Result tensor C (matrix multiplication):" << std::endl;
-    for (int i = 0; i < 2; i++) {
-        for (int j = 0; j < 2; j++) {
-            // In column-major, index = i + j * (number of rows).
-            cuComplex val = hC[i + j * 2];
-            std::cout << "(" << val.x << ", " << val.y << ") ";
-        }
-        std::cout << std::endl;
-    }
-
-    // Expected result:
-    // A = [ [1, 2, 3],
-    //       [4, 5, 6] ]
-    // B = [ [7, 8],
-    //       [9, 10],
-    //       [11,12] ]
-    // A * B = [ [1*7+2*9+3*11, 1*8+2*10+3*12],
-    //           [4*7+5*9+6*11, 4*8+5*10+6*12] ]
-    //       = [ [58, 64],
-    //           [139,154] ]
-    std::vector<cuComplex> expected = {
-        make_cuComplex(58.0f, 0.0f),  // C(0,0)
-        make_cuComplex(139.0f, 0.0f), // C(1,0)
-        make_cuComplex(64.0f, 0.0f),  // C(0,1)
-        make_cuComplex(154.0f, 0.0f)  // C(1,1)
-    };
-
-    float tol = 1e-3f;
-    bool pass = true;
-    for (size_t i = 0; i < expected.size(); i++) {
-        if (std::abs(hC[i].x - expected[i].x) > tol || std::abs(hC[i].y - expected[i].y) > tol) {
-            pass = false;
-            std::cout << "Mismatch at index " << i << ": got (" << hC[i].x << ", " << hC[i].y
-                      << "), expected (" << expected[i].x << ", " << expected[i].y << ")" << std::endl;
-        }
-    }
-    if (pass)
-        std::cout << "tensordot matrix multiplication test passed!" << std::endl;
-    else
-        std::cout << "tensordot matrix multiplication test failed." << std::endl;
-
-
-    // =============================
-    // Test 2: 3D Tensordot Contraction
-    // =============================
-    std::cout << "\nRunning 3D tensordot test..." << std::endl;
-    // Create a 3D tensor A of shape (2, 3, 4) and a 2D tensor B of shape (4, 5).
-    // We will contract A’s last axis (axis 2, size 4) with B’s first axis (axis 0, size 4).
-    // The free dimensions of A are (2,3) and for B are (5), so the output tensor C should have shape (2, 3, 5).
-    std::vector<int64_t> shapeA3d = {2, 3, 4};
-    std::vector<int64_t> shapeB2d = {4, 5};
-    tensor::DeviceTensor<cuComplex> A3d(shapeA3d);
-    tensor::DeviceTensor<cuComplex> B2d(shapeB2d);
-    size_t totalA3d = 2 * 3 * 4;  // 24 elements.
-    size_t totalB2d = 4 * 5;      // 20 elements.
-    std::vector<cuComplex> hA3d(totalA3d);
-    std::vector<cuComplex> hB2d(totalB2d);
-    // Fill A3d with sequential numbers 1 ... 24.
-    for (size_t idx = 0; idx < totalA3d; ++idx) {
-        hA3d[idx] = make_cuComplex(static_cast<float>(idx + 1), 0.0f);
-    }
-    // Fill B2d with sequential numbers 1 ... 20.
-    for (size_t idx = 0; idx < totalB2d; ++idx) {
-        hB2d[idx] = make_cuComplex(static_cast<float>(idx + 1), 0.0f);
-    }
-    HANDLE_CUDA_ERROR(cudaMemcpy(A3d.ptr(), hA3d.data(), totalA3d * sizeof(cuComplex), cudaMemcpyHostToDevice));
-    HANDLE_CUDA_ERROR(cudaMemcpy(B2d.ptr(), hB2d.data(), totalB2d * sizeof(cuComplex), cudaMemcpyHostToDevice));
-
-    // Prepare output tensor C3d with expected shape (2, 3, 5).
-    std::vector<int64_t> shapeC3d = {2, 3, 5};
-    tensor::DeviceTensor<cuComplex> C3d(shapeC3d);
-
-    // Set contraction axes: contract axis 2 of A3d with axis 0 of B2d.
-    std::vector<int> axesA3d = {2};
-    std::vector<int> axesB2d = {0};
-
-    contraction::tensordot(handle, workspace, A3d, B2d, C3d, axesA3d, axesB2d, false, false, 0);
-
-    size_t totalC3d = 2 * 3 * 5;
-    std::vector<cuComplex> hC3d(totalC3d);
-    HANDLE_CUDA_ERROR(cudaMemcpy(hC3d.data(), C3d.ptr(), totalC3d * sizeof(cuComplex), cudaMemcpyDeviceToHost));
-
-    // Compute expected result on host.
-    // For A3d with shape (2,3,4) in column-major order, the strides are:
-    //   stride[0] = 1, stride[1] = 2, stride[2] = 2*3 = 6.
-    // So A3d(i,j,l) is stored at index: i + j*2 + l*6.
-    // For B2d with shape (4,5), in column-major order, strides:
-    //   stride[0] = 1, stride[1] = 4.
-    // So B2d(l,k) is stored at index: l + k*4.
-    // For C3d with shape (2,3,5), strides are the same as A3d for the free dimensions:
-    //   index = i + j*2 + k*6.
-    std::vector<cuComplex> expected3d(totalC3d, make_cuComplex(0.0f, 0.0f));
-    for (int i = 0; i < 2; i++) {
-        for (int j = 0; j < 3; j++) {
-            for (int k = 0; k < 5; k++) {
-                float sum = 0.0f;
-                for (int l = 0; l < 4; l++) {
-                    int idxA = i + j * 2 + l * 6; // A3d(i,j,l)
-                    int idxB = l + k * 4;         // B2d(l,k)
-                    sum += hA3d[idxA].x * hB2d[idxB].x; // real numbers only.
-                }
-                int idxC = i + j * 2 + k * 6;
-                expected3d[idxC] = make_cuComplex(sum, 0.0f);
-            }
-        }
-    }
-
-    // Print the 3D tensordot result.
-    std::cout << "\nResult tensor C3d (shape 2x3x5):" << std::endl;
-    for (int k = 0; k < 5; k++) {
-        std::cout << "Slice k = " << k << ":" << std::endl;
-        for (int j = 0; j < 3; j++) {
-            for (int i = 0; i < 2; i++) {
-                int idx = i + j * 2 + k * 6;
-                cuComplex val = hC3d[idx];
-                std::cout << "(" << val.x << ", " << val.y << ") ";
-            }
-            std::cout << std::endl;
-        }
-        std::cout << std::endl;
-    }
-
-    // Compare expected vs. GPU result.
-    pass = true;
-    for (size_t idx = 0; idx < totalC3d; idx++) {
-        if (std::abs(hC3d[idx].x - expected3d[idx].x) > tol ||
-            std::abs(hC3d[idx].y - expected3d[idx].y) > tol) {
-            pass = false;
-            std::cout << "Mismatch at index " << idx << ": got ("
-                      << hC3d[idx].x << ", " << hC3d[idx].y << "), expected ("
-                      << expected3d[idx].x << ", " << expected3d[idx].y << ")" << std::endl;
-        }
-    }
-    if (pass)
-        std::cout << "3D tensordot test passed!" << std::endl;
-    else
-        std::cout << "3D tensordot test failed." << std::endl;
-
-    CUTENSOR_CHECK(cutensorDestroy(handle));
-
-    return 0;
-}
-
-using Complex = cuDoubleComplex;
-using Scalar = double;
-
-int test_scale() {
-    std::cout << std::fixed << std::setprecision(10);
-    cudaStream_t stream;
-    cudaStreamCreate(&stream);
-    static const Scalar tol = 1e-16;
-    static const Scalar reciprocal_tol = 1e-5;
-
-    int I = 20, K = 30, L = 20;
-    int total = I * K * L;
-
-    std::vector<Complex> h_tensor(total);
-    for (int l = 0; l < L; l++) {
-        for (int k = 0; k < K; k++) {
-            for (int i = 0; i < I; i++) {
-                int idx = i + I * (k + K * l);
-                h_tensor[idx] = Complex(static_cast<Scalar>(idx + 1), static_cast<Scalar>(idx + 10));
-            }
-        }
-    }
-
-    std::vector<Scalar> h_S_left(I);
-    std::iota(h_S_left.begin(), h_S_left.end(), 1.0f);
-
-    tensor::DeviceTensor<Complex> d_in({I, K, L});
-    tensor::DeviceTensor<Complex> d_out({I, K, L});
-    tensor::DeviceTensor<Scalar> d_S_left({I});
-
-    d_in.device_data = h_tensor;
-    d_S_left.device_data = h_S_left;
-
-
-    contraction::scale_B_by_S<Complex>(d_S_left, d_in, d_out, stream, false);
-    thrust::host_vector<Complex> h_out = d_out.device_data;
-
-
-    std::cout << "Test 1: Left scaling (non-reciprocal):" << std::endl;
-    bool pass = true;
-    for (int l = 0; l < L; l++) {
-        for (int k = 0; k < K; k++) {
-            for (int i = 0; i < I; i++) {
-                int idx = i + I * (k + K * l);
-                Scalar factor = h_S_left[i]; // left scaling: factor depends on i.
-                Complex expected = Complex(h_tensor[idx].x * factor, h_tensor[idx].y * factor);
-                Complex actual = h_out[idx];
-                if (std::abs(expected.x - actual.x) > tol|| std::abs(expected.y - actual.y) > tol) {
-                    pass = false;
-                    std::cout << "Mismatch at idx " << idx << ": expected ("
-                              << expected.x << "," << expected.y << ") but got ("
-                              << actual.x << "," << actual.y << ")" << std::endl;
-                }
-            }
-        }
-    }
-    std::cout << (pass ? "Test 1 PASSED" : "Test 1 FAILED") << std::endl;
-
-    // -------------------------------
-    // Test 2: Left scaling with reciprocal
-    // Expected: each element becomes inData / S[i].
-    // -------------------------------
-    contraction::scale_B_by_S<Complex>(d_S_left, d_in, d_out, stream, true);
-    h_out = d_out.device_data;
-
-    std::cout << "Test 2: Left scaling (reciprocal):" << std::endl;
-    pass = true;
-    for (int l = 0; l < L; l++) {
-        for (int k = 0; k < K; k++) {
-            for (int i = 0; i < I; i++) {
-                int idx = i + I * (k + K * l);
-                Scalar factor = h_S_left[i];
-                Complex expected = Complex(h_tensor[idx].x / factor, h_tensor[idx].y / factor);
-                Complex actual = h_out[idx];
-                if (std::abs(expected.x - actual.x) > reciprocal_tol || std::abs(expected.y - actual.y) > reciprocal_tol) {
-                    pass = false;
-                    std::cout << "Mismatch at idx " << idx << ": expected ("
-                              << expected.x << "," << expected.y << ") but got ("
-                              << actual.x << "," << actual.y << ")" << std::endl;
-                }
-            }
-        }
-    }
-    std::cout << (pass ? "Test 2 PASSED" : "Test 2 FAILED") << std::endl;
-
-    // -------------------------------
-    // Test 3: Right scaling (non-reciprocal)
-    // S vector length must equal L.
-    // Expected: each element is multiplied by S[l] where l = (idx / I) / K.
-    // -------------------------------
-    std::vector<Scalar> h_S_right(L);
-    std::iota(h_S_right.begin(), h_S_right.end(), 10);
-
-    tensor::DeviceTensor<Scalar> d_S_right({L});
-    d_S_right.device_data = h_S_right;
-
-    contraction::scale_B_by_S<Complex>(d_in, d_S_right, d_out, stream, false);
-    h_out = d_out.device_data;
-
-    std::cout << "Test 3: Right scaling (non-reciprocal):" << std::endl;
-    pass = true;
-    for (int l = 0; l < L; l++) {
-        for (int k = 0; k < K; k++) {
-            for (int i = 0; i < I; i++) {
-                int idx = i + I * (k + K * l);
-                Scalar factor = h_S_right[l]; // right scaling: factor depends on l.
-                Complex expected = Complex(h_tensor[idx].x * factor, h_tensor[idx].y * factor);
-                Complex actual = h_out[idx];
-                if (std::abs(expected.x - actual.x) > tol || std::abs(expected.y - actual.y) > tol) {
-                    pass = false;
-                    std::cout << "Mismatch at idx " << idx << ": expected ("
-                              << expected.x << "," << expected.y << ") but got ("
-                              << actual.x << "," << actual.y << ")" << std::endl;
-                }
-            }
-        }
-    }
-    std::cout << (pass ? "Test 3 PASSED" : "Test 3 FAILED") << std::endl;
-
-    // -------------------------------
-    // Test 4: Right scaling with reciprocal
-    // Expected: each element becomes inData / S[l].
-    // -------------------------------
-    contraction::scale_B_by_S<Complex>(d_in, d_S_right, d_out, stream, true);
-    h_out = d_out.device_data;
-
-    std::cout << "Test 4: Right scaling (reciprocal):" << std::endl;
-    pass = true;
-    for (int l = 0; l < L; l++) {
-        for (int k = 0; k < K; k++) {
-            for (int i = 0; i < I; i++) {
-                int idx = i + I * (k + K * l);
-                Scalar factor = h_S_right[l];
-                Complex expected = Complex(h_tensor[idx].x / factor, h_tensor[idx].y / factor);
-                Complex actual = h_out[idx];
-                if (std::abs(expected.x - actual.x) > reciprocal_tol || std::abs(expected.y - actual.y) > reciprocal_tol) {
-                    pass = false;
-                    std::cout << "Mismatch at idx " << idx << ": expected ("
-                              << expected.x << "," << expected.y << ") but got ("
-                              << actual.x << "," << actual.y << ")" << std::endl;
-                }
-            }
-        }
-    }
-    std::cout << (pass ? "Test 4 PASSED" : "Test 4 FAILED") << std::endl;
-
-    cudaStreamDestroy(stream);
-    return 0;
-}
-
-
-int main() {
-    test_scale();
-}
\ No newline at end of file