From 767684285fa404c602aa800236799d30615db652 Mon Sep 17 00:00:00 2001
From: maximvassiliev <vassiliev.max@gmail.com>
Date: Fri, 30 Aug 2024 18:02:00 +0100
Subject: [PATCH 01/10] move everything over

---
 Project.toml               |  20 ++-
 examples/convlayer.jl      |  65 ++++++++++
 examples/heatequation.jl   |   2 +-
 examples/neuralnet.jl      |  45 +++++++
 examples/newtonpendulum.jl |  41 ++++--
 src/DualArrays.jl          | 251 ++++++++++++++++++++++++++++++++-----
 test/runtests.jl           | 124 +++++++++++++-----
 7 files changed, 472 insertions(+), 76 deletions(-)
 create mode 100644 examples/convlayer.jl
 create mode 100644 examples/neuralnet.jl

diff --git a/Project.toml b/Project.toml
index 0375150..a78c37d 100644
--- a/Project.toml
+++ b/Project.toml
@@ -6,10 +6,26 @@ version = "0.1.0"
 [deps]
 ArrayLayouts = "4c555306-a7a7-4459-81d9-ec55ddd5c99a"
 BandedMatrices = "aae01518-5342-5314-be14-df237901396f"
-DifferentialEquations = "0c46a032-eb83-5123-abaf-570d42b7fbaa"
+ChainRules = "082447d4-558c-5d27-93f4-14fc19e9eca2"
+ComponentArrays = "b0b7db55-cfe3-40fc-9ded-d10e2dbeff66"
 FillArrays = "1a297f60-69ca-5386-bcde-b61e274b549b"
 ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
+Lux = "b2108857-7c20-44ae-9111-449ecde12c47"
+MLDatasets = "eb30cadb-4394-5ae3-aed4-317e484a6458"
 Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80"
-Revise = "295af30f-e4ad-537b-8983-00126c2a3abe"
+Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
+
+[compat]
+ArrayLayouts = "1"
+ForwardDiff = "0.10.36"
+Lux = "0.5.62"
+Plots = "1.40.5"
+
+[extras]
+ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210"
+Lux = "b2108857-7c20-44ae-9111-449ecde12c47"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
+
+[targets]
+test = ["Lux", "ForwardDiff", "Test"]
diff --git a/examples/convlayer.jl b/examples/convlayer.jl
new file mode 100644
index 0000000..2e16d64
--- /dev/null
+++ b/examples/convlayer.jl
@@ -0,0 +1,65 @@
+using LinearAlgebra, MLDatasets, Plots, DualArrays, Random, FillArrays
+
+#GOAL: Implement and differentiate a convolutional neural network layer
+function convlayer(img, ker, xstride = 1, ystride = 1)
+    n, m = size(ker)
+    t = eltype(ker)
+
+    n2, m2 = size(img)
+    n3, m3 = div(n2-n+1,xstride), div(m2-m+1,ystride)
+    fmap = zeros(promote_type(eltype(img), t), n3, m3)
+
+    #Apply kernel to section of image
+    for i= 1:xstride:n3,j = 1:ystride:m3
+        ft = img[i:i+n-1,j:j+m-1] .* ker
+        fmap[i,j] = sum(ft)
+    end
+
+    fmap
+end
+
+function softmax(x)
+    s = sum(exp.(x))
+    exp.(x) / s
+end
+
+function dense_layer(f, W, b, x)
+    f(W*x + b)
+end
+
+function cross_entropy(x, y)
+    -sum(y .* log.(x))
+end
+
+function model_loss(x, y, w)
+    ker = reshape(w[1:9], 3, 3)
+    weights = reshape(w[10:6769], 10, 676)
+    biases = w[6770:6779]
+    l1 = vec(DualMatrix(convlayer(x, ker)))
+    l2 = dense_layer(softmax, weights, biases, l1)
+    target = OneElement(1, y+1, 10)
+    loss = cross_entropy(l2, target)
+    loss.value, loss.partials
+end
+
+#TODO: Implement remaining layers upon more support for DualMatrix
+
+function train_model()
+    p = rand(6779)
+    epochs = 1000
+    lr = 0.02
+    dataset = MNIST(:train)
+
+    for i = 1:epochs
+        train, test = dataset[i]
+        d = DualVector(p, I(6779))
+
+        loss, grads = model_loss(train, test, d)
+        println(loss)
+        p = p - lr * grads
+    end
+end
+
+train_model()
+
+
diff --git a/examples/heatequation.jl b/examples/heatequation.jl
index e4a61b3..09b4542 100644
--- a/examples/heatequation.jl
+++ b/examples/heatequation.jl
@@ -9,7 +9,7 @@
 # Compare performance and results.
 ##
 
-using LinearAlgebra, BandedMatrices, DifferentialEquations, Plots, 
+using LinearAlgebra, BandedMatrices, OrdinaryDiffEqs, Plots, 
 ForwardDiff
 
 U = 500
diff --git a/examples/neuralnet.jl b/examples/neuralnet.jl
new file mode 100644
index 0000000..ff4e403
--- /dev/null
+++ b/examples/neuralnet.jl
@@ -0,0 +1,45 @@
+import Lux: relu
+using DualArrays, LinearAlgebra, Plots, FillArrays, Test
+
+#GOAL: Learn exp() using simple one-layer relu neural network.
+
+N = 20
+d = 0.1
+
+#Domain over which to learn exp
+data = collect(d:d:N*d)
+
+function model(a, b, x)
+    return relu.(a .* x + b)
+end
+
+function model_loss(w)
+     sum((model(w[1:N], w[(N+1):end], data) - exp.(data)) .^ 2)
+end
+
+function gradient_descent_sparse(n, lr = 0.01)
+    weights = ones(2 * N)
+    for i = 1:n
+        dw = DualVector(weights, Eye{Float64}(2*N))
+        grads = model_loss(dw).partials
+        weights -= lr * grads
+    end
+    model(weights[1:N], weights[(N + 1):end], data)
+end
+
+function gradient_descent_dense(n, lr = 0.01)
+    weights = ones(2 * N)
+    for i = 1:n
+        dw = DualVector(weights, Matrix(I, 2*N, 2*N))
+        grads = model_loss(dw).partials
+        weights -= lr * grads
+    end
+    model(weights[1:N], weights[(N + 1):end], data)
+end
+
+@time densesol = gradient_descent_dense(500)
+@time sparsesol = gradient_descent_sparse(500)
+@test densesol == sparsesol
+@test sparsesol ≈ exp.(data) rtol = 1e-2
+
+
diff --git a/examples/newtonpendulum.jl b/examples/newtonpendulum.jl
index 81d727b..1d753b6 100644
--- a/examples/newtonpendulum.jl
+++ b/examples/newtonpendulum.jl
@@ -6,24 +6,36 @@
 # via discretisation and Newton's method.
 ##
 
-using LinearAlgebra, ForwardDiff, Plots, DualArrays
+
+using LinearAlgebra, ForwardDiff, Plots, DualArrays, FillArrays
 
 #Boundary Conditions
 a = 0.1
 b = 0.0
 
 #Time step, Time period and number of x for discretisation.
-ts = 0.001
+ts = 0.1
+
 Tmax = 5.0
 N = Int(Tmax/ts) - 1
 
 #LHS of ode
 function f(x)
     n = length(x)
-    D = Tridiagonal([ones(Float64, n) / ts ; 0.0], [1.0; -2ones(Float64, n) / ts; 1.0], [0.0; ones(Float64, n) / ts])
+    D = Tridiagonal([ones(n) / ts ; 0.0], [1.0; -2ones(n) / ts; 1.0], [0.0; ones(n) / ts])
     (D * [a; x; b])[2:end-1] + sin.(x)
 end
 
+
+function f(u, a, b, Tmax)
+    h = Tmax/(length(u)-1)
+    [u[1] - a;
+     (u[1:end-2] - 2u[2:end-1] + u[3:end])/h^2 + sin.(u[2:end-1]);
+     u[end] - b]
+end
+
+
+
 #Newtons method using ForwardDiff.jl
 function newton_method_forwarddiff(f, x0, n)
     x = x0
@@ -38,18 +50,33 @@ function newton_method_dualvector(f, x0, n)
     x = x0
     l = length(x0)
     for i = 1:n
-        ∇f = f(DualVector(x, Matrix(I, l, l))).jacobian
+        ∇f = f(DualVector(x, Eye(l))).jacobian
+        x = x - ∇f \ f(x)
+    end
+    x
+end
+
+function newton_method_dualvector2(f, x0, n)
+    x = x0
+    l = length(x0)
+    for i = 1:n
+        ∇f = f(DualVector(x, Eye(l)), a, b, Tmax).jacobian
         x = x - ∇f \ f(x)
     end
     x
 end
 
 #Initial guess
-x0 = zeros(Float64, N)
+x0 = zeros(N)
 
 #Solve and plot both solution and LHS ('deviation' from system)
-@time sol = newton_method_forwarddiff(f, x0, 100)
-@time sol = newton_method_dualvector(f, x0, 100)
+@time sol1 = newton_method_forwarddiff(f, x0, 100);
+@time sol2 = newton_method_dualvector(f, x0, 100);
+@test sol1 ≈ sol2
+@time sol = newton_method_dualvector2(f, x0, 100);
+
+x0 = zeros(N); x0[1] = a; x0[end] = b;
+
 plot(0:ts:Tmax, [a; sol; b])
 plot!(0:ts:Tmax, [0; f(sol); 0])
 
diff --git a/src/DualArrays.jl b/src/DualArrays.jl
index e4074d5..56d705d 100644
--- a/src/DualArrays.jl
+++ b/src/DualArrays.jl
@@ -1,80 +1,236 @@
 module DualArrays
-export DualVector
+export DualVector, DualArray, DualMatrix
+export dropzeros
 
 import Base: +, ==, getindex, size, broadcast, axes, broadcasted, show, sum,
-             vcat, convert, *
-using LinearAlgebra, ArrayLayouts, BandedMatrices, FillArrays
+             vcat, convert, *, -, ^, /, ndims, hcat, vec, promote_rule, zero,
+             reshape
+using LinearAlgebra, ArrayLayouts, BandedMatrices, FillArrays, ComponentArrays
+import ChainRules: frule, ZeroTangent
 
-struct Dual{T, Partials <: AbstractVector{T}} <: Real
+# TODO: support non-banded
+sparsevcat(As...) = vcat(convert.(BandedMatrix,  As)...)
+
+# Should Partials necessarily be a vector?
+# For example, if we index a DualMatrix, should partials retain the shape
+# of the perturbations, thereby giving a matrix of partials?
+struct Dual{T, Partials <: AbstractArray{T}} <: Real
     value::T
     partials::Partials
 end
 
+zero(::Type{Dual{T}}) where {T} = Dual(zero(T), zeros(T,1))
+promote_rule(::Type{Dual{T}},::Type{S}) where {T,S} = Dual{promote_type(T,S)}
+
 ==(a::Dual, b::Dual) = a.value == b.value && a.partials == b.partials
+for op in (:+, :-)
+    @eval $op(a::Dual, k::Real) = Dual($op(a.value, k), a.partials)
+end
+for op in (:*, :/)
+    @eval $op(a::Dual, k::Real) = Dual($op(a.value, k), $op(a.partials,k))
+end
+
+-(x::Dual) = Dual(-1 * x.value, -1 * x.partials)
+
+*(x::Dual, y::Dual) = Dual(x.value*y.value, x.value*y.partials + y.value*x.partials)
+/(x::Dual, y::Dual) = x*Dual(y.value, -y.partials) / y.value^2
 
 sparse_getindex(a...) = layout_getindex(a...)
 sparse_getindex(D::Diagonal, k::Integer, ::Colon) = OneElement(D.diag[k], k, size(D,2))
 sparse_getindex(D::Diagonal, ::Colon, j::Integer) = OneElement(D.diag[j], j, size(D,1))
 
 """
-reprents a vector of duals given by
+reprents an array of duals given by
     
-    values + jacobian * [ε_1,…,ε_n].
+    values + jacobian * 𝛜.
 
-For now the entries just return the values.
+where 𝛜 is an array with dual parts corresponding to each entry of values.
 """
 
-struct DualVector{T, M <: AbstractMatrix{T}} <: AbstractVector{Dual{T}}
-    value::Vector{T}
-    jacobian::M
-    function DualVector(value::Vector{T},jacobian::M) where {T, M <: AbstractMatrix{T}}
-        if(size(jacobian)[1] != length(value))
-            x,y = length(value),size(jacobian)[1]
-            throw(ArgumentError("vector length must match number of rows in jacobian.\n
-            vector length: $x \n
-            no. of jacobian rows: $y"))
+#Helper function to check valid dimensions of DualArray
+function _checkdims(val, jac)
+    s1, s2 = size(val), size(jac)
+    if length(s1) > length(s2)
+        return false
+    end
+    for (i, x) in enumerate(s1)
+        if(x != s2[i])
+            return false
         end
-        new{T,M}(value,jacobian)
     end
+    return true
 end
 
-function DualVector(value::AbstractVector, jacobian::AbstractMatrix)
-    T = promote_type(eltype(value), eltype(jacobian))
-    DualVector(convert(Vector{T}, value), convert(AbstractMatrix{T}, jacobian))
+struct DualArray{T, N, M, J <: AbstractArray{T, M}} <: AbstractArray{Dual{T}, N}
+    value::AbstractArray{T, N}
+    jacobian::J
+    function DualArray(value::AbstractArray{T,N},jacobian::AbstractArray{T, M}) where {T, N, M}
+        if(!_checkdims(value,jacobian))
+            s1, s2 = size(value), size(jacobian)
+            throw(ArgumentError("N-dimensional array must be of equal size to first N dimensions of jacobian. \n\nGot $s1, $s2"))
+        end
+        new{T,N,2*N,typeof(jacobian)}(value,jacobian)
+    end
+end
+
+function DualArray(value::AbstractArray{T,N}, jacobian::AbstractArray{S,M}) where {S,T,N,M}
+    t = promote_type(T, S)
+    DualArray(convert(AbstractArray{t,N}, value), convert(AbstractArray{t, M}, jacobian))
 end
 
+#Aliases and Constructors
+const DualVector{T, M} = DualArray{T, 1, 2, M} where {T, M <: AbstractMatrix{T}}
+const DualMatrix{T, M} = DualArray{T, 2, 4, M} where {T, M <: AbstractArray{T, 4}}
+
+DualVector(x::AbstractVector, j::AbstractMatrix) = DualArray(x, j)
+DualVector(x::AbstractVector, j::ComponentArray) = DualArray(x, j)
+DualMatrix(x::AbstractMatrix, j::AbstractArray{T,4}) where {T} = DualArray(x, j)
 
+function DualMatrix(x::AbstractMatrix{Dual{T}}) where {T}
+    val = zeros(T, size(x)...)
+    jac = zeros(T, size(x)..., size(x[1,1].partials)...)
+    n, m = size(x)
+    for i = 1:n, j = 1:m
+        val[i,j] = x[i,j].value
+        jac[i,j,:,:] = x[i,j].partials
+    end
+    DualMatrix(val, jac)
+end
 
 function getindex(x::DualVector, y::Int)
     Dual(x.value[y], sparse_getindex(x.jacobian,y,:))
 end
-
 function getindex(x::DualVector, y::UnitRange)
     newval = x.value[y]
     newjac = sparse_getindex(x.jacobian,y,:)
+    newjac = layout_getindex(x.jacobian,y,:)
     DualVector(newval, newjac)
 end
-size(x::DualVector) = length(x.value)
-axes(x::DualVector) = axes(x.value)
+getindex(x::DualVector, ::Colon) = x
+
+function getindex(x::DualArray, y::Int...)
+    idx = ntuple(i -> i > length(y) ? Colon() : y[i], 2*ndims(x))
+    Dual(x.value[y...], x.jacobian[idx...])
+end
+
+for op in (:size, :axes, :ndims)
+    @eval $op(x::DualArray) = $op(x.value)
+end
+
 +(x::DualVector,y::DualVector) = DualVector(x.value + y.value, x.jacobian + y.jacobian)
+-(x::DualVector,y::DualVector) = DualVector(x.value - y.value, x.jacobian - y.jacobian)
++(x::DualVector,y::Vector) = DualVector(x.value + y, x.jacobian)
+-(x::DualVector,y::Vector) = DualVector(x.value - y, x.jacobian)
++(x::Vector,y::DualVector) = DualVector(y.value + x, y.jacobian)
+-(x::Vector,y::DualVector) = DualVector(x - y.value, -y.jacobian)
 *(x::AbstractMatrix, y::DualVector) = DualVector(x * y.value, x * y.jacobian)
+*(x::Number, y::DualVector) = DualVector(x * y.value, x * y.jacobian)
+*(x::DualVector, y::Dual) = DualVector(x.value * y.value, y.value * x.jacobian + x.value * y.partials')
+/(x::DualVector, y::Number) = DualVector(x.value / y, x.jacobian/ y)
+/(x::DualVector, y::Dual) = x * Dual(y.value, -y.partials) / y.value^2
 
-broadcasted(::typeof(sin),x::DualVector) = DualVector(sin.(x.value),Diagonal(cos.(x.value))*x.jacobian)
+##
+# Note about 4-tensor by matrix multiplication:
+#   We can consider matrix-vector multiplication as follows:
+#
+#       (Ax)[i] = A[i,:] . x
+#   
+#   This can be generalised to higher dimensions (B a 4-tensor, X a matrix):
+#
+#       (BX)[i,j] = B[i,j,:,:] . X
+#
+#   This is useful since if we consider 4-tensors dimension n × m × s × t
+#   equivalent to linear maps from n × m matrices to s × t matrices, this 
+#   gives us multiplication analogous to applying the linear transform on
+#   a matrix in the same way that matrix-vector multiplication applies the
+#   linear transformation to the vector. This also gives us a way to 
+#   'expand' the Dual part of a DualMatrix.
+##
+
+# Assumes that the total number of epsilons is equal, i.e the array of
+# perturbations are reshapes of each other.
+function *(x::DualMatrix, y::DualVector)
+    val = x.value * y.value
+    t = promote_type(eltype(x.value), eltype(y.value))
+    jac = zeros(t, length(val), size(y.jacobian, 2))
+    n, m = size(jac, 1), size(x.value, 2)
+    for i = 1:n
+        new_row = zeros(size(jac, 2))
+        for j = 1:m
+            new_row += x.value[i,j] * y.jacobian[j,:] + y.value[j] * vec(x.jacobian[i, j, :, :])
+        end
+        jac[i,:] = new_row
+    end
+    DualVector(val, jac)
+end
+
+function *(x::DualMatrix, y::AbstractVector)
+    val = x.value * y
+    t = promote_type(eltype(x.value), eltype(y))
+    jac = zeros(t, length(val), size(x.jacobian, 3) * size(x.jacobian, 4))
+    n, m = size(jac, 1), size(x.value, 2)
+    for i = 1:n
+        new_row = zeros(size(jac, 2))
+        for j = 1:m
+            new_row += y[j] * vec(x.jacobian[i, j, :, :])
+        end
+        jac[i,:] = new_row
+    end
+    DualVector(val, jac)
+end
+
+function broadcasted(f::Function,d::DualVector)
+    jvals = zeros(eltype(d.value), length(d.value))
+    for (i, x) = enumerate(d.value)
+        _, df = frule((ZeroTangent(), 1),f, x)
+        jvals[i] = df
+    end
+    DualVector(f.(d.value), Diagonal(jvals)*d.jacobian)
+end
 
 function broadcasted(::typeof(*),x::DualVector,y::DualVector)
     newval = x.value .* y.value
     newjac = x.value .* y.jacobian + y.value .* x.jacobian
+    DualArray(newval,newjac)
+end
+
+function broadcasted(::typeof(*),x::AbstractArray,y::DualArray)
+    newval = x .* y.value
+    newjac = x .* y.jacobian
+    DualArray(newval,newjac)
+end
+
+function broadcasted(::typeof(*),x::DualArray,y::AbstractArray)
+    newval = x.value .* y
+    newjac = y .* x.jacobian
     DualVector(newval,newjac)
 end
 
+function broadcasted(::typeof(^),x::DualVector,n::Int)
+    newval = x.value .^ n
+    newjac = n * x.value .^ (n - 1) .* x.jacobian
+    DualVector(newval,newjac)
+end
+
+broadcasted(::typeof(Base.literal_pow), ::typeof(^), x::DualVector, ::Val{n}) where n = broadcasted(^, x, n)
+
+###
+# sum (higher-dimensional case assuming that shape of perturbations is preserved)
+###
 function sum(x::DualVector)
-    n = length(x.value)
     Dual(sum(x.value), vec(sum(x.jacobian; dims=1)))
 end
 
+function sum(x::DualArray)
+    #Drop redundant leading dimensions of 1 in jacobian
+    idx = ntuple(i -> i > ndims(x) ? Colon() : 1, 2*ndims(x))
+    Dual(sum(x.value), sum(x.jacobian; dims=1:ndims(x))[idx...])
+end
+
 _jacobian(d::Dual) = permutedims(d.partials)
+_jacobian(d::DualVector) = d.jacobian
 _jacobian(d::DualVector, ::Int) = d.jacobian
-_jacobian(x::Number, N::Int) = zeros(typeof(x), 1, N)
+_jacobian(x::Number, N::Int) = Zeros{typeof(x)}(1, N)
 
 _value(d::DualVector) = d.value
 _value(x::Number) = x
@@ -88,16 +244,45 @@ function vcat(x::Union{Dual, DualVector}...)
     DualVector(value,jacobian)
 end
 
-_size(x::Real) = 1
-_size(x::DualVector) = size(x)
+function vcat(a::Dual ,x::DualVector, b::Dual)
+    val = vcat(a.value, x.value, b.value)
+    jac = sparsevcat(_jacobian(a), x.jacobian, _jacobian(b))
+    DualVector(val, jac)
+end
+
+function vcat(a::Real ,x::DualVector, b::Real)
+    cols = size(x.jacobian,2)
+    val = vcat(a, x.value, b)
+    jac = sparsevcat(_jacobian(a, cols), x.jacobian, _jacobian(b, cols))
+    DualVector(val, jac)
+end
+
+###
+#
+# enable converting between DualVector and DualMatrix.
+# the 4D jacobian can be thought of as a generalisation of the 2D jacobian:
+#
+#   n × m 2D jacobian:  J[i,j] = ∂fᵢ / ∂xⱼ
+#   n × m × s × t 4D jacobian: J[i, j, k, l] = ∂f[i, j] / δx[k, l]
+#
+###
+
+# reshape does not preserve shape of perturbations (since jacobian can have
+# any number of columns)
+function reshape(x::DualVector,dims::Vararg{Int})
+    val = reshape(x.value, dims...)
+    jac = reshape(x.jacobian, dims..., size(x.jacobian, 2), 1)
+    DualMatrix(val, jac)
+end
 
-function vcat(x::Union{Real, DualVector}...)
-    cols = max((_size(i) for i in x)...)
-    val = vcat((_value(i) for i in x)...)
-    jac = vcat((_jacobian(i, cols) for i in x)...)
+function vec(x::DualArray)
+    val = vec(x.value)
+    jac = reshape(x.jacobian, length(val), prod(size(x.jacobian)[ndims(x)+1:end]))
     DualVector(val, jac)
 end
 
+show(io::IO,::MIME"text/plain", x::DualArray) = (print(io,x.value); print(io," + "); print(io,x.jacobian);print("𝛜"))
 show(io::IO,::MIME"text/plain", x::DualVector) = (print(io,x.value); print(io," + "); print(io,x.jacobian);print("𝛜"))
+show(io::IO,::MIME"text/plain", x::Dual) = (print(io,x.value); print(io," + "); print(io,x.partials);print("𝛜"))
 end
-# module DualArrays
+# module DualArrays
\ No newline at end of file
diff --git a/test/runtests.jl b/test/runtests.jl
index d1cba3c..9f10864 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -1,42 +1,100 @@
-using DualArrays, Test, LinearAlgebra, ForwardDiff, BandedMatrices
+using DualArrays, Test, LinearAlgebra, BandedMatrices, ForwardDiff, ComponentArrays
 using DualArrays: Dual
+using Lux: relu
 
 @testset "DualArrays" begin
-    @test_throws ArgumentError DualVector([1,2],I(3))
     
-    v = DualVector([1,2, 3], [1 2 3; 4 5 6;7 8 9])
-    @test v[1] isa Dual
-    @test v[1] == Dual(1,[1,2,3])
-    @test v[2] == Dual(2,[4,5,6])
-    @test v[3] == Dual(3,[7,8,9])
-    @test_throws BoundsError v[4]
-    @test v == DualVector([1,2, 3], [1 2 3; 4 5 6;7 8 9])
-
-    w = v + v
-    @test w == DualVector([2,4,6],[2 4 6;8 10 12;14 16 18])
-    @test w.jacobian == 2v.jacobian
-
-    @test sin.(v) isa DualVector
-    @test sin.(v).value == [sin(1), sin(2), sin(3)]
-    @test sin.(v).jacobian == Diagonal(cos.(v.value)) * v.jacobian
-
-    x,y = v[1:2],v[2:3]
-    @test x == DualVector([1,2],[1 2 3;4 5 6])
-    @test y == DualVector([2,3],[4 5 6;7 8 9])
-    @test x .* y == DualVector([2,6],[6 9 12;26 31 36])
+    v = DualArray([1.,2, 3], [1 2 3; 4 5 6;7 8 9])
+    @testset "constructors" begin
+        @test v isa DualVector
+        @test eltype(v) == DualArrays.Dual{Float64}
+        w = DualArray(ones(2,2),ones(2,2,2,2))
+        @test w isa DualMatrix
+        @test_throws TypeError DualArray([1,2], [1,2])
+        @test_throws ArgumentError DualVector([1,2],I(3))
+    end
+
+    @testset "indexing" begin
+        @test v[1] isa Dual
+        @test v[1] == Dual(1,[1,2,3])
+        @test v[2] == Dual(2,[4,5,6])
+        @test v[3] == Dual(3,[7,8,9])
+        @test_throws BoundsError v[4]
+        @test v == DualVector([1,2, 3], [1 2 3; 4 5 6;7 8 9])
+
+        x,y = v[1:2],v[2:3]
+        @test x == DualVector([1,2],[1 2 3;4 5 6])
+        @test y == DualVector([2,3],[4 5 6;7 8 9])
+
+        n = 10
+        w = DualVector(1:n, I(n))
+        @test w[2:end].jacobian isa BandedMatrix
+    end
+
+    @testset "basic_operations" begin
+        w = v + v
+        @test w == DualVector([2.,4.,6.],[2. 4 6;8 10 12;14 16 18])
+        @test w.jacobian == 2v.jacobian
+
+        u = v - w
+        @test u == -v
+
+        x = v.value + v - v.value
+        @test x == v
+
+        @test v .^ 2 == DualVector([1, 4, 9], [2 4 6;16 20 24; 42 48 54])
+
+        x,y = v[1:2],v[2:3]
+        @test x .* y == DualVector([2,6],[6 9 12;26 31 36])
+        @test sum(x .* y) isa Dual
+        @test sum(x .* y) == Dual(8,[32,40,48])
+    end
+
+    @testset "broadcasting" begin
+        @test sin.(v) isa DualVector
+        @test sin.(v).value == [sin(1), sin(2), sin(3)]
+        @test sin.(v).jacobian == Diagonal(cos.(v.value)) * v.jacobian
     
-    @test sum(x .* y) isa Dual
-    @test sum(x .* y) == Dual(8,[32,40,48])
+        @test exp.(v) isa DualVector
+        @test exp.(v).value == [exp(1), exp(2), exp(3)]
+        @test exp.(v).jacobian == Diagonal(exp.(v.value)) * v.jacobian
+    
+        @test relu.(v) isa DualVector
+        @test relu.(v).value == [relu(1), relu(2), relu(3)]
+        @test relu.(v).jacobian == Diagonal( 0.5 * (sign.(v.value) .+ 1)) * v.jacobian
+    end
+
+    @testset "misc" begin
+        n = 10
+        v = DualVector(1:n, I(n))
+        @test sum(v[1:end-1] .* v[2:end]).partials == ForwardDiff.gradient(v -> sum(v[1:end-1] .* v[2:end]), 1:n)
+    end
 
-    n = 10
-    v = DualVector(1:n, I(n))
-    @test v[2:end].jacobian isa BandedMatrix
+    @testset "vcat" begin
+        x = Dual(1, [1, 2, 3])
+        y = DualVector([2, 3], [4 5 6;7 8 9])
+        @test vcat(x) == x
+        @test vcat(x, x) == DualVector([1, 1], [1 2 3;1 2 3])
+        @test vcat(x, y) == DualVector([1, 2, 3], [1 2 3;4 5 6;7 8 9])
+        @test vcat(1, y, 2) == DualVector([1,2,3,2], [0 0 0; 4 5 6; 7 8 9; 0 0 0])
+    end
 
-    @test sum(v[1:end-1] .* v[2:end]).partials == ForwardDiff.gradient(v -> sum(v[1:end-1] .* v[2:end]), 1:n)
+    @testset "reshape" begin
+        x = DualVector([1,2,3,4], I(4))
+        @test vec(reshape(x, 2, 2)) == x
+    end
 
-    x = Dual(1, [1, 2, 3])
-    y = DualVector([2, 3], [4 5 6;7 8 9])
-    @test vcat(x) == x
-    @test vcat(x, x) == DualVector([1, 1], [1 2 3;1 2 3])
-    @test vcat(x, y) == DualVector([1, 2, 3], [1 2 3;4 5 6;7 8 9])
+    @testset "DualMatrix" begin
+        x = DualVector(ones(6), I(6))
+        y = reshape(x[1:4], 2, 2)
+        z = x[5:6]
+        @test (y*z).value == [2,2]
+        @test (y*z).jacobian == [1 0 1 0 1 1; 0 1 0 1 1 1]
+        for t in (([1, 0, 0, 1, 0, 0], [1 0 -1 0 1 0;0 0 0 0 0 0]), ([1, 1, 1, 1, 1, 1], [1 0 -1 0 1 0;0 1 0 -1 0 1]), ([0, 1, 1, 0, -1, -1], zeros(2,6)))
+            x = DualVector(t[1], I(6))
+            y = reshape(x[1:4], 2, 2)
+            z = x[5:6]
+            @test relu.(y*[1,-1] + z).jacobian == t[2]
+        end
+    end
 end
\ No newline at end of file

From b8e7fc8c1f1ba4747d679f125f1ee58448108ebc Mon Sep 17 00:00:00 2001
From: maximvassiliev <vassiliev.max@gmail.com>
Date: Wed, 25 Sep 2024 20:08:28 +0100
Subject: [PATCH 02/10] initial (slow) implementation of blockmatrixtensor

---
 Project.toml             |   4 +
 examples/convlayer.jl    |  15 ++-
 src/BlockMatrixTensor.jl |  83 ++++++++++++
 src/DualArray.jl         | 281 ++++++++++++++++++++++++++++++++++++++
 src/DualArrays.jl        | 282 +--------------------------------------
 test/runtests.jl         |   4 +
 6 files changed, 387 insertions(+), 282 deletions(-)
 create mode 100644 src/BlockMatrixTensor.jl
 create mode 100644 src/DualArray.jl

diff --git a/Project.toml b/Project.toml
index a78c37d..0cf0764 100644
--- a/Project.toml
+++ b/Project.toml
@@ -6,6 +6,8 @@ version = "0.1.0"
 [deps]
 ArrayLayouts = "4c555306-a7a7-4459-81d9-ec55ddd5c99a"
 BandedMatrices = "aae01518-5342-5314-be14-df237901396f"
+BlockArrays = "8e7c35d0-a365-5155-bbbb-fb81a777f24e"
+BlockBandedMatrices = "ffab5731-97b5-5995-9138-79e8c1846df0"
 ChainRules = "082447d4-558c-5d27-93f4-14fc19e9eca2"
 ComponentArrays = "b0b7db55-cfe3-40fc-9ded-d10e2dbeff66"
 FillArrays = "1a297f60-69ca-5386-bcde-b61e274b549b"
@@ -14,6 +16,8 @@ LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 Lux = "b2108857-7c20-44ae-9111-449ecde12c47"
 MLDatasets = "eb30cadb-4394-5ae3-aed4-317e484a6458"
 Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80"
+ProfileView = "c46f51b8-102a-5cf2-8d2c-8597cb0e0da7"
+SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
 Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
 
 [compat]
diff --git a/examples/convlayer.jl b/examples/convlayer.jl
index 2e16d64..538fd57 100644
--- a/examples/convlayer.jl
+++ b/examples/convlayer.jl
@@ -8,13 +8,11 @@ function convlayer(img, ker, xstride = 1, ystride = 1)
     n2, m2 = size(img)
     n3, m3 = div(n2-n+1,xstride), div(m2-m+1,ystride)
     fmap = zeros(promote_type(eltype(img), t), n3, m3)
-
     #Apply kernel to section of image
     for i= 1:xstride:n3,j = 1:ystride:m3
         ft = img[i:i+n-1,j:j+m-1] .* ker
         fmap[i,j] = sum(ft)
     end
-
     fmap
 end
 
@@ -23,8 +21,12 @@ function softmax(x)
     exp.(x) / s
 end
 
-function dense_layer(f, W, b, x)
-    f(W*x + b)
+function dense_layer(W, b, x, f::Function = identity)
+    ret = W*x
+    println("Multiplication complete")
+    ret += b
+    println("Addition Complete")
+    f(ret)
 end
 
 function cross_entropy(x, y)
@@ -36,9 +38,12 @@ function model_loss(x, y, w)
     weights = reshape(w[10:6769], 10, 676)
     biases = w[6770:6779]
     l1 = vec(DualMatrix(convlayer(x, ker)))
-    l2 = dense_layer(softmax, weights, biases, l1)
+    println("Conv layer complete")
+    l2 = dense_layer(weights, biases, l1, softmax)
+    println("Dense Layer complete")
     target = OneElement(1, y+1, 10)
     loss = cross_entropy(l2, target)
+    println("Loss complete")
     loss.value, loss.partials
 end
 
diff --git a/src/BlockMatrixTensor.jl b/src/BlockMatrixTensor.jl
new file mode 100644
index 0000000..4bb693b
--- /dev/null
+++ b/src/BlockMatrixTensor.jl
@@ -0,0 +1,83 @@
+#4-Tensor with data stored in a BlockArray
+#Enables sparsity for DualMatrix jacobian.
+using BlockArrays, SparseArrays
+
+struct BlockMatrixTensor{T} <: AbstractArray{T, 4}
+    data::AbstractBlockMatrix{T}
+end
+
+function BlockMatrixTensor(x::AbstractArray{T, 4}) where {T}
+    n, m, s, t = size(x)
+    data = BlockArray(zeros(T, n * s, m * t), fill(s, n), fill(t, m))
+    for i = 1:n, j = 1:m
+        data[Block(i,j)] = x[i,j,:,:]
+    end
+    BlockMatrixTensor(data)
+end
+
+size(x::BlockMatrixTensor) = (blocksize(x.data)..., size(x.data[Block(1), Block(1)])...)
+
+getindex(x::BlockMatrixTensor, a::Int, b::Int, c, d) = sparse_getindex(x.data[Block(a), Block(b)], c, d)
+getindex(x::BlockMatrixTensor, a::UnitRange, b::UnitRange, ::Colon, ::Colon) = BlockMatrixTensor(x.data[Block.(a),Block.(b)])
+
+function show(io::IO,m::MIME"text/plain", x::BlockMatrixTensor)
+    print("BlockMatrixTensor containing: \n")
+    show(io,m,x.data)
+end
+show(io::IO,x::BlockMatrixTensor) = show(io, x.data)
+
+#Blockwise broadcast
+function broadcasted(f::Function, x::BlockMatrixTensor, y::AbstractMatrix)
+    z = undef
+    if size(y, 1) == 1
+        z = repeat(y, blocksize(x.data, 1))
+    elseif size(y, 2) == 1
+        z = repeat(y, 1, blocksize(x.data, 2))
+    else
+        z = copy(y)
+    end
+    n, m = size(z)
+    blocked = mortar([fill(z[i,j], size(x.data[Block(1,1)])) for i = 1:n, j = 1:m])
+    BlockMatrixTensor(f.(blocked, x.data))
+end
+
+function broadcasted(f::Function, x::AbstractMatrix, y::BlockMatrixTensor)
+    z = undef
+    if size(x, 1) == 1
+        z = repeat(x, blocksize(y.data, 1))
+    elseif size(x, 2) == 1
+        z = repeat(y, 1, blocksize(y.data, 2))
+    else
+        z = copy(x)
+    end
+    n, m = size(z)
+    blocked = mortar([fill(z[i,j], size(y.data[Block(1,1)])) for i = 1:n, j = 1:m])
+    BlockMatrixTensor(f.(blocked, y.data))
+end
+
+function sum(x::BlockMatrixTensor{T}; dims = Colon()) where {T}
+    if dims == 1:2
+        n, m = size(x, 3), size(x, 4)
+        ret = zeros(T, n, m)
+        for i = 1:size(x, 1), j = 1:size(x, 2)
+            ret += x.data[Block(i, j)]
+        end
+        sparse(ret)
+    elseif dims == 2
+        n, m, s, t = size(x)
+        ret = BlockArray(zeros(T, n*s, t), fill(s, n), fill(t, 1))
+        for i = 1:n, j = 1:m
+            ret[Block(i,1)] += x.data[Block(i,j)]
+        end
+        BlockMatrixTensor(ret)
+    end
+end
+
+function reshape(x::BlockMatrixTensor, dims::Vararg{Int64})
+    #Blockwise reshape
+    if dims[3] == size(x, 3) && dims[4] == size(x, 4)
+        mortar(reshape(blocks(x.data), dims[1], dims[2]))
+    end
+end
+
+vec(x::BlockMatrixTensor) = vcat(blocks(x.data)...)
diff --git a/src/DualArray.jl b/src/DualArray.jl
new file mode 100644
index 0000000..53f0282
--- /dev/null
+++ b/src/DualArray.jl
@@ -0,0 +1,281 @@
+# TODO: support non-banded
+sparsevcat(As...) = vcat(convert.(BandedMatrix,  As)...)
+
+# Should Partials necessarily be a vector?
+# For example, if we index a DualMatrix, should partials retain the shape
+# of the perturbations, thereby giving a matrix of partials?
+struct Dual{T, Partials <: AbstractArray{T}} <: Real
+    value::T
+    partials::Partials
+end
+
+zero(::Type{Dual{T}}) where {T} = Dual(zero(T), zeros(T,1))
+promote_rule(::Type{Dual{T}},::Type{S}) where {T,S} = Dual{promote_type(T,S)}
+
+==(a::Dual, b::Dual) = a.value == b.value && a.partials == b.partials
+for op in (:+, :-)
+    @eval $op(a::Dual, k::Real) = Dual($op(a.value, k), a.partials)
+end
+for op in (:*, :/)
+    @eval $op(a::Dual, k::Real) = Dual($op(a.value, k), $op(a.partials,k))
+end
+
+-(x::Dual) = Dual(-1 * x.value, -1 * x.partials)
+
+*(x::Dual, y::Dual) = Dual(x.value*y.value, x.value*y.partials + y.value*x.partials)
+/(x::Dual, y::Dual) = x*Dual(y.value, -y.partials) / y.value^2
+
+sparse_getindex(a...) = layout_getindex(a...)
+sparse_getindex(D::Diagonal, k::Integer, ::Colon) = OneElement(D.diag[k], k, size(D,2))
+sparse_getindex(D::Diagonal, ::Colon, j::Integer) = OneElement(D.diag[j], j, size(D,1))
+sparse_getindex(s::AbstractSparseArray, idx...) = s[idx...]
+
+sparse_getindex(b::BandedMatrix, k::Integer, ::Colon) = sparsevec(
+    rowrange(b, k), b[k, rowrange(b, k)], size(b, 2)
+    )
+sparse_getindex(b::BandedMatrix, ::Colon, k::Integer) = sparsevec(
+    colrange(b, k), b[colrange(b, k), k], size(b, 1)
+    )
+
+sparse_transpose(s::SparseVector) = sparse(ones(length(s.nzind)),s.nzind,s.nzval, 1, length(s))
+sparse_transpose(o::OneElement{T,1}) where {T} = reshape(o, 1, :)
+sparse_transpose(x::AbstractVector) = x'
+
+"""
+reprents an array of duals given by
+    
+    values + jacobian * 𝛜.
+
+where 𝛜 is an array with dual parts corresponding to each entry of values.
+"""
+
+#Helper function to check valid dimensions of DualArray
+function _checkdims(val, jac)
+    s1, s2 = size(val), size(jac)
+    if length(s1) > length(s2)
+        return false
+    end
+    for (i, x) in enumerate(s1)
+        if(x != s2[i])
+            return false
+        end
+    end
+    return true
+end
+
+struct DualArray{T, N, M, J <: AbstractArray{T, M}} <: AbstractArray{Dual{T}, N}
+    value::AbstractArray{T, N}
+    jacobian::J
+    function DualArray(value::AbstractArray{T,N},jacobian::AbstractArray{T, M}) where {T, N, M}
+        if(!_checkdims(value,jacobian))
+            s1, s2 = size(value), size(jacobian)
+            throw(ArgumentError("N-dimensional array must be of equal size to first N dimensions of jacobian. \n\nGot $s1, $s2"))
+        end
+        new{T,N,2*N,typeof(jacobian)}(value,jacobian)
+    end
+end
+
+function DualArray(value::AbstractArray{T,N}, jacobian::AbstractArray{S,M}) where {S,T,N,M}
+    t = promote_type(T, S)
+    DualArray(convert(AbstractArray{t,N}, value), convert(AbstractArray{t, M}, jacobian))
+end
+
+#Aliases and Constructors
+const DualVector{T, M} = DualArray{T, 1, 2, M} where {T, M <: AbstractMatrix{T}}
+const DualMatrix{T, M} = DualArray{T, 2, 4, M} where {T, M <: AbstractArray{T, 4}}
+
+DualVector(x::AbstractVector, j::AbstractMatrix) = DualArray(x, j)
+DualVector(x::AbstractVector, j::ComponentArray) = DualArray(x, j)
+DualMatrix(x::AbstractMatrix, j::AbstractArray{T,4}) where {T} = DualArray(x, j)
+
+function DualMatrix(x::AbstractMatrix)
+    val = [y.value for y in x]
+    n, m = size(x)
+    jac = mortar([x[i,j].partials for i = 1:n, j = 1:m])
+    DualMatrix(sparse(val), BlockMatrixTensor(jac))
+end
+
+function getindex(x::DualVector, y::Int)
+    Dual(x.value[y], sparse_getindex(x.jacobian,y,:))
+end
+function getindex(x::DualVector, y::UnitRange)
+    newval = x.value[y]
+    newjac = sparse_getindex(x.jacobian,y,:)
+    #newjac = layout_getindex(x.jacobian,y,:)
+    DualVector(newval, newjac)
+end
+getindex(x::DualVector, ::Colon) = x
+
+function getindex(x::DualArray, y::Int...)
+    idx = ntuple(i -> i > length(y) ? Colon() : y[i], 2*ndims(x))
+    Dual(x.value[y...], x.jacobian[idx...])
+end
+
+for op in (:size, :axes, :ndims)
+    @eval $op(x::DualArray) = $op(x.value)
+end
+
++(x::DualVector,y::DualVector) = DualVector(x.value + y.value, x.jacobian + y.jacobian)
+-(x::DualVector,y::DualVector) = DualVector(x.value - y.value, x.jacobian - y.jacobian)
++(x::DualVector,y::Vector) = DualVector(x.value + y, x.jacobian)
+-(x::DualVector,y::Vector) = DualVector(x.value - y, x.jacobian)
++(x::Vector,y::DualVector) = DualVector(y.value + x, y.jacobian)
+-(x::Vector,y::DualVector) = DualVector(x - y.value, -y.jacobian)
+*(x::AbstractMatrix, y::DualVector) = DualVector(x * y.value, x * y.jacobian)
+*(x::Number, y::DualVector) = DualVector(x * y.value, x * y.jacobian)
+*(x::DualVector, y::Dual) = DualVector(x.value * y.value, y.value * x.jacobian + x.value * y.partials')
+/(x::DualVector, y::Number) = DualVector(x.value / y, x.jacobian/ y)
+/(x::DualVector, y::Dual) = x * Dual(y.value, -y.partials) / y.value^2
+
+##
+# Note about 4-tensor by matrix multiplication:
+#   We can consider matrix-vector multiplication as follows:
+#
+#       (Ax)[i] = A[i,:] . x
+#   
+#   This can be generalised to higher dimensions (B a 4-tensor, X a matrix):
+#
+#       (BX)[i,j] = B[i,j,:,:] . X
+#
+#   This is useful since if we consider 4-tensors dimension n × m × s × t
+#   equivalent to linear maps from n × m matrices to s × t matrices, this 
+#   gives us multiplication analogous to applying the linear transform on
+#   a matrix in the same way that matrix-vector multiplication applies the
+#   linear transformation to the vector. This also gives us a way to 
+#   'expand' the Dual part of a DualMatrix.
+##
+
+# Assumes that the total number of epsilons is equal, i.e the array of
+# perturbations are reshapes of each other.
+function *(x::DualMatrix, y::DualVector)
+    val = x.value * y.value
+    jac = x.value * y.jacobian + vcat(blocks(sum(x.jacobian .* y.value'; dims = 2).data)...)
+    DualVector(val, jac)
+end
+
+function *(x::DualMatrix, y::AbstractVector)
+    val = x.value * y
+    t = promote_type(eltype(x.value), eltype(y))
+    jac = zeros(t, length(val), size(x.jacobian, 3) * size(x.jacobian, 4))
+    n, m = size(jac, 1), size(x.value, 2)
+    for i = 1:n
+        new_row = zeros(size(jac, 2))
+        for j = 1:m
+            new_row += y[j] * vec(x.jacobian[i, j, :, :])
+        end
+        jac[i,:] = new_row
+    end
+    DualVector(val, jac)
+end
+
+function broadcasted(f::Function,d::DualVector)
+    jvals = zeros(eltype(d.value), length(d.value))
+    for (i, x) = enumerate(d.value)
+        _, df = frule((ZeroTangent(), 1),f, x)
+        jvals[i] = df
+    end
+    DualVector(f.(d.value), Diagonal(jvals)*d.jacobian)
+end
+
+function broadcasted(::typeof(*),x::DualVector,y::DualVector)
+    newval = x.value .* y.value
+    newjac = x.value .* y.jacobian + y.value .* x.jacobian
+    DualArray(newval,newjac)
+end
+
+function broadcasted(::typeof(*),x::AbstractArray,y::DualArray)
+    newval = x .* y.value
+    newjac = x .* y.jacobian
+    DualArray(newval,newjac)
+end
+
+function broadcasted(::typeof(*),x::DualArray,y::AbstractArray)
+    newval = x.value .* y
+    newjac = y .* x.jacobian
+    DualArray(newval,newjac)
+end
+
+function broadcasted(::typeof(^),x::DualVector,n::Int)
+    newval = x.value .^ n
+    newjac = n * x.value .^ (n - 1) .* x.jacobian
+    DualVector(newval,newjac)
+end
+
+broadcasted(::typeof(Base.literal_pow), ::typeof(^), x::DualVector, ::Val{n}) where n = broadcasted(^, x, n)
+
+###
+# sum (higher-dimensional case assuming that shape of perturbations is preserved)
+###
+function sum(x::DualVector)
+    Dual(sum(x.value), vec(sum(x.jacobian; dims=1)))
+end
+
+function sum(x::DualArray)
+    Dual(sum(x.value), sum(x.jacobian; dims=1:ndims(x)))
+end
+
+_jacobian(d::Dual) = permutedims(d.partials)
+_jacobian(d::DualVector) = d.jacobian
+_jacobian(d::DualVector, ::Int) = d.jacobian
+_jacobian(x::Number, N::Int) = Zeros{typeof(x)}(1, N)
+
+_value(d::DualVector) = d.value
+_value(x::Number) = x
+
+function vcat(x::Union{Dual, DualVector}...)
+    if length(x) == 1
+        return x[1]
+    end
+    value = vcat((d.value for d in x)...)
+    jacobian = vcat((_jacobian(d) for d in x)...)
+    DualVector(value,jacobian)
+end
+
+function vcat(a::Dual ,x::DualVector, b::Dual)
+    val = vcat(a.value, x.value, b.value)
+    jac = sparsevcat(_jacobian(a), x.jacobian, _jacobian(b))
+    DualVector(val, jac)
+end
+
+function vcat(a::Real ,x::DualVector, b::Real)
+    cols = size(x.jacobian,2)
+    val = vcat(a, x.value, b)
+    jac = sparsevcat(_jacobian(a, cols), x.jacobian, _jacobian(b, cols))
+    DualVector(val, jac)
+end
+
+###
+#
+# enable converting between DualVector and DualMatrix.
+# the 4D jacobian can be thought of as a generalisation of the 2D jacobian:
+#
+#   n × m 2D jacobian:  J[i,j] = ∂fᵢ / ∂xⱼ
+#   n × m × s × t 4D jacobian: J[i, j, k, l] = ∂f[i, j] / δx[k, l]
+#
+###
+
+# reshape does not preserve shape of perturbations (since jacobian can have
+# any number of columns)
+
+#Should a PR be made into reshape() 
+_tomatrix(v::AbstractVector) = reshape(v, :, 1)
+
+function reshape(x::DualVector,dims::Vararg{Int})
+    val = reshape(x.value, dims...)
+    jac_blockmat = [sparse_transpose(sparse_getindex(x.jacobian, i, :)) for i = 1:size(x.jacobian, 1)]
+    jac = BlockMatrixTensor(mortar(reshape(jac_blockmat, dims...)))
+    DualMatrix(val, jac)
+end
+
+_blockvec(x::AbstractArray{T,4}) where {T} = reshape(x, size(x, 1) * size(x, 2), size(x, 3) * size(x, 4))
+_blockvec(x::BlockMatrixTensor) = vcat(blocks(x.data)...)
+
+function vec(x::DualArray)
+    val = vec(x.value)
+    jac = _blockvec(x.jacobian)
+    DualVector(val, jac)
+end
+
+show(io::IO,::MIME"text/plain", x::DualArray) = (print(io,x.value); print(io," + "); print(io,x.jacobian);print("𝛜"))
+show(io::IO,::MIME"text/plain", x::DualVector) = (print(io,x.value); print(io," + "); print(io,x.jacobian);print("𝛜"))
+show(io::IO,::MIME"text/plain", x::Dual) = (print(io,x.value); print(io," + "); print(io,x.partials);print("𝛜"))
\ No newline at end of file
diff --git a/src/DualArrays.jl b/src/DualArrays.jl
index 56d705d..830ba8f 100644
--- a/src/DualArrays.jl
+++ b/src/DualArrays.jl
@@ -1,288 +1,16 @@
 module DualArrays
-export DualVector, DualArray, DualMatrix
+
+export DualVector, DualArray, DualMatrix, BlockMatrixTensor
 export dropzeros
 
 import Base: +, ==, getindex, size, broadcast, axes, broadcasted, show, sum,
              vcat, convert, *, -, ^, /, ndims, hcat, vec, promote_rule, zero,
              reshape
-using LinearAlgebra, ArrayLayouts, BandedMatrices, FillArrays, ComponentArrays
+using LinearAlgebra, ArrayLayouts, BandedMatrices, FillArrays, ComponentArrays, SparseArrays
 import ChainRules: frule, ZeroTangent
 
-# TODO: support non-banded
-sparsevcat(As...) = vcat(convert.(BandedMatrix,  As)...)
-
-# Should Partials necessarily be a vector?
-# For example, if we index a DualMatrix, should partials retain the shape
-# of the perturbations, thereby giving a matrix of partials?
-struct Dual{T, Partials <: AbstractArray{T}} <: Real
-    value::T
-    partials::Partials
-end
-
-zero(::Type{Dual{T}}) where {T} = Dual(zero(T), zeros(T,1))
-promote_rule(::Type{Dual{T}},::Type{S}) where {T,S} = Dual{promote_type(T,S)}
-
-==(a::Dual, b::Dual) = a.value == b.value && a.partials == b.partials
-for op in (:+, :-)
-    @eval $op(a::Dual, k::Real) = Dual($op(a.value, k), a.partials)
-end
-for op in (:*, :/)
-    @eval $op(a::Dual, k::Real) = Dual($op(a.value, k), $op(a.partials,k))
-end
-
--(x::Dual) = Dual(-1 * x.value, -1 * x.partials)
-
-*(x::Dual, y::Dual) = Dual(x.value*y.value, x.value*y.partials + y.value*x.partials)
-/(x::Dual, y::Dual) = x*Dual(y.value, -y.partials) / y.value^2
-
-sparse_getindex(a...) = layout_getindex(a...)
-sparse_getindex(D::Diagonal, k::Integer, ::Colon) = OneElement(D.diag[k], k, size(D,2))
-sparse_getindex(D::Diagonal, ::Colon, j::Integer) = OneElement(D.diag[j], j, size(D,1))
-
-"""
-reprents an array of duals given by
-    
-    values + jacobian * 𝛜.
-
-where 𝛜 is an array with dual parts corresponding to each entry of values.
-"""
-
-#Helper function to check valid dimensions of DualArray
-function _checkdims(val, jac)
-    s1, s2 = size(val), size(jac)
-    if length(s1) > length(s2)
-        return false
-    end
-    for (i, x) in enumerate(s1)
-        if(x != s2[i])
-            return false
-        end
-    end
-    return true
-end
-
-struct DualArray{T, N, M, J <: AbstractArray{T, M}} <: AbstractArray{Dual{T}, N}
-    value::AbstractArray{T, N}
-    jacobian::J
-    function DualArray(value::AbstractArray{T,N},jacobian::AbstractArray{T, M}) where {T, N, M}
-        if(!_checkdims(value,jacobian))
-            s1, s2 = size(value), size(jacobian)
-            throw(ArgumentError("N-dimensional array must be of equal size to first N dimensions of jacobian. \n\nGot $s1, $s2"))
-        end
-        new{T,N,2*N,typeof(jacobian)}(value,jacobian)
-    end
-end
-
-function DualArray(value::AbstractArray{T,N}, jacobian::AbstractArray{S,M}) where {S,T,N,M}
-    t = promote_type(T, S)
-    DualArray(convert(AbstractArray{t,N}, value), convert(AbstractArray{t, M}, jacobian))
-end
-
-#Aliases and Constructors
-const DualVector{T, M} = DualArray{T, 1, 2, M} where {T, M <: AbstractMatrix{T}}
-const DualMatrix{T, M} = DualArray{T, 2, 4, M} where {T, M <: AbstractArray{T, 4}}
-
-DualVector(x::AbstractVector, j::AbstractMatrix) = DualArray(x, j)
-DualVector(x::AbstractVector, j::ComponentArray) = DualArray(x, j)
-DualMatrix(x::AbstractMatrix, j::AbstractArray{T,4}) where {T} = DualArray(x, j)
-
-function DualMatrix(x::AbstractMatrix{Dual{T}}) where {T}
-    val = zeros(T, size(x)...)
-    jac = zeros(T, size(x)..., size(x[1,1].partials)...)
-    n, m = size(x)
-    for i = 1:n, j = 1:m
-        val[i,j] = x[i,j].value
-        jac[i,j,:,:] = x[i,j].partials
-    end
-    DualMatrix(val, jac)
-end
-
-function getindex(x::DualVector, y::Int)
-    Dual(x.value[y], sparse_getindex(x.jacobian,y,:))
-end
-function getindex(x::DualVector, y::UnitRange)
-    newval = x.value[y]
-    newjac = sparse_getindex(x.jacobian,y,:)
-    newjac = layout_getindex(x.jacobian,y,:)
-    DualVector(newval, newjac)
-end
-getindex(x::DualVector, ::Colon) = x
-
-function getindex(x::DualArray, y::Int...)
-    idx = ntuple(i -> i > length(y) ? Colon() : y[i], 2*ndims(x))
-    Dual(x.value[y...], x.jacobian[idx...])
-end
-
-for op in (:size, :axes, :ndims)
-    @eval $op(x::DualArray) = $op(x.value)
-end
-
-+(x::DualVector,y::DualVector) = DualVector(x.value + y.value, x.jacobian + y.jacobian)
--(x::DualVector,y::DualVector) = DualVector(x.value - y.value, x.jacobian - y.jacobian)
-+(x::DualVector,y::Vector) = DualVector(x.value + y, x.jacobian)
--(x::DualVector,y::Vector) = DualVector(x.value - y, x.jacobian)
-+(x::Vector,y::DualVector) = DualVector(y.value + x, y.jacobian)
--(x::Vector,y::DualVector) = DualVector(x - y.value, -y.jacobian)
-*(x::AbstractMatrix, y::DualVector) = DualVector(x * y.value, x * y.jacobian)
-*(x::Number, y::DualVector) = DualVector(x * y.value, x * y.jacobian)
-*(x::DualVector, y::Dual) = DualVector(x.value * y.value, y.value * x.jacobian + x.value * y.partials')
-/(x::DualVector, y::Number) = DualVector(x.value / y, x.jacobian/ y)
-/(x::DualVector, y::Dual) = x * Dual(y.value, -y.partials) / y.value^2
-
-##
-# Note about 4-tensor by matrix multiplication:
-#   We can consider matrix-vector multiplication as follows:
-#
-#       (Ax)[i] = A[i,:] . x
-#   
-#   This can be generalised to higher dimensions (B a 4-tensor, X a matrix):
-#
-#       (BX)[i,j] = B[i,j,:,:] . X
-#
-#   This is useful since if we consider 4-tensors dimension n × m × s × t
-#   equivalent to linear maps from n × m matrices to s × t matrices, this 
-#   gives us multiplication analogous to applying the linear transform on
-#   a matrix in the same way that matrix-vector multiplication applies the
-#   linear transformation to the vector. This also gives us a way to 
-#   'expand' the Dual part of a DualMatrix.
-##
-
-# Assumes that the total number of epsilons is equal, i.e the array of
-# perturbations are reshapes of each other.
-function *(x::DualMatrix, y::DualVector)
-    val = x.value * y.value
-    t = promote_type(eltype(x.value), eltype(y.value))
-    jac = zeros(t, length(val), size(y.jacobian, 2))
-    n, m = size(jac, 1), size(x.value, 2)
-    for i = 1:n
-        new_row = zeros(size(jac, 2))
-        for j = 1:m
-            new_row += x.value[i,j] * y.jacobian[j,:] + y.value[j] * vec(x.jacobian[i, j, :, :])
-        end
-        jac[i,:] = new_row
-    end
-    DualVector(val, jac)
-end
-
-function *(x::DualMatrix, y::AbstractVector)
-    val = x.value * y
-    t = promote_type(eltype(x.value), eltype(y))
-    jac = zeros(t, length(val), size(x.jacobian, 3) * size(x.jacobian, 4))
-    n, m = size(jac, 1), size(x.value, 2)
-    for i = 1:n
-        new_row = zeros(size(jac, 2))
-        for j = 1:m
-            new_row += y[j] * vec(x.jacobian[i, j, :, :])
-        end
-        jac[i,:] = new_row
-    end
-    DualVector(val, jac)
-end
-
-function broadcasted(f::Function,d::DualVector)
-    jvals = zeros(eltype(d.value), length(d.value))
-    for (i, x) = enumerate(d.value)
-        _, df = frule((ZeroTangent(), 1),f, x)
-        jvals[i] = df
-    end
-    DualVector(f.(d.value), Diagonal(jvals)*d.jacobian)
-end
-
-function broadcasted(::typeof(*),x::DualVector,y::DualVector)
-    newval = x.value .* y.value
-    newjac = x.value .* y.jacobian + y.value .* x.jacobian
-    DualArray(newval,newjac)
-end
-
-function broadcasted(::typeof(*),x::AbstractArray,y::DualArray)
-    newval = x .* y.value
-    newjac = x .* y.jacobian
-    DualArray(newval,newjac)
-end
-
-function broadcasted(::typeof(*),x::DualArray,y::AbstractArray)
-    newval = x.value .* y
-    newjac = y .* x.jacobian
-    DualVector(newval,newjac)
-end
-
-function broadcasted(::typeof(^),x::DualVector,n::Int)
-    newval = x.value .^ n
-    newjac = n * x.value .^ (n - 1) .* x.jacobian
-    DualVector(newval,newjac)
-end
-
-broadcasted(::typeof(Base.literal_pow), ::typeof(^), x::DualVector, ::Val{n}) where n = broadcasted(^, x, n)
-
-###
-# sum (higher-dimensional case assuming that shape of perturbations is preserved)
-###
-function sum(x::DualVector)
-    Dual(sum(x.value), vec(sum(x.jacobian; dims=1)))
-end
-
-function sum(x::DualArray)
-    #Drop redundant leading dimensions of 1 in jacobian
-    idx = ntuple(i -> i > ndims(x) ? Colon() : 1, 2*ndims(x))
-    Dual(sum(x.value), sum(x.jacobian; dims=1:ndims(x))[idx...])
-end
-
-_jacobian(d::Dual) = permutedims(d.partials)
-_jacobian(d::DualVector) = d.jacobian
-_jacobian(d::DualVector, ::Int) = d.jacobian
-_jacobian(x::Number, N::Int) = Zeros{typeof(x)}(1, N)
-
-_value(d::DualVector) = d.value
-_value(x::Number) = x
-
-function vcat(x::Union{Dual, DualVector}...)
-    if length(x) == 1
-        return x[1]
-    end
-    value = vcat((d.value for d in x)...)
-    jacobian = vcat((_jacobian(d) for d in x)...)
-    DualVector(value,jacobian)
-end
-
-function vcat(a::Dual ,x::DualVector, b::Dual)
-    val = vcat(a.value, x.value, b.value)
-    jac = sparsevcat(_jacobian(a), x.jacobian, _jacobian(b))
-    DualVector(val, jac)
-end
-
-function vcat(a::Real ,x::DualVector, b::Real)
-    cols = size(x.jacobian,2)
-    val = vcat(a, x.value, b)
-    jac = sparsevcat(_jacobian(a, cols), x.jacobian, _jacobian(b, cols))
-    DualVector(val, jac)
-end
-
-###
-#
-# enable converting between DualVector and DualMatrix.
-# the 4D jacobian can be thought of as a generalisation of the 2D jacobian:
-#
-#   n × m 2D jacobian:  J[i,j] = ∂fᵢ / ∂xⱼ
-#   n × m × s × t 4D jacobian: J[i, j, k, l] = ∂f[i, j] / δx[k, l]
-#
-###
-
-# reshape does not preserve shape of perturbations (since jacobian can have
-# any number of columns)
-function reshape(x::DualVector,dims::Vararg{Int})
-    val = reshape(x.value, dims...)
-    jac = reshape(x.jacobian, dims..., size(x.jacobian, 2), 1)
-    DualMatrix(val, jac)
-end
-
-function vec(x::DualArray)
-    val = vec(x.value)
-    jac = reshape(x.jacobian, length(val), prod(size(x.jacobian)[ndims(x)+1:end]))
-    DualVector(val, jac)
-end
+include("BlockMatrixTensor.jl")
+include("DualArray.jl")
 
-show(io::IO,::MIME"text/plain", x::DualArray) = (print(io,x.value); print(io," + "); print(io,x.jacobian);print("𝛜"))
-show(io::IO,::MIME"text/plain", x::DualVector) = (print(io,x.value); print(io," + "); print(io,x.jacobian);print("𝛜"))
-show(io::IO,::MIME"text/plain", x::Dual) = (print(io,x.value); print(io," + "); print(io,x.partials);print("𝛜"))
 end
 # module DualArrays
\ No newline at end of file
diff --git a/test/runtests.jl b/test/runtests.jl
index 9f10864..fea6117 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -97,4 +97,8 @@ using Lux: relu
             @test relu.(y*[1,-1] + z).jacobian == t[2]
         end
     end
+
+    @testset "BlockMatrixTensor" begin
+        
+    end
 end
\ No newline at end of file

From da6fa4992c8f88f0c5703b86df38b1c7697f55d0 Mon Sep 17 00:00:00 2001
From: maximvassiliev <vassiliev.max@gmail.com>
Date: Wed, 25 Sep 2024 22:20:01 +0100
Subject: [PATCH 03/10] add better broadcasting + unit tests

---
 src/BlockMatrixTensor.jl | 71 ++++++++++++++++++++++++++++------------
 test/runtests.jl         | 24 ++++++++++++--
 2 files changed, 72 insertions(+), 23 deletions(-)

diff --git a/src/BlockMatrixTensor.jl b/src/BlockMatrixTensor.jl
index 4bb693b..17fdcbf 100644
--- a/src/BlockMatrixTensor.jl
+++ b/src/BlockMatrixTensor.jl
@@ -26,35 +26,64 @@ function show(io::IO,m::MIME"text/plain", x::BlockMatrixTensor)
 end
 show(io::IO,x::BlockMatrixTensor) = show(io, x.data)
 
-#Blockwise broadcast
-function broadcasted(f::Function, x::BlockMatrixTensor, y::AbstractMatrix)
-    z = undef
-    if size(y, 1) == 1
-        z = repeat(y, blocksize(x.data, 1))
-    elseif size(y, 2) == 1
-        z = repeat(y, 1, blocksize(x.data, 2))
+# Blockwise broadcast
+for op in (:*, :/)
+    @eval $op(x::BlockMatrixTensor, y::Number) = BlockMatrixTensor($op(x.data, y))
+    @eval $op(x::Number, y::BlockMatrixTensor) = BlockMatrixTensor($op(x, y.data))
+end
+
+function broadcasted(f::Function, x::BlockMatrixTensor{T}, y::AbstractMatrix) where {T}
+    ret = BlockArray{T}(undef, blocksizes(x.data)...)
+    n, m = blocksize(x.data)
+    if blocksize(x.data) == size(y)
+        for i = 1:n, j = 1:m
+            ret[Block(i,j)] = f.(y[i,j], x.data[Block(i,j)])
+        end
+    elseif size(y) == (1, 1)
+        f.(y[1,1], x.data)
+    elseif size(y) == (1, m)
+        for j = 1:m
+            ret[:, Block(j)] = f.(y[1,j], x.data[:, Block(j)])
+        end
+    elseif size(y) == (n, 1)
+        for i = 1:n
+            ret[Block(i), :] = f.(y[i, 1], x.data[Block(i), :])
+        end
     else
-        z = copy(y)
+        a = size(x)
+        b = size(y)
+        throw(DimensionMismatch("Invalid dimensions for broadcasting. Got $a, $b."))
     end
-    n, m = size(z)
-    blocked = mortar([fill(z[i,j], size(x.data[Block(1,1)])) for i = 1:n, j = 1:m])
-    BlockMatrixTensor(f.(blocked, x.data))
+    BlockMatrixTensor(ret)
 end
 
-function broadcasted(f::Function, x::AbstractMatrix, y::BlockMatrixTensor)
-    z = undef
-    if size(x, 1) == 1
-        z = repeat(x, blocksize(y.data, 1))
-    elseif size(x, 2) == 1
-        z = repeat(y, 1, blocksize(y.data, 2))
+function broadcasted(f::Function, x::AbstractMatrix, y::BlockMatrixTensor{T}) where {T}
+    ret = BlockArray{T}(undef, blocksizes(y.data)...)
+    n, m = blocksize(y.data)
+    if blocksize(y.data) == size(x)
+        for i = 1:n, j = 1:m
+            ret[Block(i,j)] = f.(x[i,j], y.data[Block(i,j)])
+        end
+    elseif size(x) == (1, 1)
+        f.(x[1,1], y.data)
+    elseif size(x) == (1, m)
+        for j = 1:m
+            ret[:, Block(j)] = f.(x[1,j], y.data[:, Block(j)])
+        end
+    elseif size(x) == (n, 1)
+        for i = 1:n
+            ret[Block(i), :] = f.(x[i, 1], y.data[Block(i), :])
+        end
     else
-        z = copy(x)
+        a = size(y)
+        b = size(x)
+        throw(DimensionMismatch("Invalid dimensions for broadcasting. Got $a, $b."))
     end
-    n, m = size(z)
-    blocked = mortar([fill(z[i,j], size(y.data[Block(1,1)])) for i = 1:n, j = 1:m])
-    BlockMatrixTensor(f.(blocked, y.data))
+    BlockMatrixTensor(ret)
 end
 
+
+
 function sum(x::BlockMatrixTensor{T}; dims = Colon()) where {T}
     if dims == 1:2
         n, m = size(x, 3), size(x, 4)
diff --git a/test/runtests.jl b/test/runtests.jl
index fea6117..8671332 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -1,4 +1,5 @@
-using DualArrays, Test, LinearAlgebra, BandedMatrices, ForwardDiff, ComponentArrays
+using DualArrays, Test, LinearAlgebra, BandedMatrices, ForwardDiff,
+BlockArrays
 using DualArrays: Dual
 using Lux: relu
 
@@ -99,6 +100,25 @@ using Lux: relu
     end
 
     @testset "BlockMatrixTensor" begin
-        
+        @testset "constructors" begin
+            x = BlockMatrixTensor(BlockArray(ones(4, 4), [2, 2], [2, 2]))
+            y = BlockMatrixTensor(ones(2, 2, 2, 2))
+            @test x isa BlockMatrixTensor
+            @test y isa BlockMatrixTensor
+            @test x == y
+        end
+        @testset "broadcasting" begin
+            y = BlockMatrixTensor(ones(2, 2, 2, 2))
+            z = BlockArray(
+                [fill(1,2,2) fill(2,2,2); fill(3,2,2) fill(4,2,2)],
+                [2, 2],
+                [2, 2]
+            )
+            @test ([1 2;3 4] .* y).data == z
+            @test reshape([3], 1, 1) .* y == 3 * y
+            @test ([1 2] .* y).data == BlockArray([fill(1,4,2) fill(2,4,2)], [2, 2], [2, 2])
+            @test ([1 2]' .* y).data == BlockArray([fill(1,2,4); fill(2,2,4)], [2, 2], [2, 2])
+            @test_throws DimensionMismatch [1 2;3 4;5 6] .* y
+        end
     end
 end
\ No newline at end of file

From 92167601f8967cb5acb31ff3ecc6c9477f5787d0 Mon Sep 17 00:00:00 2001
From: maximvassiliev <vassiliev.max@gmail.com>
Date: Thu, 26 Sep 2024 14:17:07 +0100
Subject: [PATCH 04/10] slightly better broadcasting

---
 src/BlockMatrixTensor.jl | 10 +++++-----
 src/DualArray.jl         |  2 +-
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/BlockMatrixTensor.jl b/src/BlockMatrixTensor.jl
index 17fdcbf..d92dee4 100644
--- a/src/BlockMatrixTensor.jl
+++ b/src/BlockMatrixTensor.jl
@@ -3,7 +3,7 @@
 using BlockArrays, SparseArrays
 
 struct BlockMatrixTensor{T} <: AbstractArray{T, 4}
-    data::AbstractBlockMatrix{T}
+    data::BlockMatrix{T}
 end
 
 function BlockMatrixTensor(x::AbstractArray{T, 4}) where {T}
@@ -33,7 +33,7 @@ for op in (:*, :/)
 end
 
 function broadcasted(f::Function, x::BlockMatrixTensor{T}, y::AbstractMatrix) where {T}
-    ret = BlockArray{T}(undef, blocksizes(x.data)...)
+    ret = copy(x.data)
     n, m = blocksize(x.data)
     if blocksize(x.data) == size(y)
         for i = 1:n, j = 1:m
@@ -58,7 +58,7 @@ function broadcasted(f::Function, x::BlockMatrixTensor{T}, y::AbstractMatrix) wh
 end
 
 function broadcasted(f::Function, x::AbstractMatrix, y::BlockMatrixTensor{T}) where {T}
-    ret = BlockArray{T}(undef, blocksizes(y.data)...)
+    ret = copy(y.data)
     n, m = blocksize(y.data)
     if blocksize(y.data) == size(x)
         for i = 1:n, j = 1:m
@@ -87,11 +87,11 @@ end
 function sum(x::BlockMatrixTensor{T}; dims = Colon()) where {T}
     if dims == 1:2
         n, m = size(x, 3), size(x, 4)
-        ret = zeros(T, n, m)
+        ret = similar(x.data[Block(1, 1)]) * 0
         for i = 1:size(x, 1), j = 1:size(x, 2)
             ret += x.data[Block(i, j)]
         end
-        sparse(ret)
+        ret
     elseif dims == 2
         n, m, s, t = size(x)
         ret = BlockArray(zeros(T, n*s, t), fill(s, n), fill(t, 1))
diff --git a/src/DualArray.jl b/src/DualArray.jl
index 53f0282..9396b8a 100644
--- a/src/DualArray.jl
+++ b/src/DualArray.jl
@@ -92,7 +92,7 @@ function DualMatrix(x::AbstractMatrix)
     val = [y.value for y in x]
     n, m = size(x)
     jac = mortar([x[i,j].partials for i = 1:n, j = 1:m])
-    DualMatrix(sparse(val), BlockMatrixTensor(jac))
+    DualMatrix(val, BlockMatrixTensor(jac))
 end
 
 function getindex(x::DualVector, y::Int)

From d3d8727e773f2bf5137c3c0ad7d95016f316eb95 Mon Sep 17 00:00:00 2001
From: maximvassiliev <vassiliev.max@gmail.com>
Date: Thu, 26 Sep 2024 22:08:37 +0100
Subject: [PATCH 05/10] remake with views

---
 examples/convlayer.jl    |  1 +
 src/BlockMatrixTensor.jl | 28 +++++++++++++---------------
 2 files changed, 14 insertions(+), 15 deletions(-)

diff --git a/examples/convlayer.jl b/examples/convlayer.jl
index 538fd57..2ca3636 100644
--- a/examples/convlayer.jl
+++ b/examples/convlayer.jl
@@ -37,6 +37,7 @@ function model_loss(x, y, w)
     ker = reshape(w[1:9], 3, 3)
     weights = reshape(w[10:6769], 10, 676)
     biases = w[6770:6779]
+    println("Reshape Complete")
     l1 = vec(DualMatrix(convlayer(x, ker)))
     println("Conv layer complete")
     l2 = dense_layer(weights, biases, l1, softmax)
diff --git a/src/BlockMatrixTensor.jl b/src/BlockMatrixTensor.jl
index d92dee4..773ba55 100644
--- a/src/BlockMatrixTensor.jl
+++ b/src/BlockMatrixTensor.jl
@@ -10,14 +10,14 @@ function BlockMatrixTensor(x::AbstractArray{T, 4}) where {T}
     n, m, s, t = size(x)
     data = BlockArray(zeros(T, n * s, m * t), fill(s, n), fill(t, m))
     for i = 1:n, j = 1:m
-        data[Block(i,j)] = x[i,j,:,:]
+        view(data, Block(i,j)) .= x[i,j,:,:] 
     end
     BlockMatrixTensor(data)
 end
 
-size(x::BlockMatrixTensor) = (blocksize(x.data)..., size(x.data[Block(1), Block(1)])...)
+size(x::BlockMatrixTensor) = (blocksize(x.data)..., blocksizes(x.data, 1)...)
 
-getindex(x::BlockMatrixTensor, a::Int, b::Int, c, d) = sparse_getindex(x.data[Block(a), Block(b)], c, d)
+getindex(x::BlockMatrixTensor, a::Int, b::Int, c, d) = sparse_getindex(x.data[Block(a, b)], c, d)
 getindex(x::BlockMatrixTensor, a::UnitRange, b::UnitRange, ::Colon, ::Colon) = BlockMatrixTensor(x.data[Block.(a),Block.(b)])
 
 function show(io::IO,m::MIME"text/plain", x::BlockMatrixTensor)
@@ -37,17 +37,17 @@ function broadcasted(f::Function, x::BlockMatrixTensor{T}, y::AbstractMatrix) wh
     n, m = blocksize(x.data)
     if blocksize(x.data) == size(y)
         for i = 1:n, j = 1:m
-            ret[Block(i,j)] = f.(y[i,j], x.data[Block(i,j)])
+            view(ret, Block(i,j)) .= f.(y[i,j], x.data[Block(i,j)])
         end
     elseif size(y) == (1, 1)
-        f.(y[1,1], x.data)
+        f.(y[1,1], ret)
     elseif size(y) == (1, m)
         for j = 1:m
-            ret[:, Block(j)] = f.(y[1,j], x.data[:, Block(j)])
+            view(ret, :, Block(j)) .= f.(y[1,j], x.data[:, Block(j)])
         end
     elseif size(y) == (n, 1)
         for i = 1:n
-            ret[Block(i), :] = f.(y[i, 1], x.data[Block(i), :])
+            view(ret, Block(i), :) .= f.(y[i, 1], x.data[Block(i), :])
         end
     else
         a = size(x)
@@ -62,17 +62,17 @@ function broadcasted(f::Function, x::AbstractMatrix, y::BlockMatrixTensor{T}) wh
     n, m = blocksize(y.data)
     if blocksize(y.data) == size(x)
         for i = 1:n, j = 1:m
-            ret[Block(i,j)] = f.(x[i,j], y.data[Block(i,j)])
+            view(ret, Block(i,j)) .= f.(x[i,j], y.data[Block(i,j)])
         end
     elseif size(x) == (1, 1)
-        f.(x[1,1], y.data)
+        f.(x[1,1], ret)
     elseif size(x) == (1, m)
         for j = 1:m
-            ret[:, Block(j)] = f.(x[1,j], y.data[:, Block(j)])
+            view(ret, :, Block(j)) .= f.(x[1,j], y.data[:, Block(j)])
         end
     elseif size(x) == (n, 1)
         for i = 1:n
-            ret[Block(i), :] = f.(x[i, 1], y.data[Block(i), :])
+            view(ret, Block(i), :) .= f.(x[i, 1], y.data[Block(i), :])
         end
     else
         a = size(y)
@@ -82,8 +82,6 @@ function broadcasted(f::Function, x::AbstractMatrix, y::BlockMatrixTensor{T}) wh
     BlockMatrixTensor(ret)
 end
 
-
-
 function sum(x::BlockMatrixTensor{T}; dims = Colon()) where {T}
     if dims == 1:2
         n, m = size(x, 3), size(x, 4)
@@ -94,9 +92,9 @@ function sum(x::BlockMatrixTensor{T}; dims = Colon()) where {T}
         ret
     elseif dims == 2
         n, m, s, t = size(x)
-        ret = BlockArray(zeros(T, n*s, t), fill(s, n), fill(t, 1))
+        ret = x.data[Block.(1:n), Block(1)] * 0
         for i = 1:n, j = 1:m
-            ret[Block(i,1)] += x.data[Block(i,j)]
+            view(ret, Block(i,1)) .+= x.data[Block(i,j)]
         end
         BlockMatrixTensor(ret)
     end

From 70c77a26dba90b31600307c83381d2dabe8d8683 Mon Sep 17 00:00:00 2001
From: maximvassiliev <vassiliev.max@gmail.com>
Date: Fri, 27 Sep 2024 20:08:34 +0100
Subject: [PATCH 06/10] temporarily fix sparsity issue with sum()

---
 src/BlockMatrixTensor.jl | 29 +++++++++++++++--------------
 1 file changed, 15 insertions(+), 14 deletions(-)

diff --git a/src/BlockMatrixTensor.jl b/src/BlockMatrixTensor.jl
index 773ba55..963584e 100644
--- a/src/BlockMatrixTensor.jl
+++ b/src/BlockMatrixTensor.jl
@@ -1,4 +1,4 @@
-#4-Tensor with data stored in a BlockArray
+#4-Tensor with data stored in a BlockMatrix
 #Enables sparsity for DualMatrix jacobian.
 using BlockArrays, SparseArrays
 
@@ -10,21 +10,21 @@ function BlockMatrixTensor(x::AbstractArray{T, 4}) where {T}
     n, m, s, t = size(x)
     data = BlockArray(zeros(T, n * s, m * t), fill(s, n), fill(t, m))
     for i = 1:n, j = 1:m
-        view(data, Block(i,j)) .= x[i,j,:,:] 
+        view(data, Block(i, j)) .= x[i, j, :, :] 
     end
     BlockMatrixTensor(data)
 end
 
-size(x::BlockMatrixTensor) = (blocksize(x.data)..., blocksizes(x.data, 1)...)
+size(x::BlockMatrixTensor) = (blocksize(x.data)..., blocksizes(x.data)[1,1]...)
 
 getindex(x::BlockMatrixTensor, a::Int, b::Int, c, d) = sparse_getindex(x.data[Block(a, b)], c, d)
-getindex(x::BlockMatrixTensor, a::UnitRange, b::UnitRange, ::Colon, ::Colon) = BlockMatrixTensor(x.data[Block.(a),Block.(b)])
+getindex(x::BlockMatrixTensor, a::UnitRange, b::UnitRange, ::Colon, ::Colon) = BlockMatrixTensor(x.data[Block.(a), Block.(b)])
 
 function show(io::IO,m::MIME"text/plain", x::BlockMatrixTensor)
     print("BlockMatrixTensor containing: \n")
-    show(io,m,x.data)
+    show(io,m, x.data)
 end
-show(io::IO,x::BlockMatrixTensor) = show(io, x.data)
+show(io::IO, x::BlockMatrixTensor) = show(io, x.data)
 
 # Blockwise broadcast
 for op in (:*, :/)
@@ -37,13 +37,13 @@ function broadcasted(f::Function, x::BlockMatrixTensor{T}, y::AbstractMatrix) wh
     n, m = blocksize(x.data)
     if blocksize(x.data) == size(y)
         for i = 1:n, j = 1:m
-            view(ret, Block(i,j)) .= f.(y[i,j], x.data[Block(i,j)])
+            view(ret, Block(i, j)) .= f.(y[i, j], x.data[Block(i, j)])
         end
     elseif size(y) == (1, 1)
-        f.(y[1,1], ret)
+        f.(y[1, 1], ret)
     elseif size(y) == (1, m)
         for j = 1:m
-            view(ret, :, Block(j)) .= f.(y[1,j], x.data[:, Block(j)])
+            view(ret, :, Block(j)) .= f.(y[1, j], x.data[:, Block(j)])
         end
     elseif size(y) == (n, 1)
         for i = 1:n
@@ -62,13 +62,13 @@ function broadcasted(f::Function, x::AbstractMatrix, y::BlockMatrixTensor{T}) wh
     n, m = blocksize(y.data)
     if blocksize(y.data) == size(x)
         for i = 1:n, j = 1:m
-            view(ret, Block(i,j)) .= f.(x[i,j], y.data[Block(i,j)])
+            view(ret, Block(i, j)) .= f.(x[i, j], y.data[Block(i, j)])
         end
     elseif size(x) == (1, 1)
-        f.(x[1,1], ret)
+        f.(x[1, 1], ret)
     elseif size(x) == (1, m)
         for j = 1:m
-            view(ret, :, Block(j)) .= f.(x[1,j], y.data[:, Block(j)])
+            view(ret, :, Block(j)) .= f.(x[1, j], y.data[:, Block(j)])
         end
     elseif size(x) == (n, 1)
         for i = 1:n
@@ -92,9 +92,10 @@ function sum(x::BlockMatrixTensor{T}; dims = Colon()) where {T}
         ret
     elseif dims == 2
         n, m, s, t = size(x)
-        ret = x.data[Block.(1:n), Block(1)] * 0
+        # Indexing [:, Block(j)] does not preserve density
+        ret = BlockArray(spzeros(n * s, t), fill(s, n), [t])
         for i = 1:n, j = 1:m
-            view(ret, Block(i,1)) .+= x.data[Block(i,j)]
+            view(ret, Block(i, 1)) .+= x.data[Block(i, j)]
         end
         BlockMatrixTensor(ret)
     end

From 83cadf0a8f2a66c141f6c6595f062256b14f2af8 Mon Sep 17 00:00:00 2001
From: maximvassiliev <vassiliev.max@gmail.com>
Date: Sat, 28 Sep 2024 00:49:34 +0100
Subject: [PATCH 07/10] rework with BlockedMatrix

---
 examples/convlayer.jl    |  2 +-
 src/BlockMatrixTensor.jl | 54 ++++++++++++++++++++++++++++++----------
 src/DualArray.jl         | 17 ++++++-------
 3 files changed, 49 insertions(+), 24 deletions(-)

diff --git a/examples/convlayer.jl b/examples/convlayer.jl
index 2ca3636..90ec72c 100644
--- a/examples/convlayer.jl
+++ b/examples/convlayer.jl
@@ -41,7 +41,7 @@ function model_loss(x, y, w)
     l1 = vec(DualMatrix(convlayer(x, ker)))
     println("Conv layer complete")
     l2 = dense_layer(weights, biases, l1, softmax)
-    println("Dense Layer complete")
+    println(l2.jacobian)
     target = OneElement(1, y+1, 10)
     loss = cross_entropy(l2, target)
     println("Loss complete")
diff --git a/src/BlockMatrixTensor.jl b/src/BlockMatrixTensor.jl
index 963584e..0e069ae 100644
--- a/src/BlockMatrixTensor.jl
+++ b/src/BlockMatrixTensor.jl
@@ -3,22 +3,36 @@
 using BlockArrays, SparseArrays
 
 struct BlockMatrixTensor{T} <: AbstractArray{T, 4}
-    data::BlockMatrix{T}
+    data::BlockedMatrix{T}
 end
 
+#Construct BlockMatrixTensor from 4-tensor. Useful for testing purposes.
 function BlockMatrixTensor(x::AbstractArray{T, 4}) where {T}
     n, m, s, t = size(x)
-    data = BlockArray(zeros(T, n * s, m * t), fill(s, n), fill(t, m))
+    data = BlockedMatrix(zeros(T, n * s, m * t), fill(s, n), fill(t, m))
     for i = 1:n, j = 1:m
         view(data, Block(i, j)) .= x[i, j, :, :] 
     end
     BlockMatrixTensor(data)
 end
 
+#Construct from matrix of blocks (similar to mortar())
+function BlockMatrixTensor(x::Matrix{T}) where {T <: AbstractMatrix}
+    n, m = size(x)
+    s, t = size(x[1, 1])
+    mat = hcat((vcat(x[i,:]...) for i = 1:n)...)
+    blockmat = BlockedMatrix(mat, fill(s, n), fill(t, m))
+    BlockMatrixTensor(blockmat)
+end
+
 size(x::BlockMatrixTensor) = (blocksize(x.data)..., blocksizes(x.data)[1,1]...)
 
-getindex(x::BlockMatrixTensor, a::Int, b::Int, c, d) = sparse_getindex(x.data[Block(a, b)], c, d)
-getindex(x::BlockMatrixTensor, a::UnitRange, b::UnitRange, ::Colon, ::Colon) = BlockMatrixTensor(x.data[Block.(a), Block.(b)])
+function getindex(x::BlockMatrixTensor, a::Int, b::Int, c, d)
+    s, t = size(x, 3), size(x, 4)
+    mat = parent(x.data)
+    sub = mat[((a - 1)* s + 1):(a * s),((b - 1)* t + 1):(b * t)]
+    sub[c, d]
+end
 
 function show(io::IO,m::MIME"text/plain", x::BlockMatrixTensor)
     print("BlockMatrixTensor containing: \n")
@@ -82,18 +96,18 @@ function broadcasted(f::Function, x::AbstractMatrix, y::BlockMatrixTensor{T}) wh
     BlockMatrixTensor(ret)
 end
 
-function sum(x::BlockMatrixTensor{T}; dims = Colon()) where {T}
+function sum(x::BlockMatrixTensor; dims = Colon())
     if dims == 1:2
         n, m = size(x, 3), size(x, 4)
-        ret = similar(x.data[Block(1, 1)]) * 0
+        ret = similarzeros(x[1,1,:,:], n, m)
         for i = 1:size(x, 1), j = 1:size(x, 2)
-            ret += x.data[Block(i, j)]
+            ret += x[i,j,:,:]
         end
         ret
     elseif dims == 2
         n, m, s, t = size(x)
         # Indexing [:, Block(j)] does not preserve density
-        ret = BlockArray(spzeros(n * s, t), fill(s, n), [t])
+        ret = BlockedArray(similarzeros(x[1,1,:,:], n * s, t), fill(s, n), [t])
         for i = 1:n, j = 1:m
             view(ret, Block(i, 1)) .+= x.data[Block(i, j)]
         end
@@ -101,11 +115,25 @@ function sum(x::BlockMatrixTensor{T}; dims = Colon()) where {T}
     end
 end
 
-function reshape(x::BlockMatrixTensor, dims::Vararg{Int64})
-    #Blockwise reshape
-    if dims[3] == size(x, 3) && dims[4] == size(x, 4)
-        mortar(reshape(blocks(x.data), dims[1], dims[2]))
+#Helper function for initialising a BlockedMatrix to be populated while preserving sparsity.
+similarzeros(x::Matrix, dims...) = zeros(eltype(x), dims...)
+similarzeros(x::AbstractMatrix, dims...) = spzeros(eltype(x), dims...)
+
+function reshape(x::BlockMatrixTensor, dims::Vararg{Union{Colon, Int}, 4})
+    n, m, s, t = size(x)
+    #Block-wise reshape
+    if dims[3] isa Colon && dims[4] isa Colon
+        ret = BlockedMatrix(similarzeros(x[1, 1, :, :], s * dims[1], t * dims[2]), fill(s, dims[1]), fill(t, dims[2]))
+        for j = 1:m, i = 1:n
+            c = (j - 1) * n + (i - 1)
+            a, b = c % dims[1] + 1, div(c, dims[1]) + 1
+            ret[Block(a,b)] = x.data[Block(i,j)]
+        end
+        BlockMatrixTensor(ret)
     end
 end
 
-vec(x::BlockMatrixTensor) = vcat(blocks(x.data)...)
+#'Flatten': converts 4-Tensor to matrix. For BlockMatrixTensors calls parent().
+# Mimics the reshape achieved by this for general 4-Tensors
+flatten(x::BlockMatrixTensor) = parent(x.data)
+flatten(x::AbstractArray{T, 4}) where {T} = reshape(x, size(x, 1) * size(x, 3), size(x, 2) * size(x, 4))
diff --git a/src/DualArray.jl b/src/DualArray.jl
index 9396b8a..ccec2c3 100644
--- a/src/DualArray.jl
+++ b/src/DualArray.jl
@@ -90,9 +90,8 @@ DualMatrix(x::AbstractMatrix, j::AbstractArray{T,4}) where {T} = DualArray(x, j)
 
 function DualMatrix(x::AbstractMatrix)
     val = [y.value for y in x]
-    n, m = size(x)
-    jac = mortar([x[i,j].partials for i = 1:n, j = 1:m])
-    DualMatrix(val, BlockMatrixTensor(jac))
+    jac_blocks = [y.partials for y in x]
+    DualMatrix(val, BlockMatrixTensor(jac_blocks))
 end
 
 function getindex(x::DualVector, y::Int)
@@ -149,7 +148,7 @@ end
 # perturbations are reshapes of each other.
 function *(x::DualMatrix, y::DualVector)
     val = x.value * y.value
-    jac = x.value * y.jacobian + vcat(blocks(sum(x.jacobian .* y.value'; dims = 2).data)...)
+    jac = x.value * y.jacobian + flatten(sum(x.jacobian .* y.value'; dims = 2))
     DualVector(val, jac)
 end
 
@@ -256,14 +255,12 @@ end
 
 # reshape does not preserve shape of perturbations (since jacobian can have
 # any number of columns)
-
-#Should a PR be made into reshape() 
 _tomatrix(v::AbstractVector) = reshape(v, :, 1)
 
-function reshape(x::DualVector,dims::Vararg{Int})
+function reshape(x::DualVector,dims::Vararg{Int, 2})
     val = reshape(x.value, dims...)
-    jac_blockmat = [sparse_transpose(sparse_getindex(x.jacobian, i, :)) for i = 1:size(x.jacobian, 1)]
-    jac = BlockMatrixTensor(mortar(reshape(jac_blockmat, dims...)))
+    blocked_jac = BlockedMatrix(x.jacobian, fill(1, length(x)), [size(x.jacobian, 2)])
+    jac = reshape(BlockMatrixTensor(blocked_jac), dims..., :, :)
     DualMatrix(val, jac)
 end
 
@@ -272,7 +269,7 @@ _blockvec(x::BlockMatrixTensor) = vcat(blocks(x.data)...)
 
 function vec(x::DualArray)
     val = vec(x.value)
-    jac = _blockvec(x.jacobian)
+    jac = parent(reshape(x.jacobian, length(val), 1, :, :).data)
     DualVector(val, jac)
 end
 

From 9e707dd8809b913c7edf4d462dd909c100a65541 Mon Sep 17 00:00:00 2001
From: maximvassiliev <vassiliev.max@gmail.com>
Date: Mon, 30 Sep 2024 11:05:34 +0100
Subject: [PATCH 08/10] unit tests, and back to BlockMatrix

---
 examples/convlayer.jl    |   4 +-
 src/BlockMatrixTensor.jl | 124 ++++++++++-----------------------------
 src/DualArray.jl         |   4 +-
 src/DualArrays.jl        |   2 +-
 test/runtests.jl         |  27 +++++++--
 5 files changed, 56 insertions(+), 105 deletions(-)

diff --git a/examples/convlayer.jl b/examples/convlayer.jl
index 90ec72c..390cc75 100644
--- a/examples/convlayer.jl
+++ b/examples/convlayer.jl
@@ -41,15 +41,13 @@ function model_loss(x, y, w)
     l1 = vec(DualMatrix(convlayer(x, ker)))
     println("Conv layer complete")
     l2 = dense_layer(weights, biases, l1, softmax)
-    println(l2.jacobian)
+    println("Dense Layer Complete")
     target = OneElement(1, y+1, 10)
     loss = cross_entropy(l2, target)
     println("Loss complete")
     loss.value, loss.partials
 end
 
-#TODO: Implement remaining layers upon more support for DualMatrix
-
 function train_model()
     p = rand(6779)
     epochs = 1000
diff --git a/src/BlockMatrixTensor.jl b/src/BlockMatrixTensor.jl
index 0e069ae..27aad6a 100644
--- a/src/BlockMatrixTensor.jl
+++ b/src/BlockMatrixTensor.jl
@@ -3,35 +3,34 @@
 using BlockArrays, SparseArrays
 
 struct BlockMatrixTensor{T} <: AbstractArray{T, 4}
-    data::BlockedMatrix{T}
+    data::BlockMatrix{T}
 end
 
 #Construct BlockMatrixTensor from 4-tensor. Useful for testing purposes.
 function BlockMatrixTensor(x::AbstractArray{T, 4}) where {T}
     n, m, s, t = size(x)
-    data = BlockedMatrix(zeros(T, n * s, m * t), fill(s, n), fill(t, m))
+    data = BlockMatrix(zeros(T, n * s, m * t), fill(s, n), fill(t, m))
     for i = 1:n, j = 1:m
         view(data, Block(i, j)) .= x[i, j, :, :] 
     end
     BlockMatrixTensor(data)
 end
 
-#Construct from matrix of blocks (similar to mortar())
-function BlockMatrixTensor(x::Matrix{T}) where {T <: AbstractMatrix}
-    n, m = size(x)
-    s, t = size(x[1, 1])
-    mat = hcat((vcat(x[i,:]...) for i = 1:n)...)
-    blockmat = BlockedMatrix(mat, fill(s, n), fill(t, m))
-    BlockMatrixTensor(blockmat)
-end
+# Useful for creating a BlockMatrixTensor from blocks()
+BlockMatrixTensor(x::Matrix{T}) where {T <: AbstractMatrix} = BlockMatrixTensor(mortar(x))
 
 size(x::BlockMatrixTensor) = (blocksize(x.data)..., blocksizes(x.data)[1,1]...)
 
-function getindex(x::BlockMatrixTensor, a::Int, b::Int, c, d)
-    s, t = size(x, 3), size(x, 4)
-    mat = parent(x.data)
-    sub = mat[((a - 1)* s + 1):(a * s),((b - 1)* t + 1):(b * t)]
-    sub[c, d]
+#Indexing entire blocks
+getindex(x::BlockMatrixTensor, a::Int, b::Int, ::Colon, ::Colon) = blocks(x.data)[a, b]
+getindex(x::BlockMatrixTensor, a::Int, b, ::Colon, ::Colon) = BlockMatrixTensor(reshape(blocks(x.data)[a, b], 1, :))
+getindex(x::BlockMatrixTensor, a, b::Int, ::Colon, ::Colon) = BlockMatrixTensor(reshape(blocks(x.data)[a, b], :, 1))
+getindex(x::BlockMatrixTensor, a, b, ::Colon, ::Colon) = BlockMatrixTensor(blocks(x.data)[a,b])
+
+
+# For populating a BlockMatrixTensor
+function setindex!(A::BlockMatrixTensor, value, a::Int, b::Int, ::Colon, ::Colon)
+    blocks(A.data)[a, b] = value
 end
 
 function show(io::IO,m::MIME"text/plain", x::BlockMatrixTensor)
@@ -40,100 +39,37 @@ function show(io::IO,m::MIME"text/plain", x::BlockMatrixTensor)
 end
 show(io::IO, x::BlockMatrixTensor) = show(io, x.data)
 
-# Blockwise broadcast
 for op in (:*, :/)
     @eval $op(x::BlockMatrixTensor, y::Number) = BlockMatrixTensor($op(x.data, y))
     @eval $op(x::Number, y::BlockMatrixTensor) = BlockMatrixTensor($op(x, y.data))
 end
 
-function broadcasted(f::Function, x::BlockMatrixTensor{T}, y::AbstractMatrix) where {T}
-    ret = copy(x.data)
-    n, m = blocksize(x.data)
-    if blocksize(x.data) == size(y)
-        for i = 1:n, j = 1:m
-            view(ret, Block(i, j)) .= f.(y[i, j], x.data[Block(i, j)])
-        end
-    elseif size(y) == (1, 1)
-        f.(y[1, 1], ret)
-    elseif size(y) == (1, m)
-        for j = 1:m
-            view(ret, :, Block(j)) .= f.(y[1, j], x.data[:, Block(j)])
-        end
-    elseif size(y) == (n, 1)
-        for i = 1:n
-            view(ret, Block(i), :) .= f.(y[i, 1], x.data[Block(i), :])
-        end
-    else
-        a = size(x)
-        b = size(y)
-        throw(DimensionMismatch("Invalid dimensions for broadcasting. Got $a, $b."))
-    end
-    BlockMatrixTensor(ret)
-end
-
-function broadcasted(f::Function, x::AbstractMatrix, y::BlockMatrixTensor{T}) where {T}
-    ret = copy(y.data)
-    n, m = blocksize(y.data)
-    if blocksize(y.data) == size(x)
-        for i = 1:n, j = 1:m
-            view(ret, Block(i, j)) .= f.(x[i, j], y.data[Block(i, j)])
-        end
-    elseif size(x) == (1, 1)
-        f.(x[1, 1], ret)
-    elseif size(x) == (1, m)
-        for j = 1:m
-            view(ret, :, Block(j)) .= f.(x[1, j], y.data[:, Block(j)])
-        end
-    elseif size(x) == (n, 1)
-        for i = 1:n
-            view(ret, Block(i), :) .= f.(x[i, 1], y.data[Block(i), :])
-        end
-    else
-        a = size(y)
-        b = size(x)
-        throw(DimensionMismatch("Invalid dimensions for broadcasting. Got $a, $b."))
-    end
-    BlockMatrixTensor(ret)
-end
+#Block-wise broadcast 
+broadcasted(f::Function, x::BlockMatrixTensor, y::AbstractMatrix) = BlockMatrixTensor(f.(blocks(x.data), y))
+broadcasted(f::Function, x::BlockMatrixTensor, y::AbstractVector) = BlockMatrixTensor(f.(x, reshape(y, :, 1)))
+broadcasted(f::Function, x::AbstractVecOrMat, y::BlockMatrixTensor) = f.(y, x)
 
 function sum(x::BlockMatrixTensor; dims = Colon())
+    # Blockwise sum
     if dims == 1:2
-        n, m = size(x, 3), size(x, 4)
-        ret = similarzeros(x[1,1,:,:], n, m)
-        for i = 1:size(x, 1), j = 1:size(x, 2)
-            ret += x[i,j,:,:]
-        end
-        ret
-    elseif dims == 2
-        n, m, s, t = size(x)
-        # Indexing [:, Block(j)] does not preserve density
-        ret = BlockedArray(similarzeros(x[1,1,:,:], n * s, t), fill(s, n), [t])
-        for i = 1:n, j = 1:m
-            view(ret, Block(i, 1)) .+= x.data[Block(i, j)]
-        end
-        BlockMatrixTensor(ret)
+        sum(blocks(x.data))
+    elseif dims == 1 || dims == 2
+        BlockMatrixTensor(sum(blocks(x.data); dims))
+    else
+        # For now, treat all other cases as if summing the 4-Tensor
+        sum(Array(x); dims = dims)
     end
 end
 
-#Helper function for initialising a BlockedMatrix to be populated while preserving sparsity.
-similarzeros(x::Matrix, dims...) = zeros(eltype(x), dims...)
-similarzeros(x::AbstractMatrix, dims...) = spzeros(eltype(x), dims...)
-
 function reshape(x::BlockMatrixTensor, dims::Vararg{Union{Colon, Int}, 4})
-    n, m, s, t = size(x)
-    #Block-wise reshape
+    #Reshape block-wise
+    #TODO: Implement non-blockwise
     if dims[3] isa Colon && dims[4] isa Colon
-        ret = BlockedMatrix(similarzeros(x[1, 1, :, :], s * dims[1], t * dims[2]), fill(s, dims[1]), fill(t, dims[2]))
-        for j = 1:m, i = 1:n
-            c = (j - 1) * n + (i - 1)
-            a, b = c % dims[1] + 1, div(c, dims[1]) + 1
-            ret[Block(a,b)] = x.data[Block(i,j)]
-        end
-        BlockMatrixTensor(ret)
+        BlockMatrixTensor(reshape(blocks(x.data), dims[1], dims[2]))
     end
 end
 
-#'Flatten': converts 4-Tensor to matrix. For BlockMatrixTensors calls parent().
+#'Flatten': converts BlockMatrixTensor to Matrix by removing block structure
 # Mimics the reshape achieved by this for general 4-Tensors
-flatten(x::BlockMatrixTensor) = parent(x.data)
+flatten(x::BlockMatrixTensor) = hcat((vcat((x[i, j, :, :] for i = 1:size(x, 1))...) for j = 1:size(x, 2))...)
 flatten(x::AbstractArray{T, 4}) where {T} = reshape(x, size(x, 1) * size(x, 3), size(x, 2) * size(x, 4))
diff --git a/src/DualArray.jl b/src/DualArray.jl
index ccec2c3..32faf3e 100644
--- a/src/DualArray.jl
+++ b/src/DualArray.jl
@@ -259,7 +259,7 @@ _tomatrix(v::AbstractVector) = reshape(v, :, 1)
 
 function reshape(x::DualVector,dims::Vararg{Int, 2})
     val = reshape(x.value, dims...)
-    blocked_jac = BlockedMatrix(x.jacobian, fill(1, length(x)), [size(x.jacobian, 2)])
+    blocked_jac = BlockMatrix(x.jacobian, fill(1, length(x)), [size(x.jacobian, 2)])
     jac = reshape(BlockMatrixTensor(blocked_jac), dims..., :, :)
     DualMatrix(val, jac)
 end
@@ -269,7 +269,7 @@ _blockvec(x::BlockMatrixTensor) = vcat(blocks(x.data)...)
 
 function vec(x::DualArray)
     val = vec(x.value)
-    jac = parent(reshape(x.jacobian, length(val), 1, :, :).data)
+    jac = flatten(reshape(x.jacobian, length(val), 1, :, :))
     DualVector(val, jac)
 end
 
diff --git a/src/DualArrays.jl b/src/DualArrays.jl
index 830ba8f..bedc76f 100644
--- a/src/DualArrays.jl
+++ b/src/DualArrays.jl
@@ -5,7 +5,7 @@ export dropzeros
 
 import Base: +, ==, getindex, size, broadcast, axes, broadcasted, show, sum,
              vcat, convert, *, -, ^, /, ndims, hcat, vec, promote_rule, zero,
-             reshape
+             reshape, setindex!
 using LinearAlgebra, ArrayLayouts, BandedMatrices, FillArrays, ComponentArrays, SparseArrays
 import ChainRules: frule, ZeroTangent
 
diff --git a/test/runtests.jl b/test/runtests.jl
index 8671332..2374b14 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -91,7 +91,7 @@ using Lux: relu
         z = x[5:6]
         @test (y*z).value == [2,2]
         @test (y*z).jacobian == [1 0 1 0 1 1; 0 1 0 1 1 1]
-        for t in (([1, 0, 0, 1, 0, 0], [1 0 -1 0 1 0;0 0 0 0 0 0]), ([1, 1, 1, 1, 1, 1], [1 0 -1 0 1 0;0 1 0 -1 0 1]), ([0, 1, 1, 0, -1, -1], zeros(2,6)))
+        for t in (([1, 0, 0, 1, 0, 0], [1 0 -1 0 1 0;0 0 0 0 0 0]), ([1, 1, 1, 1, 1, 1], [1 0 -1 0 1 0;0 1 0 -1 0 1]), ([0, 1, 1, 0, -1, -1], zeros(2, 6)))
             x = DualVector(t[1], I(6))
             y = reshape(x[1:4], 2, 2)
             z = x[5:6]
@@ -105,7 +105,7 @@ using Lux: relu
             y = BlockMatrixTensor(ones(2, 2, 2, 2))
             @test x isa BlockMatrixTensor
             @test y isa BlockMatrixTensor
-            @test x == y
+            @test x.data == y.data
         end
         @testset "broadcasting" begin
             y = BlockMatrixTensor(ones(2, 2, 2, 2))
@@ -115,10 +115,27 @@ using Lux: relu
                 [2, 2]
             )
             @test ([1 2;3 4] .* y).data == z
-            @test reshape([3], 1, 1) .* y == 3 * y
-            @test ([1 2] .* y).data == BlockArray([fill(1,4,2) fill(2,4,2)], [2, 2], [2, 2])
-            @test ([1 2]' .* y).data == BlockArray([fill(1,2,4); fill(2,2,4)], [2, 2], [2, 2])
+            @test (reshape([3], 1, 1) .* y).data == (3 * y).data
+            @test ([1 2] .* y).data == BlockArray([fill(1, 4, 2) fill(2, 4, 2)], [2, 2], [2, 2])
+            @test ([1 2]' .* y).data == BlockArray([fill(1, 2, 4); fill(2, 2, 4)], [2, 2], [2, 2])
+            @test (y .* [1, 2]).data == BlockArray([fill(1, 2, 4); fill(2, 2, 4)], [2, 2], [2, 2])
             @test_throws DimensionMismatch [1 2;3 4;5 6] .* y
         end
+        @testset "sum" begin
+            r = rand(2, 2, 2, 2)
+            b = BlockMatrixTensor(r)
+            #TODO: Fix traversal
+            for d in (1:2, 1, 2)
+                @test sum(r; dims = 1:2)[1, 1, :, :] ≈ sum(b; dims = 1:2)
+            end
+        end
+
+        @testset "reshape" begin
+            r= rand(3, 4, 2, 2)
+            b = BlockMatrixTensor(r)
+            for d in ((2, 6), (1, 12), (6, 2), (12, 1))
+                @test BlockMatrixTensor(reshape(r, d..., 2, 2)).data == reshape(b, d..., :, :).data
+            end
+        end
     end
 end
\ No newline at end of file

From 02c65e63db2eadca4b6746a5772aac693e402ca2 Mon Sep 17 00:00:00 2001
From: maximvassiliev <vassiliev.max@gmail.com>
Date: Mon, 30 Sep 2024 11:38:33 +0100
Subject: [PATCH 09/10] fix getindex to allow accessing 4-tensor by Array()

---
 src/BlockMatrixTensor.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/BlockMatrixTensor.jl b/src/BlockMatrixTensor.jl
index 27aad6a..73623bd 100644
--- a/src/BlockMatrixTensor.jl
+++ b/src/BlockMatrixTensor.jl
@@ -22,7 +22,7 @@ BlockMatrixTensor(x::Matrix{T}) where {T <: AbstractMatrix} = BlockMatrixTensor(
 size(x::BlockMatrixTensor) = (blocksize(x.data)..., blocksizes(x.data)[1,1]...)
 
 #Indexing entire blocks
-getindex(x::BlockMatrixTensor, a::Int, b::Int, ::Colon, ::Colon) = blocks(x.data)[a, b]
+getindex(x::BlockMatrixTensor, a::Int, b::Int, c, d) = blocks(x.data)[a, b][c, d]
 getindex(x::BlockMatrixTensor, a::Int, b, ::Colon, ::Colon) = BlockMatrixTensor(reshape(blocks(x.data)[a, b], 1, :))
 getindex(x::BlockMatrixTensor, a, b::Int, ::Colon, ::Colon) = BlockMatrixTensor(reshape(blocks(x.data)[a, b], :, 1))
 getindex(x::BlockMatrixTensor, a, b, ::Colon, ::Colon) = BlockMatrixTensor(blocks(x.data)[a,b])

From ea54133c81f9e407fe14ea3483181f5a0a56c1a9 Mon Sep 17 00:00:00 2001
From: maximvassiliev <vassiliev.max@gmail.com>
Date: Mon, 30 Sep 2024 13:28:42 +0100
Subject: [PATCH 10/10] fix ambiguity

---
 src/BlockMatrixTensor.jl | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/BlockMatrixTensor.jl b/src/BlockMatrixTensor.jl
index 73623bd..3801337 100644
--- a/src/BlockMatrixTensor.jl
+++ b/src/BlockMatrixTensor.jl
@@ -23,6 +23,7 @@ size(x::BlockMatrixTensor) = (blocksize(x.data)..., blocksizes(x.data)[1,1]...)
 
 #Indexing entire blocks
 getindex(x::BlockMatrixTensor, a::Int, b::Int, c, d) = blocks(x.data)[a, b][c, d]
+getindex(x::BlockMatrixTensor, a::Int, b::Int, ::Colon, ::Colon) = blocks(x.data)[a, b]
 getindex(x::BlockMatrixTensor, a::Int, b, ::Colon, ::Colon) = BlockMatrixTensor(reshape(blocks(x.data)[a, b], 1, :))
 getindex(x::BlockMatrixTensor, a, b::Int, ::Colon, ::Colon) = BlockMatrixTensor(reshape(blocks(x.data)[a, b], :, 1))
 getindex(x::BlockMatrixTensor, a, b, ::Colon, ::Colon) = BlockMatrixTensor(blocks(x.data)[a,b])