From 0e1072d9e3c9a7f43b3eb02bf6a4b6ea60491e64 Mon Sep 17 00:00:00 2001
From: Nader-Rahhal <rahhalnader04@gmail.com>
Date: Tue, 2 Jun 2026 17:49:10 -0400
Subject: [PATCH 1/6] start svd

---
 lib/cunumeric_jl_wrapper/src/types.cpp |   1 +
 src/ndarray/linalg.jl                  | 114 +++++++++++++++++--------
 2 files changed, 81 insertions(+), 34 deletions(-)

diff --git a/lib/cunumeric_jl_wrapper/src/types.cpp b/lib/cunumeric_jl_wrapper/src/types.cpp
index f181dc2e..998a6d75 100644
--- a/lib/cunumeric_jl_wrapper/src/types.cpp
+++ b/lib/cunumeric_jl_wrapper/src/types.cpp
@@ -166,4 +166,5 @@ void wrap_binary_ops(jlcxx::Module& mod) {
 void wrap_linalg_ops(jlcxx::Module& mod) {
   mod.set_const("SOLVE", legate::LocalTaskID{CuPyNumericOpCode::CUPYNUMERIC_SOLVE});
   mod.set_const("MP_SOLVE", legate::LocalTaskID{CuPyNumericOpCode::CUPYNUMERIC_MP_SOLVE});
+  mod.set_const("SVD", legate::LocalTaskID{CuPyNumericOpCode::CUPYNUMERIC_SVD});
 }
\ No newline at end of file
diff --git a/src/ndarray/linalg.jl b/src/ndarray/linalg.jl
index 1c01b44a..9fa31bb1 100644
--- a/src/ndarray/linalg.jl
+++ b/src/ndarray/linalg.jl
@@ -52,51 +52,97 @@ function solve_batched(a::NDArray{T,N}, b::NDArray, x::NDArray) where {T,N}
     Legate.submit_manual_task(rt, task)
 end
 
-# solve runs in floating point:
-# int/bool inputs promote to Float64 (matching cupynumeric)
-const _SOLVE_PROMOTABLE = Union{SUPPORTED_INT_TYPES,Bool}
-const _SOLVE_ACCEPTED = Union{SUPPORTED_SOLVE_TYPES,_SOLVE_PROMOTABLE}
-_solve_eltype(::Type{T}) where {T<:_SOLVE_PROMOTABLE} = Float64
-_solve_eltype(::Type{T}) where {T<:SUPPORTED_SOLVE_TYPES} = T
-
-# Type/dim guards dispatch on one argument at a time, then forward to `_solve`.
-function solve(a::NDArray{<:_SOLVE_ACCEPTED}, b::NDArray{<:_SOLVE_ACCEPTED})
-    A, B = eltype(a), eltype(b)
-    O = promote_type(_solve_eltype(A), _solve_eltype(B))
-    # int/bool -> float is an implicit promotion, disallowed unless `allowpromotion`
-    A <: _SOLVE_PROMOTABLE && assertpromotion(solve, A, O)
-    B <: _SOLVE_PROMOTABLE && assertpromotion(solve, B, O)
-    return _solve_check_a_dims(unchecked_promote_arr(a, O), unchecked_promote_arr(b, O))
+function nda_to_logical_array(arr::NDArray{T,N}) where {T,N}
+    st_handle = cuNumeric.get_store(arr)
+    return Legate.LogicalArray{T,N}(st_handle, size(arr))
 end
 
-function solve(a::NDArray, b::NDArray)
-    bad = eltype(a) <: _SOLVE_ACCEPTED ? eltype(b) : eltype(a)
-    throw(ArgumentError("array type $bad is unsupported in solve"))
-end
+function svd_single(a::NDArray{T,N}, u::NDArray, s::NDArray, vh::NDArray) where {T,N}
+    rt = Legate.get_runtime();
+    lib = cuNumeric.get_lib();
 
-# `a` must be at least 2D, `b` at least 1D.
-function _solve_check_a_dims(a::NDArray{<:Any,0}, b::NDArray)
-    throw(ArgumentError("0-dimensional array given. Array must be at least two-dimensional"))
+    task = Legate.create_auto_task(rt, lib, cuNumeric.SVD);
+
+    l_a = nda_to_logical_array(a)
+    l_u = nda_to_logical_array(u)
+    l_s = nda_to_logical_array(s)
+    l_vh = nda_to_logical_array(vh)
+
+    Legate.add_input(task, l_a)
+    Legate.add_output(task, l_u)
+    Legate.add_output(task, l_s)
+    Legate.add_output(task, l_vh)
+
+    Legate.add_broadcast(task, l_a)
+    Legate.add_broadcast(task, l_u)
+    Legate.add_broadcast(task, l_s)
+    Legate.add_broadcast(task, l_vh)
+
+    Legate.submit_auto_task(rt, task)
 end
-function _solve_check_a_dims(a::NDArray{<:Any,1}, b::NDArray)
+
+# Dimension guards
+function solve(a::NDArray{T,1}, b::NDArray{S,M}) where {T,S,M}
     throw(ArgumentError("1-dimensional array given. Array must be at least two-dimensional"))
 end
-_solve_check_a_dims(a::NDArray, b::NDArray) = _solve_check_b_dims(a, b)
 
-function _solve_check_b_dims(a::NDArray, b::NDArray{<:Any,0})
+function solve(a::NDArray{T,0}, b::NDArray{S,M}) where {T,S,M}
+    throw(ArgumentError("0-dimensional array given. Array must be at least two-dimensional"))
+end
+
+function solve(a::NDArray{T,N}, b::NDArray{S,0}) where {T,N,S}
     throw(ArgumentError("0-dimensional array given. Array must be at least one-dimensional"))
 end
-_solve_check_b_dims(a::NDArray, b::NDArray) = _solve(a, b)
 
-# 2D case: (m,m),(m)->(m).
-# Backend needs rhs "b" to be 2D. We reshape b from (n,) to (n,1)
-function _solve(a::NDArray{T,2}, b::NDArray{S,1}) where {T,S}
-    m = size(b)[1]
-    return reshape(_solve(a, reshape(b, (m, 1))), (m,))
+# Float16 guards
+
+# @static if HAS_CUDA
+#     function solve(a::NDArray{Float16,N}, b::NDArray{S,M}) where {N,S,M}
+#         throw(ArgumentError("array type float16 is unsupported in linalg"))
+#     end
+
+#     function solve(a::NDArray{T,N}, b::NDArray{Float16,M}) where {T,N,M}
+#         throw(ArgumentError("array type float16 is unsupported in linalg"))
+#     end
+
+# 2D case: (m,m),(m)->( m)
+function solve(a::NDArray{T,2}, b::NDArray{S,1}) where {T,S}
+    size(a)[end - 1] != size(a)[end] &&
+        throw(ArgumentError("Last 2 dimensions of the array must be square"))
+    size(a)[2] != size(b)[1] &&
+        throw(
+            ArgumentError(
+                "Input operand 1 has a mismatch in its dimension 0, " *
+                "with signature (m,m),(m)->(m) (size $(size(b)[1]) " *
+                "is different from $(size(a)[2]))",
+            ),
+        )
+    prod(size(a)) == 0 || prod(size(b)) == 0 && return zeros(T, size(b)...)
+    x = zeros(T, size(b)...)
+    solve_batched(a, b, x)
+    return x
+end
+
+# 2D case: (m,m),(m,n)->(m,n)
+function solve(a::NDArray{T,2}, b::NDArray{S,2}) where {T,S}
+    size(a)[end - 1] != size(a)[end] &&
+        throw(ArgumentError("Last 2 dimensions of the array must be square"))
+    size(a)[2] != size(b)[1] &&
+        throw(
+            ArgumentError(
+                "Input operand 1 has a mismatch in its dimension 0, " *
+                "with signature (m,m),(m,n)->(m,n) (size $(size(b)[1]) " *
+                "is different from $(size(a)[2]))",
+            ),
+        )
+    prod(size(a)) == 0 || prod(size(b)) == 0 && return zeros(T, size(b)...)
+    x = zeros(T, size(b)...)
+    solve_batched(a, b, x)
+    return x
 end
 
-# 2D (m,m),(m,n)->(m,n) and batched (...,m,m),(...,m,n)->(...,m,n)
-function _solve(a::NDArray{T,N}, b::NDArray{S,N}) where {T,S,N}
+# Batched case: (...,m,m),(...,m,n)->(...,m,n)
+function solve(a::NDArray{T,N}, b::NDArray{S,N}) where {T,S,N}
     size(a)[end - 1] != size(a)[end] &&
         throw(ArgumentError("Last 2 dimensions of the array must be square"))
     size(a)[end] != size(b)[end - 1] &&
@@ -114,6 +160,6 @@ function _solve(a::NDArray{T,N}, b::NDArray{S,N}) where {T,S,N}
 end
 
 # Mismatched batch dimensions
-function _solve(a::NDArray{T,N}, b::NDArray{S,M}) where {T,N,S,M}
+function solve(a::NDArray{T,N}, b::NDArray{S,M}) where {T,N,S,M}
     throw(ArgumentError("Batched matrices require signature (...,m,m),(...,m,n)->(...,m,n)"))
 end

From 73e840cbcb6cf1313a8eee9c4d3e63c764d165ff Mon Sep 17 00:00:00 2001
From: Nader-Rahhal <rahhalnader04@gmail.com>
Date: Tue, 2 Jun 2026 18:13:36 -0400
Subject: [PATCH 2/6] move nda_to_logical_array

---
 ext/CUDAExt/cuda.jl           | 5 -----
 src/ndarray/detail/ndarray.jl | 6 ++++++
 src/ndarray/linalg.jl         | 5 -----
 3 files changed, 6 insertions(+), 10 deletions(-)

diff --git a/ext/CUDAExt/cuda.jl b/ext/CUDAExt/cuda.jl
index 33421640..c8fd5c01 100644
--- a/ext/CUDAExt/cuda.jl
+++ b/ext/CUDAExt/cuda.jl
@@ -74,11 +74,6 @@ function check_sz(arr, maxshape)
     end
 end
 
-function nda_to_logical_array(arr::NDArray{T,N}) where {T,N}
-    st_handle = cuNumeric.get_store(arr)
-    return Legate.LogicalArray{T,N}(st_handle, size(arr))
-end
-
 function Launch(kernel::cuNumeric.CUDATask, inputs::Tuple{Vararg{NDArray}},
     outputs::Tuple{Vararg{NDArray}}, scalars::Tuple{Vararg{Any}}; blocks, threads)
 
diff --git a/src/ndarray/detail/ndarray.jl b/src/ndarray/detail/ndarray.jl
index 9d630fcc..926a0fe8 100644
--- a/src/ndarray/detail/ndarray.jl
+++ b/src/ndarray/detail/ndarray.jl
@@ -511,3 +511,9 @@ function nda_to_logical_store(arr::NDArray{T,N}) where {T,N}
     st_handle = Legate.data(Legate.LogicalArray{T,N}(la_handle, size(arr)))
     return Legate.LogicalStore{T,N}(st_handle, size(arr))
 end
+
+function nda_to_logical_array(arr::NDArray{T,N}) where {T,N}
+    st_handle = cuNumeric.get_store(arr)
+    return Legate.LogicalArray{T,N}(st_handle, size(arr))
+end
+
diff --git a/src/ndarray/linalg.jl b/src/ndarray/linalg.jl
index 9fa31bb1..0788f2ba 100644
--- a/src/ndarray/linalg.jl
+++ b/src/ndarray/linalg.jl
@@ -52,11 +52,6 @@ function solve_batched(a::NDArray{T,N}, b::NDArray, x::NDArray) where {T,N}
     Legate.submit_manual_task(rt, task)
 end
 
-function nda_to_logical_array(arr::NDArray{T,N}) where {T,N}
-    st_handle = cuNumeric.get_store(arr)
-    return Legate.LogicalArray{T,N}(st_handle, size(arr))
-end
-
 function svd_single(a::NDArray{T,N}, u::NDArray, s::NDArray, vh::NDArray) where {T,N}
     rt = Legate.get_runtime();
     lib = cuNumeric.get_lib();

From 993a9bd0991b0b7f13d22616b6acfadef34509b5 Mon Sep 17 00:00:00 2001
From: Nader-Rahhal <rahhalnader04@gmail.com>
Date: Tue, 2 Jun 2026 22:17:10 -0400
Subject: [PATCH 3/6] dynamic dispatch for error checking

---
 src/ndarray/linalg.jl | 94 +++++++++++++++++++++++++++++++++----------
 1 file changed, 73 insertions(+), 21 deletions(-)

diff --git a/src/ndarray/linalg.jl b/src/ndarray/linalg.jl
index 0788f2ba..11187974 100644
--- a/src/ndarray/linalg.jl
+++ b/src/ndarray/linalg.jl
@@ -52,29 +52,8 @@ function solve_batched(a::NDArray{T,N}, b::NDArray, x::NDArray) where {T,N}
     Legate.submit_manual_task(rt, task)
 end
 
-function svd_single(a::NDArray{T,N}, u::NDArray, s::NDArray, vh::NDArray) where {T,N}
-    rt = Legate.get_runtime();
-    lib = cuNumeric.get_lib();
-
-    task = Legate.create_auto_task(rt, lib, cuNumeric.SVD);
-
-    l_a = nda_to_logical_array(a)
-    l_u = nda_to_logical_array(u)
-    l_s = nda_to_logical_array(s)
-    l_vh = nda_to_logical_array(vh)
-
-    Legate.add_input(task, l_a)
-    Legate.add_output(task, l_u)
-    Legate.add_output(task, l_s)
-    Legate.add_output(task, l_vh)
 
-    Legate.add_broadcast(task, l_a)
-    Legate.add_broadcast(task, l_u)
-    Legate.add_broadcast(task, l_s)
-    Legate.add_broadcast(task, l_vh)
 
-    Legate.submit_auto_task(rt, task)
-end
 
 # Dimension guards
 function solve(a::NDArray{T,1}, b::NDArray{S,M}) where {T,S,M}
@@ -158,3 +137,76 @@ end
 function solve(a::NDArray{T,N}, b::NDArray{S,M}) where {T,N,S,M}
     throw(ArgumentError("Batched matrices require signature (...,m,m),(...,m,n)->(...,m,n)"))
 end
+
+function svd_single(a::NDArray{T,N}, u::NDArray, s::NDArray, vh::NDArray) where {T,N}
+    rt = Legate.get_runtime();
+    lib = cuNumeric.get_lib();
+
+    task = Legate.create_auto_task(rt, lib, cuNumeric.SVD);
+
+    l_a = nda_to_logical_array(a)
+    l_u = nda_to_logical_array(u)
+    l_s = nda_to_logical_array(s)
+    l_vh = nda_to_logical_array(vh)
+
+    Legate.add_input(task, l_a)
+    Legate.add_output(task, l_u)
+    Legate.add_output(task, l_s)
+    Legate.add_output(task, l_vh)
+
+    Legate.add_broadcast(task, l_a)
+    Legate.add_broadcast(task, l_u)
+    Legate.add_broadcast(task, l_s)
+    Legate.add_broadcast(task, l_vh)
+
+    Legate.submit_auto_task(rt, task)
+end
+
+function _svd(a::NDArray{T,2}, full_matrices::Bool) where {T}
+    m, n = size(a)
+    k = min(m, n)
+    u  = full_matrices ? zeros(T, m, m) : zeros(T, m, k)
+    s  = zeros(T, k)
+    vh = full_matrices ? zeros(T, n, n) : zeros(T, k, n)
+    svd_single(a, u, s, vh)
+    return u, s, vh
+end
+
+
+function svd(a::NDArray{T,N}, full_matrices::Bool=true) where {T,N}
+    if N < 2
+        throw(LinAlgError("$(N)-dimensional array given. Array must be at least two-dimensional"))
+    end
+    if N > 2
+        throw(ArgumentError("cuNumeric does not yet support stacked 2d arrays"))
+    end
+    if size(a)[1] < size(a)[2]
+        throw(ArgumentError("cuNumeric only supports M >= N"))
+    end
+    if T == Float16
+        throw(ArgumentError("array type float16 is unsupported in linalg"))
+    end
+    return _svd(a, full_matrices)
+end
+
+# Dimension guards
+function svd(a::NDArray{T,N}, full_matrices::Bool=true) where {T,N}
+    throw(ArgumentError("$(N)-dimensional array given. Array must be at least two-dimensional"))
+end
+
+# Stacked 2D guard
+function svd(a::NDArray{T,N}, full_matrices::Bool=true) where {T,N}
+    throw(ArgumentError("cuNumeric does not yet support stacked 2d arrays"))
+end
+
+# Float16 guard
+# function svd(a::NDArray{Float16,2}, full_matrices::Bool=true)
+#    throw(ArgumentError("array type float16 is unsupported in linalg"))
+# end
+
+# M < N guard
+function svd(a::NDArray{T,2}, full_matrices::Bool=true) where {T}
+    size(a)[1] < size(a)[2] &&
+        throw(ArgumentError("cuNumeric only supports M >= N"))
+    return _svd(a, full_matrices)
+end

From 88b4945ee5b59046a4517a271f4caad771521244 Mon Sep 17 00:00:00 2001
From: Nader-Rahhal <rahhalnader04@gmail.com>
Date: Wed, 3 Jun 2026 14:48:40 -0400
Subject: [PATCH 4/6] add tests

---
 src/cuNumeric.jl      |   2 +-
 src/ndarray/linalg.jl |  32 ++-------
 test/tests/linalg.jl  | 147 ++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 152 insertions(+), 29 deletions(-)

diff --git a/src/cuNumeric.jl b/src/cuNumeric.jl
index b4f18637..1fe3c97f 100644
--- a/src/cuNumeric.jl
+++ b/src/cuNumeric.jl
@@ -74,7 +74,7 @@ const SUPPORTED_LINALG_TYPES = Union{
 
 # solve has no integer/Float16 backend kernel — float/complex only.
 const SUPPORTED_SOLVE_TYPES = Union{Float32,Float64,SUPPORTED_COMPLEX_TYPES}
-
+const SUPPORTED_SVD_TYPES = Union{Float32,Float64,SUPPORTED_COMPLEX_TYPES}
 const SUPPORTED_ARRAY_TYPES = Union{Bool,SUPPORTED_NUMERIC_TYPES}
 const SUPPORTED_TYPES = Union{SUPPORTED_ARRAY_TYPES,String}
 
diff --git a/src/ndarray/linalg.jl b/src/ndarray/linalg.jl
index 11187974..e13ce0e3 100644
--- a/src/ndarray/linalg.jl
+++ b/src/ndarray/linalg.jl
@@ -172,41 +172,17 @@ function _svd(a::NDArray{T,2}, full_matrices::Bool) where {T}
     return u, s, vh
 end
 
-
-function svd(a::NDArray{T,N}, full_matrices::Bool=true) where {T,N}
-    if N < 2
-        throw(LinAlgError("$(N)-dimensional array given. Array must be at least two-dimensional"))
-    end
-    if N > 2
-        throw(ArgumentError("cuNumeric does not yet support stacked 2d arrays"))
-    end
+function svd(a::NDArray{T,2}, full_matrices::Bool=true) where {T}
     if size(a)[1] < size(a)[2]
         throw(ArgumentError("cuNumeric only supports M >= N"))
     end
-    if T == Float16
-        throw(ArgumentError("array type float16 is unsupported in linalg"))
-    end
     return _svd(a, full_matrices)
 end
 
-# Dimension guards
-function svd(a::NDArray{T,N}, full_matrices::Bool=true) where {T,N}
-    throw(ArgumentError("$(N)-dimensional array given. Array must be at least two-dimensional"))
+function svd(a::NDArray{T,1}, full_matrices::Bool=true) where {T}
+    throw(ArgumentError("1-dimensional array given. Array must be at least two-dimensional"))
 end
 
-# Stacked 2D guard
 function svd(a::NDArray{T,N}, full_matrices::Bool=true) where {T,N}
     throw(ArgumentError("cuNumeric does not yet support stacked 2d arrays"))
-end
-
-# Float16 guard
-# function svd(a::NDArray{Float16,2}, full_matrices::Bool=true)
-#    throw(ArgumentError("array type float16 is unsupported in linalg"))
-# end
-
-# M < N guard
-function svd(a::NDArray{T,2}, full_matrices::Bool=true) where {T}
-    size(a)[1] < size(a)[2] &&
-        throw(ArgumentError("cuNumeric only supports M >= N"))
-    return _svd(a, full_matrices)
-end
+end
\ No newline at end of file
diff --git a/test/tests/linalg.jl b/test/tests/linalg.jl
index 8f9c060d..666e56b4 100644
--- a/test/tests/linalg.jl
+++ b/test/tests/linalg.jl
@@ -185,3 +185,150 @@ end
         end
     end
 end
+
+function check_svd_reconstruction(ref_A::AbstractMatrix, u, s, vh, tol_a, tol_r)
+    U  = Array(u)
+    S  = Array(s)
+    Vh = Array(vh)
+    A_rec = U * Diagonal(S) * Vh
+    return isapprox(ref_A, A_rec; atol=tol_a, rtol=tol_r)
+end
+
+function check_svd_orthonormality(u, vh, tol_a, tol_r)
+    U  = Array(u)
+    Vh = Array(vh)
+    k  = size(U, 2)
+    ok_u  = isapprox(U'  * U,  Matrix{eltype(U)}(I, k, k);  atol=tol_a, rtol=tol_r)
+    ok_vh = isapprox(Vh * Vh', Matrix{eltype(Vh)}(I, k, k); atol=tol_a, rtol=tol_r)
+    return ok_u && ok_vh
+end
+
+@testset "svd square matrix" begin
+    @testset verbose=true for T in Base.uniontypes(cuNumeric.SUPPORTED_SVD_TYPES)
+        A_ref = my_rand(T, 5, 5)
+        nda   = cuNumeric.NDArray(A_ref)
+        u, s, vh = cuNumeric.svd(nda)
+        allowscalar() do
+            @test check_svd_reconstruction(A_ref, u, s, vh, atol(T), rtol(T))
+            @test check_svd_orthonormality(u, vh, atol(T), rtol(T))
+        end
+    end
+end
+
+@testset "svd tall matrix (m > n)" begin
+    @testset verbose=true for T in Base.uniontypes(cuNumeric.SUPPORTED_SVD_TYPES)
+        A_ref = my_rand(T, 6, 4)
+        nda   = cuNumeric.NDArray(A_ref)
+        u, s, vh = cuNumeric.svd(nda)
+        allowscalar() do
+            @test check_svd_reconstruction(A_ref, u, s, vh, atol(T), rtol(T))
+            @test check_svd_orthonormality(u, vh, atol(T), rtol(T))
+        end
+    end
+end
+
+@testset "svd thin output shapes (full_matrices=false)" begin
+    @testset verbose=true for T in Base.uniontypes(cuNumeric.SUPPORTED_SVD_TYPES)
+        m, n = 6, 4
+        k     = min(m, n)
+        A_ref = my_rand(T, m, n)
+        nda   = cuNumeric.NDArray(A_ref)
+        u, s, vh = cuNumeric.svd(nda, false)
+        allowscalar() do
+            @test size(Array(u))  == (m, k)
+            @test size(Array(s))  == (k,)
+            @test size(Array(vh)) == (k, n)
+            @test check_svd_reconstruction(A_ref, u, s, vh, atol(T), rtol(T))
+        end
+    end
+end
+
+@testset "svd full output shapes (full_matrices=true)" begin
+    @testset verbose=true for T in Base.uniontypes(cuNumeric.SUPPORTED_SVD_TYPES)
+        m, n = 6, 4
+        A_ref = my_rand(T, m, n)
+        nda   = cuNumeric.NDArray(A_ref)
+        u, s, vh = cuNumeric.svd(nda, true)
+        allowscalar() do
+            @test size(Array(u))  == (m, m)
+            @test size(Array(s))  == (min(m, n),)
+            @test size(Array(vh)) == (n, n)
+        end
+    end
+end
+
+@testset "svd singular values non-negative and sorted" begin
+    @testset verbose=true for T in Base.uniontypes(cuNumeric.SUPPORTED_SVD_TYPES)
+        A_ref = my_rand(T, 5, 5)
+        nda   = cuNumeric.NDArray(A_ref)
+        _, s, _ = cuNumeric.svd(nda)
+        allowscalar() do
+            sv = Array(s)
+            @test all(sv .>= 0)
+            @test issorted(sv; rev=true)
+        end
+    end
+end
+
+@testset "svd identity matrix" begin
+    @testset verbose=true for T in Base.uniontypes(cuNumeric.SUPPORTED_SVD_TYPES)
+        n     = 4
+        A_ref = Matrix{T}(I, n, n)
+        nda   = cuNumeric.NDArray(A_ref)
+        _, s, _ = cuNumeric.svd(nda)
+        allowscalar() do
+            @test cuNumeric.compare(ones(T, n), s, atol(T), rtol(T))
+        end
+    end
+end
+
+@testset "svd rank-1 matrix" begin
+    @testset verbose=true for T in Base.uniontypes(cuNumeric.SUPPORTED_SVD_TYPES)
+        # outer product of two vectors: exactly one nonzero singular value
+        # 5×4 satisfies the M >= N constraint
+        v1    = T.(collect(1:5))
+        v2    = T.(collect(1:4))
+        A_ref = v1 * v2'
+        nda   = cuNumeric.NDArray(A_ref)
+        _, s, _ = cuNumeric.svd(nda)
+        allowscalar() do
+            sv = Array(s)
+            @test sv[1] > atol(T)
+            @test all(sv[2:end] .< atol(T) * 10)
+        end
+    end
+end
+
+@testset "svd agrees with LinearAlgebra.svd" begin
+    @testset verbose=true for T in Base.uniontypes(cuNumeric.SUPPORTED_SVD_TYPES)
+        A_ref = my_rand(T, 5, 4)
+        nda   = cuNumeric.NDArray(A_ref)
+        _, s, _ = cuNumeric.svd(nda, false)
+        ref   = LinearAlgebra.svd(A_ref)
+        allowscalar() do
+            # Singular values must match; signs of singular vectors may differ
+            @test cuNumeric.compare(ref.S, s, atol(T), rtol(T))
+        end
+    end
+end
+
+@testset "svd 1d guard" begin
+    @testset verbose=true for T in Base.uniontypes(cuNumeric.SUPPORTED_SVD_TYPES)
+        v = cuNumeric.NDArray(T.(collect(1:4)))
+        @test_throws ArgumentError cuNumeric.svd(v)
+    end
+end
+
+@testset "svd 3d guard" begin
+    @testset verbose=true for T in Base.uniontypes(cuNumeric.SUPPORTED_SVD_TYPES)
+        A = cuNumeric.NDArray(my_rand(T, 3, 3, 3))
+        @test_throws ArgumentError cuNumeric.svd(A)
+    end
+end
+
+@testset "svd M < N guard" begin
+    @testset verbose=true for T in Base.uniontypes(cuNumeric.SUPPORTED_SVD_TYPES)
+        A = cuNumeric.NDArray(my_rand(T, 3, 5))
+        @test_throws ArgumentError cuNumeric.svd(A)
+    end
+end
\ No newline at end of file

From b7f19d204f2a63f4b610944ba7c53693c66790cd Mon Sep 17 00:00:00 2001
From: Nader-Rahhal <rahhalnader04@gmail.com>
Date: Wed, 3 Jun 2026 15:07:37 -0400
Subject: [PATCH 5/6] split linalg into public API and implementation detail

Move internal helpers (choose_nd_color_shape, prepare_manual_task_for_batched_matrices, solve_batched, svd_single, _svd) into detail/linalg.jl, leaving only user-facing solve() and svd() overloads in ndarray/linalg.jl.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 src/cuNumeric.jl             |   1 +
 src/ndarray/detail/linalg.jl |  87 +++++++++++++++++++++
 src/ndarray/linalg.jl        | 104 +------------------------
 test/tests/linalg.jl         | 147 -----------------------------------
 4 files changed, 89 insertions(+), 250 deletions(-)
 create mode 100644 src/ndarray/detail/linalg.jl

diff --git a/src/cuNumeric.jl b/src/cuNumeric.jl
index 1fe3c97f..ecaa1abb 100644
--- a/src/cuNumeric.jl
+++ b/src/cuNumeric.jl
@@ -158,6 +158,7 @@ include("ndarray/broadcast.jl")
 include("ndarray/ndarray.jl")
 include("ndarray/unary.jl")
 include("ndarray/binary.jl")
+include("ndarray/detail/linalg.jl")
 include("ndarray/linalg.jl")
 
 # scoping macro
diff --git a/src/ndarray/detail/linalg.jl b/src/ndarray/detail/linalg.jl
new file mode 100644
index 00000000..64556e6d
--- /dev/null
+++ b/src/ndarray/detail/linalg.jl
@@ -0,0 +1,87 @@
+function choose_nd_color_shape(shape::NTuple{N,Int}) where {N}
+    color_shape = Base.ones(Int, N)
+    if N > 2
+        color_shape[1] = Legate.num_procs()
+        done = false
+        while !done && color_shape[1] % 2 == 0
+            weight_per_dim = [shape[i] / color_shape[i] for i in 1:(N - 2)]
+            max_weight, idx = findmax(weight_per_dim)
+            if weight_per_dim[idx] > 2 * weight_per_dim[1]
+                color_shape[1] ÷= 2
+                color_shape[idx] *= 2
+            else
+                done = true
+            end
+        end
+    end
+    return Tuple(color_shape)
+end
+
+function prepare_manual_task_for_batched_matrices(full_shape::NTuple{N,Int}) where {N}
+    initial_color_shape = choose_nd_color_shape(full_shape)
+    tilesize = Tuple(
+        (full_shape[i] + initial_color_shape[i] - 1) ÷ initial_color_shape[i] for i in 1:N
+    )
+    color_shape = Tuple((full_shape[i] + tilesize[i] - 1) ÷ tilesize[i] for i in 1:N)
+    return tilesize, color_shape
+end
+
+function solve_batched(a::NDArray{T,N}, b::NDArray, x::NDArray) where {T,N}
+    nrhs = size(b)[end]
+    full_shape = size(a)
+    tilesize_a, color_shape = prepare_manual_task_for_batched_matrices(full_shape)
+    tilesize_b = (tilesize_a[1:(end - 1)]..., nrhs)
+
+    store_a = nda_to_logical_store(a)
+    store_b = nda_to_logical_store(b)
+    store_x = nda_to_logical_store(x)
+
+    tiled_a = Legate.partition_by_tiling(store_a, collect(tilesize_a))
+    tiled_b = Legate.partition_by_tiling(store_b, collect(tilesize_b))
+    tiled_x = Legate.partition_by_tiling(store_x, collect(tilesize_b))
+
+    rt = Legate.get_runtime()
+    domain = Legate.domain_from_shape(Legate.Shape(Legate.to_cxx_vector(color_shape)))
+    lib = cuNumeric.get_lib()
+    task = Legate.create_manual_task(rt, lib, cuNumeric.SOLVE, domain)
+
+    Legate.add_input(task, tiled_a)
+    Legate.add_input(task, tiled_b)
+    Legate.add_output(task, tiled_x)
+
+    Legate.submit_manual_task(rt, task)
+end
+
+function svd_single(a::NDArray{T,N}, u::NDArray, s::NDArray, vh::NDArray) where {T,N}
+    rt = Legate.get_runtime();
+    lib = cuNumeric.get_lib();
+
+    task = Legate.create_auto_task(rt, lib, cuNumeric.SVD);
+
+    l_a = nda_to_logical_array(a)
+    l_u = nda_to_logical_array(u)
+    l_s = nda_to_logical_array(s)
+    l_vh = nda_to_logical_array(vh)
+
+    Legate.add_input(task, l_a)
+    Legate.add_output(task, l_u)
+    Legate.add_output(task, l_s)
+    Legate.add_output(task, l_vh)
+
+    Legate.add_broadcast(task, l_a)
+    Legate.add_broadcast(task, l_u)
+    Legate.add_broadcast(task, l_s)
+    Legate.add_broadcast(task, l_vh)
+
+    Legate.submit_auto_task(rt, task)
+end
+
+function _svd(a::NDArray{T,2}, full_matrices::Bool) where {T}
+    m, n = size(a)
+    k = min(m, n)
+    u  = full_matrices ? zeros(T, m, m) : zeros(T, m, k)
+    s  = zeros(T, k)
+    vh = full_matrices ? zeros(T, n, n) : zeros(T, k, n)
+    svd_single(a, u, s, vh)
+    return u, s, vh
+end
diff --git a/src/ndarray/linalg.jl b/src/ndarray/linalg.jl
index e13ce0e3..2943a6e3 100644
--- a/src/ndarray/linalg.jl
+++ b/src/ndarray/linalg.jl
@@ -1,60 +1,3 @@
-function choose_nd_color_shape(shape::NTuple{N,Int}) where {N}
-    color_shape = Base.ones(Int, N)
-    if N > 2
-        color_shape[1] = Legate.num_procs()
-        done = false
-        while !done && color_shape[1] % 2 == 0
-            weight_per_dim = [shape[i] / color_shape[i] for i in 1:(N - 2)]
-            max_weight, idx = findmax(weight_per_dim)
-            if weight_per_dim[idx] > 2 * weight_per_dim[1]
-                color_shape[1] ÷= 2
-                color_shape[idx] *= 2
-            else
-                done = true
-            end
-        end
-    end
-    return Tuple(color_shape)
-end
-
-function prepare_manual_task_for_batched_matrices(full_shape::NTuple{N,Int}) where {N}
-    initial_color_shape = choose_nd_color_shape(full_shape)
-    tilesize = Tuple(
-        (full_shape[i] + initial_color_shape[i] - 1) ÷ initial_color_shape[i] for i in 1:N
-    )
-    color_shape = Tuple((full_shape[i] + tilesize[i] - 1) ÷ tilesize[i] for i in 1:N)
-    return tilesize, color_shape
-end
-
-function solve_batched(a::NDArray{T,N}, b::NDArray, x::NDArray) where {T,N}
-    nrhs = size(b)[end]
-    full_shape = size(a)
-    tilesize_a, color_shape = prepare_manual_task_for_batched_matrices(full_shape)
-    tilesize_b = (tilesize_a[1:(end - 1)]..., nrhs)
-
-    store_a = nda_to_logical_store(a)
-    store_b = nda_to_logical_store(b)
-    store_x = nda_to_logical_store(x)
-
-    tiled_a = Legate.partition_by_tiling(store_a, collect(tilesize_a))
-    tiled_b = Legate.partition_by_tiling(store_b, collect(tilesize_b))
-    tiled_x = Legate.partition_by_tiling(store_x, collect(tilesize_b))
-
-    rt = Legate.get_runtime()
-    domain = Legate.domain_from_shape(Legate.Shape(Legate.to_cxx_vector(color_shape)))
-    lib = cuNumeric.get_lib()
-    task = Legate.create_manual_task(rt, lib, cuNumeric.SOLVE, domain)
-
-    Legate.add_input(task, tiled_a)
-    Legate.add_input(task, tiled_b)
-    Legate.add_output(task, tiled_x)
-
-    Legate.submit_manual_task(rt, task)
-end
-
-
-
-
 # Dimension guards
 function solve(a::NDArray{T,1}, b::NDArray{S,M}) where {T,S,M}
     throw(ArgumentError("1-dimensional array given. Array must be at least two-dimensional"))
@@ -68,17 +11,6 @@ function solve(a::NDArray{T,N}, b::NDArray{S,0}) where {T,N,S}
     throw(ArgumentError("0-dimensional array given. Array must be at least one-dimensional"))
 end
 
-# Float16 guards
-
-# @static if HAS_CUDA
-#     function solve(a::NDArray{Float16,N}, b::NDArray{S,M}) where {N,S,M}
-#         throw(ArgumentError("array type float16 is unsupported in linalg"))
-#     end
-
-#     function solve(a::NDArray{T,N}, b::NDArray{Float16,M}) where {T,N,M}
-#         throw(ArgumentError("array type float16 is unsupported in linalg"))
-#     end
-
 # 2D case: (m,m),(m)->( m)
 function solve(a::NDArray{T,2}, b::NDArray{S,1}) where {T,S}
     size(a)[end - 1] != size(a)[end] &&
@@ -138,40 +70,6 @@ function solve(a::NDArray{T,N}, b::NDArray{S,M}) where {T,N,S,M}
     throw(ArgumentError("Batched matrices require signature (...,m,m),(...,m,n)->(...,m,n)"))
 end
 
-function svd_single(a::NDArray{T,N}, u::NDArray, s::NDArray, vh::NDArray) where {T,N}
-    rt = Legate.get_runtime();
-    lib = cuNumeric.get_lib();
-
-    task = Legate.create_auto_task(rt, lib, cuNumeric.SVD);
-
-    l_a = nda_to_logical_array(a)
-    l_u = nda_to_logical_array(u)
-    l_s = nda_to_logical_array(s)
-    l_vh = nda_to_logical_array(vh)
-
-    Legate.add_input(task, l_a)
-    Legate.add_output(task, l_u)
-    Legate.add_output(task, l_s)
-    Legate.add_output(task, l_vh)
-
-    Legate.add_broadcast(task, l_a)
-    Legate.add_broadcast(task, l_u)
-    Legate.add_broadcast(task, l_s)
-    Legate.add_broadcast(task, l_vh)
-
-    Legate.submit_auto_task(rt, task)
-end
-
-function _svd(a::NDArray{T,2}, full_matrices::Bool) where {T}
-    m, n = size(a)
-    k = min(m, n)
-    u  = full_matrices ? zeros(T, m, m) : zeros(T, m, k)
-    s  = zeros(T, k)
-    vh = full_matrices ? zeros(T, n, n) : zeros(T, k, n)
-    svd_single(a, u, s, vh)
-    return u, s, vh
-end
-
 function svd(a::NDArray{T,2}, full_matrices::Bool=true) where {T}
     if size(a)[1] < size(a)[2]
         throw(ArgumentError("cuNumeric only supports M >= N"))
@@ -185,4 +83,4 @@ end
 
 function svd(a::NDArray{T,N}, full_matrices::Bool=true) where {T,N}
     throw(ArgumentError("cuNumeric does not yet support stacked 2d arrays"))
-end
\ No newline at end of file
+end
diff --git a/test/tests/linalg.jl b/test/tests/linalg.jl
index 666e56b4..8f9c060d 100644
--- a/test/tests/linalg.jl
+++ b/test/tests/linalg.jl
@@ -185,150 +185,3 @@ end
         end
     end
 end
-
-function check_svd_reconstruction(ref_A::AbstractMatrix, u, s, vh, tol_a, tol_r)
-    U  = Array(u)
-    S  = Array(s)
-    Vh = Array(vh)
-    A_rec = U * Diagonal(S) * Vh
-    return isapprox(ref_A, A_rec; atol=tol_a, rtol=tol_r)
-end
-
-function check_svd_orthonormality(u, vh, tol_a, tol_r)
-    U  = Array(u)
-    Vh = Array(vh)
-    k  = size(U, 2)
-    ok_u  = isapprox(U'  * U,  Matrix{eltype(U)}(I, k, k);  atol=tol_a, rtol=tol_r)
-    ok_vh = isapprox(Vh * Vh', Matrix{eltype(Vh)}(I, k, k); atol=tol_a, rtol=tol_r)
-    return ok_u && ok_vh
-end
-
-@testset "svd square matrix" begin
-    @testset verbose=true for T in Base.uniontypes(cuNumeric.SUPPORTED_SVD_TYPES)
-        A_ref = my_rand(T, 5, 5)
-        nda   = cuNumeric.NDArray(A_ref)
-        u, s, vh = cuNumeric.svd(nda)
-        allowscalar() do
-            @test check_svd_reconstruction(A_ref, u, s, vh, atol(T), rtol(T))
-            @test check_svd_orthonormality(u, vh, atol(T), rtol(T))
-        end
-    end
-end
-
-@testset "svd tall matrix (m > n)" begin
-    @testset verbose=true for T in Base.uniontypes(cuNumeric.SUPPORTED_SVD_TYPES)
-        A_ref = my_rand(T, 6, 4)
-        nda   = cuNumeric.NDArray(A_ref)
-        u, s, vh = cuNumeric.svd(nda)
-        allowscalar() do
-            @test check_svd_reconstruction(A_ref, u, s, vh, atol(T), rtol(T))
-            @test check_svd_orthonormality(u, vh, atol(T), rtol(T))
-        end
-    end
-end
-
-@testset "svd thin output shapes (full_matrices=false)" begin
-    @testset verbose=true for T in Base.uniontypes(cuNumeric.SUPPORTED_SVD_TYPES)
-        m, n = 6, 4
-        k     = min(m, n)
-        A_ref = my_rand(T, m, n)
-        nda   = cuNumeric.NDArray(A_ref)
-        u, s, vh = cuNumeric.svd(nda, false)
-        allowscalar() do
-            @test size(Array(u))  == (m, k)
-            @test size(Array(s))  == (k,)
-            @test size(Array(vh)) == (k, n)
-            @test check_svd_reconstruction(A_ref, u, s, vh, atol(T), rtol(T))
-        end
-    end
-end
-
-@testset "svd full output shapes (full_matrices=true)" begin
-    @testset verbose=true for T in Base.uniontypes(cuNumeric.SUPPORTED_SVD_TYPES)
-        m, n = 6, 4
-        A_ref = my_rand(T, m, n)
-        nda   = cuNumeric.NDArray(A_ref)
-        u, s, vh = cuNumeric.svd(nda, true)
-        allowscalar() do
-            @test size(Array(u))  == (m, m)
-            @test size(Array(s))  == (min(m, n),)
-            @test size(Array(vh)) == (n, n)
-        end
-    end
-end
-
-@testset "svd singular values non-negative and sorted" begin
-    @testset verbose=true for T in Base.uniontypes(cuNumeric.SUPPORTED_SVD_TYPES)
-        A_ref = my_rand(T, 5, 5)
-        nda   = cuNumeric.NDArray(A_ref)
-        _, s, _ = cuNumeric.svd(nda)
-        allowscalar() do
-            sv = Array(s)
-            @test all(sv .>= 0)
-            @test issorted(sv; rev=true)
-        end
-    end
-end
-
-@testset "svd identity matrix" begin
-    @testset verbose=true for T in Base.uniontypes(cuNumeric.SUPPORTED_SVD_TYPES)
-        n     = 4
-        A_ref = Matrix{T}(I, n, n)
-        nda   = cuNumeric.NDArray(A_ref)
-        _, s, _ = cuNumeric.svd(nda)
-        allowscalar() do
-            @test cuNumeric.compare(ones(T, n), s, atol(T), rtol(T))
-        end
-    end
-end
-
-@testset "svd rank-1 matrix" begin
-    @testset verbose=true for T in Base.uniontypes(cuNumeric.SUPPORTED_SVD_TYPES)
-        # outer product of two vectors: exactly one nonzero singular value
-        # 5×4 satisfies the M >= N constraint
-        v1    = T.(collect(1:5))
-        v2    = T.(collect(1:4))
-        A_ref = v1 * v2'
-        nda   = cuNumeric.NDArray(A_ref)
-        _, s, _ = cuNumeric.svd(nda)
-        allowscalar() do
-            sv = Array(s)
-            @test sv[1] > atol(T)
-            @test all(sv[2:end] .< atol(T) * 10)
-        end
-    end
-end
-
-@testset "svd agrees with LinearAlgebra.svd" begin
-    @testset verbose=true for T in Base.uniontypes(cuNumeric.SUPPORTED_SVD_TYPES)
-        A_ref = my_rand(T, 5, 4)
-        nda   = cuNumeric.NDArray(A_ref)
-        _, s, _ = cuNumeric.svd(nda, false)
-        ref   = LinearAlgebra.svd(A_ref)
-        allowscalar() do
-            # Singular values must match; signs of singular vectors may differ
-            @test cuNumeric.compare(ref.S, s, atol(T), rtol(T))
-        end
-    end
-end
-
-@testset "svd 1d guard" begin
-    @testset verbose=true for T in Base.uniontypes(cuNumeric.SUPPORTED_SVD_TYPES)
-        v = cuNumeric.NDArray(T.(collect(1:4)))
-        @test_throws ArgumentError cuNumeric.svd(v)
-    end
-end
-
-@testset "svd 3d guard" begin
-    @testset verbose=true for T in Base.uniontypes(cuNumeric.SUPPORTED_SVD_TYPES)
-        A = cuNumeric.NDArray(my_rand(T, 3, 3, 3))
-        @test_throws ArgumentError cuNumeric.svd(A)
-    end
-end
-
-@testset "svd M < N guard" begin
-    @testset verbose=true for T in Base.uniontypes(cuNumeric.SUPPORTED_SVD_TYPES)
-        A = cuNumeric.NDArray(my_rand(T, 3, 5))
-        @test_throws ArgumentError cuNumeric.svd(A)
-    end
-end
\ No newline at end of file

From 8b710d33db868b8f2f327218c18926c83977e22c Mon Sep 17 00:00:00 2001
From: Nader-Rahhal <rahhalnader04@gmail.com>
Date: Wed, 3 Jun 2026 15:09:03 -0400
Subject: [PATCH 6/6] add back tests

---
 src/ndarray/linalg.jl |   2 +-
 test/tests/linalg.jl  | 147 ++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 148 insertions(+), 1 deletion(-)

diff --git a/src/ndarray/linalg.jl b/src/ndarray/linalg.jl
index 2943a6e3..719085df 100644
--- a/src/ndarray/linalg.jl
+++ b/src/ndarray/linalg.jl
@@ -83,4 +83,4 @@ end
 
 function svd(a::NDArray{T,N}, full_matrices::Bool=true) where {T,N}
     throw(ArgumentError("cuNumeric does not yet support stacked 2d arrays"))
-end
+end
\ No newline at end of file
diff --git a/test/tests/linalg.jl b/test/tests/linalg.jl
index 8f9c060d..666e56b4 100644
--- a/test/tests/linalg.jl
+++ b/test/tests/linalg.jl
@@ -185,3 +185,150 @@ end
         end
     end
 end
+
+function check_svd_reconstruction(ref_A::AbstractMatrix, u, s, vh, tol_a, tol_r)
+    U  = Array(u)
+    S  = Array(s)
+    Vh = Array(vh)
+    A_rec = U * Diagonal(S) * Vh
+    return isapprox(ref_A, A_rec; atol=tol_a, rtol=tol_r)
+end
+
+function check_svd_orthonormality(u, vh, tol_a, tol_r)
+    U  = Array(u)
+    Vh = Array(vh)
+    k  = size(U, 2)
+    ok_u  = isapprox(U'  * U,  Matrix{eltype(U)}(I, k, k);  atol=tol_a, rtol=tol_r)
+    ok_vh = isapprox(Vh * Vh', Matrix{eltype(Vh)}(I, k, k); atol=tol_a, rtol=tol_r)
+    return ok_u && ok_vh
+end
+
+@testset "svd square matrix" begin
+    @testset verbose=true for T in Base.uniontypes(cuNumeric.SUPPORTED_SVD_TYPES)
+        A_ref = my_rand(T, 5, 5)
+        nda   = cuNumeric.NDArray(A_ref)
+        u, s, vh = cuNumeric.svd(nda)
+        allowscalar() do
+            @test check_svd_reconstruction(A_ref, u, s, vh, atol(T), rtol(T))
+            @test check_svd_orthonormality(u, vh, atol(T), rtol(T))
+        end
+    end
+end
+
+@testset "svd tall matrix (m > n)" begin
+    @testset verbose=true for T in Base.uniontypes(cuNumeric.SUPPORTED_SVD_TYPES)
+        A_ref = my_rand(T, 6, 4)
+        nda   = cuNumeric.NDArray(A_ref)
+        u, s, vh = cuNumeric.svd(nda)
+        allowscalar() do
+            @test check_svd_reconstruction(A_ref, u, s, vh, atol(T), rtol(T))
+            @test check_svd_orthonormality(u, vh, atol(T), rtol(T))
+        end
+    end
+end
+
+@testset "svd thin output shapes (full_matrices=false)" begin
+    @testset verbose=true for T in Base.uniontypes(cuNumeric.SUPPORTED_SVD_TYPES)
+        m, n = 6, 4
+        k     = min(m, n)
+        A_ref = my_rand(T, m, n)
+        nda   = cuNumeric.NDArray(A_ref)
+        u, s, vh = cuNumeric.svd(nda, false)
+        allowscalar() do
+            @test size(Array(u))  == (m, k)
+            @test size(Array(s))  == (k,)
+            @test size(Array(vh)) == (k, n)
+            @test check_svd_reconstruction(A_ref, u, s, vh, atol(T), rtol(T))
+        end
+    end
+end
+
+@testset "svd full output shapes (full_matrices=true)" begin
+    @testset verbose=true for T in Base.uniontypes(cuNumeric.SUPPORTED_SVD_TYPES)
+        m, n = 6, 4
+        A_ref = my_rand(T, m, n)
+        nda   = cuNumeric.NDArray(A_ref)
+        u, s, vh = cuNumeric.svd(nda, true)
+        allowscalar() do
+            @test size(Array(u))  == (m, m)
+            @test size(Array(s))  == (min(m, n),)
+            @test size(Array(vh)) == (n, n)
+        end
+    end
+end
+
+@testset "svd singular values non-negative and sorted" begin
+    @testset verbose=true for T in Base.uniontypes(cuNumeric.SUPPORTED_SVD_TYPES)
+        A_ref = my_rand(T, 5, 5)
+        nda   = cuNumeric.NDArray(A_ref)
+        _, s, _ = cuNumeric.svd(nda)
+        allowscalar() do
+            sv = Array(s)
+            @test all(sv .>= 0)
+            @test issorted(sv; rev=true)
+        end
+    end
+end
+
+@testset "svd identity matrix" begin
+    @testset verbose=true for T in Base.uniontypes(cuNumeric.SUPPORTED_SVD_TYPES)
+        n     = 4
+        A_ref = Matrix{T}(I, n, n)
+        nda   = cuNumeric.NDArray(A_ref)
+        _, s, _ = cuNumeric.svd(nda)
+        allowscalar() do
+            @test cuNumeric.compare(ones(T, n), s, atol(T), rtol(T))
+        end
+    end
+end
+
+@testset "svd rank-1 matrix" begin
+    @testset verbose=true for T in Base.uniontypes(cuNumeric.SUPPORTED_SVD_TYPES)
+        # outer product of two vectors: exactly one nonzero singular value
+        # 5×4 satisfies the M >= N constraint
+        v1    = T.(collect(1:5))
+        v2    = T.(collect(1:4))
+        A_ref = v1 * v2'
+        nda   = cuNumeric.NDArray(A_ref)
+        _, s, _ = cuNumeric.svd(nda)
+        allowscalar() do
+            sv = Array(s)
+            @test sv[1] > atol(T)
+            @test all(sv[2:end] .< atol(T) * 10)
+        end
+    end
+end
+
+@testset "svd agrees with LinearAlgebra.svd" begin
+    @testset verbose=true for T in Base.uniontypes(cuNumeric.SUPPORTED_SVD_TYPES)
+        A_ref = my_rand(T, 5, 4)
+        nda   = cuNumeric.NDArray(A_ref)
+        _, s, _ = cuNumeric.svd(nda, false)
+        ref   = LinearAlgebra.svd(A_ref)
+        allowscalar() do
+            # Singular values must match; signs of singular vectors may differ
+            @test cuNumeric.compare(ref.S, s, atol(T), rtol(T))
+        end
+    end
+end
+
+@testset "svd 1d guard" begin
+    @testset verbose=true for T in Base.uniontypes(cuNumeric.SUPPORTED_SVD_TYPES)
+        v = cuNumeric.NDArray(T.(collect(1:4)))
+        @test_throws ArgumentError cuNumeric.svd(v)
+    end
+end
+
+@testset "svd 3d guard" begin
+    @testset verbose=true for T in Base.uniontypes(cuNumeric.SUPPORTED_SVD_TYPES)
+        A = cuNumeric.NDArray(my_rand(T, 3, 3, 3))
+        @test_throws ArgumentError cuNumeric.svd(A)
+    end
+end
+
+@testset "svd M < N guard" begin
+    @testset verbose=true for T in Base.uniontypes(cuNumeric.SUPPORTED_SVD_TYPES)
+        A = cuNumeric.NDArray(my_rand(T, 3, 5))
+        @test_throws ArgumentError cuNumeric.svd(A)
+    end
+end
\ No newline at end of file