diff --git a/ext/CUDAExt/cuda.jl b/ext/CUDAExt/cuda.jl
index 33421640..c8fd5c01 100644
--- a/ext/CUDAExt/cuda.jl
+++ b/ext/CUDAExt/cuda.jl
@@ -74,11 +74,6 @@ function check_sz(arr, maxshape)
     end
 end
 
-function nda_to_logical_array(arr::NDArray{T,N}) where {T,N}
-    st_handle = cuNumeric.get_store(arr)
-    return Legate.LogicalArray{T,N}(st_handle, size(arr))
-end
-
 function Launch(kernel::cuNumeric.CUDATask, inputs::Tuple{Vararg{NDArray}},
     outputs::Tuple{Vararg{NDArray}}, scalars::Tuple{Vararg{Any}}; blocks, threads)
 
diff --git a/lib/cunumeric_jl_wrapper/src/types.cpp b/lib/cunumeric_jl_wrapper/src/types.cpp
index f181dc2e..998a6d75 100644
--- a/lib/cunumeric_jl_wrapper/src/types.cpp
+++ b/lib/cunumeric_jl_wrapper/src/types.cpp
@@ -166,4 +166,5 @@ void wrap_binary_ops(jlcxx::Module& mod) {
 void wrap_linalg_ops(jlcxx::Module& mod) {
   mod.set_const("SOLVE", legate::LocalTaskID{CuPyNumericOpCode::CUPYNUMERIC_SOLVE});
   mod.set_const("MP_SOLVE", legate::LocalTaskID{CuPyNumericOpCode::CUPYNUMERIC_MP_SOLVE});
+  mod.set_const("SVD", legate::LocalTaskID{CuPyNumericOpCode::CUPYNUMERIC_SVD});
 }
\ No newline at end of file
diff --git a/src/cuNumeric.jl b/src/cuNumeric.jl
index 0da59605..213a4931 100644
--- a/src/cuNumeric.jl
+++ b/src/cuNumeric.jl
@@ -62,9 +62,13 @@ const SUPPORTED_NUMERIC_TYPES = Union{
     SUPPORTED_INT_TYPES,SUPPORTED_FLOAT_TYPES,SUPPORTED_COMPLEX_TYPES
 }
 
-# solve has no integer backend kernel
-const SUPPORTED_SOLVE_TYPES = Union{SUPPORTED_FLOAT_TYPES,SUPPORTED_COMPLEX_TYPES}
+const SUPPORTED_LINALG_TYPES = Union{
+    SUPPORTED_INT_TYPES,Float32,Float64,SUPPORTED_COMPLEX_TYPES
+}
 
+# solve has no integer/Float16 backend kernel — float/complex only.
+const SUPPORTED_SOLVE_TYPES = Union{Float32,Float64,SUPPORTED_COMPLEX_TYPES}
+const SUPPORTED_SVD_TYPES = Union{Float32,Float64,SUPPORTED_COMPLEX_TYPES}
 const SUPPORTED_ARRAY_TYPES = Union{Bool,SUPPORTED_NUMERIC_TYPES}
 const SUPPORTED_TYPES = Union{SUPPORTED_ARRAY_TYPES,String}
 
@@ -148,6 +152,7 @@ include("ndarray/broadcast.jl")
 include("ndarray/ndarray.jl")
 include("ndarray/unary.jl")
 include("ndarray/binary.jl")
+include("ndarray/detail/linalg.jl")
 include("ndarray/linalg.jl")
 
 # scoping macro
diff --git a/src/ndarray/detail/linalg.jl b/src/ndarray/detail/linalg.jl
new file mode 100644
index 00000000..64556e6d
--- /dev/null
+++ b/src/ndarray/detail/linalg.jl
@@ -0,0 +1,87 @@
+function choose_nd_color_shape(shape::NTuple{N,Int}) where {N}
+    color_shape = Base.ones(Int, N)
+    if N > 2
+        color_shape[1] = Legate.num_procs()
+        done = false
+        while !done && color_shape[1] % 2 == 0
+            weight_per_dim = [shape[i] / color_shape[i] for i in 1:(N - 2)]
+            max_weight, idx = findmax(weight_per_dim)
+            if weight_per_dim[idx] > 2 * weight_per_dim[1]
+                color_shape[1] ÷= 2
+                color_shape[idx] *= 2
+            else
+                done = true
+            end
+        end
+    end
+    return Tuple(color_shape)
+end
+
+function prepare_manual_task_for_batched_matrices(full_shape::NTuple{N,Int}) where {N}
+    initial_color_shape = choose_nd_color_shape(full_shape)
+    tilesize = Tuple(
+        (full_shape[i] + initial_color_shape[i] - 1) ÷ initial_color_shape[i] for i in 1:N
+    )
+    color_shape = Tuple((full_shape[i] + tilesize[i] - 1) ÷ tilesize[i] for i in 1:N)
+    return tilesize, color_shape
+end
+
+function solve_batched(a::NDArray{T,N}, b::NDArray, x::NDArray) where {T,N}
+    nrhs = size(b)[end]
+    full_shape = size(a)
+    tilesize_a, color_shape = prepare_manual_task_for_batched_matrices(full_shape)
+    tilesize_b = (tilesize_a[1:(end - 1)]..., nrhs)
+
+    store_a = nda_to_logical_store(a)
+    store_b = nda_to_logical_store(b)
+    store_x = nda_to_logical_store(x)
+
+    tiled_a = Legate.partition_by_tiling(store_a, collect(tilesize_a))
+    tiled_b = Legate.partition_by_tiling(store_b, collect(tilesize_b))
+    tiled_x = Legate.partition_by_tiling(store_x, collect(tilesize_b))
+
+    rt = Legate.get_runtime()
+    domain = Legate.domain_from_shape(Legate.Shape(Legate.to_cxx_vector(color_shape)))
+    lib = cuNumeric.get_lib()
+    task = Legate.create_manual_task(rt, lib, cuNumeric.SOLVE, domain)
+
+    Legate.add_input(task, tiled_a)
+    Legate.add_input(task, tiled_b)
+    Legate.add_output(task, tiled_x)
+
+    Legate.submit_manual_task(rt, task)
+end
+
+function svd_single(a::NDArray{T,N}, u::NDArray, s::NDArray, vh::NDArray) where {T,N}
+    rt = Legate.get_runtime();
+    lib = cuNumeric.get_lib();
+
+    task = Legate.create_auto_task(rt, lib, cuNumeric.SVD);
+
+    l_a = nda_to_logical_array(a)
+    l_u = nda_to_logical_array(u)
+    l_s = nda_to_logical_array(s)
+    l_vh = nda_to_logical_array(vh)
+
+    Legate.add_input(task, l_a)
+    Legate.add_output(task, l_u)
+    Legate.add_output(task, l_s)
+    Legate.add_output(task, l_vh)
+
+    Legate.add_broadcast(task, l_a)
+    Legate.add_broadcast(task, l_u)
+    Legate.add_broadcast(task, l_s)
+    Legate.add_broadcast(task, l_vh)
+
+    Legate.submit_auto_task(rt, task)
+end
+
+function _svd(a::NDArray{T,2}, full_matrices::Bool) where {T}
+    m, n = size(a)
+    k = min(m, n)
+    u  = full_matrices ? zeros(T, m, m) : zeros(T, m, k)
+    s  = zeros(T, k)
+    vh = full_matrices ? zeros(T, n, n) : zeros(T, k, n)
+    svd_single(a, u, s, vh)
+    return u, s, vh
+end
diff --git a/src/ndarray/detail/ndarray.jl b/src/ndarray/detail/ndarray.jl
index 9d630fcc..926a0fe8 100644
--- a/src/ndarray/detail/ndarray.jl
+++ b/src/ndarray/detail/ndarray.jl
@@ -511,3 +511,9 @@ function nda_to_logical_store(arr::NDArray{T,N}) where {T,N}
     st_handle = Legate.data(Legate.LogicalArray{T,N}(la_handle, size(arr)))
     return Legate.LogicalStore{T,N}(st_handle, size(arr))
 end
+
+function nda_to_logical_array(arr::NDArray{T,N}) where {T,N}
+    st_handle = cuNumeric.get_store(arr)
+    return Legate.LogicalArray{T,N}(st_handle, size(arr))
+end
+
diff --git a/src/ndarray/linalg.jl b/src/ndarray/linalg.jl
index 1c01b44a..719085df 100644
--- a/src/ndarray/linalg.jl
+++ b/src/ndarray/linalg.jl
@@ -1,102 +1,54 @@
-function choose_nd_color_shape(shape::NTuple{N,Int}) where {N}
-    color_shape = Base.ones(Int, N)
-    if N > 2
-        color_shape[1] = Legate.num_procs()
-        done = false
-        while !done && color_shape[1] % 2 == 0
-            weight_per_dim = [shape[i] / color_shape[i] for i in 1:(N - 2)]
-            max_weight, idx = findmax(weight_per_dim)
-            if weight_per_dim[idx] > 2 * weight_per_dim[1]
-                color_shape[1] ÷= 2
-                color_shape[idx] *= 2
-            else
-                done = true
-            end
-        end
-    end
-    return Tuple(color_shape)
-end
-
-function prepare_manual_task_for_batched_matrices(full_shape::NTuple{N,Int}) where {N}
-    initial_color_shape = choose_nd_color_shape(full_shape)
-    tilesize = Tuple(
-        (full_shape[i] + initial_color_shape[i] - 1) ÷ initial_color_shape[i] for i in 1:N
-    )
-    color_shape = Tuple((full_shape[i] + tilesize[i] - 1) ÷ tilesize[i] for i in 1:N)
-    return tilesize, color_shape
-end
-
-function solve_batched(a::NDArray{T,N}, b::NDArray, x::NDArray) where {T,N}
-    nrhs = size(b)[end]
-    full_shape = size(a)
-    tilesize_a, color_shape = prepare_manual_task_for_batched_matrices(full_shape)
-    tilesize_b = (tilesize_a[1:(end - 1)]..., nrhs)
-
-    store_a = nda_to_logical_store(a)
-    store_b = nda_to_logical_store(b)
-    store_x = nda_to_logical_store(x)
-
-    tiled_a = Legate.partition_by_tiling(store_a, collect(tilesize_a))
-    tiled_b = Legate.partition_by_tiling(store_b, collect(tilesize_b))
-    tiled_x = Legate.partition_by_tiling(store_x, collect(tilesize_b))
-
-    rt = Legate.get_runtime()
-    domain = Legate.domain_from_shape(Legate.Shape(Legate.to_cxx_vector(color_shape)))
-    lib = cuNumeric.get_lib()
-    task = Legate.create_manual_task(rt, lib, cuNumeric.SOLVE, domain)
-
-    Legate.add_input(task, tiled_a)
-    Legate.add_input(task, tiled_b)
-    Legate.add_output(task, tiled_x)
-
-    Legate.submit_manual_task(rt, task)
-end
-
-# solve runs in floating point:
-# int/bool inputs promote to Float64 (matching cupynumeric)
-const _SOLVE_PROMOTABLE = Union{SUPPORTED_INT_TYPES,Bool}
-const _SOLVE_ACCEPTED = Union{SUPPORTED_SOLVE_TYPES,_SOLVE_PROMOTABLE}
-_solve_eltype(::Type{T}) where {T<:_SOLVE_PROMOTABLE} = Float64
-_solve_eltype(::Type{T}) where {T<:SUPPORTED_SOLVE_TYPES} = T
-
-# Type/dim guards dispatch on one argument at a time, then forward to `_solve`.
-function solve(a::NDArray{<:_SOLVE_ACCEPTED}, b::NDArray{<:_SOLVE_ACCEPTED})
-    A, B = eltype(a), eltype(b)
-    O = promote_type(_solve_eltype(A), _solve_eltype(B))
-    # int/bool -> float is an implicit promotion, disallowed unless `allowpromotion`
-    A <: _SOLVE_PROMOTABLE && assertpromotion(solve, A, O)
-    B <: _SOLVE_PROMOTABLE && assertpromotion(solve, B, O)
-    return _solve_check_a_dims(unchecked_promote_arr(a, O), unchecked_promote_arr(b, O))
-end
-
-function solve(a::NDArray, b::NDArray)
-    bad = eltype(a) <: _SOLVE_ACCEPTED ? eltype(b) : eltype(a)
-    throw(ArgumentError("array type $bad is unsupported in solve"))
+# Dimension guards
+function solve(a::NDArray{T,1}, b::NDArray{S,M}) where {T,S,M}
+    throw(ArgumentError("1-dimensional array given. Array must be at least two-dimensional"))
 end
 
-# `a` must be at least 2D, `b` at least 1D.
-function _solve_check_a_dims(a::NDArray{<:Any,0}, b::NDArray)
+function solve(a::NDArray{T,0}, b::NDArray{S,M}) where {T,S,M}
     throw(ArgumentError("0-dimensional array given. Array must be at least two-dimensional"))
 end
-function _solve_check_a_dims(a::NDArray{<:Any,1}, b::NDArray)
-    throw(ArgumentError("1-dimensional array given. Array must be at least two-dimensional"))
-end
-_solve_check_a_dims(a::NDArray, b::NDArray) = _solve_check_b_dims(a, b)
 
-function _solve_check_b_dims(a::NDArray, b::NDArray{<:Any,0})
+function solve(a::NDArray{T,N}, b::NDArray{S,0}) where {T,N,S}
     throw(ArgumentError("0-dimensional array given. Array must be at least one-dimensional"))
 end
-_solve_check_b_dims(a::NDArray, b::NDArray) = _solve(a, b)
 
-# 2D case: (m,m),(m)->(m).
-# Backend needs rhs "b" to be 2D. We reshape b from (n,) to (n,1)
-function _solve(a::NDArray{T,2}, b::NDArray{S,1}) where {T,S}
-    m = size(b)[1]
-    return reshape(_solve(a, reshape(b, (m, 1))), (m,))
+# 2D case: (m,m),(m)->( m)
+function solve(a::NDArray{T,2}, b::NDArray{S,1}) where {T,S}
+    size(a)[end - 1] != size(a)[end] &&
+        throw(ArgumentError("Last 2 dimensions of the array must be square"))
+    size(a)[2] != size(b)[1] &&
+        throw(
+            ArgumentError(
+                "Input operand 1 has a mismatch in its dimension 0, " *
+                "with signature (m,m),(m)->(m) (size $(size(b)[1]) " *
+                "is different from $(size(a)[2]))",
+            ),
+        )
+    prod(size(a)) == 0 || prod(size(b)) == 0 && return zeros(T, size(b)...)
+    x = zeros(T, size(b)...)
+    solve_batched(a, b, x)
+    return x
 end
 
-# 2D (m,m),(m,n)->(m,n) and batched (...,m,m),(...,m,n)->(...,m,n)
-function _solve(a::NDArray{T,N}, b::NDArray{S,N}) where {T,S,N}
+# 2D case: (m,m),(m,n)->(m,n)
+function solve(a::NDArray{T,2}, b::NDArray{S,2}) where {T,S}
+    size(a)[end - 1] != size(a)[end] &&
+        throw(ArgumentError("Last 2 dimensions of the array must be square"))
+    size(a)[2] != size(b)[1] &&
+        throw(
+            ArgumentError(
+                "Input operand 1 has a mismatch in its dimension 0, " *
+                "with signature (m,m),(m,n)->(m,n) (size $(size(b)[1]) " *
+                "is different from $(size(a)[2]))",
+            ),
+        )
+    prod(size(a)) == 0 || prod(size(b)) == 0 && return zeros(T, size(b)...)
+    x = zeros(T, size(b)...)
+    solve_batched(a, b, x)
+    return x
+end
+
+# Batched case: (...,m,m),(...,m,n)->(...,m,n)
+function solve(a::NDArray{T,N}, b::NDArray{S,N}) where {T,S,N}
     size(a)[end - 1] != size(a)[end] &&
         throw(ArgumentError("Last 2 dimensions of the array must be square"))
     size(a)[end] != size(b)[end - 1] &&
@@ -114,6 +66,21 @@ function _solve(a::NDArray{T,N}, b::NDArray{S,N}) where {T,S,N}
 end
 
 # Mismatched batch dimensions
-function _solve(a::NDArray{T,N}, b::NDArray{S,M}) where {T,N,S,M}
+function solve(a::NDArray{T,N}, b::NDArray{S,M}) where {T,N,S,M}
     throw(ArgumentError("Batched matrices require signature (...,m,m),(...,m,n)->(...,m,n)"))
 end
+
+function svd(a::NDArray{T,2}, full_matrices::Bool=true) where {T}
+    if size(a)[1] < size(a)[2]
+        throw(ArgumentError("cuNumeric only supports M >= N"))
+    end
+    return _svd(a, full_matrices)
+end
+
+function svd(a::NDArray{T,1}, full_matrices::Bool=true) where {T}
+    throw(ArgumentError("1-dimensional array given. Array must be at least two-dimensional"))
+end
+
+function svd(a::NDArray{T,N}, full_matrices::Bool=true) where {T,N}
+    throw(ArgumentError("cuNumeric does not yet support stacked 2d arrays"))
+end
\ No newline at end of file
diff --git a/test/tests/linalg.jl b/test/tests/linalg.jl
index 8d5ededb..eb4cfd02 100644
--- a/test/tests/linalg.jl
+++ b/test/tests/linalg.jl
@@ -185,3 +185,150 @@ end
         end
     end
 end
+
+function check_svd_reconstruction(ref_A::AbstractMatrix, u, s, vh, tol_a, tol_r)
+    U  = Array(u)
+    S  = Array(s)
+    Vh = Array(vh)
+    A_rec = U * Diagonal(S) * Vh
+    return isapprox(ref_A, A_rec; atol=tol_a, rtol=tol_r)
+end
+
+function check_svd_orthonormality(u, vh, tol_a, tol_r)
+    U  = Array(u)
+    Vh = Array(vh)
+    k  = size(U, 2)
+    ok_u  = isapprox(U'  * U,  Matrix{eltype(U)}(I, k, k);  atol=tol_a, rtol=tol_r)
+    ok_vh = isapprox(Vh * Vh', Matrix{eltype(Vh)}(I, k, k); atol=tol_a, rtol=tol_r)
+    return ok_u && ok_vh
+end
+
+@testset "svd square matrix" begin
+    @testset verbose=true for T in Base.uniontypes(cuNumeric.SUPPORTED_SVD_TYPES)
+        A_ref = my_rand(T, 5, 5)
+        nda   = cuNumeric.NDArray(A_ref)
+        u, s, vh = cuNumeric.svd(nda)
+        allowscalar() do
+            @test check_svd_reconstruction(A_ref, u, s, vh, atol(T), rtol(T))
+            @test check_svd_orthonormality(u, vh, atol(T), rtol(T))
+        end
+    end
+end
+
+@testset "svd tall matrix (m > n)" begin
+    @testset verbose=true for T in Base.uniontypes(cuNumeric.SUPPORTED_SVD_TYPES)
+        A_ref = my_rand(T, 6, 4)
+        nda   = cuNumeric.NDArray(A_ref)
+        u, s, vh = cuNumeric.svd(nda)
+        allowscalar() do
+            @test check_svd_reconstruction(A_ref, u, s, vh, atol(T), rtol(T))
+            @test check_svd_orthonormality(u, vh, atol(T), rtol(T))
+        end
+    end
+end
+
+@testset "svd thin output shapes (full_matrices=false)" begin
+    @testset verbose=true for T in Base.uniontypes(cuNumeric.SUPPORTED_SVD_TYPES)
+        m, n = 6, 4
+        k     = min(m, n)
+        A_ref = my_rand(T, m, n)
+        nda   = cuNumeric.NDArray(A_ref)
+        u, s, vh = cuNumeric.svd(nda, false)
+        allowscalar() do
+            @test size(Array(u))  == (m, k)
+            @test size(Array(s))  == (k,)
+            @test size(Array(vh)) == (k, n)
+            @test check_svd_reconstruction(A_ref, u, s, vh, atol(T), rtol(T))
+        end
+    end
+end
+
+@testset "svd full output shapes (full_matrices=true)" begin
+    @testset verbose=true for T in Base.uniontypes(cuNumeric.SUPPORTED_SVD_TYPES)
+        m, n = 6, 4
+        A_ref = my_rand(T, m, n)
+        nda   = cuNumeric.NDArray(A_ref)
+        u, s, vh = cuNumeric.svd(nda, true)
+        allowscalar() do
+            @test size(Array(u))  == (m, m)
+            @test size(Array(s))  == (min(m, n),)
+            @test size(Array(vh)) == (n, n)
+        end
+    end
+end
+
+@testset "svd singular values non-negative and sorted" begin
+    @testset verbose=true for T in Base.uniontypes(cuNumeric.SUPPORTED_SVD_TYPES)
+        A_ref = my_rand(T, 5, 5)
+        nda   = cuNumeric.NDArray(A_ref)
+        _, s, _ = cuNumeric.svd(nda)
+        allowscalar() do
+            sv = Array(s)
+            @test all(sv .>= 0)
+            @test issorted(sv; rev=true)
+        end
+    end
+end
+
+@testset "svd identity matrix" begin
+    @testset verbose=true for T in Base.uniontypes(cuNumeric.SUPPORTED_SVD_TYPES)
+        n     = 4
+        A_ref = Matrix{T}(I, n, n)
+        nda   = cuNumeric.NDArray(A_ref)
+        _, s, _ = cuNumeric.svd(nda)
+        allowscalar() do
+            @test cuNumeric.compare(ones(T, n), s, atol(T), rtol(T))
+        end
+    end
+end
+
+@testset "svd rank-1 matrix" begin
+    @testset verbose=true for T in Base.uniontypes(cuNumeric.SUPPORTED_SVD_TYPES)
+        # outer product of two vectors: exactly one nonzero singular value
+        # 5×4 satisfies the M >= N constraint
+        v1    = T.(collect(1:5))
+        v2    = T.(collect(1:4))
+        A_ref = v1 * v2'
+        nda   = cuNumeric.NDArray(A_ref)
+        _, s, _ = cuNumeric.svd(nda)
+        allowscalar() do
+            sv = Array(s)
+            @test sv[1] > atol(T)
+            @test all(sv[2:end] .< atol(T) * 10)
+        end
+    end
+end
+
+@testset "svd agrees with LinearAlgebra.svd" begin
+    @testset verbose=true for T in Base.uniontypes(cuNumeric.SUPPORTED_SVD_TYPES)
+        A_ref = my_rand(T, 5, 4)
+        nda   = cuNumeric.NDArray(A_ref)
+        _, s, _ = cuNumeric.svd(nda, false)
+        ref   = LinearAlgebra.svd(A_ref)
+        allowscalar() do
+            # Singular values must match; signs of singular vectors may differ
+            @test cuNumeric.compare(ref.S, s, atol(T), rtol(T))
+        end
+    end
+end
+
+@testset "svd 1d guard" begin
+    @testset verbose=true for T in Base.uniontypes(cuNumeric.SUPPORTED_SVD_TYPES)
+        v = cuNumeric.NDArray(T.(collect(1:4)))
+        @test_throws ArgumentError cuNumeric.svd(v)
+    end
+end
+
+@testset "svd 3d guard" begin
+    @testset verbose=true for T in Base.uniontypes(cuNumeric.SUPPORTED_SVD_TYPES)
+        A = cuNumeric.NDArray(my_rand(T, 3, 3, 3))
+        @test_throws ArgumentError cuNumeric.svd(A)
+    end
+end
+
+@testset "svd M < N guard" begin
+    @testset verbose=true for T in Base.uniontypes(cuNumeric.SUPPORTED_SVD_TYPES)
+        A = cuNumeric.NDArray(my_rand(T, 3, 5))
+        @test_throws ArgumentError cuNumeric.svd(A)
+    end
+end
\ No newline at end of file