diff --git a/ext/CUDAExt/cuda.jl b/ext/CUDAExt/cuda.jl index 33421640..c8fd5c01 100644 --- a/ext/CUDAExt/cuda.jl +++ b/ext/CUDAExt/cuda.jl @@ -74,11 +74,6 @@ function check_sz(arr, maxshape) end end -function nda_to_logical_array(arr::NDArray{T,N}) where {T,N} - st_handle = cuNumeric.get_store(arr) - return Legate.LogicalArray{T,N}(st_handle, size(arr)) -end - function Launch(kernel::cuNumeric.CUDATask, inputs::Tuple{Vararg{NDArray}}, outputs::Tuple{Vararg{NDArray}}, scalars::Tuple{Vararg{Any}}; blocks, threads) diff --git a/lib/cunumeric_jl_wrapper/src/types.cpp b/lib/cunumeric_jl_wrapper/src/types.cpp index f181dc2e..998a6d75 100644 --- a/lib/cunumeric_jl_wrapper/src/types.cpp +++ b/lib/cunumeric_jl_wrapper/src/types.cpp @@ -166,4 +166,5 @@ void wrap_binary_ops(jlcxx::Module& mod) { void wrap_linalg_ops(jlcxx::Module& mod) { mod.set_const("SOLVE", legate::LocalTaskID{CuPyNumericOpCode::CUPYNUMERIC_SOLVE}); mod.set_const("MP_SOLVE", legate::LocalTaskID{CuPyNumericOpCode::CUPYNUMERIC_MP_SOLVE}); + mod.set_const("SVD", legate::LocalTaskID{CuPyNumericOpCode::CUPYNUMERIC_SVD}); } \ No newline at end of file diff --git a/src/cuNumeric.jl b/src/cuNumeric.jl index 0da59605..213a4931 100644 --- a/src/cuNumeric.jl +++ b/src/cuNumeric.jl @@ -62,9 +62,13 @@ const SUPPORTED_NUMERIC_TYPES = Union{ SUPPORTED_INT_TYPES,SUPPORTED_FLOAT_TYPES,SUPPORTED_COMPLEX_TYPES } -# solve has no integer backend kernel -const SUPPORTED_SOLVE_TYPES = Union{SUPPORTED_FLOAT_TYPES,SUPPORTED_COMPLEX_TYPES} +const SUPPORTED_LINALG_TYPES = Union{ + SUPPORTED_INT_TYPES,Float32,Float64,SUPPORTED_COMPLEX_TYPES +} +# solve has no integer/Float16 backend kernel — float/complex only. +const SUPPORTED_SOLVE_TYPES = Union{Float32,Float64,SUPPORTED_COMPLEX_TYPES} +const SUPPORTED_SVD_TYPES = Union{Float32,Float64,SUPPORTED_COMPLEX_TYPES} const SUPPORTED_ARRAY_TYPES = Union{Bool,SUPPORTED_NUMERIC_TYPES} const SUPPORTED_TYPES = Union{SUPPORTED_ARRAY_TYPES,String} @@ -148,6 +152,7 @@ include("ndarray/broadcast.jl") include("ndarray/ndarray.jl") include("ndarray/unary.jl") include("ndarray/binary.jl") +include("ndarray/detail/linalg.jl") include("ndarray/linalg.jl") # scoping macro diff --git a/src/ndarray/detail/linalg.jl b/src/ndarray/detail/linalg.jl new file mode 100644 index 00000000..64556e6d --- /dev/null +++ b/src/ndarray/detail/linalg.jl @@ -0,0 +1,87 @@ +function choose_nd_color_shape(shape::NTuple{N,Int}) where {N} + color_shape = Base.ones(Int, N) + if N > 2 + color_shape[1] = Legate.num_procs() + done = false + while !done && color_shape[1] % 2 == 0 + weight_per_dim = [shape[i] / color_shape[i] for i in 1:(N - 2)] + max_weight, idx = findmax(weight_per_dim) + if weight_per_dim[idx] > 2 * weight_per_dim[1] + color_shape[1] ÷= 2 + color_shape[idx] *= 2 + else + done = true + end + end + end + return Tuple(color_shape) +end + +function prepare_manual_task_for_batched_matrices(full_shape::NTuple{N,Int}) where {N} + initial_color_shape = choose_nd_color_shape(full_shape) + tilesize = Tuple( + (full_shape[i] + initial_color_shape[i] - 1) ÷ initial_color_shape[i] for i in 1:N + ) + color_shape = Tuple((full_shape[i] + tilesize[i] - 1) ÷ tilesize[i] for i in 1:N) + return tilesize, color_shape +end + +function solve_batched(a::NDArray{T,N}, b::NDArray, x::NDArray) where {T,N} + nrhs = size(b)[end] + full_shape = size(a) + tilesize_a, color_shape = prepare_manual_task_for_batched_matrices(full_shape) + tilesize_b = (tilesize_a[1:(end - 1)]..., nrhs) + + store_a = nda_to_logical_store(a) + store_b = nda_to_logical_store(b) + store_x = nda_to_logical_store(x) + + tiled_a = Legate.partition_by_tiling(store_a, collect(tilesize_a)) + tiled_b = Legate.partition_by_tiling(store_b, collect(tilesize_b)) + tiled_x = Legate.partition_by_tiling(store_x, collect(tilesize_b)) + + rt = Legate.get_runtime() + domain = Legate.domain_from_shape(Legate.Shape(Legate.to_cxx_vector(color_shape))) + lib = cuNumeric.get_lib() + task = Legate.create_manual_task(rt, lib, cuNumeric.SOLVE, domain) + + Legate.add_input(task, tiled_a) + Legate.add_input(task, tiled_b) + Legate.add_output(task, tiled_x) + + Legate.submit_manual_task(rt, task) +end + +function svd_single(a::NDArray{T,N}, u::NDArray, s::NDArray, vh::NDArray) where {T,N} + rt = Legate.get_runtime(); + lib = cuNumeric.get_lib(); + + task = Legate.create_auto_task(rt, lib, cuNumeric.SVD); + + l_a = nda_to_logical_array(a) + l_u = nda_to_logical_array(u) + l_s = nda_to_logical_array(s) + l_vh = nda_to_logical_array(vh) + + Legate.add_input(task, l_a) + Legate.add_output(task, l_u) + Legate.add_output(task, l_s) + Legate.add_output(task, l_vh) + + Legate.add_broadcast(task, l_a) + Legate.add_broadcast(task, l_u) + Legate.add_broadcast(task, l_s) + Legate.add_broadcast(task, l_vh) + + Legate.submit_auto_task(rt, task) +end + +function _svd(a::NDArray{T,2}, full_matrices::Bool) where {T} + m, n = size(a) + k = min(m, n) + u = full_matrices ? zeros(T, m, m) : zeros(T, m, k) + s = zeros(T, k) + vh = full_matrices ? zeros(T, n, n) : zeros(T, k, n) + svd_single(a, u, s, vh) + return u, s, vh +end diff --git a/src/ndarray/detail/ndarray.jl b/src/ndarray/detail/ndarray.jl index 9d630fcc..926a0fe8 100644 --- a/src/ndarray/detail/ndarray.jl +++ b/src/ndarray/detail/ndarray.jl @@ -511,3 +511,9 @@ function nda_to_logical_store(arr::NDArray{T,N}) where {T,N} st_handle = Legate.data(Legate.LogicalArray{T,N}(la_handle, size(arr))) return Legate.LogicalStore{T,N}(st_handle, size(arr)) end + +function nda_to_logical_array(arr::NDArray{T,N}) where {T,N} + st_handle = cuNumeric.get_store(arr) + return Legate.LogicalArray{T,N}(st_handle, size(arr)) +end + diff --git a/src/ndarray/linalg.jl b/src/ndarray/linalg.jl index 1c01b44a..719085df 100644 --- a/src/ndarray/linalg.jl +++ b/src/ndarray/linalg.jl @@ -1,102 +1,54 @@ -function choose_nd_color_shape(shape::NTuple{N,Int}) where {N} - color_shape = Base.ones(Int, N) - if N > 2 - color_shape[1] = Legate.num_procs() - done = false - while !done && color_shape[1] % 2 == 0 - weight_per_dim = [shape[i] / color_shape[i] for i in 1:(N - 2)] - max_weight, idx = findmax(weight_per_dim) - if weight_per_dim[idx] > 2 * weight_per_dim[1] - color_shape[1] ÷= 2 - color_shape[idx] *= 2 - else - done = true - end - end - end - return Tuple(color_shape) -end - -function prepare_manual_task_for_batched_matrices(full_shape::NTuple{N,Int}) where {N} - initial_color_shape = choose_nd_color_shape(full_shape) - tilesize = Tuple( - (full_shape[i] + initial_color_shape[i] - 1) ÷ initial_color_shape[i] for i in 1:N - ) - color_shape = Tuple((full_shape[i] + tilesize[i] - 1) ÷ tilesize[i] for i in 1:N) - return tilesize, color_shape -end - -function solve_batched(a::NDArray{T,N}, b::NDArray, x::NDArray) where {T,N} - nrhs = size(b)[end] - full_shape = size(a) - tilesize_a, color_shape = prepare_manual_task_for_batched_matrices(full_shape) - tilesize_b = (tilesize_a[1:(end - 1)]..., nrhs) - - store_a = nda_to_logical_store(a) - store_b = nda_to_logical_store(b) - store_x = nda_to_logical_store(x) - - tiled_a = Legate.partition_by_tiling(store_a, collect(tilesize_a)) - tiled_b = Legate.partition_by_tiling(store_b, collect(tilesize_b)) - tiled_x = Legate.partition_by_tiling(store_x, collect(tilesize_b)) - - rt = Legate.get_runtime() - domain = Legate.domain_from_shape(Legate.Shape(Legate.to_cxx_vector(color_shape))) - lib = cuNumeric.get_lib() - task = Legate.create_manual_task(rt, lib, cuNumeric.SOLVE, domain) - - Legate.add_input(task, tiled_a) - Legate.add_input(task, tiled_b) - Legate.add_output(task, tiled_x) - - Legate.submit_manual_task(rt, task) -end - -# solve runs in floating point: -# int/bool inputs promote to Float64 (matching cupynumeric) -const _SOLVE_PROMOTABLE = Union{SUPPORTED_INT_TYPES,Bool} -const _SOLVE_ACCEPTED = Union{SUPPORTED_SOLVE_TYPES,_SOLVE_PROMOTABLE} -_solve_eltype(::Type{T}) where {T<:_SOLVE_PROMOTABLE} = Float64 -_solve_eltype(::Type{T}) where {T<:SUPPORTED_SOLVE_TYPES} = T - -# Type/dim guards dispatch on one argument at a time, then forward to `_solve`. -function solve(a::NDArray{<:_SOLVE_ACCEPTED}, b::NDArray{<:_SOLVE_ACCEPTED}) - A, B = eltype(a), eltype(b) - O = promote_type(_solve_eltype(A), _solve_eltype(B)) - # int/bool -> float is an implicit promotion, disallowed unless `allowpromotion` - A <: _SOLVE_PROMOTABLE && assertpromotion(solve, A, O) - B <: _SOLVE_PROMOTABLE && assertpromotion(solve, B, O) - return _solve_check_a_dims(unchecked_promote_arr(a, O), unchecked_promote_arr(b, O)) -end - -function solve(a::NDArray, b::NDArray) - bad = eltype(a) <: _SOLVE_ACCEPTED ? eltype(b) : eltype(a) - throw(ArgumentError("array type $bad is unsupported in solve")) +# Dimension guards +function solve(a::NDArray{T,1}, b::NDArray{S,M}) where {T,S,M} + throw(ArgumentError("1-dimensional array given. Array must be at least two-dimensional")) end -# `a` must be at least 2D, `b` at least 1D. -function _solve_check_a_dims(a::NDArray{<:Any,0}, b::NDArray) +function solve(a::NDArray{T,0}, b::NDArray{S,M}) where {T,S,M} throw(ArgumentError("0-dimensional array given. Array must be at least two-dimensional")) end -function _solve_check_a_dims(a::NDArray{<:Any,1}, b::NDArray) - throw(ArgumentError("1-dimensional array given. Array must be at least two-dimensional")) -end -_solve_check_a_dims(a::NDArray, b::NDArray) = _solve_check_b_dims(a, b) -function _solve_check_b_dims(a::NDArray, b::NDArray{<:Any,0}) +function solve(a::NDArray{T,N}, b::NDArray{S,0}) where {T,N,S} throw(ArgumentError("0-dimensional array given. Array must be at least one-dimensional")) end -_solve_check_b_dims(a::NDArray, b::NDArray) = _solve(a, b) -# 2D case: (m,m),(m)->(m). -# Backend needs rhs "b" to be 2D. We reshape b from (n,) to (n,1) -function _solve(a::NDArray{T,2}, b::NDArray{S,1}) where {T,S} - m = size(b)[1] - return reshape(_solve(a, reshape(b, (m, 1))), (m,)) +# 2D case: (m,m),(m)->( m) +function solve(a::NDArray{T,2}, b::NDArray{S,1}) where {T,S} + size(a)[end - 1] != size(a)[end] && + throw(ArgumentError("Last 2 dimensions of the array must be square")) + size(a)[2] != size(b)[1] && + throw( + ArgumentError( + "Input operand 1 has a mismatch in its dimension 0, " * + "with signature (m,m),(m)->(m) (size $(size(b)[1]) " * + "is different from $(size(a)[2]))", + ), + ) + prod(size(a)) == 0 || prod(size(b)) == 0 && return zeros(T, size(b)...) + x = zeros(T, size(b)...) + solve_batched(a, b, x) + return x end -# 2D (m,m),(m,n)->(m,n) and batched (...,m,m),(...,m,n)->(...,m,n) -function _solve(a::NDArray{T,N}, b::NDArray{S,N}) where {T,S,N} +# 2D case: (m,m),(m,n)->(m,n) +function solve(a::NDArray{T,2}, b::NDArray{S,2}) where {T,S} + size(a)[end - 1] != size(a)[end] && + throw(ArgumentError("Last 2 dimensions of the array must be square")) + size(a)[2] != size(b)[1] && + throw( + ArgumentError( + "Input operand 1 has a mismatch in its dimension 0, " * + "with signature (m,m),(m,n)->(m,n) (size $(size(b)[1]) " * + "is different from $(size(a)[2]))", + ), + ) + prod(size(a)) == 0 || prod(size(b)) == 0 && return zeros(T, size(b)...) + x = zeros(T, size(b)...) + solve_batched(a, b, x) + return x +end + +# Batched case: (...,m,m),(...,m,n)->(...,m,n) +function solve(a::NDArray{T,N}, b::NDArray{S,N}) where {T,S,N} size(a)[end - 1] != size(a)[end] && throw(ArgumentError("Last 2 dimensions of the array must be square")) size(a)[end] != size(b)[end - 1] && @@ -114,6 +66,21 @@ function _solve(a::NDArray{T,N}, b::NDArray{S,N}) where {T,S,N} end # Mismatched batch dimensions -function _solve(a::NDArray{T,N}, b::NDArray{S,M}) where {T,N,S,M} +function solve(a::NDArray{T,N}, b::NDArray{S,M}) where {T,N,S,M} throw(ArgumentError("Batched matrices require signature (...,m,m),(...,m,n)->(...,m,n)")) end + +function svd(a::NDArray{T,2}, full_matrices::Bool=true) where {T} + if size(a)[1] < size(a)[2] + throw(ArgumentError("cuNumeric only supports M >= N")) + end + return _svd(a, full_matrices) +end + +function svd(a::NDArray{T,1}, full_matrices::Bool=true) where {T} + throw(ArgumentError("1-dimensional array given. Array must be at least two-dimensional")) +end + +function svd(a::NDArray{T,N}, full_matrices::Bool=true) where {T,N} + throw(ArgumentError("cuNumeric does not yet support stacked 2d arrays")) +end \ No newline at end of file diff --git a/test/tests/linalg.jl b/test/tests/linalg.jl index 8d5ededb..eb4cfd02 100644 --- a/test/tests/linalg.jl +++ b/test/tests/linalg.jl @@ -185,3 +185,150 @@ end end end end + +function check_svd_reconstruction(ref_A::AbstractMatrix, u, s, vh, tol_a, tol_r) + U = Array(u) + S = Array(s) + Vh = Array(vh) + A_rec = U * Diagonal(S) * Vh + return isapprox(ref_A, A_rec; atol=tol_a, rtol=tol_r) +end + +function check_svd_orthonormality(u, vh, tol_a, tol_r) + U = Array(u) + Vh = Array(vh) + k = size(U, 2) + ok_u = isapprox(U' * U, Matrix{eltype(U)}(I, k, k); atol=tol_a, rtol=tol_r) + ok_vh = isapprox(Vh * Vh', Matrix{eltype(Vh)}(I, k, k); atol=tol_a, rtol=tol_r) + return ok_u && ok_vh +end + +@testset "svd square matrix" begin + @testset verbose=true for T in Base.uniontypes(cuNumeric.SUPPORTED_SVD_TYPES) + A_ref = my_rand(T, 5, 5) + nda = cuNumeric.NDArray(A_ref) + u, s, vh = cuNumeric.svd(nda) + allowscalar() do + @test check_svd_reconstruction(A_ref, u, s, vh, atol(T), rtol(T)) + @test check_svd_orthonormality(u, vh, atol(T), rtol(T)) + end + end +end + +@testset "svd tall matrix (m > n)" begin + @testset verbose=true for T in Base.uniontypes(cuNumeric.SUPPORTED_SVD_TYPES) + A_ref = my_rand(T, 6, 4) + nda = cuNumeric.NDArray(A_ref) + u, s, vh = cuNumeric.svd(nda) + allowscalar() do + @test check_svd_reconstruction(A_ref, u, s, vh, atol(T), rtol(T)) + @test check_svd_orthonormality(u, vh, atol(T), rtol(T)) + end + end +end + +@testset "svd thin output shapes (full_matrices=false)" begin + @testset verbose=true for T in Base.uniontypes(cuNumeric.SUPPORTED_SVD_TYPES) + m, n = 6, 4 + k = min(m, n) + A_ref = my_rand(T, m, n) + nda = cuNumeric.NDArray(A_ref) + u, s, vh = cuNumeric.svd(nda, false) + allowscalar() do + @test size(Array(u)) == (m, k) + @test size(Array(s)) == (k,) + @test size(Array(vh)) == (k, n) + @test check_svd_reconstruction(A_ref, u, s, vh, atol(T), rtol(T)) + end + end +end + +@testset "svd full output shapes (full_matrices=true)" begin + @testset verbose=true for T in Base.uniontypes(cuNumeric.SUPPORTED_SVD_TYPES) + m, n = 6, 4 + A_ref = my_rand(T, m, n) + nda = cuNumeric.NDArray(A_ref) + u, s, vh = cuNumeric.svd(nda, true) + allowscalar() do + @test size(Array(u)) == (m, m) + @test size(Array(s)) == (min(m, n),) + @test size(Array(vh)) == (n, n) + end + end +end + +@testset "svd singular values non-negative and sorted" begin + @testset verbose=true for T in Base.uniontypes(cuNumeric.SUPPORTED_SVD_TYPES) + A_ref = my_rand(T, 5, 5) + nda = cuNumeric.NDArray(A_ref) + _, s, _ = cuNumeric.svd(nda) + allowscalar() do + sv = Array(s) + @test all(sv .>= 0) + @test issorted(sv; rev=true) + end + end +end + +@testset "svd identity matrix" begin + @testset verbose=true for T in Base.uniontypes(cuNumeric.SUPPORTED_SVD_TYPES) + n = 4 + A_ref = Matrix{T}(I, n, n) + nda = cuNumeric.NDArray(A_ref) + _, s, _ = cuNumeric.svd(nda) + allowscalar() do + @test cuNumeric.compare(ones(T, n), s, atol(T), rtol(T)) + end + end +end + +@testset "svd rank-1 matrix" begin + @testset verbose=true for T in Base.uniontypes(cuNumeric.SUPPORTED_SVD_TYPES) + # outer product of two vectors: exactly one nonzero singular value + # 5×4 satisfies the M >= N constraint + v1 = T.(collect(1:5)) + v2 = T.(collect(1:4)) + A_ref = v1 * v2' + nda = cuNumeric.NDArray(A_ref) + _, s, _ = cuNumeric.svd(nda) + allowscalar() do + sv = Array(s) + @test sv[1] > atol(T) + @test all(sv[2:end] .< atol(T) * 10) + end + end +end + +@testset "svd agrees with LinearAlgebra.svd" begin + @testset verbose=true for T in Base.uniontypes(cuNumeric.SUPPORTED_SVD_TYPES) + A_ref = my_rand(T, 5, 4) + nda = cuNumeric.NDArray(A_ref) + _, s, _ = cuNumeric.svd(nda, false) + ref = LinearAlgebra.svd(A_ref) + allowscalar() do + # Singular values must match; signs of singular vectors may differ + @test cuNumeric.compare(ref.S, s, atol(T), rtol(T)) + end + end +end + +@testset "svd 1d guard" begin + @testset verbose=true for T in Base.uniontypes(cuNumeric.SUPPORTED_SVD_TYPES) + v = cuNumeric.NDArray(T.(collect(1:4))) + @test_throws ArgumentError cuNumeric.svd(v) + end +end + +@testset "svd 3d guard" begin + @testset verbose=true for T in Base.uniontypes(cuNumeric.SUPPORTED_SVD_TYPES) + A = cuNumeric.NDArray(my_rand(T, 3, 3, 3)) + @test_throws ArgumentError cuNumeric.svd(A) + end +end + +@testset "svd M < N guard" begin + @testset verbose=true for T in Base.uniontypes(cuNumeric.SUPPORTED_SVD_TYPES) + A = cuNumeric.NDArray(my_rand(T, 3, 5)) + @test_throws ArgumentError cuNumeric.svd(A) + end +end \ No newline at end of file