diff --git a/clang/lib/CIR/CodeGen/CIRGenExpr.cpp b/clang/lib/CIR/CodeGen/CIRGenExpr.cpp index 2241fd9bb573..353ec61ed15c 100644 --- a/clang/lib/CIR/CodeGen/CIRGenExpr.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenExpr.cpp @@ -1408,7 +1408,9 @@ RValue CIRGenFunction::emitCallExpr(const clang::CallExpr *E, if (const auto *CE = dyn_cast(E)) return emitCXXMemberCallExpr(CE, ReturnValue); - assert(!dyn_cast(E) && "CUDA NYI"); + if (const auto *CE = dyn_cast(E)) + return emitCUDAKernelCallExpr(CE, ReturnValue); + if (const auto *CE = dyn_cast(E)) if (const CXXMethodDecl *MD = dyn_cast_or_null(CE->getCalleeDecl())) diff --git a/clang/lib/CIR/CodeGen/CIRGenExprCXX.cpp b/clang/lib/CIR/CodeGen/CIRGenExprCXX.cpp index 5c0170644778..b597d751da9e 100644 --- a/clang/lib/CIR/CodeGen/CIRGenExprCXX.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenExprCXX.cpp @@ -376,6 +376,11 @@ CIRGenFunction::emitCXXOperatorMemberCallExpr(const CXXOperatorCallExpr *E, /*IsArrow=*/false, E->getArg(0)); } +RValue CIRGenFunction::emitCUDAKernelCallExpr(const CUDAKernelCallExpr *E, + ReturnValueSlot ReturnValue) { + return CGM.getCUDARuntime().emitCUDAKernelCallExpr(*this, E, ReturnValue); +} + static void emitNullBaseClassInitialization(CIRGenFunction &CGF, Address DestPtr, const CXXRecordDecl *Base) { diff --git a/clang/lib/CIR/CodeGen/CIRGenFunction.h b/clang/lib/CIR/CodeGen/CIRGenFunction.h index eef9fccb555c..25d4ee1767be 100644 --- a/clang/lib/CIR/CodeGen/CIRGenFunction.h +++ b/clang/lib/CIR/CodeGen/CIRGenFunction.h @@ -2068,6 +2068,9 @@ class CIRGenFunction : public CIRGenTypeCache { const CXXMethodDecl *MD, ReturnValueSlot ReturnValue); + RValue emitCUDAKernelCallExpr(const CUDAKernelCallExpr *E, + ReturnValueSlot ReturnValue); + RValue emitCXXPseudoDestructorExpr(const CXXPseudoDestructorExpr *expr); void emitCXXTemporary(const CXXTemporary *Temporary, QualType TempType, diff --git a/clang/test/CIR/CodeGen/CUDA/destructor.cu b/clang/test/CIR/CodeGen/CUDA/destructor.cu index bf1ff0cd126f..001f27597e43 100644 --- a/clang/test/CIR/CodeGen/CUDA/destructor.cu +++ b/clang/test/CIR/CodeGen/CUDA/destructor.cu @@ -10,6 +10,26 @@ // RUN: %s -o %t.cir // RUN: FileCheck --check-prefix=CIR-DEVICE --input-file=%t.cir %s +// RUN: %clang_cc1 -triple nvptx64-nvidia-cuda -fclangir \ +// RUN: -fcuda-is-device -emit-llvm -target-sdk-version=12.3 \ +// RUN: %s -o %t.ll +// RUN: FileCheck --check-prefix=LLVM-DEVICE --input-file=%t.ll %s + +// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fclangir \ +// RUN: -x cuda -emit-llvm -target-sdk-version=12.3 \ +// RUN: %s -o %t.ll +// RUN: FileCheck --check-prefix=LLVM-HOST --input-file=%t.ll %s + +// RUN: %clang_cc1 -triple nvptx64-nvidia-cuda \ +// RUN: -fcuda-is-device -emit-llvm -target-sdk-version=12.3 \ +// RUN: %s -o %t.ll +// RUN: FileCheck --check-prefix=OGCG-DEVICE --input-file=%t.ll %s + +// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu \ +// RUN: -x cuda -emit-llvm -target-sdk-version=12.3 \ +// RUN: %s -o %t.ll +// RUN: FileCheck --check-prefix=OGCG-HOST --input-file=%t.ll %s + // Make sure we do emit device-side kernel even if it's only referenced // by the destructor of a variable not present on device. template __global__ void f(T) {} @@ -19,11 +39,23 @@ template struct A { // CIR-HOST: module // CIR-DEVICE: module -// CIR-DEVICE-DISABLED: cir.func dso_local @_Z1fIiEvT_ +// CIR-DEVICE: cir.func dso_local @_Z1fIiEvT_ +// LLVM-DEVICE: define dso_local ptx_kernel void @_Z1fIiEvT_ +// OGCG-DEVICE: define ptx_kernel void @_Z1fIiEvT_ + +// CIR-HOST: cir.func {{.*}} @_ZN1AIiED2Ev{{.*}} { +// CIR-HOST: cir.call @__cudaPushCallConfiguration +// CIR-HOST: cir.call @_Z16__device_stub__fIiEvT_ +// CIR-HOST: } + +// LLVM-HOST: define linkonce_odr void @_ZN1AIiED2Ev +// LLVM-HOST: call i32 @__cudaPushCallConfiguration( +// LLVM-HOST: call void @_Z16__device_stub__fIiEvT_ + +// OGCG-HOST: define linkonce_odr void @_ZN1AIiED2Ev +// OGCG-HOST: call i32 @__cudaPushCallConfiguration( +// OGCG-HOST: call void @_Z16__device_stub__fIiEvT_ + -// CIR-HOST-DISABLED: cir.func {{.*}} @_ZN1AIiED2Ev{{.*}} { -// CIR-HOST-DISABLED: cir.call @__cudaPushCallConfiguration -// CIR-HOST-DISABLED: cir.call @_Z16__device_stub__fIiEvT_ -// CIR-HOST-DISABLED: } -// A a; +A a; diff --git a/clang/test/CIR/CodeGen/CUDA/simple.cu b/clang/test/CIR/CodeGen/CUDA/simple.cu index 16c49c8578b4..d4fa3a63d6a2 100644 --- a/clang/test/CIR/CodeGen/CUDA/simple.cu +++ b/clang/test/CIR/CodeGen/CUDA/simple.cu @@ -1,16 +1,35 @@ #include "../Inputs/cuda.h" -// TODO: host build is currently crashing. -// RUN-DISABLE: %clang_cc1 -triple x86_64-unknown-linux-gnu -fclangir \ -// RUN-DISABLE: -x cuda -emit-cir -target-sdk-version=12.3 \ -// RUN-DISABLE: %s -o %t.cir -// RUN-DISABLE: FileCheck --check-prefix=CIR-HOST --input-file=%t.cir %s +// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fclangir \ +// RUN: -x cuda -emit-cir -target-sdk-version=12.3 \ +// RUN: %s -o %t.cir +// RUN: FileCheck --check-prefix=CIR-HOST --input-file=%t.cir %s // RUN: %clang_cc1 -triple nvptx64-nvidia-cuda -fclangir \ // RUN: -fcuda-is-device -emit-cir -target-sdk-version=12.3 \ // RUN: %s -o %t.cir // RUN: FileCheck --check-prefix=CIR-DEVICE --input-file=%t.cir %s +// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fclangir \ +// RUN: -x cuda -emit-llvm -target-sdk-version=12.3 \ +// RUN: %s -o %t.ll +// RUN: FileCheck --check-prefix=LLVM-HOST --input-file=%t.ll %s + +// RUN: %clang_cc1 -triple nvptx64-nvidia-cuda -fclangir \ +// RUN: -fcuda-is-device -emit-llvm -target-sdk-version=12.3 \ +// RUN: %s -o %t.ll +// RUN: FileCheck --check-prefix=LLVM-DEVICE --input-file=%t.ll %s + +// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu \ +// RUN: -x cuda -emit-llvm -target-sdk-version=12.3 \ +// RUN: %s -o %t.ll +// RUN: FileCheck --check-prefix=OGCG-HOST --input-file=%t.ll %s + +// RUN: %clang_cc1 -triple nvptx64-nvidia-cuda \ +// RUN: -fcuda-is-device -emit-llvm -target-sdk-version=12.3 \ +// RUN: %s -o %t.ll +// RUN: FileCheck --check-prefix=OGCG-DEVICE --input-file=%t.ll %s + // Attribute for global_fn // CIR-HOST: [[Kernel:#[a-zA-Z_0-9]+]] = {{.*}}#cir.cu.kernel_name<_Z9global_fni>{{.*}} @@ -25,6 +44,7 @@ __device__ void device_fn(int* a, double b, float c) {} __global__ void global_fn(int a) {} // CIR-DEVICE: @_Z9global_fni({{.*}} cc(ptx_kernel) // LLVM-DEVICE: define dso_local ptx_kernel void @_Z9global_fni +// OGCG-DEVICE: define dso_local ptx_kernel void @_Z9global_fni // Check for device stub emission. @@ -38,10 +58,17 @@ __global__ void global_fn(int a) {} // LLVM-HOST: void @_Z24__device_stub__global_fni // LLVM-HOST: %[[#KernelArgs:]] = alloca [1 x ptr], i64 1, align 16 // LLVM-HOST: %[[#GEP1:]] = getelementptr ptr, ptr %[[#KernelArgs]], i32 0 -// LLVM-HOST: %[[#GEP2:]] = getelementptr ptr, ptr %[[#GEP1]], i64 0 +// LLVM-HOST: %[[#GEP2:]] = getelementptr [1 x ptr], ptr %[[#KernelArgs]], i32 0, i64 0 // LLVM-HOST: call i32 @__cudaPopCallConfiguration // LLVM-HOST: call i32 @cudaLaunchKernel(ptr @_Z24__device_stub__global_fni +// OGCG-HOST: void @_Z24__device_stub__global_fni +// OGCG-HOST: %kernel_args = alloca ptr, i64 1, align 16 +// OGCG-HOST: getelementptr ptr, ptr %kernel_args, i32 0 +// OGCG-HOST: call i32 @__cudaPopCallConfiguration +// OGCG-HOST: call noundef i32 @cudaLaunchKernel(ptr noundef @_Z24__device_stub__global_fni + + int main() { global_fn<<<1, 1>>>(1); } @@ -63,10 +90,29 @@ int main() { // LLVM-HOST: alloca %struct.dim3 // LLVM-HOST: call void @_ZN4dim3C1Ejjj // LLVM-HOST: call void @_ZN4dim3C1Ejjj -// LLVM-HOST: [[LLVMConfigOK:%[0-9]+]] = call i32 @__cudaPushCallConfiguration -// LLVM-HOST: br [[LLVMConfigOK]], label %[[#Good:]], label [[#Bad:]] +// LLVM-HOST: %[[#ConfigOK:]] = call i32 @__cudaPushCallConfiguration +// LLVM-HOST: %[[#ConfigCond:]] = icmp ne i32 %[[#ConfigOK]], 0 +// LLVM-HOST: br i1 %[[#ConfigCond]], label %[[#Good:]], label %[[#Bad:]] // LLVM-HOST: [[#Good]]: -// LLVM-HOST: br label [[#End:]] +// LLVM-HOST: br label %[[#End:]] // LLVM-HOST: [[#Bad]]: -// LLVM-HOST: call void @_Z24__device_stub__global_fni -// LLVM-HOST: br label [[#End]] +// LLVM-HOST: call void @_Z24__device_stub__global_fni(i32 1) +// LLVM-HOST: br label %[[#End:]] +// LLVM-HOST: [[#End]]: +// LLVM-HOST: %[[#]] = load i32 +// LLVM-HOST: ret i32 + +// OGCG-HOST: define dso_local noundef i32 @main +// OGCG-HOST: alloca %struct.dim3, align 4 +// OGCG-HOST: alloca %struct.dim3, align 4 +// OGCG-HOST: call void @_ZN4dim3C1Ejjj +// OGCG-HOST: call void @_ZN4dim3C1Ejjj +// OGCG-HOST: %call = call i32 @__cudaPushCallConfiguration +// OGCG-HOST: %tobool = icmp ne i32 %call, 0 +// OGCG-HOST: br i1 %tobool, label %kcall.end, label %kcall.configok +// OGCG-HOST: kcall.configok: +// OGCG-HOST: call void @_Z24__device_stub__global_fni(i32 noundef 1) +// OGCG-HOST: br label %kcall.end +// OGCG-HOST: kcall.end: +// OGCG-HOST: %{{[0-9]+}} = load i32, ptr %retval, align 4 +// OGCG-HOST: ret i32 \ No newline at end of file