diff --git a/clang/include/clang/CIR/MissingFeatures.h b/clang/include/clang/CIR/MissingFeatures.h
index fe0b80f5c1af..11ac01468cbf 100644
--- a/clang/include/clang/CIR/MissingFeatures.h
+++ b/clang/include/clang/CIR/MissingFeatures.h
@@ -74,6 +74,7 @@ struct MissingFeatures {
   // GNU vectors are done, but other kinds of vectors haven't been implemented.
   static bool scalableVectors() { return false; }
   static bool vectorConstants() { return false; }
+  static bool vectorToX86AmxCasting() { return false; }
 
   // Address space related
   static bool addressSpace() { return false; }
diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp b/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp
index 975a8f10b15d..de6d27ff2ce7 100644
--- a/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp
@@ -532,6 +532,38 @@ decodeFixedType(ArrayRef<llvm::Intrinsic::IITDescriptor> &infos,
 }
 
 // llvm::Intrinsics accepts only LLVMContext. We need to reimplement it here.
+/// Helper function to correct integer signedness for intrinsic arguments.
+/// IIT always returns signed integers, but the actual intrinsic may expect
+/// unsigned integers based on the AST FunctionDecl parameter types.
+static mlir::Type getIntrinsicArgumentTypeFromAST(mlir::Type iitType,
+                                                  const CallExpr *E,
+                                                  unsigned argIndex,
+                                                  mlir::MLIRContext *context) {
+  // If it's not an integer type, return as-is
+  auto intTy = dyn_cast<cir::IntType>(iitType);
+  if (!intTy)
+    return iitType;
+
+  // Get the FunctionDecl from the CallExpr
+  const FunctionDecl *FD = nullptr;
+  if (const auto *DRE =
+          dyn_cast<DeclRefExpr>(E->getCallee()->IgnoreImpCasts())) {
+    FD = dyn_cast<FunctionDecl>(DRE->getDecl());
+  }
+
+  // If we have FunctionDecl and this argument exists, check its signedness
+  if (FD && argIndex < FD->getNumParams()) {
+    QualType paramType = FD->getParamDecl(argIndex)->getType();
+    if (paramType->isUnsignedIntegerType()) {
+      // Create unsigned version of the type
+      return IntType::get(context, intTy.getWidth(), /*isSigned=*/false);
+    }
+  }
+
+  // Default: keep IIT type (signed)
+  return iitType;
+}
+
 static cir::FuncType getIntrinsicType(mlir::MLIRContext *context,
                                       llvm::Intrinsic::ID id) {
   using namespace llvm::Intrinsic;
@@ -2730,12 +2762,20 @@ RValue CIRGenFunction::emitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
 
     SmallVector<mlir::Value> args;
     for (unsigned i = 0; i < E->getNumArgs(); i++) {
-      mlir::Value arg = emitScalarOrConstFoldImmArg(iceArguments, i, E);
-      mlir::Type argType = arg.getType();
-      if (argType != intrinsicType.getInput(i))
+      mlir::Value argValue = emitScalarOrConstFoldImmArg(iceArguments, i, E);
+      // If the intrinsic arg type is different from the builtin arg type
+      // we need to do a bit cast.
+      mlir::Type argType = argValue.getType();
+      mlir::Type expectedTy = intrinsicType.getInput(i);
+
+      // Use helper to get the correct integer type based on AST signedness
+      mlir::Type correctedExpectedTy =
+          getIntrinsicArgumentTypeFromAST(expectedTy, E, i, &getMLIRContext());
+
+      if (argType != correctedExpectedTy)
         llvm_unreachable("NYI");
 
-      args.push_back(arg);
+      args.push_back(argValue);
     }
 
     auto intrinsicCall = builder.create<cir::LLVMIntrinsicCallOp>(
diff --git a/clang/test/CIR/CodeGen/CUDA/builtins-nvptx-ptx60.cu b/clang/test/CIR/CodeGen/CUDA/builtins-nvptx-ptx60.cu
new file mode 100644
index 000000000000..2f195d547804
--- /dev/null
+++ b/clang/test/CIR/CodeGen/CUDA/builtins-nvptx-ptx60.cu
@@ -0,0 +1,55 @@
+// RUN: %clang_cc1 -triple nvptx64-nvidia-cuda -fclangir -target-cpu sm_70 \
+// RUN:            -fcuda-is-device -target-feature +ptx60 \
+// RUN:            -emit-cir -o - -x cuda %s \
+// RUN:   | FileCheck -check-prefix=CIR %s
+// RUN: %clang_cc1 -triple nvptx64-nvidia-cuda -fclangir -target-cpu sm_80 \
+// RUN:            -fcuda-is-device -target-feature +ptx65 \
+// RUN:            -emit-cir -o - -x cuda %s \
+// RUN:   | FileCheck -check-prefix=CIR %s
+// RUN: %clang_cc1 -triple nvptx64-nvidia-cuda -fclangir -target-cpu sm_80 \
+// RUN:            -fcuda-is-device -target-feature +ptx70 \
+// RUN:            -emit-cir -o - -x cuda %s \
+// RUN:   | FileCheck -check-prefix=CIR %s
+
+// RUN: %clang_cc1 -triple nvptx64-nvidia-cuda -fclangir -target-cpu sm_70 \
+// RUN:            -fcuda-is-device -target-feature +ptx60 \
+// RUN:            -emit-llvm -o - -x cuda %s \
+// RUN:   | FileCheck -check-prefix=LLVM %s
+// RUN: %clang_cc1 -triple nvptx64-nvidia-cuda -fclangir -target-cpu sm_80 \
+// RUN:            -fcuda-is-device -target-feature +ptx65 \
+// RUN:            -emit-llvm -o - -x cuda %s \
+// RUN:   | FileCheck -check-prefix=LLVM %s
+// RUN: %clang_cc1 -triple nvptx64-nvidia-cuda -fclangir -target-cpu sm_80 \
+// RUN:            -fcuda-is-device -target-feature +ptx70 \
+// RUN:            -emit-llvm -o - -x cuda %s \
+// RUN:   | FileCheck -check-prefix=LLVM %s
+
+// RUN: %clang_cc1 -triple nvptx64-nvidia-cuda -target-cpu sm_70 \
+// RUN:            -fcuda-is-device -target-feature +ptx60 \
+// RUN:            -emit-llvm -o - -x cuda %s \
+// RUN:   | FileCheck -check-prefix=OGCHECK %s
+// RUN: %clang_cc1 -triple nvptx64-nvidia-cuda -target-cpu sm_80 \
+// RUN:            -fcuda-is-device -target-feature +ptx65 \
+// RUN:            -emit-llvm -o - -x cuda %s \
+// RUN:   | FileCheck -check-prefix=OGCHECK %s
+// RUN: %clang_cc1 -triple nvptx64-nvidia-cuda -target-cpu sm_80 \
+// RUN:            -fcuda-is-device -target-feature +ptx70 \
+// RUN:            -emit-llvm -o - -x cuda %s \
+// RUN:   | FileCheck -check-prefix=OGCHECK %s
+
+#define __device__ __attribute__((device))
+#define __global__ __attribute__((global))
+#define __shared__ __attribute__((shared))
+#define __constant__ __attribute__((constant))
+
+typedef unsigned long long uint64_t;
+
+__device__ void nvvm_sync(unsigned mask, int i, float f, int a, int b,
+                          bool pred, uint64_t i64) {
+
+  // CIR: cir.llvm.intrinsic "nvvm.bar.warp.sync" {{.*}} : (!u32i)
+  // LLVM: call void @llvm.nvvm.bar.warp.sync(i32
+  // OGCHECK: call void @llvm.nvvm.bar.warp.sync(i32
+  __nvvm_bar_warp_sync(mask);
+
+}