Skip to content
Open
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
88 changes: 63 additions & 25 deletions enzyme/Enzyme/AdjointGenerator.h
Original file line number Diff line number Diff line change
Expand Up @@ -3996,43 +3996,81 @@ class AdjointGenerator : public llvm::InstVisitor<AdjointGenerator> {
(void)vdiff;

switch (ID) {
#if LLVM_VERSION_MAJOR < 22
#if LLVM_VERSION_MAJOR < 21
case Intrinsic::nvvm_barrier0_popc:
case Intrinsic::nvvm_barrier0_and:
Comment thread
minansys marked this conversation as resolved.
Outdated
case Intrinsic::nvvm_barrier0_or:
case Intrinsic::nvvm_barrier0_or: {
SmallVector<Value *, 1> args = {};
auto *Fn = getIntrinsicDeclaration(M, Intrinsic::nvvm_barrier0);
auto cal = cast<CallInst>(Builder2.CreateCall(Fn, args));
cal->setCallingConv(Fn->getCallingConv());
cal->setDebugLoc(gutils->getNewFromOriginal(I.getDebugLoc()));
return false;
}
#elif LLVM_VERSION_MAJOR < 22
case Intrinsic::nvvm_barrier0_popc:
case Intrinsic::nvvm_barrier0_and:
case Intrinsic::nvvm_barrier0_or: {
SmallVector<Value *, 1> args = {
ConstantInt::get(Type::getInt32Ty(M->getContext()), 0)};
auto *Fn = getIntrinsicDeclaration(
M, Intrinsic::nvvm_barrier_cta_sync_aligned_all);
auto cal = cast<CallInst>(Builder2.CreateCall(Fn, args));
cal->setCallingConv(Fn->getCallingConv());
cal->setDebugLoc(gutils->getNewFromOriginal(I.getDebugLoc()));
return false;
}
#else
case Intrinsic::nvvm_barrier_cta_red_and_aligned_all:
case Intrinsic::nvvm_barrier_cta_red_and_aligned_count:
case Intrinsic::nvvm_barrier_cta_red_or_aligned_all:
case Intrinsic::nvvm_barrier_cta_red_popc_aligned_all: {
SmallVector<Value *, 1> args = {I.getOperand(0)};
auto *Fn = getIntrinsicDeclaration(
M, Intrinsic::nvvm_barrier_cta_sync_aligned_all);
auto cal = cast<CallInst>(Builder2.CreateCall(Fn, args));
cal->setCallingConv(Fn->getCallingConv());
cal->setDebugLoc(gutils->getNewFromOriginal(I.getDebugLoc()));
return false;
}
case Intrinsic::nvvm_barrier_cta_red_and_aligned_count:
case Intrinsic::nvvm_barrier_cta_red_or_aligned_count:
case Intrinsic::nvvm_barrier_cta_red_popc_aligned_all:
case Intrinsic::nvvm_barrier_cta_red_popc_aligned_count:
case Intrinsic::nvvm_barrier_cta_red_popc_aligned_count: {
SmallVector<Value *, 2> args = {I.getOperand(0), I.getOperand(1)};
auto *Fn = getIntrinsicDeclaration(
M, Intrinsic::nvvm_barrier_cta_sync_aligned_count);
auto cal = cast<CallInst>(Builder2.CreateCall(Fn, args));
cal->setCallingConv(Fn->getCallingConv());
cal->setDebugLoc(gutils->getNewFromOriginal(I.getDebugLoc()));
return false;
}
#endif
{

#if LLVM_VERSION_MAJOR < 21
case Intrinsic::nvvm_barrier0: {
SmallVector<Value *, 1> args = {};
#if LLVM_VERSION_MAJOR > 20
auto cal = cast<CallInst>(Builder2.CreateCall(
getIntrinsicDeclaration(
M, Intrinsic::nvvm_barrier_cta_sync_aligned_all),
args));
cal->setCallingConv(getIntrinsicDeclaration(
M, Intrinsic::nvvm_barrier_cta_sync_aligned_all)
->getCallingConv());
#else
auto cal = cast<CallInst>(Builder2.CreateCall(
getIntrinsicDeclaration(M, Intrinsic::nvvm_barrier0), args));
cal->setCallingConv(getIntrinsicDeclaration(M, Intrinsic::nvvm_barrier0)
->getCallingConv());
#endif
auto *Fn = getIntrinsicDeclaration(M, ID);
auto cal = cast<CallInst>(Builder2.CreateCall(Fn, args));
cal->setCallingConv(Fn->getCallingConv());
cal->setDebugLoc(gutils->getNewFromOriginal(I.getDebugLoc()));
return false;
}

#if LLVM_VERSION_MAJOR <= 20
case Intrinsic::nvvm_barrier0:
#else
case Intrinsic::nvvm_barrier_cta_sync_aligned_all:
case Intrinsic::nvvm_barrier_cta_sync_aligned_count:
case Intrinsic::nvvm_barrier_cta_sync_aligned_all: {
SmallVector<Value *, 1> args = {I.getOperand(0)};
auto *Fn = getIntrinsicDeclaration(M, ID);
auto cal = cast<CallInst>(Builder2.CreateCall(Fn, args));
cal->setCallingConv(Fn->getCallingConv());
cal->setDebugLoc(gutils->getNewFromOriginal(I.getDebugLoc()));
return false;
}
case Intrinsic::nvvm_barrier_cta_sync_aligned_count: {
SmallVector<Value *, 2> args = {I.getOperand(0), I.getOperand(1)};
auto *Fn = getIntrinsicDeclaration(M, ID);
auto cal = cast<CallInst>(Builder2.CreateCall(Fn, args));
cal->setCallingConv(Fn->getCallingConv());
cal->setDebugLoc(gutils->getNewFromOriginal(I.getDebugLoc()));
return false;
}
#endif
case Intrinsic::amdgcn_s_barrier:
case Intrinsic::nvvm_membar_cta:
Expand Down
9 changes: 8 additions & 1 deletion enzyme/Enzyme/EnzymeLogic.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4605,10 +4605,17 @@ Function *EnzymeLogic::CreatePrimalAndGradient(
auto BarrierInst = Arch == Triple::amdgcn
? (llvm::Intrinsic::ID)Intrinsic::amdgcn_s_barrier
: (llvm::Intrinsic::ID)Intrinsic::nvvm_barrier0;
#endif
SmallVector<Value *, 1> BarrierArgs;
#if LLVM_VERSION_MAJOR > 20
if (Arch != Triple::amdgcn) {
BarrierArgs.push_back(ConstantInt::get(
Type::getInt32Ty(gutils->newFunc->getContext()), 0));
}
#endif
instbuilder.CreateCall(
getIntrinsicDeclaration(gutils->newFunc->getParent(), BarrierInst),
{});
BarrierArgs);
OldEntryInsts->moveAfter(entry);
sharedBlock->moveAfter(entry);
IRBuilder<> sbuilder(sharedBlock);
Expand Down
70 changes: 70 additions & 0 deletions enzyme/test/Enzyme/ReverseMode/cuda-barrier-sync.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
; RUN: split-file %s %t
; RUN: if [ %llvmver -le 20 ]; then %opt < %t/llvm20.ll %newLoadEnzyme -enzyme-preopt=false -enzyme-detect-readthrow=0 -passes="enzyme" -S | FileCheck %t/llvm20.ll; fi
; RUN: if [ %llvmver -gt 20 ]; then %opt < %t/llvm21plus.ll %newLoadEnzyme -enzyme-preopt=false -enzyme-detect-readthrow=0 -passes="enzyme" -S | FileCheck %t/llvm21plus.ll; fi

;--- llvm20.ll
target triple = "nvptx64-nvidia-cuda"

declare void @llvm.nvvm.barrier0()
declare float @__enzyme_autodiff(float (float)*, ...)

define float @f_sync(float %x) {
entry:
call void @llvm.nvvm.barrier0()
%res = fadd float %x, 1.000000e+00
ret float %res
}

define float @test(float %x) {
entry:
%r = call float (float (float)*, ...) @__enzyme_autodiff(float (float)* @f_sync, float %x)
ret float %r
}

; CHECK: define internal { float } @diffef_sync(float %x, float %differeturn)
; CHECK: call void @llvm.nvvm.barrier0()
; CHECK: call void @llvm.nvvm.barrier0()
; CHECK: ret { float }

;--- llvm21plus.ll
target triple = "nvptx64-nvidia-cuda"

declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32)
declare void @llvm.nvvm.barrier.cta.sync.aligned.count(i32, i32)
declare float @__enzyme_autodiff(float (float)*, ...)

define float @f_sync_all(float %x) {
entry:
call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 7)
%res = fadd float %x, 1.000000e+00
ret float %res
}

define float @test_all(float %x) {
entry:
%r = call float (float (float)*, ...) @__enzyme_autodiff(float (float)* @f_sync_all, float %x)
ret float %r
}

define float @f_sync_count(float %x) {
entry:
call void @llvm.nvvm.barrier.cta.sync.aligned.count(i32 7, i32 16)
%res = fadd float %x, 1.000000e+00
ret float %res
}

define float @test_count(float %x) {
entry:
%r = call float (float (float)*, ...) @__enzyme_autodiff(float (float)* @f_sync_count, float %x)
ret float %r
}

; CHECK: define internal { float } @diffef_sync_all(float %x, float %differeturn)
; CHECK: call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 7)
; CHECK: call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 7)
; CHECK: ret { float }

; CHECK: define internal { float } @diffef_sync_count(float %x, float %differeturn)
; CHECK: call void @llvm.nvvm.barrier.cta.sync.aligned.count(i32 7, i32 16)
; CHECK: call void @llvm.nvvm.barrier.cta.sync.aligned.count(i32 7, i32 16)
; CHECK: ret { float }
Loading