Skip to content

Commit 3caabbf

Browse files
gbaraldiclaude
andcommitted
Add InferAddressSpaces + optimization passes after kernarg rewrite
Run InferAddressSpaces (with TargetMachine) after add_kernarg_address_spaces! to propagate addrspace(4) through addrspacecast chains. Follow up with SROA, InstCombine, EarlyCSE, and SimplifyCFG to clean up newly-exposed opportunities. The earlier illegal address errors were caused by byref attribute loss in clone_into!, not by InferAddressSpaces itself. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent 14e010b commit 3caabbf

1 file changed

Lines changed: 15 additions & 6 deletions

File tree

src/gcn.jl

Lines changed: 15 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,20 @@ function finish_ir!(@nospecialize(job::CompilerJob{GCNCompilerTarget}), mod::LLV
5858
entry::LLVM.Function)
5959
if job.config.kernel
6060
entry = add_kernarg_address_spaces!(job, mod, entry)
61+
62+
# optimize after address space rewriting: propagate addrspace(4) through
63+
# the addrspacecast chains, then clean up newly-exposed opportunities
64+
tm = llvm_machine(job.config.target)
65+
@dispose pb=NewPMPassBuilder() tm begin
66+
add!(pb, NewPMFunctionPassManager()) do fpm
67+
add!(fpm, InferAddressSpacesPass())
68+
add!(fpm, SROAPass())
69+
add!(fpm, InstCombinePass())
70+
add!(fpm, EarlyCSEPass())
71+
add!(fpm, SimplifyCFGPass())
72+
end
73+
run!(pb, mod, tm)
74+
end
6175
end
6276
return entry
6377
end
@@ -159,12 +173,7 @@ function add_kernarg_address_spaces!(@nospecialize(job::CompilerJob), mod::LLVM.
159173
erase!(f)
160174
LLVM.name!(new_f, fn)
161175

162-
# clean up the extra conversion block.
163-
# NOTE: we do NOT run InferAddressSpaces here — the AMDGPU backend's
164-
# AMDGPULowerKernelArguments pass traces addrspacecast chains during codegen
165-
# and correctly produces s_load for addrspace(4) provenance. Running
166-
# InferAddressSpaces with a TargetMachine can over-propagate addrspace(4)
167-
# into pointer values loaded from the struct (which should remain flat/global).
176+
# clean up the extra conversion block
168177
@dispose pb=NewPMPassBuilder() begin
169178
add!(pb, NewPMFunctionPassManager()) do fpm
170179
add!(fpm, SimplifyCFGPass())

0 commit comments

Comments
 (0)