tile-ai · benenzhu · May 15, 2026 · May 16, 2026 · May 16, 2026 · May 16, 2026
diff --git a/3rdparty/tvm b/3rdparty/tvm
diff --git a/src/backend/rocm/codegen/codegen_hip.cc b/src/backend/rocm/codegen/codegen_hip.cc
@@ -56,9 +56,12 @@ std::optional<DataType> GetAccessPtrElementType(const PrimExpr &expr) {
 }
 
 int GetTileLangCPAsyncTransferBytes(const CallNode *op) {
-  ICHECK(op->args.size() == 3 || op->args.size() == 4)
-      << "tl::ptx_cp_async expects 3 or 4 arguments (dst_access_ptr, "
-         "src_access_ptr, num_elems, [predicate])";
+  // Accepts ptx_cp_async / ptx_cp_async_lds (3 or 4 args: dst, src, num_elems,
+  // [predicate]) and ptx_cp_async_lds_rsrc (5 args: dst, src, num_elems,
+  // rsrc_var, base_var) -- only args[0..2] are read here.
+  ICHECK(op->args.size() == 3 || op->args.size() == 4 || op->args.size() == 5)
+      << "tl::ptx_cp_async family expects 3-5 arguments (dst_access_ptr, "
+         "src_access_ptr, num_elems, ...)";
   const auto *num_elems_imm = op->args[2].as<IntImmNode>();
   ICHECK(num_elems_imm) << "tl::ptx_cp_async num_elems must be IntImm, but got "
                         << op->args[2];
@@ -1169,9 +1172,31 @@ void CodeGenTileLangHIP::VisitExpr_(const CallNode *op, std::ostream &os) {
     }
     this->stream << ");\n";
   };
-  if (op->op.same_as(builtin::ptx_cp_async())) {
-    // args[0] = dst_access_ptr, args[1] = src_access_ptr, args[2] = bytes,
-    // args[3] = predicate (optional)
+  if (op->op.same_as(tl::ptx_make_buffer_resource())) {
+    // Expression form: emits make_wave_buffer_resource((const void*)(ptr)).
+    // The enclosing LetStmt visitor recognises this Call and emits `auto x =`.
+    ICHECK(op->args.size() == 1)
+        << "ptx_make_buffer_resource expects 1 argument (global_ptr)";
+    std::string ptr = this->PrintExpr(op->args[0]);
+    os << "make_wave_buffer_resource((const void*)(" << ptr << "))";
+  } else if (op->op.same_as(tl::ptx_cp_async_lds_rsrc())) {
+    // args = [dst, src, num_elems, rsrc_var, base_var]. arg 2 is the logical
+    // element count inherited from the ptx_cp_async_lds call that
+    // HoistBufferResource rewrote into this rsrc form -- the helper does the
+    // src/dst width-equality and {4,8,16} validation that the plain
+    // ptx_cp_async path also relies on.
+    ICHECK(op->args.size() == 5) << "ptx_cp_async_lds_rsrc expects 5 arguments";
+    std::string dst = this->PrintExpr(op->args[0]);
+    std::string src = this->PrintExpr(op->args[1]);
+    int total_bytes = GetTileLangCPAsyncTransferBytes(op);
+    std::string size = std::to_string(total_bytes);
+    std::string rsrc = this->PrintExpr(op->args[3]);
+    std::string base = this->PrintExpr(op->args[4]);
+    this->PrintIndent();
+    this->stream << "tl::cp_async_gs_lds_with_rsrc<" << size << ">(" << dst
+                 << ", " << src << ", " << rsrc << ", " << base << ");\n";
+  } else if (op->op.same_as(builtin::ptx_cp_async())) {
+    // builtin::ptx_cp_async stores byte width directly in arg 2.
     ICHECK(op->args.size() == 3 || op->args.size() == 4)
         << "ptx_cp_async expects 3 or 4 arguments (dst_access_ptr, "
            "src_access_ptr, bytes, [predicate])";
@@ -1189,7 +1214,18 @@ void CodeGenTileLangHIP::VisitExpr_(const CallNode *op, std::ostream &os) {
       this->stream << "tl::cp_async_gs_conditional<" << size << ">(" << dst
                    << ", " << src << ", " << condition << ");\n";
     }
-  } else if (op->op.same_as(tl::ptx_cp_async())) {
+  } else if (op->op.same_as(tl::ptx_cp_async()) ||
+             op->op.same_as(tl::ptx_cp_async_lds())) {
+    // Both store logical element count in arg 2; convert to bytes via
+    // GetTileLangCPAsyncTransferBytes.
+    //
+    // tl::ptx_cp_async_lds is normally rewritten to ptx_cp_async_lds_rsrc
+    // by the HoistBufferResource pass. If a call survives the rewrite
+    // (e.g. an access_ptr shape _extract_buffer_var can't pattern-match,
+    // or the pass found nothing to hoist), fall back to the synchronous
+    // tl::cp_async_gs<bytes> path here -- correctness is preserved at
+    // the cost of giving up the buffer_load_dwordx4...lds fast path for
+    // that particular call. Treat both ops identically in codegen.
     int total_bytes = GetTileLangCPAsyncTransferBytes(op);
     std::string dst = this->PrintExpr(op->args[0]);
     std::string src = this->PrintExpr(op->args[1]);
@@ -1207,6 +1243,10 @@ void CodeGenTileLangHIP::VisitExpr_(const CallNode *op, std::ostream &os) {
     print_extern_call_stmt("tl::cp_async_commit");
   } else if (op->op.same_as(builtin::ptx_wait_group())) {
     int n = Downcast<IntImm>(op->args[0])->value;
+    // AMDGPU s_waitcnt vmcnt field is 6-bit (max 63); clamp to keep the
+    // "n"(cnt) immediate constraint in tl::cp_async_wait valid.
+    if (n > 63)
+      n = 63;
     std::string func_name = "tl::cp_async_wait<" + std::to_string(n) + ">";
     print_extern_call_stmt(func_name, 1);
   } else if (op->op.same_as(builtin::create_barriers())) {
@@ -1693,7 +1733,33 @@ void CodeGenTileLangHIP::VisitExpr_(const CallNode *op, std::ostream &os) {
 }
 
 void CodeGenTileLangHIP::VisitStmt_(const AttrStmtNode *op) {
-  if (op->attr_key == tl::attr::kLexicalAllocScope) {
+  if (op->attr_key == "buffer_resource_var") {
+    // Hoisted resource descriptor from the HoistBufferResource Python pass.
+    // Emits: auto {rsrc_var} = make_wave_buffer_resource((const
+    // void*)({buf_var}));
+    auto rsrc_var = Downcast<Var>(op->node);
+    std::string rsrc_vid = AllocVarID(rsrc_var.get());
+    std::string buf_ptr = PrintExpr(op->value);
+    this->PrintIndent();
+    this->stream << "auto " << rsrc_vid
+                 << " = make_wave_buffer_resource((const void*)(" << buf_ptr
+                 << "));\n";
+    this->VisitStmt(op->body);
+    return;
+  } else if (op->attr_key == "buffer_base_var") {
+    // Hoisted readfirstlane base address from the HoistBufferResource pass.
+    // Emits: uint32_t {base_var} = __builtin_amdgcn_readfirstlane(
+    //            (uint32_t)(uintptr_t)({buf_var}));
+    auto base_var = Downcast<Var>(op->node);
+    std::string base_vid = AllocVarID(base_var.get());
+    std::string buf_ptr = PrintExpr(op->value);
+    this->PrintIndent();
+    this->stream << "uint32_t " << base_vid
+                 << " = __builtin_amdgcn_readfirstlane("
+                 << "(uint32_t)(uintptr_t)(" << buf_ptr << "));\n";
+    this->VisitStmt(op->body);
+    return;
+  } else if (op->attr_key == tl::attr::kLexicalAllocScope) {
     PrintIndent();
     stream << "{\n";
     int scope = BeginScope();
@@ -1728,6 +1794,24 @@ void CodeGenTileLangHIP::VisitStmt_(const AttrStmtNode *op) {
   CodeGenC::VisitStmt_(op);
 }
 
+void CodeGenTileLangHIP::VisitStmt_(const BindNode *op) {
+  // For Bind(var = ptx_make_buffer_resource(buf)), emit `auto x = ...;`
+  // instead of the C-typed declaration the base class would produce. The
+  // return type is int32x4_t and naming it explicitly is brittle across
+  // backends, so `auto` keeps the template lookup in make_wave_buffer_resource
+  // responsible for the type. The body that follows the bind is handled by
+  // the enclosing SeqStmt visitor.
+  if (auto *call = op->value.as<CallNode>()) {
+    if (call->op.same_as(tl::ptx_make_buffer_resource())) {
+      std::string value = PrintExpr(op->value);
+      PrintIndent();
+      stream << "auto " << AllocVarID(op->var.get()) << " = " << value << ";\n";
+      return;
+    }
+  }
+  CodeGenC::VisitStmt_(op);
+}
+
 void CodeGenTileLangHIP::VisitStmt_(const AllocBufferNode *op) {
   std::string vid = AllocVarID(op->buffer->data.get());
 

diff --git a/src/backend/rocm/codegen/codegen_hip.h b/src/backend/rocm/codegen/codegen_hip.h
@@ -53,6 +53,7 @@ class CodeGenTileLangHIP final : public CodeGenC {
   void VisitExpr_(const ShuffleNode *op, std::ostream &os) final; // NOLINT(*)
   void VisitStmt_(const AllocBufferNode *op) final;
   void VisitStmt_(const AttrStmtNode *op) final;
+  void VisitStmt_(const BindNode *op) final;
   void VisitStmt_(const BufferStoreNode *op) final;
 
   // Override this as a work around for __grid_constant__ parameter

diff --git a/src/backend/rocm/op/copy.cc b/src/backend/rocm/op/copy.cc
@@ -133,7 +133,9 @@ struct Copy {
     auto inject_result =
         InjectPTXAsyncCopy(lowered_loop, /*enable_auto_async_copy=*/true,
                            /*async_without_async_commit_wait=*/
-                           no_implicit_commit_wait || GetIsAsyncCopy(op));
+                           no_implicit_commit_wait || GetIsAsyncCopy(op),
+                           /*enable_buffer_load_lds=*/
+                           TargetIsGfx950(T.target));
     Stmt cp_async_loop = inject_result.stmt;
     if (!inject_result.injected_ptx_async_copy) {
       DLOG(WARNING) << "cp.async rewrite miss for copy src=" << op.src->name

diff --git a/src/layout/gemm_layouts.cc b/src/layout/gemm_layouts.cc
@@ -457,7 +457,10 @@ static Layout MakeQuarterBankSwizzleLayout2D(int stride, int continuous,
   PrimExpr vec = FloorMod(j, vector_size);
   PrimExpr c_swizzle = xor2x2(c, FloorDiv(s, 4));
   PrimExpr index = vec + (c_swizzle + s * 2) * vector_size;
-  return Layout(Array<PrimExpr>{stride, continuous}, {tc, ts, index});
+  PrimExpr swizzle_delta = (c_swizzle - c) * vector_size;
+  Layout result(Array<PrimExpr>{stride, continuous}, {tc, ts, index});
+  const_cast<LayoutNode *>(result.get())->SetSwizzleDelta(swizzle_delta);
+  return result;
 }
 
 Layout makeQuarterBankSwizzleLayout(const Buffer &buffer) {
@@ -486,7 +489,10 @@ static Layout MakeHalfBankSwizzleLayout2D(int stride, int continuous,
   PrimExpr vec = FloorMod(j, vector_size);
   PrimExpr c_swizzle = xor4x4(c, FloorDiv(s, 2));
   PrimExpr index = vec + (c_swizzle + s * 4) * vector_size;
-  return Layout(Array<PrimExpr>{stride, continuous}, {tc, ts, index});
+  PrimExpr swizzle_delta = (c_swizzle - c) * vector_size;
+  Layout result(Array<PrimExpr>{stride, continuous}, {tc, ts, index});
+  const_cast<LayoutNode *>(result.get())->SetSwizzleDelta(swizzle_delta);
+  return result;
 }
 
 Layout makeHalfBankSwizzleLayout(const Buffer &buffer) {
@@ -515,7 +521,10 @@ static Layout MakeFullBankSwizzleLayout2D(int stride, int continuous,
   PrimExpr vec = FloorMod(j, vector_size);
   PrimExpr c_swizzle = xor8x8(c, s);
   PrimExpr index = vec + (c_swizzle + s * 8) * vector_size;
-  return Layout(Array<PrimExpr>{stride, continuous}, {tc, ts, index});
+  PrimExpr swizzle_delta = (c_swizzle - c) * vector_size;
+  Layout result(Array<PrimExpr>{stride, continuous}, {tc, ts, index});
+  const_cast<LayoutNode *>(result.get())->SetSwizzleDelta(swizzle_delta);
+  return result;
 }
 
 Layout makeFullBankSwizzleLayout(const Buffer &buffer) {

diff --git a/src/layout/layout.cc b/src/layout/layout.cc
@@ -504,7 +504,33 @@ Layout LayoutNode::Expand(const Array<PrimExpr> &leading_shape) const {
     new_forward_index.push_back(Substitute(e, vmap));
   }
 
-  return Layout(new_input_size, new_forward_index);
+  Layout result(new_input_size, new_forward_index);
+  // Propagate swizzle_delta_ through Expand: substitute placeholder
+  // indices so the delta keeps referring to the same physical input
+  // dimension after the leading-shape prefix is added.
+  if (swizzle_delta_.defined()) {
+    const_cast<LayoutNode *>(result.get())
+        ->SetSwizzleDelta(Substitute(swizzle_delta_.value(), vmap));
+  }
+  return result;
+}
+
+PrimExpr LayoutNode::SwizzleDelta(const Array<PrimExpr> &input_indices) const {
+  if (!swizzle_delta_.defined()) {
+    return IntImm(DataType::Int(32), 0);
+  }
+  // Substitute the last InputDim() entries of input_indices into
+  // swizzle_delta_, matching the convention Forward() uses.
+  ICHECK_GE(input_indices.size(), InputDim())
+      << "SwizzleDelta requires at least " << InputDim() << " indices, but got "
+      << input_indices.size();
+  PrimExpr delta = swizzle_delta_.value();
+  size_t offset = input_indices.size() - InputDim();
+  for (size_t i = 0; i < InputDim(); ++i) {
+    delta =
+        Substitute(delta, {{InputPlaceholder(i), input_indices[offset + i]}});
+  }
+  return delta;
 }
 
 Fragment FragmentNode::Repeat(const Array<PrimExpr> &repeats,

diff --git a/src/layout/layout.h b/src/layout/layout.h
@@ -102,6 +102,24 @@ class LayoutNode : public Object {
 
   virtual bool IsEqual(const LayoutNode *other, bool skip_index = false) const;
 
+  /*!
+   * \brief Get the XOR swizzle column delta on the last input dimension.
+   *
+   * For swizzled layouts (quarter/half/full bank) returns the column
+   * delta caused by the XOR: delta = (c_swizzled - c) * vector_size,
+   * substituted against the supplied indices. For non-swizzle layouts
+   * returns 0. Used by the swizzle-swap optimisation in lower_tile_op.cc
+   * to move the XOR off the LDS-store side and onto the global-load
+   * side when the target supports buffer_load ... lds direct DMA.
+   */
+  virtual PrimExpr SwizzleDelta(const Array<PrimExpr> &input_indices) const;
+
+  /*! \brief Whether this layout carries a non-trivial swizzle delta. */
+  bool HasSwizzle() const { return swizzle_delta_.defined(); }
+
+  /*! \brief Set the swizzle delta expression (called by layout factories). */
+  void SetSwizzleDelta(PrimExpr delta) { swizzle_delta_ = delta; }
+
   static void RegisterReflection();
   TVM_FFI_DECLARE_OBJECT_INFO("tl.Layout", LayoutNode, Object);
   static constexpr TVMFFISEqHashKind _type_s_eq_hash_kind =
@@ -112,6 +130,11 @@ class LayoutNode : public Object {
   void UpdateAnalyzer(arith::Analyzer *analyzer) const;
   Array<PrimExpr> forward_index_;
   Array<PrimExpr> input_size_;
+  /*!
+   * \brief Optional XOR swizzle delta in terms of InputPlaceholders, set
+   * by swizzle layout factories and propagated through Expand/Reshape.
+   */
+  Optional<PrimExpr> swizzle_delta_;
 };
 
 /*!

diff --git a/src/op/builtin.cc b/src/op/builtin.cc
@@ -306,6 +306,21 @@ TIR_DEFINE_TL_BUILTIN(ptx_cp_async)
     .set_attr<TCallEffectKind>("TCallEffectKind",
                                Integer(CallEffectKind::kOpaque));
 
+TIR_DEFINE_TL_BUILTIN(ptx_cp_async_lds)
+    .set_num_inputs(-1)
+    .set_attr<TCallEffectKind>("TCallEffectKind",
+                               Integer(CallEffectKind::kOpaque));
+
+TIR_DEFINE_TL_BUILTIN(ptx_make_buffer_resource)
+    .set_num_inputs(1)
+    .set_attr<TCallEffectKind>("TCallEffectKind",
+                               Integer(CallEffectKind::kOpaque));
+
+TIR_DEFINE_TL_BUILTIN(ptx_cp_async_lds_rsrc)
+    .set_num_inputs(-1)
+    .set_attr<TCallEffectKind>("TCallEffectKind",
+                               Integer(CallEffectKind::kOpaque));
+
 TIR_DEFINE_TL_BUILTIN(fence_proxy_async)
     .set_num_inputs(0)
     .set_attr<TCallEffectKind>("TCallEffectKind",

diff --git a/src/op/builtin.h b/src/op/builtin.h
@@ -516,6 +516,58 @@ TVM_DLL const Op &ptx_cp_async_barrier_noinc();
  */
 TVM_DLL const Op &ptx_cp_async();
 
+/*!
+ * \brief Marker for an eligible async G2S copy on gfx950+.
+ *
+ * Emitted by LowerPTXAsyncCopy in place of ptx_cp_async for 16-byte
+ * non-predicated shared-memory writes whose LDS index is (post
+ * swizzle-swap, see lower_tile_op.cc) lane-contiguous. The
+ * HoistBufferResource pass then rewrites each call to
+ * ptx_cp_async_lds_rsrc with a pre-computed buffer resource
+ * descriptor + base address; that rsrc form is what codegen emits as
+ * the buffer_load_dwordx4 ... lds fast path.
+ *
+ * If a call survives the rewrite (e.g. an access_ptr the hoister
+ * can't pattern-match), codegen falls back to the synchronous
+ * tl::cp_async_gs<N> path -- correct, but no buffer_load_lds win.
+ *
+ * ptx_cp_async_lds(dst_access_ptr, src_access_ptr, num_elems)
+ *
+ * num_elems is the logical element count (NOT byte width). Lowering
+ * derives the {4, 8, 16} byte transfer width from the access-ptr dtype.
+ * Passing this as elements keeps vec-loop folding in vectorize_loop.cc
+ * (which multiplies the count when it widens a loop) consistent with
+ * the plain ptx_cp_async path.
+ */
+TVM_DLL const Op &ptx_cp_async_lds();
+
+/*!
+ * \brief Create a buffer resource descriptor for async G2S LDS copy (gfx950+).
+ *
+ * ptx_make_buffer_resource(global_ptr)
+ *
+ * Returns an int32x4_t buffer resource descriptor via
+ * make_wave_buffer_resource (defined in src/tl_templates/hip/copy.h).
+ */
+TVM_DLL const Op &ptx_make_buffer_resource();
+
+/*!
+ * \brief Truly async G2S copy with pre-computed buffer resource (gfx950+).
+ *
+ * Same as ptx_cp_async_lds but takes a pre-hoisted buffer resource
+ * descriptor + base address to avoid redundant readfirstlane /
+ * make_wave_buffer_resource calls inside unrolled loops. The
+ * HoistBufferResource Python pass rewrites ptx_cp_async_lds calls to this
+ * form once per kernel.
+ *
+ * ptx_cp_async_lds_rsrc(dst_access_ptr, src_access_ptr, num_elems, rsrc_var,
+ *                       base_var)
+ *
+ * num_elems uses the same convention as ptx_cp_async_lds -- logical
+ * element count, not bytes; lowering converts via the access-ptr dtype.
+ */
+TVM_DLL const Op &ptx_cp_async_lds_rsrc();
+
 /*!
  * \brief Pack two b16 value into a b32 value
  *