Skip to content

[RISCV] Lower the paadd/pasub intrinsics to existing ISD nodes. NFC#203646

Open
topperc wants to merge 1 commit into
llvm:mainfrom
topperc:pr/paadd
Open

[RISCV] Lower the paadd/pasub intrinsics to existing ISD nodes. NFC#203646
topperc wants to merge 1 commit into
llvm:mainfrom
topperc:pr/paadd

Conversation

@topperc

@topperc topperc commented Jun 12, 2026

Copy link
Copy Markdown
Contributor

This avoids having 2 different isel patterns for the same operation.

This avoids having 2 different isel patterns for the same operation.
@llvmorg-github-actions

Copy link
Copy Markdown

@llvm/pr-subscribers-backend-risc-v

Author: Craig Topper (topperc)

Changes

This avoids having 2 different isel patterns for the same operation.


Full diff: https://github.com/llvm/llvm-project/pull/203646.diff

2 Files Affected:

  • (modified) llvm/lib/Target/RISCV/RISCVISelLowering.cpp (+40-2)
  • (modified) llvm/lib/Target/RISCV/RISCVInstrInfoP.td (-27)
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 57af056c2d3c0..4a937fe2c6462 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -11736,6 +11736,29 @@ SDValue RISCVTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
     unsigned Opc = IntNo == Intrinsic::riscv_clmulh ? ISD::CLMULH : ISD::CLMULR;
     return DAG.getNode(Opc, DL, XLenVT, Op.getOperand(1), Op.getOperand(2));
   }
+  case Intrinsic::riscv_paadd:
+  case Intrinsic::riscv_paaddu:
+  case Intrinsic::riscv_pasub:
+  case Intrinsic::riscv_pasubu: {
+    unsigned Opc;
+    switch (IntNo) {
+    case Intrinsic::riscv_paadd:
+      Opc = ISD::AVGFLOORS;
+      break;
+    case Intrinsic::riscv_paaddu:
+      Opc = ISD::AVGFLOORU;
+      break;
+    case Intrinsic::riscv_pasub:
+      Opc = RISCVISD::ASUB;
+      break;
+    case Intrinsic::riscv_pasubu:
+      Opc = RISCVISD::ASUBU;
+      break;
+    }
+
+    return DAG.getNode(Opc, DL, Op.getValueType(), Op.getOperand(1),
+                       Op.getOperand(2));
+  }
   case Intrinsic::experimental_get_vector_length:
     return lowerGetVectorLength(Op.getNode(), DAG, Subtarget);
   case Intrinsic::riscv_vmv_x_s: {
@@ -15648,14 +15671,29 @@ void RISCVTargetLowering::ReplaceNodeResults(SDNode *N,
       if (!Subtarget.is64Bit() || (VT != MVT::v4i8 && VT != MVT::v2i16))
         return;
 
+      unsigned Opc;
+      switch (IntNo) {
+      case Intrinsic::riscv_paadd:
+        Opc = ISD::AVGFLOORS;
+        break;
+      case Intrinsic::riscv_paaddu:
+        Opc = ISD::AVGFLOORU;
+        break;
+      case Intrinsic::riscv_pasub:
+        Opc = RISCVISD::ASUB;
+        break;
+      case Intrinsic::riscv_pasubu:
+        Opc = RISCVISD::ASUBU;
+        break;
+      }
+
       EVT WideVT = VT == MVT::v4i8 ? MVT::v8i8 : MVT::v4i16;
       SDValue Undef = DAG.getUNDEF(VT);
       SDValue Op0 =
           DAG.getNode(ISD::CONCAT_VECTORS, DL, WideVT, N->getOperand(1), Undef);
       SDValue Op1 =
           DAG.getNode(ISD::CONCAT_VECTORS, DL, WideVT, N->getOperand(2), Undef);
-      SDValue Res = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, WideVT,
-                                N->getOperand(0), Op0, Op1);
+      SDValue Res = DAG.getNode(Opc, DL, WideVT, Op0, Op1);
       Results.push_back(DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
                                     DAG.getVectorIdxConstant(0, DL)));
       return;
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoP.td b/llvm/lib/Target/RISCV/RISCVInstrInfoP.td
index fe1e459967e92..79ff90545b68a 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoP.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoP.td
@@ -1990,22 +1990,12 @@ let Predicates = [HasStdExtP] in {
   def : PatGprGpr<riscv_asub, PASUB_B, XLenVecI8VT>;
   def : PatGprGpr<riscv_asubu, PASUBU_B, XLenVecI8VT>;
 
-  def : PatGprGpr<int_riscv_paadd, PAADD_B, XLenVecI8VT>;
-  def : PatGprGpr<int_riscv_paaddu, PAADDU_B, XLenVecI8VT>;
-  def : PatGprGpr<int_riscv_pasub, PASUB_B, XLenVecI8VT>;
-  def : PatGprGpr<int_riscv_pasubu, PASUBU_B, XLenVecI8VT>;
-
   // 16-bit averaging patterns
   def : PatGprGpr<avgfloors, PAADD_H, XLenVecI16VT>;
   def : PatGprGpr<avgflooru, PAADDU_H, XLenVecI16VT>;
   def : PatGprGpr<riscv_asub, PASUB_H, XLenVecI16VT>;
   def : PatGprGpr<riscv_asubu, PASUBU_H, XLenVecI16VT>;
 
-  def : PatGprGpr<int_riscv_paadd, PAADD_H, XLenVecI16VT>;
-  def : PatGprGpr<int_riscv_paaddu, PAADDU_H, XLenVecI16VT>;
-  def : PatGprGpr<int_riscv_pasub, PASUB_H, XLenVecI16VT>;
-  def : PatGprGpr<int_riscv_pasubu, PASUBU_H, XLenVecI16VT>;
-
   // 8-bit absolute difference patterns
   def : Pat<(XLenVecI8VT (abs GPR:$rs1)), (PABD_B GPR:$rs1, (XLenVecI8VT X0))>;
   def : PatGprGpr<abds, PABD_B, XLenVecI8VT>;
@@ -2280,19 +2270,6 @@ let append Predicates = [IsRV32] in {
   def : PatGprPairGprPair<riscv_asub, PASUB_DW, v2i32>;
   def : PatGprPairGprPair<riscv_asubu, PASUBU_DW, v2i32>;
 
-  def : PatGprPairGprPair<int_riscv_paadd, PAADD_DB, v8i8>;
-  def : PatGprPairGprPair<int_riscv_paaddu, PAADDU_DB, v8i8>;
-  def : PatGprPairGprPair<int_riscv_pasub, PASUB_DB, v8i8>;
-  def : PatGprPairGprPair<int_riscv_pasubu, PASUBU_DB, v8i8>;
-  def : PatGprPairGprPair<int_riscv_paadd, PAADD_DH, v4i16>;
-  def : PatGprPairGprPair<int_riscv_paaddu, PAADDU_DH, v4i16>;
-  def : PatGprPairGprPair<int_riscv_pasub, PASUB_DH, v4i16>;
-  def : PatGprPairGprPair<int_riscv_pasubu, PASUBU_DH, v4i16>;
-  def : PatGprPairGprPair<int_riscv_paadd, PAADD_DW, v2i32>;
-  def : PatGprPairGprPair<int_riscv_paaddu, PAADDU_DW, v2i32>;
-  def : PatGprPairGprPair<int_riscv_pasub, PASUB_DW, v2i32>;
-  def : PatGprPairGprPair<int_riscv_pasubu, PASUBU_DW, v2i32>;
-
   // 8-bit absolute difference patterns
   def : Pat<(v8i8 (abs GPRPair:$rs1)), (PABD_DB GPRPair:$rs1, (v8i8 X0_Pair))>;
   def : PatGprPairGprPair<abds, PABD_DB, v8i8>;
@@ -2501,14 +2478,10 @@ let append Predicates = [IsRV64] in {
   // 32-bit averaging patterns
   def : PatGprGpr<avgfloors, PAADD_W, v2i32>;
   def : PatGprGpr<avgflooru, PAADDU_W, v2i32>;
-  def : PatGprGpr<int_riscv_paadd, PAADD_W, v2i32>;
-  def : PatGprGpr<int_riscv_paaddu, PAADDU_W, v2i32>;
 
   // 32-bit averaging-sub patterns
   def : PatGprGpr<riscv_asub, PASUB_W, v2i32>;
   def : PatGprGpr<riscv_asubu, PASUBU_W, v2i32>;
-  def : PatGprGpr<int_riscv_pasub, PASUB_W, v2i32>;
-  def : PatGprGpr<int_riscv_pasubu, PASUBU_W, v2i32>;
 
   // 32-bit multiply high patterns
   def : PatGprGpr<mulhs, PMULH_W, v2i32>;

@sihuan

sihuan commented Jun 13, 2026

Copy link
Copy Markdown
Contributor

I noticed DAGCombiner already folds the full-precision (a +/- b) >> 1 into ISD::AVGFLOORS/ISD::AVGFLOORU (for the add) and RISCVISD::ASUB/ASUBU (for the sub). Given that, should we maybe go a step further and drop the llvm ir intrinsics entirely? Clang could emit the plain (a +/- b) >> 1 directly, with no llvm.riscv.* in the IR.
I tried a header wrapper along these lines:

return __builtin_convertvector(
    (__builtin_convertvector(a, i16x4) + __builtin_convertvector(b, i16x4)) >> 1,
    i8x4);

which produces

%0 = sext <4 x i8> %a to <4 x i16>
%1 = sext <4 x i8> %b to <4 x i16>
%2 = add nsw <4 x i16> %1, %0
%3 = lshr <4 x i16> %2, splat (i16 1)
%4 = trunc <4 x i16> %3 to <4 x i8>

and selects to the expected instructions. As a side benefit, the RV64 widening for the 32-bit vector types would not be needed.

@topperc

topperc commented Jun 13, 2026

Copy link
Copy Markdown
Contributor Author

I noticed DAGCombiner already folds the full-precision (a +/- b) >> 1 into ISD::AVGFLOORS/ISD::AVGFLOORU (for the add) and RISCVISD::ASUB/ASUBU (for the sub). Given that, should we maybe go a step further and drop the llvm ir intrinsics entirely? Clang could emit the plain (a +/- b) >> 1 directly, with no llvm.riscv.* in the IR. I tried a header wrapper along these lines:

return __builtin_convertvector(
    (__builtin_convertvector(a, i16x4) + __builtin_convertvector(b, i16x4)) >> 1,
    i8x4);

which produces

%0 = sext <4 x i8> %a to <4 x i16>
%1 = sext <4 x i8> %b to <4 x i16>
%2 = add nsw <4 x i16> %1, %0
%3 = lshr <4 x i16> %2, splat (i16 1)
%4 = trunc <4 x i16> %3 to <4 x i8>

and selects to the expected instructions. As a side benefit, the RV64 widening for the 32-bit vector types would not be needed.

If the pattern gets broken by another transform, we won't produce the expected instruction for the intrinsic. That's not a good experience for intrinsic programmers.

@sihuan

sihuan commented Jun 13, 2026

Copy link
Copy Markdown
Contributor

That makes sense, thanks — a fragile IR pattern isn't a guarantee we can hand intrinsic users, so keeping the explicit intrinsics is the right tradeoff. LGTM.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Projects

None yet

Development

Successfully merging this pull request may close these issues.

2 participants