[AArch64]Fix specification of NEON FP16 scalar CVT to/from fixed point#201425
[AArch64]Fix specification of NEON FP16 scalar CVT to/from fixed point#201425CarolineConcatto wants to merge 1 commit into
Conversation
This patch implements the changes proposed in[1] for SelectionDAG and GlobalISel [1]ARM-software/acle#435
|
@llvm/pr-subscribers-backend-aarch64 Author: CarolineConcatto ChangesThis patch implements the changes proposed in[1] for SelectionDAG and GlobalISel Full diff: https://github.com/llvm/llvm-project/pull/201425.diff 4 Files Affected:
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 30b621bab41aa..e4e741f80ab2e 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -9167,10 +9167,23 @@ def : Pat<(v1f64 (int_aarch64_neon_vcvtfxu2fp (v1i64 FPR64:$Rn),
def : Pat<(int_aarch64_neon_vcvtfxs2fp FPR32:$Rn, vecshiftR32:$imm),
(SCVTFs FPR32:$Rn, vecshiftR32:$imm)>;
-// Patterns for FP16 Intrinsics - requires reg copy to/from as i16s not supported.
+def fixedpoint_scalar_xform : SDNodeXForm<timm, [{
+ (void)N;
+ return V;
+}]>;
+def gi_fixedpoint_scalar_xform
+ : GICustomOperandRenderer<"renderFixedPointScalarXForm">,
+ GISDNodeXFormEquiv<fixedpoint_scalar_xform>;
+// Patterns for FP16 Intrinsics - requires reg copy to/from as i16s not supported.
def : Pat<(f16 (int_aarch64_neon_vcvtfxs2fp (i32 (sext_inreg FPR32:$Rn, i16)), vecshiftR16:$imm)),
(SCVTFh (f16 (EXTRACT_SUBREG FPR32:$Rn, hsub)), vecshiftR16:$imm)>;
+let AddedComplexity = 1 in {
+def : Pat<(f16 (int_aarch64_neon_vcvtfxs2fp (i32 GPR32:$Rn), vecshiftR16:$imm)),
+ (SCVTFSWHri GPR32:$Rn, (fixedpoint_scalar_xform vecshiftR16:$imm))>;
+def : Pat<(f16 (int_aarch64_neon_vcvtfxs2fp (i64 GPR64:$Rn), vecshiftR16:$imm)),
+ (SCVTFSXHri GPR64:$Rn, (fixedpoint_scalar_xform vecshiftR16:$imm))>;
+}
def : Pat<(f16 (int_aarch64_neon_vcvtfxs2fp (i32 FPR32:$Rn), vecshiftR16:$imm)),
(SCVTFh (f16 (EXTRACT_SUBREG FPR32:$Rn, hsub)), vecshiftR16:$imm)>;
def : Pat<(f16 (int_aarch64_neon_vcvtfxs2fp (i64 FPR64:$Rn), vecshiftR16:$imm)),
@@ -9179,10 +9192,26 @@ def : Pat<(f16 (int_aarch64_neon_vcvtfxu2fp
(and FPR32:$Rn, (i32 65535)),
vecshiftR16:$imm)),
(UCVTFh (f16 (EXTRACT_SUBREG FPR32:$Rn, hsub)), vecshiftR16:$imm)>;
+let AddedComplexity = 1 in {
+def : Pat<(f16 (int_aarch64_neon_vcvtfxu2fp GPR32:$Rn, vecshiftR16:$imm)),
+ (UCVTFSWHri GPR32:$Rn, (fixedpoint_scalar_xform vecshiftR16:$imm))>;
+def : Pat<(f16 (int_aarch64_neon_vcvtfxu2fp (i64 GPR64:$Rn), vecshiftR16:$imm)),
+ (UCVTFSXHri GPR64:$Rn, (fixedpoint_scalar_xform vecshiftR16:$imm))>;
+}
def : Pat<(f16 (int_aarch64_neon_vcvtfxu2fp FPR32:$Rn, vecshiftR16:$imm)),
(UCVTFh (f16 (EXTRACT_SUBREG FPR32:$Rn, hsub)), vecshiftR16:$imm)>;
def : Pat<(f16 (int_aarch64_neon_vcvtfxu2fp (i64 FPR64:$Rn), vecshiftR16:$imm)),
(UCVTFh (f16 (EXTRACT_SUBREG FPR64:$Rn, hsub)), vecshiftR16:$imm)>;
+let AddedComplexity = 1 in {
+def : Pat<(i32 (int_aarch64_neon_vcvtfp2fxs (f16 FPR16:$Rn), vecshiftR32:$imm)),
+ (FCVTZSSWHri FPR16:$Rn, (fixedpoint_scalar_xform vecshiftR32:$imm))>;
+def : Pat<(i64 (int_aarch64_neon_vcvtfp2fxs (f16 FPR16:$Rn), vecshiftR64:$imm)),
+ (FCVTZSSXHri FPR16:$Rn, (fixedpoint_scalar_xform vecshiftR64:$imm))>;
+def : Pat<(i32 (int_aarch64_neon_vcvtfp2fxu (f16 FPR16:$Rn), vecshiftR32:$imm)),
+ (FCVTZUSWHri FPR16:$Rn, (fixedpoint_scalar_xform vecshiftR32:$imm))>;
+def : Pat<(i64 (int_aarch64_neon_vcvtfp2fxu (f16 FPR16:$Rn), vecshiftR64:$imm)),
+ (FCVTZUSXHri FPR16:$Rn, (fixedpoint_scalar_xform vecshiftR64:$imm))>;
+}
def : Pat<(i32 (int_aarch64_neon_vcvtfp2fxs (f16 FPR16:$Rn), vecshiftR32:$imm)),
(i32 (INSERT_SUBREG
(i32 (IMPLICIT_DEF)),
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
index cf650fd5c4e72..6ad1a061ddfef 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
@@ -488,6 +488,8 @@ class AArch64InstructionSelector : public InstructionSelector {
ComplexRendererFns
selectCVTFixedPointVecBase(const MachineOperand &Root,
bool isReciprocal = false) const;
+ void renderFixedPointScalarXForm(MachineInstrBuilder &MIB,
+ const MachineInstr &MI, int OpIdx) const;
void renderFixedPointXForm(MachineInstrBuilder &MIB, const MachineInstr &MI,
int OpIdx = -1) const;
void renderFixedPointRecipXForm(MachineInstrBuilder &MIB,
@@ -7941,6 +7943,13 @@ void AArch64InstructionSelector::renderFixedPointXForm(MachineInstrBuilder &MIB,
(Renderer->front())(MIB);
}
+void AArch64InstructionSelector::renderFixedPointScalarXForm(
+ MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
+ assert(OpIdx == 3 && MI.getOperand(OpIdx).isImm() &&
+ "Expected fixed-point conversion intrinsic immediate");
+ MIB.addImm(MI.getOperand(OpIdx).getImm());
+}
+
void AArch64InstructionSelector::renderFixedPointRecipXForm(
MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
InstructionSelector::ComplexRendererFns Renderer =
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp
index e814316b0f2ed..088107b5f8594 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp
@@ -1444,17 +1444,28 @@ AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
break;
}
case Intrinsic::aarch64_neon_vcvtfxs2fp:
- case Intrinsic::aarch64_neon_vcvtfxu2fp:
- case Intrinsic::aarch64_neon_vcvtfp2fxs:
- case Intrinsic::aarch64_neon_vcvtfp2fxu:
+ case Intrinsic::aarch64_neon_vcvtfxu2fp: {
// Override these intrinsics, because they would have a partial
// mapping. This is needed for 'half' types, which otherwise don't
// get legalised correctly.
OpRegBankIdx[0] = PMI_FirstFPR;
+ OpRegBankIdx[2] = MRI.getType(MI.getOperand(2).getReg()).isVector()
+ ? PMI_FirstFPR
+ : PMI_FirstGPR;
+ // OpRegBankIdx[1] is the intrinsic ID.
+ // OpRegBankIdx[3] is an integer immediate.
+ break;
+ }
+ case Intrinsic::aarch64_neon_vcvtfp2fxs:
+ case Intrinsic::aarch64_neon_vcvtfp2fxu: {
+ OpRegBankIdx[0] = MRI.getType(MI.getOperand(0).getReg()).isVector()
+ ? PMI_FirstFPR
+ : PMI_FirstGPR;
OpRegBankIdx[2] = PMI_FirstFPR;
// OpRegBankIdx[1] is the intrinsic ID.
// OpRegBankIdx[3] is an integer immediate.
break;
+ }
default: {
// Check if we know that the intrinsic has any constraints on its register
// banks. If it does, then update the mapping accordingly.
diff --git a/llvm/test/CodeGen/AArch64/fp16_intrinsic_scalar_2op.ll b/llvm/test/CodeGen/AArch64/fp16_intrinsic_scalar_2op.ll
index da70599483a63..206d61e9e3b9a 100644
--- a/llvm/test/CodeGen/AArch64/fp16_intrinsic_scalar_2op.ll
+++ b/llvm/test/CodeGen/AArch64/fp16_intrinsic_scalar_2op.ll
@@ -196,8 +196,7 @@ define dso_local half @test_vcvth_n_f16_s16_1(i16 %a) {
; CHECK-GI-LABEL: test_vcvth_n_f16_s16_1:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: sxth w8, w0
-; CHECK-GI-NEXT: fmov s0, w8
-; CHECK-GI-NEXT: scvtf h0, h0, #1
+; CHECK-GI-NEXT: scvtf h0, w8, #1
; CHECK-GI-NEXT: ret
entry:
%sext = sext i16 %a to i32
@@ -215,8 +214,7 @@ define dso_local half @test_vcvth_n_f16_s16_16(i16 %a) {
; CHECK-GI-LABEL: test_vcvth_n_f16_s16_16:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: sxth w8, w0
-; CHECK-GI-NEXT: fmov s0, w8
-; CHECK-GI-NEXT: scvtf h0, h0, #16
+; CHECK-GI-NEXT: scvtf h0, w8, #16
; CHECK-GI-NEXT: ret
entry:
%sext = sext i16 %a to i32
@@ -227,8 +225,7 @@ entry:
define dso_local half @test_vcvth_n_f16_s32_1(i32 %a) {
; CHECK-LABEL: test_vcvth_n_f16_s32_1:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: fmov s0, w0
-; CHECK-NEXT: scvtf h0, h0, #1
+; CHECK-NEXT: scvtf h0, w0, #1
; CHECK-NEXT: ret
entry:
%vcvth_n_f16_s32 = tail call half @llvm.aarch64.neon.vcvtfxs2fp.f16.i32(i32 %a, i32 1)
@@ -238,8 +235,7 @@ entry:
define dso_local half @test_vcvth_n_f16_s32_16(i32 %a) {
; CHECK-LABEL: test_vcvth_n_f16_s32_16:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: fmov s0, w0
-; CHECK-NEXT: scvtf h0, h0, #16
+; CHECK-NEXT: scvtf h0, w0, #16
; CHECK-NEXT: ret
entry:
%vcvth_n_f16_s32 = tail call half @llvm.aarch64.neon.vcvtfxs2fp.f16.i32(i32 %a, i32 16)
@@ -249,8 +245,7 @@ entry:
define dso_local i16 @test_vcvth_n_s16_f16_1(half %a) {
; CHECK-LABEL: test_vcvth_n_s16_f16_1:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: fcvtzs h0, h0, #1
-; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: fcvtzs w0, h0, #1
; CHECK-NEXT: ret
entry:
%fcvth_n = tail call i32 @llvm.aarch64.neon.vcvtfp2fxs.i32.f16(half %a, i32 1)
@@ -261,8 +256,7 @@ entry:
define dso_local i16 @test_vcvth_n_s16_f16_16(half %a) {
; CHECK-LABEL: test_vcvth_n_s16_f16_16:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: fcvtzs h0, h0, #16
-; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: fcvtzs w0, h0, #16
; CHECK-NEXT: ret
entry:
%fcvth_n = tail call i32 @llvm.aarch64.neon.vcvtfp2fxs.i32.f16(half %a, i32 16)
@@ -273,8 +267,7 @@ entry:
define dso_local i32 @test_vcvth_n_s32_f16_1(half %a) {
; CHECK-LABEL: test_vcvth_n_s32_f16_1:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: fcvtzs h0, h0, #1
-; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: fcvtzs w0, h0, #1
; CHECK-NEXT: ret
entry:
%vcvth_n_s32_f16 = tail call i32 @llvm.aarch64.neon.vcvtfp2fxs.i32.f16(half %a, i32 1)
@@ -284,8 +277,7 @@ entry:
define dso_local i32 @test_vcvth_n_s32_f16_16(half %a) {
; CHECK-LABEL: test_vcvth_n_s32_f16_16:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: fcvtzs h0, h0, #16
-; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: fcvtzs w0, h0, #16
; CHECK-NEXT: ret
entry:
%vcvth_n_s32_f16 = tail call i32 @llvm.aarch64.neon.vcvtfp2fxs.i32.f16(half %a, i32 16)
@@ -295,8 +287,7 @@ entry:
define dso_local i64 @test_vcvth_n_s64_f16_1(half %a) {
; CHECK-LABEL: test_vcvth_n_s64_f16_1:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: fcvtzs h0, h0, #1
-; CHECK-NEXT: fmov x0, d0
+; CHECK-NEXT: fcvtzs x0, h0, #1
; CHECK-NEXT: ret
entry:
%vcvth_n_s64_f16 = tail call i64 @llvm.aarch64.neon.vcvtfp2fxs.i64.f16(half %a, i32 1)
@@ -306,8 +297,7 @@ entry:
define dso_local i64 @test_vcvth_n_s64_f16_32(half %a) {
; CHECK-LABEL: test_vcvth_n_s64_f16_32:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: fcvtzs h0, h0, #32
-; CHECK-NEXT: fmov x0, d0
+; CHECK-NEXT: fcvtzs x0, h0, #32
; CHECK-NEXT: ret
entry:
%vcvth_n_s64_f16 = tail call i64 @llvm.aarch64.neon.vcvtfp2fxs.i64.f16(half %a, i32 32)
@@ -324,8 +314,7 @@ define dso_local half @test_vcvth_n_f16_u16_1(i16 %a) {
; CHECK-GI-LABEL: test_vcvth_n_f16_u16_1:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: and w8, w0, #0xffff
-; CHECK-GI-NEXT: fmov s0, w8
-; CHECK-GI-NEXT: ucvtf h0, h0, #1
+; CHECK-GI-NEXT: ucvtf h0, w8, #1
; CHECK-GI-NEXT: ret
entry:
%0 = zext i16 %a to i32
@@ -343,8 +332,7 @@ define dso_local half @test_vcvth_n_f16_u16_16(i16 %a) {
; CHECK-GI-LABEL: test_vcvth_n_f16_u16_16:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: and w8, w0, #0xffff
-; CHECK-GI-NEXT: fmov s0, w8
-; CHECK-GI-NEXT: ucvtf h0, h0, #16
+; CHECK-GI-NEXT: ucvtf h0, w8, #16
; CHECK-GI-NEXT: ret
entry:
%0 = zext i16 %a to i32
@@ -355,8 +343,7 @@ entry:
define dso_local half @test_vcvth_n_f16_u32_1(i32 %a) {
; CHECK-LABEL: test_vcvth_n_f16_u32_1:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: fmov s0, w0
-; CHECK-NEXT: ucvtf h0, h0, #1
+; CHECK-NEXT: ucvtf h0, w0, #1
; CHECK-NEXT: ret
entry:
%vcvth_n_f16_u32 = tail call half @llvm.aarch64.neon.vcvtfxu2fp.f16.i32(i32 %a, i32 1)
@@ -366,8 +353,7 @@ entry:
define dso_local half @test_vcvth_n_f16_u32_16(i32 %a) {
; CHECK-LABEL: test_vcvth_n_f16_u32_16:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: fmov s0, w0
-; CHECK-NEXT: ucvtf h0, h0, #16
+; CHECK-NEXT: ucvtf h0, w0, #16
; CHECK-NEXT: ret
entry:
%vcvth_n_f16_u32 = tail call half @llvm.aarch64.neon.vcvtfxu2fp.f16.i32(i32 %a, i32 16)
@@ -377,8 +363,7 @@ entry:
define dso_local i16 @test_vcvth_n_u16_f16_1(half %a) {
; CHECK-LABEL: test_vcvth_n_u16_f16_1:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: fcvtzu h0, h0, #1
-; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: fcvtzu w0, h0, #1
; CHECK-NEXT: ret
entry:
%fcvth_n = tail call i32 @llvm.aarch64.neon.vcvtfp2fxu.i32.f16(half %a, i32 1)
@@ -389,8 +374,7 @@ entry:
define dso_local i16 @test_vcvth_n_u16_f16_16(half %a) {
; CHECK-LABEL: test_vcvth_n_u16_f16_16:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: fcvtzu h0, h0, #16
-; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: fcvtzu w0, h0, #16
; CHECK-NEXT: ret
entry:
%fcvth_n = tail call i32 @llvm.aarch64.neon.vcvtfp2fxu.i32.f16(half %a, i32 16)
@@ -401,8 +385,7 @@ entry:
define dso_local i32 @test_vcvth_n_u32_f16_1(half %a) {
; CHECK-LABEL: test_vcvth_n_u32_f16_1:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: fcvtzu h0, h0, #1
-; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: fcvtzu w0, h0, #1
; CHECK-NEXT: ret
entry:
%vcvth_n_u32_f16 = tail call i32 @llvm.aarch64.neon.vcvtfp2fxu.i32.f16(half %a, i32 1)
@@ -412,8 +395,7 @@ entry:
define dso_local i32 @test_vcvth_n_u32_f16_16(half %a) {
; CHECK-LABEL: test_vcvth_n_u32_f16_16:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: fcvtzu h0, h0, #16
-; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: fcvtzu w0, h0, #16
; CHECK-NEXT: ret
entry:
%vcvth_n_u32_f16 = tail call i32 @llvm.aarch64.neon.vcvtfp2fxu.i32.f16(half %a, i32 16)
@@ -447,8 +429,7 @@ entry:
define dso_local half @vcvth_n_f16_s64_test(i64 %a) {
; CHECK-LABEL: vcvth_n_f16_s64_test:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: fmov d0, x0
-; CHECK-NEXT: scvtf h0, h0, #16
+; CHECK-NEXT: scvtf h0, x0, #16
; CHECK-NEXT: ret
entry:
%vcvth_n_f16_s64 = tail call half @llvm.aarch64.neon.vcvtfxs2fp.f16.i64(i64 %a, i32 16)
|
|
Hello, |
There was a problem hiding this comment.
Pull request overview
Updates AArch64 codegen to match the revised ACLE specification for NEON FP16 scalar fixed-point conversions, ensuring SelectionDAG and GlobalISel select the correct scalar forms that convert directly between GPRs and FP registers.
Changes:
- Update CodeGen checks to expect direct scalar
scvtf/ucvtfandfcvtzs/fcvtzuforms (removing intermediatefmovsequences). - Refine GlobalISel register bank mappings for
vcvtfx{u,s}2fpandvcvtfp2fx{u,s}so scalar integer operands/results use GPRs while FP operands remain in FPRs. - Add a GlobalISel custom operand renderer and TableGen xform plumbing to pass fixed-point immediates for scalar conversions.
Reviewed changes
Copilot reviewed 4 out of 4 changed files in this pull request and generated 1 comment.
| File | Description |
|---|---|
| llvm/test/CodeGen/AArch64/fp16_intrinsic_scalar_2op.ll | Updates expected assembly to validate direct scalar fixed-point convert instructions for both SDAG and GlobalISel. |
| llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp | Adjusts intrinsic operand bank assignments to correctly model scalar vs vector register bank requirements. |
| llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp | Adds a dedicated renderer to emit scalar fixed-point immediates for the new patterns. |
| llvm/lib/Target/AArch64/AArch64InstrInfo.td | Introduces scalar fixed-point xform + patterns selecting direct GPR↔FPR scalar conversion opcodes for FP16. |
| : GICustomOperandRenderer<"renderFixedPointScalarXForm">, | ||
| GISDNodeXFormEquiv<fixedpoint_scalar_xform>; | ||
|
|
||
| // Patterns for FP16 Intrinsics - requires reg copy to/from as i16s not supported. |
davemgreen
left a comment
There was a problem hiding this comment.
It looks like vcvth_n_f16_s16 will also generate llvm.aarch64.neon.vcvtfxs2fp.f16.i32. Should that be changed to generate a llvm.aarch64.neon.vcvtfxs2fp.f16.i16 now? It might require some extra legalization to deal with the i16, maybe a bitcast to f16 in SDAG and being treated like a FPR in GISel.
There was a problem hiding this comment.
Can this reuse the existing fixedpoint_vec_xform, maybe with a rename? AFAIU it is just to get the types correct.
There was a problem hiding this comment.
I did try that, but I think the scalar intrinsics don't need to use selectCVTFixedPointVecBase as we have for fixedpoint_vec_xform. Having a different render function for the scalar makes the code a bit better.
But I could add an if statement in renderFixedPointXForm, if you think that is better than having another render function.
This patch implements the changes proposed in[1] for SelectionDAG and GlobalISel
[1]ARM-software/acle#435