Skip to content

Commit b8127cc

Browse files
authored
[AMDGPU][True16][CodeGen] fix v_mov_b16_t16 index in folding pass (#161764)
With true16 mode v_mov_b16_t16 is added as new foldable copy inst, but the src operand is in different index. Use the correct src index for v_mov_b16_t16.
1 parent 50285ea commit b8127cc

File tree

4 files changed

+55
-1
lines changed

4 files changed

+55
-1
lines changed

llvm/lib/Target/AMDGPU/SIFoldOperands.cpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -932,7 +932,9 @@ static MachineOperand *lookUpCopyChain(const SIInstrInfo &TII,
932932
for (MachineInstr *SubDef = MRI.getVRegDef(SrcReg);
933933
SubDef && TII.isFoldableCopy(*SubDef);
934934
SubDef = MRI.getVRegDef(Sub->getReg())) {
935-
MachineOperand &SrcOp = SubDef->getOperand(1);
935+
unsigned SrcIdx = TII.getFoldableCopySrcIdx(*SubDef);
936+
MachineOperand &SrcOp = SubDef->getOperand(SrcIdx);
937+
936938
if (SrcOp.isImm())
937939
return &SrcOp;
938940
if (!SrcOp.isReg() || SrcOp.getReg().isPhysical())

llvm/lib/Target/AMDGPU/SIInstrInfo.cpp

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3433,6 +3433,32 @@ bool SIInstrInfo::isFoldableCopy(const MachineInstr &MI) {
34333433
}
34343434
}
34353435

3436+
unsigned SIInstrInfo::getFoldableCopySrcIdx(const MachineInstr &MI) {
3437+
switch (MI.getOpcode()) {
3438+
case AMDGPU::V_MOV_B16_t16_e32:
3439+
case AMDGPU::V_MOV_B16_t16_e64:
3440+
return 2;
3441+
case AMDGPU::V_MOV_B32_e32:
3442+
case AMDGPU::V_MOV_B32_e64:
3443+
case AMDGPU::V_MOV_B64_PSEUDO:
3444+
case AMDGPU::V_MOV_B64_e32:
3445+
case AMDGPU::V_MOV_B64_e64:
3446+
case AMDGPU::S_MOV_B32:
3447+
case AMDGPU::S_MOV_B64:
3448+
case AMDGPU::S_MOV_B64_IMM_PSEUDO:
3449+
case AMDGPU::COPY:
3450+
case AMDGPU::WWM_COPY:
3451+
case AMDGPU::V_ACCVGPR_WRITE_B32_e64:
3452+
case AMDGPU::V_ACCVGPR_READ_B32_e64:
3453+
case AMDGPU::V_ACCVGPR_MOV_B32:
3454+
case AMDGPU::AV_MOV_B32_IMM_PSEUDO:
3455+
case AMDGPU::AV_MOV_B64_IMM_PSEUDO:
3456+
return 1;
3457+
default:
3458+
llvm_unreachable("MI is not a foldable copy");
3459+
}
3460+
}
3461+
34363462
static constexpr AMDGPU::OpName ModifierOpNames[] = {
34373463
AMDGPU::OpName::src0_modifiers, AMDGPU::OpName::src1_modifiers,
34383464
AMDGPU::OpName::src2_modifiers, AMDGPU::OpName::clamp,

llvm/lib/Target/AMDGPU/SIInstrInfo.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -417,6 +417,7 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
417417
const MachineInstr &MIb) const override;
418418

419419
static bool isFoldableCopy(const MachineInstr &MI);
420+
static unsigned getFoldableCopySrcIdx(const MachineInstr &MI);
420421

421422
void removeModOperands(MachineInstr &MI) const;
422423

llvm/test/CodeGen/AMDGPU/true16-fold.mir

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,7 @@ body: |
5757
%4:vgpr_16 = COPY %3:sgpr_lo16
5858
%5:vgpr_32 = V_ALIGNBIT_B32_t16_e64 0, %0:sreg_32, 0, killed %1:sreg_32, 0, killed %4:vgpr_16, 0, 0, implicit $exec
5959
S_ENDPGM 0, implicit %5
60+
...
6061

6162
---
6263
name: fold_16bit_madmix_clamp
@@ -207,3 +208,27 @@ body: |
207208
$vgpr0 = COPY %4
208209
S_ENDPGM 0, implicit $vgpr0
209210
...
211+
212+
---
213+
name: fold_imm16_across_reg_sequence
214+
tracksRegLiveness: true
215+
registers:
216+
body: |
217+
bb.0:
218+
liveins: $vgpr0, $vgpr1, $vgpr2
219+
; CHECK-LABEL: name: fold_imm16_across_reg_sequence
220+
; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2
221+
; CHECK-NEXT: {{ $}}
222+
; CHECK-NEXT: [[V_MOV_B16_t16_e64_:%[0-9]+]]:vgpr_16 = V_MOV_B16_t16_e64 0, -1, 0, implicit $exec
223+
; CHECK-NEXT: [[V_MOV_B16_t16_e64_1:%[0-9]+]]:vgpr_16 = V_MOV_B16_t16_e64 0, -1, 0, implicit $exec
224+
; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[V_MOV_B16_t16_e64_]], %subreg.lo16, [[V_MOV_B16_t16_e64_1]], %subreg.hi16
225+
; CHECK-NEXT: [[V_MAX_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_MAX_F32_e64 0, -1, 0, -1, 0, 0, implicit $mode, implicit $exec
226+
; CHECK-NEXT: $vgpr0 = COPY [[V_MAX_F32_e64_]]
227+
; CHECK-NEXT: S_ENDPGM 0, implicit $vgpr0
228+
%0:vgpr_16 = V_MOV_B16_t16_e64 0, -1, 0, implicit $exec
229+
%1:vgpr_16 = V_MOV_B16_t16_e64 0, -1, 0, implicit $exec
230+
%2:vgpr_32 = REG_SEQUENCE %0, %subreg.lo16, %1, %subreg.hi16
231+
%3:vgpr_32 = nofpexcept V_MAX_F32_e64 0, %2, 0, %2, 0, 0, implicit $mode, implicit $exec
232+
$vgpr0 = COPY %3
233+
S_ENDPGM 0, implicit $vgpr0
234+
...

0 commit comments

Comments
 (0)