Commit 5c68a1cb authored by Matt Arsenault's avatar Matt Arsenault Committed by Tom Stellard
Browse files

AMDGPU: Make various vector undefs legal

Surprisingly these were getting legalized to something
zero initialized.

This fixes an infinite loop when combining some vector types.
Also fixes zero initializing some undef values.

SimplifyDemandedVectorElts / SimplifyDemandedBits are not checking
for the legality of the output undefs they are replacing unused
operations with. This resulted in turning vectors into undefs
that were later re-legalized back into zero vectors.

(cherry picked from commit 7a846240)
parent 80a9fc84
......@@ -249,6 +249,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
case ISD::STORE:
case ISD::BUILD_VECTOR:
case ISD::BITCAST:
case ISD::UNDEF:
case ISD::EXTRACT_VECTOR_ELT:
case ISD::INSERT_VECTOR_ELT:
case ISD::EXTRACT_SUBVECTOR:
......@@ -516,6 +517,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
case ISD::STORE:
case ISD::BUILD_VECTOR:
case ISD::BITCAST:
case ISD::UNDEF:
case ISD::EXTRACT_VECTOR_ELT:
case ISD::INSERT_VECTOR_ELT:
case ISD::INSERT_SUBVECTOR:
......
......@@ -5,14 +5,6 @@
define amdgpu_ps float @main(float %arg0, float %arg1) #0 {
; SI-LABEL: main:
; SI: ; %bb.0: ; %bb
; SI-NEXT: s_mov_b32 s0, 0
; SI-NEXT: s_mov_b32 s1, s0
; SI-NEXT: s_mov_b32 s2, s0
; SI-NEXT: s_mov_b32 s3, s0
; SI-NEXT: s_mov_b32 s4, s0
; SI-NEXT: s_mov_b32 s5, s0
; SI-NEXT: s_mov_b32 s6, s0
; SI-NEXT: s_mov_b32 s7, s0
; SI-NEXT: image_load v2, v0, s[0:7] dmask:0x1 unorm
; SI-NEXT: v_cvt_i32_f32_e32 v0, v0
; SI-NEXT: v_and_b32_e32 v0, 7, v0
......@@ -26,14 +18,6 @@ define amdgpu_ps float @main(float %arg0, float %arg1) #0 {
;
; VI-LABEL: main:
; VI: ; %bb.0: ; %bb
; VI-NEXT: s_mov_b32 s0, 0
; VI-NEXT: s_mov_b32 s1, s0
; VI-NEXT: s_mov_b32 s2, s0
; VI-NEXT: s_mov_b32 s3, s0
; VI-NEXT: s_mov_b32 s4, s0
; VI-NEXT: s_mov_b32 s5, s0
; VI-NEXT: s_mov_b32 s6, s0
; VI-NEXT: s_mov_b32 s7, s0
; VI-NEXT: image_load v2, v0, s[0:7] dmask:0x1 unorm
; VI-NEXT: v_cvt_i32_f32_e32 v0, v0
; VI-NEXT: v_and_b32_e32 v0, 7, v0
......
......@@ -213,7 +213,7 @@ if.else: ; preds = %entry
br label %if.end
if.end: ; preds = %if.else, %if.then
%call6.sink = phi <3 x i16> [ %call6, %if.else ], [ undef, %if.then ]
%call6.sink = phi <3 x i16> [ %call6, %if.else ], [ zeroinitializer, %if.then ]
store <3 x i16> %call6.sink, <3 x i16> addrspace(1)* undef
ret void
}
......@@ -266,7 +266,7 @@ if.else: ; preds = %entry
br label %if.end
if.end: ; preds = %if.else, %if.then
%call6.sink = phi <3 x half> [ %call6, %if.else ], [ undef, %if.then ]
%call6.sink = phi <3 x half> [ %call6, %if.else ], [ zeroinitializer, %if.then ]
store <3 x half> %call6.sink, <3 x half> addrspace(1)* undef
ret void
}
......
......@@ -4,16 +4,8 @@
define amdgpu_ps float @_amdgpu_ps_main() #0 {
; GCN-LABEL: _amdgpu_ps_main:
; GCN: ; %bb.0: ; %.entry
; GCN-NEXT: s_mov_b32 s0, 0
; GCN-NEXT: v_mov_b32_e32 v4, 0
; GCN-NEXT: s_mov_b32 s1, s0
; GCN-NEXT: s_mov_b32 s2, s0
; GCN-NEXT: s_mov_b32 s3, s0
; GCN-NEXT: s_mov_b32 s4, s0
; GCN-NEXT: s_mov_b32 s5, s0
; GCN-NEXT: s_mov_b32 s6, s0
; GCN-NEXT: s_mov_b32 s7, s0
; GCN-NEXT: image_sample v[0:1], v[0:1], s[0:7], s[0:3] dmask:0x3 dim:SQ_RSRC_IMG_2D
; GCN-NEXT: v_mov_b32_e32 v4, 0
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_clause 0x1
; GCN-NEXT: image_sample v2, v[0:1], s[0:7], s[0:3] dmask:0x4 dim:SQ_RSRC_IMG_2D
......
......@@ -100,14 +100,7 @@ define <4 x i16> @vec_8xi16_extract_4xi16(<8 x i16> addrspace(1) * %p0, <8 x i16
; GFX9-NEXT: s_cbranch_execz .LBB0_3
; GFX9-NEXT: s_branch .LBB0_4
; GFX9-NEXT: .LBB0_2:
; GFX9-NEXT: s_mov_b32 s8, 0
; GFX9-NEXT: s_mov_b32 s9, s8
; GFX9-NEXT: s_mov_b32 s10, s8
; GFX9-NEXT: s_mov_b32 s11, s8
; GFX9-NEXT: v_mov_b32_e32 v2, s8
; GFX9-NEXT: v_mov_b32_e32 v3, s9
; GFX9-NEXT: v_mov_b32_e32 v4, s10
; GFX9-NEXT: v_mov_b32_e32 v5, s11
; GFX9-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5
; GFX9-NEXT: .LBB0_3: ; %T
; GFX9-NEXT: global_load_dwordx4 v[2:5], v[0:1], off glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
......@@ -244,14 +237,7 @@ define <4 x i16> @vec_8xi16_extract_4xi16_2(<8 x i16> addrspace(1) * %p0, <8 x i
; GFX9-NEXT: s_cbranch_execz .LBB1_3
; GFX9-NEXT: s_branch .LBB1_4
; GFX9-NEXT: .LBB1_2:
; GFX9-NEXT: s_mov_b32 s8, 0
; GFX9-NEXT: s_mov_b32 s9, s8
; GFX9-NEXT: s_mov_b32 s10, s8
; GFX9-NEXT: s_mov_b32 s11, s8
; GFX9-NEXT: v_mov_b32_e32 v2, s8
; GFX9-NEXT: v_mov_b32_e32 v3, s9
; GFX9-NEXT: v_mov_b32_e32 v4, s10
; GFX9-NEXT: v_mov_b32_e32 v5, s11
; GFX9-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5
; GFX9-NEXT: .LBB1_3: ; %T
; GFX9-NEXT: global_load_dwordx4 v[2:5], v[0:1], off glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
......@@ -386,14 +372,7 @@ define <4 x half> @vec_8xf16_extract_4xf16(<8 x half> addrspace(1) * %p0, <8 x h
; GFX9-NEXT: s_cbranch_execz .LBB2_3
; GFX9-NEXT: s_branch .LBB2_4
; GFX9-NEXT: .LBB2_2:
; GFX9-NEXT: s_mov_b32 s8, 0
; GFX9-NEXT: s_mov_b32 s9, s8
; GFX9-NEXT: s_mov_b32 s10, s8
; GFX9-NEXT: s_mov_b32 s11, s8
; GFX9-NEXT: v_mov_b32_e32 v2, s8
; GFX9-NEXT: v_mov_b32_e32 v3, s9
; GFX9-NEXT: v_mov_b32_e32 v4, s10
; GFX9-NEXT: v_mov_b32_e32 v5, s11
; GFX9-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5
; GFX9-NEXT: .LBB2_3: ; %T
; GFX9-NEXT: global_load_dwordx4 v[2:5], v[0:1], off glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
......@@ -567,22 +546,7 @@ define <4 x i16> @vec_16xi16_extract_4xi16(<16 x i16> addrspace(1) * %p0, <16 x
; GFX9-NEXT: s_cbranch_execz .LBB3_3
; GFX9-NEXT: s_branch .LBB3_4
; GFX9-NEXT: .LBB3_2:
; GFX9-NEXT: s_mov_b32 s8, 0
; GFX9-NEXT: s_mov_b32 s9, s8
; GFX9-NEXT: s_mov_b32 s10, s8
; GFX9-NEXT: s_mov_b32 s11, s8
; GFX9-NEXT: s_mov_b32 s12, s8
; GFX9-NEXT: s_mov_b32 s13, s8
; GFX9-NEXT: s_mov_b32 s14, s8
; GFX9-NEXT: s_mov_b32 s15, s8
; GFX9-NEXT: v_mov_b32_e32 v4, s8
; GFX9-NEXT: v_mov_b32_e32 v5, s9
; GFX9-NEXT: v_mov_b32_e32 v6, s10
; GFX9-NEXT: v_mov_b32_e32 v7, s11
; GFX9-NEXT: v_mov_b32_e32 v8, s12
; GFX9-NEXT: v_mov_b32_e32 v9, s13
; GFX9-NEXT: v_mov_b32_e32 v10, s14
; GFX9-NEXT: v_mov_b32_e32 v11, s15
; GFX9-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11
; GFX9-NEXT: .LBB3_3: ; %T
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_load_dwordx4 v[2:5], v[0:1], off offset:16 glc
......@@ -759,22 +723,7 @@ define <4 x i16> @vec_16xi16_extract_4xi16_2(<16 x i16> addrspace(1) * %p0, <16
; GFX9-NEXT: s_cbranch_execz .LBB4_3
; GFX9-NEXT: s_branch .LBB4_4
; GFX9-NEXT: .LBB4_2:
; GFX9-NEXT: s_mov_b32 s8, 0
; GFX9-NEXT: s_mov_b32 s9, s8
; GFX9-NEXT: s_mov_b32 s10, s8
; GFX9-NEXT: s_mov_b32 s11, s8
; GFX9-NEXT: s_mov_b32 s12, s8
; GFX9-NEXT: s_mov_b32 s13, s8
; GFX9-NEXT: s_mov_b32 s14, s8
; GFX9-NEXT: s_mov_b32 s15, s8
; GFX9-NEXT: v_mov_b32_e32 v4, s8
; GFX9-NEXT: v_mov_b32_e32 v5, s9
; GFX9-NEXT: v_mov_b32_e32 v6, s10
; GFX9-NEXT: v_mov_b32_e32 v7, s11
; GFX9-NEXT: v_mov_b32_e32 v8, s12
; GFX9-NEXT: v_mov_b32_e32 v9, s13
; GFX9-NEXT: v_mov_b32_e32 v10, s14
; GFX9-NEXT: v_mov_b32_e32 v11, s15
; GFX9-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11
; GFX9-NEXT: .LBB4_3: ; %T
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_load_dwordx4 v[2:5], v[0:1], off offset:16 glc
......@@ -949,22 +898,7 @@ define <4 x half> @vec_16xf16_extract_4xf16(<16 x half> addrspace(1) * %p0, <16
; GFX9-NEXT: s_cbranch_execz .LBB5_3
; GFX9-NEXT: s_branch .LBB5_4
; GFX9-NEXT: .LBB5_2:
; GFX9-NEXT: s_mov_b32 s8, 0
; GFX9-NEXT: s_mov_b32 s9, s8
; GFX9-NEXT: s_mov_b32 s10, s8
; GFX9-NEXT: s_mov_b32 s11, s8
; GFX9-NEXT: s_mov_b32 s12, s8
; GFX9-NEXT: s_mov_b32 s13, s8
; GFX9-NEXT: s_mov_b32 s14, s8
; GFX9-NEXT: s_mov_b32 s15, s8
; GFX9-NEXT: v_mov_b32_e32 v4, s8
; GFX9-NEXT: v_mov_b32_e32 v5, s9
; GFX9-NEXT: v_mov_b32_e32 v6, s10
; GFX9-NEXT: v_mov_b32_e32 v7, s11
; GFX9-NEXT: v_mov_b32_e32 v8, s12
; GFX9-NEXT: v_mov_b32_e32 v9, s13
; GFX9-NEXT: v_mov_b32_e32 v10, s14
; GFX9-NEXT: v_mov_b32_e32 v11, s15
; GFX9-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11
; GFX9-NEXT: .LBB5_3: ; %T
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_load_dwordx4 v[2:5], v[0:1], off offset:16 glc
......
......@@ -374,18 +374,10 @@ define <4 x float> @insertelement_to_sgpr() nounwind {
; GCN-LABEL: insertelement_to_sgpr:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: s_load_dwordx4 s[12:15], s[4:5], 0x0
; GCN-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_mov_b32 s12, 0
; GCN-NEXT: s_mov_b32 s4, s12
; GCN-NEXT: s_mov_b32 s5, s12
; GCN-NEXT: s_mov_b32 s6, s12
; GCN-NEXT: s_mov_b32 s7, s12
; GCN-NEXT: s_mov_b32 s8, s12
; GCN-NEXT: s_mov_b32 s9, s12
; GCN-NEXT: s_mov_b32 s10, s12
; GCN-NEXT: s_mov_b32 s11, s12
; GCN-NEXT: image_gather4_lz v[0:3], v[0:1], s[4:11], s[12:15] dmask:0x1
; GCN-NEXT: s_mov_b32 s4, 0
; GCN-NEXT: image_gather4_lz v[0:3], v[0:1], s[4:11], s[4:7] dmask:0x1
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
%tmp = load <4 x i32>, <4 x i32> addrspace(4)* undef
......
; RUN: llc -mtriple=amdgcn-amd-amdhsa < %s | FileCheck -check-prefix=GCN %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefix=GCN %s
; GCN-LABEL: {{^}}select_undef_lhs:
; GCN: s_waitcnt
......@@ -43,3 +43,220 @@ define void @select_undef_n2(float addrspace(1)* %a, i32 %c) {
}
declare float @llvm.amdgcn.rcp.f32(float)
; Make sure the vector undef isn't lowered into 0s.
; GCN-LABEL: {{^}}undef_v6f32:
; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0
; GCN-NOT: s_mov_b32 s{{[0-9]+}}, 0
; GCN: s_cbranch_vccnz
define amdgpu_kernel void @undef_v6f32(<6 x float> addrspace(3)* %ptr, i1 %cond) {
entry:
br label %loop
loop:
%phi = phi <6 x float> [ undef, %entry ], [ %add, %loop ]
%load = load volatile <6 x float>, <6 x float> addrspace(3)* undef
%add = fadd <6 x float> %load, %phi
br i1 %cond, label %loop, label %ret
ret:
store volatile <6 x float> %add, <6 x float> addrspace(3)* undef
ret void
}
; GCN-LABEL: {{^}}undef_v6i32:
; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0
; GCN-NOT: s_mov_b32 s{{[0-9]+}}, 0
; GCN: s_cbranch_vccnz
define amdgpu_kernel void @undef_v6i32(<6 x i32> addrspace(3)* %ptr, i1 %cond) {
entry:
br label %loop
loop:
%phi = phi <6 x i32> [ undef, %entry ], [ %add, %loop ]
%load = load volatile <6 x i32>, <6 x i32> addrspace(3)* undef
%add = add <6 x i32> %load, %phi
br i1 %cond, label %loop, label %ret
ret:
store volatile <6 x i32> %add, <6 x i32> addrspace(3)* undef
ret void
}
; Make sure the vector undef isn't lowered into 0s.
; GCN-LABEL: {{^}}undef_v5f32:
; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0
; GCN-NOT: s_mov_b32 s{{[0-9]+}}, 0
; GCN: s_cbranch_vccnz
define amdgpu_kernel void @undef_v5f32(<5 x float> addrspace(3)* %ptr, i1 %cond) {
entry:
br label %loop
loop:
%phi = phi <5 x float> [ undef, %entry ], [ %add, %loop ]
%load = load volatile <5 x float>, <5 x float> addrspace(3)* undef
%add = fadd <5 x float> %load, %phi
br i1 %cond, label %loop, label %ret
ret:
store volatile <5 x float> %add, <5 x float> addrspace(3)* undef
ret void
}
; GCN-LABEL: {{^}}undef_v5i32:
; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0
; GCN-NOT: s_mov_b32 s{{[0-9]+}}, 0
; GCN: s_cbranch_vccnz
define amdgpu_kernel void @undef_v5i32(<5 x i32> addrspace(3)* %ptr, i1 %cond) {
entry:
br label %loop
loop:
%phi = phi <5 x i32> [ undef, %entry ], [ %add, %loop ]
%load = load volatile <5 x i32>, <5 x i32> addrspace(3)* undef
%add = add <5 x i32> %load, %phi
br i1 %cond, label %loop, label %ret
ret:
store volatile <5 x i32> %add, <5 x i32> addrspace(3)* undef
ret void
}
; Make sure the vector undef isn't lowered into 0s.
; GCN-LABEL: {{^}}undef_v3f64:
; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0
; GCN-NOT: s_mov_b32 s{{[0-9]+}}, 0
; GCN: s_cbranch_vccnz
define amdgpu_kernel void @undef_v3f64(<3 x double> addrspace(3)* %ptr, i1 %cond) {
entry:
br label %loop
loop:
%phi = phi <3 x double> [ undef, %entry ], [ %add, %loop ]
%load = load volatile <3 x double>, <3 x double> addrspace(3)* %ptr
%add = fadd <3 x double> %load, %phi
br i1 %cond, label %loop, label %ret
ret:
store volatile <3 x double> %add, <3 x double> addrspace(3)* %ptr
ret void
}
; GCN-LABEL: {{^}}undef_v3i64:
; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0
; GCN-NOT: s_mov_b32 s{{[0-9]+}}, 0
; GCN: s_cbranch_vccnz
define amdgpu_kernel void @undef_v3i64(<3 x i64> addrspace(3)* %ptr, i1 %cond) {
entry:
br label %loop
loop:
%phi = phi <3 x i64> [ undef, %entry ], [ %add, %loop ]
%load = load volatile <3 x i64>, <3 x i64> addrspace(3)* %ptr
%add = add <3 x i64> %load, %phi
br i1 %cond, label %loop, label %ret
ret:
store volatile <3 x i64> %add, <3 x i64> addrspace(3)* %ptr
ret void
}
; Make sure the vector undef isn't lowered into 0s.
; GCN-LABEL: {{^}}undef_v4f16:
; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0
; GCN-NOT: s_mov_b32 s{{[0-9]+}}, 0
; GCN: s_cbranch_vccnz
define amdgpu_kernel void @undef_v4f16(<4 x half> addrspace(3)* %ptr, i1 %cond) {
entry:
br label %loop
loop:
%phi = phi <4 x half> [ undef, %entry ], [ %add, %loop ]
%load = load volatile <4 x half>, <4 x half> addrspace(3)* %ptr
%add = fadd <4 x half> %load, %phi
br i1 %cond, label %loop, label %ret
ret:
store volatile <4 x half> %add, <4 x half> addrspace(3)* %ptr
ret void
}
; GCN-LABEL: {{^}}undef_v4i16:
; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0
; GCN-NOT: s_mov_b32 s{{[0-9]+}}, 0
; GCN: s_cbranch_vccnz
define amdgpu_kernel void @undef_v4i16(<4 x i16> addrspace(3)* %ptr, i1 %cond) {
entry:
br label %loop
loop:
%phi = phi <4 x i16> [ undef, %entry ], [ %add, %loop ]
%load = load volatile <4 x i16>, <4 x i16> addrspace(3)* %ptr
%add = add <4 x i16> %load, %phi
br i1 %cond, label %loop, label %ret
ret:
store volatile <4 x i16> %add, <4 x i16> addrspace(3)* %ptr
ret void
}
; Make sure the vector undef isn't lowered into 0s.
; GCN-LABEL: {{^}}undef_v2f16:
; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0
; GCN-NOT: s_mov_b32 s{{[0-9]+}}, 0
; GCN: s_cbranch_vccnz
define amdgpu_kernel void @undef_v2f16(<2 x half> addrspace(3)* %ptr, i1 %cond) {
entry:
br label %loop
loop:
%phi = phi <2 x half> [ undef, %entry ], [ %add, %loop ]
%load = load volatile <2 x half>, <2 x half> addrspace(3)* %ptr
%add = fadd <2 x half> %load, %phi
br i1 %cond, label %loop, label %ret
ret:
store volatile <2 x half> %add, <2 x half> addrspace(3)* %ptr
ret void
}
; GCN-LABEL: {{^}}undef_v2i16:
; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0
; GCN-NOT: s_mov_b32 s{{[0-9]+}}, 0
; GCN: s_cbranch_vccnz
define amdgpu_kernel void @undef_v2i16(<2 x i16> addrspace(3)* %ptr, i1 %cond) {
entry:
br label %loop
loop:
%phi = phi <2 x i16> [ undef, %entry ], [ %add, %loop ]
%load = load volatile <2 x i16>, <2 x i16> addrspace(3)* %ptr
%add = add <2 x i16> %load, %phi
br i1 %cond, label %loop, label %ret
ret:
store volatile <2 x i16> %add, <2 x i16> addrspace(3)* %ptr
ret void
}
; We were expanding undef vectors into zero vectors. Optimizations
; would then see we used no elements of the vector, and reform the
; undef vector resulting in a combiner loop.
; GCN-LABEL: {{^}}inf_loop_undef_vector:
; GCN: s_waitcnt
; GCN-NEXT: v_mad_u64_u32
; GCN-NEXT: v_mul_lo_u32
; GCN-NEXT: v_mul_lo_u32
; GCN-NEXT: v_add3_u32
; GCN-NEXT: global_store_dwordx2
define void @inf_loop_undef_vector(<6 x float> %arg, float %arg1, i64 %arg2) {
%i = insertelement <6 x float> %arg, float %arg1, i64 2
%i3 = bitcast <6 x float> %i to <3 x i64>
%i4 = extractelement <3 x i64> %i3, i64 0
%i5 = extractelement <3 x i64> %i3, i64 1
%i6 = mul i64 %i5, %arg2
%i7 = add i64 %i6, %i4
store volatile i64 %i7, i64 addrspace(1)* undef, align 4
ret void
}
......@@ -1397,28 +1397,20 @@ bb7: ; preds = %bb4
define amdgpu_ps void @if_after_kill_block(float %arg, float %arg1, float %arg2, float %arg3) #0 {
; SI-LABEL: if_after_kill_block:
; SI: ; %bb.0: ; %bb
; SI-NEXT: s_mov_b64 s[2:3], exec
; SI-NEXT: s_mov_b64 s[0:1], exec
; SI-NEXT: s_wqm_b64 exec, exec
; SI-NEXT: s_mov_b32 s0, 0
; SI-NEXT: v_cmp_nle_f32_e32 vcc, 0, v1
; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; SI-NEXT: s_and_saveexec_b64 s[2:3], vcc
; SI-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
; SI-NEXT: s_cbranch_execz .LBB13_3
; SI-NEXT: ; %bb.1: ; %bb3
; SI-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v0
; SI-NEXT: s_andn2_b64 s[2:3], s[2:3], vcc
; SI-NEXT: s_andn2_b64 s[0:1], s[0:1], vcc
; SI-NEXT: s_cbranch_scc0 .LBB13_6
; SI-NEXT: ; %bb.2: ; %bb3
; SI-NEXT: s_andn2_b64 exec, exec, vcc
; SI-NEXT: .LBB13_3: ; %bb4
; SI-NEXT: s_or_b64 exec, exec, s[4:5]
; SI-NEXT: s_mov_b32 s1, s0
; SI-NEXT: s_mov_b32 s2, s0
; SI-NEXT: s_mov_b32 s3, s0
; SI-NEXT: s_mov_b32 s4, s0
; SI-NEXT: s_mov_b32 s5, s0
; SI-NEXT: s_mov_b32 s6, s0
; SI-NEXT: s_mov_b32 s7, s0
; SI-NEXT: s_or_b64 exec, exec, s[2:3]
; SI-NEXT: image_sample_c v0, v[2:3], s[0:7], s[0:3] dmask:0x10
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cmp_neq_f32_e32 vcc, 0, v0
......@@ -1439,28 +1431,20 @@ define amdgpu_ps void @if_after_kill_block(float %arg, float %arg1, float %arg2,
;
; GFX10-WAVE64-LABEL: if_after_kill_block:
; GFX10-WAVE64: ; %bb.0: ; %bb
; GFX10-WAVE64-NEXT: s_mov_b64 s[2:3], exec
; GFX10-WAVE64-NEXT: s_mov_b64 s[0:1], exec
; GFX10-WAVE64-NEXT: s_wqm_b64 exec, exec
; GFX10-WAVE64-NEXT: v_cmp_nle_f32_e32 vcc, 0, v1
; GFX10-WAVE64-NEXT: s_mov_b32 s0, 0
; GFX10-WAVE64-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX10-WAVE64-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GFX10-WAVE64-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX10-WAVE64-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
; GFX10-WAVE64-NEXT: s_cbranch_execz .LBB13_3
; GFX10-WAVE64-NEXT: ; %bb.1: ; %bb3
; GFX10-WAVE64-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v0
; GFX10-WAVE64-NEXT: s_andn2_b64 s[2:3], s[2:3], vcc
; GFX10-WAVE64-NEXT: s_andn2_b64 s[0:1], s[0:1], vcc
; GFX10-WAVE64-NEXT: s_cbranch_scc0 .LBB13_6
; GFX10-WAVE64-NEXT: ; %bb.2: ; %bb3
; GFX10-WAVE64-NEXT: s_andn2_b64 exec, exec, vcc
; GFX10-WAVE64-NEXT: .LBB13_3: ; %bb4
; GFX10-WAVE64-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX10-WAVE64-NEXT: s_mov_b32 s1, s0
; GFX10-WAVE64-NEXT: s_mov_b32 s2, s0
; GFX10-WAVE64-NEXT: s_mov_b32 s3, s0
; GFX10-WAVE64-NEXT: s_mov_b32 s4, s0
; GFX10-WAVE64-NEXT: s_mov_b32 s5, s0
; GFX10-WAVE64-NEXT: s_mov_b32 s6, s0
; GFX10-WAVE64-NEXT: s_mov_b32 s7, s0
; GFX10-WAVE64-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX10-WAVE64-NEXT: image_sample_c v0, v[2:3], s[0:7], s[0:3] dmask:0x10 dim:SQ_RSRC_IMG_1D
; GFX10-WAVE64-NEXT: s_waitcnt vmcnt(0)
; GFX10-WAVE64-NEXT: v_cmp_neq_f32_e32 vcc, 0, v0
......@@ -1479,28 +1463,20 @@ define amdgpu_ps void @if_after_kill_block(float %arg, float %arg1, float %arg2,
;
; GFX10-WAVE32-LABEL: if_after_kill_block:
; GFX10-WAVE32: ; %bb.0: ; %bb
; GFX10-WAVE32-NEXT: s_mov_b32 s1, exec_lo
; GFX10-WAVE32-NEXT: s_mov_b32 s0, exec_lo
; GFX10-WAVE32-NEXT: s_wqm_b32 exec_lo, exec_lo
; GFX10-WAVE32-NEXT: v_cmp_nle_f32_e32 vcc_lo, 0, v1
; GFX10-WAVE32-NEXT: s_mov_b32 s0, 0
; GFX10-WAVE32-NEXT: s_and_saveexec_b32 s2, vcc_lo
; GFX10-WAVE32-NEXT: s_xor_b32 s2, exec_lo, s2
; GFX10-WAVE32-NEXT: s_and_saveexec_b32 s1, vcc_lo
; GFX10-WAVE32-NEXT: s_xor_b32 s1, exec_lo, s1
; GFX10-WAVE32-NEXT: s_cbranch_execz .LBB13_3
; GFX10-WAVE32-NEXT: ; %bb.1: ; %bb3
; GFX10-WAVE32-NEXT: v_cmp_ngt_f32_e32 vcc_lo, 0, v0
; GFX10-WAVE32-NEXT: s_andn2_b32 s1, s1, vcc_lo
; GFX10-WAVE32-NEXT: s_andn2_b32 s0, s0, vcc_lo
; GFX10-WAVE32-NEXT: s_cbranch_scc0 .LBB13_6
; GFX10-WAVE32-NEXT: ; %bb.2: ; %bb3
; GFX10-WAVE32-NEXT: s_andn2_b32 exec_lo, exec_lo, vcc_lo
; GFX10-WAVE32-NEXT: .LBB13_3: ; %bb4
; GFX10-WAVE32-NEXT: s_or_b32 exec_lo, exec_lo, s2
; GFX10-WAVE32-NEXT: s_mov_b32 s1, s0
; GFX10-WAVE32-NEXT: s_mov_b32 s2, s0
; GFX10-WAVE32-NEXT: s_mov_b32 s3, s0
; GFX10-WAVE32-NEXT: s_mov_b32 s4, s0
; GFX10-WAVE32-NEXT: s_mov_b32 s5, s0
; GFX10-WAVE32-NEXT: s_mov_b32 s6, s0
; GFX10-WAVE32-NEXT: s_mov_b32 s7, s0
; GFX10-WAVE32-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX10-WAVE32-NEXT: image_sample_c v0, v[2:3], s[0:7], s[0:3] dmask:0x10 dim:SQ_RSRC_IMG_1D
; GFX10-WAVE32-NEXT: s_waitcnt vmcnt(0)
; GFX10-WAVE32-NEXT: v_cmp_neq_f32_e32 vcc_lo, 0, v0
......@@ -1519,29 +1495,22 @@ define amdgpu_ps void @if_after_kill_block(float %arg, float %arg1, float %arg2,
;
; GFX11-LABEL: if_after_kill_block:
; GFX11: ; %bb.0: ; %bb
; GFX11-NEXT: s_mov_b64 s[2:3], exec
; GFX11-NEXT: s_mov_b64 s[0:1], exec
; GFX11-NEXT: s_wqm_b64 exec, exec
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: s_mov_b64 s[4:5], exec
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_mov_b64 s[2:3], exec
; GFX11-NEXT: v_cmpx_nle_f32_e32 0, v1
; GFX11-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GFX11-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
; GFX11-NEXT: s_cbranch_execz .LBB13_3
; GFX11-NEXT: ; %bb.1: ; %bb3
; GFX11-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v0
; GFX11-NEXT: s_and_not1_b64 s[2:3], s[2:3], vcc
; GFX11-NEXT: s_and_not1_b64 s[0:1], s[0:1], vcc
; GFX11-NEXT: s_cbranch_scc0 .LBB13_6
; GFX11-NEXT: ; %bb.2: ; %bb3
; GFX11-NEXT: s_and_not1_b64 exec, exec, vcc
; GFX11-NEXT: .LBB13_3: ; %bb4
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX11-NEXT: s_mov_b32 s1, s0
; GFX11-NEXT: s_mov_b32 s2, s0
; GFX11-NEXT: s_mov_b32 s3, s0
; GFX11-NEXT: s_mov_b32 s4, s0
; GFX11-NEXT: s_mov_b32 s5, s0
; GFX11-NEXT: s_mov_b32 s6, s0
; GFX11-NEXT: s_mov_b32 s7, s0
; GFX11-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX11-NEXT: image_sample_c v0, v[2:3], s[0:7], s[0:3] dmask:0x10 dim:SQ_RSRC_IMG_1D
; GFX11-NEXT: s_mov_b64 s[0:1], exec
; GFX11-NEXT: s_waitcnt vmcnt(0)
......@@ -1584,19 +1553,11 @@ bb9: ; preds = %bb4
define amdgpu_ps void @cbranch_kill(i32 inreg %0, float %val0, float %val1) {
; SI-LABEL: cbranch_kill:
; SI: ; %bb.0: ; %.entry
; SI-NEXT: s_mov_b32 s4, 0
; SI-NEXT: s_mov_b64 s[0:1], exec
; SI-NEXT: v_mov_b32_e32 v4, 0
; SI-NEXT: v_mov_b32_e32 v2, v1
; SI-NEXT: v_mov_b32_e32 v3, v1
; SI-NEXT: s_mov_b32 s5, s4
; SI-NEXT: s_mov_b32 s6, s4
; SI-NEXT: s_mov_b32 s7, s4
; SI-NEXT: s_mov_b32 s8, s4
; SI-NEXT: s_mov_b32 s9, s4
; SI-NEXT: s_mov_b32 s10, s4
; SI-NEXT: s_mov_b32 s11, s4
; SI-NEXT: image_sample_l v1, v[1:4], s[4:11], s[0:3] dmask:0x1 da
; SI-NEXT: image_sample_l v1, v[1:4], s[0:7], s[0:3] dmask:0x1 da
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cmp_ge_f32_e32 vcc, 0, v1
; SI-NEXT: s_and_saveexec_b64 s[2:3], vcc
......@@ -1627,16 +1588,8 @@ define amdgpu_ps void @cbranch_kill(i32 inreg %0, float %val0, float %val1) {
; GFX10-WAVE64-LABEL: cbranch_kill:
; GFX10-WAVE64: ; %bb.0: ; %.entry
; GFX10-WAVE64-NEXT: v_mov_b32_e32 v2, 0