665 lines
29 KiB
LLVM
665 lines
29 KiB
LLVM
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
|
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -enable-misched=0 -post-RA-scheduler=0 -stress-regalloc=8 < %s | FileCheck -check-prefix=MUBUF %s
|
|
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -enable-misched=0 -post-RA-scheduler=0 -stress-regalloc=8 -mattr=+enable-flat-scratch < %s | FileCheck -check-prefix=FLATSCR %s
|
|
|
|
; Test that the VGPR spiller correctly switches to SGPR offsets when the
|
|
; instruction offset field would overflow, and that it accounts for memory
|
|
; swizzling.
|
|
|
|
define amdgpu_kernel void @test_inst_offset_kernel() {
|
|
; MUBUF-LABEL: test_inst_offset_kernel:
|
|
; MUBUF: ; %bb.0: ; %entry
|
|
; MUBUF-NEXT: s_add_u32 s0, s0, s7
|
|
; MUBUF-NEXT: s_addc_u32 s1, s1, 0
|
|
; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:8 glc
|
|
; MUBUF-NEXT: s_waitcnt vmcnt(0)
|
|
; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4092 ; 4-byte Folded Spill
|
|
; MUBUF-NEXT: ;;#ASMSTART
|
|
; MUBUF-NEXT: ;;#ASMEND
|
|
; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:4092 ; 4-byte Folded Reload
|
|
; MUBUF-NEXT: s_waitcnt vmcnt(0)
|
|
; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:8
|
|
; MUBUF-NEXT: s_waitcnt vmcnt(0)
|
|
; MUBUF-NEXT: s_endpgm
|
|
;
|
|
; FLATSCR-LABEL: test_inst_offset_kernel:
|
|
; FLATSCR: ; %bb.0: ; %entry
|
|
; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s0, s3
|
|
; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s1, 0
|
|
; FLATSCR-NEXT: s_mov_b32 vcc_lo, 0
|
|
; FLATSCR-NEXT: scratch_load_dword v0, off, vcc_lo offset:8 glc
|
|
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
|
|
; FLATSCR-NEXT: s_movk_i32 s0, 0xffc
|
|
; FLATSCR-NEXT: s_mov_b32 vcc_hi, 0
|
|
; FLATSCR-NEXT: scratch_store_dword off, v0, s0 ; 4-byte Folded Spill
|
|
; FLATSCR-NEXT: ;;#ASMSTART
|
|
; FLATSCR-NEXT: ;;#ASMEND
|
|
; FLATSCR-NEXT: s_movk_i32 s0, 0xffc
|
|
; FLATSCR-NEXT: scratch_load_dword v0, off, s0 ; 4-byte Folded Reload
|
|
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
|
|
; FLATSCR-NEXT: scratch_store_dword off, v0, vcc_hi offset:8
|
|
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
|
|
; FLATSCR-NEXT: s_endpgm
|
|
entry:
|
|
; Occupy 4092 bytes of scratch, so the offset of the spill of %a just fits in
|
|
; the instruction offset field.
|
|
%alloca = alloca i8, i32 4088, align 4, addrspace(5)
|
|
%buf = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)*
|
|
|
|
%aptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1
|
|
|
|
|
|
%a = load volatile i32, i32 addrspace(5)* %aptr
|
|
|
|
; Force %a to spill.
|
|
call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" ()
|
|
|
|
%outptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1
|
|
store volatile i32 %a, i32 addrspace(5)* %outptr
|
|
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @test_sgpr_offset_kernel() {
|
|
; MUBUF-LABEL: test_sgpr_offset_kernel:
|
|
; MUBUF: ; %bb.0: ; %entry
|
|
; MUBUF-NEXT: s_add_u32 s0, s0, s7
|
|
; MUBUF-NEXT: s_addc_u32 s1, s1, 0
|
|
; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:8 glc
|
|
; MUBUF-NEXT: s_waitcnt vmcnt(0)
|
|
; MUBUF-NEXT: s_mov_b32 s4, 0x40000
|
|
; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s4 ; 4-byte Folded Spill
|
|
; MUBUF-NEXT: ;;#ASMSTART
|
|
; MUBUF-NEXT: ;;#ASMEND
|
|
; MUBUF-NEXT: s_mov_b32 s4, 0x40000
|
|
; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], s4 ; 4-byte Folded Reload
|
|
; MUBUF-NEXT: s_waitcnt vmcnt(0)
|
|
; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:8
|
|
; MUBUF-NEXT: s_waitcnt vmcnt(0)
|
|
; MUBUF-NEXT: s_endpgm
|
|
;
|
|
; FLATSCR-LABEL: test_sgpr_offset_kernel:
|
|
; FLATSCR: ; %bb.0: ; %entry
|
|
; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s0, s3
|
|
; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s1, 0
|
|
; FLATSCR-NEXT: s_mov_b32 vcc_lo, 0
|
|
; FLATSCR-NEXT: scratch_load_dword v0, off, vcc_lo offset:8 glc
|
|
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
|
|
; FLATSCR-NEXT: s_movk_i32 s0, 0x1000
|
|
; FLATSCR-NEXT: s_mov_b32 vcc_hi, 0
|
|
; FLATSCR-NEXT: scratch_store_dword off, v0, s0 ; 4-byte Folded Spill
|
|
; FLATSCR-NEXT: ;;#ASMSTART
|
|
; FLATSCR-NEXT: ;;#ASMEND
|
|
; FLATSCR-NEXT: s_movk_i32 s0, 0x1000
|
|
; FLATSCR-NEXT: scratch_load_dword v0, off, s0 ; 4-byte Folded Reload
|
|
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
|
|
; FLATSCR-NEXT: scratch_store_dword off, v0, vcc_hi offset:8
|
|
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
|
|
; FLATSCR-NEXT: s_endpgm
|
|
entry:
|
|
; Occupy 4096 bytes of scratch, so the offset of the spill of %a does not
|
|
; fit in the instruction, and has to live in the SGPR offset.
|
|
%alloca = alloca i8, i32 4092, align 4, addrspace(5)
|
|
%buf = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)*
|
|
|
|
%aptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1
|
|
; 0x40000 / 64 = 4096 (for wave64)
|
|
%a = load volatile i32, i32 addrspace(5)* %aptr
|
|
; Force %a to spill
|
|
call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" ()
|
|
|
|
%outptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1
|
|
store volatile i32 %a, i32 addrspace(5)* %outptr
|
|
|
|
ret void
|
|
}
|
|
|
|
define void @test_sgpr_offset_function_scavenge_fail_func() #2 {
|
|
; MUBUF-LABEL: test_sgpr_offset_function_scavenge_fail_func:
|
|
; MUBUF: ; %bb.0: ; %entry
|
|
; MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; MUBUF-NEXT: ;;#ASMSTART
|
|
; MUBUF-NEXT: ;;#ASMEND
|
|
; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:8 glc
|
|
; MUBUF-NEXT: s_waitcnt vmcnt(0)
|
|
; MUBUF-NEXT: v_mov_b32_e32 v1, 0x1004
|
|
; MUBUF-NEXT: buffer_store_dword v0, v1, s[0:3], s32 offen ; 4-byte Folded Spill
|
|
; MUBUF-NEXT: ;;#ASMSTART
|
|
; MUBUF-NEXT: ;;#ASMEND
|
|
; MUBUF-NEXT: ;;#ASMSTART
|
|
; MUBUF-NEXT: ;;#ASMEND
|
|
; MUBUF-NEXT: ;;#ASMSTART
|
|
; MUBUF-NEXT: ;;#ASMEND
|
|
; MUBUF-NEXT: v_mov_b32_e32 v1, 0x1004
|
|
; MUBUF-NEXT: buffer_load_dword v0, v1, s[0:3], s32 offen ; 4-byte Folded Reload
|
|
; MUBUF-NEXT: s_waitcnt vmcnt(0)
|
|
; MUBUF-NEXT: ;;#ASMSTART
|
|
; MUBUF-NEXT: ;;#ASMEND
|
|
; MUBUF-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; FLATSCR-LABEL: test_sgpr_offset_function_scavenge_fail_func:
|
|
; FLATSCR: ; %bb.0: ; %entry
|
|
; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; FLATSCR-NEXT: ;;#ASMSTART
|
|
; FLATSCR-NEXT: ;;#ASMEND
|
|
; FLATSCR-NEXT: scratch_load_dword v0, off, s32 offset:8 glc
|
|
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
|
|
; FLATSCR-NEXT: s_add_i32 s8, s32, 0x1004
|
|
; FLATSCR-NEXT: scratch_store_dword off, v0, s8 ; 4-byte Folded Spill
|
|
; FLATSCR-NEXT: ;;#ASMSTART
|
|
; FLATSCR-NEXT: ;;#ASMEND
|
|
; FLATSCR-NEXT: ;;#ASMSTART
|
|
; FLATSCR-NEXT: ;;#ASMEND
|
|
; FLATSCR-NEXT: ;;#ASMSTART
|
|
; FLATSCR-NEXT: ;;#ASMEND
|
|
; FLATSCR-NEXT: s_add_i32 s8, s32, 0x1004
|
|
; FLATSCR-NEXT: scratch_load_dword v0, off, s8 ; 4-byte Folded Reload
|
|
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
|
|
; FLATSCR-NEXT: ;;#ASMSTART
|
|
; FLATSCR-NEXT: ;;#ASMEND
|
|
; FLATSCR-NEXT: s_setpc_b64 s[30:31]
|
|
entry:
|
|
; Occupy 4096 bytes of scratch, so the offset of the spill of %a does not
|
|
; fit in the instruction, and has to live in the SGPR offset.
|
|
%alloca = alloca i8, i32 4096, align 4, addrspace(5)
|
|
%buf = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)*
|
|
|
|
%aptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1
|
|
|
|
%asm.0 = call { i32, i32, i32, i32, i32, i32, i32, i32 } asm sideeffect "", "=s,=s,=s,=s,=s,=s,=s,=s"()
|
|
%asm0.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 0
|
|
%asm1.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 1
|
|
%asm2.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 2
|
|
%asm3.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 3
|
|
%asm4.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 4
|
|
%asm5.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 5
|
|
%asm6.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 6
|
|
%asm7.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 7
|
|
|
|
; 0x40000 / 64 = 4096 (for wave64)
|
|
%a = load volatile i32, i32 addrspace(5)* %aptr
|
|
call void asm sideeffect "", "s,s,s,s,s,s,s,s,v"(i32 %asm0.0, i32 %asm1.0, i32 %asm2.0, i32 %asm3.0, i32 %asm4.0, i32 %asm5.0, i32 %asm6.0, i32 %asm7.0, i32 %a)
|
|
|
|
%asm = call { i32, i32, i32, i32, i32, i32, i32, i32 } asm sideeffect "", "=s,=s,=s,=s,=s,=s,=s,=s"()
|
|
%asm0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 0
|
|
%asm1 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 1
|
|
%asm2 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 2
|
|
%asm3 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 3
|
|
%asm4 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 4
|
|
%asm5 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 5
|
|
%asm6 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 6
|
|
%asm7 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 7
|
|
|
|
call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}"() #0
|
|
; Force %a to spill with no free SGPRs
|
|
call void asm sideeffect "", "s,s,s,s,s,s,s,s,v"(i32 %asm0, i32 %asm1, i32 %asm2, i32 %asm3, i32 %asm4, i32 %asm5, i32 %asm6, i32 %asm7, i32 %a)
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @test_sgpr_offset_function_scavenge_fail_kernel() #3 {
|
|
; MUBUF-LABEL: test_sgpr_offset_function_scavenge_fail_kernel:
|
|
; MUBUF: ; %bb.0: ; %entry
|
|
; MUBUF-NEXT: s_add_u32 s0, s0, s7
|
|
; MUBUF-NEXT: s_addc_u32 s1, s1, 0
|
|
; MUBUF-NEXT: ;;#ASMSTART
|
|
; MUBUF-NEXT: ;;#ASMEND
|
|
; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:8 glc
|
|
; MUBUF-NEXT: s_waitcnt vmcnt(0)
|
|
; MUBUF-NEXT: v_mov_b32_e32 v1, 0x1004
|
|
; MUBUF-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; 4-byte Folded Spill
|
|
; MUBUF-NEXT: ;;#ASMSTART
|
|
; MUBUF-NEXT: ;;#ASMEND
|
|
; MUBUF-NEXT: ;;#ASMSTART
|
|
; MUBUF-NEXT: ;;#ASMEND
|
|
; MUBUF-NEXT: ;;#ASMSTART
|
|
; MUBUF-NEXT: ;;#ASMEND
|
|
; MUBUF-NEXT: v_mov_b32_e32 v1, 0x1004
|
|
; MUBUF-NEXT: buffer_load_dword v0, v1, s[0:3], 0 offen ; 4-byte Folded Reload
|
|
; MUBUF-NEXT: s_waitcnt vmcnt(0)
|
|
; MUBUF-NEXT: ;;#ASMSTART
|
|
; MUBUF-NEXT: ;;#ASMEND
|
|
; MUBUF-NEXT: s_endpgm
|
|
;
|
|
; FLATSCR-LABEL: test_sgpr_offset_function_scavenge_fail_kernel:
|
|
; FLATSCR: ; %bb.0: ; %entry
|
|
; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s0, s3
|
|
; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s1, 0
|
|
; FLATSCR-NEXT: s_mov_b32 vcc_hi, 0
|
|
; FLATSCR-NEXT: ;;#ASMSTART
|
|
; FLATSCR-NEXT: ;;#ASMEND
|
|
; FLATSCR-NEXT: scratch_load_dword v0, off, vcc_hi offset:8 glc
|
|
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
|
|
; FLATSCR-NEXT: s_movk_i32 s8, 0x1004
|
|
; FLATSCR-NEXT: scratch_store_dword off, v0, s8 ; 4-byte Folded Spill
|
|
; FLATSCR-NEXT: ;;#ASMSTART
|
|
; FLATSCR-NEXT: ;;#ASMEND
|
|
; FLATSCR-NEXT: ;;#ASMSTART
|
|
; FLATSCR-NEXT: ;;#ASMEND
|
|
; FLATSCR-NEXT: ;;#ASMSTART
|
|
; FLATSCR-NEXT: ;;#ASMEND
|
|
; FLATSCR-NEXT: s_movk_i32 s8, 0x1004
|
|
; FLATSCR-NEXT: scratch_load_dword v0, off, s8 ; 4-byte Folded Reload
|
|
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
|
|
; FLATSCR-NEXT: ;;#ASMSTART
|
|
; FLATSCR-NEXT: ;;#ASMEND
|
|
; FLATSCR-NEXT: s_endpgm
|
|
entry:
|
|
; Occupy 4096 bytes of scratch, so the offset of the spill of %a does not
|
|
; fit in the instruction, and has to live in the SGPR offset.
|
|
%alloca = alloca i8, i32 4096, align 4, addrspace(5)
|
|
%buf = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)*
|
|
|
|
%aptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1
|
|
|
|
%asm.0 = call { i32, i32, i32, i32, i32, i32, i32, i32 } asm sideeffect "", "=s,=s,=s,=s,=s,=s,=s,=s"()
|
|
%asm0.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 0
|
|
%asm1.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 1
|
|
%asm2.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 2
|
|
%asm3.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 3
|
|
%asm4.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 4
|
|
%asm5.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 5
|
|
%asm6.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 6
|
|
%asm7.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 7
|
|
|
|
; 0x40000 / 64 = 4096 (for wave64)
|
|
%a = load volatile i32, i32 addrspace(5)* %aptr
|
|
call void asm sideeffect "", "s,s,s,s,s,s,s,s,v"(i32 %asm0.0, i32 %asm1.0, i32 %asm2.0, i32 %asm3.0, i32 %asm4.0, i32 %asm5.0, i32 %asm6.0, i32 %asm7.0, i32 %a)
|
|
|
|
%asm = call { i32, i32, i32, i32, i32, i32, i32, i32 } asm sideeffect "", "=s,=s,=s,=s,=s,=s,=s,=s"()
|
|
%asm0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 0
|
|
%asm1 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 1
|
|
%asm2 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 2
|
|
%asm3 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 3
|
|
%asm4 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 4
|
|
%asm5 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 5
|
|
%asm6 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 6
|
|
%asm7 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 7
|
|
|
|
call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}"() #0
|
|
; Force %a to spill with no free SGPRs
|
|
call void asm sideeffect "", "s,s,s,s,s,s,s,s,v"(i32 %asm0, i32 %asm1, i32 %asm2, i32 %asm3, i32 %asm4, i32 %asm5, i32 %asm6, i32 %asm7, i32 %a)
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @test_sgpr_offset_subregs_kernel() {
|
|
; MUBUF-LABEL: test_sgpr_offset_subregs_kernel:
|
|
; MUBUF: ; %bb.0: ; %entry
|
|
; MUBUF-NEXT: s_add_u32 s0, s0, s7
|
|
; MUBUF-NEXT: s_addc_u32 s1, s1, 0
|
|
; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:12 glc
|
|
; MUBUF-NEXT: s_waitcnt vmcnt(0)
|
|
; MUBUF-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:16 glc
|
|
; MUBUF-NEXT: s_waitcnt vmcnt(0)
|
|
; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4088 ; 4-byte Folded Spill
|
|
; MUBUF-NEXT: s_waitcnt vmcnt(0)
|
|
; MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:4092 ; 4-byte Folded Spill
|
|
; MUBUF-NEXT: ;;#ASMSTART
|
|
; MUBUF-NEXT: ;;#ASMEND
|
|
; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:8 glc
|
|
; MUBUF-NEXT: s_waitcnt vmcnt(0)
|
|
; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:4088 ; 4-byte Folded Reload
|
|
; MUBUF-NEXT: s_nop 0
|
|
; MUBUF-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:4092 ; 4-byte Folded Reload
|
|
; MUBUF-NEXT: s_waitcnt vmcnt(0)
|
|
; MUBUF-NEXT: ;;#ASMSTART
|
|
; MUBUF-NEXT: ; v[0:1]
|
|
; MUBUF-NEXT: ;;#ASMEND
|
|
; MUBUF-NEXT: s_endpgm
|
|
;
|
|
; FLATSCR-LABEL: test_sgpr_offset_subregs_kernel:
|
|
; FLATSCR: ; %bb.0: ; %entry
|
|
; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s0, s3
|
|
; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s1, 0
|
|
; FLATSCR-NEXT: s_mov_b32 vcc_lo, 0
|
|
; FLATSCR-NEXT: scratch_load_dwordx2 v[0:1], off, vcc_lo offset:12 glc
|
|
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
|
|
; FLATSCR-NEXT: s_movk_i32 s0, 0xff8
|
|
; FLATSCR-NEXT: s_mov_b32 vcc_hi, 0
|
|
; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s0 ; 8-byte Folded Spill
|
|
; FLATSCR-NEXT: ;;#ASMSTART
|
|
; FLATSCR-NEXT: ;;#ASMEND
|
|
; FLATSCR-NEXT: scratch_load_dword v0, off, vcc_hi offset:8 glc
|
|
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
|
|
; FLATSCR-NEXT: s_movk_i32 s0, 0xff8
|
|
; FLATSCR-NEXT: scratch_load_dwordx2 v[0:1], off, s0 ; 8-byte Folded Reload
|
|
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
|
|
; FLATSCR-NEXT: ;;#ASMSTART
|
|
; FLATSCR-NEXT: ; v[0:1]
|
|
; FLATSCR-NEXT: ;;#ASMEND
|
|
; FLATSCR-NEXT: s_endpgm
|
|
entry:
|
|
; Occupy 4088 bytes of scratch, so that the spill of the last subreg of %a
|
|
; still fits below offset 4096 (4088 + 8 - 4 = 4092), and can be placed in
|
|
; the instruction offset field.
|
|
%alloca = alloca i8, i32 4084, align 4, addrspace(5)
|
|
%bufv1 = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)*
|
|
%bufv2 = bitcast i8 addrspace(5)* %alloca to <2 x i32> addrspace(5)*
|
|
%aptr = getelementptr <2 x i32>, <2 x i32> addrspace(5)* %bufv2, i32 1
|
|
%a = load volatile <2 x i32>, <2 x i32> addrspace(5)* %aptr
|
|
|
|
; Force %a to spill.
|
|
call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" ()
|
|
|
|
; Ensure the alloca sticks around.
|
|
%bptr = getelementptr i32, i32 addrspace(5)* %bufv1, i32 1
|
|
%b = load volatile i32, i32 addrspace(5)* %bptr
|
|
|
|
; Ensure the spill is of the full super-reg.
|
|
call void asm sideeffect "; $0", "r"(<2 x i32> %a)
|
|
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @test_inst_offset_subregs_kernel() {
|
|
; MUBUF-LABEL: test_inst_offset_subregs_kernel:
|
|
; MUBUF: ; %bb.0: ; %entry
|
|
; MUBUF-NEXT: s_add_u32 s0, s0, s7
|
|
; MUBUF-NEXT: s_addc_u32 s1, s1, 0
|
|
; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:12 glc
|
|
; MUBUF-NEXT: s_waitcnt vmcnt(0)
|
|
; MUBUF-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:16 glc
|
|
; MUBUF-NEXT: s_waitcnt vmcnt(0)
|
|
; MUBUF-NEXT: s_mov_b32 s4, 0x3ff00
|
|
; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s4 ; 4-byte Folded Spill
|
|
; MUBUF-NEXT: s_waitcnt vmcnt(0)
|
|
; MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], s4 offset:4 ; 4-byte Folded Spill
|
|
; MUBUF-NEXT: ;;#ASMSTART
|
|
; MUBUF-NEXT: ;;#ASMEND
|
|
; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:8 glc
|
|
; MUBUF-NEXT: s_waitcnt vmcnt(0)
|
|
; MUBUF-NEXT: s_mov_b32 s4, 0x3ff00
|
|
; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], s4 ; 4-byte Folded Reload
|
|
; MUBUF-NEXT: s_nop 0
|
|
; MUBUF-NEXT: buffer_load_dword v1, off, s[0:3], s4 offset:4 ; 4-byte Folded Reload
|
|
; MUBUF-NEXT: s_waitcnt vmcnt(0)
|
|
; MUBUF-NEXT: ;;#ASMSTART
|
|
; MUBUF-NEXT: ; v[0:1]
|
|
; MUBUF-NEXT: ;;#ASMEND
|
|
; MUBUF-NEXT: s_endpgm
|
|
;
|
|
; FLATSCR-LABEL: test_inst_offset_subregs_kernel:
|
|
; FLATSCR: ; %bb.0: ; %entry
|
|
; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s0, s3
|
|
; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s1, 0
|
|
; FLATSCR-NEXT: s_mov_b32 vcc_lo, 0
|
|
; FLATSCR-NEXT: scratch_load_dwordx2 v[0:1], off, vcc_lo offset:12 glc
|
|
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
|
|
; FLATSCR-NEXT: s_movk_i32 s0, 0xffc
|
|
; FLATSCR-NEXT: s_mov_b32 vcc_hi, 0
|
|
; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s0 ; 8-byte Folded Spill
|
|
; FLATSCR-NEXT: ;;#ASMSTART
|
|
; FLATSCR-NEXT: ;;#ASMEND
|
|
; FLATSCR-NEXT: scratch_load_dword v0, off, vcc_hi offset:8 glc
|
|
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
|
|
; FLATSCR-NEXT: s_movk_i32 s0, 0xffc
|
|
; FLATSCR-NEXT: scratch_load_dwordx2 v[0:1], off, s0 ; 8-byte Folded Reload
|
|
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
|
|
; FLATSCR-NEXT: ;;#ASMSTART
|
|
; FLATSCR-NEXT: ; v[0:1]
|
|
; FLATSCR-NEXT: ;;#ASMEND
|
|
; FLATSCR-NEXT: s_endpgm
|
|
entry:
|
|
; Occupy 4092 bytes of scratch, so that the spill of the last subreg of %a
|
|
; does not fit below offset 4096 (4092 + 8 - 4 = 4096), and has to live
|
|
; in the SGPR offset.
|
|
%alloca = alloca i8, i32 4088, align 4, addrspace(5)
|
|
%bufv1 = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)*
|
|
%bufv2 = bitcast i8 addrspace(5)* %alloca to <2 x i32> addrspace(5)*
|
|
|
|
; 0x3ff00 / 64 = 4092 (for wave64)
|
|
%aptr = getelementptr <2 x i32>, <2 x i32> addrspace(5)* %bufv2, i32 1
|
|
%a = load volatile <2 x i32>, <2 x i32> addrspace(5)* %aptr
|
|
|
|
; Force %a to spill.
|
|
call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" ()
|
|
|
|
; Ensure the alloca sticks around.
|
|
%bptr = getelementptr i32, i32 addrspace(5)* %bufv1, i32 1
|
|
%b = load volatile i32, i32 addrspace(5)* %bptr
|
|
|
|
; Ensure the spill is of the full super-reg.
|
|
call void asm sideeffect "; $0", "r"(<2 x i32> %a)
|
|
|
|
ret void
|
|
}
|
|
|
|
define void @test_inst_offset_function() {
|
|
; MUBUF-LABEL: test_inst_offset_function:
|
|
; MUBUF: ; %bb.0: ; %entry
|
|
; MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 glc
|
|
; MUBUF-NEXT: s_waitcnt vmcnt(0)
|
|
; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4088 ; 4-byte Folded Spill
|
|
; MUBUF-NEXT: ;;#ASMSTART
|
|
; MUBUF-NEXT: ;;#ASMEND
|
|
; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4088 ; 4-byte Folded Reload
|
|
; MUBUF-NEXT: s_waitcnt vmcnt(0)
|
|
; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4
|
|
; MUBUF-NEXT: s_waitcnt vmcnt(0)
|
|
; MUBUF-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; FLATSCR-LABEL: test_inst_offset_function:
|
|
; FLATSCR: ; %bb.0: ; %entry
|
|
; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; FLATSCR-NEXT: scratch_load_dword v0, off, s32 offset:4 glc
|
|
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
|
|
; FLATSCR-NEXT: scratch_store_dword off, v0, s32 offset:4088 ; 4-byte Folded Spill
|
|
; FLATSCR-NEXT: ;;#ASMSTART
|
|
; FLATSCR-NEXT: ;;#ASMEND
|
|
; FLATSCR-NEXT: scratch_load_dword v0, off, s32 offset:4088 ; 4-byte Folded Reload
|
|
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
|
|
; FLATSCR-NEXT: scratch_store_dword off, v0, s32 offset:4
|
|
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
|
|
; FLATSCR-NEXT: s_setpc_b64 s[30:31]
|
|
entry:
|
|
; Occupy enough bytes of scratch, so the offset of the spill of %a
|
|
; just fits in the instruction offset field when the emergency stack
|
|
; slot is added. It's hard to hit the actual limit since we're also
|
|
; going to insert the emergency stack slot for large frames.
|
|
%alloca = alloca i8, i32 4088, align 4, addrspace(5)
|
|
%buf = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)*
|
|
|
|
%aptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1
|
|
|
|
|
|
%a = load volatile i32, i32 addrspace(5)* %aptr
|
|
|
|
; Force %a to spill.
|
|
call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" ()
|
|
|
|
%outptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1
|
|
store volatile i32 %a, i32 addrspace(5)* %outptr
|
|
|
|
ret void
|
|
}
|
|
|
|
define void @test_sgpr_offset_function() {
|
|
; MUBUF-LABEL: test_sgpr_offset_function:
|
|
; MUBUF: ; %bb.0: ; %entry
|
|
; MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:8 glc
|
|
; MUBUF-NEXT: s_waitcnt vmcnt(0)
|
|
; MUBUF-NEXT: s_add_i32 s4, s32, 0x40100
|
|
; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s4 ; 4-byte Folded Spill
|
|
; MUBUF-NEXT: ;;#ASMSTART
|
|
; MUBUF-NEXT: ;;#ASMEND
|
|
; MUBUF-NEXT: s_add_i32 s4, s32, 0x40100
|
|
; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], s4 ; 4-byte Folded Reload
|
|
; MUBUF-NEXT: s_waitcnt vmcnt(0)
|
|
; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:8
|
|
; MUBUF-NEXT: s_waitcnt vmcnt(0)
|
|
; MUBUF-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; FLATSCR-LABEL: test_sgpr_offset_function:
|
|
; FLATSCR: ; %bb.0: ; %entry
|
|
; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; FLATSCR-NEXT: scratch_load_dword v0, off, s32 offset:8 glc
|
|
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
|
|
; FLATSCR-NEXT: s_add_i32 s0, s32, 0x1004
|
|
; FLATSCR-NEXT: scratch_store_dword off, v0, s0 ; 4-byte Folded Spill
|
|
; FLATSCR-NEXT: ;;#ASMSTART
|
|
; FLATSCR-NEXT: ;;#ASMEND
|
|
; FLATSCR-NEXT: s_add_i32 s0, s32, 0x1004
|
|
; FLATSCR-NEXT: scratch_load_dword v0, off, s0 ; 4-byte Folded Reload
|
|
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
|
|
; FLATSCR-NEXT: scratch_store_dword off, v0, s32 offset:8
|
|
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
|
|
; FLATSCR-NEXT: s_setpc_b64 s[30:31]
|
|
entry:
|
|
; Occupy 4096 bytes of scratch, so the offset of the spill of %a does not
|
|
; fit in the instruction, and has to live in the SGPR offset.
|
|
%alloca = alloca i8, i32 4096, align 4, addrspace(5)
|
|
%buf = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)*
|
|
|
|
%aptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1
|
|
; 0x40000 / 64 = 4096 (for wave64)
|
|
%a = load volatile i32, i32 addrspace(5)* %aptr
|
|
|
|
; Force %a to spill
|
|
call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" ()
|
|
|
|
%outptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1
|
|
store volatile i32 %a, i32 addrspace(5)* %outptr
|
|
|
|
ret void
|
|
}
|
|
|
|
define void @test_sgpr_offset_subregs_function() {
|
|
; MUBUF-LABEL: test_sgpr_offset_subregs_function:
|
|
; MUBUF: ; %bb.0: ; %entry
|
|
; MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:8 glc
|
|
; MUBUF-NEXT: s_waitcnt vmcnt(0)
|
|
; MUBUF-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:12 glc
|
|
; MUBUF-NEXT: s_waitcnt vmcnt(0)
|
|
; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4084 ; 4-byte Folded Spill
|
|
; MUBUF-NEXT: s_waitcnt vmcnt(0)
|
|
; MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4088 ; 4-byte Folded Spill
|
|
; MUBUF-NEXT: ;;#ASMSTART
|
|
; MUBUF-NEXT: ;;#ASMEND
|
|
; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 glc
|
|
; MUBUF-NEXT: s_waitcnt vmcnt(0)
|
|
; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4084 ; 4-byte Folded Reload
|
|
; MUBUF-NEXT: s_nop 0
|
|
; MUBUF-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4088 ; 4-byte Folded Reload
|
|
; MUBUF-NEXT: s_waitcnt vmcnt(0)
|
|
; MUBUF-NEXT: ;;#ASMSTART
|
|
; MUBUF-NEXT: ; v[0:1]
|
|
; MUBUF-NEXT: ;;#ASMEND
|
|
; MUBUF-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; FLATSCR-LABEL: test_sgpr_offset_subregs_function:
|
|
; FLATSCR: ; %bb.0: ; %entry
|
|
; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; FLATSCR-NEXT: scratch_load_dwordx2 v[0:1], off, s32 offset:8 glc
|
|
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
|
|
; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s32 offset:4084 ; 8-byte Folded Spill
|
|
; FLATSCR-NEXT: ;;#ASMSTART
|
|
; FLATSCR-NEXT: ;;#ASMEND
|
|
; FLATSCR-NEXT: scratch_load_dword v0, off, s32 offset:4 glc
|
|
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
|
|
; FLATSCR-NEXT: scratch_load_dwordx2 v[0:1], off, s32 offset:4084 ; 8-byte Folded Reload
|
|
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
|
|
; FLATSCR-NEXT: ;;#ASMSTART
|
|
; FLATSCR-NEXT: ; v[0:1]
|
|
; FLATSCR-NEXT: ;;#ASMEND
|
|
; FLATSCR-NEXT: s_setpc_b64 s[30:31]
|
|
entry:
|
|
; We want to test the spill of the last subreg of %a is the highest
|
|
; valid value for the immediate offset. We enable the emergency
|
|
; stack slot for large frames, so it's hard to get the frame layout
|
|
; exactly as we want to test it.
|
|
; Occupy 4084 bytes of scratch, so that the spill of the last subreg of %a
|
|
; still fits below offset 4096 (4084 + 8 - 4 = 4092), and can be placed in
|
|
; the instruction offset field.
|
|
%alloca = alloca i8, i32 4084, align 4, addrspace(5)
|
|
%bufv1 = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)*
|
|
%bufv2 = bitcast i8 addrspace(5)* %alloca to <2 x i32> addrspace(5)*
|
|
%aptr = getelementptr <2 x i32>, <2 x i32> addrspace(5)* %bufv2, i32 1
|
|
%a = load volatile <2 x i32>, <2 x i32> addrspace(5)* %aptr
|
|
|
|
; Force %a to spill.
|
|
call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" ()
|
|
|
|
; Ensure the alloca sticks around.
|
|
%bptr = getelementptr i32, i32 addrspace(5)* %bufv1, i32 1
|
|
%b = load volatile i32, i32 addrspace(5)* %bptr
|
|
|
|
; Ensure the spill is of the full super-reg.
|
|
call void asm sideeffect "; $0", "r"(<2 x i32> %a)
|
|
|
|
ret void
|
|
}
|
|
|
|
define void @test_inst_offset_subregs_function() {
|
|
; MUBUF-LABEL: test_inst_offset_subregs_function:
|
|
; MUBUF: ; %bb.0: ; %entry
|
|
; MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:12 glc
|
|
; MUBUF-NEXT: s_waitcnt vmcnt(0)
|
|
; MUBUF-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:16 glc
|
|
; MUBUF-NEXT: s_waitcnt vmcnt(0)
|
|
; MUBUF-NEXT: s_add_i32 s4, s32, 0x3ff00
|
|
; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s4 ; 4-byte Folded Spill
|
|
; MUBUF-NEXT: s_waitcnt vmcnt(0)
|
|
; MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], s4 offset:4 ; 4-byte Folded Spill
|
|
; MUBUF-NEXT: ;;#ASMSTART
|
|
; MUBUF-NEXT: ;;#ASMEND
|
|
; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:8 glc
|
|
; MUBUF-NEXT: s_waitcnt vmcnt(0)
|
|
; MUBUF-NEXT: s_add_i32 s4, s32, 0x3ff00
|
|
; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], s4 ; 4-byte Folded Reload
|
|
; MUBUF-NEXT: s_nop 0
|
|
; MUBUF-NEXT: buffer_load_dword v1, off, s[0:3], s4 offset:4 ; 4-byte Folded Reload
|
|
; MUBUF-NEXT: s_waitcnt vmcnt(0)
|
|
; MUBUF-NEXT: ;;#ASMSTART
|
|
; MUBUF-NEXT: ; v[0:1]
|
|
; MUBUF-NEXT: ;;#ASMEND
|
|
; MUBUF-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; FLATSCR-LABEL: test_inst_offset_subregs_function:
|
|
; FLATSCR: ; %bb.0: ; %entry
|
|
; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; FLATSCR-NEXT: scratch_load_dwordx2 v[0:1], off, s32 offset:12 glc
|
|
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
|
|
; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s32 offset:4092 ; 8-byte Folded Spill
|
|
; FLATSCR-NEXT: ;;#ASMSTART
|
|
; FLATSCR-NEXT: ;;#ASMEND
|
|
; FLATSCR-NEXT: scratch_load_dword v0, off, s32 offset:8 glc
|
|
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
|
|
; FLATSCR-NEXT: scratch_load_dwordx2 v[0:1], off, s32 offset:4092 ; 8-byte Folded Reload
|
|
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
|
|
; FLATSCR-NEXT: ;;#ASMSTART
|
|
; FLATSCR-NEXT: ; v[0:1]
|
|
; FLATSCR-NEXT: ;;#ASMEND
|
|
; FLATSCR-NEXT: s_setpc_b64 s[30:31]
|
|
entry:
|
|
; Occupy 4088 bytes of scratch, so that the spill of the last subreg of %a
|
|
; does not fit below offset 4096 (408 + 4 + 8 - 4 = 4096), and has to live
|
|
; in the SGPR offset.
|
|
%alloca = alloca i8, i32 4088, align 4, addrspace(5)
|
|
%bufv1 = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)*
|
|
%bufv2 = bitcast i8 addrspace(5)* %alloca to <2 x i32> addrspace(5)*
|
|
|
|
; 0x3ff0000 / 64 = 4092 (for wave64)
|
|
%aptr = getelementptr <2 x i32>, <2 x i32> addrspace(5)* %bufv2, i32 1
|
|
%a = load volatile <2 x i32>, <2 x i32> addrspace(5)* %aptr
|
|
|
|
; Force %a to spill.
|
|
call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" ()
|
|
|
|
; Ensure the alloca sticks around.
|
|
%bptr = getelementptr i32, i32 addrspace(5)* %bufv1, i32 1
|
|
%b = load volatile i32, i32 addrspace(5)* %bptr
|
|
|
|
; Ensure the spill is of the full super-reg.
|
|
call void asm sideeffect "; $0", "r"(<2 x i32> %a)
|
|
|
|
ret void
|
|
}
|
|
|
|
attributes #0 = { nounwind }
|
|
attributes #1 = { nounwind "amdgpu-num-sgpr"="17" "amdgpu-num-vgpr"="8" }
|
|
attributes #2 = { nounwind "amdgpu-num-sgpr"="14" "amdgpu-num-vgpr"="8" }
|
|
attributes #3 = { nounwind "amdgpu-num-sgpr"="16" "amdgpu-num-vgpr"="8" }
|