980 lines
48 KiB
LLVM
980 lines
48 KiB
LLVM
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
|
; RUN: llc < %s -O0 -mtriple=x86_64-unknown-unknown -mattr=+amx-int8 -mattr=+avx512f | FileCheck %s --check-prefix=AVX512
|
|
|
|
%struct.__tile1024i_str = type <{ i16, i16, [60 x i8], <256 x i32> }>
|
|
@buf = dso_local global [1024 x i8] zeroinitializer, align 16
|
|
@buf2 = dso_local global [1024 x i8] zeroinitializer, align 16
|
|
|
|
define dso_local void @test_api(i32 %cond, i16 signext %row, i16 signext %col) #0 {
|
|
; AVX512-LABEL: test_api:
|
|
; AVX512: # %bb.0: # %entry
|
|
; AVX512-NEXT: pushq %rbp
|
|
; AVX512-NEXT: movq %rsp, %rbp
|
|
; AVX512-NEXT: andq $-1024, %rsp # imm = 0xFC00
|
|
; AVX512-NEXT: subq $25600, %rsp # imm = 0x6400
|
|
; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
|
|
; AVX512-NEXT: vmovups %zmm0, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: movb $1, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: movw %dx, %ax
|
|
; AVX512-NEXT: movw %si, %cx
|
|
; AVX512-NEXT: movl %edi, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: movw %cx, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: movw %ax, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
|
|
; AVX512-NEXT: xorl %esi, %esi
|
|
; AVX512-NEXT: movl $1088, %edx # imm = 0x440
|
|
; AVX512-NEXT: vzeroupper
|
|
; AVX512-NEXT: callq memset@PLT
|
|
; AVX512-NEXT: movw {{[0-9]+}}(%rsp), %ax
|
|
; AVX512-NEXT: movw %ax, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: movw $8, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
|
|
; AVX512-NEXT: xorl %esi, %esi
|
|
; AVX512-NEXT: movl $1088, %edx # imm = 0x440
|
|
; AVX512-NEXT: callq memset@PLT
|
|
; AVX512-NEXT: movw $8, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: movw {{[0-9]+}}(%rsp), %ax
|
|
; AVX512-NEXT: movw %ax, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
|
|
; AVX512-NEXT: xorl %esi, %esi
|
|
; AVX512-NEXT: movl $1088, %edx # imm = 0x440
|
|
; AVX512-NEXT: callq memset@PLT
|
|
; AVX512-NEXT: movw {{[0-9]+}}(%rsp), %ax
|
|
; AVX512-NEXT: movw %ax, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: movw {{[0-9]+}}(%rsp), %ax
|
|
; AVX512-NEXT: movw %ax, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: cmpl $0, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: je .LBB0_2
|
|
; AVX512-NEXT: # %bb.1: # %if.then
|
|
; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rax
|
|
; AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: movabsq $buf, %rax
|
|
; AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: movq $32, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
|
|
; AVX512-NEXT: movw (%rax), %si
|
|
; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
|
|
; AVX512-NEXT: movw 2(%rax), %dx
|
|
; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rcx
|
|
; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
|
|
; AVX512-NEXT: movw %si, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: movw %dx, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: movw {{[0-9]+}}(%rsp), %ax
|
|
; AVX512-NEXT: movw {{[0-9]+}}(%rsp), %cx
|
|
; AVX512-NEXT: # implicit-def: $al
|
|
; AVX512-NEXT: movb %al, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: movw %cx, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rdx
|
|
; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rsi
|
|
; AVX512-NEXT: tileloadd (%rdx,%rsi), %tmm0
|
|
; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rdx
|
|
; AVX512-NEXT: addq $64, %rdx
|
|
; AVX512-NEXT: movl $64, %esi
|
|
; AVX512-NEXT: tilestored %tmm0, (%rdx,%rsi)
|
|
; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rax
|
|
; AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: movabsq $buf, %rax
|
|
; AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: movq $32, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
|
|
; AVX512-NEXT: movw (%rax), %si
|
|
; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
|
|
; AVX512-NEXT: movw 2(%rax), %dx
|
|
; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rcx
|
|
; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
|
|
; AVX512-NEXT: movw %si, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: movw %dx, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: movw {{[0-9]+}}(%rsp), %ax
|
|
; AVX512-NEXT: movw {{[0-9]+}}(%rsp), %cx
|
|
; AVX512-NEXT: # implicit-def: $al
|
|
; AVX512-NEXT: movb %al, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: movw %cx, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rdx
|
|
; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rsi
|
|
; AVX512-NEXT: tileloadd (%rdx,%rsi), %tmm0
|
|
; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rdx
|
|
; AVX512-NEXT: addq $64, %rdx
|
|
; AVX512-NEXT: movl $64, %esi
|
|
; AVX512-NEXT: tilestored %tmm0, (%rdx,%rsi)
|
|
; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rax
|
|
; AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: movabsq $buf, %rax
|
|
; AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: movq $32, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
|
|
; AVX512-NEXT: movw (%rax), %si
|
|
; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
|
|
; AVX512-NEXT: movw 2(%rax), %dx
|
|
; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rcx
|
|
; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
|
|
; AVX512-NEXT: movw %si, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: movw %dx, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: movw {{[0-9]+}}(%rsp), %ax
|
|
; AVX512-NEXT: movw {{[0-9]+}}(%rsp), %cx
|
|
; AVX512-NEXT: # implicit-def: $al
|
|
; AVX512-NEXT: movb %al, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: movw %cx, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rdx
|
|
; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rsi
|
|
; AVX512-NEXT: tileloadd (%rdx,%rsi), %tmm0
|
|
; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rdx
|
|
; AVX512-NEXT: addq $64, %rdx
|
|
; AVX512-NEXT: movl $64, %esi
|
|
; AVX512-NEXT: tilestored %tmm0, (%rdx,%rsi)
|
|
; AVX512-NEXT: jmp .LBB0_3
|
|
; AVX512-NEXT: .LBB0_2: # %if.else
|
|
; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rax
|
|
; AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: movabsq $buf2, %rax
|
|
; AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: movq $32, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
|
|
; AVX512-NEXT: movw (%rax), %si
|
|
; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
|
|
; AVX512-NEXT: movw 2(%rax), %dx
|
|
; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rcx
|
|
; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
|
|
; AVX512-NEXT: movw %si, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: movw %dx, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: movw {{[0-9]+}}(%rsp), %ax
|
|
; AVX512-NEXT: movw {{[0-9]+}}(%rsp), %cx
|
|
; AVX512-NEXT: # implicit-def: $al
|
|
; AVX512-NEXT: movb %al, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: movw %cx, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rdx
|
|
; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rsi
|
|
; AVX512-NEXT: tileloadd (%rdx,%rsi), %tmm0
|
|
; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rdx
|
|
; AVX512-NEXT: addq $64, %rdx
|
|
; AVX512-NEXT: movl $64, %esi
|
|
; AVX512-NEXT: tilestored %tmm0, (%rdx,%rsi)
|
|
; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rax
|
|
; AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: movabsq $buf2, %rax
|
|
; AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: movq $32, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
|
|
; AVX512-NEXT: movw (%rax), %si
|
|
; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
|
|
; AVX512-NEXT: movw 2(%rax), %dx
|
|
; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rcx
|
|
; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
|
|
; AVX512-NEXT: movw %si, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: movw %dx, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: movw {{[0-9]+}}(%rsp), %ax
|
|
; AVX512-NEXT: movw {{[0-9]+}}(%rsp), %cx
|
|
; AVX512-NEXT: # implicit-def: $al
|
|
; AVX512-NEXT: movb %al, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: movw %cx, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rdx
|
|
; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rsi
|
|
; AVX512-NEXT: tileloadd (%rdx,%rsi), %tmm0
|
|
; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rdx
|
|
; AVX512-NEXT: addq $64, %rdx
|
|
; AVX512-NEXT: movl $64, %esi
|
|
; AVX512-NEXT: tilestored %tmm0, (%rdx,%rsi)
|
|
; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rax
|
|
; AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: movabsq $buf2, %rax
|
|
; AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: movq $32, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
|
|
; AVX512-NEXT: movw (%rax), %si
|
|
; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
|
|
; AVX512-NEXT: movw 2(%rax), %dx
|
|
; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rcx
|
|
; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
|
|
; AVX512-NEXT: movw %si, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: movw %dx, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: movw {{[0-9]+}}(%rsp), %ax
|
|
; AVX512-NEXT: movw {{[0-9]+}}(%rsp), %cx
|
|
; AVX512-NEXT: # implicit-def: $al
|
|
; AVX512-NEXT: movb %al, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: movw %cx, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rdx
|
|
; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rsi
|
|
; AVX512-NEXT: tileloadd (%rdx,%rsi), %tmm0
|
|
; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rdx
|
|
; AVX512-NEXT: addq $64, %rdx
|
|
; AVX512-NEXT: movl $64, %esi
|
|
; AVX512-NEXT: tilestored %tmm0, (%rdx,%rsi)
|
|
; AVX512-NEXT: .LBB0_3: # %if.end
|
|
; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
|
|
; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rsi
|
|
; AVX512-NEXT: movl $1088, %edx # imm = 0x440
|
|
; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
|
|
; AVX512-NEXT: callq memcpy@PLT
|
|
; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
|
|
; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
|
|
; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rsi
|
|
; AVX512-NEXT: callq memcpy@PLT
|
|
; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rax
|
|
; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
|
|
; AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: movw {{[0-9]+}}(%rsp), %ax
|
|
; AVX512-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
|
|
; AVX512-NEXT: movw {{[0-9]+}}(%rsp), %ax
|
|
; AVX512-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
|
|
; AVX512-NEXT: movw {{[0-9]+}}(%rsp), %ax
|
|
; AVX512-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
|
|
; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
|
|
; AVX512-NEXT: vmovdqa64 64(%rax), %zmm0
|
|
; AVX512-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
|
|
; AVX512-NEXT: vmovdqa64 128(%rax), %zmm0
|
|
; AVX512-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
|
|
; AVX512-NEXT: vmovdqa64 192(%rax), %zmm0
|
|
; AVX512-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
|
|
; AVX512-NEXT: vmovdqa64 256(%rax), %zmm0
|
|
; AVX512-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
|
|
; AVX512-NEXT: vmovdqa64 320(%rax), %zmm0
|
|
; AVX512-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
|
|
; AVX512-NEXT: vmovdqa64 384(%rax), %zmm0
|
|
; AVX512-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
|
|
; AVX512-NEXT: vmovdqa64 448(%rax), %zmm0
|
|
; AVX512-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
|
|
; AVX512-NEXT: vmovdqa64 512(%rax), %zmm0
|
|
; AVX512-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
|
|
; AVX512-NEXT: vmovdqa64 576(%rax), %zmm0
|
|
; AVX512-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
|
|
; AVX512-NEXT: vmovdqa64 640(%rax), %zmm0
|
|
; AVX512-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
|
|
; AVX512-NEXT: vmovdqa64 704(%rax), %zmm0
|
|
; AVX512-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
|
|
; AVX512-NEXT: vmovdqa64 768(%rax), %zmm0
|
|
; AVX512-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
|
|
; AVX512-NEXT: vmovdqa64 832(%rax), %zmm0
|
|
; AVX512-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
|
|
; AVX512-NEXT: vmovdqa64 896(%rax), %zmm0
|
|
; AVX512-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
|
|
; AVX512-NEXT: vmovdqa64 960(%rax), %zmm0
|
|
; AVX512-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
|
|
; AVX512-NEXT: vmovdqa64 1024(%rax), %zmm0
|
|
; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm16
|
|
; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm17
|
|
; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm18
|
|
; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm19
|
|
; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm20
|
|
; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm21
|
|
; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm22
|
|
; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm23
|
|
; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm24
|
|
; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm25
|
|
; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm26
|
|
; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm27
|
|
; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm28
|
|
; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm29
|
|
; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm30
|
|
; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm31
|
|
; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm1
|
|
; AVX512-NEXT: vmovaps %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
|
|
; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm1
|
|
; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm2
|
|
; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm3
|
|
; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm4
|
|
; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm5
|
|
; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm6
|
|
; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm7
|
|
; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm8
|
|
; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm9
|
|
; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm10
|
|
; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm11
|
|
; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm12
|
|
; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm13
|
|
; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm14
|
|
; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm15
|
|
; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
|
|
; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
|
|
; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
|
|
; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
|
|
; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
|
|
; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
|
|
; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
|
|
; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
|
|
; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
|
|
; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
|
|
; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
|
|
; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
|
|
; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
|
|
; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
|
|
; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
|
|
; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
|
|
; AVX512-NEXT: vmovdqa64 %zmm31, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: vmovdqa64 %zmm30, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: vmovdqa64 %zmm29, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: vmovdqa64 %zmm28, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: vmovdqa64 %zmm27, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: vmovdqa64 %zmm26, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: vmovdqa64 %zmm25, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: vmovdqa64 %zmm24, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: vmovdqa64 %zmm23, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: vmovdqa64 %zmm22, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: vmovdqa64 %zmm21, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: vmovdqa64 %zmm20, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: vmovdqa64 %zmm19, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: vmovdqa64 %zmm18, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: vmovdqa64 %zmm17, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: vmovdqa64 %zmm16, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: vmovdqa64 %zmm15, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: vmovdqa64 %zmm14, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: vmovdqa64 %zmm13, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: vmovdqa64 %zmm12, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: vmovdqa64 %zmm11, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: vmovdqa64 %zmm10, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: vmovdqa64 %zmm9, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: vmovdqa64 %zmm8, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: vmovdqa64 %zmm7, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: vmovdqa64 %zmm6, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: vmovdqa64 %zmm5, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: vmovdqa64 %zmm4, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: vmovdqa64 %zmm3, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: vmovdqa64 %zmm2, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: vmovdqa64 %zmm1, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
|
|
; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rsi
|
|
; AVX512-NEXT: movl $1024, %edx # imm = 0x400
|
|
; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
|
|
; AVX512-NEXT: vzeroupper
|
|
; AVX512-NEXT: callq memcpy@PLT
|
|
; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
|
|
; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
|
|
; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rsi
|
|
; AVX512-NEXT: callq memcpy@PLT
|
|
; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
|
|
; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
|
|
; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rsi
|
|
; AVX512-NEXT: callq memcpy@PLT
|
|
; AVX512-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %di # 2-byte Reload
|
|
; AVX512-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %cx # 2-byte Reload
|
|
; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
|
|
; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
|
|
; AVX512-NEXT: # kill: def $r8 killed $rax
|
|
; AVX512-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %ax # 2-byte Reload
|
|
; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm0
|
|
; AVX512-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
|
|
; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm0
|
|
; AVX512-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
|
|
; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm0
|
|
; AVX512-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
|
|
; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm0
|
|
; AVX512-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
|
|
; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm0
|
|
; AVX512-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
|
|
; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm0
|
|
; AVX512-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
|
|
; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm0
|
|
; AVX512-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
|
|
; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm0
|
|
; AVX512-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
|
|
; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm0
|
|
; AVX512-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
|
|
; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm0
|
|
; AVX512-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
|
|
; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm0
|
|
; AVX512-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
|
|
; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm0
|
|
; AVX512-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
|
|
; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm0
|
|
; AVX512-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
|
|
; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm0
|
|
; AVX512-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
|
|
; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm0
|
|
; AVX512-NEXT: vmovaps %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
|
|
; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm0
|
|
; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm16
|
|
; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm17
|
|
; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm18
|
|
; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm19
|
|
; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm20
|
|
; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm21
|
|
; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm22
|
|
; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm23
|
|
; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm24
|
|
; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm25
|
|
; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm26
|
|
; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm27
|
|
; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm28
|
|
; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm29
|
|
; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm30
|
|
; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm31
|
|
; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm1
|
|
; AVX512-NEXT: vmovaps %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
|
|
; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm1
|
|
; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm2
|
|
; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm3
|
|
; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm4
|
|
; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm5
|
|
; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm6
|
|
; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm7
|
|
; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm8
|
|
; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm9
|
|
; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm10
|
|
; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm11
|
|
; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm12
|
|
; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm13
|
|
; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm14
|
|
; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm15
|
|
; AVX512-NEXT: movw %di, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: movw %cx, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: movw %ax, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
|
|
; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
|
|
; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
|
|
; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
|
|
; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
|
|
; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
|
|
; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
|
|
; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
|
|
; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
|
|
; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
|
|
; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
|
|
; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
|
|
; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
|
|
; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
|
|
; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
|
|
; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
|
|
; AVX512-NEXT: vmovdqa64 %zmm31, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: vmovdqa64 %zmm30, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: vmovdqa64 %zmm29, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: vmovdqa64 %zmm28, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: vmovdqa64 %zmm27, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: vmovdqa64 %zmm26, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: vmovdqa64 %zmm25, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: vmovdqa64 %zmm24, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: vmovdqa64 %zmm23, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: vmovdqa64 %zmm22, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: vmovdqa64 %zmm21, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: vmovdqa64 %zmm20, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: vmovdqa64 %zmm19, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: vmovdqa64 %zmm18, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: vmovdqa64 %zmm17, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: vmovdqa64 %zmm16, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: vmovdqa64 %zmm15, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: vmovdqa64 %zmm14, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: vmovdqa64 %zmm13, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: vmovdqa64 %zmm12, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: vmovdqa64 %zmm11, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: vmovdqa64 %zmm10, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: vmovdqa64 %zmm9, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: vmovdqa64 %zmm8, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: vmovdqa64 %zmm7, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: vmovdqa64 %zmm6, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: vmovdqa64 %zmm5, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: vmovdqa64 %zmm4, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: vmovdqa64 %zmm3, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: vmovdqa64 %zmm2, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: vmovdqa64 %zmm1, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: movw {{[0-9]+}}(%rsp), %ax
|
|
; AVX512-NEXT: movw {{[0-9]+}}(%rsp), %cx
|
|
; AVX512-NEXT: movzwl {{[0-9]+}}(%rsp), %r8d
|
|
; AVX512-NEXT: movw %r8w, %di
|
|
; AVX512-NEXT: shrl $2, %r8d
|
|
; AVX512-NEXT: movw %r8w, %r9w
|
|
; AVX512-NEXT: # implicit-def: $al
|
|
; AVX512-NEXT: movb %al, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: movw %cx, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: # implicit-def: $r9b
|
|
; AVX512-NEXT: movb %r9b, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: movw %cx, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: # implicit-def: $al
|
|
; AVX512-NEXT: movb %al, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: movw %di, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: # implicit-def: $al
|
|
; AVX512-NEXT: movb %al, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: movw %cx, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: movl $64, %r8d
|
|
; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %r10
|
|
; AVX512-NEXT: tileloadd (%r10,%r8), %tmm0
|
|
; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %r10
|
|
; AVX512-NEXT: tileloadd (%r10,%r8), %tmm1
|
|
; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %r10
|
|
; AVX512-NEXT: tileloadd (%r10,%r8), %tmm2
|
|
; AVX512-NEXT: tdpbssd %tmm2, %tmm1, %tmm0
|
|
; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rdi
|
|
; AVX512-NEXT: addq $64, %rdi
|
|
; AVX512-NEXT: tilestored %tmm0, (%rdi,%r8)
|
|
; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
|
|
; AVX512-NEXT: vzeroupper
|
|
; AVX512-NEXT: callq memcpy@PLT
|
|
; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
|
|
; AVX512-NEXT: movq $buf, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: movq $32, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: movw {{[0-9]+}}(%rsp), %ax
|
|
; AVX512-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
|
|
; AVX512-NEXT: movw {{[0-9]+}}(%rsp), %ax
|
|
; AVX512-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
|
|
; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
|
|
; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
|
|
; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
|
|
; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
|
|
; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm0
|
|
; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm1
|
|
; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm2
|
|
; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm3
|
|
; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm4
|
|
; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm5
|
|
; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm6
|
|
; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm7
|
|
; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm8
|
|
; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm9
|
|
; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm10
|
|
; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm11
|
|
; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm12
|
|
; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm13
|
|
; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm14
|
|
; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm15
|
|
; AVX512-NEXT: vmovdqa64 %zmm15, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: vmovdqa64 %zmm14, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: vmovdqa64 %zmm13, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: vmovdqa64 %zmm12, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: vmovdqa64 %zmm11, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: vmovdqa64 %zmm10, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: vmovdqa64 %zmm9, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: vmovdqa64 %zmm8, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: vmovdqa64 %zmm7, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: vmovdqa64 %zmm6, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: vmovdqa64 %zmm5, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: vmovdqa64 %zmm4, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: vmovdqa64 %zmm3, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: vmovdqa64 %zmm2, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: vmovdqa64 %zmm1, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
|
|
; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rsi
|
|
; AVX512-NEXT: vzeroupper
|
|
; AVX512-NEXT: callq memcpy@PLT
|
|
; AVX512-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %si # 2-byte Reload
|
|
; AVX512-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %dx # 2-byte Reload
|
|
; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
|
|
; AVX512-NEXT: # kill: def $rdi killed $rax
|
|
; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
|
|
; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm0
|
|
; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm1
|
|
; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm2
|
|
; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm3
|
|
; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm4
|
|
; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm5
|
|
; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm6
|
|
; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm7
|
|
; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm8
|
|
; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm9
|
|
; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm10
|
|
; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm11
|
|
; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm12
|
|
; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm13
|
|
; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm14
|
|
; AVX512-NEXT: vmovdqa64 {{[0-9]+}}(%rsp), %zmm15
|
|
; AVX512-NEXT: movw %si, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: movw %dx, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: vmovdqa64 %zmm15, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: vmovdqa64 %zmm14, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: vmovdqa64 %zmm13, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: vmovdqa64 %zmm12, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: vmovdqa64 %zmm11, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: vmovdqa64 %zmm10, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: vmovdqa64 %zmm9, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: vmovdqa64 %zmm8, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: vmovdqa64 %zmm7, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: vmovdqa64 %zmm6, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: vmovdqa64 %zmm5, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: vmovdqa64 %zmm4, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: vmovdqa64 %zmm3, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: vmovdqa64 %zmm2, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: vmovdqa64 %zmm1, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: movw {{[0-9]+}}(%rsp), %ax
|
|
; AVX512-NEXT: movw {{[0-9]+}}(%rsp), %cx
|
|
; AVX512-NEXT: # implicit-def: $al
|
|
; AVX512-NEXT: movb %al, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: movw %cx, {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
|
|
; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rdx
|
|
; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rsi
|
|
; AVX512-NEXT: movl $64, %r8d
|
|
; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
|
|
; AVX512-NEXT: tileloadd (%rdi,%r8), %tmm0
|
|
; AVX512-NEXT: tilestored %tmm0, (%rdx,%rsi)
|
|
; AVX512-NEXT: movq %rbp, %rsp
|
|
; AVX512-NEXT: popq %rbp
|
|
; AVX512-NEXT: tilerelease
|
|
; AVX512-NEXT: vzeroupper
|
|
; AVX512-NEXT: retq
|
|
entry:
|
|
%m.addr.i85 = alloca i16, align 2
|
|
%n.addr.i86 = alloca i16, align 2
|
|
%base.addr.i87 = alloca ptr, align 8
|
|
%stride.addr.i88 = alloca i64, align 8
|
|
%tile.addr.i = alloca <256 x i32>, align 64
|
|
%indirect-arg-temp.i5284 = alloca <256 x i32>, align 1024
|
|
%m.addr.i81 = alloca i16, align 2
|
|
%n.addr.i82 = alloca i16, align 2
|
|
%k.addr.i = alloca i16, align 2
|
|
%dst.addr.i83 = alloca <256 x i32>, align 64
|
|
%src1.addr.i = alloca <256 x i32>, align 64
|
|
%src2.addr.i = alloca <256 x i32>, align 64
|
|
%indirect-arg-temp5.i80 = alloca <256 x i32>, align 1024
|
|
%indirect-arg-temp4.i79 = alloca <256 x i32>, align 1024
|
|
%indirect-arg-temp.i78 = alloca <256 x i32>, align 1024
|
|
%m.addr.i74 = alloca i16, align 2
|
|
%n.addr.i75 = alloca i16, align 2
|
|
%base.addr.i76 = alloca ptr, align 8
|
|
%stride.addr.i77 = alloca i64, align 8
|
|
%m.addr.i70 = alloca i16, align 2
|
|
%n.addr.i71 = alloca i16, align 2
|
|
%base.addr.i72 = alloca ptr, align 8
|
|
%stride.addr.i73 = alloca i64, align 8
|
|
%m.addr.i66 = alloca i16, align 2
|
|
%n.addr.i67 = alloca i16, align 2
|
|
%base.addr.i68 = alloca ptr, align 8
|
|
%stride.addr.i69 = alloca i64, align 8
|
|
%m.addr.i62 = alloca i16, align 2
|
|
%n.addr.i63 = alloca i16, align 2
|
|
%base.addr.i64 = alloca ptr, align 8
|
|
%stride.addr.i65 = alloca i64, align 8
|
|
%m.addr.i58 = alloca i16, align 2
|
|
%n.addr.i59 = alloca i16, align 2
|
|
%base.addr.i60 = alloca ptr, align 8
|
|
%stride.addr.i61 = alloca i64, align 8
|
|
%m.addr.i = alloca i16, align 2
|
|
%n.addr.i = alloca i16, align 2
|
|
%base.addr.i56 = alloca ptr, align 8
|
|
%stride.addr.i57 = alloca i64, align 8
|
|
%base.addr.i50 = alloca ptr, align 8
|
|
%stride.addr.i51 = alloca i64, align 8
|
|
%indirect-arg-temp.i52 = alloca <256 x i32>, align 1024
|
|
%c49 = alloca %struct.__tile1024i_str, align 64
|
|
%dst.addr.i44 = alloca ptr, align 8
|
|
%indirect-arg-temp.i = alloca <256 x i32>, align 1024
|
|
%indirect-arg-temp4.i = alloca <256 x i32>, align 1024
|
|
%indirect-arg-temp5.i = alloca <256 x i32>, align 1024
|
|
%b43 = alloca %struct.__tile1024i_str, align 64
|
|
%a42 = alloca %struct.__tile1024i_str, align 64
|
|
%dst.addr.i35 = alloca ptr, align 8
|
|
%base.addr.i36 = alloca ptr, align 8
|
|
%stride.addr.i37 = alloca i64, align 8
|
|
%dst.addr.i28 = alloca ptr, align 8
|
|
%base.addr.i29 = alloca ptr, align 8
|
|
%stride.addr.i30 = alloca i64, align 8
|
|
%dst.addr.i21 = alloca ptr, align 8
|
|
%base.addr.i22 = alloca ptr, align 8
|
|
%stride.addr.i23 = alloca i64, align 8
|
|
%dst.addr.i14 = alloca ptr, align 8
|
|
%base.addr.i15 = alloca ptr, align 8
|
|
%stride.addr.i16 = alloca i64, align 8
|
|
%dst.addr.i7 = alloca ptr, align 8
|
|
%base.addr.i8 = alloca ptr, align 8
|
|
%stride.addr.i9 = alloca i64, align 8
|
|
%dst.addr.i = alloca ptr, align 8
|
|
%base.addr.i = alloca ptr, align 8
|
|
%stride.addr.i = alloca i64, align 8
|
|
%cond.addr = alloca i32, align 4
|
|
%row.addr = alloca i16, align 2
|
|
%col.addr = alloca i16, align 2
|
|
%a = alloca %struct.__tile1024i_str, align 64
|
|
%b = alloca %struct.__tile1024i_str, align 64
|
|
%c = alloca %struct.__tile1024i_str, align 64
|
|
store i32 %cond, ptr %cond.addr, align 4
|
|
store i16 %row, ptr %row.addr, align 2
|
|
store i16 %col, ptr %col.addr, align 2
|
|
call void @llvm.memset.p0.i64(ptr align 64 %a, i8 0, i64 1088, i1 false)
|
|
%0 = load i16, ptr %row.addr, align 2
|
|
store i16 %0, ptr %a, align 64
|
|
%col2 = getelementptr inbounds %struct.__tile1024i_str, ptr %a, i32 0, i32 1
|
|
store i16 8, ptr %col2, align 2
|
|
call void @llvm.memset.p0.i64(ptr align 64 %b, i8 0, i64 1088, i1 false)
|
|
store i16 8, ptr %b, align 64
|
|
%col4 = getelementptr inbounds %struct.__tile1024i_str, ptr %b, i32 0, i32 1
|
|
%1 = load i16, ptr %col.addr, align 2
|
|
store i16 %1, ptr %col4, align 2
|
|
call void @llvm.memset.p0.i64(ptr align 64 %c, i8 0, i64 1088, i1 false)
|
|
%2 = load i16, ptr %row.addr, align 2
|
|
store i16 %2, ptr %c, align 64
|
|
%col6 = getelementptr inbounds %struct.__tile1024i_str, ptr %c, i32 0, i32 1
|
|
%3 = load i16, ptr %col.addr, align 2
|
|
store i16 %3, ptr %col6, align 2
|
|
%4 = load i32, ptr %cond.addr, align 4
|
|
%tobool = icmp ne i32 %4, 0
|
|
br i1 %tobool, label %if.then, label %if.else
|
|
|
|
if.then: ; preds = %entry
|
|
store ptr %a, ptr %dst.addr.i35, align 8
|
|
store ptr @buf, ptr %base.addr.i36, align 8
|
|
store i64 32, ptr %stride.addr.i37, align 8
|
|
%5 = load ptr, ptr %dst.addr.i35, align 8
|
|
%6 = load i16, ptr %5, align 64
|
|
%7 = load ptr, ptr %dst.addr.i35, align 8
|
|
%col.i39 = getelementptr inbounds %struct.__tile1024i_str, ptr %7, i32 0, i32 1
|
|
%8 = load i16, ptr %col.i39, align 2
|
|
%9 = load ptr, ptr %base.addr.i36, align 8
|
|
%10 = load i64, ptr %stride.addr.i37, align 8
|
|
store i16 %6, ptr %m.addr.i, align 2
|
|
store i16 %8, ptr %n.addr.i, align 2
|
|
store ptr %9, ptr %base.addr.i56, align 8
|
|
store i64 %10, ptr %stride.addr.i57, align 8
|
|
%11 = load i16, ptr %m.addr.i, align 2
|
|
%12 = load i16, ptr %n.addr.i, align 2
|
|
%13 = load ptr, ptr %base.addr.i56, align 8
|
|
%14 = load i64, ptr %stride.addr.i57, align 8
|
|
%15 = call x86_amx @llvm.x86.tileloadd64.internal(i16 %11, i16 %12, ptr %13, i64 %14) #2
|
|
%16 = bitcast x86_amx %15 to <256 x i32>
|
|
%17 = load ptr, ptr %dst.addr.i35, align 8
|
|
%tile.i41 = getelementptr inbounds %struct.__tile1024i_str, ptr %17, i32 0, i32 3
|
|
store <256 x i32> %16, ptr %tile.i41, align 64
|
|
store ptr %b, ptr %dst.addr.i28, align 8
|
|
store ptr @buf, ptr %base.addr.i29, align 8
|
|
store i64 32, ptr %stride.addr.i30, align 8
|
|
%18 = load ptr, ptr %dst.addr.i28, align 8
|
|
%19 = load i16, ptr %18, align 64
|
|
%20 = load ptr, ptr %dst.addr.i28, align 8
|
|
%col.i32 = getelementptr inbounds %struct.__tile1024i_str, ptr %20, i32 0, i32 1
|
|
%21 = load i16, ptr %col.i32, align 2
|
|
%22 = load ptr, ptr %base.addr.i29, align 8
|
|
%23 = load i64, ptr %stride.addr.i30, align 8
|
|
store i16 %19, ptr %m.addr.i58, align 2
|
|
store i16 %21, ptr %n.addr.i59, align 2
|
|
store ptr %22, ptr %base.addr.i60, align 8
|
|
store i64 %23, ptr %stride.addr.i61, align 8
|
|
%24 = load i16, ptr %m.addr.i58, align 2
|
|
%25 = load i16, ptr %n.addr.i59, align 2
|
|
%26 = load ptr, ptr %base.addr.i60, align 8
|
|
%27 = load i64, ptr %stride.addr.i61, align 8
|
|
%28 = call x86_amx @llvm.x86.tileloadd64.internal(i16 %24, i16 %25, ptr %26, i64 %27) #2
|
|
%29 = bitcast x86_amx %28 to <256 x i32>
|
|
%30 = load ptr, ptr %dst.addr.i28, align 8
|
|
%tile.i34 = getelementptr inbounds %struct.__tile1024i_str, ptr %30, i32 0, i32 3
|
|
store <256 x i32> %29, ptr %tile.i34, align 64
|
|
store ptr %c, ptr %dst.addr.i21, align 8
|
|
store ptr @buf, ptr %base.addr.i22, align 8
|
|
store i64 32, ptr %stride.addr.i23, align 8
|
|
%31 = load ptr, ptr %dst.addr.i21, align 8
|
|
%32 = load i16, ptr %31, align 64
|
|
%33 = load ptr, ptr %dst.addr.i21, align 8
|
|
%col.i25 = getelementptr inbounds %struct.__tile1024i_str, ptr %33, i32 0, i32 1
|
|
%34 = load i16, ptr %col.i25, align 2
|
|
%35 = load ptr, ptr %base.addr.i22, align 8
|
|
%36 = load i64, ptr %stride.addr.i23, align 8
|
|
store i16 %32, ptr %m.addr.i62, align 2
|
|
store i16 %34, ptr %n.addr.i63, align 2
|
|
store ptr %35, ptr %base.addr.i64, align 8
|
|
store i64 %36, ptr %stride.addr.i65, align 8
|
|
%37 = load i16, ptr %m.addr.i62, align 2
|
|
%38 = load i16, ptr %n.addr.i63, align 2
|
|
%39 = load ptr, ptr %base.addr.i64, align 8
|
|
%40 = load i64, ptr %stride.addr.i65, align 8
|
|
%41 = call x86_amx @llvm.x86.tileloadd64.internal(i16 %37, i16 %38, ptr %39, i64 %40) #2
|
|
%42 = bitcast x86_amx %41 to <256 x i32>
|
|
%43 = load ptr, ptr %dst.addr.i21, align 8
|
|
%tile.i27 = getelementptr inbounds %struct.__tile1024i_str, ptr %43, i32 0, i32 3
|
|
store <256 x i32> %42, ptr %tile.i27, align 64
|
|
br label %if.end
|
|
|
|
if.else: ; preds = %entry
|
|
store ptr %a, ptr %dst.addr.i14, align 8
|
|
store ptr @buf2, ptr %base.addr.i15, align 8
|
|
store i64 32, ptr %stride.addr.i16, align 8
|
|
%44 = load ptr, ptr %dst.addr.i14, align 8
|
|
%45 = load i16, ptr %44, align 64
|
|
%46 = load ptr, ptr %dst.addr.i14, align 8
|
|
%col.i18 = getelementptr inbounds %struct.__tile1024i_str, ptr %46, i32 0, i32 1
|
|
%47 = load i16, ptr %col.i18, align 2
|
|
%48 = load ptr, ptr %base.addr.i15, align 8
|
|
%49 = load i64, ptr %stride.addr.i16, align 8
|
|
store i16 %45, ptr %m.addr.i66, align 2
|
|
store i16 %47, ptr %n.addr.i67, align 2
|
|
store ptr %48, ptr %base.addr.i68, align 8
|
|
store i64 %49, ptr %stride.addr.i69, align 8
|
|
%50 = load i16, ptr %m.addr.i66, align 2
|
|
%51 = load i16, ptr %n.addr.i67, align 2
|
|
%52 = load ptr, ptr %base.addr.i68, align 8
|
|
%53 = load i64, ptr %stride.addr.i69, align 8
|
|
%54 = call x86_amx @llvm.x86.tileloadd64.internal(i16 %50, i16 %51, ptr %52, i64 %53) #2
|
|
%55 = bitcast x86_amx %54 to <256 x i32>
|
|
%56 = load ptr, ptr %dst.addr.i14, align 8
|
|
%tile.i20 = getelementptr inbounds %struct.__tile1024i_str, ptr %56, i32 0, i32 3
|
|
store <256 x i32> %55, ptr %tile.i20, align 64
|
|
store ptr %b, ptr %dst.addr.i7, align 8
|
|
store ptr @buf2, ptr %base.addr.i8, align 8
|
|
store i64 32, ptr %stride.addr.i9, align 8
|
|
%57 = load ptr, ptr %dst.addr.i7, align 8
|
|
%58 = load i16, ptr %57, align 64
|
|
%59 = load ptr, ptr %dst.addr.i7, align 8
|
|
%col.i11 = getelementptr inbounds %struct.__tile1024i_str, ptr %59, i32 0, i32 1
|
|
%60 = load i16, ptr %col.i11, align 2
|
|
%61 = load ptr, ptr %base.addr.i8, align 8
|
|
%62 = load i64, ptr %stride.addr.i9, align 8
|
|
store i16 %58, ptr %m.addr.i70, align 2
|
|
store i16 %60, ptr %n.addr.i71, align 2
|
|
store ptr %61, ptr %base.addr.i72, align 8
|
|
store i64 %62, ptr %stride.addr.i73, align 8
|
|
%63 = load i16, ptr %m.addr.i70, align 2
|
|
%64 = load i16, ptr %n.addr.i71, align 2
|
|
%65 = load ptr, ptr %base.addr.i72, align 8
|
|
%66 = load i64, ptr %stride.addr.i73, align 8
|
|
%67 = call x86_amx @llvm.x86.tileloadd64.internal(i16 %63, i16 %64, ptr %65, i64 %66) #2
|
|
%68 = bitcast x86_amx %67 to <256 x i32>
|
|
%69 = load ptr, ptr %dst.addr.i7, align 8
|
|
%tile.i13 = getelementptr inbounds %struct.__tile1024i_str, ptr %69, i32 0, i32 3
|
|
store <256 x i32> %68, ptr %tile.i13, align 64
|
|
store ptr %c, ptr %dst.addr.i, align 8
|
|
store ptr @buf2, ptr %base.addr.i, align 8
|
|
store i64 32, ptr %stride.addr.i, align 8
|
|
%70 = load ptr, ptr %dst.addr.i, align 8
|
|
%71 = load i16, ptr %70, align 64
|
|
%72 = load ptr, ptr %dst.addr.i, align 8
|
|
%col.i = getelementptr inbounds %struct.__tile1024i_str, ptr %72, i32 0, i32 1
|
|
%73 = load i16, ptr %col.i, align 2
|
|
%74 = load ptr, ptr %base.addr.i, align 8
|
|
%75 = load i64, ptr %stride.addr.i, align 8
|
|
store i16 %71, ptr %m.addr.i74, align 2
|
|
store i16 %73, ptr %n.addr.i75, align 2
|
|
store ptr %74, ptr %base.addr.i76, align 8
|
|
store i64 %75, ptr %stride.addr.i77, align 8
|
|
%76 = load i16, ptr %m.addr.i74, align 2
|
|
%77 = load i16, ptr %n.addr.i75, align 2
|
|
%78 = load ptr, ptr %base.addr.i76, align 8
|
|
%79 = load i64, ptr %stride.addr.i77, align 8
|
|
%80 = call x86_amx @llvm.x86.tileloadd64.internal(i16 %76, i16 %77, ptr %78, i64 %79) #2
|
|
%81 = bitcast x86_amx %80 to <256 x i32>
|
|
%82 = load ptr, ptr %dst.addr.i, align 8
|
|
%tile.i = getelementptr inbounds %struct.__tile1024i_str, ptr %82, i32 0, i32 3
|
|
store <256 x i32> %81, ptr %tile.i, align 64
|
|
br label %if.end
|
|
|
|
if.end: ; preds = %if.else, %if.then
|
|
call void @llvm.memcpy.p0.p0.i64(ptr align 1 %b43, ptr align 1 %b, i64 1088, i1 false) #2
|
|
call void @llvm.memcpy.p0.p0.i64(ptr align 1 %a42, ptr align 1 %a, i64 1088, i1 false) #2
|
|
store ptr %c, ptr %dst.addr.i44, align 8
|
|
%83 = load i16, ptr %a42, align 64
|
|
%col.i46 = getelementptr inbounds %struct.__tile1024i_str, ptr %b43, i32 0, i32 1
|
|
%84 = load i16, ptr %col.i46, align 2
|
|
%col1.i = getelementptr inbounds %struct.__tile1024i_str, ptr %a42, i32 0, i32 1
|
|
%85 = load i16, ptr %col1.i, align 2
|
|
%86 = load ptr, ptr %dst.addr.i44, align 8
|
|
%tile.i47 = getelementptr inbounds %struct.__tile1024i_str, ptr %86, i32 0, i32 3
|
|
%87 = load <256 x i32>, ptr %tile.i47, align 64
|
|
%tile2.i = getelementptr inbounds %struct.__tile1024i_str, ptr %a42, i32 0, i32 3
|
|
%88 = load <256 x i32>, ptr %tile2.i, align 64
|
|
%tile3.i = getelementptr inbounds %struct.__tile1024i_str, ptr %b43, i32 0, i32 3
|
|
%89 = load <256 x i32>, ptr %tile3.i, align 64
|
|
store <256 x i32> %87, ptr %indirect-arg-temp.i, align 1024
|
|
store <256 x i32> %88, ptr %indirect-arg-temp4.i, align 1024
|
|
store <256 x i32> %89, ptr %indirect-arg-temp5.i, align 1024
|
|
call void @llvm.memcpy.p0.p0.i64(ptr align 1 %indirect-arg-temp5.i80, ptr align 1 %indirect-arg-temp5.i, i64 1024, i1 false) #2
|
|
call void @llvm.memcpy.p0.p0.i64(ptr align 1 %indirect-arg-temp4.i79, ptr align 1 %indirect-arg-temp4.i, i64 1024, i1 false) #2
|
|
call void @llvm.memcpy.p0.p0.i64(ptr align 1 %indirect-arg-temp.i78, ptr align 1 %indirect-arg-temp.i, i64 1024, i1 false) #2
|
|
%dst.i = load <256 x i32>, ptr %indirect-arg-temp.i78, align 1024
|
|
%src1.i = load <256 x i32>, ptr %indirect-arg-temp4.i79, align 1024
|
|
%src2.i = load <256 x i32>, ptr %indirect-arg-temp5.i80, align 1024
|
|
store i16 %83, ptr %m.addr.i81, align 2
|
|
store i16 %84, ptr %n.addr.i82, align 2
|
|
store i16 %85, ptr %k.addr.i, align 2
|
|
store <256 x i32> %dst.i, ptr %dst.addr.i83, align 64
|
|
store <256 x i32> %src1.i, ptr %src1.addr.i, align 64
|
|
store <256 x i32> %src2.i, ptr %src2.addr.i, align 64
|
|
%90 = load i16, ptr %m.addr.i81, align 2
|
|
%91 = load i16, ptr %n.addr.i82, align 2
|
|
%92 = load i16, ptr %k.addr.i, align 2
|
|
%93 = load <256 x i32>, ptr %dst.addr.i83, align 64
|
|
%94 = bitcast <256 x i32> %93 to x86_amx
|
|
%95 = load <256 x i32>, ptr %src1.addr.i, align 64
|
|
%96 = bitcast <256 x i32> %95 to x86_amx
|
|
%97 = load <256 x i32>, ptr %src2.addr.i, align 64
|
|
%98 = bitcast <256 x i32> %97 to x86_amx
|
|
%99 = call x86_amx @llvm.x86.tdpbssd.internal(i16 %90, i16 %91, i16 %92, x86_amx %94, x86_amx %96, x86_amx %98) #2
|
|
%100 = bitcast x86_amx %99 to <256 x i32>
|
|
%101 = load ptr, ptr %dst.addr.i44, align 8
|
|
%tile6.i = getelementptr inbounds %struct.__tile1024i_str, ptr %101, i32 0, i32 3
|
|
store <256 x i32> %100, ptr %tile6.i, align 64
|
|
call void @llvm.memcpy.p0.p0.i64(ptr align 1 %c49, ptr align 1 %c, i64 1088, i1 false) #2
|
|
store ptr @buf, ptr %base.addr.i50, align 8
|
|
store i64 32, ptr %stride.addr.i51, align 8
|
|
%102 = load i16, ptr %c49, align 64
|
|
%col.i54 = getelementptr inbounds %struct.__tile1024i_str, ptr %c49, i32 0, i32 1
|
|
%103 = load i16, ptr %col.i54, align 2
|
|
%104 = load ptr, ptr %base.addr.i50, align 8
|
|
%105 = load i64, ptr %stride.addr.i51, align 8
|
|
%tile.i55 = getelementptr inbounds %struct.__tile1024i_str, ptr %c49, i32 0, i32 3
|
|
%106 = load <256 x i32>, ptr %tile.i55, align 64
|
|
store <256 x i32> %106, ptr %indirect-arg-temp.i52, align 1024
|
|
call void @llvm.memcpy.p0.p0.i64(ptr align 1 %indirect-arg-temp.i5284, ptr align 1 %indirect-arg-temp.i52, i64 1024, i1 false) #2
|
|
%tile.i89 = load <256 x i32>, ptr %indirect-arg-temp.i5284, align 1024
|
|
store i16 %102, ptr %m.addr.i85, align 2
|
|
store i16 %103, ptr %n.addr.i86, align 2
|
|
store ptr %104, ptr %base.addr.i87, align 8
|
|
store i64 %105, ptr %stride.addr.i88, align 8
|
|
store <256 x i32> %tile.i89, ptr %tile.addr.i, align 64
|
|
%107 = load i16, ptr %m.addr.i85, align 2
|
|
%108 = load i16, ptr %n.addr.i86, align 2
|
|
%109 = load ptr, ptr %base.addr.i87, align 8
|
|
%110 = load i64, ptr %stride.addr.i88, align 8
|
|
%111 = load <256 x i32>, ptr %tile.addr.i, align 64
|
|
%112 = bitcast <256 x i32> %111 to x86_amx
|
|
call void @llvm.x86.tilestored64.internal(i16 %107, i16 %108, ptr %109, i64 %110, x86_amx %112) #2
|
|
ret void
|
|
}
|
|
|
|
declare void @llvm.memset.p0.i64(ptr nocapture writeonly, i8, i64, i1 immarg) #1
|
|
declare x86_amx @llvm.x86.tileloadd64.internal(i16, i16, ptr, i64) #2
|
|
declare x86_amx @llvm.x86.tdpbssd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) #2
|
|
declare void @llvm.x86.tilestored64.internal(i16, i16, ptr, i64, x86_amx) #2
|
|
declare void @llvm.memcpy.p0.p0.i64(ptr noalias nocapture writeonly, ptr noalias nocapture readonly, i64, i1 immarg) #3
|
|
|
|
attributes #0 = { noinline nounwind optnone }
|
|
attributes #1 = { argmemonly nofree nosync nounwind willreturn writeonly }
|
|
attributes #2 = { nounwind }
|
|
attributes #3 = { argmemonly nofree nosync nounwind willreturn }
|