/*******************************************************************************
 *
 * MIT License
 *
 * Copyright (c) 2020-2021 Advanced Micro Devices, Inc.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 *
 *******************************************************************************/
; generated by igemm_codegen.py (090d5c199adbaae1e85d5c8bad5a26564124d4ba)
;
.include "igemm_fwd_gtcn2_nchwc_cyxkc_fp16x4_utils.inc"

;----------------------------------------------------------
; starting of kernel igemm_fwd_gtcn2_nchwc_kcyxc_fp16x4_bx0_ex0_bt256x128x32_lt8x8_lw4x2_lr2x8_ta1x1x1x32_1x8x1x32_tb1x1x4x4_1x8x1x32
; tensor_layout              : 'nchwc_kcyxc'
; gemm_m_per_block           : 256
; gemm_n_per_block           : 128
; gemm_k_per_block           : 32
; lanegroup_tile_m           : 8
; lanegroup_wave_m           : 4
; lanegroup_repeat_m         : 2
; lanegroup_tile_n           : 8
; lanegroup_wave_n           : 2
; lanegroup_repeat_n         : 8
; tensor_a_thread_lengths    : [1, 1, 1, 32]
; tensor_a_cluster_lengths   : [1, 8, 1, 32]
; tensor_b_thread_lengths    : [1, 1, 4, 4]
; tensor_b_cluster_lengths   : [1, 8, 1, 32]
; direction                  : 'fwd'
; precision                  : 'fp16'
; nxb                        : 0
; nxe                        : 0
; vector_c                   : 4
; 
; block_size                 : 256
; lds_total                  : 32768
; lds_buffer_num             : 1
; 
.set k_p_in, 0
.set k_p_wei, 8
.set k_p_out, 16
.set k_tile_hw, 24
.set k_ntile_hw, 28
.set k_hi, 32
.set k_wi, 36
.set k_n, 40
.set k_k, 44
.set k_c, 48
.set k_group, 52
.set k_ks, 56
.set k_ho, 60
.set k_wo, 64
.set k_stride_hw, 68
.set k_dilation_hw, 72
.set k_pad_hw, 76
.set k_wei_hw, 80
.set k_move_slice_k, 84
.set k_magic_0, 88
.set k_magic_1, 92
.set k_magic_2, 96
.set k_magic_3, 100
.set k_magic_4, 104
.set k_magic_5, 108
.set k_magic_6, 112
.set k_magic_7, 116
.set k_shift_pack_0, 120
.set k_shift_pack_1, 124
.set k_end, 128

.set s_ka, 0
.set s_bx, 2
.set s_by, 3
.set s_p_in, 4
.set s_p_wei, 8
.set s_p_out, 12
.set s_tile_hw, 16
.set s_ntile_hw, 17
.set s_hi, 18
.set s_wi, 19
.set s_n, 20
.set s_k, 21
.set s_c, 22
.set s_group, 23
.set s_gemmk_split, 24
.set s_magic_0, 32
.set s_magic_1, 33
.set s_magic_2, 34
.set s_magic_3, 35
.set s_magic_4, 36
.set s_magic_5, 37
.set s_magic_6, 38
.set s_magic_7, 39
.set s_shift_pack_0, 40
.set s_shift_pack_1, 41
.set s_i_tile_h, 42
.set s_i_tile_w, 43
.set s_tile_h, 44
.set s_tile_w, 16
.set s_ntile_h, 45
.set s_ntile_w, 17
.set s_sps_hi, 46
.set s_sps_wi, 47
.set s_tile_os_hi, 48
.set s_tile_os_wi, 49
.set s_in_stride_c, 50
.set s_in_stride_hi, 51
.set s_in_stride_n, 52
.set s_in_stride_nb0, 53
.set s_wei_stride_k, 54
.set s_out_stride_k, 55
.set s_out_stride_ho, 56
.set s_out_stride_n, 57
.set s_block_gtc_ig, 58
.set s_block_gtc_ik, 59
.set s_block_gtc_inb, 60
.set s_move_slice_k_stride_gemm_k, 61
.set s_move_slice_k_stride_c, 62
.set s_knum, 3
.set s_dim_br, 63
.set s_dim_mp, 64
.set s_dim_mr, 65
.set s_dim_np, 66
.set s_dim_nr, 67
.set s_move_slice_k_acc_c, 68
.set s_kitr, 1
.set s_0xffff, 69
.set s_in_offset, 70
.set s_wei_offset, 71
.set s_tmp, 78
.set s_x_dilation_w, 48
.set s_y_dilation_h, 49
.set s_end, 84

.set v_c, 0
.set v_a, 129
.set v_b, 134
.set v_gld_a, 138
.set v_gld_b, 154
.set v_sst_a_os, 162
.set v_sld_a_os, 163
.set v_sst_b_os, 164
.set v_sld_b_os, 165
.set v_in_os, 166
.set v_in_i_hw_list, 170
.set v_in_flag, 174
.set v_in_flag_n, 178
.set v_wei_os, 179
.set v_out_os, 180
.set v_gtc_ic, 181
.set v_gtc_iec, 182
.set v_gtc_iy, 183
.set v_gtc_ix, 184
.set v_in_inb, 185
.set v_in_in, 186
.set v_wei_ik, 187
.set v_co_sst, 186
.set v_co_sld, 188
.set v_out_flag, 187
.set v_out_inb, 185
.set v_out_ik, 185
.set v_gemm_in, 189
.set v_gemm_im, 190
.set v_co_sub_m_index, 190
.set v_co_sub_n_index, 189
.set v_out_in, 189
.set v_coalescing_store_index, 189
.set v_tmp, 192
.set v_end, 198

.text
.globl igemm_fwd_gtcn2_nchwc_kcyxc_fp16x4_bx0_ex0_bt256x128x32_lt8x8_lw4x2_lr2x8_ta1x1x1x32_1x8x1x32_tb1x1x4x4_1x8x1x32
.p2align 8
.type igemm_fwd_gtcn2_nchwc_kcyxc_fp16x4_bx0_ex0_bt256x128x32_lt8x8_lw4x2_lr2x8_ta1x1x1x32_1x8x1x32_tb1x1x4x4_1x8x1x32,@function
igemm_fwd_gtcn2_nchwc_kcyxc_fp16x4_bx0_ex0_bt256x128x32_lt8x8_lw4x2_lr2x8_ta1x1x1x32_1x8x1x32_tb1x1x4x4_1x8x1x32:
    s_load_dwordx2  s[s_p_in+0:s_p_in+1],   s[s_ka+0:s_ka+1],    0+k_p_in
    s_load_dwordx2  s[s_p_wei+0:s_p_wei+1],   s[s_ka+0:s_ka+1],    0+k_p_wei
    s_load_dwordx2  s[s_p_out+0:s_p_out+1],   s[s_ka+0:s_ka+1],    0+k_p_out
    s_load_dwordx8  s[s_tile_hw+0:s_tile_hw+7],   s[s_ka+0:s_ka+1],    0+k_tile_hw
    s_load_dword  s[s_gemmk_split],   s[s_ka+0:s_ka+1],    0+k_ks
    s_load_dwordx8  s[s_magic_0+0:s_magic_0+7],   s[s_ka+0:s_ka+1],  0+k_magic_0
    s_load_dwordx2  s[s_shift_pack_0+0:s_shift_pack_0+1],   s[s_ka+0:s_ka+1],  0+k_shift_pack_0
    ; wei(1, ce, 1, k-vec-c) thread_lengths: 1x1x1x32, cluster_length: 1x8x1x32, k_pack:4
    v_mov_b32 v[v_tmp], v0
    v_and_b32 v[v_wei_ik], 31, v[v_tmp]
    v_lshrrev_b32 v[v_tmp], 5, v[v_tmp]
    v_and_b32 v[v_gtc_iec], 7, v[v_tmp]

    ; inp(1, ce, nb0, nb1) thread_length: 1x1x4x4, cluster_length: 1x8x1x32, k_pack:4
    v_mov_b32 v[v_tmp], v0
    v_and_b32 v[v_in_inb], 31, v[v_tmp]
    s_mov_b32 s[s_0xffff], 65535
    s_mov_b32 s[s_tmp+1], 255
    s_waitcnt lgkmcnt(0)

    ; calculate index
    s_lshr_b32 s[s_tile_h], s[s_tile_hw], 16
    s_and_b32 s[s_tile_w], s[s_tile_hw], s[s_0xffff]
    s_lshr_b32 s[s_ntile_h], s[s_ntile_hw], 16
    s_and_b32 s[s_ntile_w], s[s_ntile_hw], s[s_0xffff]
    s_lshl_b32 s[s_in_stride_hi], s[s_wi], 2
    s_mul_i32 s[s_in_stride_c], s[s_hi], s[s_in_stride_hi]
    s_mul_i32 s[s_tmp], s[s_in_stride_c], s[s_c]
    s_mul_i32 s[s_in_stride_n], s[s_tmp], s[s_group]
    s_lshl_b32 s[s_out_stride_ho], s[s_wi], 2
    s_mul_i32 s[s_out_stride_k], s[s_hi], s[s_out_stride_ho]
    s_lshr_b32 s[s_tmp+1], s[s_k], 2
    s_mul_i32 s[s_tmp], s[s_tmp+1], s[s_out_stride_k]
    s_mul_i32 s[s_out_stride_n], s[s_tmp], s[s_group]
    s_mul_i32  s[s_tmp], s[s_n], s[s_in_stride_n]
    s_mul_i32  s[s_tmp+1], s[s_n], s[s_out_stride_n]
    s_lshl_b32 s[s_tmp+4], s[s_tmp], 1
    s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 1
    s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4]
    s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4]
    s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp]
    s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1]
    s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5]
    s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5]
    s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp]
    s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1]
    s_mov_b32 s[s_knum], s[s_c]
    s_mul_i32 s[s_dim_br], s[s_tile_h], s[s_tile_w]
    s_mul_i32 s[s_dim_nr], s[s_n], s[s_dim_br]
    s_add_u32 s[s_tmp+2], 127, s[s_dim_nr]
    s_lshl_b32 s[s_wei_stride_k], s[s_knum], 2
    s_lshr_b32 s[s_dim_np], s[s_tmp+2], 7
    s_add_u32 s[s_tmp], 255, s[s_k]
    s_lshr_b32 s[s_dim_mp], s[s_tmp], 8

    ; gemm_m_per_block:256, gemm_n_per_block:128, source_access_order:0
    s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8
    .mdiv_u32_rem_ss s_tmp+4,s_tmp+2,s_bx,s_magic_0,s_tmp+3,s_dim_np,s_tmp
    s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8
    .mdiv_u32_rem_ss s_tmp+5,s_bx,s_tmp+2,s_magic_1,s_tmp+3,s_dim_mp,s_tmp
    ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im
    s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+4], 7
    s_lshl_b32 s[s_block_gtc_ik], s[s_tmp+5], 8
    s_bfe_u32 s[s_tmp+3], s[s_shift_pack_1], 0x00080018 ; offset:24, width:8
    .mdiv_u32_rem_ss s_i_tile_w,s_tmp+2,s_bx,s_magic_7,s_tmp+3,s_ntile_w,s_tmp
    s_bfe_u32 s[s_tmp+3], s[s_shift_pack_1], 0x00080010 ; offset:16, width:8
    .mdiv_u32_rem_ss s_i_tile_h,s_block_gtc_ig,s_tmp+2,s_magic_6,s_tmp+3,s_ntile_h,s_tmp
    ; calculate spatial tiling
    s_mul_i32 s[s_tile_os_hi], s[s_i_tile_h], s[s_tile_h]
    s_sub_u32 s[s_sps_hi], s[s_hi], s[s_tile_os_hi]
    s_cmp_ge_u32 s[s_sps_hi], s[s_tile_h]
    s_cmov_b32 s[s_sps_hi], s[s_tile_h]
    ; calculate spatial tiling
    s_mul_i32 s[s_tile_os_wi], s[s_i_tile_w], s[s_tile_w]
    s_sub_u32 s[s_sps_wi], s[s_wi], s[s_tile_os_wi]
    s_cmp_ge_u32 s[s_sps_wi], s[s_tile_w]
    s_cmov_b32 s[s_sps_wi], s[s_tile_w]
    s_mul_i32 s[s_tmp], s[s_in_stride_hi], s[s_tile_os_hi]
    s_lshl_b32 s[s_tmp+1], s[s_tile_os_wi], 2
    s_add_u32 s[s_tmp], s[s_tmp], s[s_tmp+1]
    s_lshl_b32 s[s_tmp], s[s_tmp], 1
    s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp] ; accumulate tile offset for input
    s_addc_u32 s[s_p_in+1], s[s_p_in+1], 0
    s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp] ; accumulate tile offset for output
    s_addc_u32 s[s_p_out+1], s[s_p_out+1], 0
    v_add_nc_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_in_inb]
    s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8
    .mdiv_u32_rem_vs v_tmp,v_tmp+4,v_tmp+5,s_magic_3,s_tmp+3,s_tile_w,v_tmp+3
    s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8
    .mdiv_u32_rem_vs v_tmp+1,v_in_in,v_tmp+4,s_magic_2,s_tmp+3,s_tile_h,v_tmp+3
    v_mov_b32 v[v_gtc_ic], v[v_gtc_iec]
    v_cmp_gt_u32  s[s_n], v[v_in_in]
    v_cndmask_b32 v[v_tmp+3], 0, 1
    v_lshlrev_b32 v[v_in_flag_n], 0, v[v_tmp+3]
    s_lshl_b32 s[s_sps_hi], s[s_sps_hi], 16 ; shift to hi-16
    v_lshl_or_b32 v[v_in_i_hw_list], v[v_tmp+1], 16, v[v_tmp]

    ; calculate wei offset
    s_lshl_b32 s[s_tmp+5], s[s_k], 2
    s_mul_i32 s[s_tmp], s[s_tmp+5], s[s_knum]
    s_lshl_b32 s[s_tmp], s[s_tmp], 1
    s_mov_b32 s[s_p_wei+2], s[s_tmp]
    s_mov_b32 s[s_p_wei+3], 0x31014000
    s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp]
    s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp]
    s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp]
    s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1]
    v_add_nc_u32 v[v_tmp+5], s[s_block_gtc_ik], v[v_wei_ik]
    v_lshlrev_b32 v[v_tmp], 2, v[v_gtc_iec]
    v_mul_lo_u32 v[v_tmp+4], s[s_wei_stride_k], v[v_tmp+5]
    v_add_lshl_u32 v[v_wei_os], v[v_tmp], v[v_tmp+4], 1
    s_lshl_b32 s[s_wei_stride_k], s[s_wei_stride_k], 5

    s_lshl_b32 s[s_wei_stride_k], s[s_wei_stride_k], 1

    s_mul_i32 s[s_wei_offset+0], 2, s[s_wei_stride_k]
    s_mul_i32 s[s_wei_offset+1], 3, s[s_wei_stride_k]
    s_mul_i32 s[s_wei_offset+2], 4, s[s_wei_stride_k]
    s_mul_i32 s[s_wei_offset+3], 5, s[s_wei_stride_k]
    s_mul_i32 s[s_wei_offset+4], 6, s[s_wei_stride_k]
    s_mul_i32 s[s_wei_offset+5], 7, s[s_wei_stride_k]
    .v_clear_nc v_gld_a, 16
    ; load weight
    buffer_load_dwordx2 v[v_gld_a+0:v_gld_a+0+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0
    buffer_load_dwordx2 v[v_gld_a+2:v_gld_a+2+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0
    buffer_load_dwordx2 v[v_gld_a+4:v_gld_a+4+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+0] offen offset:0
    buffer_load_dwordx2 v[v_gld_a+6:v_gld_a+6+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0
    buffer_load_dwordx2 v[v_gld_a+8:v_gld_a+8+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+2] offen offset:0
    buffer_load_dwordx2 v[v_gld_a+10:v_gld_a+10+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+3] offen offset:0
    buffer_load_dwordx2 v[v_gld_a+12:v_gld_a+12+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+4] offen offset:0
    buffer_load_dwordx2 v[v_gld_a+14:v_gld_a+14+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+5] offen offset:0

    ; calculate in offset
    s_mov_b32 s[s_in_offset], 0
    s_mov_b32 s[s_p_in+2], 0xffffffff
    s_mov_b32 s[s_p_in+3], 0x31014000
    s_mul_i32 s[s_tmp+2], s[s_c], s[s_in_stride_c]
    s_lshl_b32 s[s_tmp+2], s[s_tmp+2], 1
    s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2]
    s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2]
    s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp]
    s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1]

    v_mul_lo_u32 v[v_tmp+5], s[s_in_stride_n], v[v_in_in]
    v_mul_lo_u32 v[v_tmp+4], s[s_in_stride_c], v[v_gtc_ic]
    v_lshrrev_b32 v[v_tmp], 16, v[v_in_i_hw_list]
    v_and_b32 v[v_tmp+1], s[s_0xffff], v[v_in_i_hw_list]
    v_mul_lo_u32 v[v_tmp+3], s[s_wi], v[v_tmp]
    v_add_nc_u32 v[v_tmp+3], v[v_tmp+1], v[v_tmp+3]
    v_add_nc_u32 v[v_tmp+4], v[v_tmp+4], v[v_tmp+5]
    v_lshlrev_b32 v[v_tmp+3], 2, v[v_tmp+3]
    v_add_lshl_u32 v[v_in_os], v[v_tmp+4], v[v_tmp+3], 1
    v_cmp_gt_u32 s[s_c], v[v_gtc_ic]
    v_cndmask_b32 v[v_tmp+4], 0, 1
    v_bfe_u32 v[v_tmp+1], v[v_in_flag_n],  0, 1
    v_and_b32 v[v_tmp+1], v[v_tmp+4], v[v_tmp+1]
    v_cmp_gt_u32  s[s_sps_hi], v[v_in_i_hw_list]
    v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1]
    v_cmp_gt_u16  s[s_sps_wi], v[v_in_i_hw_list]
    v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag]

    s_mov_b32 s1, 32
    v_add_nc_u32 v[v_tmp+3], s1, v[v_in_inb]
    v_add_nc_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp+3]
    s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8
    .mdiv_u32_rem_vs v_tmp,v_tmp+4,v_tmp+5,s_magic_3,s_tmp+3,s_tile_w,v_tmp+3
    s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8
    .mdiv_u32_rem_vs v_tmp+1,v_in_in,v_tmp+4,s_magic_2,s_tmp+3,s_tile_h,v_tmp+3
    v_mul_lo_u32 v[v_tmp+5], s[s_in_stride_n], v[v_in_in]
    v_mul_lo_u32 v[v_tmp+4], s[s_in_stride_c], v[v_gtc_ic]
    v_mul_lo_u32 v[v_tmp+3], s[s_wi], v[v_tmp+1]
    v_add_nc_u32 v[v_tmp+3], v[v_tmp], v[v_tmp+3]
    v_add_nc_u32 v[v_tmp+4], v[v_tmp+4], v[v_tmp+5]
    v_lshlrev_b32 v[v_tmp+3], 2, v[v_tmp+3]
    v_add_lshl_u32 v[v_in_os+1], v[v_tmp+4], v[v_tmp+3], 1
    v_cmp_gt_u32 s[s_c], v[v_gtc_ic]
    v_cndmask_b32 v[v_tmp+4], 0, 1
    v_cmp_gt_u32  s[s_n], v[v_in_in]
    v_cndmask_b32 v[v_tmp+3], 0, 1
    v_lshl_or_b32 v[v_in_i_hw_list+1], v[v_tmp+1], 16, v[v_tmp]
    v_lshl_or_b32 v[v_in_flag_n], v[v_tmp+3], 1, v[v_in_flag_n]
    v_and_b32 v[v_tmp+3], v[v_tmp+4], v[v_tmp+3]
    v_cmp_gt_u32  s[s_sps_hi], v[v_in_i_hw_list+1]
    v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp+3]
    v_cmp_gt_u16  s[s_sps_wi], v[v_in_i_hw_list+1]
    v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1]
    s_mov_b32 s1, 64
    v_add_nc_u32 v[v_tmp+3], s1, v[v_in_inb]
    v_add_nc_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp+3]
    s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8
    .mdiv_u32_rem_vs v_tmp,v_tmp+4,v_tmp+5,s_magic_3,s_tmp+3,s_tile_w,v_tmp+3
    s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8
    .mdiv_u32_rem_vs v_tmp+1,v_in_in,v_tmp+4,s_magic_2,s_tmp+3,s_tile_h,v_tmp+3
    v_mul_lo_u32 v[v_tmp+5], s[s_in_stride_n], v[v_in_in]
    v_mul_lo_u32 v[v_tmp+4], s[s_in_stride_c], v[v_gtc_ic]
    v_mul_lo_u32 v[v_tmp+3], s[s_wi], v[v_tmp+1]
    v_add_nc_u32 v[v_tmp+3], v[v_tmp], v[v_tmp+3]
    v_add_nc_u32 v[v_tmp+4], v[v_tmp+4], v[v_tmp+5]
    v_lshlrev_b32 v[v_tmp+3], 2, v[v_tmp+3]
    v_add_lshl_u32 v[v_in_os+2], v[v_tmp+4], v[v_tmp+3], 1
    v_cmp_gt_u32 s[s_c], v[v_gtc_ic]
    v_cndmask_b32 v[v_tmp+4], 0, 1
    v_cmp_gt_u32  s[s_n], v[v_in_in]
    v_cndmask_b32 v[v_tmp+3], 0, 1
    v_lshl_or_b32 v[v_in_i_hw_list+2], v[v_tmp+1], 16, v[v_tmp]
    v_lshl_or_b32 v[v_in_flag_n], v[v_tmp+3], 2, v[v_in_flag_n]
    v_and_b32 v[v_tmp+3], v[v_tmp+4], v[v_tmp+3]
    v_cmp_gt_u32  s[s_sps_hi], v[v_in_i_hw_list+2]
    v_cndmask_b32 v[v_in_flag+2], 0, v[v_tmp+3]
    v_cmp_gt_u16  s[s_sps_wi], v[v_in_i_hw_list+2]
    v_cndmask_b32 v[v_in_flag+2], 0, v[v_in_flag+2]
    s_mov_b32 s1, 96
    v_add_nc_u32 v[v_tmp+3], s1, v[v_in_inb]
    v_add_nc_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp+3]
    s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8
    .mdiv_u32_rem_vs v_tmp,v_tmp+4,v_tmp+5,s_magic_3,s_tmp+3,s_tile_w,v_tmp+3
    s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8
    .mdiv_u32_rem_vs v_tmp+1,v_in_in,v_tmp+4,s_magic_2,s_tmp+3,s_tile_h,v_tmp+3
    v_mul_lo_u32 v[v_tmp+5], s[s_in_stride_n], v[v_in_in]
    v_mul_lo_u32 v[v_tmp+4], s[s_in_stride_c], v[v_gtc_ic]
    v_mul_lo_u32 v[v_tmp+3], s[s_wi], v[v_tmp+1]
    v_add_nc_u32 v[v_tmp+3], v[v_tmp], v[v_tmp+3]
    v_add_nc_u32 v[v_tmp+4], v[v_tmp+4], v[v_tmp+5]
    v_lshlrev_b32 v[v_tmp+3], 2, v[v_tmp+3]
    v_add_lshl_u32 v[v_in_os+3], v[v_tmp+4], v[v_tmp+3], 1
    v_cmp_gt_u32 s[s_c], v[v_gtc_ic]
    v_cndmask_b32 v[v_tmp+4], 0, 1
    v_cmp_gt_u32  s[s_n], v[v_in_in]
    v_cndmask_b32 v[v_tmp+3], 0, 1
    v_lshl_or_b32 v[v_in_i_hw_list+3], v[v_tmp+1], 16, v[v_tmp]
    v_lshl_or_b32 v[v_in_flag_n], v[v_tmp+3], 3, v[v_in_flag_n]
    v_and_b32 v[v_tmp+3], v[v_tmp+4], v[v_tmp+3]
    v_cmp_gt_u32  s[s_sps_hi], v[v_in_i_hw_list+3]
    v_cndmask_b32 v[v_in_flag+3], 0, v[v_tmp+3]
    v_cmp_gt_u16  s[s_sps_wi], v[v_in_i_hw_list+3]
    v_cndmask_b32 v[v_in_flag+3], 0, v[v_in_flag+3]
    ; load input, nxe:0
    .v_clear_nc v_gld_b, 8
    v_cmpx_le_u32 1, v[v_in_flag]
    buffer_load_dwordx2 v[v_gld_b:v_gld_b+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0
    s_mov_b64 exec, -1
    v_cmpx_le_u32 1, v[v_in_flag+1]
    buffer_load_dwordx2 v[v_gld_b+2:v_gld_b+2+1], v[v_in_os+1], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0
    s_mov_b64 exec, -1
    v_cmpx_le_u32 1, v[v_in_flag+2]
    buffer_load_dwordx2 v[v_gld_b+4:v_gld_b+4+1], v[v_in_os+2], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0
    s_mov_b64 exec, -1
    v_cmpx_le_u32 1, v[v_in_flag+3]
    buffer_load_dwordx2 v[v_gld_b+6:v_gld_b+6+1], v[v_in_os+3], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0
    s_mov_b64 exec, -1

    v_mov_b32 v[v_tmp+5], v0
    ; dotx mapping, get source matrix gemm index, k_pack:4, v_pack:1, k_pack_per_thread:2
    v_and_b32 v[v_gemm_in], 7, v[v_tmp+5]           ; lanegroup_n index 
    v_and_b32 v[v_gemm_im], 7, v[v_tmp+5]           ; lanegroup_m index 
    v_lshrrev_b32 v[v_tmp+5], 3, v[v_tmp+5]
    v_and_b32 v[v_tmp + 0], 1, v[v_tmp+5]          ; lanegroup_n_per_wave index
    v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 3, v[v_gemm_in]
    v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5]
    v_and_b32 v[v_tmp + 1], 3, v[v_tmp+5]          ; lanegroup_m_per_wave index
    v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 1], 3, v[v_gemm_im]
    v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5]
    v_and_b32 v[v_tmp + 3], 3, v[v_tmp+5]  ; waves_per_m index
    v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 5, v[v_gemm_im]

    v_mov_b32 v[v_tmp+5], v0
    ; dotx mapping, get dst matrix gemm index
    v_and_b32 v[v_tmp+0], 7, v[v_tmp+5]
    v_lshrrev_b32 v[v_tmp+5], 3, v[v_tmp+5]
    v_and_b32 v[v_tmp+2], 1, v[v_tmp+5]
    v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5]
    v_and_b32 v[v_tmp+3], 3, v[v_tmp+5]
    v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5]
    v_mov_b32 v[v_co_sst], v[v_tmp+0]
    v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 3, v[v_co_sst]
    v_lshlrev_b32 v[v_co_sld], 3, v[v_tmp+3]
    v_and_b32 v[v_tmp+1], 3, v[v_tmp+5]
    v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 5, v[v_co_sld]

    ; LDS store, wei: 1,ce,1,k: 1x1x1x32, 1x8x1x32, k_pack:4, k_pack_gld_a:4, fp16
    v_lshlrev_b32 v[v_tmp+2], 2, v[v_wei_ik]
    v_mad_u32_u24 v[v_tmp], v[v_gtc_iec], 1024, v[v_tmp+2]
    v_lshlrev_b32 v[v_sst_a_os], 1, v[v_tmp]

    v_lshlrev_b32 v[v_sld_a_os], 3, v[v_gemm_im] ; LDS load wei
    ; LDS store, input: 1,ce,nb_vec_c: 1x1x4x4, 1x8x1x32, k_pack:4, k_pack_gld_b:4, fp16
    v_lshlrev_b32 v[v_tmp+2], 2,  v[v_in_inb]
    v_mad_u32_u24 v[v_tmp], v[v_gtc_iec], 512, v[v_tmp+2]
    v_lshlrev_b32 v[v_sst_b_os], 1, v[v_tmp]
    v_add_nc_u32 v[v_sst_b_os], 16384, v[v_sst_b_os]

    v_lshlrev_b32 v[v_sld_b_os], 3, v[v_gemm_in] ; LDS load input
    v_add_nc_u32 v[v_sld_b_os], 16384, v[v_sld_b_os]
    v_mov_b32 v[v_gemm_in], v[v_co_sst]
    v_mov_b32 v[v_gemm_im], v[v_co_sld]
    ; init_co_lds_offset for dotx
    v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im]    ; shink m by 4
    v_lshlrev_b32 v[v_tmp + 1],  2, v[v_gemm_in]    ; expand n by 4
    v_mad_u32_u24 v[v_co_sst], v[v_tmp], 512, v[v_tmp + 1]    ; macro_tile_n:128, sld_vec:4
    v_lshlrev_b32 v[v_co_sld], 3, v[0]   ; sld vec:4 * byte:2
    v_lshlrev_b32 v[v_co_sst], 1, v[v_co_sst]
    ; init_co_sub_m_index for dotx
    v_lshrrev_b32 v[v_co_sub_m_index], 7, v[0]
    v_lshlrev_b32 v[v_co_sub_m_index], 2, v[v_co_sub_m_index] ; expand m by sld_vec:4
    v_lshrrev_b32 v[v_co_sub_m_index], 2, v[v_co_sub_m_index] ; fold sub_m by 4
    ; init_co_sub_n_index dotx
    v_and_b32 v[v_co_sub_n_index], 127, v[0]

    ; output offset
    s_lshr_b32 s[s_tmp+3], s[s_k], 2
    s_mul_i32 s[s_tmp+3], s[s_block_gtc_ig],s[s_tmp+3]
    s_lshl_b32 s[s_tmp+4], s[s_out_stride_k], 1
    s_mul_i32 s[s_tmp], s[s_tmp+3], s[s_tmp+4]
    s_mul_hi_u32 s[s_tmp+1], s[s_tmp+3], s[s_tmp+4]
    s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp]
    s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1]

    s_lshr_b32 s[s_tmp+1], s[s_block_gtc_ik], 2
    s_lshl_b32 s[s_tmp+4], s[s_out_stride_k], 1
    s_mul_i32 s[s_tmp], s[s_tmp+1], s[s_tmp+4]
    s_mul_hi_u32 s[s_tmp+1], s[s_tmp+1], s[s_tmp+4]
    s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp]
    s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1]

    v_add_nc_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_n_index]   ; total n*ho*wo
    ;   compute from n1b
    s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8
    .mdiv_u32_rem_vs v_tmp+4,v_tmp+3,v_out_inb,s_magic_3,s_tmp+3,s_tile_w,v_tmp
    s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8
    .mdiv_u32_rem_vs v_tmp+5,v_out_in,v_tmp+3,s_magic_2,s_tmp+3,s_tile_h,v_tmp
    v_cmp_gt_u32  s[s_n], v[v_out_in]
    v_cndmask_b32 v[v_tmp+3], 0, 1
    v_lshl_or_b32 v[v_tmp], v[v_tmp+5], 16, v[v_tmp+4]
    v_cmp_gt_u32  s[s_sps_hi], v[v_tmp]
    v_cndmask_b32 v[v_out_flag], 0, v[v_tmp+3]
    v_cmp_gt_u16  s[s_sps_wi], v[v_tmp]
    v_cndmask_b32 v[v_out_flag], 0, v[v_out_flag]
    v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_tmp+5]
    v_add_nc_u32 v[v_tmp+4], v[v_tmp], v[v_tmp+4]
    v_mul_lo_u32 v[v_tmp], v[v_out_in], s[s_out_stride_n]
    v_lshlrev_b32 v[v_tmp], 1, v[v_tmp]
    v_lshl_add_u32 v[v_out_os], v[v_tmp+4], 3, v[v_tmp]
    v_mul_lo_u32 v[v_tmp], s[s_out_stride_k], v[v_co_sub_m_index]
    v_lshlrev_b32 v[v_tmp], 1, v[v_tmp]
    v_add_nc_u32 v[v_out_os], v[v_out_os], v[v_tmp]
    ;    mask for coaleascing store
    v_mov_b32 v[v_coalescing_store_index], v[0]
    ; move slice stride
    s_mul_i32 s[s_move_slice_k_stride_c], s[s_in_stride_c], 16
    s_mov_b32 s[s_move_slice_k_acc_c], 8
    s_mov_b32 s[s_move_slice_k_stride_gemm_k], 64

    s_mov_b32 s[s_p_out+2], 0xffffffff
    s_mov_b32 s[s_p_out+3], 0x31014000
    s_add_i32 s[s_knum], s[s_knum], 7
    s_lshr_b32 s[s_knum], s[s_knum], 3
    s_lshl_b32 s[s_knum], s[s_knum], 3

    ; start FMA loop
    s_waitcnt vmcnt(4)
    ds_write_b64 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+1] 
    ds_write_b64 v[v_sst_a_os], v[v_gld_a+2:v_gld_a+2+1] offset:256
    ds_write_b64 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+1] offset:512
    ds_write_b64 v[v_sst_a_os], v[v_gld_a+6:v_gld_a+6+1] offset:768
    ds_write_b64 v[v_sst_a_os], v[v_gld_a+8:v_gld_a+8+1] offset:1024
    ds_write_b64 v[v_sst_a_os], v[v_gld_a+10:v_gld_a+10+1] offset:1280
    ds_write_b64 v[v_sst_a_os], v[v_gld_a+12:v_gld_a+12+1] offset:1536
    ds_write_b64 v[v_sst_a_os], v[v_gld_a+14:v_gld_a+14+1] offset:1792
    s_waitcnt vmcnt(0)
    ds_write_b64 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+1] 
    ds_write_b64 v[v_sst_b_os], v[v_gld_b+2:v_gld_b+2+1] offset:256
    ds_write_b64 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+1] offset:512
    ds_write_b64 v[v_sst_b_os], v[v_gld_b+6:v_gld_b+6+1] offset:768
    .v_clear_nc v_c, 128
    s_sub_i32 s[s_kitr], s[s_knum], 8
    s_cmp_gt_i32 s[s_kitr], 0
    s_cbranch_scc0 L_igemm_fwd_gtcn2_nchwc_kcyxc_fp16x4_bx0_ex0_bt256x128x32_lt8x8_lw4x2_lr2x8_ta1x1x1x32_1x8x1x32_tb1x1x4x4_1x8x1x32_end
    
    v_add_nc_u32 v[v_gtc_ic], s[s_move_slice_k_acc_c], v[v_gtc_ic]
    s_add_u32 s[s_in_offset],  s[s_move_slice_k_stride_c], s[s_in_offset]
    v_add_nc_u32 v[v_wei_os], s[s_move_slice_k_stride_gemm_k], v[v_wei_os]
    v_cmp_gt_u32 s[s_c], v[v_gtc_ic]
    v_cndmask_b32 v[v_tmp], 0, 1
    v_and_b32 v[v_in_flag], v[v_in_flag], v[v_tmp]
    v_and_b32 v[v_in_flag+1], v[v_in_flag+1], v[v_tmp]
    v_and_b32 v[v_in_flag+2], v[v_in_flag+2], v[v_tmp]
    v_and_b32 v[v_in_flag+3], v[v_in_flag+3], v[v_tmp]

    s_waitcnt lgkmcnt(0)
    s_barrier
    ; load weight
    buffer_load_dwordx2 v[v_gld_a+0:v_gld_a+0+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0
    buffer_load_dwordx2 v[v_gld_a+2:v_gld_a+2+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0
    buffer_load_dwordx2 v[v_gld_a+4:v_gld_a+4+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+0] offen offset:0
    buffer_load_dwordx2 v[v_gld_a+6:v_gld_a+6+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0
    buffer_load_dwordx2 v[v_gld_a+8:v_gld_a+8+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+2] offen offset:0
    buffer_load_dwordx2 v[v_gld_a+10:v_gld_a+10+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+3] offen offset:0
    buffer_load_dwordx2 v[v_gld_a+12:v_gld_a+12+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+4] offen offset:0
    buffer_load_dwordx2 v[v_gld_a+14:v_gld_a+14+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+5] offen offset:0
    ; load input, nxe:0
    .v_clear_nc v_gld_b, 8
    v_cmpx_le_u32 1, v[v_in_flag]
    buffer_load_dwordx2 v[v_gld_b:v_gld_b+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0
    s_mov_b64 exec, -1
    v_cmpx_le_u32 1, v[v_in_flag+1]
    buffer_load_dwordx2 v[v_gld_b+2:v_gld_b+2+1], v[v_in_os+1], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0
    s_mov_b64 exec, -1
    v_cmpx_le_u32 1, v[v_in_flag+2]
    buffer_load_dwordx2 v[v_gld_b+4:v_gld_b+4+1], v[v_in_os+2], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0
    s_mov_b64 exec, -1
    v_cmpx_le_u32 1, v[v_in_flag+3]
    buffer_load_dwordx2 v[v_gld_b+6:v_gld_b+6+1], v[v_in_os+3], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0
    s_mov_b64 exec, -1
L_igemm_fwd_gtcn2_nchwc_kcyxc_fp16x4_bx0_ex0_bt256x128x32_lt8x8_lw4x2_lr2x8_ta1x1x1x32_1x8x1x32_tb1x1x4x4_1x8x1x32_fma_body:
    ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:0
    ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:0
    ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:1024
    ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:128
    
    
    s_waitcnt lgkmcnt(2)
    .v_lanegroup_dotx_fp16_8x8x4 v_c,v_a,v_b
    s_waitcnt lgkmcnt(1)
    .v_lanegroup_dotx_fp16_8x8x4 v_c+64,v_a+2,v_b
    ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:0+0*1024+256
    s_waitcnt lgkmcnt(1)
    .v_lanegroup_dotx_fp16_8x8x4 v_c+8,v_a,v_b+2
    .v_lanegroup_dotx_fp16_8x8x4 v_c+72,v_a+2,v_b+2
    ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:0+0*1024+384
    s_waitcnt lgkmcnt(1)
    .v_lanegroup_dotx_fp16_8x8x4 v_c+16,v_a,v_b
    .v_lanegroup_dotx_fp16_8x8x4 v_c+80,v_a+2,v_b
    ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:0+0*1024+512
    s_waitcnt lgkmcnt(1)
    .v_lanegroup_dotx_fp16_8x8x4 v_c+24,v_a,v_b+2
    .v_lanegroup_dotx_fp16_8x8x4 v_c+88,v_a+2,v_b+2
    ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:0+0*1024+640
    s_waitcnt lgkmcnt(1)
    .v_lanegroup_dotx_fp16_8x8x4 v_c+32,v_a,v_b
    .v_lanegroup_dotx_fp16_8x8x4 v_c+96,v_a+2,v_b
    ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:0+0*1024+768
    s_waitcnt lgkmcnt(1)
    .v_lanegroup_dotx_fp16_8x8x4 v_c+40,v_a,v_b+2
    .v_lanegroup_dotx_fp16_8x8x4 v_c+104,v_a+2,v_b+2
    ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:0+0*1024+896
    s_waitcnt lgkmcnt(1)
    .v_lanegroup_dotx_fp16_8x8x4 v_c+48,v_a,v_b
    .v_lanegroup_dotx_fp16_8x8x4 v_c+112,v_a+2,v_b
    ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:0+0*1024+1024
    s_waitcnt lgkmcnt(1)
    .v_lanegroup_dotx_fp16_8x8x4 v_c+56,v_a,v_b+2
    ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:0+1*2048+0
    .v_lanegroup_dotx_fp16_8x8x4 v_c+120,v_a+2,v_b+2
    ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:0+1*2048+1024
    ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:0+1*1024+128
    s_waitcnt lgkmcnt(2)
    .v_lanegroup_dotx_fp16_8x8x4 v_c,v_a,v_b
    s_waitcnt lgkmcnt(1)
    .v_lanegroup_dotx_fp16_8x8x4 v_c+64,v_a+2,v_b
    ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:0+1*1024+256
    s_waitcnt lgkmcnt(1)
    .v_lanegroup_dotx_fp16_8x8x4 v_c+8,v_a,v_b+2
    .v_lanegroup_dotx_fp16_8x8x4 v_c+72,v_a+2,v_b+2
    ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:0+1*1024+384
    s_waitcnt lgkmcnt(1)
    .v_lanegroup_dotx_fp16_8x8x4 v_c+16,v_a,v_b
    .v_lanegroup_dotx_fp16_8x8x4 v_c+80,v_a+2,v_b
    ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:0+1*1024+512
    s_waitcnt lgkmcnt(1)
    .v_lanegroup_dotx_fp16_8x8x4 v_c+24,v_a,v_b+2
    .v_lanegroup_dotx_fp16_8x8x4 v_c+88,v_a+2,v_b+2
    ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:0+1*1024+640
    s_waitcnt lgkmcnt(1)
    .v_lanegroup_dotx_fp16_8x8x4 v_c+32,v_a,v_b
    .v_lanegroup_dotx_fp16_8x8x4 v_c+96,v_a+2,v_b
    ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:0+1*1024+768
    s_waitcnt lgkmcnt(1)
    .v_lanegroup_dotx_fp16_8x8x4 v_c+40,v_a,v_b+2
    .v_lanegroup_dotx_fp16_8x8x4 v_c+104,v_a+2,v_b+2
    ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:0+1*1024+896
    s_waitcnt lgkmcnt(1)
    .v_lanegroup_dotx_fp16_8x8x4 v_c+48,v_a,v_b
    .v_lanegroup_dotx_fp16_8x8x4 v_c+112,v_a+2,v_b
    ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:0+1*1024+1024
    s_waitcnt lgkmcnt(1)
    .v_lanegroup_dotx_fp16_8x8x4 v_c+56,v_a,v_b+2
    ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:0+2*2048+0
    .v_lanegroup_dotx_fp16_8x8x4 v_c+120,v_a+2,v_b+2
    ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:0+2*2048+1024
    ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:0+2*1024+128
    s_waitcnt lgkmcnt(2)
    .v_lanegroup_dotx_fp16_8x8x4 v_c,v_a,v_b
    s_waitcnt lgkmcnt(1)
    .v_lanegroup_dotx_fp16_8x8x4 v_c+64,v_a+2,v_b
    ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:0+2*1024+256
    s_waitcnt lgkmcnt(1)
    .v_lanegroup_dotx_fp16_8x8x4 v_c+8,v_a,v_b+2
    .v_lanegroup_dotx_fp16_8x8x4 v_c+72,v_a+2,v_b+2
    ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:0+2*1024+384
    s_waitcnt lgkmcnt(1)
    .v_lanegroup_dotx_fp16_8x8x4 v_c+16,v_a,v_b
    .v_lanegroup_dotx_fp16_8x8x4 v_c+80,v_a+2,v_b
    ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:0+2*1024+512
    s_waitcnt lgkmcnt(1)
    .v_lanegroup_dotx_fp16_8x8x4 v_c+24,v_a,v_b+2
    .v_lanegroup_dotx_fp16_8x8x4 v_c+88,v_a+2,v_b+2
    ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:0+2*1024+640
    s_waitcnt lgkmcnt(1)
    .v_lanegroup_dotx_fp16_8x8x4 v_c+32,v_a,v_b
    .v_lanegroup_dotx_fp16_8x8x4 v_c+96,v_a+2,v_b
    ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:0+2*1024+768
    s_waitcnt lgkmcnt(1)
    .v_lanegroup_dotx_fp16_8x8x4 v_c+40,v_a,v_b+2
    .v_lanegroup_dotx_fp16_8x8x4 v_c+104,v_a+2,v_b+2
    ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:0+2*1024+896
    s_waitcnt lgkmcnt(1)
    .v_lanegroup_dotx_fp16_8x8x4 v_c+48,v_a,v_b
    .v_lanegroup_dotx_fp16_8x8x4 v_c+112,v_a+2,v_b
    ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:0+2*1024+1024
    s_waitcnt lgkmcnt(1)
    .v_lanegroup_dotx_fp16_8x8x4 v_c+56,v_a,v_b+2
    ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:0+3*2048+0
    .v_lanegroup_dotx_fp16_8x8x4 v_c+120,v_a+2,v_b+2
    ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:0+3*2048+1024
    ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:0+3*1024+128
    s_waitcnt lgkmcnt(2)
    .v_lanegroup_dotx_fp16_8x8x4 v_c,v_a,v_b
    s_waitcnt lgkmcnt(1)
    .v_lanegroup_dotx_fp16_8x8x4 v_c+64,v_a+2,v_b
    ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:0+3*1024+256
    s_waitcnt lgkmcnt(1)
    .v_lanegroup_dotx_fp16_8x8x4 v_c+8,v_a,v_b+2
    .v_lanegroup_dotx_fp16_8x8x4 v_c+72,v_a+2,v_b+2
    ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:0+3*1024+384
    s_waitcnt lgkmcnt(1)
    .v_lanegroup_dotx_fp16_8x8x4 v_c+16,v_a,v_b
    .v_lanegroup_dotx_fp16_8x8x4 v_c+80,v_a+2,v_b
    ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:0+3*1024+512
    s_waitcnt lgkmcnt(1)
    .v_lanegroup_dotx_fp16_8x8x4 v_c+24,v_a,v_b+2
    .v_lanegroup_dotx_fp16_8x8x4 v_c+88,v_a+2,v_b+2
    ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:0+3*1024+640
    s_waitcnt lgkmcnt(1)
    .v_lanegroup_dotx_fp16_8x8x4 v_c+32,v_a,v_b
    .v_lanegroup_dotx_fp16_8x8x4 v_c+96,v_a+2,v_b
    ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:0+3*1024+768
    s_waitcnt lgkmcnt(1)
    .v_lanegroup_dotx_fp16_8x8x4 v_c+40,v_a,v_b+2
    .v_lanegroup_dotx_fp16_8x8x4 v_c+104,v_a+2,v_b+2
    ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:0+3*1024+896
    s_waitcnt lgkmcnt(1)
    .v_lanegroup_dotx_fp16_8x8x4 v_c+48,v_a,v_b
    .v_lanegroup_dotx_fp16_8x8x4 v_c+112,v_a+2,v_b
    ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:0+3*1024+1024
    s_waitcnt lgkmcnt(1)
    .v_lanegroup_dotx_fp16_8x8x4 v_c+56,v_a,v_b+2
    ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:0+4*2048+0
    .v_lanegroup_dotx_fp16_8x8x4 v_c+120,v_a+2,v_b+2
    ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:0+4*2048+1024
    ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:0+4*1024+128
    s_waitcnt lgkmcnt(2)
    .v_lanegroup_dotx_fp16_8x8x4 v_c,v_a,v_b
    s_waitcnt lgkmcnt(1)
    .v_lanegroup_dotx_fp16_8x8x4 v_c+64,v_a+2,v_b
    ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:0+4*1024+256
    s_waitcnt lgkmcnt(1)
    .v_lanegroup_dotx_fp16_8x8x4 v_c+8,v_a,v_b+2
    .v_lanegroup_dotx_fp16_8x8x4 v_c+72,v_a+2,v_b+2
    ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:0+4*1024+384
    s_waitcnt lgkmcnt(1)
    .v_lanegroup_dotx_fp16_8x8x4 v_c+16,v_a,v_b
    .v_lanegroup_dotx_fp16_8x8x4 v_c+80,v_a+2,v_b
    ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:0+4*1024+512
    s_waitcnt lgkmcnt(1)
    .v_lanegroup_dotx_fp16_8x8x4 v_c+24,v_a,v_b+2
    .v_lanegroup_dotx_fp16_8x8x4 v_c+88,v_a+2,v_b+2
    ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:0+4*1024+640
    s_waitcnt lgkmcnt(1)
    .v_lanegroup_dotx_fp16_8x8x4 v_c+32,v_a,v_b
    .v_lanegroup_dotx_fp16_8x8x4 v_c+96,v_a+2,v_b
    ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:0+4*1024+768
    s_waitcnt lgkmcnt(1)
    .v_lanegroup_dotx_fp16_8x8x4 v_c+40,v_a,v_b+2
    .v_lanegroup_dotx_fp16_8x8x4 v_c+104,v_a+2,v_b+2
    ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:0+4*1024+896
    s_waitcnt lgkmcnt(1)
    .v_lanegroup_dotx_fp16_8x8x4 v_c+48,v_a,v_b
    .v_lanegroup_dotx_fp16_8x8x4 v_c+112,v_a+2,v_b
    ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:0+4*1024+1024
    s_waitcnt lgkmcnt(1)
    .v_lanegroup_dotx_fp16_8x8x4 v_c+56,v_a,v_b+2
    ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:0+5*2048+0
    .v_lanegroup_dotx_fp16_8x8x4 v_c+120,v_a+2,v_b+2
    ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:0+5*2048+1024
    ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:0+5*1024+128
    s_waitcnt lgkmcnt(2)
    .v_lanegroup_dotx_fp16_8x8x4 v_c,v_a,v_b
    s_waitcnt lgkmcnt(1)
    .v_lanegroup_dotx_fp16_8x8x4 v_c+64,v_a+2,v_b
    ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:0+5*1024+256
    s_waitcnt lgkmcnt(1)
    .v_lanegroup_dotx_fp16_8x8x4 v_c+8,v_a,v_b+2
    .v_lanegroup_dotx_fp16_8x8x4 v_c+72,v_a+2,v_b+2
    ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:0+5*1024+384
    s_waitcnt lgkmcnt(1)
    .v_lanegroup_dotx_fp16_8x8x4 v_c+16,v_a,v_b
    .v_lanegroup_dotx_fp16_8x8x4 v_c+80,v_a+2,v_b
    ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:0+5*1024+512
    s_waitcnt lgkmcnt(1)
    .v_lanegroup_dotx_fp16_8x8x4 v_c+24,v_a,v_b+2
    .v_lanegroup_dotx_fp16_8x8x4 v_c+88,v_a+2,v_b+2
    ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:0+5*1024+640
    s_waitcnt lgkmcnt(1)
    .v_lanegroup_dotx_fp16_8x8x4 v_c+32,v_a,v_b
    .v_lanegroup_dotx_fp16_8x8x4 v_c+96,v_a+2,v_b
    ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:0+5*1024+768
    s_waitcnt lgkmcnt(1)
    .v_lanegroup_dotx_fp16_8x8x4 v_c+40,v_a,v_b+2
    .v_lanegroup_dotx_fp16_8x8x4 v_c+104,v_a+2,v_b+2
    ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:0+5*1024+896
    s_waitcnt lgkmcnt(1)
    .v_lanegroup_dotx_fp16_8x8x4 v_c+48,v_a,v_b
    .v_lanegroup_dotx_fp16_8x8x4 v_c+112,v_a+2,v_b
    ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:0+5*1024+1024
    s_waitcnt lgkmcnt(1)
    .v_lanegroup_dotx_fp16_8x8x4 v_c+56,v_a,v_b+2
    ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:0+6*2048+0
    .v_lanegroup_dotx_fp16_8x8x4 v_c+120,v_a+2,v_b+2
    ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:0+6*2048+1024
    ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:0+6*1024+128
    s_waitcnt lgkmcnt(2)
    .v_lanegroup_dotx_fp16_8x8x4 v_c,v_a,v_b
    s_waitcnt lgkmcnt(1)
    .v_lanegroup_dotx_fp16_8x8x4 v_c+64,v_a+2,v_b
    ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:0+6*1024+256
    s_waitcnt lgkmcnt(1)
    .v_lanegroup_dotx_fp16_8x8x4 v_c+8,v_a,v_b+2
    .v_lanegroup_dotx_fp16_8x8x4 v_c+72,v_a+2,v_b+2
    ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:0+6*1024+384
    s_waitcnt lgkmcnt(1)
    .v_lanegroup_dotx_fp16_8x8x4 v_c+16,v_a,v_b
    .v_lanegroup_dotx_fp16_8x8x4 v_c+80,v_a+2,v_b
    ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:0+6*1024+512
    s_waitcnt lgkmcnt(1)
    .v_lanegroup_dotx_fp16_8x8x4 v_c+24,v_a,v_b+2
    .v_lanegroup_dotx_fp16_8x8x4 v_c+88,v_a+2,v_b+2
    ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:0+6*1024+640
    s_waitcnt lgkmcnt(1)
    .v_lanegroup_dotx_fp16_8x8x4 v_c+32,v_a,v_b
    .v_lanegroup_dotx_fp16_8x8x4 v_c+96,v_a+2,v_b
    ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:0+6*1024+768
    s_waitcnt lgkmcnt(1)
    .v_lanegroup_dotx_fp16_8x8x4 v_c+40,v_a,v_b+2
    .v_lanegroup_dotx_fp16_8x8x4 v_c+104,v_a+2,v_b+2
    ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:0+6*1024+896
    s_waitcnt lgkmcnt(1)
    .v_lanegroup_dotx_fp16_8x8x4 v_c+48,v_a,v_b
    .v_lanegroup_dotx_fp16_8x8x4 v_c+112,v_a+2,v_b
    ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:0+6*1024+1024
    s_waitcnt lgkmcnt(1)
    .v_lanegroup_dotx_fp16_8x8x4 v_c+56,v_a,v_b+2
    ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:0+7*2048+0
    .v_lanegroup_dotx_fp16_8x8x4 v_c+120,v_a+2,v_b+2
    ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:0+7*2048+1024
    ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:0+7*1024+128
    s_waitcnt lgkmcnt(2)
    .v_lanegroup_dotx_fp16_8x8x4 v_c,v_a,v_b
    s_waitcnt lgkmcnt(1)
    .v_lanegroup_dotx_fp16_8x8x4 v_c+64,v_a+2,v_b
    ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:0+7*1024+256
    s_waitcnt lgkmcnt(1)
    .v_lanegroup_dotx_fp16_8x8x4 v_c+8,v_a,v_b+2
    .v_lanegroup_dotx_fp16_8x8x4 v_c+72,v_a+2,v_b+2
    ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:0+7*1024+384
    s_waitcnt lgkmcnt(1)
    .v_lanegroup_dotx_fp16_8x8x4 v_c+16,v_a,v_b
    .v_lanegroup_dotx_fp16_8x8x4 v_c+80,v_a+2,v_b
    ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:0+7*1024+512
    s_waitcnt lgkmcnt(1)
    .v_lanegroup_dotx_fp16_8x8x4 v_c+24,v_a,v_b+2
    .v_lanegroup_dotx_fp16_8x8x4 v_c+88,v_a+2,v_b+2
    ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:0+7*1024+640
    s_waitcnt lgkmcnt(1)
    .v_lanegroup_dotx_fp16_8x8x4 v_c+32,v_a,v_b
    .v_lanegroup_dotx_fp16_8x8x4 v_c+96,v_a+2,v_b
    ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:0+7*1024+768
    s_waitcnt lgkmcnt(1)
    .v_lanegroup_dotx_fp16_8x8x4 v_c+40,v_a,v_b+2
    .v_lanegroup_dotx_fp16_8x8x4 v_c+104,v_a+2,v_b+2
    ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:0+7*1024+896
    s_waitcnt lgkmcnt(1)
    .v_lanegroup_dotx_fp16_8x8x4 v_c+48,v_a,v_b
    .v_lanegroup_dotx_fp16_8x8x4 v_c+112,v_a+2,v_b
    
    
    s_waitcnt lgkmcnt(0)
    s_barrier
    s_waitcnt vmcnt(4)
    ds_write_b64 v[v_sst_a_os], v[v_gld_a+0:v_gld_a+0+1] 
    ds_write_b64 v[v_sst_a_os], v[v_gld_a+2:v_gld_a+2+1] offset:256
    ds_write_b64 v[v_sst_a_os], v[v_gld_a+4:v_gld_a+4+1] offset:512
    ds_write_b64 v[v_sst_a_os], v[v_gld_a+6:v_gld_a+6+1] offset:768
    ds_write_b64 v[v_sst_a_os], v[v_gld_a+8:v_gld_a+8+1] offset:1024
    ds_write_b64 v[v_sst_a_os], v[v_gld_a+10:v_gld_a+10+1] offset:1280
    ds_write_b64 v[v_sst_a_os], v[v_gld_a+12:v_gld_a+12+1] offset:1536
    ds_write_b64 v[v_sst_a_os], v[v_gld_a+14:v_gld_a+14+1] offset:1792
    s_waitcnt vmcnt(0)
    ds_write_b64 v[v_sst_b_os], v[v_gld_b+0:v_gld_b+0+1] 
    ds_write_b64 v[v_sst_b_os], v[v_gld_b+2:v_gld_b+2+1] offset:256
    ds_write_b64 v[v_sst_b_os], v[v_gld_b+4:v_gld_b+4+1] offset:512
    ds_write_b64 v[v_sst_b_os], v[v_gld_b+6:v_gld_b+6+1] offset:768
    s_sub_i32 s[s_kitr], s[s_kitr], 8
    s_cmp_gt_i32 s[s_kitr], 0
    s_cbranch_scc0 L_igemm_fwd_gtcn2_nchwc_kcyxc_fp16x4_bx0_ex0_bt256x128x32_lt8x8_lw4x2_lr2x8_ta1x1x1x32_1x8x1x32_tb1x1x4x4_1x8x1x32_fma_finishing
    
    v_add_nc_u32 v[v_gtc_ic], s[s_move_slice_k_acc_c], v[v_gtc_ic]
    s_add_u32 s[s_in_offset],  s[s_move_slice_k_stride_c], s[s_in_offset]
    v_add_nc_u32 v[v_wei_os], s[s_move_slice_k_stride_gemm_k], v[v_wei_os]
    v_cmp_gt_u32 s[s_c], v[v_gtc_ic]
    v_cndmask_b32 v[v_tmp], 0, 1
    v_and_b32 v[v_in_flag], v[v_in_flag], v[v_tmp]
    v_and_b32 v[v_in_flag+1], v[v_in_flag+1], v[v_tmp]
    v_and_b32 v[v_in_flag+2], v[v_in_flag+2], v[v_tmp]
    v_and_b32 v[v_in_flag+3], v[v_in_flag+3], v[v_tmp]

    s_waitcnt lgkmcnt(12)
    .v_lanegroup_dotx_fp16_8x8x4 v_c+56,v_a,v_b+2
    s_waitcnt lgkmcnt(0)
    s_barrier
    ; load weight
    buffer_load_dwordx2 v[v_gld_a+0:v_gld_a+0+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0
    buffer_load_dwordx2 v[v_gld_a+2:v_gld_a+2+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k] offen offset:0
    buffer_load_dwordx2 v[v_gld_a+4:v_gld_a+4+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+0] offen offset:0
    buffer_load_dwordx2 v[v_gld_a+6:v_gld_a+6+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0
    buffer_load_dwordx2 v[v_gld_a+8:v_gld_a+8+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+2] offen offset:0
    buffer_load_dwordx2 v[v_gld_a+10:v_gld_a+10+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+3] offen offset:0
    buffer_load_dwordx2 v[v_gld_a+12:v_gld_a+12+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+4] offen offset:0
    buffer_load_dwordx2 v[v_gld_a+14:v_gld_a+14+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+5] offen offset:0
    ; load input, nxe:0
    .v_clear_nc v_gld_b, 8
    v_cmpx_le_u32 1, v[v_in_flag]
    buffer_load_dwordx2 v[v_gld_b:v_gld_b+1], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0
    s_mov_b64 exec, -1
    v_cmpx_le_u32 1, v[v_in_flag+1]
    buffer_load_dwordx2 v[v_gld_b+2:v_gld_b+2+1], v[v_in_os+1], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0
    s_mov_b64 exec, -1
    v_cmpx_le_u32 1, v[v_in_flag+2]
    buffer_load_dwordx2 v[v_gld_b+4:v_gld_b+4+1], v[v_in_os+2], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0
    s_mov_b64 exec, -1
    v_cmpx_le_u32 1, v[v_in_flag+3]
    buffer_load_dwordx2 v[v_gld_b+6:v_gld_b+6+1], v[v_in_os+3], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0
    s_mov_b64 exec, -1
    .v_lanegroup_dotx_fp16_8x8x4 v_c+120,v_a+2,v_b+2
    s_branch L_igemm_fwd_gtcn2_nchwc_kcyxc_fp16x4_bx0_ex0_bt256x128x32_lt8x8_lw4x2_lr2x8_ta1x1x1x32_1x8x1x32_tb1x1x4x4_1x8x1x32_fma_body
L_igemm_fwd_gtcn2_nchwc_kcyxc_fp16x4_bx0_ex0_bt256x128x32_lt8x8_lw4x2_lr2x8_ta1x1x1x32_1x8x1x32_tb1x1x4x4_1x8x1x32_fma_finishing:
    s_waitcnt lgkmcnt(12)
    .v_lanegroup_dotx_fp16_8x8x4 v_c+56,v_a,v_b+2
    .v_lanegroup_dotx_fp16_8x8x4 v_c+120,v_a+2,v_b+2
L_igemm_fwd_gtcn2_nchwc_kcyxc_fp16x4_bx0_ex0_bt256x128x32_lt8x8_lw4x2_lr2x8_ta1x1x1x32_1x8x1x32_tb1x1x4x4_1x8x1x32_end:
    s_waitcnt lgkmcnt(0)
    s_barrier
    ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:0
    ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:0
    ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:1024
    ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:128
    
    
    s_waitcnt lgkmcnt(2)
    .v_lanegroup_dotx_fp16_8x8x4 v_c,v_a,v_b
    s_waitcnt lgkmcnt(1)
    .v_lanegroup_dotx_fp16_8x8x4 v_c+64,v_a+2,v_b
    ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:0+0*1024+256
    s_waitcnt lgkmcnt(1)
    .v_lanegroup_dotx_fp16_8x8x4 v_c+8,v_a,v_b+2
    .v_lanegroup_dotx_fp16_8x8x4 v_c+72,v_a+2,v_b+2
    ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:0+0*1024+384
    s_waitcnt lgkmcnt(1)
    .v_lanegroup_dotx_fp16_8x8x4 v_c+16,v_a,v_b
    .v_lanegroup_dotx_fp16_8x8x4 v_c+80,v_a+2,v_b
    ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:0+0*1024+512
    s_waitcnt lgkmcnt(1)
    .v_lanegroup_dotx_fp16_8x8x4 v_c+24,v_a,v_b+2
    .v_lanegroup_dotx_fp16_8x8x4 v_c+88,v_a+2,v_b+2
    ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:0+0*1024+640
    s_waitcnt lgkmcnt(1)
    .v_lanegroup_dotx_fp16_8x8x4 v_c+32,v_a,v_b
    .v_lanegroup_dotx_fp16_8x8x4 v_c+96,v_a+2,v_b
    ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:0+0*1024+768
    s_waitcnt lgkmcnt(1)
    .v_lanegroup_dotx_fp16_8x8x4 v_c+40,v_a,v_b+2
    .v_lanegroup_dotx_fp16_8x8x4 v_c+104,v_a+2,v_b+2
    ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:0+0*1024+896
    s_waitcnt lgkmcnt(1)
    .v_lanegroup_dotx_fp16_8x8x4 v_c+48,v_a,v_b
    .v_lanegroup_dotx_fp16_8x8x4 v_c+112,v_a+2,v_b
    ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:0+0*1024+1024
    s_waitcnt lgkmcnt(1)
    .v_lanegroup_dotx_fp16_8x8x4 v_c+56,v_a,v_b+2
    ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:0+1*2048+0
    .v_lanegroup_dotx_fp16_8x8x4 v_c+120,v_a+2,v_b+2
    ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:0+1*2048+1024
    ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:0+1*1024+128
    s_waitcnt lgkmcnt(2)
    .v_lanegroup_dotx_fp16_8x8x4 v_c,v_a,v_b
    s_waitcnt lgkmcnt(1)
    .v_lanegroup_dotx_fp16_8x8x4 v_c+64,v_a+2,v_b
    ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:0+1*1024+256
    s_waitcnt lgkmcnt(1)
    .v_lanegroup_dotx_fp16_8x8x4 v_c+8,v_a,v_b+2
    .v_lanegroup_dotx_fp16_8x8x4 v_c+72,v_a+2,v_b+2
    ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:0+1*1024+384
    s_waitcnt lgkmcnt(1)
    .v_lanegroup_dotx_fp16_8x8x4 v_c+16,v_a,v_b
    .v_lanegroup_dotx_fp16_8x8x4 v_c+80,v_a+2,v_b
    ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:0+1*1024+512
    s_waitcnt lgkmcnt(1)
    .v_lanegroup_dotx_fp16_8x8x4 v_c+24,v_a,v_b+2
    .v_lanegroup_dotx_fp16_8x8x4 v_c+88,v_a+2,v_b+2
    ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:0+1*1024+640
    s_waitcnt lgkmcnt(1)
    .v_lanegroup_dotx_fp16_8x8x4 v_c+32,v_a,v_b
    .v_lanegroup_dotx_fp16_8x8x4 v_c+96,v_a+2,v_b
    ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:0+1*1024+768
    s_waitcnt lgkmcnt(1)
    .v_lanegroup_dotx_fp16_8x8x4 v_c+40,v_a,v_b+2
    .v_lanegroup_dotx_fp16_8x8x4 v_c+104,v_a+2,v_b+2
    ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:0+1*1024+896
    s_waitcnt lgkmcnt(1)
    .v_lanegroup_dotx_fp16_8x8x4 v_c+48,v_a,v_b
    .v_lanegroup_dotx_fp16_8x8x4 v_c+112,v_a+2,v_b
    ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:0+1*1024+1024
    s_waitcnt lgkmcnt(1)
    .v_lanegroup_dotx_fp16_8x8x4 v_c+56,v_a,v_b+2
    ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:0+2*2048+0
    .v_lanegroup_dotx_fp16_8x8x4 v_c+120,v_a+2,v_b+2
    ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:0+2*2048+1024
    ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:0+2*1024+128
    s_waitcnt lgkmcnt(2)
    .v_lanegroup_dotx_fp16_8x8x4 v_c,v_a,v_b
    s_waitcnt lgkmcnt(1)
    .v_lanegroup_dotx_fp16_8x8x4 v_c+64,v_a+2,v_b
    ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:0+2*1024+256
    s_waitcnt lgkmcnt(1)
    .v_lanegroup_dotx_fp16_8x8x4 v_c+8,v_a,v_b+2
    .v_lanegroup_dotx_fp16_8x8x4 v_c+72,v_a+2,v_b+2
    ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:0+2*1024+384
    s_waitcnt lgkmcnt(1)
    .v_lanegroup_dotx_fp16_8x8x4 v_c+16,v_a,v_b
    .v_lanegroup_dotx_fp16_8x8x4 v_c+80,v_a+2,v_b
    ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:0+2*1024+512
    s_waitcnt lgkmcnt(1)
    .v_lanegroup_dotx_fp16_8x8x4 v_c+24,v_a,v_b+2
    .v_lanegroup_dotx_fp16_8x8x4 v_c+88,v_a+2,v_b+2
    ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:0+2*1024+640
    s_waitcnt lgkmcnt(1)
    .v_lanegroup_dotx_fp16_8x8x4 v_c+32,v_a,v_b
    .v_lanegroup_dotx_fp16_8x8x4 v_c+96,v_a+2,v_b
    ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:0+2*1024+768
    s_waitcnt lgkmcnt(1)
    .v_lanegroup_dotx_fp16_8x8x4 v_c+40,v_a,v_b+2
    .v_lanegroup_dotx_fp16_8x8x4 v_c+104,v_a+2,v_b+2
    ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:0+2*1024+896
    s_waitcnt lgkmcnt(1)
    .v_lanegroup_dotx_fp16_8x8x4 v_c+48,v_a,v_b
    .v_lanegroup_dotx_fp16_8x8x4 v_c+112,v_a+2,v_b
    ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:0+2*1024+1024
    s_waitcnt lgkmcnt(1)
    .v_lanegroup_dotx_fp16_8x8x4 v_c+56,v_a,v_b+2
    ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:0+3*2048+0
    .v_lanegroup_dotx_fp16_8x8x4 v_c+120,v_a+2,v_b+2
    ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:0+3*2048+1024
    ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:0+3*1024+128
    s_waitcnt lgkmcnt(2)
    .v_lanegroup_dotx_fp16_8x8x4 v_c,v_a,v_b
    s_waitcnt lgkmcnt(1)
    .v_lanegroup_dotx_fp16_8x8x4 v_c+64,v_a+2,v_b
    ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:0+3*1024+256
    s_waitcnt lgkmcnt(1)
    .v_lanegroup_dotx_fp16_8x8x4 v_c+8,v_a,v_b+2
    .v_lanegroup_dotx_fp16_8x8x4 v_c+72,v_a+2,v_b+2
    ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:0+3*1024+384
    s_waitcnt lgkmcnt(1)
    .v_lanegroup_dotx_fp16_8x8x4 v_c+16,v_a,v_b
    .v_lanegroup_dotx_fp16_8x8x4 v_c+80,v_a+2,v_b
    ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:0+3*1024+512
    s_waitcnt lgkmcnt(1)
    .v_lanegroup_dotx_fp16_8x8x4 v_c+24,v_a,v_b+2
    .v_lanegroup_dotx_fp16_8x8x4 v_c+88,v_a+2,v_b+2
    ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:0+3*1024+640
    s_waitcnt lgkmcnt(1)
    .v_lanegroup_dotx_fp16_8x8x4 v_c+32,v_a,v_b
    .v_lanegroup_dotx_fp16_8x8x4 v_c+96,v_a+2,v_b
    ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:0+3*1024+768
    s_waitcnt lgkmcnt(1)
    .v_lanegroup_dotx_fp16_8x8x4 v_c+40,v_a,v_b+2
    .v_lanegroup_dotx_fp16_8x8x4 v_c+104,v_a+2,v_b+2
    ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:0+3*1024+896
    s_waitcnt lgkmcnt(1)
    .v_lanegroup_dotx_fp16_8x8x4 v_c+48,v_a,v_b
    .v_lanegroup_dotx_fp16_8x8x4 v_c+112,v_a+2,v_b
    ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:0+3*1024+1024
    s_waitcnt lgkmcnt(1)
    .v_lanegroup_dotx_fp16_8x8x4 v_c+56,v_a,v_b+2
    ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:0+4*2048+0
    .v_lanegroup_dotx_fp16_8x8x4 v_c+120,v_a+2,v_b+2
    ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:0+4*2048+1024
    ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:0+4*1024+128
    s_waitcnt lgkmcnt(2)
    .v_lanegroup_dotx_fp16_8x8x4 v_c,v_a,v_b
    s_waitcnt lgkmcnt(1)
    .v_lanegroup_dotx_fp16_8x8x4 v_c+64,v_a+2,v_b
    ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:0+4*1024+256
    s_waitcnt lgkmcnt(1)
    .v_lanegroup_dotx_fp16_8x8x4 v_c+8,v_a,v_b+2
    .v_lanegroup_dotx_fp16_8x8x4 v_c+72,v_a+2,v_b+2
    ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:0+4*1024+384
    s_waitcnt lgkmcnt(1)
    .v_lanegroup_dotx_fp16_8x8x4 v_c+16,v_a,v_b
    .v_lanegroup_dotx_fp16_8x8x4 v_c+80,v_a+2,v_b
    ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:0+4*1024+512
    s_waitcnt lgkmcnt(1)
    .v_lanegroup_dotx_fp16_8x8x4 v_c+24,v_a,v_b+2
    .v_lanegroup_dotx_fp16_8x8x4 v_c+88,v_a+2,v_b+2
    ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:0+4*1024+640
    s_waitcnt lgkmcnt(1)
    .v_lanegroup_dotx_fp16_8x8x4 v_c+32,v_a,v_b
    .v_lanegroup_dotx_fp16_8x8x4 v_c+96,v_a+2,v_b
    ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:0+4*1024+768
    s_waitcnt lgkmcnt(1)
    .v_lanegroup_dotx_fp16_8x8x4 v_c+40,v_a,v_b+2
    .v_lanegroup_dotx_fp16_8x8x4 v_c+104,v_a+2,v_b+2
    ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:0+4*1024+896
    s_waitcnt lgkmcnt(1)
    .v_lanegroup_dotx_fp16_8x8x4 v_c+48,v_a,v_b
    .v_lanegroup_dotx_fp16_8x8x4 v_c+112,v_a+2,v_b
    ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:0+4*1024+1024
    s_waitcnt lgkmcnt(1)
    .v_lanegroup_dotx_fp16_8x8x4 v_c+56,v_a,v_b+2
    ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:0+5*2048+0
    .v_lanegroup_dotx_fp16_8x8x4 v_c+120,v_a+2,v_b+2
    ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:0+5*2048+1024
    ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:0+5*1024+128
    s_waitcnt lgkmcnt(2)
    .v_lanegroup_dotx_fp16_8x8x4 v_c,v_a,v_b
    s_waitcnt lgkmcnt(1)
    .v_lanegroup_dotx_fp16_8x8x4 v_c+64,v_a+2,v_b
    ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:0+5*1024+256
    s_waitcnt lgkmcnt(1)
    .v_lanegroup_dotx_fp16_8x8x4 v_c+8,v_a,v_b+2
    .v_lanegroup_dotx_fp16_8x8x4 v_c+72,v_a+2,v_b+2
    ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:0+5*1024+384
    s_waitcnt lgkmcnt(1)
    .v_lanegroup_dotx_fp16_8x8x4 v_c+16,v_a,v_b
    .v_lanegroup_dotx_fp16_8x8x4 v_c+80,v_a+2,v_b
    ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:0+5*1024+512
    s_waitcnt lgkmcnt(1)
    .v_lanegroup_dotx_fp16_8x8x4 v_c+24,v_a,v_b+2
    .v_lanegroup_dotx_fp16_8x8x4 v_c+88,v_a+2,v_b+2
    ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:0+5*1024+640
    s_waitcnt lgkmcnt(1)
    .v_lanegroup_dotx_fp16_8x8x4 v_c+32,v_a,v_b
    .v_lanegroup_dotx_fp16_8x8x4 v_c+96,v_a+2,v_b
    ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:0+5*1024+768
    s_waitcnt lgkmcnt(1)
    .v_lanegroup_dotx_fp16_8x8x4 v_c+40,v_a,v_b+2
    .v_lanegroup_dotx_fp16_8x8x4 v_c+104,v_a+2,v_b+2
    ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:0+5*1024+896
    s_waitcnt lgkmcnt(1)
    .v_lanegroup_dotx_fp16_8x8x4 v_c+48,v_a,v_b
    .v_lanegroup_dotx_fp16_8x8x4 v_c+112,v_a+2,v_b
    ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:0+5*1024+1024
    s_waitcnt lgkmcnt(1)
    .v_lanegroup_dotx_fp16_8x8x4 v_c+56,v_a,v_b+2
    ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:0+6*2048+0
    .v_lanegroup_dotx_fp16_8x8x4 v_c+120,v_a+2,v_b+2
    ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:0+6*2048+1024
    ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:0+6*1024+128
    s_waitcnt lgkmcnt(2)
    .v_lanegroup_dotx_fp16_8x8x4 v_c,v_a,v_b
    s_waitcnt lgkmcnt(1)
    .v_lanegroup_dotx_fp16_8x8x4 v_c+64,v_a+2,v_b
    ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:0+6*1024+256
    s_waitcnt lgkmcnt(1)
    .v_lanegroup_dotx_fp16_8x8x4 v_c+8,v_a,v_b+2
    .v_lanegroup_dotx_fp16_8x8x4 v_c+72,v_a+2,v_b+2
    ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:0+6*1024+384
    s_waitcnt lgkmcnt(1)
    .v_lanegroup_dotx_fp16_8x8x4 v_c+16,v_a,v_b
    .v_lanegroup_dotx_fp16_8x8x4 v_c+80,v_a+2,v_b
    ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:0+6*1024+512
    s_waitcnt lgkmcnt(1)
    .v_lanegroup_dotx_fp16_8x8x4 v_c+24,v_a,v_b+2
    .v_lanegroup_dotx_fp16_8x8x4 v_c+88,v_a+2,v_b+2
    ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:0+6*1024+640
    s_waitcnt lgkmcnt(1)
    .v_lanegroup_dotx_fp16_8x8x4 v_c+32,v_a,v_b
    .v_lanegroup_dotx_fp16_8x8x4 v_c+96,v_a+2,v_b
    ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:0+6*1024+768
    s_waitcnt lgkmcnt(1)
    .v_lanegroup_dotx_fp16_8x8x4 v_c+40,v_a,v_b+2
    .v_lanegroup_dotx_fp16_8x8x4 v_c+104,v_a+2,v_b+2
    ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:0+6*1024+896
    s_waitcnt lgkmcnt(1)
    .v_lanegroup_dotx_fp16_8x8x4 v_c+48,v_a,v_b
    .v_lanegroup_dotx_fp16_8x8x4 v_c+112,v_a+2,v_b
    ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:0+6*1024+1024
    s_waitcnt lgkmcnt(1)
    .v_lanegroup_dotx_fp16_8x8x4 v_c+56,v_a,v_b+2
    ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:0+7*2048+0
    .v_lanegroup_dotx_fp16_8x8x4 v_c+120,v_a+2,v_b+2
    ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:0+7*2048+1024
    ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:0+7*1024+128
    s_waitcnt lgkmcnt(2)
    .v_lanegroup_dotx_fp16_8x8x4 v_c,v_a,v_b
    s_waitcnt lgkmcnt(1)
    .v_lanegroup_dotx_fp16_8x8x4 v_c+64,v_a+2,v_b
    ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:0+7*1024+256
    s_waitcnt lgkmcnt(1)
    .v_lanegroup_dotx_fp16_8x8x4 v_c+8,v_a,v_b+2
    .v_lanegroup_dotx_fp16_8x8x4 v_c+72,v_a+2,v_b+2
    ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:0+7*1024+384
    s_waitcnt lgkmcnt(1)
    .v_lanegroup_dotx_fp16_8x8x4 v_c+16,v_a,v_b
    .v_lanegroup_dotx_fp16_8x8x4 v_c+80,v_a+2,v_b
    ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:0+7*1024+512
    s_waitcnt lgkmcnt(1)
    .v_lanegroup_dotx_fp16_8x8x4 v_c+24,v_a,v_b+2
    .v_lanegroup_dotx_fp16_8x8x4 v_c+88,v_a+2,v_b+2
    ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:0+7*1024+640
    s_waitcnt lgkmcnt(1)
    .v_lanegroup_dotx_fp16_8x8x4 v_c+32,v_a,v_b
    .v_lanegroup_dotx_fp16_8x8x4 v_c+96,v_a+2,v_b
    ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:0+7*1024+768
    s_waitcnt lgkmcnt(1)
    .v_lanegroup_dotx_fp16_8x8x4 v_c+40,v_a,v_b+2
    .v_lanegroup_dotx_fp16_8x8x4 v_c+104,v_a+2,v_b+2
    ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:0+7*1024+896
    s_waitcnt lgkmcnt(1)
    .v_lanegroup_dotx_fp16_8x8x4 v_c+48,v_a,v_b
    .v_lanegroup_dotx_fp16_8x8x4 v_c+112,v_a+2,v_b
    s_waitcnt lgkmcnt(0)
    .v_lanegroup_dotx_fp16_8x8x4 v_c+56,v_a,v_b+2
    .v_lanegroup_dotx_fp16_8x8x4 v_c+120,v_a+2,v_b+2
    
    
    
    
    
    ; coalescing store, mapping:c_m:8x1-1x4-1x4-2x1, c_n:1x8-1x2-1x1-8x1, a_m:1x8, a_n:1x8, a_k:2x1
    ; coalescing_groups:2, num_dword_per_group:64, block_size:256
    ; gemm_co_prev_desc:[1, 4, 4, 1, 8, 128], gemm_co_split_lengths:[1, 4, 4, 1, 8, 8, 1, 2, 8, 1], gemm_co_post_desc:[2, 32, 2, 128, 1]
    s_lshl_b32 s[s_out_stride_k], s[s_out_stride_k], 1
    s_barrier
    s_barrier
    v_cvt_f16_f32 v[v_c], v[v_c]
    v_cvt_f16_f32_sdwa v[v_c], v[v_c+1]  dst_sel:WORD_1
    v_cvt_f16_f32 v[v_c+1], v[v_c+2]
    v_cvt_f16_f32_sdwa v[v_c+1], v[v_c+3]  dst_sel:WORD_1
    ds_write_b64 v[v_co_sst], v[v_c:v_c+1] 
    v_cvt_f16_f32 v[v_c+4], v[v_c+4]
    v_cvt_f16_f32_sdwa v[v_c+4], v[v_c+5]  dst_sel:WORD_1
    v_cvt_f16_f32 v[v_c+5], v[v_c+6]
    v_cvt_f16_f32_sdwa v[v_c+5], v[v_c+7]  dst_sel:WORD_1
    ds_write_b64 v[v_co_sst], v[v_c+4:v_c+4+1] offset:1024
    v_cvt_f16_f32 v[v_c+8], v[v_c+8]
    v_cvt_f16_f32_sdwa v[v_c+8], v[v_c+9]  dst_sel:WORD_1
    v_cvt_f16_f32 v[v_c+9], v[v_c+10]
    v_cvt_f16_f32_sdwa v[v_c+9], v[v_c+11]  dst_sel:WORD_1
    ds_write_b64 v[v_co_sst], v[v_c+8:v_c+8+1] offset:128
    v_cvt_f16_f32 v[v_c+12], v[v_c+12]
    v_cvt_f16_f32_sdwa v[v_c+12], v[v_c+13]  dst_sel:WORD_1
    v_cvt_f16_f32 v[v_c+13], v[v_c+14]
    v_cvt_f16_f32_sdwa v[v_c+13], v[v_c+15]  dst_sel:WORD_1
    ds_write_b64 v[v_co_sst], v[v_c+12:v_c+12+1] offset:1152
    v_cvt_f16_f32 v[v_c+16], v[v_c+16]
    v_cvt_f16_f32_sdwa v[v_c+16], v[v_c+17]  dst_sel:WORD_1
    v_cvt_f16_f32 v[v_c+17], v[v_c+18]
    v_cvt_f16_f32_sdwa v[v_c+17], v[v_c+19]  dst_sel:WORD_1
    ds_write_b64 v[v_co_sst], v[v_c+16:v_c+16+1] offset:256
    v_cvt_f16_f32 v[v_c+20], v[v_c+20]
    v_cvt_f16_f32_sdwa v[v_c+20], v[v_c+21]  dst_sel:WORD_1
    v_cvt_f16_f32 v[v_c+21], v[v_c+22]
    v_cvt_f16_f32_sdwa v[v_c+21], v[v_c+23]  dst_sel:WORD_1
    ds_write_b64 v[v_co_sst], v[v_c+20:v_c+20+1] offset:1280
    v_cvt_f16_f32 v[v_c+24], v[v_c+24]
    v_cvt_f16_f32_sdwa v[v_c+24], v[v_c+25]  dst_sel:WORD_1
    v_cvt_f16_f32 v[v_c+25], v[v_c+26]
    v_cvt_f16_f32_sdwa v[v_c+25], v[v_c+27]  dst_sel:WORD_1
    ds_write_b64 v[v_co_sst], v[v_c+24:v_c+24+1] offset:384
    v_cvt_f16_f32 v[v_c+28], v[v_c+28]
    v_cvt_f16_f32_sdwa v[v_c+28], v[v_c+29]  dst_sel:WORD_1
    v_cvt_f16_f32 v[v_c+29], v[v_c+30]
    v_cvt_f16_f32_sdwa v[v_c+29], v[v_c+31]  dst_sel:WORD_1
    ds_write_b64 v[v_co_sst], v[v_c+28:v_c+28+1] offset:1408
    v_cvt_f16_f32 v[v_c+32], v[v_c+32]
    v_cvt_f16_f32_sdwa v[v_c+32], v[v_c+33]  dst_sel:WORD_1
    v_cvt_f16_f32 v[v_c+33], v[v_c+34]
    v_cvt_f16_f32_sdwa v[v_c+33], v[v_c+35]  dst_sel:WORD_1
    ds_write_b64 v[v_co_sst], v[v_c+32:v_c+32+1] offset:512
    v_cvt_f16_f32 v[v_c+36], v[v_c+36]
    v_cvt_f16_f32_sdwa v[v_c+36], v[v_c+37]  dst_sel:WORD_1
    v_cvt_f16_f32 v[v_c+37], v[v_c+38]
    v_cvt_f16_f32_sdwa v[v_c+37], v[v_c+39]  dst_sel:WORD_1
    ds_write_b64 v[v_co_sst], v[v_c+36:v_c+36+1] offset:1536
    v_cvt_f16_f32 v[v_c+40], v[v_c+40]
    v_cvt_f16_f32_sdwa v[v_c+40], v[v_c+41]  dst_sel:WORD_1
    v_cvt_f16_f32 v[v_c+41], v[v_c+42]
    v_cvt_f16_f32_sdwa v[v_c+41], v[v_c+43]  dst_sel:WORD_1
    ds_write_b64 v[v_co_sst], v[v_c+40:v_c+40+1] offset:640
    v_cvt_f16_f32 v[v_c+44], v[v_c+44]
    v_cvt_f16_f32_sdwa v[v_c+44], v[v_c+45]  dst_sel:WORD_1
    v_cvt_f16_f32 v[v_c+45], v[v_c+46]
    v_cvt_f16_f32_sdwa v[v_c+45], v[v_c+47]  dst_sel:WORD_1
    ds_write_b64 v[v_co_sst], v[v_c+44:v_c+44+1] offset:1664
    v_cvt_f16_f32 v[v_c+48], v[v_c+48]
    v_cvt_f16_f32_sdwa v[v_c+48], v[v_c+49]  dst_sel:WORD_1
    v_cvt_f16_f32 v[v_c+49], v[v_c+50]
    v_cvt_f16_f32_sdwa v[v_c+49], v[v_c+51]  dst_sel:WORD_1
    ds_write_b64 v[v_co_sst], v[v_c+48:v_c+48+1] offset:768
    v_cvt_f16_f32 v[v_c+52], v[v_c+52]
    v_cvt_f16_f32_sdwa v[v_c+52], v[v_c+53]  dst_sel:WORD_1
    v_cvt_f16_f32 v[v_c+53], v[v_c+54]
    v_cvt_f16_f32_sdwa v[v_c+53], v[v_c+55]  dst_sel:WORD_1
    ds_write_b64 v[v_co_sst], v[v_c+52:v_c+52+1] offset:1792
    v_cvt_f16_f32 v[v_c+56], v[v_c+56]
    v_cvt_f16_f32_sdwa v[v_c+56], v[v_c+57]  dst_sel:WORD_1
    v_cvt_f16_f32 v[v_c+57], v[v_c+58]
    v_cvt_f16_f32_sdwa v[v_c+57], v[v_c+59]  dst_sel:WORD_1
    ds_write_b64 v[v_co_sst], v[v_c+56:v_c+56+1] offset:896
    v_cvt_f16_f32 v[v_c+60], v[v_c+60]
    v_cvt_f16_f32_sdwa v[v_c+60], v[v_c+61]  dst_sel:WORD_1
    v_cvt_f16_f32 v[v_c+61], v[v_c+62]
    v_cvt_f16_f32_sdwa v[v_c+61], v[v_c+63]  dst_sel:WORD_1
    ds_write_b64 v[v_co_sst], v[v_c+60:v_c+60+1] offset:1920
    s_mov_b32 s[s_tmp], 0   ; i_m:0(i_m0:0,i_m1:0, fold_m:4)
    v_lshlrev_b32 v[v_co_sub_m_index], 2, v[v_co_sub_m_index]
    v_add_nc_u32 v[v_out_ik], s[s_block_gtc_ik], v[v_co_sub_m_index]
    v_mov_b32 v[v_tmp], v[v_out_ik]
    s_waitcnt lgkmcnt(0)
    s_barrier
    ;   load from lds, i_ssgroup:0, num_sld_issues_per_ssgroup:8
    v_cmpx_gt_u32 256, v[v_coalescing_store_index]
    ds_read_b64 v[v_c:v_c+1], v[v_co_sld] offset:0
    ds_read_b64 v[v_c+2:v_c+2+1], v[v_co_sld] offset:2048
    ds_read_b64 v[v_c+4:v_c+4+1], v[v_co_sld] offset:4096
    ds_read_b64 v[v_c+6:v_c+6+1], v[v_co_sld] offset:6144
    ds_read_b64 v[v_c+8:v_c+8+1], v[v_co_sld] offset:8192
    ds_read_b64 v[v_c+10:v_c+10+1], v[v_co_sld] offset:10240
    ds_read_b64 v[v_c+12:v_c+12+1], v[v_co_sld] offset:12288
    ds_read_b64 v[v_c+14:v_c+14+1], v[v_co_sld] offset:14336
    v_cmp_eq_i32 1, v[v_out_flag]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    ;   store to global, m index start:0
    s_waitcnt lgkmcnt(7)
    v_cmp_gt_u32 s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_store_dwordx2 v[v_c:v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 2, s[s_out_stride_k]   ; i_m:8(i_m0:0,i_m1:0, fold_m:4)
    v_add_nc_u32 v[v_tmp], 8, v[v_out_ik]
    s_waitcnt lgkmcnt(6)
    v_cmp_gt_u32 s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_store_dwordx2 v[v_c+2:v_c+2+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 4, s[s_out_stride_k]   ; i_m:16(i_m0:0,i_m1:0, fold_m:4)
    v_add_nc_u32 v[v_tmp], 16, v[v_out_ik]
    s_waitcnt lgkmcnt(5)
    v_cmp_gt_u32 s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_store_dwordx2 v[v_c+4:v_c+4+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 6, s[s_out_stride_k]   ; i_m:24(i_m0:0,i_m1:0, fold_m:4)
    v_add_nc_u32 v[v_tmp], 24, v[v_out_ik]
    s_waitcnt lgkmcnt(4)
    v_cmp_gt_u32 s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_store_dwordx2 v[v_c+6:v_c+6+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 8, s[s_out_stride_k]   ; i_m:32(i_m0:0,i_m1:0, fold_m:4)
    v_add_nc_u32 v[v_tmp], 32, v[v_out_ik]
    s_waitcnt lgkmcnt(3)
    v_cmp_gt_u32 s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_store_dwordx2 v[v_c+8:v_c+8+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 10, s[s_out_stride_k]   ; i_m:40(i_m0:0,i_m1:0, fold_m:4)
    v_add_nc_u32 v[v_tmp], 40, v[v_out_ik]
    s_waitcnt lgkmcnt(2)
    v_cmp_gt_u32 s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_store_dwordx2 v[v_c+10:v_c+10+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 12, s[s_out_stride_k]   ; i_m:48(i_m0:0,i_m1:0, fold_m:4)
    v_add_nc_u32 v[v_tmp], 48, v[v_out_ik]
    s_waitcnt lgkmcnt(1)
    v_cmp_gt_u32 s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_store_dwordx2 v[v_c+12:v_c+12+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 14, s[s_out_stride_k]   ; i_m:56(i_m0:0,i_m1:0, fold_m:4)
    v_add_nc_u32 v[v_tmp], 56, v[v_out_ik]
    s_waitcnt lgkmcnt(0)
    v_cmp_gt_u32 s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_store_dwordx2 v[v_c+14:v_c+14+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 16, s[s_out_stride_k]   ; i_m:64(i_m0:0,i_m1:0, fold_m:4)
    v_add_nc_u32 v[v_tmp], 64, v[v_out_ik]
    s_mov_b64 exec, -1
    ;   load from lds, i_ssgroup:1, num_sld_issues_per_ssgroup:8
    v_cmpx_gt_u32 256, v[v_coalescing_store_index]
    ds_read_b64 v[v_c:v_c+1], v[v_co_sld] offset:16384
    ds_read_b64 v[v_c+2:v_c+2+1], v[v_co_sld] offset:18432
    ds_read_b64 v[v_c+4:v_c+4+1], v[v_co_sld] offset:20480
    ds_read_b64 v[v_c+6:v_c+6+1], v[v_co_sld] offset:22528
    ds_read_b64 v[v_c+8:v_c+8+1], v[v_co_sld] offset:24576
    ds_read_b64 v[v_c+10:v_c+10+1], v[v_co_sld] offset:26624
    ds_read_b64 v[v_c+12:v_c+12+1], v[v_co_sld] offset:28672
    ds_read_b64 v[v_c+14:v_c+14+1], v[v_co_sld] offset:30720
    v_cmp_eq_i32 1, v[v_out_flag]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    ;   store to global, m index start:0
    s_waitcnt lgkmcnt(7)
    v_cmp_gt_u32 s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_store_dwordx2 v[v_c:v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 18, s[s_out_stride_k]   ; i_m:72(i_m0:0,i_m1:0, fold_m:4)
    v_add_nc_u32 v[v_tmp], 72, v[v_out_ik]
    s_waitcnt lgkmcnt(6)
    v_cmp_gt_u32 s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_store_dwordx2 v[v_c+2:v_c+2+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 20, s[s_out_stride_k]   ; i_m:80(i_m0:0,i_m1:0, fold_m:4)
    v_add_nc_u32 v[v_tmp], 80, v[v_out_ik]
    s_waitcnt lgkmcnt(5)
    v_cmp_gt_u32 s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_store_dwordx2 v[v_c+4:v_c+4+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 22, s[s_out_stride_k]   ; i_m:88(i_m0:0,i_m1:0, fold_m:4)
    v_add_nc_u32 v[v_tmp], 88, v[v_out_ik]
    s_waitcnt lgkmcnt(4)
    v_cmp_gt_u32 s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_store_dwordx2 v[v_c+6:v_c+6+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 24, s[s_out_stride_k]   ; i_m:96(i_m0:0,i_m1:0, fold_m:4)
    v_add_nc_u32 v[v_tmp], 96, v[v_out_ik]
    s_waitcnt lgkmcnt(3)
    v_cmp_gt_u32 s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_store_dwordx2 v[v_c+8:v_c+8+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 26, s[s_out_stride_k]   ; i_m:104(i_m0:0,i_m1:0, fold_m:4)
    v_add_nc_u32 v[v_tmp], 104, v[v_out_ik]
    s_waitcnt lgkmcnt(2)
    v_cmp_gt_u32 s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_store_dwordx2 v[v_c+10:v_c+10+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 28, s[s_out_stride_k]   ; i_m:112(i_m0:0,i_m1:0, fold_m:4)
    v_add_nc_u32 v[v_tmp], 112, v[v_out_ik]
    s_waitcnt lgkmcnt(1)
    v_cmp_gt_u32 s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_store_dwordx2 v[v_c+12:v_c+12+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 30, s[s_out_stride_k]   ; i_m:120(i_m0:0,i_m1:0, fold_m:4)
    v_add_nc_u32 v[v_tmp], 120, v[v_out_ik]
    s_waitcnt lgkmcnt(0)
    v_cmp_gt_u32 s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_store_dwordx2 v[v_c+14:v_c+14+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mov_b64 exec, -1
    s_barrier
    v_cvt_f16_f32 v[v_c+64], v[v_c+64]
    v_cvt_f16_f32_sdwa v[v_c+64], v[v_c+65]  dst_sel:WORD_1
    v_cvt_f16_f32 v[v_c+65], v[v_c+66]
    v_cvt_f16_f32_sdwa v[v_c+65], v[v_c+67]  dst_sel:WORD_1
    ds_write_b64 v[v_co_sst], v[v_c+64:v_c+64+1] 
    v_cvt_f16_f32 v[v_c+68], v[v_c+68]
    v_cvt_f16_f32_sdwa v[v_c+68], v[v_c+69]  dst_sel:WORD_1
    v_cvt_f16_f32 v[v_c+69], v[v_c+70]
    v_cvt_f16_f32_sdwa v[v_c+69], v[v_c+71]  dst_sel:WORD_1
    ds_write_b64 v[v_co_sst], v[v_c+68:v_c+68+1] offset:1024
    v_cvt_f16_f32 v[v_c+72], v[v_c+72]
    v_cvt_f16_f32_sdwa v[v_c+72], v[v_c+73]  dst_sel:WORD_1
    v_cvt_f16_f32 v[v_c+73], v[v_c+74]
    v_cvt_f16_f32_sdwa v[v_c+73], v[v_c+75]  dst_sel:WORD_1
    ds_write_b64 v[v_co_sst], v[v_c+72:v_c+72+1] offset:128
    v_cvt_f16_f32 v[v_c+76], v[v_c+76]
    v_cvt_f16_f32_sdwa v[v_c+76], v[v_c+77]  dst_sel:WORD_1
    v_cvt_f16_f32 v[v_c+77], v[v_c+78]
    v_cvt_f16_f32_sdwa v[v_c+77], v[v_c+79]  dst_sel:WORD_1
    ds_write_b64 v[v_co_sst], v[v_c+76:v_c+76+1] offset:1152
    v_cvt_f16_f32 v[v_c+80], v[v_c+80]
    v_cvt_f16_f32_sdwa v[v_c+80], v[v_c+81]  dst_sel:WORD_1
    v_cvt_f16_f32 v[v_c+81], v[v_c+82]
    v_cvt_f16_f32_sdwa v[v_c+81], v[v_c+83]  dst_sel:WORD_1
    ds_write_b64 v[v_co_sst], v[v_c+80:v_c+80+1] offset:256
    v_cvt_f16_f32 v[v_c+84], v[v_c+84]
    v_cvt_f16_f32_sdwa v[v_c+84], v[v_c+85]  dst_sel:WORD_1
    v_cvt_f16_f32 v[v_c+85], v[v_c+86]
    v_cvt_f16_f32_sdwa v[v_c+85], v[v_c+87]  dst_sel:WORD_1
    ds_write_b64 v[v_co_sst], v[v_c+84:v_c+84+1] offset:1280
    v_cvt_f16_f32 v[v_c+88], v[v_c+88]
    v_cvt_f16_f32_sdwa v[v_c+88], v[v_c+89]  dst_sel:WORD_1
    v_cvt_f16_f32 v[v_c+89], v[v_c+90]
    v_cvt_f16_f32_sdwa v[v_c+89], v[v_c+91]  dst_sel:WORD_1
    ds_write_b64 v[v_co_sst], v[v_c+88:v_c+88+1] offset:384
    v_cvt_f16_f32 v[v_c+92], v[v_c+92]
    v_cvt_f16_f32_sdwa v[v_c+92], v[v_c+93]  dst_sel:WORD_1
    v_cvt_f16_f32 v[v_c+93], v[v_c+94]
    v_cvt_f16_f32_sdwa v[v_c+93], v[v_c+95]  dst_sel:WORD_1
    ds_write_b64 v[v_co_sst], v[v_c+92:v_c+92+1] offset:1408
    v_cvt_f16_f32 v[v_c+96], v[v_c+96]
    v_cvt_f16_f32_sdwa v[v_c+96], v[v_c+97]  dst_sel:WORD_1
    v_cvt_f16_f32 v[v_c+97], v[v_c+98]
    v_cvt_f16_f32_sdwa v[v_c+97], v[v_c+99]  dst_sel:WORD_1
    ds_write_b64 v[v_co_sst], v[v_c+96:v_c+96+1] offset:512
    v_cvt_f16_f32 v[v_c+100], v[v_c+100]
    v_cvt_f16_f32_sdwa v[v_c+100], v[v_c+101]  dst_sel:WORD_1
    v_cvt_f16_f32 v[v_c+101], v[v_c+102]
    v_cvt_f16_f32_sdwa v[v_c+101], v[v_c+103]  dst_sel:WORD_1
    ds_write_b64 v[v_co_sst], v[v_c+100:v_c+100+1] offset:1536
    v_cvt_f16_f32 v[v_c+104], v[v_c+104]
    v_cvt_f16_f32_sdwa v[v_c+104], v[v_c+105]  dst_sel:WORD_1
    v_cvt_f16_f32 v[v_c+105], v[v_c+106]
    v_cvt_f16_f32_sdwa v[v_c+105], v[v_c+107]  dst_sel:WORD_1
    ds_write_b64 v[v_co_sst], v[v_c+104:v_c+104+1] offset:640
    v_cvt_f16_f32 v[v_c+108], v[v_c+108]
    v_cvt_f16_f32_sdwa v[v_c+108], v[v_c+109]  dst_sel:WORD_1
    v_cvt_f16_f32 v[v_c+109], v[v_c+110]
    v_cvt_f16_f32_sdwa v[v_c+109], v[v_c+111]  dst_sel:WORD_1
    ds_write_b64 v[v_co_sst], v[v_c+108:v_c+108+1] offset:1664
    v_cvt_f16_f32 v[v_c+112], v[v_c+112]
    v_cvt_f16_f32_sdwa v[v_c+112], v[v_c+113]  dst_sel:WORD_1
    v_cvt_f16_f32 v[v_c+113], v[v_c+114]
    v_cvt_f16_f32_sdwa v[v_c+113], v[v_c+115]  dst_sel:WORD_1
    ds_write_b64 v[v_co_sst], v[v_c+112:v_c+112+1] offset:768
    v_cvt_f16_f32 v[v_c+116], v[v_c+116]
    v_cvt_f16_f32_sdwa v[v_c+116], v[v_c+117]  dst_sel:WORD_1
    v_cvt_f16_f32 v[v_c+117], v[v_c+118]
    v_cvt_f16_f32_sdwa v[v_c+117], v[v_c+119]  dst_sel:WORD_1
    ds_write_b64 v[v_co_sst], v[v_c+116:v_c+116+1] offset:1792
    v_cvt_f16_f32 v[v_c+120], v[v_c+120]
    v_cvt_f16_f32_sdwa v[v_c+120], v[v_c+121]  dst_sel:WORD_1
    v_cvt_f16_f32 v[v_c+121], v[v_c+122]
    v_cvt_f16_f32_sdwa v[v_c+121], v[v_c+123]  dst_sel:WORD_1
    ds_write_b64 v[v_co_sst], v[v_c+120:v_c+120+1] offset:896
    v_cvt_f16_f32 v[v_c+124], v[v_c+124]
    v_cvt_f16_f32_sdwa v[v_c+124], v[v_c+125]  dst_sel:WORD_1
    v_cvt_f16_f32 v[v_c+125], v[v_c+126]
    v_cvt_f16_f32_sdwa v[v_c+125], v[v_c+127]  dst_sel:WORD_1
    ds_write_b64 v[v_co_sst], v[v_c+124:v_c+124+1] offset:1920
    s_mul_i32 s[s_tmp], 32, s[s_out_stride_k]   ; i_m:128(i_m0:0,i_m1:0, fold_m:4)
    v_add_nc_u32 v[v_tmp], 128, v[v_out_ik]
    s_waitcnt lgkmcnt(0)
    s_barrier
    ;   load from lds, i_ssgroup:0, num_sld_issues_per_ssgroup:8
    v_cmpx_gt_u32 256, v[v_coalescing_store_index]
    ds_read_b64 v[v_c:v_c+1], v[v_co_sld] offset:0
    ds_read_b64 v[v_c+2:v_c+2+1], v[v_co_sld] offset:2048
    ds_read_b64 v[v_c+4:v_c+4+1], v[v_co_sld] offset:4096
    ds_read_b64 v[v_c+6:v_c+6+1], v[v_co_sld] offset:6144
    ds_read_b64 v[v_c+8:v_c+8+1], v[v_co_sld] offset:8192
    ds_read_b64 v[v_c+10:v_c+10+1], v[v_co_sld] offset:10240
    ds_read_b64 v[v_c+12:v_c+12+1], v[v_co_sld] offset:12288
    ds_read_b64 v[v_c+14:v_c+14+1], v[v_co_sld] offset:14336
    v_cmp_eq_i32 1, v[v_out_flag]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    ;   store to global, m index start:128
    s_waitcnt lgkmcnt(7)
    v_cmp_gt_u32 s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_store_dwordx2 v[v_c:v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 34, s[s_out_stride_k]   ; i_m:136(i_m0:0,i_m1:0, fold_m:4)
    v_add_nc_u32 v[v_tmp], 136, v[v_out_ik]
    s_waitcnt lgkmcnt(6)
    v_cmp_gt_u32 s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_store_dwordx2 v[v_c+2:v_c+2+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 36, s[s_out_stride_k]   ; i_m:144(i_m0:0,i_m1:0, fold_m:4)
    v_add_nc_u32 v[v_tmp], 144, v[v_out_ik]
    s_waitcnt lgkmcnt(5)
    v_cmp_gt_u32 s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_store_dwordx2 v[v_c+4:v_c+4+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 38, s[s_out_stride_k]   ; i_m:152(i_m0:0,i_m1:0, fold_m:4)
    v_add_nc_u32 v[v_tmp], 152, v[v_out_ik]
    s_waitcnt lgkmcnt(4)
    v_cmp_gt_u32 s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_store_dwordx2 v[v_c+6:v_c+6+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 40, s[s_out_stride_k]   ; i_m:160(i_m0:0,i_m1:0, fold_m:4)
    v_add_nc_u32 v[v_tmp], 160, v[v_out_ik]
    s_waitcnt lgkmcnt(3)
    v_cmp_gt_u32 s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_store_dwordx2 v[v_c+8:v_c+8+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 42, s[s_out_stride_k]   ; i_m:168(i_m0:0,i_m1:0, fold_m:4)
    v_add_nc_u32 v[v_tmp], 168, v[v_out_ik]
    s_waitcnt lgkmcnt(2)
    v_cmp_gt_u32 s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_store_dwordx2 v[v_c+10:v_c+10+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 44, s[s_out_stride_k]   ; i_m:176(i_m0:0,i_m1:0, fold_m:4)
    v_add_nc_u32 v[v_tmp], 176, v[v_out_ik]
    s_waitcnt lgkmcnt(1)
    v_cmp_gt_u32 s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_store_dwordx2 v[v_c+12:v_c+12+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 46, s[s_out_stride_k]   ; i_m:184(i_m0:0,i_m1:0, fold_m:4)
    v_add_nc_u32 v[v_tmp], 184, v[v_out_ik]
    s_waitcnt lgkmcnt(0)
    v_cmp_gt_u32 s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_store_dwordx2 v[v_c+14:v_c+14+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 48, s[s_out_stride_k]   ; i_m:192(i_m0:0,i_m1:0, fold_m:4)
    v_add_nc_u32 v[v_tmp], 192, v[v_out_ik]
    s_mov_b64 exec, -1
    ;   load from lds, i_ssgroup:1, num_sld_issues_per_ssgroup:8
    v_cmpx_gt_u32 256, v[v_coalescing_store_index]
    ds_read_b64 v[v_c:v_c+1], v[v_co_sld] offset:16384
    ds_read_b64 v[v_c+2:v_c+2+1], v[v_co_sld] offset:18432
    ds_read_b64 v[v_c+4:v_c+4+1], v[v_co_sld] offset:20480
    ds_read_b64 v[v_c+6:v_c+6+1], v[v_co_sld] offset:22528
    ds_read_b64 v[v_c+8:v_c+8+1], v[v_co_sld] offset:24576
    ds_read_b64 v[v_c+10:v_c+10+1], v[v_co_sld] offset:26624
    ds_read_b64 v[v_c+12:v_c+12+1], v[v_co_sld] offset:28672
    ds_read_b64 v[v_c+14:v_c+14+1], v[v_co_sld] offset:30720
    v_cmp_eq_i32 1, v[v_out_flag]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    ;   store to global, m index start:128
    s_waitcnt lgkmcnt(7)
    v_cmp_gt_u32 s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_store_dwordx2 v[v_c:v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 50, s[s_out_stride_k]   ; i_m:200(i_m0:0,i_m1:0, fold_m:4)
    v_add_nc_u32 v[v_tmp], 200, v[v_out_ik]
    s_waitcnt lgkmcnt(6)
    v_cmp_gt_u32 s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_store_dwordx2 v[v_c+2:v_c+2+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 52, s[s_out_stride_k]   ; i_m:208(i_m0:0,i_m1:0, fold_m:4)
    v_add_nc_u32 v[v_tmp], 208, v[v_out_ik]
    s_waitcnt lgkmcnt(5)
    v_cmp_gt_u32 s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_store_dwordx2 v[v_c+4:v_c+4+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 54, s[s_out_stride_k]   ; i_m:216(i_m0:0,i_m1:0, fold_m:4)
    v_add_nc_u32 v[v_tmp], 216, v[v_out_ik]
    s_waitcnt lgkmcnt(4)
    v_cmp_gt_u32 s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_store_dwordx2 v[v_c+6:v_c+6+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 56, s[s_out_stride_k]   ; i_m:224(i_m0:0,i_m1:0, fold_m:4)
    v_add_nc_u32 v[v_tmp], 224, v[v_out_ik]
    s_waitcnt lgkmcnt(3)
    v_cmp_gt_u32 s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_store_dwordx2 v[v_c+8:v_c+8+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 58, s[s_out_stride_k]   ; i_m:232(i_m0:0,i_m1:0, fold_m:4)
    v_add_nc_u32 v[v_tmp], 232, v[v_out_ik]
    s_waitcnt lgkmcnt(2)
    v_cmp_gt_u32 s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_store_dwordx2 v[v_c+10:v_c+10+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 60, s[s_out_stride_k]   ; i_m:240(i_m0:0,i_m1:0, fold_m:4)
    v_add_nc_u32 v[v_tmp], 240, v[v_out_ik]
    s_waitcnt lgkmcnt(1)
    v_cmp_gt_u32 s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_store_dwordx2 v[v_c+12:v_c+12+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 62, s[s_out_stride_k]   ; i_m:248(i_m0:0,i_m1:0, fold_m:4)
    v_add_nc_u32 v[v_tmp], 248, v[v_out_ik]
    s_waitcnt lgkmcnt(0)
    v_cmp_gt_u32 s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_store_dwordx2 v[v_c+14:v_c+14+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mov_b64 exec, -1
L_igemm_fwd_gtcn2_nchwc_kcyxc_fp16x4_bx0_ex0_bt256x128x32_lt8x8_lw4x2_lr2x8_ta1x1x1x32_1x8x1x32_tb1x1x4x4_1x8x1x32_out:
    s_endpgm
.rodata
.p2align 6
.amdhsa_kernel igemm_fwd_gtcn2_nchwc_kcyxc_fp16x4_bx0_ex0_bt256x128x32_lt8x8_lw4x2_lr2x8_ta1x1x1x32_1x8x1x32_tb1x1x4x4_1x8x1x32
    .amdhsa_group_segment_fixed_size 32768
    .amdhsa_user_sgpr_kernarg_segment_ptr 1
    .amdhsa_system_sgpr_workgroup_id_x 1
    .amdhsa_system_sgpr_workgroup_id_y 1
    .amdhsa_system_sgpr_workgroup_id_z 1
    .amdhsa_system_vgpr_workitem_id 0
    .amdhsa_next_free_vgpr 198
    .amdhsa_next_free_sgpr 84
    .amdhsa_ieee_mode 0
    .amdhsa_dx10_clamp 0
    .amdhsa_wavefront_size32 1
    .amdhsa_workgroup_processor_mode 1
.end_amdhsa_kernel

.amdgpu_metadata
---
amdhsa.version: [ 1, 0 ]
amdhsa.kernels:
  - .name: igemm_fwd_gtcn2_nchwc_kcyxc_fp16x4_bx0_ex0_bt256x128x32_lt8x8_lw4x2_lr2x8_ta1x1x1x32_1x8x1x32_tb1x1x4x4_1x8x1x32
    .symbol: igemm_fwd_gtcn2_nchwc_kcyxc_fp16x4_bx0_ex0_bt256x128x32_lt8x8_lw4x2_lr2x8_ta1x1x1x32_1x8x1x32_tb1x1x4x4_1x8x1x32.kd
    .sgpr_count: 90
    .vgpr_count: 198
    .kernarg_segment_align: 8
    .kernarg_segment_size: 128
    .group_segment_fixed_size: 32768
    .private_segment_fixed_size: 0
    .wavefront_size: 32
    .reqd_workgroup_size : [256, 1, 1]
    .max_flat_workgroup_size: 256
    .args:
    - { .name: p_in      , .size: 8, .offset:   0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true}
    - { .name: p_wei     , .size: 8, .offset:   8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true}
    - { .name: p_out     , .size: 8, .offset:  16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false}
    - { .name: tile_hw   , .size: 4, .offset:  24, .value_kind: by_value, .value_type: i32}
    - { .name: ntile_hw  , .size: 4, .offset:  28, .value_kind: by_value, .value_type: i32}
    - { .name: hi        , .size: 4, .offset:  32, .value_kind: by_value, .value_type: i32}
    - { .name: wi        , .size: 4, .offset:  36, .value_kind: by_value, .value_type: i32}
    - { .name: n_         , .size: 4, .offset:  40, .value_kind: by_value, .value_type: i32}
    - { .name: k         , .size: 4, .offset:  44, .value_kind: by_value, .value_type: i32}
    - { .name: c         , .size: 4, .offset:  48, .value_kind: by_value, .value_type: i32}
    - { .name: group     , .size: 4, .offset:  52, .value_kind: by_value, .value_type: i32}
    - { .name: gemm_k_split, .size: 4, .offset:  56, .value_kind: by_value, .value_type: i32}
    - { .name: ho        , .size: 4, .offset:  60, .value_kind: by_value, .value_type: i32}
    - { .name: wo        , .size: 4, .offset:  64, .value_kind: by_value, .value_type: i32}
    - { .name: stride_hw , .size: 4, .offset:  68, .value_kind: by_value, .value_type: i32}
    - { .name: dilation_hw, .size: 4, .offset:  72, .value_kind: by_value, .value_type: i32}
    - { .name: pad_hw    , .size: 4, .offset:  76, .value_kind: by_value, .value_type: i32}
    - { .name: wei_hw    , .size: 4, .offset:  80, .value_kind: by_value, .value_type: i32}
    - { .name: move_slice_k, .size: 4, .offset:  84, .value_kind: by_value, .value_type: i32}
    - { .name: magic_0   , .size: 4, .offset:  88, .value_kind: by_value, .value_type: i32}
    - { .name: magic_1   , .size: 4, .offset:  92, .value_kind: by_value, .value_type: i32}
    - { .name: magic_2   , .size: 4, .offset:  96, .value_kind: by_value, .value_type: i32}
    - { .name: magic_3   , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32}
    - { .name: magic_4   , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32}
    - { .name: magic_5   , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32}
    - { .name: magic_6   , .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32}
    - { .name: magic_7   , .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32}
    - { .name: shift_pack_0, .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32}
    - { .name: shift_pack_1, .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32}
...
.end_amdgpu_metadata
