diff --git a/common/inc/cpack_common.h b/common/inc/cpack_common.h index b19aaa6..ac998ff 100644 --- a/common/inc/cpack_common.h +++ b/common/inc/cpack_common.h @@ -343,11 +343,24 @@ namespace ckernel::packer TTI_REG2FLOP(2,0,0,0,THCON_SEC1_REG1_Row_start_section_size_ADDR32+2-THCON_CFGREG_BASE_ADDR32, p_gpr_pack::TMP_LO); TTI_REG2FLOP(2,0,0,0,THCON_SEC1_REG8_Row_start_section_size_ADDR32+2-THCON_CFGREG_BASE_ADDR32, p_gpr_pack::TMP_LO); - uint32_t reconfig_PCK_DEST_RD_CTRL_Read_unsigned = 0; + dest_rd_ctrl_u dest_rd_ctrl; + dest_rd_ctrl.val = 0; + dest_rd_ctrl.f.PCK_DEST_RD_CTRL_Read_32b_data = (pack_src_format == (uint)DataFormat::Int8) | + (pack_src_format == (uint)DataFormat::UInt8) | + (pack_src_format == (uint)DataFormat::Int32) | + (pack_src_format == (uint)DataFormat::Float32) | + (is_fp32_dest_acc_en ? 1 : 0); if (pack_dst_format == (uint)DataFormat::UInt8) { - reconfig_PCK_DEST_RD_CTRL_Read_unsigned = 1; + dest_rd_ctrl.f.PCK_DEST_RD_CTRL_Read_unsigned = 1; + } + //Round to 10 bit mantissa from fp32 dest + if(is_fp32_dest_acc_en && (pack_src_format!=(uint)DataFormat::Float32)) { + dest_rd_ctrl.f.PCK_DEST_RD_CTRL_Round_10b_mant = 1; } - cfg_reg_rmw_tensix(reconfig_PCK_DEST_RD_CTRL_Read_unsigned); + cfg_reg_rmw_tensix + (dest_rd_ctrl.val); if (IS_BFP_FORMAT(pack_dst_format)) { // Override exp section size for packers 1,2,3 diff --git a/llk_lib/llk_math_eltwise_unary_datacopy.h b/llk_lib/llk_math_eltwise_unary_datacopy.h index ca532ad..f2541eb 100644 --- a/llk_lib/llk_math_eltwise_unary_datacopy.h +++ b/llk_lib/llk_math_eltwise_unary_datacopy.h @@ -121,8 +121,8 @@ inline void eltwise_unary_configure_mop(uint rows_per_inst, uint total_rows, con uint innerloop = (rows_per_inst == p_mova2d::MOV_1_ROW) ? total_rows : (total_rows >> 3); uint outerloop = num_faces; - if ((is_fp32_dest_acc_en || is_int_fpu_en) && !(dst_format == (uint)DataFormat::UInt16)) { - //use elwadd to handle unpacking data into src A as fp16, but dest is in fp32 mode + if (((is_fp32_dest_acc_en || is_int_fpu_en) && !(dst_format == (uint)DataFormat::UInt16)) || (dst_format == (uint)DataFormat::UInt8)) { + // use elwadd to handle unpacking data into src A as fp16, but dest is in fp32 mode OR to handle uint8 datums ckernel_template tmp(outerloop, innerloop, TT_OP_ELWADD(0, 0, p_elwise::SRCB_NO_BCAST, ADDR_MOD_2, 0)); tmp.set_end_op(TT_OP_SETRWC(p_setrwc::CLR_AB, 0, 0, 0, 0, p_setrwc::SET_AB)); tmp.program(instrn_buffer); diff --git a/llk_lib/llk_unpack_common.h b/llk_lib/llk_unpack_common.h index 924c21e..d07e6ec 100644 --- a/llk_lib/llk_unpack_common.h +++ b/llk_lib/llk_unpack_common.h @@ -82,6 +82,17 @@ inline void _llk_unpack_config_tile_dim_srcb_impl_(const std::uint32_t face_r_di inline void _llk_unpack_reconfig_data_format_srca_impl_(const std::uint32_t unpack_src_format, const std::uint32_t unpack_dst_format, const std::uint32_t tile_size) { + alu_config_u alu_payload = {.val = 0}; + alu_payload.f.ALU_FORMAT_SPEC_REG0_SrcA = unpack_dst_format; + if ((uint)unpack_src_format == (uint)DataFormat::UInt8) { + alu_payload.f.ALU_FORMAT_SPEC_REG0_SrcAUnsigned = 1; + } + alu_payload.f.ALU_ACC_CTRL_INT8_math_enabled = ((uint)unpack_dst_format == (uint)DataFormat::Int8) || + ((uint)unpack_dst_format == (uint)DataFormat::UInt8) || + ((uint)unpack_dst_format == (uint)DataFormat::Int32); + constexpr uint alu_mask = ALU_FORMAT_SPEC_REG0_SrcA_MASK | ALU_FORMAT_SPEC_REG0_SrcAUnsigned_MASK | ALU_ACC_CTRL_INT8_math_enabled_MASK; + cfg_reg_rmw_tensix(alu_payload.val); + cfg_reg_rmw_tensix(unpack_src_format); cfg_reg_rmw_tensix(unpack_dst_format); TT_SETDMAREG(0, LOWER_HALFWORD(tile_size), 0, LO_16(p_gpr_unpack::TILE_SIZE_A)); // update gpr which holds tile size A