#ifndef RUN_MATADD_WRAPPER_CC
#define RUN_MATADD_WRAPPER_CC

#include "broadcast/add2d_bf16x16.hpp"
#include "broadcast/add2d_bf16x16_impl.hpp"
#include "common.hh"
#include "q/q_impl.hpp"
#include "dq/dq_impl.hpp"

#pragma pack(push,1)
struct matadd16_layer_params{
    uint32_t offset_bytes;
    uint32_t core_qbuf_offset;
    uint32_t core_dqbuf_offset;
    uint32_t qdq_inner_g;
    uint32_t is_input_16_bit;
    uint32_t is_output_16_bit;
    uint32_t sign_A;
    uint32_t sign_O;
    Add2dBf16x16Params add_params;
};
#pragma pack(pop)

void run_matadd_16(KernelArgs& args)
{
    matadd16_layer_params* layer_params = static_cast<matadd16_layer_params*>(args.params_data);

    uint16_t* matA = static_cast<uint16_t*>(args.s2mm_ch0_port->data());
    uint16_t* matB = byte_incr(matA, layer_params->offset_bytes);
    uint16_t* output = static_cast<uint16_t*>(args.mm2s_ch0_port->data());

    BinaryQDQParams* qdq_prm   = reinterpret_cast<BinaryQDQParams*>(args.s2mm_ch1_port->data());
    KernelDqParam dq_krn_param;
    dq_krn_param.inner_g = layer_params->qdq_inner_g;
    dq_krn_param.sign_A = layer_params->sign_A;
    KernelQParam q_krn_param;
    v32accfloat *dq_buf, *q_buf;

    q_buf = (v32accfloat*)byte_incr(qdq_prm, layer_params->core_qbuf_offset );
    dq_buf = (v32accfloat*)byte_incr(qdq_prm, layer_params->core_dqbuf_offset );

    dq_buf[0] = broadcast_to_v32accfloat(qdq_prm->dq_a_zp); 
    dq_buf[2] = broadcast_to_v32accfloat(qdq_prm->dq_a_sc);
    dq_float16_v32((int8_t*) matA, (float*) dq_buf, (QDQFloatType*) matA, dq_krn_param, qdq_prm->dq_enable, layer_params->is_input_16_bit);
    dq_buf[0] = broadcast_to_v32accfloat(qdq_prm->dq_b_zp); 
    dq_buf[2] = broadcast_to_v32accfloat(qdq_prm->dq_b_sc);
    dq_float16_v32((int8_t*) matB, (float*) dq_buf, (QDQFloatType*) matB, dq_krn_param, qdq_prm->dq_enable, layer_params->is_input_16_bit);

    Add2dBf16x16Params add2d_bf16x16_layer_params(layer_params->add_params.outer_loop, layer_params->add_params.dims_x, layer_params->add_params.dims_y, 0); // is_sub = 0 for add
    add2d_bf16x16_nontemplatized(
        0,
        (QDQFloatType*) matA, (QDQFloatType*) matB, (QDQFloatType*)output, add2d_bf16x16_layer_params
    );

    q_buf[0]  = broadcast_to_v32accfloat(qdq_prm->q_zp); 
    q_buf[2]  = broadcast_to_v32accfloat(qdq_prm->q_sc);
    q_krn_param.inner_g = layer_params->qdq_inner_g;
    q_krn_param.sign_O = layer_params->sign_O;

    q_float16_to_int16_v32((QDQFloatType*) output, (float*) q_buf, (int16*) output, q_krn_param, qdq_prm->q_enable, layer_params->is_output_16_bit);

}

#endif
