#ifndef QDQ_INT16_BFLOAT16_HPP
#define QDQ_INT16_BFLOAT16_HPP

#include "qdq/qdq_kernel_helpers.h"
#include "qdq/qdq.cc"
#include <adf.h>
#include <aie_api/aie.hpp>
#include <aie_api/utils.hpp>

/*
Quant enabled by default,
pass false arg in the wrapper
if not required
*/
void __attribute__((noinline)) quant_bf16_to_int16(
    int8_t* restrict dq_in, int8_t* restrict q_out, int num_elems,
    uint16 zero_point, bfloat16 inv_scale, bool sign = true, bool q_enable = true, bool is_int16 = true)
{
    int zp = sign ? static_cast<int16_t>(zero_point) : zero_point;
    if(q_enable){
        q_dynamic((bfloat16 *)dq_in, inv_scale, zp, sign, is_int16, q_out, num_elems);
    }
}

/*
Dequant enabled by default,
pass false arg in the wrapper
if not required
*/
void  __attribute__((noinline)) dequant_int16_to_bf16(
    int8_t* q_in, int8_t* dq_out, int num_elems,
    uint16 zero_point, bfloat16 scale, bool sign = true, bool dq_enable = true, bool is_int16 = true)
{   
    int zp = sign ? static_cast<int16_t>(zero_point) : zero_point;
    if(dq_enable){
        dq_dynamic( q_in, sign, is_int16, zp, scale, ( bfloat16 * ) dq_out, num_elems );
    }
}

/*
 * Standlone OP for NEGATIVE
 */

void  neg_int16(
    int8_t* restrict dq_in, int8_t* restrict q_out, int num_elems)
{
    int loop_count = num_elems / 32;

    v32uint16*   v_in  = (v32uint16*)(dq_in);
    v32uint16*   v_out = (v32uint16*)(q_out);
    for (int i = 0; i < loop_count; ++i)
    chess_loop_range(6,)
    //chess_no_hw_loop
    chess_prepare_for_pipelining
    {
        *v_out++ = neg(*v_in++);
    }
}

#endif // QDQ_INT16_BFLOAT16_HPP