
#ifndef __QDQADD_H__
#define __QDQADD_H__

#include <limits.h>
#include <stdio.h>
#include <stdlib.h>
#include <string>
#include <aie_api/aie.hpp>
#include <aie_api/utils.hpp>
#include <vector>

struct qdqadd_params {
    int32_t shft_o;
    int64 c0;
    int32_t c1;
    int32_t c2;

    int32_t inner_loop;
    int32_t step_ci;
    int32_t step_xi;
    int32_t step_reset;
    int32_t num_ox;
    int32_t num_oc;
    int8_t channel_v = 1;
    int8_t ifm_sign = 1;
};

__attribute__((always_inline))
v32acc64 broadcast_acc64( int64 val ){
    v8acc64 load_c0 = (v8acc64) broadcast_s64(val);
    v32acc64 c0_accum;
    c0_accum = insert(c0_accum, 3, load_c0);
    c0_accum = insert(c0_accum, 2, load_c0);
    c0_accum = insert(c0_accum, 1, load_c0);
    c0_accum = insert(c0_accum, 0, load_c0);
    return c0_accum;
}

/*
__attribute__((always_inline))
v32acc64 broadcast_acc64_2( int64 val ){
    aie::accum<acc64,32> load_c0 = aie::broadcast<acc64,32>( val );
    return load_c0;
}
*/

template <int len = 16>
__attribute__((always_inline))
auto broadcast_coeff( int32 val ){
    int32_t c1[2];
    c1[0] = val;
    c1[1] = 0;
    return aie::broadcast<cint32,len>( *( (cint32 *) c1) );
}

/*
loop Range = minimum iterations inner loop (9 are optimal)
replication = if true, and input1 is a channel vector, C8 is assumed to be duplicated like this 
(C0...CN) -> (C0...C7,C0...C7,C0...C7,C0...C7,C8...C15,C8...C15,C8...C15,C8...C15,...,CN-8...CN,CN-8...CN,CN-8...CN,CN-8...CN)

*/
template <int loop_range = 9, bool replication = true>
void qdqadd(int* in0, int* in1, qdqadd_params &params, int* out)
{

    aie::vector<cint32,16> c1_vec = broadcast_coeff(params.c1);
    aie::vector<cint32,16> c2_vec = broadcast_coeff(params.c2);
    aie::accum<cacc64,16> c0_accum_cint = (v16cacc64) broadcast_acc64(params.c0);
    dims_3d_t ifm2_steps = dims_3d_from_steps(params.num_ox, params.step_xi, params.num_oc, params.step_ci, params.step_reset);
    cint16 * restrict in0_ptr = (cint16 * restrict ) in0;
    cint16 * restrict in1_ptr = (cint16 * restrict ) in1;
    int16  * restrict  out_ptr = (int16 * restrict ) out;
    const unsigned vsize = 16;
    auto loop_func = [&](bool rep) __attribute__(( always_inline )){
        for(unsigned i = 0; i < params.inner_loop; i++)
        //chess_prepare_for_pipelining
        //chess_loop_range( loop_range, )
        {
            aie::vector<cint16,16> actv0, actv1;
            actv0 = aie::load_v<vsize>( in0_ptr); in0_ptr += vsize;
            if (rep == true){
                actv1 = aie::load_v<vsize>( in1_ptr); in1_ptr = add_3d_byte(in1_ptr, ifm2_steps);
            } else {
                actv1 = aie::load_v<vsize/4>( in1_ptr).grow_replicate<16>(); in1_ptr = add_3d_byte(in1_ptr, ifm2_steps);
            }
            aie::accum<cacc64,16> result = mac_elem_16(c2_vec, true, actv0, params.ifm_sign, c0_accum_cint);
            result =  mac_elem_16(c1_vec, true, actv1, params.ifm_sign, result);
            aie::accum<acc64,32> add_accum = (v32acc64) result;
            aie::vector<int16,32> store_vector = add_accum.template to_vector_sign<int16>( params.ifm_sign, params.shft_o );
            aie::store_v(out_ptr, store_vector ); out_ptr += 32;
        }
    };

    if constexpr (replication == 1)
    {
        loop_func(true);
    } else {
        if(params.channel_v == 0)
            loop_func(true);
        else
            loop_func(false);
    }
}


#endif

