#ifndef __CONCAT_16B_IMPL_H__
#define __CONCAT_16B_IMPL_H__

#include "aie_api/utils.hpp"
#include "aie_api/aie.hpp"
#include "common/api_loop_pipe_helper.hpp"
#include "common/ml_params.h"

struct ConcatParams {
    uint32_t outer_loop;
};


/*
The Kernel is performing a concat of (B,Y,X,63) and (B,Y,X,1) to (B,Y,X,64) given a BYXC data order
in_ptr : input pointer for (B,Y,X,63), use the same Type T for bf16/int16/fp16 to avoid multiple instances of the kernel
in_ptr : output pointer for (B,Y,X,1),
out_ptr : output pointer for (B,Y,X,64),  needs to be 512 bit aligned
*/
template< unsigned loop_range=8, typename T>
__attribute__ ((always_inline))
void concat_16b_c6463(
        T * in1_ptr,
        T * in2_ptr,
        T * out_ptr,
        const ConcatParams &params
) {

v32int16 __aie_dm_resource_a * p_in1 = (v32int16 __aie_dm_resource_a *) in1_ptr;
int16 __aie_dm_resource_a * p_in2 = (int16 __aie_dm_resource_a *) in2_ptr;
v32int16 __aie_dm_resource_b * p_out = (v32int16 __aie_dm_resource_b *) out_ptr;

fifo_state_t fS;
fS.pos = 0;

for ( int i=0; i<params.outer_loop; i++ ) chess_prepare_for_pipelining chess_loop_range(loop_range,){
    fifo_ld_fill( p_in1, fS );
    aie::vector<int16,32> a0 = fifo_ld_pop( p_in1, fS );
    aie::vector<int16,32> a1 = fifo_ld_pop_1d_byte( p_in1, fS, -2 );
    a1 = aie::select( a1, *p_in2++, aie::mask<32>().from_uint32(  0x80000000 ));
    *p_out++ = a0;
    *p_out++ = a1;
}

}

#endif //__CONCAT_16B_IMPL_H_