#ifndef __SLICE_16B_IMPL_H__
#define __SLICE_16B_IMPL_H__

#include "aie_api/utils.hpp"
#include "aie_api/aie.hpp"
#include "common/api_loop_pipe_helper.hpp"
#include "common/ml_params.h"

struct SliceParams {
    uint32_t outer_loop;
};

/*
The Kernel is performing a slice of (B,Y,X,64) to (B,Y,X,63) and (B,Y,X,1) given a BYXC data order
in_ptr : pointer to input buffer, use the same Type T for bf16/int16/fp16 to avoid multiple instances of the kernel, needs to be 512 bit aligned
out1_ptr : output pointer for (B,Y,X,63),  due to the way the fifo stores work, we need B*Y*X*63 + 4 byte space in L1 for this, the kernel is writng 4 bytes more than necessary, 
if the second sliced buffer is right after the first, these 4 additional bytes are correctly written and no additional space is needed for example (1,1,64,64) -> out_buf1 (1,1,64,63) out_buf2 (1,1,64,1) can be placed after one another
out2_ptr : output pointer for (B,Y,X,1),  there are no special requirements
*/
template< unsigned loop_range=5, typename T>
__attribute__ ((always_inline))
void slice_16b_c6463 (
        T * in_ptr,
        T * out1_ptr,
        T * out2_ptr,
        const SliceParams &params
) {

int16 __aie_dm_resource_a * p_in = (int16 __aie_dm_resource_a *) in_ptr;
int16 __aie_dm_resource_b * restrict p_out1 = (int16 __aie_dm_resource_b *) out1_ptr;
int32 __aie_dm_resource_b * restrict p_out2 = (int32 __aie_dm_resource_b *) out2_ptr;

int32 __aie_dm_resource_b * restrict c_out2 = (int32 __aie_dm_resource_b *) out2_ptr;

fifo_state_t fS;
fS.pos = 0;
int32 tmp_val;
int16 tmp16_val[2];

//Loading a X2C64 and slicing it to X2C63 and C2
//Generating X2C63 for store interface of 32 bit
//Shuffles are used for the gap between X1C63 and X2C63
unsigned slice_addr = -sizeof(T)*2;
for ( int i=0; i<params.outer_loop; i++ ) chess_prepare_for_pipelining chess_loop_range(loop_range,){
    aie::vector<T,32> ld_v = aie::load_v<32>(p_in); p_in+=32;
    fifo_st_push( (v32int16 __aie_dm_resource_b *& restrict) p_out1, (v32int16) ld_v, fS );
    ld_v = aie::load_v<32>(p_in); p_in+=32;
    tmp16_val[0] = ld_v.get(31);
    aie::vector<T,32> ld2_v = aie::load_v<32>(p_in); p_in+=32;
    aie::vector<T,32> ld3_v = aie::shuffle_down_fill(aie::shuffle_up_rotate(ld_v,1), ld2_v,1);
    fifo_st_push( (v32int16 __aie_dm_resource_b *& restrict) p_out1, (v32int16) ld3_v, fS );
    ld_v = aie::load_v<32>(p_in); p_in+=32;
    ld3_v = aie::shuffle_down_fill(ld2_v, ld_v,1);
    fifo_st_push( (v32int16 __aie_dm_resource_b *& restrict) p_out1, (v32int16) ld3_v, fS );
    ld3_v = aie::shuffle_down_fill(ld_v, ld_v ,1);
    fifo_st_push( (v32int16 __aie_dm_resource_b *& restrict) p_out1, (v32int16) ld3_v, fS );
    fifo_st_flush_1d_byte( (v32int16 __aie_dm_resource_b *& restrict) p_out1, fS, slice_addr );

    tmp16_val[1] = ld_v.get(31);
    tmp_val = *((int32*) tmp16_val);
    *p_out2++ = tmp_val;
}
//writing the output for the first two elements again
//this is done for the case when output_buf2 is located directly after output_buf1
tmp16_val[0] = in_ptr[63];
tmp16_val[1] = in_ptr[127];
*c_out2 = *((int32*) tmp16_val);

}

#endif //__SLICE_16B_IMPL_H_