#ifndef __MAXPOOL_INT16X16_IMPL_HPP__
#define __MAXPOOL_INT16X16_IMPL_HPP__

#include "aie_api/aie.hpp"
#include "aie_api/utils.hpp"
#include "api_loop_pipe_helper.hpp"
#include "maxpool_int16x16.hpp"

using namespace aie;

inline void maxpool_int16x16( int * ifm, int * __restrict ofm, const MaxpoolInt16x16Params &params )
{
    int16 * pI = ( int16 * ) ifm;
    int16 * pO = ( int16 * ) ofm;

    const MaxpoolInt16x16Params::Control ctrl = params.ctrl;

    dims_3d_t dimsA = params.dimsA.instantiate( );

    for ( unsigned j = 0; j < params.outer_loop; j++ )
        chess_prepare_for_pipelining
        chess_loop_range( 2, )
    {
        vector<int16, 32> Xbuff0, Xbuff1, Xbuff2;
        vector<int16, 32> interleave_0, interleave_1;
        vector<int16, 32> shift_buff_0, shift_buff_1;
        vector<int16, 32> Obuff;

        Obuff = broadcast<int16, 32>( params.min_value );

        aie::pipelined_loop<2>( params.inner_loop, [&]( auto j ) __aie_inline {
            Xbuff0 = load_v<32>( pI    );
            Xbuff1 = load_v<32>( pI+32 );
            Xbuff2 = load_v<32>( pI+64 );

            interleave_0 = shuffle( Xbuff0, Xbuff1, params.shfl_0 );
            interleave_1 = shuffle( Xbuff0, Xbuff1, params.shfl_1 );

            shift_buff_0 = shuffle_down_fill( interleave_1, Xbuff1, params.shft_0 );
            shift_buff_1 = shuffle_down_fill( interleave_0, Xbuff2, params.shft_1 );
            shift_buff_1 = shuffle_down_fill( shift_buff_1, Xbuff1, params.shft_2 );

            Obuff = max( Obuff, interleave_0, ctrl.sign );
            Obuff = max( Obuff, shift_buff_0, ctrl.sign );
            Obuff = max( Obuff, shift_buff_1, ctrl.sign );

            pI = byte_incr( pI, params.step_Ky );
        });

        pI = add_3d_byte( pI, dimsA );

        store_v( pO, Obuff );
        pO += 32;
    }
}

#endif