#ifndef __MAXPOOL_INT8X8_IMPL_HPP__
#define __MAXPOOL_INT8X8_IMPL_HPP__

#include "aie_api/aie.hpp"
#include "aie_api/utils.hpp"
#include "api_loop_pipe_helper.hpp"
#include "maxpool_int8x8.hpp"

using namespace aie;

inline void maxpool_int8x8( int * ifm, int * __restrict ofm, const MaxpoolInt8x8Params &params )
{
    int8 * pI = ( int8 * ) ifm;
    int8 * pO = ( int8 * ) ofm;
    const MaxpoolInt8x8Params::Control ctrl = params.ctrl;

    dims_3d_t dimsA = params.dimsA.instantiate( );

    for ( unsigned j = 0; j < params.outer_loop; j++ )
#ifdef PROC_OPT
        chess_no_hw_loop
#else
        chess_prepare_for_pipelining
#endif
        chess_loop_range( 2, )
    {
        vector<int8, 64> Xbuff0, Xbuff1, Xbuff2;
        vector<int8, 64> interleave_0, interleave_1;
        vector<int8, 64> shift_buff_0, shift_buff_1;
        vector<int8, 64> Obuff;

        Obuff = broadcast<int8, 64>( params.min_value );

        aie::pipelined_loop<2>( params.inner_loop, [&]( auto j ) __aie_inline {
            Xbuff0 = load_v<64>( pI    );
            Xbuff1 = load_v<64>( pI+64 );
            Xbuff2 = load_v<64>( pI+128 );

            interleave_0 = shuffle( Xbuff0, Xbuff1, params.shfl_0 );
            interleave_1 = shuffle( Xbuff0, Xbuff1, params.shfl_1 );

            shift_buff_0 = shuffle_down_fill( interleave_1, Xbuff1, params.shft_0 );
            shift_buff_1 = shuffle_down_fill( interleave_0, Xbuff2, params.shft_1 );
            shift_buff_1 = shuffle_down_fill( shift_buff_1, Xbuff1, params.shft_2 );

            Obuff = max( Obuff, interleave_0, ctrl.sign );
            Obuff = max( Obuff, shift_buff_0, ctrl.sign );
            Obuff = max( Obuff, shift_buff_1, ctrl.sign );

            pI = byte_incr( pI, params.step_Ky );
        });

        pI = add_3d_byte( pI, dimsA );

        store_v( pO, Obuff );
        pO += 64;
    }
}

#endif
