#ifndef __QDQ_SUM_HPP__
#define __QDQ_SUM_HPP__

#include "qdq_helpers.hpp"

//this function generates ifm_sum_1xN for int16 ifm, to be used for actv*actv asymmetric
template<typename Ta, typename Ts, unsigned Kgran, unsigned Ngran, unsigned Ktile, unsigned Ntile, unsigned inner_lr_min, unsigned outer_lr_min>
void sum_inner_scale_sum( adf::input_buffer<Ta> &ifm, adf::output_buffer<Ts> &ofm, unsigned K_g, unsigned N_g, unsigned Y_g, bool zero_init, int shift)
{

    //unrolled iterations
    constexpr unsigned Ni = Ngran / Ntile;
    constexpr unsigned Ki = Kgran / Ktile;
    constexpr unsigned Vs = 4;
    static_assert( Ki == 1, "Unrolled inner iterations not supported" );

    /*
        dim1 -> Accumulate over K
        Ngran * Kgran, we have CRC8, one inner loop processes for example R8C8, adding Ngran*Kgran should bring us to next R8C8
        size = K_g do it K_g times
        dim2 -> next N to be processed, we want to jump Ngran * Kgran* K_g to get there
        size = N_g iterate over N
        dim3 = Y dimension, Should jump to next channels in conv, that is jump over C16XX8C8, which should be Ngran * Kgran * N_g * K_g
    */
    Add3dElem add_3d_ifm( Ngran * Kgran, K_g, Ngran * Kgran* K_g, N_g, Ngran * Kgran * N_g * K_g );
    auto pI = ifm.data( );
    // pT used to load psums, pO for output
    auto pT = aie::begin_vector<Vs>( ofm );
    auto pO = aie::begin_restrict_vector<Vs>( ofm );
    auto pOdbg = aie::begin_restrict_vector<Vs>( ofm );

    //16-bit x 8-bit = 64-bit	1	4x8	8x8	-> 4x8, is used for macs
    constexpr unsigned sum_outer = 4;
    using sum_mul_t = aie::mmul<4, 8, 8, int16_t, int8_t,acc64>;

    using tdm_t = aie::accum<acc64, Vs>;
    using acc_t = aie::accum<acc64, 32>;
    acc_t sum[2*Ni];
    static_assert( Ntile == 8, "This implementation assumes Ntile == 8" );

    //loading of psum along N
    #pragma unroll
    for (unsigned ni=0; ni<2*Ni; ni++) {
        sum[ni] = tdm_t( *pT++, shift ).template grow<32>( );
    }

    //outer loop should iterates over output dimensions N_g and Y_g, Validated for N_g
    for ( unsigned o=0; o<N_g*Y_g; o++ )
        chess_prepare_for_pipelining
        chess_loop_range( outer_lr_min, )
    {
        //peeled loop iteration for performance
        #pragma unroll
        for (unsigned nic=0; nic<Ni; nic++) {
            unsigned ni = Ni - 1 - nic;
            pI = chess_copy( pI );
            // loading an 8x8 explicitly, transpose it and then process

            auto ifm = load_index<Ntile*Ktile, aie_dm_resource::a>( pI, Ntile * Ktile * ni );

            auto trans_ifm = aie::transpose(ifm, 8, 8);

            auto s0 = sum_mul_t( aie::op_zero( sum[2*ni], zero_init ));
            auto s1 = sum_mul_t( aie::op_zero( sum[2*ni+1], zero_init ));
            s0.mac( trans_ifm.template extract<32>(0), aie::broadcast<int8_t, 64>( 1 ));
            s1.mac( trans_ifm.template extract<32>(1), aie::broadcast<int8_t, 64>( 1 ));
            sum[2*ni] = s0.to_accum( );
            sum[2*ni+1] = s1.to_accum( );

        }
        pI = add_3d_ifm( pI );

        //Iterate over K_g here / Accumulation dimension
        for ( unsigned i=1; i<K_g; i++ )
            chess_prepare_for_pipelining
            chess_loop_range( inner_lr_min-1, )
            //chess_peel_pipelined_loop( 3 )
        {
            #pragma unroll
            for (unsigned nic=0; nic<Ni; nic++) {
                unsigned ni = Ni - 1 - nic;
                pI = chess_copy( pI );

                auto ifm = load_index<Ntile*Ktile, aie_dm_resource::a>( pI, Ntile * Ktile * ni );

                auto trans_ifm = aie::transpose(ifm, 8, 8);

                auto s0 = sum_mul_t( sum[2*ni]);
                auto s1 = sum_mul_t( sum[2*ni+1]);
                s0.mac( trans_ifm.template extract<32>(0), aie::broadcast<int8_t, 64>( 1 ));
                s1.mac( trans_ifm.template extract<32>(1), aie::broadcast<int8_t, 64>( 1 ));
                sum[2*ni] = s0.to_accum( );
                sum[2*ni+1] = s1.to_accum( );

            }
            pI = add_3d_ifm( pI );
            locate_in_register( sum );
        }

;
        #pragma unroll
        for (unsigned nic=0; nic<Ni*2; nic++) {

            unsigned ni = nic;


            auto res_vec = sum[ni].template to_vector<Ts>( shift );
            auto res_vec_ex1 = res_vec.template extract<16>(0);
            auto res_vec_ex2 = res_vec.template extract<16>(1);

            //shuffle 4 input value of the 4x8 accum together
            aie::vector<Ts,16> res_vec_ex = shuffle(res_vec_ex1, res_vec_ex2, T32_4x8_lo);
            //I used this chess_report for validation together with work_aie2p/data/gemm_sumifmt.txt
            //as the testbench does not check that part automated
            //chess_report(res_vec_ex);
            *pO++ = res_vec_ex.template extract<Vs>( 0 );
            sum[ni] = tdm_t( *pT++, shift ).template grow<32>( );
        }
    }
}


//this function generates ifm_sum_Nx1 using int16 ifm, to be used for 3 term qdq
template<typename Ta, typename Ts, unsigned Mgran, unsigned Kgran, unsigned Mtile, unsigned Ktile, unsigned inner_lr_min, unsigned outer_lr_min>
void sum_inner_int16( adf::input_buffer<Ta> &ifm, adf::output_buffer<Ts> &ofm, unsigned M_g, unsigned K_g, unsigned Y_g, bool zero_init, int shift )
{
    constexpr unsigned Mi = Mgran / Mtile;
    constexpr unsigned Ki = Kgran / Ktile;
    constexpr unsigned Vs = sum_write_garbage_stride<Ts>() * Mtile;


    Add3dElem add_3d_ifm( Mgran * Kgran * M_g, K_g, Mgran * Ktile, M_g, Mgran * Kgran * M_g * K_g );
    auto pI = ifm.data( );
    auto pT = aie::begin_vector<Vs>( ofm );
    auto pO = aie::begin_restrict_vector<Vs>( ofm );


    constexpr unsigned sum_outer = 4;
    using sum_mul_t = aie::mmul<4, 8, 8, Ta, int8_t,acc64>;

    using tdm_t = aie::accum<acc64, Vs>;
    using acc_t = aie::accum<acc64, 32>;
    acc_t sum[Mi];

    #pragma unroll
    for (unsigned mi=0; mi<Mi; mi++) {
        sum[mi] = tdm_t( *pT++, shift ).template grow<32>( );
    }

    for ( unsigned o=0; o<M_g*Y_g; o++ )
        chess_prepare_for_pipelining
        chess_loop_range( outer_lr_min, )
    {

        #pragma unroll
        for (unsigned mic=0; mic<Mi; mic++) {
            unsigned mi = Mi - 1 - mic;
            pI = chess_copy( pI );

            #pragma unroll
            for (unsigned ki=0; ki<Ki; ki++) {
                auto ifm = load_index<Mtile*Ktile, aie_dm_resource::a>( pI, Mtile * Ktile * mi + M_g * Mgran * Ktile * ki );
                auto s = sum_mul_t( aie::op_zero( sum[mi], zero_init ));
                s.mac( ifm, aie::broadcast<int8_t, 64>( 1 ));
                sum[mi] = s.to_accum( );

            }
        }
        pI = add_3d_ifm( pI );
        locate_in_register( sum );

        for ( unsigned i=1; i<K_g; i++ )
            chess_prepare_for_pipelining
            chess_loop_range( inner_lr_min-1, )
            chess_peel_pipelined_loop( 3 )
        {
            #pragma unroll
            for (unsigned mic=0; mic<Mi; mic++) {
                unsigned mi = Mi - 1 - mic;
                pI = chess_copy( pI );
                #pragma unroll
                for (unsigned ki=0; ki<Ki; ki++) {
                    //auto ifm = pI[mi + M_g * Mi * ki];
                    auto ifm = load_index<Mtile*Ktile, aie_dm_resource::a>( pI, Mtile * Ktile * mi + M_g * Mgran * Ktile * ki );
                    auto s = sum_mul_t( sum[mi] );
                    s.mac( ifm, aie::broadcast<int8_t, 64>( 1 ));
                    sum[mi] = s.to_accum( );
                }
            }
            pI = add_3d_ifm( pI );
        }

        #pragma unroll
        for (unsigned mic=0; mic<Mi; mic++) {
            unsigned mi = mic;


            auto res_vec = sum[mi].template to_vector<Ts>( shift );
            auto res_vec_ex1 = res_vec.template extract<16>(0);
            auto res_vec_ex2 = res_vec.template extract<16>(1);

            aie::vector<Ts,16> res_vec_ex = shuffle(res_vec_ex1, res_vec_ex2, T32_4x8_lo);
            auto res_vec_ex_o = aie::filter_even(res_vec_ex,1);
            *pO++ = res_vec_ex.template extract<Vs>( 0 );
            sum[mi] = tdm_t( *pT++, shift ).template grow<32>( );
        }
    }
}


/*! \brief Sum across columns in K M Ktile data structure. Kernel implementation is tested and optimized for Ta=int8, Ts=int16, Mgran=16, Kgran=8, Mtile=8, Ktile=8, inner_lr_min=8, outer_lr_min=3
    Compute: ofm^{Mx1} = srs( sum_K( ifm^{MxK} ) + !zero_init * ( ofm^{Mx1} << shift ), shift );

  @param[in] ifm        input volume. Data order: C R C8 ( K M K8 )
  @param[out] ofm       output sum (also used as partial sum buffer)
  @param[in] M_g        Size for M dimension in terms of granularity Mgran
  @param[in] K_g        Size for K dimension in terms of granularity Kgran
  @param[in] Y_g        Size for Y dimension to model YCXC8 data order used in convolution kernel. Set to 1 if unused
  @param[in] zero_init  Zero init flag to clear state (beginning of summation for multiple iterations. To be set to 1 for single iteration
  @param[in] shift      Shift factor for output
 */
template<typename Ta, typename Ts, unsigned Mgran, unsigned Kgran, unsigned Mtile, unsigned Ktile, bool has_transpose=0, unsigned inner_lr_min=6, unsigned outer_lr_min=2>
inline __attribute__((always_inline)) void sum_inner( adf::input_buffer<Ta> &ifm, adf::output_buffer<Ts> &ofm, unsigned M_g, unsigned K_g, unsigned Y_g, bool zero_init, int shift, bool sign_in=std::is_signed<Ta>(), bool transpose=0 )
{

    constexpr unsigned Mi = Mgran / Mtile;
    constexpr unsigned Ki = Kgran / Ktile;
    constexpr unsigned Vs = sum_write_garbage_stride<Ts>() * Mtile;

    //Add3dPtr add_3d_ifm( Mi * Ki * M_g, K_g, Mi, M_g, Mi * Ki * M_g * K_g );
    //auto pI = aie::begin_vector<Mtile*Ktile>( ifm );
    unsigned M1_g = has_transpose && transpose ? 1 : M_g * Mi;
    unsigned M2_g = has_transpose && transpose ? M_g * Mi : 1;
    unsigned K1_g = has_transpose && transpose ? 1 : K_g * Ki;
    unsigned K2_g = has_transpose && transpose ? K_g * Ki : 1;
    bool bypass_transpose = has_transpose && transpose;
    Add3dElem add_3d_ifm( Kgran * Mtile * M1_g, K_g, Mgran * Ktile * K2_g, M_g, Mgran * Kgran * M_g * K_g );
    auto pI = ifm.data( );
    auto pT = aie::begin_vector<Vs>( ofm );
    auto pO = aie::begin_restrict_vector<Vs>( ofm );

    constexpr unsigned sum_outer = __AIE_ARCH__ <= 20 ? 4 : 64 / Mtile;
    using sum_mul_t = aie::mmul<sum_outer, Ktile, Mtile, Ta, Ta>;
    using tdm_t = aie::accum<aie::detail::accum_tag_for_mul_types<Ta, Ta>, Vs>;
    using acc_t = aie::accum<aie::detail::accum_tag_for_mul_types<Ta, Ta>, sum_outer * Mtile>;
    acc_t sum[Mi];

#ifdef __DEBUG__
    if (((get_coreid() >> 16)==0) && ((get_coreid() & 0xF)==4))
        printf("sum: sum_outer:%d, Mi:%d, Ki:%d, Vs:%d, Mgran:%d, Kgran:%d, Mtile:%d, Ktile:%d\n",sum_outer, Mi, Ki, Vs, Mgran, Kgran, Mtile, Ktile);
#endif

    #pragma unroll
    for (unsigned mi=0; mi<Mi; mi++) {
        sum[mi] = tdm_t( *pT++, shift ).template grow<sum_outer * Mtile>( );
    }

    //if (((get_coreid() >> 16)==0) && ((get_coreid() & 0xF)==4)) {
    //    auto acc0 = sum[0];
    //    aie::print((acc0.template extract<Vs>( 0 ).template to_vector<Ts>( 0 )), true, "sum[0] 256 = ");
    //}

    #if __AIE_ARCH__ >= 21
        v256int8_sparse sparse_in;
        sparse_in = insert( sparse_in, 0, get_sparse( 5 ));
        sparse_in = insert( sparse_in, 1, get_sparse( 5 ));
    #endif

    for_with_dynamic_pipeline<outer_lr_min>( M_g*Y_g, [&]( unsigned o ) __attribute__(( always_inline ))
    //for( unsigned o=0; o<M_g*Y_g; o++ )
    //    chess_prepare_for_pipelining
    //    chess_loop_range( outer_lr_min, )
    {
        //locate_in_register<2>( sum );
        #pragma unroll
        for (unsigned mic=0; mic<Mi; mic++) {
            unsigned mi = Mi - 1 - mic;
            pI = chess_copy( pI );

            #pragma unroll
            for (unsigned ki=0; ki<Ki; ki++) {
                //auto ifm = pI[mi + M_g * Mi * ki];
                auto ifm = load_index<Mtile*Ktile, aie_dm_resource::a>( pI, Mtile * Ktile * K2_g * mi + M1_g * Mtile * Ktile * ki );
    #if __AIE_ARCH__ >= 21
                if constexpr( __AIE_ARCH__ >= 21 && Mtile == 16 && Ktile == 8 && !has_transpose ) {
                    sparse_in = insert( sparse_in, 1, ifm.template extract<64>( 1 ));
                    sparse_in = insert( sparse_in, 0, ifm.template extract<64>( 0 )); 
                    sum[mi] = mac_4x16_16x16T_conf( aie::broadcast<Ta, sum_outer * Ktile * 2>( 1 ), sign_in, sparse_in, 0, sum[mi], zero_init, 0, 0, 0 );
                } else {
    #endif
                    auto s = sum_mul_t( aie::op_zero( sum[mi], chess_copy( zero_init )));
                    decltype( ifm ) ifm2 = aie::transpose( ifm, Mtile, Ktile );
                    if constexpr( has_transpose && sizeof( Ta ) == 1 && Mtile == 8 && Ktile == 8 ) {
                        ifm2 = shuffle( ifm, bypass_transpose ? T512_1x2_lo : T8_8x8 );
                    } else if ( bypass_transpose ) {
                        ifm2 = ifm;
                    }
                    s.mac( aie::broadcast<Ta, sum_outer * Ktile>( 1 ), aie::op_sign( ifm2, sign_in ));
                    sum[mi] = s.to_accum( );
                    //if (((get_coreid() >> 16)==0) && ((get_coreid() & 0xF)==4)) {
                    //    aie::print(ifm2, true, "ifm2 300 = ");
                    //    auto acc0 = sum[mi];
                    //    aie::print((acc0.template extract<Vs>( 0 ).template to_vector<Ts>( 0 )), true, "sum[0] 300 = ");
                    //}
    #if __AIE_ARCH__ >= 21
                }
    #endif
            }
            //sum[mi] = locate_dm<2>( sum[mi] );
        }
        pI = add_3d_ifm( pI );
        locate_in_register( sum );

        for ( unsigned i=1; i<K_g; i++ )
            chess_prepare_for_pipelining
            chess_loop_range( inner_lr_min-1, )
          #if __AIE_ARCH__ >= 20
            chess_peel_pipelined_loop( 3 * (outer_lr_min > 1 ))
          #endif
        {
            #pragma unroll
            for (unsigned mic=0; mic<Mi; mic++) {
                unsigned mi = Mi - 1 - mic;
                pI = chess_copy( pI );
                #pragma unroll
                for (unsigned ki=0; ki<Ki; ki++) {
                    //auto ifm = pI[mi + M_g * Mi * ki];
                    auto ifm = load_index<Mtile*Ktile, aie_dm_resource::a>( pI, Mtile * Ktile * K2_g * mi + M1_g * Mtile * Ktile * ki );
    #if __AIE_ARCH__ >= 21
                    if constexpr( __AIE_ARCH__ >= 21 && Mtile == 16 && Ktile == 8 && !has_transpose ) {
                        sparse_in = insert( sparse_in, 1, ifm.template extract<64>( 1 ));
                        sparse_in = insert( sparse_in, 0, ifm.template extract<64>( 0 )); 
                        sum[mi] = mac_4x16_16x16T( aie::broadcast<Ta, sum_outer * Ktile * 2>( 1 ), sign_in, sparse_in, 0, sum[mi] );
                    } else {
    #endif
                        auto s = sum_mul_t( sum[mi] );
                        decltype( ifm ) ifm2 = aie::transpose( ifm, Mtile, Ktile );
                        if constexpr( has_transpose && sizeof( Ta ) == 1 && Mtile == 8 && Ktile == 8 ) {
                            ifm2 = shuffle( ifm, bypass_transpose ? T512_1x2_lo : T8_8x8 );
                        } else if ( bypass_transpose ) {
                            ifm2 = ifm;
                        }
                        s.mac( aie::broadcast<Ta, sum_outer * Ktile>( 1 ), aie::op_sign( ifm2, sign_in ));
                        sum[mi] = s.to_accum( );
                        //if (((get_coreid() >> 16)==0) && ((get_coreid() & 0xF)==4)) {
                        //    aie::print(ifm2, true, "ifm2 344 = ");
                        //    auto acc0 = sum[mi];
                        //    aie::print((acc0.template extract<Vs>( 0 ).template to_vector<Ts>( 0 )), true, "sum[0] 344 = ");
                        //}

    #if __AIE_ARCH__ >= 21
                    }
          #endif
                }
            }
            pI = add_3d_ifm( pI );
        }

        #pragma unroll
        for (unsigned mic=0; mic<Mi; mic++) {
            //unsigned mi = Mi - 1 - mic;
            unsigned mi = mic;
            if constexpr( std::is_same_v<Ts, int32> && Mtile == 8 && __AIE_ARCH__ >= 21 ) {
                v64acc32 acc = sum[mi];
                *pO++ = extract_v8int32(( v16int32 )extract_v16acc32( acc, 0 ), 0 );
                v8int32 tdm = *pT++;
                sum[mi] = set_v64acc32( 0, ( v16acc32 )set_v16int32( 0, tdm ));
            } else {
                //#if __AIE_ARCH__ >= 21
                auto acc = sum[mi];//.template extract<std::max(16u,Mtile)>( 0 );
                *pO++ = acc.template extract<Vs>( 0 ).template to_vector<Ts>( shift );
                //#else
                //auto acc = sum[mi].to_accum( );
                //*pO++ = acc.template to_vector<Ts>( shift ).template extract<Vs>( 0 );
                //#endif
                //*pO++ = acc.template extract<Mtile>( 0 ).template to_vector<Ts>( shift );
                sum[mi] = tdm_t( *pT++, shift ).template grow<sum_outer * Mtile>( );
                //if (((get_coreid() >> 16)==0) && ((get_coreid() & 0xF)==4)) {
                //    auto acc0 = sum[0];
                //    aie::print((acc0.template extract<Vs>( 0 ).template to_vector<Ts>( 0 )), true, "sum[0] 371 = ");
                //}
            }
        }
    });
}

/*! \brief Sum across columns in K M Ktile data structure. Kernel implementation is tested and optimized for Ta=int8, Ts=int16, Mgran=16, Kgran=8, Mtile=8, Ktile=8, inner_lr_min=8, outer_lr_min=3
    Compute: ofm^{Mx1} = srs( sum_K( ifm^{MxK} ) + !zero_init * ( ofm^{Mx1} << shift ) + casc_en * casc_in, shift );

  @param[in] ifm        input volume. Data order: C R C8 ( K M K8 )
  @param[in] casc_in    cascade input from previous core in cascade chain
  @param[out] ofm       output sum (also used as partial sum buffer)
  @param[in] M_g        Size for M dimension in terms of granularity Mgran
  @param[in] K_g        Size for K dimension in terms of granularity Kgran
  @param[in] Y_g        Size for Y dimension to model YCXC8 data order used in convolution kernel. Set to 1 if unused
  @param[in] zero_init  Zero init flag to clear state (beginning of summation for multiple iterations. To be set to 1 for single iteration
  @param[in] casc_en    Dynamic enable of cascade input (1 to read from cascade, 0 to omit)
  @param[in] shift      Shift factor for output
 */
template<typename Ta, typename Ts, typename Tcasc, unsigned Mgran, unsigned Kgran, unsigned Mtile, unsigned Ktile, unsigned inner_lr_min, unsigned outer_lr_min>
void sum_inner( adf::input_buffer<Ta> &ifm, input_cascade<Tcasc> &casc_in, adf::output_buffer<Ts> &ofm, unsigned M_g, unsigned K_g, unsigned Y_g, bool zero_init, bool casc_en, int shift )
{
    constexpr unsigned Mi = Mgran / Mtile;
    constexpr unsigned Ki = Kgran / Ktile;
    constexpr unsigned Vs = sum_write_garbage_stride<Ts>() * Mtile;

    Add3dPtr add_3d_ifm( Mi * Ki * M_g, K_g, Mi, M_g, Mi * Ki * M_g * K_g );
    auto pI = aie::begin_vector<Mtile*Ktile>( ifm );
    auto pT = aie::begin_vector<Vs>( ofm );
    auto pO = aie::begin_restrict_vector<Vs>( ofm );

    constexpr unsigned sum_outer = __AIE_ARCH__ <= 20 ? 4 : 8;
    using sum_mul_t = aie::mmul<sum_outer, Ktile, Mtile, Ta, Ta>;
    using acc_t = aie::accum<aie::detail::accum_tag_for_mul_types<Ta, Ta>, sum_mul_t::size_C>;

    for ( unsigned o=0; o<M_g*Y_g; o++ )
        chess_prepare_for_pipelining
        chess_loop_range( outer_lr_min, )
    {
        sum_mul_t sum[Mi];

        #pragma unroll
        for (unsigned mic=0; mic<Mi; mic++) {
            unsigned mi = Mi - 1 - mic;
            sum[mi] = sum_mul_t( aie::op_zero( acc_t(( *pT++ ).template grow<sum_mul_t::size_C>( ), shift ), zero_init ));
            #pragma unroll
            for (unsigned ki=0; ki<Ki; ki++) {
                sum[mi].mac( aie::broadcast<Ta,sum_outer*Ktile>( 1 ), aie::transpose( pI[mi + M_g * Mi * ki], Mtile, Ktile ));
            }
        }
        pI = add_3d_ifm( pI );

        for ( unsigned i=1; i<K_g; i++ )
            chess_prepare_for_pipelining
            chess_peel_pipelined_loop( 1 )
            chess_loop_range( inner_lr_min-1, )
        {
            #pragma unroll
            for (unsigned mic=0; mic<Mi; mic++) {
                unsigned mi = Mi - 1 - mic;
                #pragma unroll
                for (unsigned ki=0; ki<Ki; ki++) {
                    sum[mi].mac( aie::broadcast<Ta,sum_outer*Ktile>( 1 ), aie::transpose( pI[mi + M_g * Mi * ki], Mtile, Ktile ));
                }
            }
            pI = add_3d_ifm( pI );
        }

        #pragma unroll
        for (unsigned mic=0; mic<Mi; mic++) {
            unsigned mi = Mi - 1 - mic;
            auto acc = aie::add( sum[mi].to_accum( ), get_cascade<aie::accum<Tcasc,std::max( 16u, Mtile )>>( casc_en ).template grow<sum_mul_t::size_C>( ));
            *pO++ = acc.template to_vector<Ts>( shift ).template extract<Vs>( 0 );
        }
    }
}

/*! \brief Sum across columns in K M Ktile data structure. Kernel implementation is tested and optimized for Ta=int8, Ts=int16, Mgran=16, Kgran=8, Mtile=8, Ktile=8, inner_lr_min=8, outer_lr_min=3
    Compute: casc_out = sum_K( ifm^{MxK} ) + casc_en * casc_in;

  @param[in] ifm        input volume. Data order: C R C8 ( K M K8 )
  @param[in] casc_in    cascade input from previous core in cascade chain
  @param[out] casc_out  output to next core in cascade chain
  @param[in] M_g        Size for M dimension in terms of granularity Mgran
  @param[in] K_g        Size for K dimension in terms of granularity Kgran
  @param[in] Y_g        Size for Y dimension to model YCXC8 data order used in convolution kernel. Set to 1 if unused
  @param[in] zero_init  Zero init flag to clear state (beginning of summation for multiple iterations. To be set to 1 for single iteration
  @param[in] casc_en    Dynamic enable of cascade input (1 to read from cascade, 0 to omit)
 */
template<typename Ta, typename Tcasc, unsigned Mgran, unsigned Kgran, unsigned Mtile, unsigned Ktile, unsigned inner_lr_min, unsigned outer_lr_min>
void sum_inner( adf::input_buffer<Ta> &ifm, input_cascade<Tcasc> &casc_in, output_cascade<Tcasc> &casc_out, unsigned M_g, unsigned K_g, unsigned Y_g, bool zero_init, bool casc_en )
{
    constexpr unsigned Mi = Mgran / Mtile;
    constexpr unsigned Ki = Kgran / Ktile;

    Add3dPtr add_3d_ifm( Mi * Ki * M_g, K_g, Mi, M_g, Mi * Ki * M_g * K_g );
    auto pI = aie::begin_vector<Mtile*Ktile>( ifm );

    constexpr unsigned sum_outer = __AIE_ARCH__ <= 20 ? 4 : 8;
    using sum_mul_t = aie::mmul<sum_outer, Ktile, Mtile, Ta, Ta>;

    for ( unsigned o=0; o<M_g*Y_g; o++ )
        chess_prepare_for_pipelining
        chess_loop_range( outer_lr_min, )
    {
        sum_mul_t sum[Mi];

        for ( unsigned i=0; i<K_g; i++ )
            chess_prepare_for_pipelining
            chess_peel_pipelined_loop( 1 )
            chess_loop_range( inner_lr_min, )
        {
            #pragma unroll
            for (unsigned mic=0; mic<Mi; mic++) {
                unsigned mi = Mi - 1 - mic;
                #pragma unroll
                for (unsigned ki=0; ki<Ki; ki++) {
                    sum[mi].mac( aie::broadcast<Ta,sum_outer*Ktile>( 1 ), aie::transpose( pI[mi + M_g * Mi * ki], Mtile, Ktile ));
                }
            }
            pI = add_3d_ifm( pI );
        }

        #pragma unroll
        for (unsigned mic=0; mic<Mi; mic++) {
            unsigned mi = Mi - 1 - mic;
            auto acc = aie::add( sum[mi].to_accum( ), get_cascade<aie::accum<Tcasc,std::max( 16u, Mtile )>>( casc_en ).template grow<sum_mul_t::size_C>( ));
            writeincr( &casc_out, acc.template extract<std::max( 16u, Mtile )>( 0 ));
        }
    }
}

void compute_act_sum(void* matA, void* ifm_sum_addr, uint32_t M = 64, uint32_t K = 128, bool zero_init = 1, bool transpose = 0){
    const int Mgran  = 16;
    const int Kgran  = 8;
    const int Mtile  = 8;
    const int Ktile  = 8;
    const int has_transpose = 1;
    const int inner_lr_min = 8;
    const int outer_lr_min = 2;
    int ofm_len = (transpose) ? K : M;
    auto ifm = adf::input_buffer<uint8> ({static_cast<uint8*>(matA), M*K, 0, M*K});
    auto ofm = adf::output_buffer<int32>({static_cast<int32*>(ifm_sum_addr), ofm_len*sizeof(int32_t), 0, ofm_len*sizeof(int32_t)});

    const int M_g     = M / Mgran;
    const int K_g     = K / Kgran;
    const int Y_g     = 1;
    sum_inner<uint8, int32, Mgran, Kgran, Mtile, Ktile, has_transpose, inner_lr_min, outer_lr_min>( ifm, ofm, M_g, K_g, Y_g, zero_init, 0, std::is_signed<uint8>(), transpose);
}

void compute_act_sum_int16(void* matA, void* ifm_sum_addr, uint32_t M = 64, uint32_t K = 128, bool zero_init = 1, bool transpose = 0){
    const int Mgran  = 16;
    const int Kgran  = 8;
    const int Mtile  = 4;
    const int Ktile  = 8;
    const int has_transpose = 1;
    const int inner_lr_min = 8;
    const int outer_lr_min = 2;
    int ofm_len = (transpose) ? K : M;
    auto ifm = adf::input_buffer<uint16> ({static_cast<uint16*>(matA), (M*K*sizeof(uint16)), 0, (M*K*sizeof(uint16)) });
    auto ofm = adf::output_buffer<int32>({static_cast<int32*>(ifm_sum_addr), ofm_len*sizeof(int32_t), 0, ofm_len*sizeof(int32_t)});

    const int M_g     = M / Mgran;
    const int K_g     = K / Kgran;
    const int Y_g     = 1;
    sum_inner_int16<uint16, int32, Mgran, Kgran, Mtile, Ktile, inner_lr_min, outer_lr_min>( ifm, ofm, M_g, K_g, Y_g, zero_init, 0);
}
#endif

