/*  (c) Copyright 2019 - 2022 Xilinx, Inc. All rights reserved.

    This file contains confidential and proprietary information
    of Xilinx, Inc. and is protected under U.S. and
    international copyright and other intellectual property
    laws.

    DISCLAIMER
    This disclaimer is not a license and does not grant any
    rights to the materials distributed herewith. Except as
    otherwise provided in a valid license issued to you by
    Xilinx, and to the maximum extent permitted by applicable
    law: (1) THESE MATERIALS ARE MADE AVAILABLE "AS IS" AND
    WITH ALL FAULTS, AND XILINX HEREBY DISCLAIMS ALL WARRANTIES
    AND CONDITIONS, EXPRESS, IMPLIED, OR STATUTORY, INCLUDING
    BUT NOT LIMITED TO WARRANTIES OF MERCHANTABILITY, NON-
    INFRINGEMENT, OR FITNESS FOR ANY PARTICULAR PURPOSE; and
    (2) Xilinx shall not be liable (whether in contract or tort,
    including negligence, or under any other theory of
    liability) for any loss or damage of any kind or nature
    related to, arising under or in connection with these
    materials, including for any direct, or any indirect,
    special, incidental, or consequential loss or damage
    (including loss of data, profits, goodwill, or any type of
    loss or damage suffered as a result of any action brought
    by a third party) even if such damage or loss was
    reasonably foreseeable or Xilinx had been advised of the
    possibility of the same.

    CRITICAL APPLICATIONS
    Xilinx products are not designed or intended to be fail-
    safe, or for use in any application requiring fail-safe
    performance, such as life-support or safety devices or
    systems, Class III medical devices, nuclear facilities,
    applications related to the deployment of airbags, or any
    other applications that could lead to death, personal
    injury, or severe property or environmental damage
    (individually and collectively, "Critical
    Applications"). Customer assumes the sole risk and
    liability of any use of Xilinx products in Critical
    Applications, subject only to applicable laws and
    regulations governing limitations on product liability.

    THIS COPYRIGHT NOTICE AND DISCLAIMER MUST BE RETAINED AS
    PART OF THIS FILE AT ALL TIMES.                       */

#ifndef __ACTIVATED_MMULT_QDQ_INT16X4_IMPL_HPP__
#define __ACTIVATED_MMULT_QDQ_INT16X4_IMPL_HPP__

#include "aie_api/aie.hpp"
#include "aie_api/utils.hpp"
#include "assert.h"
#include "common.hh"
#include "access_helpers.hpp"
// #include "kernel_helpers.h"
#ifdef DEBUG_KERNEL
#include "stdio.h"
#endif

inline v128int8 unpack_wght(v128int4 v, uint1_t sign) { return unpack(v, sign); }
inline v128int8 unpack_wght(v128int8 v, uint1_t sign) { return v; }


template<unsigned has_actv_sum, unsigned vector_coeff, typename T_cf, class Ba, class Bb, class Bc, class Bo>
void activated_mmult_qdq_int16x4
(
        Ba & restrict bufA,
        Bb & restrict bufB,
        Bc & restrict bufC,
        Bo & restrict bufO,
        int8 * spill_buf,
        int8 * actv_buf,
        int8 * cf_cache,
        const MMultQdQInt16x4Params &mmult_params
) {

    using Ta = int16; //buffer_element_t<Ba>;
    using Tb = int8; //buffer_element_t<Bb>;
    using To = int16; //buffer_element_t<Bo>;
    using Tc = float; //buffer_element_t<Bc>;

    using T_c0 = get_next_type_t<T_cf>;
    constexpr unsigned V_qdq = ( std::is_same_v<T_cf, float> ) ? 32 : 16;
    constexpr unsigned N_c0 = 64;
    constexpr unsigned N_c1 = vector_coeff >= 1 ? 64 : 1;
    constexpr unsigned N_c2 = vector_coeff >= 2 ? 64 : 1;
    constexpr unsigned V_c1 = std::min( V_qdq, N_c1 );
    constexpr unsigned V_c2 = std::min( V_qdq, N_c2 );

    constexpr unsigned granM = 32;
    constexpr bool stream_ifm = is_stream_type_v<Ta>;
    constexpr bool stream_wgt = is_stream_type_v<Tb>;
    constexpr bool stream_ofm = is_stream_type_v<To>;

    constexpr bool stream_kernel = stream_ifm && stream_wgt;
    constexpr bool stream_ifm_actsum = stream_ifm && has_actv_sum;

    #ifdef DEBUG_KERNEL
    constexpr unsigned il_lr = 1;
    constexpr unsigned il_peel = 0;
    #else
    constexpr unsigned il_lr = 4;
    constexpr unsigned il_peel = 2;
    #endif

    constexpr unsigned ol_lr = 1;

    // const MMultQdQInt16x4Params::Control ctrl = mmult_params.ctrl;
    MMultQdQInt16x8_RT_Params* mmult_rt_params;

    int il_bound = mmult_params.inner_loop;
    int ol_bound = mmult_params.inner_time_iters;

    if constexpr( stream_kernel ) {
        il_bound *= ol_bound;
        ol_bound = 1;
    }

    unsigned sum_bound = il_bound;

    int iters = mmult_params.outer_time_iters;

    unsigned keep_sum_iters = 1;
    unsigned keep_sum_cnt = 0;

    int zero_acc = 1;
    int zero_sum = 1;

    int8 * pAI = actv_buf;
    Ta * pA;
    int8 * pAs;
    int8 * pATdm;
    int8 * pB;
    Tc * pC;
    To * pO;

    if constexpr( stream_ifm ){
        pAs = pAI;
        pATdm = pAI;
    }

    dims_3d_t dimsA = mmult_params.dimsA.instantiate();
    dims_3d_t dimsB = mmult_params.dimsB.instantiate();
    dims_2d_t dimsQ = mmult_params.dimsQ.instantiate_step();
    dims_3d_t dimsAs = dimsA;

    struct coeff_cache_type {
        alignas( 128 ) T_c0 c0[N_c0];
        T_cf c1[N_c1];
        T_cf c2[N_c2];
        T_cf c2_shift[1];
    };

    coeff_cache_type *coeff_cache_ptr = (coeff_cache_type*) cf_cache;
    coeff_cache_type &coeff_cache = *coeff_cache_ptr;

    //collect qdq coefficients from weight stream
    auto store_stream_coeff = [&]( int8* ptr, unsigned vec) __attribute__(( always_inline )) {
        #pragma unroll
        for ( unsigned l = 0; l < 2; l++ ) {
            aie::vector<float,32> load_stream =  (v32float) get_ss_v128int8_weight( 1 );
            if ( vec >= 1 ){
                *( (chess_protect_access v32float*) ptr+l) = load_stream;
            } else{
                *( (chess_protect_access float*) ptr) = extract_elem(extract_v16float(load_stream,0), 0);
            }
        }
    };

    //collecting coefficients from coefficient
    auto coeff_fetch = [&]( ) __attribute__(( always_inline )) {
        if(!stream_wgt)
        {
            T_c0 * c0 = ( T_c0* ) pC;
            T_cf * c1 = ( T_cf* )( c0 + 64);
            T_cf * c2 = c1 + 64 * ( has_actv_sum > 0 );
            vector_copy( coeff_cache.c0, c0, N_c0 );
            if constexpr( has_actv_sum > 0)
                vector_copy( coeff_cache.c1, c1, N_c1 );
            vector_copy( coeff_cache.c2, c2, N_c2 );

        } else {
            store_stream_coeff((int8*)coeff_cache.c0, 1);
            if(has_actv_sum){
                store_stream_coeff((int8*)coeff_cache.c1, vector_coeff >= 1);
            }
            store_stream_coeff((int8*)coeff_cache.c2, vector_coeff >= 2);
        }
        *( (T_cf*) coeff_cache.c2_shift) = aie::mul( *((T_cf*) coeff_cache.c2), 256.0f);
    };

    aie::vector<int8,64> a0, a1;

    //Declaring pointers for the used accum spill buffers
    v32int32* sum_tdm_ifm_l = (v32int32*) spill_buf;
    v32int32* sum_tdm_ifm_h = (v32int32*) spill_buf + 2;
    v32int32* gemm_tdm_l = (v32int32*) spill_buf + 2*2;
    v32int32* gemm_tdm_h = (v32int32*) spill_buf + 2*3;
    v32float* sum_ifm_qdq = (v32float*) gemm_tdm_h;


    m32x64acc32 chess_storage( em0 ) acc0 = chess_dont_care( m32x64acc32 );
    m32x64acc32 chess_storage( em1 ) acc1 = chess_dont_care( m32x64acc32 );

    uint5_t ib = 0;
    uint5_t im0 = 0;
    uint5_t im1 = 0;
    uint5_t io = 0;



    auto weight_acquire = [&]( ) __attribute__(( always_inline )) {
        bufB.acquire( );
        pB = (int8*) bufB.data();
        bufC.acquire( );
        pC = (Tc*) bufC.data();
        mmult_rt_params = (MMultQdQInt16x8_RT_Params*)byte_incr(pB, mmult_params.wgt_size + mmult_params.coeff_size);
    };

    auto weight_release = [&]( ) __attribute__(( always_inline )) {
        bufB.release( );
        bufC.release( );
    };

    auto ifm_acquire = [&]( ) __attribute__(( always_inline )) {
        bufA.acquire( );
        pA = (Ta*) bufA.data();
        if constexpr( !stream_ifm ) {
            pAs = (int8*)bufA.data();
        }
    };

    auto ifm_release = [&]( ) __attribute__(( always_inline )) {
        bufA.release( );
    };

    auto ofm_acquire = [&]( ) __attribute__(( always_inline )) {
        bufO.acquire( );
        pO = (To*) bufO.data( );
    };

    auto ofm_release = [&]( ) __attribute__(( always_inline )) {
        bufO.release( );
    };

    auto weight_fetch = [&]( ) __attribute__(( always_inline )) {
        aie::vector<int8,128> weight_v;
        if constexpr( !stream_wgt )
            weight_v = unpack_wght( *( v128int4 __aie_dm_resource_a * ) pB, mmult_rt_params->sign_W );
        else
            weight_v = unpack_wght( get_ss_v128int4( 1 ), mmult_rt_params->sign_W );
        insert_staging( weight_v, ib++, 2 + mmult_rt_params->sign_W );   
        pB = add_byte( pB, 64*sizeof(int8) );
        ib = chess_copy( ib );
    };

    auto compute_prepare = [&]( ) __attribute__(( always_inline )) {
        staging_to_matrix_m64x64int8( );
        pB = add_3d_byte( pB, dimsB );
    };

    auto compute_execute = [&]( bool fix_reg = 1, bool pass_stream = 0 ) __attribute__(( always_inline )) {
        aie::vector<int8, 128>  a;
        if constexpr(stream_ifm){
            if(!pass_stream ){
                //This degrades performance for A, or will cause a crash when run as AW
                //a = read_v<128>( pA );
                a = get_ss_v128int8(0);
                if(has_actv_sum){
                    write_v(pATdm, a);
                    pATdm+=128;
                }
            } else {
                a = read_v<128>( pAs );pAs+=128;
            }
        } else {
            a = read_v<128>( (int8 *) pA );
        }

        if ( fix_reg && has_actv_sum ) a = locate_in_register<7>( a );
        std::tie( a0, a1 ) = a.template split<64>( );
        std::tie( a0, a1 ) = aie::interleave_unzip( a0, a1, 1 );

        if ( fix_reg ) {
            a0 = locate_in_register<6>( a0 );
            a1 = locate_in_register<7>( a1 );
        }

        if ( chess_manifest( zero_acc == 1 )) {
            acc0[im0] = mul( a0, false,       mmult_rt_params->sign_W );
            acc1[im1] = mul( a1, mmult_rt_params->sign_A, mmult_rt_params->sign_W );
        } else {
            acc0[im0] = mac_conf( a0, false,       mmult_rt_params->sign_W, acc0[im0], zero_acc );
            acc1[im1] = mac_conf( a1, mmult_rt_params->sign_A, mmult_rt_params->sign_W, acc1[im1], zero_acc );
        }
    };

    auto compute_incr = [&]( ) __attribute__(( always_inline )) {
        im0++; im1++;
        pA = add_byte( pA, 128*sizeof(int8) );
    };

    auto compute_finalize = [&]( ) __attribute__(( always_inline )) {
        zero_acc = 0;
        pA = add_3d_byte( pA, dimsA );
        chess_separator();
    };

    auto sum_fetch = [&]( bool pass_stream = 1 ) __attribute__(( always_inline )) {
        aie::vector<int8, 128> a;
        if constexpr(stream_ifm){
            if(!pass_stream){
                //This degrades performance for A, or will cause a crash when run as AW
                //a = read_v<128>( pA );
                a = get_ss_v128int8(0);
                insert_staging( a, ib++, 2 + mmult_rt_params->sign_A );
                aie::store_v(pATdm, a); pATdm+=128;
            } else {
                a = read_v<128>(pAs);pAs += 128;
                insert_staging( a, ib++, 2 + mmult_rt_params->sign_A );
            }
        } else {
            a = read_v<128>(pAs);
            insert_staging( a, ib++, 2 + mmult_rt_params->sign_A );     pAs += 128;
        }
    };

    auto sum_start = [&]( ) __attribute__(( always_inline )) {
        //Accumulators are spilled and restored to create the ifm_sum, choosing accum 16
        const unsigned spill_idx = 16;
        *(gemm_tdm_l) = (v32int32) extract_v32acc32( acc0[spill_idx],0);
        *(gemm_tdm_l + 1) = (v32int32) extract_v32acc32( acc0[spill_idx],1);
        *(gemm_tdm_h) = (v32int32) extract_v32acc32( acc1[spill_idx],0);
        *(gemm_tdm_h + 1) = (v32int32) extract_v32acc32( acc1[spill_idx],1);
        acc0 = insert( acc0, spill_idx, 0, *((chess_protect_access v32acc32*) sum_tdm_ifm_l));
        acc0 = insert( acc0, spill_idx, 1, *((chess_protect_access v32acc32*) sum_tdm_ifm_l+1));
        acc1 = insert( acc1, spill_idx, 0, *((chess_protect_access v32acc32*) sum_tdm_ifm_h) );
        acc1 = insert( acc1, spill_idx, 1, *((chess_protect_access v32acc32*) sum_tdm_ifm_h+1) );
    };

    auto sum_end_int16 = [&]( ) __attribute__(( always_inline )) {
        //Accumulators are spilled and restored to create the ifm_sum, choosing accum 16
        const unsigned spill_idx = 16;
        sum_tdm_ifm_l[0] = (v32int32) extract_v32acc32(acc0[spill_idx], 0);
        sum_tdm_ifm_l[1] = (v32int32) extract_v32acc32(acc0[spill_idx], 1);
        sum_tdm_ifm_h[0] = (v32int32) extract_v32acc32(acc1[spill_idx], 0);
        sum_tdm_ifm_h[1] = (v32int32) extract_v32acc32(acc1[spill_idx], 1);
        acc0 = insert( acc0, spill_idx, 0, *((chess_protect_access v32acc32*) gemm_tdm_l));
        acc0 = insert( acc0, spill_idx, 1, *((chess_protect_access v32acc32*) gemm_tdm_l+1));
        acc1 = insert( acc1, spill_idx, 0, *((chess_protect_access v32acc32*) gemm_tdm_h));
        acc1 = insert( acc1, spill_idx, 1, *((chess_protect_access v32acc32*) gemm_tdm_h+1));
    };

    auto sum_block_int16 = [&]( bool zero_acc = 0 ) __attribute__(( always_inline )) {
        //Accumulators are spilled and restored to create the ifm_sum, choosing accum 16
        const unsigned spill_idx = 16;
        staging_to_matrix_m64x64int8( );
        auto low_mask = chess_duplicate(aie::interleave_zip(aie::broadcast<int8,64>( 1 ),aie::broadcast<int8,64>( 0 ),1).first);
        auto high_mask = chess_duplicate(aie::interleave_zip(aie::broadcast<int8,64>( 0 ),aie::broadcast<int8,64>( 1 ),1).first);

        if ( chess_manifest( zero_acc == 1 )) {
            acc0[spill_idx] = mul( low_mask, false, false );
            acc1[spill_idx] = mul( high_mask, false, mmult_rt_params->sign_A );
        } else {
            acc0[spill_idx] = mac_conf( low_mask, false, false, acc0[spill_idx], zero_acc );
            acc1[spill_idx] = mac_conf( high_mask, false, mmult_rt_params->sign_A, acc1[spill_idx], zero_acc );
        }

        zero_acc = 0;
        if constexpr(!stream_ifm_actsum)
            pAs = add_3d_byte( pAs, dimsAs );
        return zero_acc;
    };

    auto sum_int16_post_cache_rev_unroll = [&]( ) __attribute__(( always_inline )) {
            aie::accum<accfloat,32> acc;
            aie::vector<int32,32> vec_high, vec_low;
            vec_low = aie::interleave_unzip( aie::vector<int32,32>(sum_tdm_ifm_l[0]), aie::vector<int32,32>(sum_tdm_ifm_l[1]), 1).first;
            vec_low = aie::add(vec_low,aie::interleave_unzip( aie::vector<int32,32>(sum_tdm_ifm_l[0]), aie::vector<int32,32>(sum_tdm_ifm_l[1]), 1).second);
            vec_high = aie::interleave_unzip( aie::vector<int32,32>(sum_tdm_ifm_h[0]), aie::vector<int32,32>(sum_tdm_ifm_h[1]), 1).first;
            vec_high = aie::add(vec_high, aie::interleave_unzip( aie::vector<int32,32>(sum_tdm_ifm_h[0]), aie::vector<int32,32>(sum_tdm_ifm_h[1]), 1).second);
            aie::vector<float,32>  vec_scale = (v32float) mul_elem_32( vec_low, 1.0f);
            vec_scale = (v32float) mac_elem_32(vec_high, 256.0f, (v32accfloat) vec_scale);
            *((v32float*) sum_ifm_qdq ) = vec_scale;
    };

    auto write_output = [&]( auto l , auto i, float* sum_ifm) __attribute__(( always_inline )) {
        if constexpr ( std::is_same_v<T_cf, float> ){

            v32int32 gemm_low = (v32int32 )  extract_v32acc32(acc0[l],i);
            v32int32 gemm_high = ( v32int32 ) extract_v32acc32(acc1[l],i);
            aie::vector<float,32>  c0_bias_vector;
            decltype(access<V_c2>( coeff_cache.c2, i ))  qdq_coeff2, c2_shifted;
            aie::accum<accfloat,32> qdq_acc, qdq_acc1;

            if constexpr ( vector_coeff > 1 ){
                decltype(access<V_c2>( coeff_cache.c2, i )) qdq_coeff2_pin = locate_in_register<5>(*((chess_protect_access v32float __aie_dm_resource_b *)coeff_cache.c2 + i));
                qdq_coeff2 = qdq_coeff2_pin;
            } else {
                qdq_coeff2 = access<V_c2>( coeff_cache.c2, i );
                c2_shifted =  access<V_c2>( coeff_cache.c2_shift, i );
            }
            c0_bias_vector = locate_in_register<4>(*((chess_protect_access v32float __aie_dm_resource_b*)coeff_cache.c0 + i));

            if constexpr ( vector_coeff > 1 && has_actv_sum){
                qdq_acc =  locate_in_register<0>( mul_elem_32(gemm_low, 1.0f ) );
                qdq_acc1 = locate_in_register<1>( mac_elem_32(gemm_high, 256.0f, (v32accfloat) qdq_acc ));
                if( i == 0){
                    qdq_acc1 = locate_in_register<2>( mac_elem_32((v32float) qdq_acc1, qdq_coeff2, (v32accfloat) c0_bias_vector));
                } else {
                    qdq_acc1 = locate_in_register<6>( mac_elem_32((v32float) qdq_acc1, qdq_coeff2, (v32accfloat) c0_bias_vector));
                }
            }
            else if constexpr ( vector_coeff > 1 ){
                qdq_acc =  locate_in_register<1>( mul_elem_32(gemm_low, 1.0f ) );
                qdq_acc1 = mac_elem_32(gemm_high, 256.0f, (v32accfloat) qdq_acc );
                qdq_acc1 = mac_elem_32((v32float) qdq_acc1, qdq_coeff2, (v32accfloat) c0_bias_vector);
            } else {
                qdq_acc =  locate_in_register<0>( aie::accum<accfloat,32>( mac_elem_32(gemm_low, qdq_coeff2, (v32accfloat) c0_bias_vector) ));
                qdq_acc1 = locate_in_register<1>( aie::accum<accfloat,32>( mac_elem_32(gemm_high, c2_shifted, (v32accfloat) qdq_acc ) ));
            }

            if constexpr( has_actv_sum ) {
                v32float qdq_coeff1;
                if constexpr ( vector_coeff >= 1 ){
                    qdq_coeff1 = locate_in_register<4 + (vector_coeff == 1)>(*((chess_protect_access __aie_dm_resource_b v32float*)coeff_cache.c1 + i));
                } else {
                    qdq_coeff1 = chess_duplicate(aie::broadcast<float,32>(access<V_c1>( coeff_cache.c1, i )));
                }
                if( i == 0)
                    qdq_acc1 = locate_in_register<2>(mac_elem_32(qdq_coeff1, *(float *)sum_ifm, (v32accfloat) qdq_acc1));
                else
                    qdq_acc1 = locate_in_register<6>(mac_elem_32(qdq_coeff1, *(float __aie_dm_resource_a *)sum_ifm, (v32accfloat) qdq_acc1));
            }

            auto out = qdq_acc1.template to_vector_sign<int16>( mmult_rt_params->sign_O, aie::neg(mmult_rt_params->shift_res) );
            write_v(pO, out); pO = add_byte( pO, 2*V_qdq );
        }
    };

    #ifdef FILE_IO
    constexpr bool is_fileio = 0;//!( src_cfg == DSC_STREAM );
    #else
    constexpr bool is_fileio = 0;
    #endif

    if constexpr( il_lr <= 1 || is_fileio ) {
        for ( int it=0; it < iters; it++ )
        chess_loop_range(1,)
        {
            acc0 = chess_dont_care( m32x64acc32 );
            acc1 = chess_dont_care( m32x64acc32 );
            zero_acc = 1;
            for ( int j=0; j < ol_bound; j++ )
            chess_loop_range(1,)
            {
                weight_acquire( );
                ifm_acquire( );
                bool do_actv_sum = has_actv_sum && ( keep_sum_cnt == 0 );

                if ( do_actv_sum ) {
                    sum_start( );
                    auto z = zero_acc;

                    for ( int i=0; i < sum_bound; i++ )
                    chess_loop_range(1,)
                    {
                        for ( unsigned l = 0; l < granM; l++ ) {
                            sum_fetch( );
                        }
                        z = sum_block_int16(  z );
                    }

                    sum_end_int16();

                }

                for ( int i=0; i < il_bound; i++ )
                chess_loop_range(1,)
                {

                    for ( unsigned l = 0; l < granM; l++ ) {

                        weight_fetch( );
                    }

                    compute_prepare( );

                    for ( unsigned l = 0; l < granM; l++ ) {
                        compute_execute( );
                        compute_incr( );
                    }

                    compute_finalize( );

                }
                coeff_fetch( );
                weight_release( );
                ifm_release( );
            }

            im0=0;
            im1=0;

            ofm_acquire( );
            keep_sum_cnt = keep_sum_cnt == 0 ? keep_sum_iters - 1 : 0;
            if constexpr( has_actv_sum ) {
                sum_int16_post_cache_rev_unroll();
            }
            uint5_t iPo = 0;
            uint5_t iPoo = 0;
            float* sum_ifm = (float *) sum_ifm_qdq;
            for ( unsigned l =0; l < granM; l++ )
            chess_loop_range(1,)
            {
                write_output(iPo++, 0, sum_ifm);
                write_output(iPoo++, 1, sum_ifm++);
            }
            ofm_release( );
        }

    } else {
        weight_acquire( );
        for ( unsigned l = 0; l < granM; l++ ) {
            weight_fetch( );
        }

        compute_prepare( );
        ifm_acquire( );
        zero_acc = 1;

        for ( unsigned l = 0; l < granM; l++ ) {
            compute_execute( );
            compute_incr( );
            weight_fetch( );
        }
        v32int32 dbg_gemm_lo = (v32int32 )  extract_v32acc32(acc0[0],0);
        v32int32 dbg_gemm_hi = (v32int32 )  extract_v32acc32(acc1[0],0);
        compute_finalize( );
        compute_prepare( );
        for ( int it=0; it < iters; it++ )
        chess_loop_range( 1, )
        {
            for ( int j=0; j < ol_bound; j++ )
            chess_loop_range( 1, )
            {
                constexpr unsigned il_code_fold = 2 + ( has_actv_sum > 0 );
                pipelined_loop<il_lr - il_code_fold, std::max( il_code_fold, il_peel ) - il_code_fold>( il_bound - il_code_fold, [&]( auto i ) __attribute__((always_inline))
                {
                    pipelined_loop<8, 4>( granM, [&]( auto l ) __attribute__((always_inline))
                    {
                        compute_execute( );
                        compute_incr( );
                        weight_fetch( );
                    });

                    compute_finalize( );
                    compute_prepare( );
                });
                if constexpr( has_actv_sum ) {
                    pipelined_loop<8, 4>( granM, [&]( auto l ) __attribute__((always_inline))
                    {
                        compute_execute( );
                        compute_incr( );
                        sum_fetch( );
                    });

                    compute_finalize( );
                    sum_start( );
                    zero_sum = sum_block_int16( zero_sum );

                    for ( int i=1; i < il_bound - stream_ifm_actsum; i++ )
                        chess_prepare_for_pipelining
                        chess_loop_range( il_lr - 1 - stream_ifm_actsum, )
                    {
                        for ( unsigned l = 0; l < granM ; l++ )
                            chess_peel_pipelined_loop( 10 )
                            chess_pipeline_adjust_preamble( -1 )
                        {
                            sum_fetch( );
                        }
                        sum_block_int16( 0 );
                    }
                    if constexpr(stream_ifm){
                        for ( unsigned l = 0; l < granM ; l++ )
                            chess_peel_pipelined_loop( 10 )
                            chess_pipeline_adjust_preamble( -1 )
                        {
                            sum_fetch( false );
                        }
                        sum_block_int16( 0 );
                    }
                    sum_end_int16();

                    for ( unsigned l = 0; l < granM; l++ ) {
                        weight_fetch( );
                    }
                    compute_prepare( );
                }
                weight_release( );

                if ( j >= ol_bound - 1 )
                    break;

                weight_acquire( );

                for ( unsigned l = 0; l < granM; l++ ) {
                    compute_execute( false, stream_ifm_actsum );
                    compute_incr( );
                    weight_fetch( );
                }

                if constexpr(stream_ifm_actsum){
                    pATdm = pAI;
                    pAs =  pAI;
                }
                ifm_release( );

                compute_finalize( );
                compute_prepare( );
                ifm_acquire( );

                for ( unsigned l = 0; l < granM; l++ ) {
                    compute_execute( );
                    compute_incr( );
                    weight_fetch( );
                }

                compute_finalize( );
                compute_prepare( );
            }

            ofm_acquire( );
            coeff_fetch( );
            im0=0;
            im1=0;
            if constexpr( has_actv_sum ) {
                sum_int16_post_cache_rev_unroll();
            }
            if ( it >= iters - 1 )
                break;
            //using two indexing variables to improve scheduling
            uint5_t iPo = 0;
            uint5_t iPoo = 0;

            weight_acquire( );
            float* sum_ifm = (float *) sum_ifm_qdq;
            for ( unsigned l = 0; l < granM / 2; l++ )
            chess_allocate(Y : 8)
            {
                compute_execute( 1, stream_ifm_actsum );
                compute_incr( );
                write_output(iPo++, 0, sum_ifm);
                compute_execute( 1, stream_ifm_actsum  );
                write_output(iPoo++, 1, sum_ifm++);
                compute_incr( );
                weight_fetch( );
                weight_fetch( );
            }

            if constexpr(stream_ifm_actsum){
                pATdm =  pAI;
                pAs =  pAI;
            }

            ifm_release( );

            compute_finalize( );
            compute_prepare( );
            ifm_acquire( );
            zero_acc = 1;
            zero_sum = 1;
            //Peeling two calls to improve loop scheduling
            write_output(iPo++, 0, sum_ifm);
            write_output(iPoo++, 1, sum_ifm++);
            for ( unsigned l = 0; l < (granM / 2)-1; l++ )
            chess_allocate(Y : 8)
            {
                compute_execute( 1 );
                compute_incr( );
                write_output(iPo++, 0, sum_ifm);
                compute_execute( 1  );
                write_output(iPoo++, 1, sum_ifm++);
                compute_incr( );
                weight_fetch( );
                weight_fetch( );
            }
            compute_execute( 1 );
            compute_incr( );
            compute_execute( 1  );
            compute_incr( );
            weight_fetch( );
            weight_fetch( );

            ofm_release( );

            compute_finalize( );
            compute_prepare( );
        }

        uint5_t iPo = 0;
        uint5_t iPoo = 0;
        float* sum_ifm = (float *) sum_ifm_qdq;
        for ( unsigned l = 0; l < granM; l++ ) {
            compute_execute( 1 , stream_ifm_actsum);
            write_output(iPo++, 0, sum_ifm);
            write_output(iPoo++, 1, sum_ifm++);
            compute_incr( );
        }

        ifm_release( );
        ofm_release( );
        compute_finalize( );

    }

    event1( );
}

#endif // __ACTIVATED_MMULT_QDQ_INT16X4_IMPL_HPP__

    