/*  (c) Copyright 2019 - 2024 Xilinx, Inc. All rights reserved.

    This file contains confidential and proprietary information
    of Xilinx, Inc. and is protected under U.S. and
    international copyright and other intellectual property
    laws.

    DISCLAIMER
    This disclaimer is not a license and does not grant any
    rights to the materials distributed herewith. Except as
    otherwise provided in a valid license issued to you by
    Xilinx, and to the maximum extent permitted by applicable
    law: (1) THESE MATERIALS ARE MADE AVAILABLE "AS IS" AND
    WITH ALL FAULTS, AND XILINX HEREBY DISCLAIMS ALL WARRANTIES
    AND CONDITIONS, EXPRESS, IMPLIED, OR STATUTORY, INCLUDING
    BUT NOT LIMITED TO WARRANTIES OF MERCHANTABILITY, NON-
    INFRINGEMENT, OR FITNESS FOR ANY PARTICULAR PURPOSE; and
    (2) Xilinx shall not be liable (whether in contract or tort,
    including negligence, or under any other theory of
    liability) for any loss or damage of any kind or nature
    related to, arising under or in connection with these
    materials, including for any direct, or any indirect,
    special, incidental, or consequential loss or damage
    (including loss of data, profits, goodwill, or any type of
    loss or damage suffered as a result of any action brought
    by a third party) even if such damage or loss was
    reasonably foreseeable or Xilinx had been advised of the
    possibility of the same.

    CRITICAL APPLICATIONS
    Xilinx products are not designed or intended to be fail-
    safe, or for use in any application requiring fail-safe
    performance, such as life-support or safety devices or
    systems, Class III medical devices, nuclear facilities,
    applications related to the deployment of airbags, or any
    other applications that could lead to death, personal
    injury, or severe property or environmental damage
    (individually and collectively, "Critical
    Applications"). Customer assumes the sole risk and
    liability of any use of Xilinx products in Critical
    Applications, subject only to applicable laws and
    regulations governing limitations on product liability.

    THIS COPYRIGHT NOTICE AND DISCLAIMER MUST BE RETAINED AS
    PART OF THIS FILE AT ALL TIMES.                       */


#ifndef __COMMON_HH__
#define __COMMON_HH__

#include <stdint.h>
#include <stdlib.h>

#ifndef ALWAYS_INLINE
#ifdef  __clang__
#define ALWAYS_INLINE inline __attribute__(( always_inline ))
#else
#define ALWAYS_INLINE inline
#endif
#endif

#ifndef NO_INLINE
#ifdef __clang__
#define NO_INLINE __attribute__(( noinline ))
#else
#define NO_INLINE __attribute__(( noinline ))
#endif
#endif

#ifndef __IS_QDQ_FP16__
#define __IS_QDQ_FP16__ 0
#endif

#if __IS_QDQ_FP16__
    using QDQFloatType = float16;
#else
    using QDQFloatType = bfloat16;
#endif


inline float mul( int   a, float b );
inline float mac( int   a, float b, float c );

inline int fast_div_p2( int val, int div ) {
    return val >> ( 31 - clb( div ));
}

inline int fast_div_ceil_p2( int val, int div ) {
    int mask = div - 1;
    return ( val + mask ) >> ( 31 - clb( div ));
}

[[gnu::cold]]
static int div_wrap( int val, int div ) {
    return val / div;
}

inline v32accfloat mul_elem_32( v32int32 a, float b );

inline int fast_div( int val, int div ) {
    //int ret;
    //if ( chess_copy(( population_count( div ) == 1 ) & ( div > 0 ))) [[likely]] {
    //    ret = val >> ( 31 - clb( div ));
    //} else {
    //    int di = inv(( div << 16 ) - div, 16, 16 );
    //    ret = ( val * di + di / 2 ) >> 16;
    //    if ( chess_copy(( clb( div ) < 16 ) | ( ret * div > val ) | ( ret * div <= val - div ))) {
        //if ( chess_copy(( clb( div ) < 16 ) | ( clb( val ) < 16 ))) {
    //        [[unlikely]]
            //ret = div_wrap( val, div );
    //        ret = val / div;
    //    }
        /*
        int16_t v = val;
        int8_t  d = div;
        int ret;
        if (( v != val ) + ( d != div )) {
            [[unlikely]]
            ret = val / div;
        } else {
            ret = ( val * inv( div, 0, 16 ) + 32768 ) >> 16;
        }*/
    //}
    //int ret = float2fix( mac( val, inv( fix2float( div, 0 )), 0.5 ), 0 );
    //int ret = float2fix( mul( val, inv( fix2float( div, 0 ))), 0 );
    int ret = extract_elem( extract_v16int32( to_v32int32( mul_elem_32( set_v32int32( 0, insert( undef_v16int32( ), 0, val )), inv( fix2float(( div << 15 ) - div, 15 )))), 0 ), 0 );
    if ( chess_copy(( clb( div ) < 16 ) | ( clb( val ) < 8 ))) [[unlikely]] {
        ret = chess_copy( 0 ) ? ret : val / div;
    }
    return ret;
}

inline int16_t fast_div( int16_t val, int16_t div ) {
    int di = inv(( div << 16 ) - 1, 16, 16 );
    int ret = div == 0 ? -1 : ( val * di + abs( di ) / 2 ) >> 16;
    return ret;
}

inline uint16_t fast_div( uint16_t val, uint16_t div ) {
    unsigned di = inv(( div << 16 ) - 1, 16, 16 );
    unsigned ret = div == 0 ? -1 : ( val * di + di / 2 ) >> 16;
    return ret;
}


inline int add_dimension( int &reset, int count, int step ) property( no_debug ) {
    int incr = reset + step;
    reset -= count * step;
    return incr;
}

inline dims_2d_t dims_2d_from_steps_reset( int reset, int wrap0, int step0, int step1 ) {
    int inc0 = add_dimension( reset, wrap0 - 1, step0 );
    int inc1 = reset + step1;
    return dims_2d_t( wrap0 - 1, inc0, inc1 );
}

inline dims_3d_t dims_3d_from_steps( int reset, int wrap0, int step0, int wrap1, int step1, int step2 ) {
    int inc0 = add_dimension( reset, wrap0 - 1, step0 );
    int inc1 = add_dimension( reset, wrap1 - 1, step1 );
    int inc2 = reset + step2;
    return dims_3d_t( wrap0 - 1, inc0, wrap1 - 1, inc1, inc2 );
}


inline int get_expo( int offset ) property( no_debug ) {
    return 0x1010101 * ( 127+offset );
}
inline sparsity_t get_sparse( int mask ) property( no_debug ) {
    v16int8 sparse = extract_v16int8( broadcast_s8( 0x11 * mask ), 0 );
    return *( sparsity_t* )&sparse; // TODO: create CRVO to get this enabled as a move
}




inline v2float inv( v2float vec ) property( no_debug ) {
    v2float ret;
    ret = set_v2float( 0, inv( extract_elem( vec, 0 )));
    ret = insert( ret, 1, inv( extract_elem( vec, 1 )));
    return ret;
}
inline v2float invsqrt( v2float vec ) property( no_debug ) {
    v2float ret;
    ret = set_v2float( 0, invsqrt( extract_elem( vec, 0 )));
    ret = insert( ret, 1, invsqrt( extract_elem( vec, 1 )));
    return ret;
}
inline v16float insert_invsqrt_extract_v2float( v16float out, int idx, v16float in ) property( no_debug ) {
    out = insert( out, idx, invsqrt( extract_v2float( in, idx )));
    return out;
}
inline v16float insert_invsqrt_extract_v2float( v16float out, int idx, v32accfloat in ) property( no_debug ) {
    out = insert( out, idx&15, invsqrt( extract_v2float(( v16float ) extract_v16accfloat( in, idx/16 ), idx&15 )));
    return out;
}
inline v32int32 insert( v32int32 v, int idx, int s ) property( no_debug ) {
    return insert( v, idx / 16, insert( extract_v16int32( v, idx / 16 ), idx & 15, s ));
}
inline v16acc32 insert( v16acc32 v, int idx, int s ) property( no_debug ) {
    return ( v16acc32 )insert(( v16int32 ) v, idx, s );
}
inline v32acc32 insert( v32acc32 v, int idx, int s ) property( no_debug ) {
    return insert( v, idx / 16, insert( extract_v16acc32( v, idx / 16 ), idx & 15, s ));
}
inline v64acc32 insert( v64acc32 v, int idx, int s ) property( no_debug ) {
    return insert( v, idx / 16, insert( extract_v16acc32( v, idx / 16 ), idx & 15, s ));
}


inline v16acc32 extract_v16acc32( m32x64acc32 a, int i, int p ) {
    return extract_v16acc32( extract_v32acc32( a, i, p / 2 ), p & 1 );
}


inline float extract_elem( v64accfloat v, int idx ) property( no_debug ) {
    return as_float( extract_elem(( v16int32 )extract_v16accfloat( v, idx/16 ), idx&15 ));
}
inline float extract_elem( v32accfloat v, int idx ) property( no_debug ) {
    return as_float( extract_elem(( v16int32 )extract_v16accfloat( v, idx/16 ), idx&15 ));
}

inline v64float concat( v32float a, v32float b ) property( no_debug ) {
    return ( v64float ) concat(( v32accfloat ) a, ( v32accfloat ) b );
}

#if __AIE_ARCH__ < 40
inline v32float broadcast_to_v32float( float s ) property( no_debug ) {
    v16float bm = broadcast_to_v16float( s );
    return concat( bm, bm );
}
inline v32accfloat broadcast_to_v32accfloat( float s ) property( no_debug ) {
    v16accfloat bm = broadcast_to_v16accfloat( s );
    return concat( bm, bm );
}
#endif

inline v64float broadcast_to_v64float( float s ) property( no_debug ) {
    v32float cm = broadcast_to_v32float( s );
    return concat( cm, cm );
}
inline v64accfloat broadcast_to_v64accfloat( float s ) property( no_debug ) {
    v32accfloat cm = broadcast_to_v32accfloat( s );
    return concat( cm, cm );
}

inline v32bfloat16 broadcast_extract_v8bfloat16_to_v32bfloat16( v32bfloat16 v, int idx ) property( no_debug ) {
    return ( v32bfloat16 ) broadcast_elem_128(( v16int32 )v, idx );
}
inline v32bfloat16 broadcast_extract_v16bfloat16_to_v32bfloat16( v32bfloat16 v, int idx ) property( no_debug ) {
    return shuffle( v, v, T256_2x2_lo+( idx&1 ));
}


inline v64int8 shuffle( v128int8 a, int mode ) property( no_debug ) {
    return shuffle( extract_v64int8( a, 0 ), extract_v64int8( a, 1 ), mode );
}
inline v32int16 shuffle( v64int16 a, int mode ) property( no_debug ) {
    return shuffle( extract_v32int16( a, 0 ), extract_v32int16( a, 1 ), mode );
}
inline v16int32 shuffle( v32int32 a, int mode ) property( no_debug ) {
    return shuffle( extract_v16int32( a, 0 ), extract_v16int32( a, 1 ), mode );
}

inline v64bfloat16 shuffle( v64bfloat16 x, int mode ) property( no_debug ) {
    v64bfloat16 y = set_v64bfloat16( 0, shuffle( extract_v32bfloat16( x, 0 ), extract_v32bfloat16( x, 1 ), mode ));
    return insert( y, 1, shuffle( extract_v32bfloat16( x, 0 ), extract_v32bfloat16( x, 1 ), mode+1 ));
}


inline v32int16 interleave_T16_4x2x4( v32int16 a ) property( no_debug ) {
    return shuffle( shuffle( a, T16_8x4 ), T16_4x2 );
}


inline v8acc64 shuffle( v8acc64 a, v8acc64 b, int mode ) property( no_debug ) {
    return ( v8acc64 ) shuffle(( v16int32 )a, ( v16int32 )b, mode );
}

inline v16accfloat shuffle( v16accfloat a, v16accfloat b, int mode ) property( no_debug ) {
    return ( v16accfloat ) shuffle(( v16int32 )a, ( v16int32 )b, mode );
}
inline v16accfloat shuffle( v16accfloat a, int mode ) property( no_debug ) {
    return ( v16accfloat ) shuffle(( v16int32 )a, mode );
}

inline v64float8 shuffle( v64float8 a, v64float8 b, int mode ) {
    return ( v64float8 ) shuffle(( v64int8 )a, ( v64int8 )b, mode );
}
inline v64float8 shuffle( v64float8 a, int mode ) {
    return ( v64float8 ) shuffle(( v64int8 )a, mode );
}
inline v32float16 shuffle( v32float16 a, v32float16 b, int mode ) {
    return ( v32float16 ) shuffle(( v32bfloat16 )a, ( v32bfloat16 )b, mode );
}
inline v32float16 shuffle( v32float16 a, int mode ) {
    return ( v32float16 ) shuffle(( v32bfloat16 )a, mode );
}


inline v32float16 shift( v32float16 a, v32float16 b, int shft ) {
    return ( v32float16 ) shift(( v32bfloat16 )a, ( v32bfloat16 )b, shft );
}

inline v32float16 sel( v32float16 a, v32float16 b, int m ) {
    return ( v32float16 ) sel(( v32bfloat16 ) a, ( v32bfloat16 ) b, m );
}
inline v16float sel( v16float a, v16float b, int m ) {
    return ( v16float ) sel(( v16int32 ) a, ( v16int32 ) b, m );
}
inline v32float sel( v32float a, v32float b, int m ) {
    return ( v32float ) sel(( v32int32 ) a, ( v32int32 ) b, m );
}
inline v32accfloat sel( v32accfloat a, v32accfloat b, int m ) {
    return ( v32accfloat ) sel(( v32float ) a, ( v32float ) b, m );
}

inline v32int32 band( v32int32 a, v16int32 b ) property( no_debug ) {
    return concat( band( extract_v16int32( a, 0 ), b ), band( extract_v16int32( a, 1 ), b ));
}


#if __AIE_MODEL_VERSION__ <= 4900
inline v32accfloat exp2_bf20_hw( v32accfloat a ) {
    return me_primitive::exp2_bf20_hw( a );
}
#endif


inline v64acc32 ups_to_v64acc32( v64int16 vec, int shift, int sign=1 ) property( no_debug ) {
    return to_v64acc32( vec, shift, sign );
}


inline v64float8 to_v64float8( v64accfloat acc ) property( no_debug ) {
    return to_v64float8( acc, 0 );
}

#if __AIE_MODEL_VERSION__ < 11200
inline v32acc64 mac_elem_32( v32int16 a, v32int32 b, v32int32 c ) {
    v32acc64 acc = set_v32acc64( 0, mac_elem_16( extract_v16int32( b, 0 ), extract_v16int16( a, 0 ), to_v16acc64( extract_v16int32( c, 0 ), 0 )));
    return insert( acc, 1, mac_elem_16( extract_v16int32( b, 1 ), extract_v16int16( a, 1 ), to_v16acc64( extract_v16int32( c, 1 ), 0 )));
}
#endif

inline v16accfloat mul_elem_16( v16float a, float b ) {
    return extract_v16accfloat( mul_elem_32( set_v32float( 0, a ), broadcast_to_v32float( b )), 0 );
}
inline v16accfloat mac_elem_16( v16float a, float b, v16accfloat acc ) {
    return extract_v16accfloat( mac_elem_32( set_v32float( 0, a ), broadcast_to_v32float( b ), set_v32accfloat( 0, acc )), 0 );
}
inline v16accfloat mul_elem_16( v16int32 a, float b ) {
    return extract_v16accfloat( mul_elem_32( set_v32int32( 0, a ), broadcast_to_v32float( b )), 0 );
}
inline v16accfloat mac_elem_16( v16int32 a, float b, v16accfloat acc ) {
    return extract_v16accfloat( mac_elem_32( set_v32int32( 0, a ), broadcast_to_v32float( b ), set_v32accfloat( 0, acc )), 0 );
}

inline float mul( int a, float b ) {
    return extract_elem(( v16float ) mul_elem_16( insert( undef_v16int32( ), 0, a ), b ), 0 );
}
inline float mac( int a, float b, float c ) {
    return extract_elem(( v16float ) mac_elem_16( insert( undef_v16int32( ), 0, a ), b, ( v16accfloat ) insert( undef_v16float( ), 0, c )), 0 );
}



inline v64acc32 mul( v64int8 x, int sgn_x, int sgn_y ) {
 int conf = me_primitive::mac_control( sgn_x, sgn_y, dtype8x8, 1 );
 return me_primitive::mac( undef_v64acc32( ), x, conf );
}

inline v64acc32 mac( v64int8 x, int sgn_x, int sgn_y, v64acc32 acc ) {
 int conf = me_primitive::mac_control( sgn_x, sgn_y, dtype8x8, 0 );
 return me_primitive::mac( acc, x, conf );
}

inline v16acc64 mac_elem_16( int x, int y, v16acc64 acc ) {
    return mac_elem_16( broadcast_to_v16int32( x ), y, acc );
}


/*
// 2d/3d params declarations
template<typename T>
inline T *add_2d_ptr_helper( T *a, dims_2d_t &params ) {
    addr_t c;
    T *r = ( T* )add_2d_int( a, params.inc2, params.inc1, params.num1, params.count1, c );
    params.count1 = c;
    return r;
}

template<typename T>
inline T *add_2d_byte_helper( T *a, dims_2d_t &params ) {
    addr_t c;
    T *r = ( T* )add_2d_byte_int( a, params.inc2, params.inc1, params.num1, params.count1, c );
    params.count1 = c;
    return r;
}
*/

template<unsigned N, typename T> inline __attribute__(( always_inline ))
auto access( T * p, int i ) {
    return aie::load_v<N>( p + N * i );
}
template<> inline __attribute__(( always_inline ))
auto access<1,int>( int * p, int i ) {
    return *p;
}
template<> inline __attribute__(( always_inline ))
auto access<1,int64>( int64 * p, int i ) {
    return *p;
}
template<> inline __attribute__(( always_inline ))
auto access<16,int64>( int64 * p, int i ) {
    return (( v16acc64* ) p )[i];
}
template<> inline __attribute__(( always_inline ))
auto access<1,float>( float * p, int i ) {
    return *p;
}

template<typename T>
inline __attribute__(( always_inline ))
void vector_copy( T * restrict dst, T * src, int cnt ) {
    int elem = sizeof( T ) * cnt;
    for ( int i = 0; i < elem / 128; i++ ) (( v128int8* )dst )[i] = aie::utils::locate_in_register<0>((( v128int8* )src )[i]);
    if ( elem & 64 ) (( v64int8* )dst )[elem & ~127] = (( v64int8* )src )[elem & ~127];
    if ( elem & 32 ) (( v32int8* )dst )[elem & ~63]  = (( v32int8* )src )[elem & ~63];
    if ( elem & 16 ) (( v16int8* )dst )[elem & ~31]  = (( v16int8* )src )[elem & ~31];
    if ( elem &  8 ) ((   int64* )dst )[elem & ~15]  = ((   int64* )src )[elem & ~15];
    if ( elem &  4 ) ((   int32* )dst )[elem & ~7]   = ((   int32* )src )[elem & ~7];
    if ( elem &  2 ) ((   int16* )dst )[elem & ~3]   = ((   int16* )src )[elem & ~3];
    if ( elem &  1 ) ((    int8* )dst )[elem & ~1]   = ((    int8* )src )[elem & ~1];
}

template<typename T> struct get_next_type { using type = aie::Utils::get_next_integer_type_t<T>; };
template<> struct get_next_type<float> { using type = float; };
template<typename T> using get_next_type_t = get_next_type<T>::type;


#ifdef __ndl__

#ifndef ALWAYS_INLINE
#define ALWAYS_INLINE inline
#endif

#else

#ifndef ALWAYS_INLINE
#define ALWAYS_INLINE inline __attribute__(( always_inline ))
#endif

#include <cstdlib>
#include <utility>
#include <algorithm>


template<typename T> struct Signed { static constexpr bool value = 1; };
template<> struct Signed<v256uint4> { static constexpr bool value = 0; };
template<> struct Signed<v128uint4> { static constexpr bool value = 0; };
template<> struct Signed<v128uint8> { static constexpr bool value = 0; };
template<> struct Signed<v64uint8>  { static constexpr bool value = 0; };
template<> struct Signed<v64uint16> { static constexpr bool value = 0; };
template<> struct Signed<v32uint16> { static constexpr bool value = 0; };
template<> struct Signed<v32uint32> { static constexpr bool value = 0; };
template<> struct Signed<v16uint32> { static constexpr bool value = 0; };

template<typename T>
constexpr bool Signed_v = Signed<T>::value;


template<typename T> struct AccessType { using type = T; };
//template<> struct AccessType<v64float8> { using type = v32float8; };
//template<> struct AccessType<v64float16> { using type = v32float16; };
//template<> struct AccessType<v64bfloat16> { using type = v32bfloat16; };
//template<> struct AccessType<v64int16> { using type = v64int16; };
template<> struct AccessType<v64acc32> { using type = v32acc32; };

template<typename T>
using AccessType_t = aie_dm_resource_set_t<typename AccessType<aie_dm_resource_remove_t<T>>::type, aie_dm_resource_get_v<T>>;


template<typename T> struct AccumulatorType    { using type = v64accfloat; };
template<> struct AccumulatorType<v32float8>   { using type = v32accfloat; };
template<> struct AccumulatorType<v32float16>  { using type = v32accfloat; };
template<> struct AccumulatorType<v32bfloat16> { using type = v32accfloat; };
template<> struct AccumulatorType<v32float>    { using type = v32accfloat; };
template<> struct AccumulatorType<v64int4>     { using type = v64acc32; };
template<> struct AccumulatorType<v64uint4>    { using type = v64acc32; };
template<> struct AccumulatorType<v64int8>     { using type = v64acc32; };
template<> struct AccumulatorType<v64uint8>    { using type = v64acc32; };
template<> struct AccumulatorType<v64int16>    { using type = v64acc32; };
template<> struct AccumulatorType<v64uint16>   { using type = v64acc32; };
template<> struct AccumulatorType<v32int4>     { using type = v32acc32; };
template<> struct AccumulatorType<v32uint4>    { using type = v32acc32; };
template<> struct AccumulatorType<v32int8>     { using type = v32acc32; };
template<> struct AccumulatorType<v32uint8>    { using type = v32acc32; };
template<> struct AccumulatorType<v32int16>    { using type = v32acc32; };
template<> struct AccumulatorType<v32uint16>   { using type = v32acc32; };
template<> struct AccumulatorType<v32int32>    { using type = v32acc64; };
template<> struct AccumulatorType<v32uint32>   { using type = v32acc64; };
#ifdef __AIE_API_AIE__HPP__
template<> struct AccumulatorType<int16_t>     { using type = acc32; };
template<> struct AccumulatorType<int32_t>     { using type = acc64; };
template<> struct AccumulatorType<float>       { using type = accfloat; };
#endif

template<typename T>
using AccumulatorType_t = typename AccumulatorType<aie_dm_resource_remove_t<T>>::type;


template<typename T, unsigned div> struct SubVector { static_assert( div == 1 ); using type = T; };
template<> struct SubVector<v64int16,    2> { using type = v32int16; };
template<> struct SubVector<v64uint16,   2> { using type = v32uint16; };
template<> struct SubVector<v32int32,    2> { using type = v16int32; };
template<> struct SubVector<v32uint32,   2> { using type = v16uint32; };
template<> struct SubVector<v64acc32,    2> { using type = v32acc32; };
template<> struct SubVector<v64acc32,    4> { using type = v16acc32; };
template<> struct SubVector<v64accfloat, 2> { using type = v32accfloat; };
template<> struct SubVector<v64accfloat, 4> { using type = v16accfloat; };

template<typename T, unsigned div> using SubVector_t = typename SubVector<T, div>::type;

template<typename T> inline T undefined( );
template<> inline v256int4  undefined( ) { return undef_v256int4( ); }
template<> inline v128int4  undefined( ) { return undef_v128int4( ); }
template<> inline v128int8  undefined( ) { return undef_v128int8( ); }
template<> inline v64int8   undefined( ) { return undef_v64int8(  ); }
template<> inline v64int16  undefined( ) { return undef_v64int16( ); }
template<> inline v32int16  undefined( ) { return undef_v32int16( ); }
template<> inline v32int32  undefined( ) { return undef_v32int32( ); }
template<> inline v16int32  undefined( ) { return undef_v16int32( ); }
template<> inline v256uint4 undefined( ) { return undef_v256uint4( ); }
template<> inline v128uint4 undefined( ) { return undef_v128uint4( ); }
template<> inline v128uint8 undefined( ) { return undef_v128uint8( ); }
template<> inline v64uint8  undefined( ) { return undef_v64uint8(  ); }
template<> inline v64uint16 undefined( ) { return undef_v64uint16( ); }
template<> inline v32uint16 undefined( ) { return undef_v32uint16( ); }
template<> inline v32uint32 undefined( ) { return undef_v32uint32( ); }
template<> inline v16uint32 undefined( ) { return undef_v16uint32( ); }
template<> inline v64acc32  undefined( ) { return undef_v64acc32( ); }
template<> inline v32acc32  undefined( ) { return undef_v32acc32( ); }
template<> inline v16acc32  undefined( ) { return undef_v16acc32( ); }
template<> inline v64float16  undefined( ) { return undef_v64float16( ); }
template<> inline v32float16  undefined( ) { return undef_v32float16( ); }
template<> inline v16float16  undefined( ) { return undef_v16float16( ); }
template<> inline v64bfloat16 undefined( ) { return undef_v64bfloat16( ); }
template<> inline v32bfloat16 undefined( ) { return undef_v32bfloat16( ); }
template<> inline v16bfloat16 undefined( ) { return undef_v16bfloat16( ); }
template<> inline v64float    undefined( ) { return undef_v64float( ); }
template<> inline v32float    undefined( ) { return undef_v32float( ); }
template<> inline v16float    undefined( ) { return undef_v16float( ); }
template<> inline v64accfloat undefined( ) { return undef_v64accfloat( ); }
template<> inline v32accfloat undefined( ) { return undef_v32accfloat( ); }
template<> inline v16accfloat undefined( ) { return undef_v16accfloat( ); }

template<> inline v32int16 extract( v64int16 v, int idx ) { return extract_v32int16( v, idx ); }
template<> inline v16acc32 extract( v32acc32 v, int idx ) { return extract_v16acc32( v, idx ); }
template<> inline v16acc32 extract( v64acc32 v, int idx ) { return extract_v16acc32( v, idx ); }
template<> inline v32acc32 extract( v32acc32 v, int idx ) { return v; }
template<> inline v32acc32 extract( v64acc32 v, int idx ) { return extract_v32acc32( v, idx ); }
template<> inline v16accfloat extract( v32accfloat v, int idx ) { return extract_v16accfloat( v, idx ); }
template<> inline v16accfloat extract( v64accfloat v, int idx ) { return extract_v16accfloat( v, idx ); }
template<> inline v32accfloat extract( v64accfloat v, int idx ) { return extract_v32accfloat( v, idx ); }
template<> inline int extract( v32int32 v, int idx ) { return extract_elem( extract_v16int32( v, idx / 16 ), idx & 15 ); }
template<> inline int extract( v16acc32 v, int idx ) { return extract_elem(( v16int32 ) v, idx ); }
template<> inline int extract( v32acc32 v, int idx ) { return extract_elem(( v16int32 ) extract_v16acc32( v, idx / 16 ), idx & 15 ); }
template<> inline int extract( v64acc32 v, int idx ) { return extract_elem(( v16int32 ) extract_v16acc32( v, idx / 16 ), idx & 15 ); }
template<> inline float extract( v16accfloat v, int idx ) { return extract_elem(( v16float ) v, idx ); }
template<> inline float extract( v32accfloat v, int idx ) { return extract_elem(( v16float ) extract_v16accfloat( v, idx / 16 ), idx & 15 ); }
template<> inline float extract( v64accfloat v, int idx ) { return extract_elem(( v16float ) extract_v16accfloat( v, idx / 16 ), idx & 15 ); }


template<unsigned parts, typename Tv, typename Te=SubVector_t<Tv, parts>>
inline Te extract_w( Tv vec, int idx ) {
    if constexpr( parts == 1 )
        return vec;
    else
        return extract<Te>( vec, idx );
}

template<typename To, typename Ti>
inline To convert( Ti in, int shift=0, bool sign=Signed_v<Ti>&&Signed_v<To> ) {
    return in;
}
template<> inline v32int16 convert( v32acc32 in, int shift, bool sign ) { return to_v32int16( in, shift, sign ); }
template<> inline v64int16 convert( v64acc32 in, int shift, bool sign ) { return to_v64int16( in, shift, sign ); }
template<> inline v32int32 convert( v32acc32 in, int shift, bool sign ) { return ( v32int32 )in; }
#if __AIE_ARCH__ > 21
template<> inline v32float8   convert( v32accfloat in, int shift, bool sign ) { return to_v32float8( in, 0 ); }
template<> inline v64float8   convert( v64accfloat in, int shift, bool sign ) { return to_v64float8( in, 0 ); }
template<> inline v32float16  convert( v32accfloat in, int shift, bool sign ) { return to_v32float16( in ); }
template<> inline v64float16  convert( v64accfloat in, int shift, bool sign ) { return to_v64float16( in ); }
#endif
template<> inline v32bfloat16 convert( v32accfloat in, int shift, bool sign ) { return to_v32bfloat16( in ); }
template<> inline v64bfloat16 convert( v64accfloat in, int shift, bool sign ) { return to_v64bfloat16( in ); }
template<> inline v32float    convert( v32accfloat in, int shift, bool sign ) { return ( v32float )in; }
#if __AIE_ARCH__ > 21
template<> inline v32float16  convert( v32float    in, int shift, bool sign ) { return to_v32float16(( v32accfloat ) in ); }
#endif
template<> inline v32bfloat16 convert( v32float    in, int shift, bool sign ) { return to_v32bfloat16(( v32accfloat ) in ); }

template<> inline v32acc32 convert( v32int16 in, int shift, bool sign ) { return to_v32acc32( in, shift, sign ); }
template<> inline v64acc32 convert( v64int16 in, int shift, bool sign ) { return to_v64acc32( in, shift, sign ); }
template<> inline v32accfloat convert( v32bfloat16 in, int shift, bool sign ) { return to_v32accfloat( in ); }
template<> inline v64accfloat convert( v64bfloat16 in, int shift, bool sign ) { return to_v64accfloat( in ); }
template<> inline v32accfloat convert( v32float16  in, int shift, bool sign ) { return to_v32accfloat( in ); }
template<> inline v64accfloat convert( v64float16  in, int shift, bool sign ) { return to_v64accfloat( in ); }
template<> inline v32accfloat convert( v32float    in, int shift, bool sign ) { return ( v32accfloat )in; }
template<> inline v32float convert( v32float16  in, int shift, bool sign ) { return ( v32float ) to_v32accfloat( in ); }
template<> inline v32float convert( v32bfloat16 in, int shift, bool sign ) { return ( v32float ) to_v32accfloat( in ); }
#if __AIE_ARCH__ >= 40
template<> inline v32int8  convert( v32float in, int shift, bool sign ) { return to_v32int8( in, shift, sign ); }
template<> inline v32int16 convert( v32float in, int shift, bool sign ) { return to_v32int16( in, shift, sign ); }
template<> inline v32int32 convert( v32float in, int shift, bool sign ) { return to_v32int32( in, shift, sign ); }
#endif


template<typename Tr, typename Tm>
inline Tr load( Tm * ptr, int shift=0, bool sign=Signed_v<Tm> ) {
    using Tp = AccessType_t<Tm>;
    constexpr unsigned words = sizeof( Tm ) / sizeof( Tp );
    using Tc = SubVector_t<Tr, words>;
    Tp * p = ( Tp* ) ptr;
    Tr reg = undefined<Tr>( );
    if constexpr( words == 1 ) {
        reg = convert<Tc>( *p, shift, sign );
    } else {
        #pragma unroll
        for ( unsigned i = 0; i < words; i++ ) {
            reg = insert( reg, i, convert<Tc>( p[i], shift, sign ));
        }
    }
    return reg;
}


template<typename Tm, typename Tr>
inline void store( Tm * ptr, Tr reg, int shift=0, bool sign=Signed_v<Tm> ) {
    using Tp = AccessType_t<Tm>;
    constexpr unsigned words = sizeof( Tm ) / sizeof( Tp );
    Tp * p = ( Tp* ) ptr;
    #pragma unroll
    for ( unsigned i = 0; i < words; i++ ) {
        p[i] = convert<Tp>( extract_w<words>( reg, i ), shift, sign );
    }
}


inline v32float16 sub( v32float16 a, v32float16 b ) {
    return convert<v32float16>( add( convert<v32float>( a ), convert<v32float>(( v32float16 ) bxor(( v32int16 ) b, broadcast_to_v32int16( 0x8000 )))));
}


enum AIE_RegFile {
    AIE_RegFile_default,
    AIE_RegFile_R,
    AIE_RegFile_P,
    AIE_RegFile_M,
    AIE_RegFile_DC,
    AIE_RegFile_DJ,
    AIE_RegFile_DN,
    AIE_RegFile_Vector,
    AIE_RegFile_Accum,
};



template<unsigned reg=999, AIE_RegFile rf=AIE_RegFile_default, typename T>
ALWAYS_INLINE T locate_in_register( T val ) {
  #ifdef __chess__
   #ifdef __AIE_API_AIE__HPP__
    if constexpr( aie::detail::is_vector_v<T> || aie::detail::is_accum_v<T> ) {
        val = T( locate_in_register<reg, rf>( val.to_native( )));
    } else
   #endif
    if constexpr( reg == 999 ) {
        auto __aie_register_keep( ) tmp = val; val = __aie_copy( tmp );
    } else if constexpr( std::is_same_v<T, v128int4> ||
                         std::is_same_v<T, v64int8>  ||
                         std::is_same_v<T, v32int16> ||
                         std::is_same_v<T, v16int32> ||
                         std::is_same_v<T, v64uint8>  ||
                         std::is_same_v<T, v32uint16> ||
                         std::is_same_v<T, v16uint32> ||
                    #if __AIE_ARCH__ > 21
                         std::is_same_v<T, v64float8>  ||
                         std::is_same_v<T, v32float16> ||
                    #endif
                         std::is_same_v<T, v32bfloat16> ||
                         std::is_same_v<T, v16float> ) {
        static_assert( rf==AIE_RegFile_default || rf==AIE_RegFile_Vector, "locate_in_register not yet implemented for this type and register file" );
        if constexpr( reg ==  0 ) { auto __aie_register( x0  ) tmp = val; val = __aie_copy( tmp ); }
        if constexpr( reg ==  1 ) { auto __aie_register( x1  ) tmp = val; val = __aie_copy( tmp ); }
        if constexpr( reg ==  2 ) { auto __aie_register( x2  ) tmp = val; val = __aie_copy( tmp ); }
        if constexpr( reg ==  3 ) { auto __aie_register( x3  ) tmp = val; val = __aie_copy( tmp ); }
        if constexpr( reg ==  4 ) { auto __aie_register( x4  ) tmp = val; val = __aie_copy( tmp ); }
        if constexpr( reg ==  5 ) { auto __aie_register( x5  ) tmp = val; val = __aie_copy( tmp ); }
        if constexpr( reg ==  6 ) { auto __aie_register( x6  ) tmp = val; val = __aie_copy( tmp ); }
        if constexpr( reg ==  7 ) { auto __aie_register( x7  ) tmp = val; val = __aie_copy( tmp ); }
        if constexpr( reg ==  8 ) { auto __aie_register( x8  ) tmp = val; val = __aie_copy( tmp ); }
        if constexpr( reg ==  9 ) { auto __aie_register( x9  ) tmp = val; val = __aie_copy( tmp ); }
        if constexpr( reg == 10 ) { auto __aie_register( x10 ) tmp = val; val = __aie_copy( tmp ); }
        if constexpr( reg == 11 ) { auto __aie_register( x11 ) tmp = val; val = __aie_copy( tmp ); }
      #if __AIE_ARCH__ >= 40
        if constexpr( reg == 12 ) { auto __aie_register( x12 ) tmp = val; val = __aie_copy( tmp ); }
        if constexpr( reg == 13 ) { auto __aie_register( x13 ) tmp = val; val = __aie_copy( tmp ); }
        if constexpr( reg == 14 ) { auto __aie_register( x14 ) tmp = val; val = __aie_copy( tmp ); }
        if constexpr( reg == 15 ) { auto __aie_register( x15 ) tmp = val; val = __aie_copy( tmp ); }
      #endif
        if constexpr( reg >= 16 ) { auto __aie_register( X   ) tmp = val; val = __aie_copy( tmp ); }
    } else if constexpr( std::is_same_v<T, v256int4> ||
                         std::is_same_v<T, v128int8> ||
                         std::is_same_v<T, v64int16> ||
                         std::is_same_v<T, v32int32> ||
                         std::is_same_v<T, v128uint8> ||
                         std::is_same_v<T, v64uint16> ||
                         std::is_same_v<T, v32uint32> ||
                       ( std::is_same_v<T, v32acc32> && rf!=AIE_RegFile_Accum ) ||
                         std::is_same_v<T, v16acc64> ||
                    #if __AIE_ARCH__ > 21
                         std::is_same_v<T, v128float8> ||
                         std::is_same_v<T, v64float16> ||
                    #endif
                         std::is_same_v<T, v64bfloat16> ||
                         std::is_same_v<T, v32float> ||
                       ( std::is_same_v<T, v32accfloat> && rf!=AIE_RegFile_Accum )) {
        static_assert( rf==AIE_RegFile_default || rf==AIE_RegFile_Vector, "locate_in_register not yet implemented for this type and register file" );
        if constexpr( reg ==  0 ) { auto __aie_register( y0  ) tmp = val; val = __aie_copy( tmp ); }
        if constexpr( reg ==  1 ) { auto __aie_register( y1  ) tmp = val; val = __aie_copy( tmp ); }
        if constexpr( reg ==  2 ) { auto __aie_register( y2  ) tmp = val; val = __aie_copy( tmp ); }
        if constexpr( reg ==  3 ) { auto __aie_register( y3  ) tmp = val; val = __aie_copy( tmp ); }
        if constexpr( reg ==  4 ) { auto __aie_register( y4  ) tmp = val; val = __aie_copy( tmp ); }
        if constexpr( reg ==  5 ) { auto __aie_register( y5  ) tmp = val; val = __aie_copy( tmp ); }
      #if __AIE_ARCH__ >= 40
        if constexpr( reg ==  6 ) { auto __aie_register( y6  ) tmp = val; val = __aie_copy( tmp ); }
        if constexpr( reg ==  7 ) { auto __aie_register( y7  ) tmp = val; val = __aie_copy( tmp ); }
      #endif
        if constexpr( reg >=  8 ) { auto __aie_register( Y   ) tmp = val; val = __aie_copy( tmp ); }
    } else if constexpr( std::is_same_v<T, v32acc32> || std::is_same_v<T, v64acc32> || std::is_same_v<T, v32accfloat> || std::is_same_v<T, v64accfloat> ) {
      #if __AIE_ARCH__ >= 40
        static_assert( rf==AIE_RegFile_default || rf==AIE_RegFile_Accum, "locate_in_register not yet implemented for this type and register file" );
        if constexpr( reg ==  0 ) { auto __aie_register( dma0  ) tmp = val; val = __aie_copy( tmp ); }
        if constexpr( reg ==  1 ) { auto __aie_register( dma1  ) tmp = val; val = __aie_copy( tmp ); }
        if constexpr( reg ==  2 ) { auto __aie_register( dma2  ) tmp = val; val = __aie_copy( tmp ); }
        if constexpr( reg ==  3 ) { auto __aie_register( dma3  ) tmp = val; val = __aie_copy( tmp ); }
        if constexpr( reg ==  4 ) { auto __aie_register( dma4  ) tmp = val; val = __aie_copy( tmp ); }
        if constexpr( reg ==  5 ) { auto __aie_register( dma5  ) tmp = val; val = __aie_copy( tmp ); }
        if constexpr( reg ==  6 ) { auto __aie_register( dma6  ) tmp = val; val = __aie_copy( tmp ); }
        if constexpr( reg ==  7 ) { auto __aie_register( dma7  ) tmp = val; val = __aie_copy( tmp ); }
        if constexpr( reg ==  8 ) { auto __aie_register( dma8  ) tmp = val; val = __aie_copy( tmp ); }
        if constexpr( reg ==  9 ) { auto __aie_register( dma9  ) tmp = val; val = __aie_copy( tmp ); }
        if constexpr( reg == 10 ) { auto __aie_register( dma10 ) tmp = val; val = __aie_copy( tmp ); }
        if constexpr( reg == 11 ) { auto __aie_register( dma11 ) tmp = val; val = __aie_copy( tmp ); }
        if constexpr( reg == 12 ) { auto __aie_register( dma12 ) tmp = val; val = __aie_copy( tmp ); }
        if constexpr( reg == 13 ) { auto __aie_register( dma13 ) tmp = val; val = __aie_copy( tmp ); }
        if constexpr( reg == 14 ) { auto __aie_register( dma14 ) tmp = val; val = __aie_copy( tmp ); }
        if constexpr( reg == 15 ) { auto __aie_register( dma15 ) tmp = val; val = __aie_copy( tmp ); }
        if constexpr( reg == 16 ) { auto __aie_register( dma16 ) tmp = val; val = __aie_copy( tmp ); }
        if constexpr( reg == 17 ) { auto __aie_register( dma17 ) tmp = val; val = __aie_copy( tmp ); }
        if constexpr( reg == 18 ) { auto __aie_register( dma18 ) tmp = val; val = __aie_copy( tmp ); }
        if constexpr( reg == 19 ) { auto __aie_register( dma19 ) tmp = val; val = __aie_copy( tmp ); }
        if constexpr( reg == 20 ) { auto __aie_register( dma20 ) tmp = val; val = __aie_copy( tmp ); }
        if constexpr( reg == 21 ) { auto __aie_register( dma21 ) tmp = val; val = __aie_copy( tmp ); }
        if constexpr( reg == 22 ) { auto __aie_register( dma22 ) tmp = val; val = __aie_copy( tmp ); }
        if constexpr( reg == 23 ) { auto __aie_register( dma23 ) tmp = val; val = __aie_copy( tmp ); }
        if constexpr( reg == 24 ) { auto __aie_register( dma24 ) tmp = val; val = __aie_copy( tmp ); }
        if constexpr( reg == 25 ) { auto __aie_register( dma25 ) tmp = val; val = __aie_copy( tmp ); }
        if constexpr( reg == 26 ) { auto __aie_register( dma26 ) tmp = val; val = __aie_copy( tmp ); }
        if constexpr( reg == 27 ) { auto __aie_register( dma27 ) tmp = val; val = __aie_copy( tmp ); }
        if constexpr( reg == 28 ) { auto __aie_register( dma28 ) tmp = val; val = __aie_copy( tmp ); }
        if constexpr( reg == 29 ) { auto __aie_register( dma29 ) tmp = val; val = __aie_copy( tmp ); }
        if constexpr( reg == 30 ) { auto __aie_register( dma30 ) tmp = val; val = __aie_copy( tmp ); }
        if constexpr( reg == 31 ) { auto __aie_register( dma31 ) tmp = val; val = __aie_copy( tmp ); }
        if constexpr( reg == 32 ) { auto __aie_register( dmb0  ) tmp = val; val = __aie_copy( tmp ); }
        if constexpr( reg == 33 ) { auto __aie_register( dmb1  ) tmp = val; val = __aie_copy( tmp ); }
        if constexpr( reg == 34 ) { auto __aie_register( dmb2  ) tmp = val; val = __aie_copy( tmp ); }
        if constexpr( reg == 35 ) { auto __aie_register( dmb3  ) tmp = val; val = __aie_copy( tmp ); }
        if constexpr( reg == 36 ) { auto __aie_register( dmb4  ) tmp = val; val = __aie_copy( tmp ); }
        if constexpr( reg == 37 ) { auto __aie_register( dmb5  ) tmp = val; val = __aie_copy( tmp ); }
        if constexpr( reg == 38 ) { auto __aie_register( dmb6  ) tmp = val; val = __aie_copy( tmp ); }
        if constexpr( reg == 39 ) { auto __aie_register( dmb7  ) tmp = val; val = __aie_copy( tmp ); }
        if constexpr( reg == 40 ) { auto __aie_register( dmb8  ) tmp = val; val = __aie_copy( tmp ); }
        if constexpr( reg == 41 ) { auto __aie_register( dmb9  ) tmp = val; val = __aie_copy( tmp ); }
        if constexpr( reg == 42 ) { auto __aie_register( dmb10 ) tmp = val; val = __aie_copy( tmp ); }
        if constexpr( reg == 43 ) { auto __aie_register( dmb11 ) tmp = val; val = __aie_copy( tmp ); }
        if constexpr( reg == 44 ) { auto __aie_register( dmb12 ) tmp = val; val = __aie_copy( tmp ); }
        if constexpr( reg == 45 ) { auto __aie_register( dmb13 ) tmp = val; val = __aie_copy( tmp ); }
        if constexpr( reg == 46 ) { auto __aie_register( dmb14 ) tmp = val; val = __aie_copy( tmp ); }
        if constexpr( reg == 47 ) { auto __aie_register( dmb15 ) tmp = val; val = __aie_copy( tmp ); }
        if constexpr( reg == 48 ) { auto __aie_register( dmb16 ) tmp = val; val = __aie_copy( tmp ); }
        if constexpr( reg == 49 ) { auto __aie_register( dmb17 ) tmp = val; val = __aie_copy( tmp ); }
        if constexpr( reg == 50 ) { auto __aie_register( dmb18 ) tmp = val; val = __aie_copy( tmp ); }
        if constexpr( reg == 51 ) { auto __aie_register( dmb19 ) tmp = val; val = __aie_copy( tmp ); }
        if constexpr( reg == 52 ) { auto __aie_register( dmb20 ) tmp = val; val = __aie_copy( tmp ); }
        if constexpr( reg == 53 ) { auto __aie_register( dmb21 ) tmp = val; val = __aie_copy( tmp ); }
        if constexpr( reg == 54 ) { auto __aie_register( dmb22 ) tmp = val; val = __aie_copy( tmp ); }
        if constexpr( reg == 55 ) { auto __aie_register( dmb23 ) tmp = val; val = __aie_copy( tmp ); }
        if constexpr( reg == 56 ) { auto __aie_register( dmb24 ) tmp = val; val = __aie_copy( tmp ); }
        if constexpr( reg == 57 ) { auto __aie_register( dmb25 ) tmp = val; val = __aie_copy( tmp ); }
        if constexpr( reg == 58 ) { auto __aie_register( dmb26 ) tmp = val; val = __aie_copy( tmp ); }
        if constexpr( reg == 59 ) { auto __aie_register( dmb27 ) tmp = val; val = __aie_copy( tmp ); }
        if constexpr( reg == 60 ) { auto __aie_register( dmb28 ) tmp = val; val = __aie_copy( tmp ); }
        if constexpr( reg == 61 ) { auto __aie_register( dmb29 ) tmp = val; val = __aie_copy( tmp ); }
        if constexpr( reg == 62 ) { auto __aie_register( dmb30 ) tmp = val; val = __aie_copy( tmp ); }
        if constexpr( reg == 63 ) { auto __aie_register( dmb31 ) tmp = val; val = __aie_copy( tmp ); }
        if constexpr( reg >= 64 ) { auto __aie_register( DM    ) tmp = val; val = __aie_copy( tmp ); }

    } else if constexpr( std::is_same_v<T, uint5_t> ) {
        static_assert( rf==AIE_RegFile_default, "locate_in_register not yet implemented for this type and register file" );
        if constexpr( reg ==  0 ) { auto __aie_register( i0  ) tmp = val; val = __aie_copy( tmp ); }
        if constexpr( reg ==  1 ) { auto __aie_register( i1  ) tmp = val; val = __aie_copy( tmp ); }
        if constexpr( reg ==  2 ) { auto __aie_register( i2  ) tmp = val; val = __aie_copy( tmp ); }
        if constexpr( reg ==  3 ) { auto __aie_register( i3  ) tmp = val; val = __aie_copy( tmp ); }
        if constexpr( reg ==  4 ) { auto __aie_register( i4  ) tmp = val; val = __aie_copy( tmp ); }
        if constexpr( reg ==  5 ) { auto __aie_register( i5  ) tmp = val; val = __aie_copy( tmp ); }
        if constexpr( reg ==  6 ) { auto __aie_register( i6  ) tmp = val; val = __aie_copy( tmp ); }
        if constexpr( reg ==  7 ) { auto __aie_register( i7  ) tmp = val; val = __aie_copy( tmp ); }
      #endif

    } else if constexpr( std::is_same_v<T, addr_t> ) {
        static_assert( rf==AIE_RegFile_default, "locate_in_register not yet implemented for this type and register file" );
        if constexpr( reg ==  0 ) { auto __aie_register( dc0  ) tmp = val; val = __aie_copy( tmp ); }
        if constexpr( reg ==  1 ) { auto __aie_register( dc1  ) tmp = val; val = __aie_copy( tmp ); }
        if constexpr( reg ==  2 ) { auto __aie_register( dc2  ) tmp = val; val = __aie_copy( tmp ); }
        if constexpr( reg ==  3 ) { auto __aie_register( dc3  ) tmp = val; val = __aie_copy( tmp ); }
        if constexpr( reg ==  4 ) { auto __aie_register( dc4  ) tmp = val; val = __aie_copy( tmp ); }
        if constexpr( reg ==  5 ) { auto __aie_register( dc5  ) tmp = val; val = __aie_copy( tmp ); }
        if constexpr( reg ==  6 ) { auto __aie_register( dc6  ) tmp = val; val = __aie_copy( tmp ); }
        if constexpr( reg ==  7 ) { auto __aie_register( dc7  ) tmp = val; val = __aie_copy( tmp ); }

    } else if constexpr( std::is_same_v<T, int> || std::is_same_v<T, unsigned> || std::is_same_v<T, bool> ) {
        static_assert( rf==AIE_RegFile_default || rf==AIE_RegFile_R, "locate_in_register not yet implemented for this type and register file" );
        if constexpr( reg ==  0 ) { auto __aie_register( r0  ) tmp = val; val = __aie_copy( tmp ); }
        if constexpr( reg ==  1 ) { auto __aie_register( r1  ) tmp = val; val = __aie_copy( tmp ); }
        if constexpr( reg ==  2 ) { auto __aie_register( r2  ) tmp = val; val = __aie_copy( tmp ); }
        if constexpr( reg ==  3 ) { auto __aie_register( r3  ) tmp = val; val = __aie_copy( tmp ); }
        if constexpr( reg ==  4 ) { auto __aie_register( r4  ) tmp = val; val = __aie_copy( tmp ); }
        if constexpr( reg ==  5 ) { auto __aie_register( r5  ) tmp = val; val = __aie_copy( tmp ); }
        if constexpr( reg ==  6 ) { auto __aie_register( r6  ) tmp = val; val = __aie_copy( tmp ); }
        if constexpr( reg ==  7 ) { auto __aie_register( r7  ) tmp = val; val = __aie_copy( tmp ); }
        if constexpr( reg ==  8 ) { auto __aie_register( r8  ) tmp = val; val = __aie_copy( tmp ); }
        if constexpr( reg ==  9 ) { auto __aie_register( r9  ) tmp = val; val = __aie_copy( tmp ); }
        if constexpr( reg == 10 ) { auto __aie_register( r10 ) tmp = val; val = __aie_copy( tmp ); }
        if constexpr( reg == 11 ) { auto __aie_register( r11 ) tmp = val; val = __aie_copy( tmp ); }
        if constexpr( reg == 12 ) { auto __aie_register( r12 ) tmp = val; val = __aie_copy( tmp ); }
        if constexpr( reg == 13 ) { auto __aie_register( r13 ) tmp = val; val = __aie_copy( tmp ); }
        if constexpr( reg == 14 ) { auto __aie_register( r14 ) tmp = val; val = __aie_copy( tmp ); }
        if constexpr( reg == 15 ) { auto __aie_register( r15 ) tmp = val; val = __aie_copy( tmp ); }
        if constexpr( reg == 16 ) { auto __aie_register( r16 ) tmp = val; val = __aie_copy( tmp ); }
        if constexpr( reg == 17 ) { auto __aie_register( r17 ) tmp = val; val = __aie_copy( tmp ); }
        if constexpr( reg == 18 ) { auto __aie_register( r18 ) tmp = val; val = __aie_copy( tmp ); }
        if constexpr( reg == 19 ) { auto __aie_register( r19 ) tmp = val; val = __aie_copy( tmp ); }
        if constexpr( reg == 20 ) { auto __aie_register( r20 ) tmp = val; val = __aie_copy( tmp ); }
        if constexpr( reg == 21 ) { auto __aie_register( r21 ) tmp = val; val = __aie_copy( tmp ); }
        if constexpr( reg == 22 ) { auto __aie_register( r22 ) tmp = val; val = __aie_copy( tmp ); }
        if constexpr( reg == 23 ) { auto __aie_register( r23 ) tmp = val; val = __aie_copy( tmp ); }
        if constexpr( reg == 24 ) { auto __aie_register( r24 ) tmp = val; val = __aie_copy( tmp ); }
        if constexpr( reg == 25 ) { auto __aie_register( r25 ) tmp = val; val = __aie_copy( tmp ); }
        if constexpr( reg == 26 ) { auto __aie_register( r26 ) tmp = val; val = __aie_copy( tmp ); }
        if constexpr( reg == 27 ) { auto __aie_register( r27 ) tmp = val; val = __aie_copy( tmp ); }
        if constexpr( reg == 28 ) { auto __aie_register( r28 ) tmp = val; val = __aie_copy( tmp ); }
        if constexpr( reg == 29 ) { auto __aie_register( r29 ) tmp = val; val = __aie_copy( tmp ); }
        if constexpr( reg == 30 ) { auto __aie_register( r30 ) tmp = val; val = __aie_copy( tmp ); }
        if constexpr( reg == 31 ) { auto __aie_register( r31 ) tmp = val; val = __aie_copy( tmp ); }
        if constexpr( reg >= 32 ) { auto __aie_register( R   ) tmp = val; val = __aie_copy( tmp ); }

      #if __AIE_ARCH__ >= 40
    } else if constexpr( std::is_same_v<T, m32x64acc32> || std::is_same_v<T, m32x64accfloat> ) {
        static_assert( rf==AIE_RegFile_default || rf==AIE_RegFile_Accum, "locate_in_register not yet implemented for this type and register file" );
        if constexpr( reg ==  0 ) { auto __aie_register( em0  ) tmp = val; val = __aie_copy( tmp ); }
        if constexpr( reg ==  1 ) { auto __aie_register( em1  ) tmp = val; val = __aie_copy( tmp ); }
      #endif

    } else {
        chess_error( "locate_in_register not yet implemented for this type" );
    }
  #endif
    return val;
}


template<typename T, unsigned size, unsigned reg_start, unsigned ... Is>
ALWAYS_INLINE void locate_in_register_helper( T ( &arr )[size], std::integer_sequence<unsigned, Is...> const & ) {
   (( arr[Is] = locate_in_register<reg_start+Is>( arr[Is] )), ... );
}

template<unsigned reg_start=0, unsigned size, typename T>
ALWAYS_INLINE void locate_in_register( T ( &arr )[size] ) {
    locate_in_register_helper<T, size, reg_start>( arr, std::make_integer_sequence<unsigned, size>{} );
}


#ifndef __AIE_API_DETAIL_UTILS__HPP__
template <typename T, T Start, T End, T It>
struct iteration_dim
{
    constexpr operator T( ) const
    {
        return current( );
    }

    constexpr T min( ) const
    {
        return Start;
    }

    constexpr T max( ) const
    {
        return End;
    }

    constexpr T current( ) const
    {
        return It;
    }
};

template <typename T, T Start, T End, T It, T Step>
struct unroll_for_helper
{
    static_assert( Step != 0, "0 is not a valid step" );

    template <typename Fn>
    ALWAYS_INLINE
    static void execute( Fn && fn )
    {
        if constexpr (( Step > 0 && It < End ) || ( Step < 0 && It > End )) {
            constexpr iteration_dim<T, Start, End, It> it{};

            static_assert( std::is_invocable_v<Fn, decltype( it )> ||
                          std::is_invocable_v<Fn> );

            if constexpr ( std::is_invocable_v<Fn, decltype( it )> )
                fn( it );
            else
                fn( );

            constexpr T next_it = It + Step;

            // Check for wrap-around
            static_assert(( Step > 0 ) || ( next_it < It ), "The unrolled loop range wraps around" );
            static_assert(( Step < 0 ) || ( next_it > It ), "The unrolled loop range wraps around" );

            unroll_for_helper<T, Start, End, next_it, Step>::execute( std::forward<Fn>( fn ));
        }
    }
};

/*
 * Creates an index sequence defined by Start, End, Step, and executes the given function many times, passing to each
 * instance one index from the sequence
 */
template <typename T, T Start, T End, T Step = 1, typename Fn>
ALWAYS_INLINE
void unroll_for( Fn &&fn )
{
    unroll_for_helper<T, Start, End, Start, Step>::execute( std::forward<Fn>( fn ));
}

/*
 * Creates the [0, Times ) index sequence and executes the given function many times, function passing to each instance
 * an index from the sequence
 */
template <unsigned Times, typename Fn>
ALWAYS_INLINE
void unroll_times( Fn &&fn )
{
    unroll_for<unsigned, 0, Times, 1>( std::forward<Fn>( fn ));
}
#else
using aie::unroll_times;
#endif


constexpr unsigned constexpr_unsigned_tester( unsigned a ) { return a; }

//template<typename T> struct pipelined_loop_is_peeled { static constexpr bool value = std::is_base_of_v<struct iteration_dim, T> || std::is_const_v<T> ); };
//template<typename T> struct pipelined_loop_is_peeled { static constexpr bool value = std::is_invocable_v<decltype( constexpr_unsigned_tester ), T>; };// || std::is_const_v<T> ); };
template<typename T> struct pipelined_loop_is_peeled { static constexpr bool value = !std::is_same_v<unsigned, T>; };// || std::is_const_v<T> ); };
template<typename T> constexpr bool pipelined_loop_is_peeled_v = pipelined_loop_is_peeled<T>::value;


template<unsigned lr, unsigned peel=0, unsigned rot_peel=0, typename Fn>
[[clang::always_inline]] void pipelined_loop( unsigned count, Fn &&body ) {
        //chess_report( lr );
        //chess_report( count );
    if ( chess_manifest( count == 1 )) {
        //chess_report( 0xbb0 );
        body( 0 );
    } else if ( chess_manifest( count <= peel )) {
        //chess_report( 0xbb1 );
        if ( count > 0 )
            unroll_times<peel>( body );
    } else {
        //static_assert( peel + 1 < lr, "Peeling not feasible" );
        if constexpr( lr > 1 ) {
        //chess_report( 0xbb2 );
            //chess_separator_scheduler_local();
            constexpr unsigned peel_front = std::max( rot_peel, peel ) - rot_peel;
            constexpr unsigned peel_back  = std::min( rot_peel, peel );
            if constexpr( peel_front > 0 ) {
                unroll_times<peel_front>( body );
                chess_separator( );
            }
            [[using chess: prepare_for_pipelining, min_loop_count( lr-peel )]]
            for ( unsigned o=peel_front; o<count-peel_back; o++ ) {
                constexpr bool t = true;
                //if constexpr( std::is_invocable_v<Fn, unsigned, decltype( t )> )
                //    body( o, t );
                //if constexpr( std::is_constructible_v<Fn<bool>> )
                //    body<1>( o );
                //else
                    body( o );
            }
            if constexpr( peel_back > 0 ) {
                chess_separator( );
                unroll_times<peel_back>( [&]( auto o ) __attribute__(( always_inline )) { body( count - peel_back + o ); } );
            }
        } else {
        //chess_report( 0xbb3 );
        //chess_report( count );
            //#pragma chess min_loop_count(lr)
            [[using chess: min_loop_count( lr )]]
            for ( unsigned o=0; o<count; o++ ) {
                //chess_report( 0xcc3 );
                //chess_report( o );
                body( o );
            }
        //chess_report( 0xaa3 );
        }
    }
}


template<unsigned lr, int rot=0, unsigned peel=0, unsigned rot_peel=0, typename Fn>
[[clang::always_inline]] void pipelined_loop_rotate( unsigned count, Fn &&body ) {
    if ( chess_manifest( count == 1 )) {
        body( 0 );
    } else if ( chess_manifest( count <= peel )) {
        if ( count > 0 )
            unroll_times<peel>( body );
    } else {
        //static_assert( peel + 1 < lr, "Peeling not feasible" );
        if constexpr( lr > 1 ) {
            //chess_separator_scheduler_local();
            constexpr unsigned peel_front = std::max( rot_peel, peel ) - rot_peel;
            constexpr unsigned peel_back  = std::min( rot_peel, peel );
            if constexpr( peel_front > 0 ) {
                unroll_times<peel_front>( body );
                chess_separator( );
            }
            [[using chess: prepare_for_pipelining, min_loop_count( lr-peel ), pipeline_adjust_preamble( rot )]]
            for ( unsigned o=peel_front; o<count-peel_back; o++ ) {
                body( o );
            }
            if constexpr( peel_back > 0 ) {
                chess_separator( );
                unroll_times<peel_back>( [&]( auto o ) __attribute__(( always_inline )) { body( count - peel_back + o ); } );
            }
        } else {
            //#pragma chess min_loop_count(lr)
            [[using chess: min_loop_count( lr )]]
            for ( unsigned o=0; o<count; o++ ) {
                body( o );
            }
        }
    }
}

#define LOOP_PEEL_FRONT 0
#define LOOP_PEEL_MAIN  1
#define LOOP_PEEL_BACK  2

template<int lr, int peel_front=0, int peel_back=0, int inner_peel_front=0, int inner_peel_back=0, typename Fn>
[[clang::always_inline]] void pipelined_loop_innerpeel( int count, Fn &&body ) {
    static_assert( peel_front + peel_back < lr, "Peeling not feasible" );
#if 0
    if ( chess_manifest( count-( peel_front+peel_back )<3 )) { // unroll main 0..2 times if count is manifest ( this will result in smaller code )
        unroll_times<peel_front>( [&]( auto o ) __attribute__(( always_inline )) { body( o, 0,               std::integral_constant<int, LOOP_PEEL_FRONT>( ), std::integral_constant<int, 0>( ), std::integral_constant<int, 0>( )); } );
        if ( count-( peel_front+peel_back )>0 )                    /* main iter0 */    body( 0, peel_front,      std::integral_constant<int, LOOP_PEEL_MAIN>( ),  std::integral_constant<int, 0>( ), std::integral_constant<int, 0>( ));
        if ( count-( peel_front+peel_back )>1 )                    /* main iter1 */    body( 1, peel_front,      std::integral_constant<int, LOOP_PEEL_MAIN>( ),  std::integral_constant<int, 0>( ), std::integral_constant<int, 0>( ));
        unroll_times<peel_back >( [&]( auto o ) __attribute__(( always_inline )) { body( o, count-peel_back, std::integral_constant<int, LOOP_PEEL_BACK>( ),  std::integral_constant<int, 0>( ), std::integral_constant<int, 0>( )); } );
    } else
#endif
    {
        if constexpr( peel_front > 0 ) {
            //printf("peel_front=%d\n", peel_front);
            unroll_times<peel_back>( [&]( auto o ) __attribute__(( always_inline )) { body( o, 0, std::integral_constant<int, LOOP_PEEL_FRONT>( ),  std::integral_constant<int, 0>( ), std::integral_constant<int, 0>( )); } );
            // chess_separator( );
        }

        // printf("loop=%d .. %d\n", peel_front, count-peel_back);
        [[using chess: prepare_for_pipelining, min_loop_count( lr-( peel_front+peel_back ))/*, pipeline_adjust_preamble( rot )*/]]
        for ( int o=0; o<count-peel_front-peel_back; o++ ) {
            body( o, peel_front, std::integral_constant<int, LOOP_PEEL_MAIN>( ), std::integral_constant<int, inner_peel_front>( ), std::integral_constant<int, inner_peel_back>( ));
        }

        if constexpr( peel_back > 0 ) {
            // printf("peel_back=%d\n", peel_back);
            // chess_separator( );
            unroll_times<peel_back>( [&]( auto o ) __attribute__(( always_inline )) { body( o, count + peel_front, std::integral_constant<int, LOOP_PEEL_BACK>( ),  std::integral_constant<int, 0>( ), std::integral_constant<int, 0>( )); } );
        }
    }
}

template<int lr, int rot=0, int peel_front=0, int peel_back=0, typename Fn>
[[clang::always_inline]] void peeled_loop( int count, Fn &&body ) {
    static_assert( peel_front + peel_back < lr, "Peeling not feasible" );
#if 0
    if ( chess_manifest( count-( peel_front+peel_back )<3 )) { // unroll main 0..2 times if count is manifest ( this will result in smaller code )
        unroll_times<peel_front>( [&]( auto o ) __attribute__(( always_inline )) { body( std::integral_constant<int, LOOP_PEEL_FRONT>( ), 0,               o ); } );
        if ( count-( peel_front+peel_back )>0 )                    /* main iter0 */    body( std::integral_constant<int, LOOP_PEEL_MAIN>( ),  peel_front,      0 );
        if ( count-( peel_front+peel_back )>1 )                    /* main iter1 */    body( std::integral_constant<int, LOOP_PEEL_MAIN>( ),  peel_front,      1 );
        unroll_times<peel_back >( [&]( auto o ) __attribute__(( always_inline )) { body( std::integral_constant<int, LOOP_PEEL_BACK>( ),  count-peel_back, o ); } );
    } else
#endif
    {
        if constexpr( peel_front > 0 ) {
            //printf("peel_front=%d\n", peel_front);
            unroll_times<peel_back>( [&]( auto o ) __attribute__(( always_inline )) { body( std::integral_constant<int, LOOP_PEEL_FRONT>( ), 0, o ); } );
            // chess_separator( );
        }

        // printf("loop=%d .. %d\n", peel_front, count-peel_back);
        //[[using chess: prepare_for_pipelining, min_loop_count( lr-(peel_front+peel_back) ), pipeline_adjust_preamble( rot )]]
        for ( int o=peel_front; o<count-peel_back; o++ ) {
            body( std::integral_constant<int, LOOP_PEEL_MAIN>( ), peel_front, o );
        }

        if constexpr( peel_back > 0 ) {
            // printf("peel_back=%d\n", peel_back);
            // chess_separator( );
            unroll_times<peel_back>( [&]( auto o ) __attribute__(( always_inline )) { body( std::integral_constant<int, LOOP_PEEL_BACK>( ), count + peel_front, o ); } );
        }
    }
}


template<unsigned allocR, unsigned lr, unsigned peel=0, unsigned rot_peel=0, typename Fn>
[[clang::always_inline]] void pipelined_loop_allocR( unsigned count, Fn &&body ) {
    if ( chess_manifest( count == 1 )) {
        body( 0 );
    } else if ( chess_manifest( count <= peel )) {
        if ( count > 0 )
            unroll_times<peel>( body );
    } else {
        //static_assert( peel + 1 < lr, "Peeling not feasible" );
        if constexpr( lr > 1 ) {
            constexpr unsigned peel_front = std::max( rot_peel, peel ) - rot_peel;
            constexpr unsigned peel_back  = std::min( rot_peel, peel );
            if constexpr( peel_front > 0 ) {
                unroll_times<peel_front>( body );
                chess_separator( );
            }
            #define ITER( i ) if constexpr( allocR == i ) \
                [[using chess: prepare_for_pipelining, min_loop_count( lr-peel ), allocate( R:i )]] \
                for ( unsigned o=peel_front; o<count-peel_back; o++ ) body( o );
            FOR_1_31( ITER, SEP_EMPT )
            if constexpr( allocR == 32 )
                [[using chess: prepare_for_pipelining, min_loop_count( lr-peel ), allocate( R:32 )]]
                for ( unsigned o=peel_front; o<count-peel_back; o++ ) body( o );
            #undef ITER
            if constexpr( peel_back > 0 ) {
                chess_separator( );
                unroll_times<peel_back>( [&]( auto o ) __attribute__(( always_inline )) { body( count - peel_back + o ); } );
            }

        } else {
            #define ITER( i ) if constexpr( allocR == i ) \
                [[using chess: min_loop_count( lr ), allocate( R:i )]] \
                for ( unsigned o=0; o<count; o++ ) body( o );
            FOR_1_31( ITER, SEP_EMPT )
            if constexpr( allocR == 32 )
                [[using chess: min_loop_count( lr ), allocate( R:32 )]]
                for ( unsigned o=0; o<count; o++ ) body( o );
            #undef ITER
        }
    }
}

/* Unroll lambda helper */

#include <type_traits>

template<typename Fn>
concept callable_with_extra_constant = requires(Fn fn, unsigned dyn_i, std::integral_constant<unsigned, 0> unroll_i) { 
    fn( dyn_i, unroll_i);
};

template<typename Fn, unsigned Factor>
struct UnrolledLambda {
    
    UnrolledLambda(const Fn &fn):fn(fn)
    {};

    __aie_inline auto operator()(auto j) {
        if constexpr(callable_with_extra_constant<Fn>)
        {
            aie::unroll_times<Factor>([&]<unsigned unroll_idx>( std::integral_constant<unsigned,unroll_idx> unroll_idx_dummy ) __aie_inline {
                fn(unroll_idx + j * Factor, unroll_idx_dummy);
            });
        } else {
            aie::unroll_times<Factor>([&](auto i) __aie_inline {
                fn(i + j * Factor);
            });
        }
    };

    Fn fn;
};

template<unsigned Unroll, typename Fn>
__aie_inline auto unroll_fn(const Fn &fn)
{
    auto unrolled = UnrolledLambda<Fn, Unroll>(fn);
    return unrolled;
}

template<typename T>
inline T *add_2d_byte_rewrite( T *a, dims_2d_t &params, int rst ) {
    addr_t c1 = params.count1;
    T *r = ( T* )add_2d_byte(( char* )a-rst, params.inc2+rst, params.num1, c1, params.inc1+rst );
    params.count1 = c1;
    return r;
}

template<typename T>
inline T *add_3d_byte_rewrite( T *a, dims_3d_t &params, int rst ) {
    addr_t c1 = params.count1;
    addr_t c2 = params.count2;
    T *r = ( T* )add_3d_byte(( char* )a-rst, params.inc3+rst, params.num1, c1, params.inc1+rst, params.num2, c2, params.inc2+rst );
    params.count1 = c1;
    params.count2 = c2;
    return r;
}


template<typename T>
inline T *add_2d_ptr_rewrite( T *a, dims_2d_t &params, int rst ) {
    addr_t c1 = params.count1;
    T *r = add_2d_ptr( a-rst, params.inc2+rst, params.num1, c1, params.inc1+rst );
    params.count1 = c1;
    return r;
}

template<typename T>
inline T *add_3d_ptr_rewrite( T *a, dims_3d_t &params, int rst ) {
    addr_t c1 = params.count1;
    addr_t c2 = params.count2;
    T *r = add_3d_ptr( a-rst, params.inc3+rst, params.num1, c1, params.inc1+rst, params.num2, c2, params.inc2+rst );
    params.count1 = c1;
    params.count2 = c2;
    return r;
}


template<typename T>
inline aie_dm_resource_remove_t<T> fifo_ld_pop_3d_byte( T * &p, fifo_state_t &f, dims_3d_t &params ) {
    //addr_t c;
    //addr_t c2;
    aie_dm_resource_remove_t<T> r = fifo_ld_pop_3d_byte( p, f, params.inc3, params.num1, params.count1, params.inc1, params.num2, params.count2, params.inc2 );
    //params.count1 = c;
    //params.count2 = c2;
    return r;
}
template<typename T>
inline aie_dm_resource_remove_t<T> fifo_ld_pop_2d_byte( T * &p, fifo_state_t &f, dims_2d_t &params ) {
    aie_dm_resource_remove_t<T> r = fifo_ld_pop_2d_byte( p, f, params.inc2, params.num1, params.count1, params.inc1 );
    return r;
}


template<typename T>
inline aie_dm_resource_remove_t<T> fifo_ld_popx_2d_byte( T * &ptr, fifo_state_t &fifo, int step, int mask, dims_2d_t &dims ) {
  #if 1
    addr_t tmp;
    aie_dm_resource_remove_t<T> ret = fifo_ld_popx_3d_byte( ptr, fifo, step, mask, dims.inc2, dims.num1, dims.count1, dims.inc1, 0, tmp, 0 );
  #else
    fifo_ld_fill( ptr, fifo );
    aie_dm_resource_remove_t<T> ret = fifo_ld_pop_2d_byte( ptr, fifo, dims.inc2, dims.num1, dims.count1, dims.inc1 );
  #endif
    return ret;
}
template<typename T>
inline aie_dm_resource_remove_t<T> fifo_ld_popx_3d_byte( T * &ptr, fifo_state_t &fifo, int step, int mask, dims_3d_t &dims ) {
  #if 1
    aie_dm_resource_remove_t<T> ret = fifo_ld_popx_3d_byte( ptr, fifo, step, mask, dims.inc3, dims.num1, dims.count1, dims.inc1, dims.num2, dims.count2, dims.inc2 );
  #else
    fifo_ld_fill( ptr, fifo );
    aie_dm_resource_remove_t<T> ret = fifo_ld_pop_3d_byte( ptr, fifo, dims.inc3, dims.num1, dims.count1, dims.inc1, dims.num2, dims.count2, dims.inc2 );
  #endif
    return ret;
}

inline void fifo_reset( fifo_state_t &f ) property( no_debug ) {
    f.pos = 0;
    f.fifo = chess_dont_care( sparse_fifo_t );
    f.extra = chess_dont_care( v32int32 );
}


class my_uint5_t {
  public:
    uint5_t val;
    my_uint5_t( uint5_t v ) : val( v ) {}
    my_uint5_t( int v ) : val( v ) {}
    inline auto operator=( uint5_t v ) { val = v; return *this; }
    inline uint5_t operator++( int ) {
        uint5_t r = val;
        val = val + 1;
        return r;
    }
    inline operator uint5_t( ) { return val; }
};


#endif //__ndl__



#ifndef LOCK_OFFSET
#define LOCK_OFFSET 48
#endif

#define INPUT_LOCK( X )       LOCK_OFFSET+( X )+8
#define WEIGHT_LOCK( X )      LOCK_OFFSET+( X )+2
#define OUTPUT_LOCK( X )      LOCK_OFFSET+( X )+4
#define PARAM_LOCK( X )       LOCK_OFFSET+( X )+6
#define CASC_IN_LOCK        LOCK_OFFSET+10
#define CASC_OUT_LOCK       LOCK_OFFSET+11

#define EMPTY_LOCK 0
#define FULL_LOCK  1


#ifndef NULL
#define NULL (( void* )0 )
#endif

template<typename T> inline T min( T a, T b ) { return a<b ? a : b; }
template<typename T> inline T max( T a, T b ) { return a>b ? a : b; }


#define KEY_IFM  1
#define KEY_WGHT 2
#define KEY_OFM  4
#ifndef KERNEL_STREAM_CFG
#define KERNEL_STREAM_CFG 0
#endif
#ifndef KERNEL_SYNC_CFG
#ifdef SUPER_KERNEL
#define KERNEL_SYNC_CFG 7
#else
#define KERNEL_SYNC_CFG 0
#endif
#endif

enum DataSyncConfig {
    DSC_EXTERN,
    DSC_INTERN,
    DSC_STREAM
};

enum BufferID {
    BID_IFM,
    BID_WGHT,
    BID_OFM
};

#define BID_to_KEY( x ) ( 1 << ( x ))
constexpr int get_lock_id( BufferID bid ) {
    if ( bid == BID_IFM ) return INPUT_LOCK( 0 );
    if ( bid == BID_WGHT ) return WEIGHT_LOCK( 0 );
    return OUTPUT_LOCK( 0 );
}

struct KernelDataSyncConfig {
    DataSyncConfig ifm;
    DataSyncConfig wght;
    DataSyncConfig ofm;
    constexpr KernelDataSyncConfig( DataSyncConfig a=DSC_EXTERN ) : ifm( a ), wght( a ), ofm( a ) { }
    constexpr KernelDataSyncConfig( DataSyncConfig a, DataSyncConfig w, DataSyncConfig o ) : ifm( a ), wght( w ), ofm( o ) { }
    constexpr bool operator==( const DataSyncConfig a ) const {
        return ( ifm == a ) && ( wght == a ) && ( ofm == a );
    }
    constexpr bool any( const DataSyncConfig a ) const {
        return ( ifm == a ) || ( wght == a ) || ( ofm == a );
    }
    constexpr DataSyncConfig operator[]( const BufferID bid ) const {
        if ( bid == BID_IFM  ) return ifm;
        if ( bid == BID_WGHT ) return wght;
        return ofm;
    }
};

#ifdef __chess__
constexpr DataSyncConfig decode_kernel_cfg( unsigned key ) { return ( KERNEL_STREAM_CFG & key ) ? DSC_STREAM : (( KERNEL_SYNC_CFG & key ) ? DSC_INTERN : DSC_EXTERN ); }
#else
constexpr DataSyncConfig decode_kernel_cfg( unsigned key ) { return ( KERNEL_STREAM_CFG & key ) ? DSC_INTERN : (( KERNEL_SYNC_CFG & key ) ? DSC_INTERN : DSC_EXTERN ); }
constexpr DataSyncConfig decode_stream_cfg( unsigned key ) { return ( KERNEL_STREAM_CFG & key ) ? DSC_STREAM : DSC_INTERN; }
constexpr KernelDataSyncConfig stream_cfg( decode_stream_cfg( KEY_IFM ), decode_stream_cfg( KEY_WGHT ), decode_stream_cfg( KEY_OFM ));
void open_stream( BufferID, bool );
void close_stream( BufferID );
#endif
constexpr KernelDataSyncConfig kernel_src_cfg( decode_kernel_cfg( KEY_IFM ), decode_kernel_cfg( KEY_WGHT ), decode_kernel_cfg( KEY_OFM ));

enum KernelConfig {
    KC_ZERO,
    KC_RESULT4,
    KC_RESULT6,
    KC_RESULT8,
    KC_RESULT13,
    KC_RESULT16,
    KC_RESULT32,
    KC_CASC,
    KC_TDM16,
    KC_TDM32,
    KC_TDM64,
    KC_TDM16_CASC,
    KC_TDM32_CASC,
    KC_TDM64_CASC,
    KC_RESULT32_CASC,
    KC_CASC2,
    KC_TDM16_CASC2,
    KC_TDM32_CASC2,
};

struct dims_2d_param {
    uint32_t num0;
    int32_t inc0;
    int32_t inc1;

  #ifdef __AIENGINE__
    inline dims_2d_t instantiate( ) const property( nodebug ) {
        return dims_2d_t( num0, inc0, inc1 );
    }
    inline dims_2d_t instantiate_step() const property( nodebug ) {
        return dims_2d_from_steps( num0, inc0, inc1 );
    }
  #endif
};

struct dims_3d_param {
    uint32_t num0;
    uint32_t num1;
    int32_t inc0;
    int32_t inc1;
    int32_t inc2;

  #ifdef __AIENGINE__
    inline dims_3d_t instantiate() const property( nodebug ) {
        return dims_3d_t( num0, inc0, num1, inc1, inc2 );
    }
    inline dims_3d_t instantiate_step() const property( nodebug ) {
        return dims_3d_from_steps( num0, inc0, num1, inc1, inc2 );
    }
  #endif
};

struct dims_4d_param {
    uint32_t num0;
    uint32_t num1;
    uint32_t num2;
    int32_t inc0;
    int32_t inc1;
    int32_t inc2;
    int32_t inc3;

  #ifdef __AIENGINE__
    inline auto instantiate() const property( nodebug ) {
        return std::pair( dims_3d_t( num0, inc0, num1, inc1, inc2 ), dims_2d_t( num2, 0, inc3 ));
    }
  #endif
};

struct dims_5d_param {
    uint32_t num0;
    uint32_t num1;
    uint32_t num2;
    uint32_t num3;
    int32_t inc0;
    int32_t inc1;
    int32_t inc2;
    int32_t inc3;
    int32_t inc4;

  #ifdef __AIENGINE__
    inline auto instantiate() const property( nodebug ) {
        return std::pair( dims_3d_t( num0, inc0, num1, inc1, inc2 ), dims_3d_t( num2, 0, num3, inc3, inc4 ));
    }
  #endif
};


struct dims_2d_param_s16 {
    uint16_t num0;
    int16_t inc0;
    int16_t inc1;

  #ifdef __AIENGINE__
    inline dims_2d_t instantiate( int scale = 1 ) const property( nodebug ) {
        return dims_2d_t( num0, inc0 * scale, inc1 * scale );
    }
  #endif
};

struct dims_3d_param_s16 {
    uint16_t num0;
    uint16_t num1;
    int16_t inc0;
    int16_t inc1;
    int16_t inc2;

  #ifdef __AIENGINE__
    inline dims_3d_t instantiate( int scale = 1 ) const property( nodebug ) {
        return dims_3d_t( num0, inc0 * scale, num1, inc1 * scale, inc2 * scale );
    }
  #endif
};

struct dims_4d_param_s16 {
    uint16_t num0;
    uint16_t num1;
    uint16_t num2;
    int16_t inc0;
    int16_t inc1;
    int16_t inc2;
    int16_t inc3;

  #ifdef __AIENGINE__
    inline auto instantiate( int scale = 1 ) const property( nodebug ) {
        return std::pair( dims_3d_t( num0, inc0 * scale, num1, inc1 * scale, inc2 * scale ), dims_2d_t( num2, 0, inc3 * scale ));
    }
  #endif
};

struct dims_5d_param_s16 {
    uint16_t num0;
    uint16_t num1;
    uint16_t num2;
    uint16_t num3;
    int16_t inc0;
    int16_t inc1;
    int16_t inc2;
    int16_t inc3;
    int16_t inc4;

  #ifdef __AIENGINE__
    inline auto instantiate( int scale = 1 ) const property( nodebug ) {
        return std::pair( dims_3d_t( num0, inc0 * scale, num1, inc1 * scale, inc2 * scale ), dims_3d_t( num2, 0, num3, inc3 * scale, inc4 * scale ));
    }
  #endif
};

#ifndef KIR
#define KIR
#endif

struct BinaryQDQParams {
    float dq_a_zp;
    float dq_a_sc;
    float dq_b_zp;
    float dq_b_sc;
    float q_zp;
    float q_sc;
    bool dq_enable;
    bool q_enable;
};


struct KernelDqParam {
    uint16_t inner_g;
    uint8_t sign_A;
};


struct KernelQParam {
    uint16_t inner_g;
    uint8_t sign_O;
};


struct MLKernelControl KIR {
    uint32_t zero_init:1;
    uint32_t sign_N:1;
    uint32_t sign_O:1;
    uint32_t reserved3:3;
    uint32_t skip_casc_in:1;
    uint32_t skip_casc_out:1;
    uint32_t sign_W:1;
    uint32_t sign_A:1;
    uint32_t reserved10:14;
    uint32_t norm_ch_g:8;
};


struct MLKernelParams {
    uint8_t Kx_g;
    uint8_t Ky_g;
    uint8_t Ci_g;
    int8_t  S_g;
    uint8_t N_g;
    uint8_t X_g;
    uint8_t Y_g;
    uint8_t Co_g;
    uint16_t inner_g;
    uint16_t outer_g;
    int8_t shift_tdm;
    int8_t shift_res;
    int8_t shift_norm;
    int8_t shift_bias;

    uint16_t step_Kx;
    uint16_t step_Ky;
    uint16_t step_Ci;
    uint16_t step_Xi;
    uint16_t step_Yi;
    uint16_t step_Xo;
    uint16_t step_Yo;
    uint16_t step_Co;
    int param_value;
    MLKernelControl ctrl;
};

struct MLLayerParams {
    MLKernelParams kernel;
    uint16_t iter_cnt;
    uint8_t tdm_cnt;    //0=no tdm ( 1 iter ); 1=accumulate once ( 2 iters ), ...
    uint8_t keep_cnt;   //0=use one time; 1=use two time, ...
    uint8_t keep_data;  //1=keep data; 0=keep weights
    uint8_t casc_setup; //0=no cascade; 1=start; 2=middle; 3=end of cascade
    uint8_t kernel_family;
    uint8_t reserved1;
    /* uint8_t reserved2; */
    int offset_actv;
    int offset_wght;
    int offset_out;
    int offset_interm;
    int mode;
};

struct MLAdd2dParams {
    uint8_t Kx_g;
    uint8_t Ky_g;
    uint8_t Ci_g;
    int8_t  S_g;
    uint8_t N_g;
    uint8_t X_g;
    uint8_t Y_g;
    uint8_t Co_g;
    uint16_t inner_g;
    uint16_t outer_g;
    int8_t shift_tdm;
    int8_t shift_res;
    int8_t shift_in;
    int8_t shift_in1;
    uint16_t step_Kx;
    uint16_t step_Ky;
    uint16_t step_Ci;
    uint16_t step_Xi;
    uint16_t step_Yi;
    uint16_t step_Xo;
    uint16_t step_Yo;
    uint16_t step_Co;
    int param_value;
    MLKernelControl ctrl;
};

enum ActivationConfig : uint8_t {
    AC_SRS,
    AC_RELU=0,
    AC_RELU6,
    AC_LRELU,
    AC_HSWISH
};

// New style params for conv_int8x8_idx kernels
struct BiasedConvInt8x8IdxParams {
    uint32_t hardened_loop;
    uint32_t mode;
    uint32_t wgt_size;              // Size of pure weights in bytes
    uint32_t bias_size;             // Size of bias in bytes
    uint16_t outer_time_iters;
    uint16_t inner_time_iters;
    uint16_t inner_loop;
    uint8_t step_align;
    uint8_t norm_ch_g;
    dims_2d_param dims_A2;
    dims_3d_param dims_A3;
};

struct ActivatedConvInt16x8Params {
    uint32_t mode;
    uint32_t wgt_size;              // Size of pure weights in bytes
    uint32_t coeff_size;             // Size of bias in bytes
    uint32_t spill_buf;
    uint32_t cf_cache;

    uint64_t mask_Ci_low;
    uint64_t mask_Ci_high;
    int8_t Co_blk;
    int8_t Co_shft;
    int8_t reserved1;
    int8_t reserved2;   
    uint16_t outer_time_iters;
    uint16_t inner_time_iters;
    uint16_t inner_loop;
    uint16_t step_Ci;
    dims_2d_param dims_A2;
    dims_3d_param dims_A3;
    dims_2d_param dims_conv2d_sum_inner;
    dims_2d_param dims_conv2d_sum_outer;
    dims_3d_param dims_sum_actv;
    uint8_t step_align;
    uint8_t step_align_sum;
    int8_t reserved3;
    int8_t reserved4;   
    int8_t Sx_g;
    int8_t Sy_g;
    int8_t Kx_g;
    int8_t Ky_g;
    int8_t sum_outer;
    int8_t sum_bound;
    int8_t n_accus;
    int8_t max_accus;
    float cf_AxB;
    float cf_Asum;
};


// Quantization parameters for conv_int8x8_idx kernels are runtime configurable
struct Quantization {
    int16_t lrelu_alpha;
    int8_t max_value;
    int8_t shift_bias;
    int8_t shift_lrelu_in;
    int8_t shift_out;
    struct Control {
        uint8_t sign_A:1;
        uint8_t sign_W:1;
        uint8_t sign_O:1;
    } ctrl;
};

/*
NOTE: This params struct is used to over-ride the values in param.bin for the convolutional layer
This struct is packed as part of the WGT/consts buffer to support runtime config.
Refer to the testbench and host formatting for the correct packing of this struct.
*/
struct conv_rt_params{
    ActivationConfig act_type;
    int8_t shift_res;
    int8_t shift_bias;
    uint8_t sign_A;
    uint8_t sign_W;
    uint8_t sign_O;
};

struct MLIncrements property( keep_in_registers ) {

    // Note: Only #dims_2d + 2 * #dims_3d <= 8 can be placed without spilling
    dims_2d_t dims_A2;
    dims_3d_t dims_A3;
    dims_3d_t dims_AO;
    dims_2d_t dims_W2;
    dims_3d_t dims_W3;
    dims_2d_t dims_O2;
    dims_3d_t dims_O3;

    int incA_0;
    int incA_1;
    int incA_2;
    int incA_3;

    int incS_0;
    int incS_1;
    int incS_2;

    int step_align;
    int shft_0;
    int shft_1;
    int shfl_0;
    int shfl_1;
    int shfl_2;
    int shfl_3;

    uint8_t numBN;
};



struct BatchNormParams property( keep_in_registers ) {
    int count;
    int shift_norm;
    int shift_bias;
    int shift_res;
    int param_value;
    int mem_offset;
    int sign_config;

    int numBN1;
    int incBN1;
    int numBN2;
    int incBN2;
    int incBN3;
};

struct SignConfigNorm {
    uint32_t A:1;
    uint32_t W:1;
    uint32_t O:1;
    uint32_t N:1;
};

struct MMultQdQInt16x8Params {
    uint32_t spill_buff;
    uint32_t ifm_tmp_buf;
    uint32_t coeff_tmp_buf;
    uint32_t wgt_size;  // Size of pure weights in bytes
    uint32_t coeff_size; // Size of Coeff in bytes
    uint32_t mode;

    uint16_t outer_time_iters;
    uint16_t inner_time_iters;
    uint16_t inner_loop;

    uint8_t Y_g;
    uint8_t X_g;
    uint16_t step_Xi;
    uint16_t step_Yi;
    uint16_t step_Kx;
    uint16_t step_Ky;
    // int16_t shift_res;

    // struct Control {
    //     uint8_t sign_A:1;
    //     uint8_t sign_W:1;
    //     uint8_t sign_O:1;
    // } ctrl;

    // uint8_t reserved; // (reserved) - There was a byte alignment issue in the original code, so we added a reserved byte

    dims_3d_param dimsA;
    dims_3d_param dimsB;
    dims_2d_param dimsQ;
};

struct MMultQdQInt16x4Params {

    uint32_t spill_buff;
    uint32_t ifm_tmp_buf;
    uint32_t coeff_tmp_buf;
    uint32_t wgt_size;  // Size of pure weights in bytes
    uint32_t coeff_size; // Size of Coeff in bytes
    uint32_t mode;
    
    uint16_t outer_time_iters;
    uint16_t inner_time_iters;
    uint16_t inner_loop;

    uint8_t Y_g;
    uint8_t X_g;
    uint16_t step_Xi;
    uint16_t step_Yi;
    uint16_t step_Kx;
    uint16_t step_Ky;
    // uint16_t shift_res;

    // struct Control {
    //     uint8_t sign_A:1;
    //     uint8_t sign_W:1;
    //     uint8_t sign_O:1;
    // } ctrl;

    dims_3d_param dimsA;
    dims_3d_param dimsB;
    dims_2d_param dimsQ;
};


struct GemmQdqint16x16_RT_Params
{
    int16_t shift_res; // unused for here; 
    uint8_t sign_A;  // unused for here;
    uint8_t sign_W;  // unused for here;
    uint8_t sign_O;  // unused for here; 
    // 0: scalar, 1: vectorized C1, 2: vectorized C2
    uint8_t vector_coeff; // unused for here; kernel input only
    uint8_t pad[2]; 
    float c0;
    float c1;
    float c2;
    float c3;
    // NOTE: Used to align the struct size to 128 bytes.
    uint8_t reserved[104];
};


struct MMultQdQInt16x8_RT_Params {
    int16_t shift_res;
    uint8_t sign_A;
    uint8_t sign_W;
    uint8_t sign_O;
};

struct ConvQdQInt16x8_RT_Params {
    int16_t shift_res;
    uint8_t sign_A;
    uint8_t sign_W;
    uint8_t sign_O;
};

struct maxpool_layer_params{
    MLKernelParams kernel_params;
    uint16_t mode;
};

struct maxpool_noqdq_params{
    uint8_t sign;
    uint8_t reserved[127];
};

struct ActivatedMMultKernelParams {
    uint32_t tdm_buf;
    uint32_t cfqdq_buf;
    uint32_t qdq_buf;
    uint16_t outer_time_iters;
    uint16_t inner_time_iters;
    uint16_t inner_g;
    uint8_t Y_g;
    uint8_t X_g;
    uint16_t step_Xi;
    uint16_t step_Yi;
    uint16_t step_Kx;
    uint16_t step_Ky;
    uint16_t shift_res;
    uint16_t mode;
    dims_3d_param dimsA;
    dims_3d_param dimsB;
    dims_2d_param dimsQ;
    dims_2d_param dimsAs;
};

struct ActivatedMMultKernelParamsTranspose {
    uint32_t tdm_buf;
    uint32_t wght_transpose_sb;
    uint32_t cfqdq_buf;
    uint32_t qdq_buf;
    uint16_t outer_time_iters;
    uint16_t inner_time_iters;
    uint16_t inner_g;
    uint16_t tsl_bound;
    uint8_t transpose;
    uint8_t is_split;
    uint8_t Y_g;
    uint8_t X_g;
    uint16_t step_Xi;
    uint16_t step_Yi;
    uint16_t step_Kx;
    uint16_t step_Ky;
    uint16_t shift_res;
    uint16_t mode;
    dims_3d_param dimsA;
    dims_3d_param dimsB;
    dims_2d_param dimsQ;
    dims_2d_param dimsAs;
    dims_3d_param dimsW;
};

ALWAYS_INLINE void setup_parameters_dc_int8x8_idx ( const MLKernelParams &param, MLIncrements &incrs, const uint16_t granXY, const uint16_t bits_O=8 ) {
    int SN0_g = !( param.S_g == 2 );
    int SN1_g = 4 << SN0_g;
    int reset;

    uint16_t granCi = 64;
    uint16_t granCo = 64;

    int step_Ci = max( param.step_Ci, granCi );
    int step_Co = max( param.step_Co, granCo );

    incrs.step_align = 28 - clb( param.step_Xi ^ ( param.step_Xi-1 ));  // 8->0, 16->1, 32->2
    //incrs.shfl_0 = (SN0_g ? T512_1x2_lo : (param.N_g == 1 ? T64_8x2_lo : (param.N_g == 2 ? T128_4x2_lo : T256_2x2_lo)));
    //incrs.shfl_1 = (SN0_g ? T512_1x2_hi : (param.N_g == 1 ? T64_8x2_lo : (param.N_g == 2 ? T128_4x2_lo : T256_2x2_lo)));

    //int XoL = min( granXY, param.step_Co / granCo );
    //int XoL = min( granXY, ( param.step_Co < granCo ? param.step_Yo : param.step_Co ) / granCo );
    int XoL = min(( int ) granXY, param.step_Yo / param.step_Xo );
    int YoL = fast_div_p2( granXY, XoL );

    //printf( "XoL=%i(%i), YoL=%i, step_Xo=%i, step_Yo=%i, step_Co=%i\n", XoL, min( granXY, param.step_Co / granCo ), YoL, param.step_Xo, param.step_Yo, param.step_Co );
    int incA_0 = XoL - 1;
    reset = -granCi;
    int incA_1 = add_dimension( reset, incA_0, param.step_Xi );
    int incA_2 = add_dimension( reset, YoL-1, param.step_Yi );
    incrs.dims_A2 = dims_2d_t( incA_0, incA_1, incA_2 );

    //printf( "step_Kx=%i, step_Xi=%i, step_Ky=%i, step_Yi=%i\n", param.step_Kx, param.step_Xi, param.step_Ky, param.step_Yi );
    int incKx = add_dimension( reset, param.Kx_g-1, max( param.step_Kx, granCi ));
    int incKy = add_dimension( reset, param.Ky_g-1, param.step_Ky );
    int incCi = reset + step_Ci;
    incrs.dims_A3 = dims_3d_t( param.Kx_g-1, incKx, param.Ky_g-1, incKy, incCi );

    reset = -param.Ci_g * step_Ci;
    int incXi     = add_dimension( reset, param.X_g-1, granXY * param.step_Xi );
    int incYi     = add_dimension( reset, param.Y_g-1, YoL * param.step_Yi );
    int incCo_rev = reset;
    incrs.dims_AO = dims_3d_t( param.X_g-1, incXi, param.Y_g-1, incYi, incCo_rev );

    reset = 0;
    int incXo = add_dimension( reset, XoL * param.X_g - 1, param.step_Xo );
    int incYo = add_dimension( reset, YoL * param.Y_g - 1, param.step_Yo );
    int incCo = reset + step_Co;
    incrs.dims_O3 = dims_3d_t( XoL * param.X_g - 1, incXo, YoL * param.Y_g - 1, incYo, incCo );

    int numW = param.X_g * param.Y_g - 1;
    int incW_Ci_rev = -param.inner_g * 4096;
    int incW_Co_rev = incW_Ci_rev * param.Co_g;
    incrs.dims_W2 = dims_2d_t( numW, incW_Ci_rev, 0 );
}

ALWAYS_INLINE void setup_parameters_bc_int8x8( const MLKernelParams &param, MLIncrements &incrs, const uint16_t granXY, const uint16_t bits_O=8 ) {
    int SN0_g = !( param.S_g == 2 );
    int SN1_g = 4 << SN0_g;
    int reset;

    uint16_t granCi = 64;
    uint16_t granCo = 64;

    int step_Ci = max( param.step_Ci, granCi );
    int step_Co = max( param.step_Co, granCo );

    incrs.step_align = 28 - clb( param.step_Xi ^ ( param.step_Xi-1 ));  // 8->0, 16->1, 32->2
    //incrs.shfl_0 = (SN0_g ? T512_1x2_lo : (param.N_g == 1 ? T64_8x2_lo : (param.N_g == 2 ? T128_4x2_lo : T256_2x2_lo)));
    //incrs.shfl_1 = (SN0_g ? T512_1x2_hi : (param.N_g == 1 ? T64_8x2_lo : (param.N_g == 2 ? T128_4x2_lo : T256_2x2_lo)));

    //int XoL = min( granXY, param.step_Co / granCo );
    //int XoL = min( granXY, ( param.step_Co < granCo ? param.step_Yo : param.step_Co ) / granCo );
    int XoL = min(( int ) granXY, param.step_Yo / param.step_Xo );
    int YoL = fast_div_p2( granXY, XoL );

    //printf( "XoL=%i(%i), YoL=%i, step_Xo=%i, step_Yo=%i, step_Co=%i\n", XoL, min( granXY, param.step_Co / granCo ), YoL, param.step_Xo, param.step_Yo, param.step_Co );
    int incA_0 = XoL - 1;
    reset = -granCi;
    int incA_1 = add_dimension( reset, incA_0, param.step_Xi );
    int incA_2 = add_dimension( reset, YoL-1, param.step_Yi );
    incrs.dims_A2 = dims_2d_t( incA_0, incA_1, incA_2 );

    //printf( "step_Kx=%i, step_Xi=%i, step_Ky=%i, step_Yi=%i\n", param.step_Kx, param.step_Xi, param.step_Ky, param.step_Yi );
    int incKx = add_dimension( reset, param.Kx_g-1, max( param.step_Kx, granCi ));
    int incKy = add_dimension( reset, param.Ky_g-1, param.step_Ky );
    int incCi = reset + step_Ci;
    incrs.dims_A3 = dims_3d_t( param.Kx_g-1, incKx, param.Ky_g-1, incKy, incCi );

    reset = -param.Ci_g * step_Ci;
    int incXi     = add_dimension( reset, param.X_g-1, granXY * param.step_Xi );
    int incYi     = add_dimension( reset, param.Y_g-1, YoL * param.step_Yi );
    int incCo_rev = reset;
    incrs.dims_AO = dims_3d_t( param.X_g-1, incXi, param.Y_g-1, incYi, incCo_rev );

    reset = 0;
    int incXo = add_dimension( reset, XoL * param.X_g - 1, param.step_Xo );
    int incYo = add_dimension( reset, YoL * param.Y_g - 1, param.step_Yo );
    int incCo = reset + step_Co;
    incrs.dims_O3 = dims_3d_t( XoL * param.X_g - 1, incXo, YoL * param.Y_g - 1, incYo, incCo );

    int numW = param.X_g * param.Y_g - 1;
    int incW_Ci_rev = -param.inner_g * 4096;
    int incW_Co_rev = incW_Ci_rev * param.Co_g;
    incrs.dims_W2 = dims_2d_t( numW, incW_Ci_rev, 0 );
}

template<unsigned il_lr, unsigned ol_lr=0, unsigned max_bound=4096, unsigned it_lr=0>
__attribute__(( noinline )) void check_bounds( unsigned il, unsigned ol=0, bool additional_condition=1, unsigned it=0 ) {
    // #ifdef DEBUG_KERNEL
    // printf( "[Info]: check bounds: %i>=%i, %i>=%i, %i>=%i, %i, %i\n", il, il_lr, ol, ol_lr, it, it_lr, additional_condition, max_bound );
    // #endif
    assert( il >= il_lr );
    assert( il <= max_bound );
    if ( ol_lr > 0 ) assert( ol >= ol_lr );
    if ( ol_lr > 0 ) assert( ol <= max_bound );
    assert( additional_condition );
    if ( it_lr > 0 ) assert( it >= it_lr );
    if ( it_lr > 0 ) assert( it <= max_bound );
    #ifdef DEBUG_TESTBENCH
    printf( "[Info]: check bounds done\n" );
    #endif
}

struct ResizeNearestParams {
    uint32_t wgt_l1_offset;
    uint16_t outer_loop;
    uint16_t inner_loop;
    uint16_t time_iters;
    uint16_t wts_x_offset;
    uint16_t num_wi_c64;
    uint16_t reserved;
    uint32_t step_yi;
    dims_3d_param dimsA;
    dims_2d_param dimsW;
};

void* conv_to_local_ptr(uint32_t addr)
{
    uint32_t constexpr core_local_offset = 0xE0000;
    return reinterpret_cast<void*>(core_local_offset + addr);
}

template<typename T>
class TypedBufferPort {
    BufferPort &bp;
  public:
    TypedBufferPort( BufferPort &bp ) : bp(bp) { }
    void release( ) { bp.release( ); }
    void acquire( ) { bp.acquire( ); }
    T * data( ) { return (T*) bp.data( ); }
};

template<typename T>
class Typedlogical_BufferPort {
    logical_BufferPort &bp;
  public:
    Typedlogical_BufferPort( logical_BufferPort &bp ) : bp(bp) { }
    void release( ) { bp.release( ); }
    void acquire( ) { bp.acquire( ); }
    T * data( ) { return (T*) bp.data( ); }
};

inline v32float16 broadcast_to_v32(float16 x) { return broadcast_to_v32float16( x ); }
inline v32bfloat16 broadcast_to_v32(bfloat16 x) { return broadcast_to_v32bfloat16( x ); }

// NOTE: DWC S1 and S2 params are identical, using the same struct for both the kernels
struct ActivatedDwcQdqInt16x8Params {
    uint32_t stride;                // Stride dictates a different kernel
    uint32_t mode;
    uint32_t wgt_size;              // Size of pure weights in bytes
    uint32_t coeff_size;            // Size of coeffs in bytes
    uint16_t outer_loop;
    uint16_t reserved1;
    int16_t incS_0;
    int16_t incA_0;
    dims_3d_param dims_A3;
    dims_2d_param dims_A2;
    dims_3d_param dims_O3;
    dims_2d_param dims_O2;
    dims_2d_param dims_W2;
    dims_2d_param dims_C2;
    struct Control {
        uint8_t sign_A:1;
        uint8_t sign_W:1;
        uint8_t sign_O:1;
    } ctrl;
};

struct dwcint16x8_RT_params {
    int32_t shift;
    int32_t zp_w;
    uint32_t reserved[30];
};

#endif // __COMMON_HH__
