#ifndef __GLOBAL_REDUCE_IMPL_HPP__
#define __GLOBAL_REDUCE_IMPL_HPP__

volatile int chess_storage( TM:0xA1060 - 0x58000 ) casc_mode;

template<typename To, unsigned is_vector>
void global_reduce
(
    v32float Xbuff,
    v32float Xbuff2,
    float * restrict output,
    int split_mode=0
) {
    To * pO = ( To * ) output;

    auto row = get_coreid( ) & 7;
    auto col = get_coreid( ) >> 16;

    /* Receive reduced sum from cascade stream */
    v32accfloat psum  = get_scd_v32accfloat( row < 5 && ( split_mode != 2 || row != 3 ));
    v32accfloat psum2 = get_scd_v32accfloat( row < 5 && ( split_mode != 2 || row != 3 ));

    v32accfloat res, res2;
    res  = mac_elem_32( Xbuff,  1.0f, psum  ); 
    res2 = mac_elem_32( Xbuff2, 1.0f, psum2 );

    /* Write values to output cascade */
    // put_mcd( res,  row > 2 && ( split_mode != 2 || row !=4 )); // CRVO-12854
    // put_mcd( res2, row > 2 && ( split_mode != 2 || row !=4 ));
    if ( row > 2 && ( split_mode != 2 || row !=4 )) {
        put_mcd( res  );
        put_mcd( res2 );
    }

    if ( split_mode == 0 && row == 2 ) {
        /* Set input cascade to West and output cascade to East */
        casc_mode = 0x00000003;
        chess_separator_scheduler( 2 );

        psum  = get_scd_v32accfloat( col > 0 );
        psum2 = get_scd_v32accfloat( col > 0 );

        res  = mac_elem_32( v32float( res ),  1.0f, psum  );
        res2 = mac_elem_32( v32float( res2 ), 1.0f, psum2 );

        if ( col != 2 ) {
            put_mcd( res );
            put_mcd( res2 );
            chess_separator_scheduler( 4 );
            casc_mode = 0x00000000;
        } else {
            casc_mode = 0x00000000;

            /* Write to L1 */
            if constexpr( is_vector ) {
                *pO++ = res;
                *pO++ = res2;
            } else {
                *pO++ = extract_v2float( extract_v16float( v32float( res ), 0 ), 0 );
                *pO++ = extract_v2float( extract_v16float( v32float( res2 ), 0 ), 0 );
            }
        }
    } else {
        if (( split_mode == 1 && row == 2 ) || ( split_mode == 2 && ( row == 2 || row == 4 ))) {
            /* Write to L1 */
            if constexpr( is_vector ) {
                *pO++ = res;
                *pO++ = res2;
            } else {
                *pO++ = extract_v2float( extract_v16float( v32float( res ), 0 ), 0 );
                *pO++ = extract_v2float( extract_v16float( v32float( res2 ), 0 ), 0 );
            }
        }
    }
}

void global_reduce
(
    v32float Xbuff,
    float * restrict output,
    int split_mode=0,
    const int mode=0
) {
    //v4float * pO = ( v4float * ) output;
    v32float * pO = ( v32float * ) output;

    auto row = get_coreid( ) & 7;
    auto col = get_coreid( ) >> 16;

    /* Receive reduced sum from cascade stream */
    v32accfloat psum = get_scd_v32accfloat( row < 5 && ( split_mode != 2 || row != 3 ));

    v32accfloat res;
    if ( mode == 0 ) {
        res = mac_elem_32( Xbuff, 1.0f, psum );
    } else {
        res = v32accfloat( max( Xbuff, v32float( psum )));
    }

    /* Write values to output cascade */
    // put_mcd( res, row > 2 && ( split_mode != 2 || row !=4 )); // CRVO-12854
    if ( row > 2 && ( split_mode != 2 || row !=4 )) {
        put_mcd( res );
    }

    if ( split_mode == 0 && row == 2 ) {
        /* Set input cascade to West and output cascade to East */
        casc_mode = 0x00000003;
        chess_separator_scheduler( 2 );

        psum = get_scd_v32accfloat( col > 0 );

        if ( mode == 0 ) {
            res = mac_elem_32( v32float( res ), 1.0f, psum );
        } else {
            res = v32accfloat( max( v32float( res ), v32float( psum )));
        }

        if ( col != 2 ) {
            put_mcd( res );
            chess_separator_scheduler( 4 );
            casc_mode = 0x00000000;
        } else {
            casc_mode = 0x00000000;

            /* Write to L1 */
            //*pO = extract_v4float( extract_v16float( v32float( res ), 0 ), 0 );
        }
    } else {
        if (( split_mode == 1 && row == 2 ) || ( split_mode == 2 && ( row == 2 || row == 4 ))) {
            /* Write to L1 */
            //*pO = extract_v4float( extract_v16float( v32float( res ), 0 ), 0 );
            *pO = v32float( res );
            put_ms(v16int32(extract_v16float(v32float(res),0)));
            put_ms(v16int32(extract_v16float(v32float(res),1)));
        }
    }

    if(split_mode == 1 && row > 2){
        //*pO = insert(*pO, 0, get_ss_v16float());
        *pO = get_ss_v32float();
    }
}

#endif