#ifndef __GLOBAL_REDUCE_TEMPLATE_H__
#define __GLOBAL_REDUCE_TEMPLATE_H__


#include "../include/access_casc_stream.h"
#include "../include/access_core_stream.h"


inline __attribute__((always_inline))
v32bfloat16 max_v32bfloat (v32bfloat16 in1, v32bfloat16 in2)
{
    v32bfloat16 out_v32 = max(in1, in2);

    return out_v32;
}


v16accfloat global_add_reduce( v16accfloat local_sums, const unsigned rowIdx, const unsigned colIdx, const int mode=0) clobbers( dm1, dm4, qex11, r2, r3, r4, r5, r18,r19,r20,r21 ) {

    uint16 neg_inf_bits = 0xff80;
    bfloat16 neg_inf = *(bfloat16*)&neg_inf_bits;
    v16accfloat v_neg_inf = v16accfloat(broadcast_bfloat16(neg_inf));
    
    v16accfloat rec_sum = read_casc_stream( rowIdx < 5 ); 
    if (mode == 1 && !(rowIdx < 5))
    {
        rec_sum = v_neg_inf;
    }    

    v16accfloat write_inter_sum;
    v16accfloat glob_sum;
    v32bfloat16 maxop1, maxop2;
    if (mode == 0) {
        write_inter_sum = local_sums + rec_sum;
    } else {
        maxop1   = v32bfloat16(local_sums);
        maxop2   = v32bfloat16(rec_sum);
        write_inter_sum = v16accfloat(max_v32bfloat(maxop1, maxop2));
    }

    if ( rowIdx > 2 ) {
        write_casc_stream(write_inter_sum);
    } else {
        casc_mode = 0x00000003;
        chess_separator_scheduler(2);

        rec_sum = read_casc_stream( colIdx >= 1 );
        if (mode == 1 && !(colIdx >= 1))
        {
            rec_sum = v_neg_inf;
        }

        if (mode == 0) {
            write_inter_sum = write_inter_sum + rec_sum;
        } else {
            maxop1   = v32bfloat16(write_inter_sum);
            maxop2   = v32bfloat16(rec_sum);
            write_inter_sum = v16accfloat(max_v32bfloat(maxop1, maxop2));
        }

        if ( colIdx != 3 ) {
            write_casc_stream(write_inter_sum);
            chess_separator_scheduler(4);
            casc_mode = 0x00000000;
        } else {
            casc_mode = 0x00000000;

            glob_sum = write_inter_sum;
            write_core_stream_inline(glob_sum); // Push global reduced sum to core stream
        }
    }
    if ( rowIdx != 2 || colIdx != 3 ) {
        //event0();
        glob_sum = read_core_stream_inline();
    }

    return glob_sum;
}


#endif
