#include "elemwise_qdq.hpp"
#include <cstdio>
#include <algorithm>
#include <limits>
#include <cassert>
#include <iostream>
#include <cstdint>
#include <cmath>
#include <fstream>
#ifndef __TXNRT__
#include <adf.h>
#include <adf/adf_api/AIERuntimeControl.h>
#include "super.hh"
#include "graph.hpp"
#endif // __TXNRT__
#if defined(__AIESIM__) || defined(__TXNRT__)
#include "dma.hpp"
#endif // __AIESIM__ || __TXNRT__

// using Telem = uint16_t;

// #if IS_SIGNED == 0
#if IS_INT16 == 1
        using Tin = uint16_t;
        using Tout = uint16_t;
#else
    #if QDQ_MODE == 3
        using Tin = uint8_t;
        using Tout = uint8_t;
    #elif QDQ_MODE == 0
        using Tin = uint8_t;
        using Tout = uint16_t;
    #elif QDQ_MODE == 1
        using Tin = uint16_t;
        using Tout = uint8_t;
    #elif QDQ_MODE == 2
        using Tin = uint8_t;
        using Tout = uint8_t;
    #else
        #error "INVALID QDQ_MODE"
    #endif

#endif
// #else
//     #if IS_INT16 == 1
//         #if QDQ_MODE == 0
//             using Tin = int16_t;
//             using Tout = uint16_t;
//         #elif QDQ_MODE == 1
//             using Tin = uint16_t;
//             using Tout = int16_t;
//         #else
//             using Tin = int16_t;
//             using Tout = int16_t;
//         #endif
//     #else
//     #if QDQ_MODE == 3
//         using Tin = int8_t;
//         using Tout = int8_t;
//     #elif QDQ_MODE == 0
//         using Tin = int8_t;
//         using Tout = int16_t;
//     #elif QDQ_MODE == 1
//         using Tin = int16_t;
//         using Tout = int8_t;
//     #elif QDQ_MODE == 2
//         using Tin = int8_t;
//         using Tout = int8_t;
//     #else
//         #error "INVALID QDQ_MODE"
//     #endif
//     #endif
// #endif


using Welem = int32_t;


template<typename T>
struct ActTensor
{
    int const C;
    int const Y;
    int const X;
    int const N;
    T* const data;

    ActTensor(int C, int Y, int X, int N, void* data)
        : C(C)
        , Y(Y)
        , X(X)
        , N(N)
        , data(static_cast<T*>(data))
    {}

    T& at(int c, int y, int x, int n)
    {
        assert(n < N);
        assert(c < C);
        assert(y < Y);
        assert(x < X);
        int idx = (n * Y * X * C) + (y * X * C) + (x * C) + c;
        assert(idx < C * Y * X* N);
        return data[idx];
    }

    void print(char const* msg = nullptr)
    {
        if (msg != nullptr) {
            std::cout << msg;
        }
        for (int n = 0; n < N; ++n) {
            for (int c = 0; c < C; ++c) {
                for (int y = 0; y < Y; ++y) {
                    for (int x = 0; x < X; ++x) {
                        if (std::is_integral<T>::value) {
                            std::cout << static_cast<int64_t>(at(c, y, x, n)) << " ";
                        } else {
                            std::cout << at(c, y, x, n) << " ";
                        }
                    }
                    std::cout << "\n";
                }
                std::cout << "\n";
            }
        }
    }

    void init_random(uint16_t zp, float scale,
                     float min_f = 0.0f, float max_f = 128.0f)
    {
        float rnd_data;
        for (int n = 0; n < N; ++n) {
            for (int c = 0; c < C; ++c) {
                for (int y = 0; y < Y; ++y) {
                    for (int x = 0; x < X; ++x) {
                        rnd_data = ((max_f - min_f) * (rand() / float(RAND_MAX))) + min_f;
                        T tmp = (QDQ_MODE== 1) ? float_to_bfloat16(rnd_data) : quantize<T>(rnd_data, scale, zp );
                        at(c, y, x, n) = tmp;
                    }
                }
            }
        }
    }

    void init_wgt(int Ky, int Kx, int Py_b, int Px_b, int Py_a, int Px_a)
    {
        /*
        1. for the middle        : wgt = 1 / (Ky*Kx)
        2. for left-upper corner : wgt = 1 / ((Ky-Py_b) * (Kx-Px_b))
        3. for right-upper corner: wgt = 1 / ((Ky-Py_b) * (Kx-Px_a))
        4. for left-lower corner : wgt = 1 / ((Ky-Py_a) * (Kx-Px_b))
        5. for right-lower corner: wgt = 1 / ((Ky-Py_a) * (Kx-Px_a))

        6. for first row         : wgt = 1 / ((Ky-Py_b) * Kx)
        7. for first column      : wgt = 1 / (Ky * (Kx-Px_b))
        8. for last row          : wgt = 1 / ((Ky-Py_a) * Kx)
        9. for last column       : wgt = 1 / (Kx * (Kx-Px_a))

        */
        int wgt_offset = N * Y * X * C;
        float wgt_temp = 0;
        for (int n = 0; n < N; ++n) {
            for (int c = 0; c < C; ++c) {
                for (int y = 0; y < Y; ++y) {
                    for (int x = 0; x < X; ++x) {
                        int idx = (n * Y * X * C) + (y * X * C) + (x * C) + c;
                        if (y == 0) {
                            if (x == 0){        //left-upper corner
                                wgt_temp =  1 / ((Ky-Py_b) * (Kx-Px_b));
                            }else if (X == X-1){ // right-upper corner
                                wgt_temp =  1 / ((Ky-Py_b) * (Kx-Px_a));
                            }else{               //first row
                                wgt_temp = 1 / ((Ky-Py_b) * Kx);
                            }
                        }else if (y == Y-1){
                            if (x == 0){        //left-lower corner
                                wgt_temp =  1 / ((Ky-Py_a) * (Kx-Px_b));
                            }else if (X == X-1){ // right-lower corner
                                wgt_temp =  1 / ((Ky-Py_a) * (Kx-Px_a));
                            }else{              //last row
                                wgt_temp = 1 / ((Ky-Py_a) * Kx);
                            }
                        }else if (x == 0){
                            if (y == 0){        //left-upper corner
                                wgt_temp =  1 / ((Ky-Py_b) * (Kx-Px_b));
                            }else if (Y == Y-1){ // left-lower corner
                                wgt_temp =  1 / ((Ky-Py_b) * (Kx-Px_a));
                            }else{              //first column
                                wgt_temp = 1 / (Ky * (Kx-Px_b));
                            }
                        }else if (x == X-1){
                            if (y == 0){        //right-upper corner
                                wgt_temp =  1 / ((Ky-Py_b) * (Kx-Px_a));
                            }else if (y == Y-1){ // right-lower corner
                                wgt_temp =  1 / ((Ky-Py_a) * (Kx-Px_a));
                            }else{              //last column
                                wgt_temp = 1 / (Kx * (Kx-Px_a));
                            }
                        }else{
                            wgt_temp = 1 / (Ky * Kx);
                        }
                        data[wgt_offset + idx] = float_to_bfloat16(wgt_temp);
                    }
                }
            }
        }
    }


    static int size(int C, int Y, int X, int N)
    {
        return C * Y * X * N * sizeof(T);
    }
};



template<typename Tin, typename Tout>
void quant_bfloat16(const Tin* in_data, Tout* out_data,
                     int N, int H, int W, int C,
                     const Welem* qdq_param)
{
    float inv_s = bfloat16_to_float(qdq_param[3]);
    int32_t z = qdq_param[2];  // Use int32_t to avoid overflow

    int size = N * H * W * C;
    for (int i = 0; i < size; i++) {
        // Convert bfloat16 (stored as uint16_t or int16_t) to float
        float val = bfloat16_to_float(static_cast<uint16_t>(in_data[i]));

        // Apply inverse scale and zero-point
        float scaled = std::round(val * inv_s) + z;

        // Saturate to the output type range
        if constexpr (std::is_same_v<Tout, int8_t>) {
            out_data[i] = static_cast<int8_t>(std::clamp(scaled, -128.0f, 127.0f));
        } else if constexpr (std::is_same_v<Tout, uint8_t>) {
            out_data[i] = static_cast<uint8_t>(std::clamp(scaled, 0.0f, 255.0f));
        } else if constexpr (std::is_same_v<Tout, int16_t>) {
            out_data[i] = static_cast<int16_t>(std::clamp(scaled, -32768.0f, 32767.0f));
        } else if constexpr (std::is_same_v<Tout, uint16_t>) {
            out_data[i] = static_cast<uint16_t>(std::clamp(scaled, 0.0f, 65535.0f));
        } else {
            static_assert(sizeof(Tout) == 0, "Unsupported output type");
        }
    }
}


template<typename T>
void init_qdq_mat(T* data, int qdq_mode) {
    int32_t zp =0;
    float scale = 1.0f;
    float fmin = 0.0f; //fmin and fmax can be random data
    float fmax = 128.0f;
    // compute_scale_and_zp<Tin>(fmin, fmax, scale, zp);
    //for dq
    data[0] = zp;
    data[1] = float_to_bfloat16 (scale);
    //for q
    // if (qdq_mode == 1 ) //and is_int8
        // compute_scale_and_zp<Tout>(fmin, fmax, scale, zp);
    // else
        // compute_scale_and_zp<Tin>(fmin, fmax, scale, zp);
    data[2] = zp;
    data[3] = float_to_bfloat16 (1/scale);

    //for enable
    if (qdq_mode == 0) {
        data[4] = 1; //dq ENABLE
        data[5] = 0; //q DISABLE
    } else if (qdq_mode == 1)
    {
        data[4] = 0; //dq DISABLE
        data[5] = 1; //q ENABLE
    } else if (qdq_mode == 2)
    {
        data[4] = 1; //dq ENABLE
        data[5] = 1; //q ENABLE
    } else if (qdq_mode == 3)
    {
        data[4] = 0; //dq DISABLE
        data[5] = 0; //q DISABLE
    } else {
        std::cout << "ERROR MODE!" << std::endl;
    }
}

// template<typename Tin, typename Tout>
// void cpu_pooling_2d(
//     ActTensor<Tin> ifm,
//     ActTensor<Tout> ofm,
//     int max_or_avg,  //0: max, 1: avg
//     int Ky, int Kx,
//     int Sy, int Sx,
//     int Py_b, int Px_b, int Py_a, int Px_a,
//     int qdq_mode,
//     Welem* qdq_param
//     )
// {

//     float q_inv_or_s = (1/bfloat16_to_float(qdq_param[3]));
//     uint16_t q_z = qdq_param[2];
//     float dq_inv_or_s = bfloat16_to_float(qdq_param[1]);
//     uint16_t dq_z = qdq_param[0];
//     float tmp_f;
//     Tin tmp_int_in;
//     Tout tmp_int_out;

//     for (int co = 0; co < ofm.C; ++co){  //Ci==Co
//         for (int yo= 0; yo < ofm.Y; ++yo){
//             for (int xo = 0; xo < ofm.X; ++xo){
//                 int acc = (max_or_avg == 0) ? std::numeric_limits<int>::min() : 0;
//                 int count = 0;
//                 for (int ky = 0; ky < Ky; ++ky){
//                     for (int kx = 0; kx < Kx; ++kx){
//                         int yi = yo * Sy + ky - Py_b;
//                         int xi = xo * Sx + kx - Px_b;
//                         if (yi >= 0 && yi < ifm.Y && xi >= 0 && xi < ifm.X){ //zero-padding
//                             tmp_int_in = ifm.at(co, yi, xi);
//                             if (qdq_mode == 0 || qdq_mode == 2) {//dq
//                                     acc = std::max(static_cast<Tin>(acc), tmp_int_in); //maxpool
//                                     tmp_f = dequantize<Tout>(static_cast<Tin>(acc), dq_inv_or_s, dq_z);
//                                     tmp_int_out = float_to_bfloat16(tmp_f);
//                             }
//                             if (qdq_mode == 1){//q only
//                                 tmp_int_out = quantize<Tin>(bfloat16_to_float(tmp_int_in), q_inv_or_s, q_z);
//                                 acc = std::max(static_cast<Tout>(acc), tmp_int_out);//maxpool
//                                 tmp_int_out = static_cast<Tout>acc;
//                             }
//                             if (qdq_mode == 2) { //dq then q
//                                 tmp_int_out = quantize<Tout>(tmp_f, q_inv_or_s, q_z);
//                             }
//                             if (qdq_mode == 3){
//                                 acc = std::max(static_cast<Tin>(acc), tmp_int_in);
//                                 tmp_int_out = static_cast<Tout>acc;
//                             }
//                             // if (max_or_avg == 0){
//                             // acc = std::max(static_cast<Tin>(acc), tmp_int_in);
//                             // }else{
//                             //     acc += a;
//                             //     ++count;
//                             // }
//                         }
//                     }
//                 }
//                 // if (max_or_avg == 1 && count > 0){
//                 //     acc /= count;
//                 // }
//                 scratch.at(co, yo, xo) = tmp_int_out;
//             }
//         }
//     }
// }

template<typename Tin, typename Tout>
void cpu_maxpool_2d(
    ActTensor<Tin> ifm,
    ActTensor<Tout> ofm,
    int Ky, int Kx,
    int Sy, int Sx,
    int Py_b, int Px_b,
    int qdq_mode,
    Welem* qdq_param
) {
    float dq_s = bfloat16_to_float(qdq_param[1]);     // scale_dq
    uint16_t dq_z = qdq_param[0];                     // zp_dq
    float q_s  = 1.0f / bfloat16_to_float(qdq_param[3]); // scale_q_inv
    uint16_t q_z = qdq_param[2];                       // zp_q

    for (int n = 0; n < ofm.N; ++n) {
        for (int c = 0; c < ofm.C; ++c) {
            for (int yo = 0; yo < ofm.Y; ++yo) {
                for (int xo = 0; xo < ofm.X; ++xo) {

                    float acc_f = -1e30f;
                    int acc_i = std::numeric_limits<int>::min();

                    for (int ky = 0; ky < Ky; ++ky) {
                        for (int kx = 0; kx < Kx; ++kx) {

                            int yi = yo * Sy + ky - Py_b;
                            int xi = xo * Sx + kx - Px_b;
                            if (yi < 0 || yi >= ifm.Y || xi < 0 || xi >= ifm.X)
                                continue;

                            Tin val_q = ifm.at(c, yi, xi, n);

                            if (qdq_mode == 0) {               // DQ only
                                float v = dequantize(val_q, dq_s, dq_z);
                                acc_f = std::max(acc_f, v);
                            }
                            else if (qdq_mode == 1) {          // Q only
                                int vi = val_q;
                                acc_i = std::max(acc_i, vi);
                            }
                            else if (qdq_mode == 2) {          // DQ then Q
                                float v = dequantize(val_q, dq_s, dq_z);
                                acc_f = std::max(acc_f, v);
                            }
                            else {                             // quant only domain
                                int vi = val_q;
                                acc_i = std::max(acc_i, vi);
                            }
                        }
                    }

                    // Output
                    if (qdq_mode == 0) {
                        // store float as bfloat16
                        float v = acc_f;
                        ofm.at(c, yo, xo, n) = float_to_bfloat16(v);
                    }
                    else if (qdq_mode == 1) {
                        // pure quant max result
                        ofm.at(c, yo, xo, n) = static_cast<Tout>(acc_i);
                    }
                    else if (qdq_mode == 2) {
                        // dq → max → q
                        float v = acc_f;
                        Tout qv = quantize<Tout>(v, q_s, q_z);
                        ofm.at(c, yo, xo, n) = qv;
                    }
                    else {
                        // quant-only
                        ofm.at(c, yo, xo, n) = static_cast<Tout>(acc_i);
                    }
                }
            }
        }
    }
}



void write_bin_file(std::string filename, char* data, size_t size)
{
    std::fstream file;
    file.open(filename, std::ios::out | std::ios::binary);
    file.write(data, size);
}

template<typename T>
int check_result(
    ActTensor<T> expected,
    ActTensor<T> received,
    int epsilon = 0)
{
    assert(expected.C == received.C);
    assert(expected.Y == received.Y);
    assert(expected.X == received.X);
    assert(expected.N == received.N);

    int err_count = 0;
    for (int n = 0; n < expected.N; ++n) {
        for (int c = 0; c < expected.C; ++c) {
            for (int y = 0; y < expected.Y; ++y) {
                for (int x = 0; x < expected.X; ++x) {
                    int diff = expected.at(c, y, x, n) - received.at(c, y, x, n);
                    diff = (diff < 0) ? -diff : diff;
                    bool fail = (diff > epsilon);
                    bool warn = (diff > 0);
                    if (fail) {
                        err_count += 1;
                        std::cout << "ERROR: [" << n << ", " << c << ", " << x << ", " << y << "]: "
                                << "Expected: " << expected.at(c, y, x, n) << ", "
                                << "Received: " << received.at(c, y, x, n) << "\n";
                    } else if (warn) {
                        std::cout << "WARNING: [" << n << ", " << c << ", " << x << ", " << y << "]: "
                                << "Expected: " << expected.at(c, y, x, n) << ", "
                                << "Received: " << received.at(c, y, x, n) << "\n";
                    }
                }
            }
        }
    }
    std::cout << "Error Count = " << err_count << "\n";
    if (err_count > 0) {
        std::cout << "DI: FAIL "
                  << expected.C << "x" << expected.Y << "x" << expected.X << "x" << expected.N <<  "\n";
    } else {
        std::cout << "DI: PASS "
                  << expected.C << "x" << expected.Y << "x" << expected.X << "x" << expected.N <<  "\n";
    }

    return err_count;
}



#ifndef __TXNRT__
ComputeGraph g_compute_graph;
#endif // __TXNRT__

int main(void)
{
    srand(0xABCD);

    int Ni = N_IN;
    int Ci = C_IN;
    int Yi = Y_IN;
    int Xi = X_IN;
    int No = N_OUT;
    int Co = C_OUT;
    int Yo = Y_OUT;
    int Xo = X_OUT;
    int Ky = KERNEL_Y;
    int Kx = KERNEL_X;
    int Sy = STRIDE_Y;
    int Sx = STRIDE_X;
    int Py_b = PAD_Y_BEFORE;
    int Px_b = PAD_X_BEFORE;
    int Py_a = PAD_Y_AFTER;
    int Px_a = PAD_X_AFTER;
    // int constexpr Cis = C_IN_SUBV;
    // int constexpr Cos = C_OUT_SUBV;
    // int constexpr Co_split = C_OUT_SPLIT;
    int constexpr max_or_avg = MAX_OR_AVG;

    if (Ci != Co){
        std::cerr << "Error: Ci and Co should be equal!\n";
        assert (false);
    }


    int qdq_size  = QDQ_SIZE;
    int qdq_mode = QDQ_MODE; //0: DEQUANT; 1: QUANT; 2: BOTH; 3: NONE

    int ifm_size = ActTensor<Tin>::size(Ci, Yi, Xi, Ni);
    int wgt_size = ActTensor<Welem>::size(Ci, Yi, Xi, Ni);
    int ofm_size = ActTensor<Tout>::size(Co, Yo, Xo, No);
    int ifm_combined_size = max_or_avg ? (ifm_size + wgt_size) : ifm_size;

#ifdef __TXNRT__
    ActTensor<Tin> aie_ifm(Ci, Yi, Xi, Ni, malloc(ifm_combined_size));
    auto aie_qdq = static_cast<Welem*>(malloc(qdq_size));
    ActTensor<Tout> aie_ofm(Co, Yo, Xo, No, malloc(ofm_size));
#else
    ActTensor<Tin> aie_ifm(Ci, Yi, Xi, Ni, adf::GMIO::malloc(ifm_combined_size));
    auto aie_qdq = static_cast<Welem*>(adf::GMIO::malloc(qdq_size));
    ActTensor<Tout> aie_ofm(Co, Yo, Xo, No, adf::GMIO::malloc(ofm_size));
#endif // __TXNRT__
    ActTensor<Tout> cpu_ofm(Co, Yo, Xo, No, malloc(ofm_size));

    Welem qdq_param[QDQ_SIZE];
    if (QDQ_MODE == 0 || QDQ_MODE == 1 || QDQ_MODE == 2 || QDQ_MODE == 3){
        init_qdq_mat<Welem>(qdq_param, QDQ_MODE);
        printf("qdq_mode : %d\n", QDQ_MODE);
        memcpy(aie_qdq, (void*)qdq_param, qdq_size);
    }

    uint16_t zp = qdq_param[0];
    float scale = bfloat16_to_float(qdq_param[1]);
    aie_ifm.init_random(zp, scale);
    if (max_or_avg){
        aie_ifm.init_wgt(Ky, Kx, Py_b, Px_b, Py_a, Px_a);
    }

    /*
    sequence:
    1.  maxpool -> dq -> q
    2.  dq -> avgpool -> q
    */
    int N = 1;
    ActTensor<Tout> cpu_out_mat = cpu_ofm;

    // if (max_or_avg){
    //     if (QDQ_MODE == 0 || QDQ_MODE ==2)  {
    //         dequant<Tin, uint16_t>(scratch_buf.data, dq_out_buf.data, N, Yo, Xo, Co, qdq_param);
    //     }
    // }

    // cpu_pooling_2d<Tin, Tout>(aie_ifm, cpu_ofm , max_or_avg, Ky, Kx, Sy, Sx, Py_b, Px_b, Py_a, Px_a, qdq_mode, qdq_param);
    cpu_maxpool_2d<Tin, Tout>(aie_ifm, cpu_ofm, Ky, Kx, Sy, Sx, Py_b, Px_b, /*Py_a, Px_a,*/ qdq_mode, qdq_param);

    // if (max_or_avg == 0){
    //     if (QDQ_MODE == 0) {
    //         dequant<Tin, uint16_t>(scratch_buf.data, dq_out_buf.data, N, Yo, Xo, Co, qdq_param);
    //     }
    //     else if (QDQ_MODE == 1) {
    //         quant_bfloat16<Tin, Tout>(scratch_buf.data, cpu_out_mat.data, N, Yo, Xo, Co, qdq_param);
    //     }
    //     else if (QDQ_MODE == 2) {
    //         dequant<Tin, uint16_t>(scratch_buf.data, dq_out_buf.data, N, Yo, Xo, Co, qdq_param);
    //         quant_bfloat16<uint16_t, Tout>(dq_out_buf.data, cpu_out_mat.data, N, Yo, Xo, Co, qdq_param);
    //     }else if (QDQ_MODE == 3){
    //         memcpy(cpu_out_mat.data, scratch_buf.data, ofm_size);
    //     }
    // } else if (max_or_avg == 1){
    //     if (QDQ_MODE == 0 || QDQ_MODE ==2)  {
    //         quant_bfloat16<uint16_t, Tout>(dq_out_buf.data, cpu_out_mat.data, N, Yo, Xo, Co, qdq_param);
    //     }
    //     else if (QDQ_MODE == 1){
    //         quant_bfloat16<Tin, Tout>(scratch_buf.data, cpu_out_mat.data, N, Yo, Xo, Co, qdq_param);
    //     }
    //     else if (QDQ_MODE == 3){
    //         memcpy(cpu_out_mat.data, scratch_buf.data, ofm_size);
    //     }
    // } else{ // not supported
    //     std::cerr << "Error: only support max(0) and avg(1) pooling\n";
    //     assert (false);
    // }


#if defined(__AIESIM__) || defined(__TXNRT__)
    #ifdef __TXNRT__
            DmaBins bins = run_dma_layer_config();
            bins.save();
            write_bin_file("ifm.bin", reinterpret_cast<char*>(aie_ifm.data), ifm_combined_size);
            write_bin_file("wgt.bin", reinterpret_cast<char*>(aie_qdq), qdq_size);
            write_bin_file("ofm.bin", reinterpret_cast<char*>(cpu_ofm.data), ofm_size);
    #else
    aie_ifm.print("AIE IFM =\n");
    cpu_ofm.print("CPU OFM =\n");
    g_compute_graph.init();
    run_dma_layer_config(g_compute_graph, aie_ofm.data, aie_ifm.data, aie_qdq);
    g_compute_graph.end();
    aie_ofm.print("AIE OFM =\n");
    int err_count = check_result(cpu_out_mat, aie_ofm);
    if (err_count == 0) {
        printf("DI: PASS\n");
    } else {
        printf("DI: FAIL\n");
    }
    printf("Error Count = %d\n", err_count);
    #endif // __TXNRT__
#endif // __AIESIM__ || __TXNRT__

    #ifdef __TXNRT__
        free(aie_ifm.data);
        free(aie_qdq);
        free(aie_ofm.data);
		// free(scratch_buf.data);
		// free(dq_out_buf.data);
    #else
        adf::GMIO::free(aie_ifm.data);
        adf::GMIO::free(aie_qdq);
        adf::GMIO::free(aie_ofm.data);
		// adf::GMIO::free(scratch_buf.data);
		// adf::GMIO::free(dq_out_buf.data);
    #endif // __TXNRT__
        free(cpu_ofm.data);

    #ifndef __TXNRT__
        assert(false);
    #endif // __TXNRT__
    return 0;
}
