/******************************************************************************
 * Subgraph CERT Sim
 *
 * Purpose:
 *   Minimal host-side driver to run CERT Sim on a *single compiled subgraph*
 *   and (optionally) compare AIE-produced OFM vs a reference CPU OFM.
 *
 * Typical flow:
 *   1) Allocate host buffers for IFM/WGT/OFM
 *   2) Load model-produced reference binaries into those buffers (IFM/WGT/CPU OFM)
 *   3) Invoke run_cert_sim() to execute the subgraph in CERT Sim (AIE-Sim)
 *   4) Optionally compare AIE OFM against CPU OFM (supports reading AIE OFM at an offset)
 *
 * Build modes:
 *   - !ASM_MODE: uses adf::GMIO::malloc/free (preferred in AIE-Sim / runtime integration)
 *   -  ASM_MODE: uses malloc/free and can dump buffers to .bin files for inspection
 *
 * Notes:
 *   - Tensor dims (N/Y/X/C) are only needed for print/compare helpers.
 *   - Sizes (bytes) must match the exact binary files produced for the subgraph.
 ******************************************************************************/

#if !ASM_MODE
#include <adf.h>
#include <adf/adf_api/AIERuntimeControl.h>
#include "super.hh"
#include "graph.hpp"
#endif // !ASM_MODE
#ifdef __AIESIM__
#if !ASM_MODE
#include "dma.hpp"
#endif // !ASM_MODE
#endif // __AIESIM__

#include "common.hpp"
#include "pdi.hpp"

#if !ASM_MODE
ComputeGraph g_compute_graph;
#endif // !ASM_MODE

int main(void)
{
    // Number of ops included in the compiled subgraph. Used to size the param blob.
    // If the subgraph includes K ops and each contributes 12288 bytes, set K here.
    int NUM_OPS_IN_SUBGRAPH = 1;
    std::size_t prm_size = 12288 * NUM_OPS_IN_SUBGRAPH;

    // (Optional) Tensor shapes for debugging/verification helpers.
    // Only required if you enable print_tensor_nyxc() or cmp_tensor_nyxc().
    //
    // int Nin = 0, Yin = 0, Xin = 0, Cin = 0;
    // int Nout = 0, Yout = 0, Xout = 0, Cout = 0;

    // Buffer sizes in BYTES. Must exactly match the binary files you load.
    int ifm_size = 0;       // bytes in input activation binary
    int wgt_size = 0;       // bytes in weight binary
    int aie_ofm_size = 0;   // bytes written by subgraph into AIE OFM buffer
    int cpu_ofm_size = 0;   // bytes in reference output binary (CPU/golden)

#if !ASM_MODE
    void* aie_ifm = adf::GMIO::malloc(ifm_size);
    void* aie_wgt = adf::GMIO::malloc(wgt_size);
    void* aie_ofm = adf::GMIO::malloc(aie_ofm_size);
#else
    void* aie_ifm = malloc(ifm_size);
    void* aie_wgt = malloc(wgt_size);
    void* aie_ofm = malloc(aie_ofm_size);
#endif // !ASM_MODE
    void* cpu_ofm = malloc(cpu_ofm_size);

    // Load binaries generated from your model/debug pipeline.
    // - IFM: subgraph input activation
    // - WGT: subgraph weights (or fused weights for that subgraph)
    // - CPU OFM: golden/reference output for correctness comparison
    //
    // read_bin_file(std::string("/abs/path/to/ifm.bin"), reinterpret_cast<char*>(aie_ifm), ifm_size);
    // read_bin_file(std::string("/abs/path/to/wgt.bin"), reinterpret_cast<char*>(aie_wgt), wgt_size);
    // read_bin_file(std::string("/abs/path/to/cpu_ofm.bin"), reinterpret_cast<char*>(cpu_ofm), cpu_ofm_size);

#if ASM_MODE
    write_bin_file("ifm.bin", reinterpret_cast<char*>(aie_ifm), ifm_size);
    write_bin_file("wgt.bin", reinterpret_cast<char*>(aie_wgt), wgt_size);
    write_bin_file("ofm.bin", reinterpret_cast<char*>(cpu_ofm), cpu_ofm_size);
#endif // ASM_MODE

#ifdef __AIESIM__
#if !ASM_MODE
    // Optional: print IFM in NYXC order for debugging (enable only when dims are set).
    // print_tensor_nyxc<uint16_t>(aie_ifm, Nin, Yin, Xin, Cin);
#if USE_CERT_LIBRARY
    // Run the compiled subgraph in CERT Sim (AIE-Sim).
    // aie_ofm: output buffer produced by the subgraph
    // aie_ifm: input activation buffer
    // aie_wgt: weight buffer
    // prm_size: parameter blob size for the subgraph (depends on NUM_OPS_IN_SUBGRAPH)
    run_cert_sim(g_compute_graph,
                 reinterpret_cast<void*>(aie_ofm), aie_ofm_size,
                 reinterpret_cast<void*>(aie_ifm), ifm_size,
                 reinterpret_cast<void*>(aie_wgt), wgt_size,
                 prm_size);
#endif // USE_CERT_LIBRARY

    // Optional correctness check:
    // - threshold: absolute tolerance per element (tune for datatype / quantization)
    // - read_aie_ofm_from_offset: if the subgraph writes OFM into a larger BO, set byte offset here
    //
    // int threshold = 5000;
    // int read_aie_ofm_from_offset = 0; // bytes
    // int err = cmp_tensor_nyxc<uint16_t>(
    //     cpu_ofm,
    //     static_cast<const void*>(static_cast<const uint8_t*>(aie_ofm) + read_aie_ofm_from_offset),
    //     Nout, Yout, Xout, Cout,
    //     threshold
    // );
    // printf(err == 0 ? "DI_PASS\n" : "DI_FAIL\n");

#endif // !ASM_MODE
#endif // __AIESIM__

#if !ASM_MODE
    adf::GMIO::free(aie_ifm);
    adf::GMIO::free(aie_wgt);
    adf::GMIO::free(aie_ofm);
#else
    free(aie_ifm);
    free(aie_wgt);
    free(aie_ofm);
#endif // !ASM_MODE
    free(cpu_ofm);
    return 0;
}