#ifndef TXN_PM_BIN_GEN_HPP
#define TXN_PM_BIN_GEN_HPP

#include <stdint.h>
#include <limits.h>
#include <stdlib.h>
#include <assert.h>
#include <stdio.h>
#include <string.h>

#include <vector>
#include <iostream>
#include <iomanip>
#include <sstream>
#include <typeinfo>
#include <type_traits>

#if defined(__AIESIM__) || defined(__TXNRT__)
#include "xaiengine.h"
extern XAie_DevInst DevInst;

#define DEV_ROW_SHIFT (DevInst.DevProp.RowShift)
#define DEV_COL_SHIFT (DevInst.DevProp.ColShift)
#else

struct op_base_st
{
    int temp;
    unsigned int size_in_bytes;
};

struct PatchShimOp
{
    op_base_st op;
    uint32_t action;
    uint64_t regAddr;
    uint64_t extBufId;
    uint64_t argplus;
};

uint64_t ComputePatchRegAddr(uint8_t shimColumn, uint8_t bdId)
{
    // get tile info
    XAie_LocType tileLoc = XAie_TileLoc(shimColumn, 0); //XAie_TileLoc(Col, Row)
    XAie_DmaDesc dmaInst;
    XRT_ERRCHK(XAie_DmaDescInit(&DevInst, &dmaInst, tileLoc));

    // compute reg addr
    //https://gitenterprise.xilinx.com/ai-engine/aie-rt/blob/main/driver/src/dma/xaie_dma_aieml.c#L1308
    // _XAie_GetTileAddr: https://gitenterprise.xilinx.com/ai-engine/aie-rt/blob/main/driver/src/common/xaie_helper.h
    u64 regAddr = dmaInst.DmaMod->BaseAddr 
        + bdId * dmaInst.DmaMod->IdxOffset 
        + _XAie_GetTileAddr(&DevInst, tileLoc.Row, tileLoc.Col) 
        + dmaInst.DmaMod->BdProp->Buffer->ShimDmaBuff.AddrLow.Idx * 4U;

    return regAddr;
}

void AddDDRCustomOp(uint32_t action, uint64_t bufId, uint64_t offsetInBytes, uint8_t shimCol, uint8_t bdId, int32_t patch_op_code)
{
    PatchShimOp op;
    op.action = action;
    op.regAddr = ComputePatchRegAddr(shimCol, bdId);
    op.extBufId = bufId;
    op.argplus = offsetInBytes;
    XAie_AddCustomTxnOp(&DevInst, patch_op_code, (void*)&op, sizeof(op));
}

#endif

#define M_AIE_CORE_ROWS 4
#define M_AIE_START_CORE_ROW 2
#define M_AIE_END_CORE_ROW 6

std::vector<uint8_t> dump_txn_pm(int nCols, int bin_offset, int pm_id)
{
    XAie_LocType ShimDma;

    XAie_StartTransaction(&DevInst, XAIE_TRANSACTION_DISABLE_AUTO_FLUSH);
    XAie_Txn_PmLoadStart(&DevInst, pm_id);
    // Reset Core tiles and Tile DMAs
    for (int c = 0; c < nCols; c++) {
       for (int r = M_AIE_START_CORE_ROW; r < M_AIE_END_CORE_ROW; r++) {
            XAie_CoreDisable(&DevInst, XAie_TileLoc(c, r));
            XAie_CoreReset(&DevInst, XAie_TileLoc(c, r));
            XAie_CoreUnreset(&DevInst, XAie_TileLoc(c, r));
            XAie_DmaChannelResetAll(&DevInst, XAie_TileLoc(c, r), DMA_CHANNEL_RESET);
            XAie_DmaChannelResetAll(&DevInst, XAie_TileLoc(c, r), DMA_CHANNEL_UNRESET);
       }
    }

    uint64_t offset = 0;
    for (int c = 0; c < nCols; c++) {
        ShimDma = XAie_TileLoc(c, 0);
        for (int r = 0; r < M_AIE_CORE_ROWS; r++) {
            int bd = r;
            {
               XAie_DmaDesc bd_shim_0_0_id1;
               XAie_DmaDimDesc bd_shim_0_0_id1_dims[1];
               XAie_DmaTensor bd_shim_0_0_id1_tensor;
               u64 bd_shim_0_0_id1_addr = 0;
               u32 bd_shim_0_0_id1_len = bin_offset;
               bd_shim_0_0_id1_dims[0].AieMlDimDesc.StepSize = 1;
               bd_shim_0_0_id1_dims[0].AieMlDimDesc.Wrap = 0;
               bd_shim_0_0_id1_tensor.NumDim = 1;
               bd_shim_0_0_id1_tensor.Dim = bd_shim_0_0_id1_dims;
               XAie_DmaDescInit(&DevInst, &bd_shim_0_0_id1, ShimDma);    
               XAie_DmaSetMultiDimAddr(&bd_shim_0_0_id1, &bd_shim_0_0_id1_tensor,
                                       bd_shim_0_0_id1_addr, bd_shim_0_0_id1_len);
               XAie_DmaEnableBd(&bd_shim_0_0_id1);
               XAie_DmaSetAxi(&bd_shim_0_0_id1, 0, 32, 0, 2, 0);
               if(r != M_AIE_CORE_ROWS - 1){
                    XAie_DmaSetNextBd(&bd_shim_0_0_id1, r+1, 1);
               }
               XAie_DmaWriteBd(&DevInst, &bd_shim_0_0_id1, ShimDma, bd);
            }    
            XAie_DmaDesc dmaInst;
            XAie_DmaDescInit(&DevInst, &dmaInst, ShimDma);   
            PatchShimOp op;
            op.action = 0;
            op.regAddr = dmaInst.DmaMod->BaseAddr + bd * dmaInst.DmaMod->IdxOffset +
                         _XAie_GetTileAddr(&DevInst, ShimDma.Row, ShimDma.Col) +
                         dmaInst.DmaMod->BdProp->Buffer->ShimDmaBuff.AddrLow.Idx * 4U;   
            op.extBufId = 0;
            op.argplus = offset;
            XAie_AddCustomTxnOp(&DevInst, (int32_t)XAIE_IO_CUSTOM_OP_DDR_PATCH,
                                (void *)&op, sizeof(op));    
            offset += bin_offset;
        }
        XAie_DmaChannelSetStartQueue(&DevInst, ShimDma, 0, DMA_MM2S, 0, 1, XAIE_DISABLE);
    }

    // Poll for completition after all BD Writes are done
    for (int c = 0; c < nCols; c++) {
        ShimDma = XAie_TileLoc(c, 0);
        XAie_DmaWaitForDone(&DevInst, ShimDma, 0, DMA_MM2S, 0);
    }

    // Enable all cores
    for (int c = 0; c < nCols; c++) {
        for (int r = M_AIE_START_CORE_ROW; r < M_AIE_END_CORE_ROW; r++) {
            XAie_CoreEnable(&DevInst, XAie_TileLoc(c, r));
        }
    }
    XAie_Txn_PmLoadEnd(&DevInst);
    uint8_t *txn_ptr = XAie_ExportSerializedTransaction(&DevInst, 0, 0);
    XAie_TxnHeader *Hdr = (XAie_TxnHeader *)txn_ptr;
    auto size = Hdr->TxnSize;

    std::vector<uint8_t> txn(size, 0);
    memcpy((void *)txn.data(), (void *)txn_ptr, size);

    // check if there is an API to free txn pointer
    free(txn_ptr);
    XAie_Finish(&DevInst);
    
    return txn;
}

#endif
