import os
import sys
import re
import math
import subprocess
import copy
import json
from typing import List
import shutil

# Check if GCC is available
gcc_available = shutil.which("gcc") is not None

sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
from config_loader import waic_config


CURRDIR = os.path.dirname(os.path.abspath(__file__))
sys.path.append(os.path.join(CURRDIR, "..", "dmacompiler"))

# To Unblock CI pylint checks error
XILINX_VITIS_AIETOOLS = None
include_path = None
include_path_xaiengine = None
lib_path_driver = None
lib_path_cdo = None

if os.name == "nt":
    # Since aie-rt driver is included as submodule, no need of adding explicit path for xaiengine include headers.
    include_path = r"{}".format(os.getenv("XAIENGINE_PATH"))
    include_path_xaiengine = os.path.join(include_path, "xaiengine")

    # Building xaiengine driver as part of submodule.
    lib_path_driver = r"{}".format(os.getenv("LIBRARY_PATH_XAIENGINE"))
    lib_path_cdo = r"{}".format(os.getenv("LIBRARY_PATH_CDO"))
else:
    XILINX_VITIS_AIETOOLS = os.environ.get("XILINX_VITIS_AIETOOLS")

from dmacompiler import (
    DmaConnection,
    DmaDir,
    OverlayShape,
    shim_dma,
    memtile_dma,
    core_dma,
    BackEnd,
    CoreConnection,
    AieTile,
    TileType,
    set_dev_gen,
    DevGen,
    config,
)

set_dev_gen(DevGen.Aie2p)


# gnerate the tiling.json for each op
def tiling_json_gen(tiling: dict, path: str):
    def is_2d_list(a):
        return isinstance(a, list) and all(isinstance(elem, list) for elem in a)

    """generate original_dimensions"""
    orig_dim = {}
    # Add inputs
    orig_input = (
        [tiling["orig_input"]]
        if not is_2d_list(tiling["orig_input"])
        else tiling["orig_input"]
    )
    for idx, dims in enumerate(orig_input):
        orig_dim[f"input{idx}"] = {"dims": dims}

    # Add outputs
    orig_output = (
        [tiling["orig_output"]]
        if not is_2d_list(tiling["orig_output"])
        else tiling["orig_output"]
    )
    for idx, dims in enumerate(orig_output):
        orig_dim[f"output{idx}"] = {"dims": dims}

    """generate host_layer_padding"""
    host_layer_padding = copy.deepcopy(orig_dim)
    for idx, (key, value) in enumerate(host_layer_padding.items()):
        value["values"] = [
            0,
            0,
            0,
            (
                0
                if value["dims"][3] % 8 == 0
                else (f"zp_i{idx}" if "input" in key else f"zp_o{0}")
            ),
        ]
        value["dims"][3] = (
            value["dims"][3]
            if value["dims"][3] % 8 == 0
            else iceil(value["dims"][3], 8)
        )

    """generate dma_padding"""
    dma_padding = copy.deepcopy(host_layer_padding)
    for idx, (key, value) in enumerate(dma_padding.items()):
        value["channels"] = [0, 1, 2, 3] if "input" in key else [5]

    # Wrap in a root-level dictionary
    data = {
        "original_dimensions": orig_dim,
        "host_layer_padding": host_layer_padding,
        "dma_padding": dma_padding,
    }

    # Save to JSON
    with open(path, "w") as f:
        json.dump(data, f, indent=4)


def disable_fast_pm_backend():
    config.ENABLE_FAST_PM = False


def overlay_stack_addr() -> int:
    return 60352


def overlay_stack_size() -> int:
    return 2048


def overlay_heap_size() -> int:
    return 3072


"""
Function to update the shape list to have four dimensions (N, H, W, C)
"""


def update_len_to_4(data: List[int]):
    while len(data) < 4:
        data.insert(0, 1)
    return data


class ShimAlloc:
    def __init__(
        self,
        ifm_buffer_id: int,
        wgt_buffer_id: int,
        ofm_buffer_id: int,
        prm_buffer_id: int,
    ):
        self.ifm_buffer_id = ifm_buffer_id
        self.wgt_buffer_id = wgt_buffer_id
        self.ofm_buffer_id = ofm_buffer_id
        self.prm_buffer_id = prm_buffer_id


def ceildiv(x: int, d: int) -> int:
    return -(x // -d)


def floordiv(x: int, d: int) -> int:
    return x // d


def iceil(x: int, d: int) -> int:
    """Integer ceiling function"""
    return ceildiv(x, d) * d


def ifloor(x: int, d: int) -> int:
    """Integer floor function"""
    return (x // d) * d


def count_dims(shape: List[int]):
    idx = 0
    while shape[idx] == 1:
        idx += 1

    return len(shape) - idx


def prm_shim_mm2s(col: int) -> str:
    return f"Col:{col}:{col + 1} Row Param"


def prm_memtile_s2mm() -> str:
    return f"Row Param"


def prm_memtile_mm2s(row: int) -> str:
    return f"Row:{row}:{row + 1} Param"


def prm_shim_memory(dims) -> str:
    return f"Col:{dims.aie_cols} Row:{dims.aie_rows} Param:{dims.param_subv_size}"


def conv_kernel_prm_shim_memory(dims, size_in_bytes: int) -> str:
    return f"Col:1 Row:1 Param:{size_in_bytes}"


def prm_memtile_memory(dims) -> str:
    return f"Row:{dims.aie_rows} Param:{dims.param_subv_size}"


def conv_kernel_prm_memtile_memory(dims, size_in_bytes: int) -> str:
    return f"Row:1 Param:{size_in_bytes}"


def shim_alloc() -> ShimAlloc:
    return ShimAlloc(1, 2, 0, 3)


def overlay_4x4_dma_connections() -> List[DmaConnection]:
    aie_cols = 4
    aie_rows = 4
    assert aie_rows == aie_cols
    dma_connections = (
        [
            DmaConnection(
                shim_dma(col, DmaDir.MM2S, 0), memtile_dma(col, DmaDir.S2MM, 0)
            )
            for col in range(aie_cols)
        ]
        + [
            DmaConnection(
                shim_dma(col, DmaDir.MM2S, 1), memtile_dma(col, DmaDir.S2MM, 1)
            )
            for col in range(aie_cols)
        ]
        + [
            DmaConnection(
                memtile_dma(col, DmaDir.MM2S, 4), core_dma(row, col, DmaDir.S2MM, 1)
            )
            for col in range(aie_cols)
            for row in range(aie_rows)
        ]
        + [
            DmaConnection(
                memtile_dma(col, DmaDir.MM2S, row), core_dma(col, row, DmaDir.S2MM, 0)
            )
            for col in range(aie_cols)
            for row in range(aie_rows)
        ]
        + [
            DmaConnection(
                core_dma(col, row, DmaDir.MM2S, 0),
                memtile_dma(col, DmaDir.S2MM, 2 + row),
            )
            for col in range(aie_cols)
            for row in range(aie_rows)
        ]
        + [
            DmaConnection(
                memtile_dma(col, DmaDir.MM2S, 5), shim_dma(col, DmaDir.S2MM, 0)
            )
            for col in range(aie_cols)
        ]
    )
    return dma_connections


def overlay_8x4_dma_connections() -> List[DmaConnection]:
    aie_cols = 8
    aie_rows = 4
    assert 2 * aie_rows == aie_cols
    dma_connections = (
        [
            DmaConnection(
                shim_dma(col, DmaDir.MM2S, 0), memtile_dma(col, DmaDir.S2MM, 0)
            )
            for col in range(aie_cols)
        ]
        + [
            DmaConnection(
                shim_dma(col, DmaDir.MM2S, 1), memtile_dma(col, DmaDir.S2MM, 1)
            )
            for col in range(aie_cols)
        ]
        + [
            DmaConnection(
                memtile_dma(col, DmaDir.MM2S, row), core_dma(col, row, DmaDir.S2MM, 0)
            )
            for col in range(aie_cols)
            for row in range(aie_rows)
        ]
        + [
            DmaConnection(
                memtile_dma(mem_col, DmaDir.MM2S, 4),
                core_dma(core_col, (mem_col // 2), DmaDir.S2MM, 1),
            )
            for mem_col in range(0, aie_cols, 2)
            for core_col in range(aie_cols)
        ]
        + [
            DmaConnection(
                core_dma(col, row, DmaDir.MM2S, 0),
                memtile_dma(col, DmaDir.S2MM, 2 + row),
            )
            for col in range(aie_cols)
            for row in range(aie_rows)
        ]
        + [
            DmaConnection(
                memtile_dma(col, DmaDir.MM2S, 5), shim_dma(col, DmaDir.S2MM, 0)
            )
            for col in range(aie_cols)
        ]
    )
    return dma_connections


def overlay_8x4_core_stream_bdcast() -> List[CoreConnection]:
    core_connections = []
    aie_cols = 8
    Num4x4 = 2
    aie_rows = 4
    src_core1 = AieTile(TileType.Core, aie_cols // Num4x4 - 1, 0)
    src_core2 = AieTile(TileType.Core, aie_cols - 1, 0)
    core_connections = [
        CoreConnection(src_core1, AieTile(TileType.Core, col, row))
        for col in range(aie_cols // Num4x4)
        for row in range(aie_rows)
        if src_core1 != AieTile(TileType.Core, col, row)
    ]
    core_connections += [
        CoreConnection(src_core2, AieTile(TileType.Core, col, row))
        for col in range(aie_cols // Num4x4, aie_cols)
        for row in range(aie_rows)
        if src_core2 != AieTile(TileType.Core, col, row)
    ]

    return core_connections


def overlay_4x4_core_stream_bdcast() -> List[CoreConnection]:
    core_connections = []
    aie_cols = 4
    aie_rows = 4
    src_core = AieTile(TileType.Core, aie_cols - 1, 0)
    core_connections = [
        CoreConnection(src_core, AieTile(TileType.Core, col, row))
        for col in range(aie_cols)
        for row in range(aie_rows)
        if src_core != AieTile(TileType.Core, col, row)
    ]
    return core_connections


def verbose_run(command: str):
    print(command)
    env = os.environ.copy()
    env["BASH_FUNC_make%%"] = (
        f"() {{ patch_aiecompiler_make.py $*; /usr/bin/make $@; }}"
    )
    subprocess.run(command, shell=True, env=env)


def clean_overlay():
    clean_command = (
        "rm -rf aiesimulator_output Work .Xil *.log *.vpd *.vcd .AIE* pl_* temp hw_package "
        "x86simulator_output libadf.a Map_Report.csv sol.db DVEfiles ISS_RPC_SERVER_PORT "
        "dma.hpp graph.hpp super.cc super.hh "
        "param.bin txn.bin ifm.bin wgt.bin ofm.bin txn_pm.bin pm.bin *.bin "
        "HW_build_txn/BOOT.BIN HW_build_txn/_x HW_build_txn/boot_image.bif HW_build_txn/full.pdi out.xclbin out_transformed_pdi.xclbin ../*_xclbin"
    )
    verbose_run(clean_command)


# Aligns the calculate_row_split for a valid pong address
def valid_row_split(lower_bound, second_input):
    num = second_input // 8
    x = lower_bound + 1
    while True:
        if num % x == 0:
            return x
        x += 1


def calculate_row_split(
    only_memtile: bool,
    input_rows: int,
    input_cols: int,
    input_chs: int,
    ifm_bits: int,
    wgt_size: int,
    memory_space: int,
    aie_cols=8,
    aie_rows=4,
    enable_ifm_pingpong=True,
    enable_ofm_pingpong=True,
):

    max_input_in_bytes = input_rows * input_cols * input_chs * ifm_bits // 8 // aie_cols
    min_input_in_bytes = 1 * input_cols * input_chs * ifm_bits // 8  # input_rows = 1

    # NOTE: WGT is not ping pong it is just ping. wgt_ping_addr = 0
    usable_bytes = (
        memory_space - (config.MAX_CORE_LAYER_PARAM_SIZE * aie_rows) - wgt_size
    )

    split_mem_factor = 0
    split_mem_factor += 2 if enable_ifm_pingpong else 1
    if not only_memtile:
        split_mem_factor += 2 if enable_ofm_pingpong else 1

    max_subv_in_bytes = usable_bytes // split_mem_factor

    max_num_split = math.ceil(input_rows / aie_cols)

    if only_memtile:
        min_num_split = math.ceil(max_input_in_bytes / (max_subv_in_bytes))
    else:
        min_num_split = math.ceil(max_input_in_bytes / (max_subv_in_bytes * aie_rows))

    min_num_split = valid_row_split(min_num_split, input_rows)

    return min_num_split


def sizeof(datatype: str) -> int:
    datatype_to_int = {"uint8" : 8, "int8": 8, "uint16": 16, "int16": 16, "float32": 32, "int32": 32}
    return datatype_to_int[datatype]


def elem_size(fix_point_bits: int, qdq_mode: int):
    if fix_point_bits == 8:
        if qdq_mode == 0:
            ifm_bits = 8
            ofm_bits = 16
        elif qdq_mode == 1:
            ifm_bits = 16
            ofm_bits = 8
        else:
            ifm_bits = 8
            ofm_bits = 8
    else:
        ifm_bits = 16
        ofm_bits = 16
    return ifm_bits, ofm_bits


def core_loop_count(
    ifm_subv_elem: int,
    ifm_bits: int,
    param_subv_size: int,
    wgt_size: int,
    core_ifm_ping_pong: bool,
    core_ofm_ping_pong: bool,
):
    usable_core_memory = overlay_stack_addr() - param_subv_size - wgt_size

    num_ifm_ofm_banks = 0
    num_ifm_ofm_banks += 2 if core_ifm_ping_pong else 1
    num_ifm_ofm_banks += 2 if core_ofm_ping_pong else 1

    core_loop = 1

    ifm_core_size = ifm_subv_elem * (ifm_bits // 8) * num_ifm_ofm_banks
    while (ifm_core_size // core_loop) > usable_core_memory:
        core_loop *= 2

    return core_loop


def ifm_split_cost_function(
    aie_cols: int,
    aie_rows: int,
    dataflow_op: bool,
    ifm_rows: int,
    ifm_cols: int,
    ifm_chs: int,
    ofm_rows: int,
    ofm_cols: int,
    ofm_chs: int,
    ifm_bits: int,
    wgt_split_size_in_coretile: int = 0,  # If WGT exists, then size of WGT split not in ping-pong mode for CORETILE
    wgt_split_size_in_memtile: int = 0,  # If WGT exists, then size of WGT split not in ping-pong mode for MEMTILE
    ofm_ifm_subv_ratio: int = 1,  # Default IFM and OFM size is same
    blockSize: int = 1,  # Only used for Permute op
):

    available_memtile_size = (
        config.MAX_MEMTILE_ADDR
        - (config.MAX_CORE_LAYER_PARAM_SIZE * aie_rows)
        - wgt_split_size_in_memtile
    )
    available_coretile_size = overlay_stack_addr() - wgt_split_size_in_coretile

    """
    TODO:   The current limitation is the split from SHIM to MEMTILE
            is only implemented on ROWS, so the granulity is as below.
            Even though split is available, it might not be effecient.
    """

    # NOTE: Assumption is ifm_rows = 1
    min_ifm_in_bytes = 1 * ifm_cols * ifm_chs * ifm_bits // 8
    if dataflow_op:
        assert (
            min_ifm_in_bytes * (1 + ofm_ifm_subv_ratio) < available_memtile_size
        ), "N*H*W*C(N=1, H=1) has to fit in MEMTILE, with IFM and OFM not in ping-pong mode"
    else:
        assert (
            min_ifm_in_bytes // aie_rows * (1 + ofm_ifm_subv_ratio)
        ) // aie_cols < available_coretile_size, "N*H*W*C(N=1, H=1) has to fit in the 4-CORETILES of each column, with IFM and OFM not in ping-pong mode"

    max_splits = ceildiv(ifm_rows, aie_cols)

    # Generated all possible splits for input to the cost function
    num_splits = []
    split = 1
    while split <= max_splits:
        row_in_splits = ceildiv(ifm_rows, aie_cols * split)
        ifm_size_in_bytes = (
            row_in_splits * ifm_cols * ifm_chs * ifm_bits // 8 // aie_cols
        )
        ofm_size_in_bytes = (
            row_in_splits * ifm_cols * ifm_chs * ifm_bits // 8 // aie_cols
        )

        if (ifm_size_in_bytes + ofm_size_in_bytes) <= available_memtile_size and (
            ifm_size_in_bytes + ofm_size_in_bytes
        ) <= available_coretile_size * aie_rows:
            num_splits.append(split)

        split *= 2  # Move to the next power of 2

    """
    NOTE:   The following cost function makes judgement CORETILE can fit IFM/ OFM in ping-pong mode
                1. If CORETILE is involved we can ignore MEMTILE:
                    TODO:   MEMTILE can be still in ping-pong mode because config.MAX_MEMTILE_ADDR =  MAX_CORETILE_ADDR *4
                2. If only MEMTILE we check indivudually
    """

    # Cost function
    core_loop = 1
    split_curr = 0
    split_score_curr = 0
    split_score_past = 0
    if not dataflow_op:
        for idx, split in enumerate(num_splits):
            ifm_min_memory_per_memtile = (
                max(1, ifm_rows // aie_cols // split)
                * ifm_cols
                * ifm_chs
                * ifm_bits
                // 8
            )
            ofm_min_memory_per_memtile = (
                blockSize
                * max(1, ifm_rows // aie_cols // split)
                * ofm_cols
                * ofm_chs
                * ifm_bits
                // 8
            )
            available_memtile_mem_size = (
                config.MAX_MEMTILE_ADDR
                - wgt_split_size_in_memtile
                - (config.MAX_CORE_LAYER_PARAM_SIZE * aie_rows)
            )
            ifm_min_memory_per_core = (
                max(1, ifm_rows // aie_cols // split)
                * ifm_cols
                * ifm_chs
                * ifm_bits
                // 8
                // aie_rows
            )
            ofm_min_memory_per_core = (
                blockSize
                * max(1, ifm_rows // aie_cols // split)
                * ofm_cols
                * ofm_chs
                * ifm_bits
                // 8
                // aie_rows
            )
            available_coretile_addr = overlay_stack_addr() - wgt_split_size_in_coretile
            if (
                ifm_min_memory_per_core + ofm_min_memory_per_core
                > available_coretile_addr
            ):
                print(
                    "The problem size (W and C) exceeds MAX_CORETILE_ADDR with current split (H only across all columns)"
                )
                if idx == len(num_splits) - 1:
                    ifm_subv_elem = (
                        ifm_rows * ifm_cols * ifm_chs // aie_cols // split // aie_rows
                    )
                    core_loop = core_loop_count(
                        ifm_subv_elem,
                        ifm_bits,
                        config.MAX_CORE_LAYER_PARAM_SIZE,
                        wgt_split_size_in_coretile,
                        True,
                        True,
                    )
                    split_curr = split
                    core_ifm_pingpong = True
                    core_ofm_pingpong = True
                    if (
                        2 * ifm_min_memory_per_memtile + 2 * ofm_min_memory_per_memtile
                        <= available_memtile_mem_size
                    ):
                        memtile_ifm_pingpong = True
                        memtile_ofm_pingpong = True
                        break
                    elif (
                        2 * ifm_min_memory_per_memtile + ofm_min_memory_per_memtile
                        <= available_memtile_mem_size
                    ):
                        memtile_ifm_pingpong = True
                        memtile_ofm_pingpong = False
                    else:
                        memtile_ifm_pingpong = False
                        memtile_ofm_pingpong = False
                else:
                    continue
            elif (
                2 * ifm_min_memory_per_core + 2 * ofm_min_memory_per_core
                <= available_coretile_addr
            ):
                core_ifm_pingpong = True
                core_ofm_pingpong = True
                memtile_ifm_pingpong = True
                memtile_ofm_pingpong = True
                split_curr = split
                break
            elif (
                2 * ifm_min_memory_per_core + ofm_min_memory_per_core
                <= available_coretile_addr
            ):
                core_ifm_pingpong = True
                core_ofm_pingpong = False
                memtile_ifm_pingpong = True
                memtile_ofm_pingpong = True
                split_score_curr += 2
            else:
                core_ifm_pingpong = False
                core_ofm_pingpong = False
                memtile_ifm_pingpong = False
                memtile_ofm_pingpong = False
                split_score_curr += 1
            if split_score_curr > split_score_past:
                split_curr = split
                split_score_past = split_score_curr
    else:  # only dataflow_op
        for split in num_splits:
            if (
                ifm_min_memory_per_memtile + ofm_min_memory_per_memtile
                > available_memtile_mem_size
            ):
                assert (
                    False
                ), "The problem size (W and C) exceeds config.MAX_MEMTILE_ADDR with current split (H only across all columns)"
            elif (
                2 * ifm_min_memory_per_memtile + 2 * ofm_min_memory_per_memtile
                <= available_memtile_mem_size
            ):
                memtile_ifm_pingpong = True
                memtile_ofm_pingpong = True
                split_curr = split
                break
            elif (
                2 * ifm_min_memory_per_memtile + ofm_min_memory_per_memtile
                <= available_memtile_mem_size
            ):
                memtile_ifm_pingpong = True
                memtile_ofm_pingpong = False
                split_score_curr += 2
            else:
                memtile_ifm_pingpong = False
                memtile_ofm_pingpong = False
                split_score_curr += 1
            if split_score_curr > split_score_past:
                split_curr = split
                split_score_past = split_score_curr

    if split_curr == 0:
        assert False, "INCORRECT SPLIT!"

    return (
        split_curr,
        memtile_ifm_pingpong,
        memtile_ofm_pingpong,
        core_ifm_pingpong,
        core_ofm_pingpong,
        core_loop,
    )


def extract_simulation_time(sim_log_content: str) -> float:
    """
    Extract simulation time from the log content.
    sim time in nanoseconds
    """
    sim_time_extract_string = (
        "[INFO] : Simulation Finished, Sim result: 0 Total Simulation time "
    )
    sim_time = 0.0
    try:
        time_string = re.search(
            f"{re.escape(sim_time_extract_string)}" + r"\s*([^\n]+)", sim_log_content
        )
        if time_string is not None:
            val, unit = time_string.group(1).split()
            sim_time = float(val)
            if unit == "ps":
                sim_time = float(sim_time) / 1000
            elif unit == "us":
                sim_time = float(sim_time) * 1000
            elif unit == "ms":
                sim_time = float(sim_time) * 1000000
            elif unit == "s":
                sim_time = float(sim_time) * 1000000000
            elif unit == "ns":
                sim_time = float(sim_time)
    except AttributeError:
        pass
    return sim_time


def process_simulation_results(
    sim_log: str, shape_index: int, results_list: list, simtime_list: list
) -> None:
    """
    Process the simulation results from the AIESimulator.log file.
    """
    if not os.path.exists(sim_log):
        results_list[shape_index] = "COMPILE FAIL"
        return

    with open(sim_log, "r", encoding="utf-8") as log_file:
        sim_log_content = log_file.read()

    if "DI: PASS" in sim_log_content:
        results_list[shape_index] = "DI PASS"
    elif "DI: FAIL" in sim_log_content:
        results_list[shape_index] = "DI FAIL"
    else:
        results_list[shape_index] = "SIM INCOMPLETE"

    sim_time = extract_simulation_time(sim_log_content)
    simtime_list[shape_index] = sim_time


def build_sim_overlay(
    backend: BackEnd,
    host_filename: str,
    compile_flags: List[str],
    dump_trace: bool = False,
    kernel_debug: bool = False,
):
    if backend == BackEnd.Adf:
        build_sim_aiecompiler(host_filename, compile_flags, dump_trace, kernel_debug)
    elif backend == BackEnd.TxnHostPatch:
        should_use_common_cpp = False
        if (os.name == "nt" and not gcc_available) or (
            gcc_available and waic_config.mode == "release"
        ):
            should_use_common_cpp = True

        if should_use_common_cpp:
            host_filename = "main_common.cpp"
        else:
            host_filename = host_filename

        build_txn_aiert(host_filename, compile_flags)


def build_sim_aiecompiler(
    host_filename: str,
    compile_flags: List[str],
    dump_trace: bool = False,
    kernel_debug: bool = False,
):
    compile_args = [
        "aiecompiler",
        host_filename,
        "--target=hw",
        "--part=xc10AIE2P_ML-die-0x-e-S-es1",
        f'--include={os.path.join(CURRDIR, "..", "kernels")}',
        f'--include={os.path.join(CURRDIR, "..", "dataflow")}',
        f'--include={os.path.join(CURRDIR, "..", "kernels", "conv")}',
        f'--include={os.path.join(CURRDIR, "..", "kernels", "mmult_qdq_blocked_int16x2")}',
        f'--include={os.path.join(CURRDIR, "..", "kernels", "qdq")}',
        f'--include={os.path.join(CURRDIR, "..", "kernels", "include")}',
        f'--include={os.path.join(CURRDIR, "..", "kernels", "common")}',
        "--adf-api-log-level=3",
        f"--stacksize={overlay_stack_size()}",
        f"--heapsize={overlay_heap_size()}",
        "--enable-core-processor-bus=true",
        "--disable-dma-autostart=true",
        '--Xchess="main:backend.mist2.pnll=off"',
        '--Xchess="main:backend.amnesia.rls=on"',
        '--Xpreproc="-D__AIE_API_WORKAROUND_CR_1223259__=1"',
        '--Xpreproc="-D_main_init=_waic_main_init"',
    ] + compile_flags
    if kernel_debug:
        compile_args += [
            '--Xpreproc="-DLOG_CORE_COL=0"',
            '--Xpreproc="-DLOG_CORE_ROW=2"',
            "--large-program-memory=true",
        ]

    sim_args = [
        "aiesimulator",
        "--profile",
        "--gmio-throughput-global-read=28",
        "--gmio-throughput-global-write=28",
    ]
    if dump_trace:
        sim_args += [
            "--dump-vcd=trace",
        ]
    compile_command = " ".join(compile_args)
    systemC_sed_command = "sed -i 's/-ladf_api/-ladf_rt_ctrl_api -ladf_api/g' Work/ps/c_rts/systemC/Makefile"
    systemC_make_command = "make -C Work/ps/c_rts/systemC/ all"
    sim_command = " ".join(sim_args)
    verbose_run(compile_command)
    verbose_run(systemC_sed_command)
    verbose_run(systemC_make_command)
    verbose_run(sim_command)


def build_txn_aiert(host_filename: str, compile_flags: list):
    mode = waic_config.mode
    compile_args = [
        "g++",
        "-std=c++20" if os.name == "nt" else "-std=c++17",
        "-Wall" if os.name == "nt" else "-Wall -Wextra",
        "-D__AIECONTROLCODE__",
        "-D__TXNRT__=1" if os.name == "nt" else "-D__TXNRT__",
        "-o txn_dma",
        "-w",
        f"{host_filename}",
        f'-I{os.path.join(CURRDIR, "..", "kernels")}',
        f'-I{os.path.join(CURRDIR, "..", "dataflow")}',
        f'-I{os.path.join(CURRDIR, "..", "kernels", "conv")}',
        f'-I{os.path.join(CURRDIR, "..", "kernels", "qdq")}',
        f'-I{os.path.join(CURRDIR, "..", "kernels", "include")}',
        f'-I{os.path.join(CURRDIR, "..", "kernels", "common")}',
    ]

    if os.name == "nt":
        compile_args += [
            f'-I{os.path.join(CURRDIR, "..", "dataflow ")}',
            f"-I{include_path}",
            f"-I{include_path_xaiengine}",
            f"-L{lib_path_driver}",
            f"-L{lib_path_cdo}",
            "-lxaiengine",
            "-lLibCdo",
        ]
    else:
        compile_args += [
            f"-I{XILINX_VITIS_AIETOOLS}/include/drivers/aiengine_aig",
            f"-L{XILINX_VITIS_AIETOOLS}/lib/lnx64.o",
            f"-Wl,-rpath,{XILINX_VITIS_AIETOOLS}/lib/lnx64.o",
            "-lxaiengine_aig",
            "-lcdo_driver",
            "-lstdc++",
        ]

    compile_args += compile_flags
    compile_command = " ".join(compile_args)

    if os.name == "nt":
        if gcc_available:
            if mode == "dev":
                print(
                    "[INFO] Building: ctrl.bin, ifm.bin, ofm.bin, param.bin, txn.bin, wgt.bin"
                )
            elif mode == "release":
                print("[INFO] Building: ctrl.bin, param.bin, txn.bin")
            run_target = "txn_dma.exe"
            verbose_run(compile_command)
            verbose_run(run_target)
        else:
            try:
                CURR_DIR = os.path.dirname(os.path.abspath(__file__))
                ROOT_DIR = os.path.abspath(os.path.join(CURRDIR, ".."))
                CMAKE_SCRIPT = os.path.join(ROOT_DIR, "cmake_generator.py")
                print(f"------- Using MSVC compiler instead of gcc ---------")
                subprocess.run(["python", CMAKE_SCRIPT, compile_command], check=True)
            except subprocess.CalledProcessError as e:
                print(f"Error executing cmake_generator.py: {e}")
    else:
        run_target = "./txn_dma"
        verbose_run(compile_command)
        verbose_run(run_target)


def main():
    print("graph_hpp")


if __name__ == "__main__":
    main()
