"""Utilities for Build System"""

import io
import csv
import json
import logging
import os
import re
import shutil
import subprocess
import sys
import traceback
from contextlib import contextmanager, redirect_stderr, redirect_stdout
from glob import glob
from typing import Any, Dict, Generator, List, Optional, Set, Iterable, Tuple, Union
from pathlib import Path

from cert_sim.build_cert_makefile import patch_makefile
from cert_sim.build_aiebu_json import generate_aiebu_json, CfgItem
from dmacompiler import BackEnd, config
from utils.get_pdi_combination import find_suitable_pdi_variant
from utils.unique_pdi_variants import pdi_variants as PDI_VARIANTS
from utils.utils_common import (
    log,
    is_log_enabled,
    overlay_3x4_core_heap_size,
    overlay_3x4_core_stack_size,
    ReadBins,
)

config.ENABLE_MULTI_UC = True

CURRDIR = os.environ.get("AIE4_ROOT_DIR")
XILINX_VITIS_AIETOOLS = os.environ.get("XILINX_VITIS_AIETOOLS")

# Check mandatory environment variables
missing_envs = []
if CURRDIR is None:
    missing_envs.append("AIE4_ROOT_DIR")
if os.name != "nt" and os.getenv("XILINX_VITIS_AIETOOLS") is None:
    missing_envs.append("XILINX_VITIS_AIETOOLS")

if missing_envs:
    raise RuntimeError(
        f"Missing environment variable(s): {', '.join(missing_envs)}\n"
        "Please source settings.sh before running this script."
    )

HOSTDIR = os.path.join(CURRDIR, "host")
CERT_SIM = os.path.join(CURRDIR, "cert_sim")
CERT_AIESIM_OPTIONS = os.path.join(CERT_SIM, "CERT_AIESIM_OPTIONS.txt")
if os.name == "nt":
    CERT_BINARY = os.path.join(CERT_SIM, "cert_medusa_info_console.bin")
else:
    CERT_BINARY = os.path.join(
        XILINX_VITIS_AIETOOLS,
        "data/simmodels/osci/2.3.1/lnx64/8.3.0/systemc/protected/aiesim_cert_interface/bin/cert_medusa_info_console.bin",
    )
AIESIM_OPTIONS = os.path.join(CURRDIR, "utils", "AIESIM_OPTIONS.txt")
"""
    START OF COMPILATION FUNCTIONS
    NOTE:   The following compilation functions have been tested and verified to work
            with the sourced TOOLS_VERSION. These functions are considered stable and
            should not be modified to ensure compatibility and reliability.
"""


def run_subprocess(cmd: List[str], error_prefix: str = "Command") -> None:
    """
    Run a subprocess with captured output. On failure, print stdout/stderr.

    Args:
        cmd: Command and arguments to run.
        error_prefix: Prefix for error messages (e.g., "Compilation", "Execution").
    """
    try:
        subprocess.run(cmd, check=True, capture_output=True, text=True)
        log(f"[INFO] Command successful {cmd}")
    except subprocess.CalledProcessError as e:
        print(f"[ERR] {error_prefix} failed: {e}")
        if e.stdout:
            print(f"[STDOUT] {e.stdout}")
        if e.stderr:
            print(f"[STDERR] {e.stderr}", file=sys.stderr)
        raise


def build_data_bins(
    host_filename: str,
    layer_id_dir: str,
) -> None:
    """
    Compiles a GCC app for testbench for data bins generation.
    """
    # Conditionally include cpp_log_control.h
    cpp_header = os.path.join(HOSTDIR, "cpp_log_control.h").replace("\\", "/")
    include_flag = [] if is_log_enabled() else ["-include", cpp_header]
    exe_name = "data_bins_gen.exe" if os.name == "nt" else "data_bins_gen"

    compile_args = [
        "g++",
        "-DASM_MODE=1",
        *include_flag,
        host_filename,
        "-w",
        "-o",
        exe_name,
        f"-I{HOSTDIR}",
        f"-I{layer_id_dir}",
        f"-D__IS_QDQ_FP16__={is_qdq_fp16()}",
    ]
    log(f"[INFO] Compiling {exe_name}...")
    run_subprocess(compile_args, error_prefix="Compilation")

    # Run the executable
    exe_path = os.path.join(os.getcwd(), exe_name)

    # Check if data_bins_gen executable exists
    if not os.path.exists(exe_path):
        raise FileNotFoundError(
            f"Missing required executable: {exe_path}. {host_filename} failed to compile."
        )

    log(f"[INFO] Running {exe_name}...")
    run_subprocess([exe_path], error_prefix="Execution")


def gen_pdi_asm(layer_id_dir: str, prefix_path: str = None):
    """To ensure backward compatibility change as TA builds updated PDI format"""
    pdi_path = os.path.join(layer_id_dir, "pdi.asm") if prefix_path else os.path.join(layer_id_dir, "Work", "ps", "asm", "pdi.asm")
    elfs_asm = os.path.join(prefix_path, "aie_asm_elfs.asm") if prefix_path else "aie_asm_elfs.asm"
    init_asm = os.path.join(prefix_path, "aie_asm_init.asm") if prefix_path else "aie_asm_init.asm"
    enable_asm = os.path.join(prefix_path, "aie_asm_enable.asm") if prefix_path else "aie_asm_enable.asm"
    pdi = f'''pdi:
.include {elfs_asm}
.eop
.include {init_asm}
.eop
.include {enable_asm}
.eop
.endl pdi
'''
    with open(pdi_path, "w", encoding="utf-8") as file:
        file.writelines(pdi)


def build_aie_compiler(
    host_filename: str, layer_id_dir: str, device: str = "mds"
) -> None:
    """
    Compiles an AI Engine host application using aiecompiler.
    """
    part_name = "xc10MDS1-die-0x-e-S-es1" if device == "mds" else "xc10SWV1"
    compile_args = [
        "aiecompiler",
        host_filename,
        "-v",
        "--disable-multirate-analysis",
        f"--part={part_name}",
        "--adf-api-log-level=5",
        "--disable-dma-autostart=true",
        "--enable-core-processor-bus=true",
        "-log-level=5",
        "--workdir=./Work",
        "--Xelfgen=-j32",
        f"--stacksize={overlay_3x4_core_stack_size()}",
        f"--heapsize={overlay_3x4_core_heap_size()}",
        f"--include={HOSTDIR}",
        f"--include={os.path.join(CURRDIR, 'kernel')}",
        f"--include={os.path.join(CURRDIR, 'kernel/common')}",
        f"--include={layer_id_dir}",
        f"--Xpreproc=-D__IS_QDQ_FP16__={is_qdq_fp16()}"
    ]

    compile_command = " ".join(compile_args)
    verbose_run(compile_command)
    # To ensure backward compatibility after TA builds change
    gen_pdi_asm(layer_id_dir)


def build_sim(dump_vcd: bool = False) -> None:
    """
    Runs AI Engine simulation with profiling and VCD dump.
    """
    # NOTE: --dpm-state=legacy disables performance modeling of DDR in aiesimulation
    # Sets the BW/freq to ideal
    sim_args = [
        "aiesimulator",
        "--profile",
        "--mt-model=false",
        "--dpm-state=7",
        f"-f {AIESIM_OPTIONS}",
    ]
    if dump_vcd:
        sim_args.append("--dump-vcd=trace")
    # pylint: disable-next=C0301,W1401
    systemC_sed_command = "sed -i 's/-ladf_api/-ladf_rt_ctrl_api -ladf_api/g' Work/ps/c_rts/systemC/Makefile"  # noqa: E501
    systemC_make_command = "make -C Work/ps/c_rts/systemC/ all"
    sim_command = " ".join(sim_args)
    verbose_run(systemC_sed_command)
    verbose_run(systemC_make_command)
    verbose_run(sim_command)


def generate_ctrl_elf(cfg: List[CfgItem], out_path: str):
    """Generates control.elf using control.asm."""
    # Ensure output directory exists
    os.makedirs(out_path, exist_ok=True)

    cfg_json_path = os.path.join(out_path, "config.json")
    generate_aiebu_json(cfg, cfg_json_path)

    # Generate `control.elf` using AIEBU
    if os.name == "nt":
        aiebu_bin = os.path.join(CURRDIR, "prebuilt", "aiebu-asm.exe")
    else:
        aiebu_bin = os.path.join(CERT_SIM, "aiebu-asm")
    out_elf = os.path.join(out_path, "control.elf")
    cmd = [
        aiebu_bin,
        "-t",
        "aie4_config",
        "-j",
        cfg_json_path,
        "-o",
        out_elf,
        "-f",
        "disabledump",
    ]
    try:
        subprocess.run(cmd, check=True, stdout=None if is_log_enabled() else subprocess.DEVNULL)
        log(f"[INFO] Generated control.elf at {out_elf}")
    except FileNotFoundError:
        print(f"[ERR] aiebu-asm not found at {aiebu_bin}")
    except subprocess.CalledProcessError as e:
        print(f"[ERR] aiebu-asm execution failed: {e}")


def build_cert_sim(layer_id_dir: str = "", is_cert_sim: bool = False) -> None:
    """Runs CERT Simulation with profiling and VCD dump."""
    sim_args = [
        "aiesimulator",
        "--profile",
        "--dump-vcd=trace",
        "--dpm-state=7",
        f"-f {CERT_AIESIM_OPTIONS}",
        "--mt-model=false",
    ]
    sim_command = " ".join(sim_args)
    # pylint: disable-next=C0301,W1401
    systemC_sed_command = "sed -i 's/-ladf_api/-ladf_rt_ctrl_api -ladf_api/g' Work/ps/c_rts/systemC/Makefile"  # noqa: E501
    verbose_run(systemC_sed_command)

    # Build `control.asm`
    build_cert(is_cert_sim=is_cert_sim)

    # Generate Shim Patching JSON for AIEBU
    work_path = os.path.join(layer_id_dir, "Work")
    work_aie4_path = os.path.join(layer_id_dir, "Work_AIE4")

    # Generate `control.elf` using AIEBU
    cfg = [CfgItem(id="aie4_models", path=work_aie4_path)]
    generate_ctrl_elf(cfg, work_aie4_path)

    # Copy CERT Firmware binary to `Work`
    shutil.copy(CERT_BINARY, work_path)

    # Update `Work/ps/c_rts/systemC/Makefile` to adapt to CERT Sim
    patch_makefile(layer_id_dir)

    # Make `Work/ps/c_rts/systemC/Makefile` includes testbench
    systemC_make_command = "make -C Work/ps/c_rts/systemC/ all"
    verbose_run(systemC_make_command)

    # Start CERT Sim
    verbose_run(sim_command)


def generate_multi_uc_control_asm(work_aie4_dir: str) -> None:
    """Generate control.asm for Multi-Uc"""
    path_uc0 = control_asm_file_name = os.path.join(work_aie4_dir, "uc0.asm")
    path_uc2 = control_asm_file_name = os.path.join(work_aie4_dir, "uc2.asm")
    path_uc4 = control_asm_file_name = os.path.join(work_aie4_dir, "uc4.asm")
    control_asm = f'''control:
.include {path_uc0}
.eop
.include {path_uc2}
.eop
.include {path_uc4}
.eop
.endl control
'''
    os.makedirs(work_aie4_dir, exist_ok=True)
    control_asm_file_name = os.path.join(work_aie4_dir, "control.asm")
    with open(control_asm_file_name, 'w', encoding="utf-8") as f:
        f.write(control_asm)


def create_nop_pdi(output_path: str, nop_pdi: str) -> None:
    """
    Create file pdi{nop_pdi}.asm under `output_path` with template content.
    All uCs must issue LOAD_PDI; only uC_0 performs the real LOAD_PDI,
    while others use dummy (NOP) LOAD_PDI to satisy CERT firmware requirements.
    """
    file_path = f"{output_path}/pdi{nop_pdi}.asm"
    content = """;
; Code
;
START_JOB 0
  NOP
END_JOB

EOF
"""
    with open(file_path, "w", encoding="utf-8") as f:
        f.write(content)


def process_asm_file(is_standalone_op: bool, input_path: str, output_path: str, nop_pdi: str = '') -> None:
    """
    Modifies an ASM file by:
    1. Incrementing all START_JOB indices by 1.
    2. Adding a new START_JOB 0 block after the first three lines.
    3. Replacing the last ".eop" with a reference to "pdi.asm".

    Args:
        input_path (str): Path to the input ASM file.
        output_path (str): Path to save the modified ASM file.

    Raises:
        FileNotFoundError: If the input file does not exist.
    """
    if not os.path.exists(input_path):
        raise FileNotFoundError(f"Error: '{input_path}' not found.")

    with open(input_path, "r", encoding="utf-8") as file:
        lines = file.readlines()

    if nop_pdi in ["2", "4"]:
        create_nop_pdi(os.path.dirname(input_path), nop_pdi)

    # Step 1: Increment START_JOB indices by 1
    updated_lines = []
    job_pattern = re.compile(r"^(START_JOB)\s+(\d+)$")

    for line in lines:
        match = job_pattern.match(line)
        if match:
            updated_index = int(match.group(2)) + 1
            # Step 2: Insert PDI block as first JOB
            if updated_index == 1:
                new_block = [
                    "START_JOB 0\n",
                    f"LOAD_PDI 0, @pdi{nop_pdi}\n",
                    "END_JOB\n",
                    ".eop\n"
                    "\n"
                ]
                updated_lines.extend(new_block)
            updated_lines.append(f"START_JOB {updated_index}\n")
        else:
            updated_lines.append(line)

    # Step 3: Replace the last ".eop" with new block
    last_eop_index = None
    for i in range(len(updated_lines) - 1, -1, -1):
        if updated_lines[i].strip() == ".eop":
            last_eop_index = i
            break

    if last_eop_index is not None:
        pdi_dir = os.path.dirname(output_path) if is_standalone_op else os.getcwd()
        pdi_path = os.path.join(pdi_dir, f"pdi{nop_pdi}.asm")
        updated_lines[last_eop_index] = f".eop\npdi{nop_pdi}:\n.include {pdi_path}\n.endl pdi{nop_pdi}\n"

    # Write the updated content back to a new file
    with open(output_path, "w", encoding="utf-8") as file:
        file.writelines(updated_lines)


def split_aie4_dma_by_uc() -> Union[str, Tuple[str, str, str]]:
    """
    Streaming split of aie4_dma.cpp into per-UC files.
    If config.ENABLE_MULTI_UC:
      Creates:
        - aie4_dma_uc0.cpp
        - aie4_dma_uc2.cpp
        - aie4_dma_uc4.cpp
      Each output = same prefix + ONE UC block (Open..Close inclusive) + same suffix.

    If not ENABLE_MULTI_UC:
      No-op; returns "aie4_dma.cpp".
    """
    aie4_dma_cpp_path = os.path.join(os.getcwd(), "aie4_dma.cpp")
    src_path = Path(aie4_dma_cpp_path)
    if not config.ENABLE_MULTI_UC:
        return src_path.stem

    uc0, uc2, uc4 = "uc0.asm", "uc2.asm", "uc4.asm"
    uc_open_0 = f'XAie_OpenControlCodeFile(&DevInst, "Work_AIE4/{uc0}", 8192);'
    uc_open_2 = f'XAie_OpenControlCodeFile(&DevInst, "Work_AIE4/{uc2}", 8192);'
    uc_open_4 = f'XAie_OpenControlCodeFile(&DevInst, "Work_AIE4/{uc4}", 8192);'
    uc_close = "XAie_CloseControlCodeFile(&DevInst);"

    out0, out2, out4 = Path("aie4_dma_uc0.cpp"), Path("aie4_dma_uc2.cpp"), Path("aie4_dma_uc4.cpp")
    w0 = out0.open("w", encoding="utf-8")  # pylint: disable=consider-using-with
    w2 = out2.open("w", encoding="utf-8")  # pylint: disable=consider-using-with
    w4 = out4.open("w", encoding="utf-8")  # pylint: disable=consider-using-with

    prefix_lines: list[str] = []
    state = "prefix"     # prefix | block | suffix
    active = None        # "uc0.asm" | "uc2.asm" | "uc4.asm" | None
    seen0 = seen2 = seen4 = False

    def write_suffix_to_all(line: str) -> None:
        w0.write(line)
        w2.write(line)
        w4.write(line)

    def flush_prefix() -> None:
        if prefix_lines:
            pre = "".join(prefix_lines)
            w0.write(pre)
            w2.write(pre)
            w4.write(pre)
            prefix_lines.clear()

    # For very large control-code blobs (millions of lines), we avoid scanning with generic loops
    # and instead use unrolled checks for each uC (uc0 / uc2 / uc4) for minimizing post-processing latency.

    # Terminology:
    # - prefix: HW configuration + helper/utility code before any UC Open marker
    # - block : UC control-code region between XAie_OpenControlCodeFile and XAie_CloseControlCodeFile
    # - suffix: trailing code after the UC block (e.g., return statements and closing braces)
    try:
        with src_path.open("r", encoding="utf-8") as reader:
            for line in reader:
                # Detect Open markers
                if uc_open_0 in line:
                    seen0 = True
                    flush_prefix()
                    active, state = uc0, "block"
                    w0.write(line)
                    continue
                if uc_open_2 in line:
                    seen2 = True
                    flush_prefix()
                    active, state = uc2, "block"
                    w2.write(line)
                    continue
                if uc_open_4 in line:
                    seen4 = True
                    flush_prefix()
                    active, state = uc4, "block"
                    w4.write(line)
                    continue

                # Detect Close marker when inside a UC block
                if state == "block" and active and (uc_close in line):
                    if active == uc0:
                        w0.write(line)
                    elif active == uc2:
                        w2.write(line)
                    else:
                        w4.write(line)
                    active, state = None, "suffix"
                    continue

                # Route normal lines based on state
                if state == "prefix":
                    prefix_lines.append(line)
                elif state == "block" and active:
                    if active == uc0:
                        w0.write(line)
                    elif active == uc2:
                        w2.write(line)
                    else:
                        w4.write(line)
                else:
                    write_suffix_to_all(line)
    finally:
        w0.close()
        w2.close()
        w4.close()

    missing = []
    if not seen0:
        missing.append(uc0)
    if not seen2:
        missing.append(uc2)
    if not seen4:
        missing.append(uc4)
    if missing:
        raise ValueError(f"Missing UC block(s) in {src_path.name}: {', '.join(missing)}")

    return (out0.stem, out2.stem, out4.stem)


def build_cert(copy_pdi: bool = True,
               is_cert_sim: bool = False,
               fused_layer_ids: List[int] = None,
               is_standalone_op: bool = False) -> None:
    """
    Builds CERT ASMs and copy required ASM files to Work_AIE4.
    """
    # Clean previous outputs
    work_aie4_dir = os.path.join(os.getcwd(), "Work_AIE4")
    try:
        shutil.rmtree(work_aie4_dir, ignore_errors=True)
    except Exception:  # pylint: disable=broad-exception-caught
        log(
            f"[WARN] Failed to remove directory {work_aie4_dir}: {traceback.format_exc()}"
        )

    # Inject ML Timeline TimeStamps
    if get_ml_timeline_log_level() > 0:
        generate_xrt_ini()
        inject_timestamps("aie4_dma.cpp", fused_layer_ids)

    for artifact in ("aie4_dma", "aie4_dma.exe"):
        try:
            os.remove(os.path.join(os.getcwd(), artifact))
        except FileNotFoundError:
            pass
        except Exception:  # pylint: disable=broad-exception-caught
            log(f"[WARN] Failed to remove {artifact}: {traceback.format_exc()}")

    # Recreate output dir
    os.makedirs(work_aie4_dir, exist_ok=True)

    for dma_cpp_file in split_aie4_dma_by_uc():
        if os.name == "nt":
            compile_args = [
                "g++",
                "-w",
                "-D__AIECONTROLCODE__",
                "-o",
                f"{dma_cpp_file}.exe",
                f"{dma_cpp_file}.cpp",
                f"-I{os.environ['XAIENGINE_PATH']}",
                f"-I{os.environ['XAIENGINE_HEADER_PATH']}",
                f"-L{os.environ['LIBRARY_PATH_XAIENGINE']}",
                "-lxaiengine",
                "-Wl,--enable-auto-import",
                "-lstdc++",
                f"-D__IS_QDQ_FP16__={is_qdq_fp16()}",
            ]
        else:
            compile_args = [
                "g++",
                "-w",
                "-D__AIECONTROLCODE__",
                "-o",
                f"{dma_cpp_file}",
                f"{dma_cpp_file}.cpp",
                f"-I{os.getenv('AIE_RT_LINUX_INCLUDE_DIR')}",
                f"-L{os.getenv('AIE_RT_LINUX_LIB_DIR')}",
                "-lxaiengine",
                f"-Wl,-rpath,{os.getenv('AIE_RT_LINUX_LIB_DIR')}",
                "-lstdc++",
                f"-D__IS_QDQ_FP16__={is_qdq_fp16()}",
            ]

        run_subprocess(compile_args, error_prefix="Compile")

        # Run target (handle .exe on Windows)
        exe_path = os.path.join(
            os.getcwd(), f"{dma_cpp_file}.exe" if os.name == "nt" else dma_cpp_file
        )
        run_subprocess([exe_path], error_prefix="Target run")

    # Process ASM files
    try:
        if config.ENABLE_MULTI_UC:
            generate_multi_uc_control_asm(work_aie4_dir)
            process_asm_file(
                is_standalone_op,
                os.path.join(work_aie4_dir, "uc0.asm"),
                os.path.join(work_aie4_dir, "uc0.asm")
                )
            process_asm_file(
                is_standalone_op,
                os.path.join(work_aie4_dir, "uc2.asm"),
                os.path.join(work_aie4_dir, "uc2.asm"),
                "2"
                )
            process_asm_file(
                is_standalone_op,
                os.path.join(work_aie4_dir, "uc4.asm"),
                os.path.join(work_aie4_dir, "uc4.asm"),
                "4"
                )
        else:
            process_asm_file(
                is_standalone_op,
                os.path.join(work_aie4_dir, "test.asm"),
                os.path.join(work_aie4_dir, "control.asm")
                )
    except Exception:   # pylint: disable=W0718
        print(f"[ERR] Error in processing ASM files: {traceback.format_exc()}", file=sys.stderr)

    # Copy PDI-related ASM files
    if copy_pdi:
        pdi_path = (
            os.path.join("Work", "ps", "asm")
            if is_cert_sim
            else os.path.join(
                os.getcwd(), "..", "op_pdi_shape_input_0", "Work", "ps", "asm"
            )
        )
        asm_files = [
            "aie_asm_elfs.asm",
            "aie_asm_init.asm",
            "aie_asm_enable.asm",
            "pdi.asm",
        ]

        # Ensure destination exists
        os.makedirs(work_aie4_dir, exist_ok=True)

        for fname in asm_files:
            src = os.path.join(pdi_path, fname)
            dst = os.path.join(work_aie4_dir, fname)
            if os.path.isfile(src):
                try:
                    shutil.copy2(src, dst)
                    log(f"[INFO] Copied {fname}")
                except Exception:
                    print(
                        f"[ERR] Failed to copy {fname}: {traceback.format_exc()}")
                    raise
            else:
                log(f"[WARN] Missing ASM file: {fname} (expected at {src})")


def compile_backend(
    back_end: BackEnd,
    host_filename: str,
    layer_id_dir: str = "",
    is_cert_sim: bool = False,
    device: str = "mds",
    dump_vcd: bool = False,
    is_standalone_op: bool = False,
) -> None:
    """
    Manages the AI Engine build process based on the selected backend.
    - If using Adf, compiles and simulates the AI Engine model.
    - If using CertAsm, optionally builds PDI and compiles the CERT ASM.
    - Raises a ValueError for an invalid backend selection.
    """
    host_filename = os.path.join(HOSTDIR, host_filename)
    if back_end == BackEnd.Adf:
        build_aie_compiler(host_filename, layer_id_dir, device=device)
        if is_cert_sim:
            build_cert_sim(layer_id_dir, is_cert_sim, True)
        else:
            build_sim(dump_vcd)
    elif back_end == BackEnd.CertAsm:
        build_data_bins(host_filename, layer_id_dir)
        build_cert(is_standalone_op=is_standalone_op)
    else:
        raise ValueError("Invalid BackEnd!")


# END OF COMPILATION FUNCTIONS


def capture_prints_to_file(fn, *args, filename="prints.txt", **kwargs):
    """Run a function and redirect all stdout/stderr prints to the given file."""
    with (
        open(filename, "w", encoding="utf-8") as f,
        redirect_stdout(f),
        redirect_stderr(f),
    ):
        return fn(*args, **kwargs)


def unified_di_sim(infolder: str, device: str = "mds"):
    """To simulate L2 fused dma.hpp"""
    os.chdir(infolder)
    host_filename = os.path.join(HOSTDIR, "unified_di_sim.cpp")
    build_aie_compiler(host_filename, os.getcwd(), device=device)
    build_sim()


# Utility functions
def verbose_run(command: str) -> None:
    """Run a shell command and print it."""
    print(command)
    subprocess.run(command, shell=True, check=False)


def clean_overlay(backend: BackEnd = BackEnd.Adf) -> None:
    """Clean already existing generated files during compilation"""
    if backend == BackEnd.Adf:
        targets = [
            "aiesimulator_output",
            "Work",
            ".Xil",
            ".AIE*",
            "pl_*",
            "temp",
            "hw_package",
            "x86simulator_output",
            "libadf.a",
            "Map_Report.csv",
            "sol.db",
            "DVEfiles",
            "ISS_RPC_SERVER_PORT",
            "dma.hpp",
            "graph.hpp",
            "super.cc",
            "super.hh",
            "aie4_dma.cpp",
            "param.bin",
            "txn.bin",
            "ifm.bin",
            "wgt.bin",
            "ofm.bin",
            "HW_build_txn",
            "out.xclbin",
            "../*_xclbin",
            "dataflow_tiling.txt",
            "Work_AIE4",
            "aie4_dma",
        ]
    else:
        targets = [
            "dma.hpp",
            "graph.hpp",
            "super.cc",
            "super.hh",
            "aie4_dma.cpp",
            "aie4_dma",
        ]

    # Expand wildcards and delete
    for pattern in targets:
        for path in glob(pattern):
            try:
                if os.path.isdir(path):
                    shutil.rmtree(path, ignore_errors=True)
                    log(f"[INFO] Removed directory: {path}")
                elif os.path.isfile(path):
                    os.remove(path)
                    log(f"[INFO] Removed file: {path}")
            except Exception:  # pylint: disable=broad-exception-caught
                log(f"[WARN] Failed to remove {path}: {traceback.format_exc()}")


def out_dir_name_from_dict(d: dict) -> str:
    """
    Join variable-length input/output/kernel/stride/pad lists into a name.
    """
    parts = []

    for key in [
        "input",
        "input0",
        "input1",
        "output",
        "kernel",
        "stride",
        "pad",
        "dataflow_type",
        "qdq_mode",
        "b_on_wgt",
        "a_on_wgt",
    ]:
        if key in d and isinstance(d[key], (list, tuple)):
            int_vals = [str(v) for v in d[key] if isinstance(v, (str, int))]
            if int_vals:
                parts.append(f"{key}_{'_'.join(int_vals)}")

    # Only add group if it exists and is greater than 1 - This is to distringuish CONV and DWC output folders
    if "group" in d:
        group_val = int(d["group"])
        if group_val > 1:
            parts.append(f"group_{group_val}")

    return "_".join(parts)


def create_output_folder(
    operator: str,
    shape: Dict,
    out_folder: str,
    is_json: bool = False,
    json_block_id: int = None,
) -> str:
    """Create output folder for the operator"""

    if is_json:
        op_name = f"op_{operator}_layer_id_{json_block_id}"
    else:
        op_name = "op_" + f"{operator}_" + \
            "shape_" + out_dir_name_from_dict(shape)

    op_folder = os.path.join(out_folder, op_name)
    os.makedirs(op_folder, exist_ok=True)
    return op_folder


def copy_pdi_for_win(combined_kernels_names: dict, combined_kernel_includes: list[str]) -> str:
    """
    Creates {out_path}/op_pdi_shape_input_0/Work/ps/asm
    and copies all .asm files from {CURRDIR}/prebuild/pdi to it.
    """
    # Fina PDI variant that has all required kernels
    prebuilt_pdi = find_suitable_pdi_variant(PDI_VARIANTS, combined_kernels_names, combined_kernel_includes)

    # Define source and destination paths
    dtype_prefix = "fp16" if is_qdq_fp16() else "bf16"
    src_dir = os.path.join(CURRDIR, "prebuilt", f"{dtype_prefix}_pdi_with_{prebuilt_pdi}")
    print(f"[INFO] Selected prebuilt PDI: {src_dir}")
    return src_dir


@contextmanager
def capture_logging(log_path: str) -> Generator[io.StringIO, None, None]:
    """
    Context manager to capture stdout and stderr while logging the output
    to a custom log file (not relying on basicConfig).
    """
    captured_output = io.StringIO()
    original_stdout = sys.stdout
    original_stderr = sys.stderr

    # Set up per-layer file logger
    logger = logging.getLogger(f"logger_{os.path.basename(log_path)}")
    logger.setLevel(logging.INFO)
    logger.propagate = False  # Prevent double logging

    if logger.hasHandlers():
        logger.handlers.clear()

    file_handler = logging.FileHandler(log_path)
    file_handler.setFormatter(logging.Formatter("%(message)s"))
    logger.addHandler(file_handler)

    try:
        sys.stdout = captured_output
        sys.stderr = captured_output
        yield captured_output
    finally:
        sys.stdout = original_stdout
        sys.stderr = original_stderr
        output = captured_output.getvalue()
        logger.info(output)


def parse_json_to_dict_with_op(json_obj):
    """
    Recursively converts a JSON object to a Python dict and:
    - Converts "true"/"false" strings to bool
    - Converts numeric strings to int (or float if needed)
    - Returns the parsed dict and the value of the "op" field if it exists, else None
    """

    def _parse(obj):
        if isinstance(obj, dict):
            new_dict = {}
            for key, value in obj.items():
                # try to convert key to int
                if isinstance(key, str) and key.lstrip("-").isdigit():
                    key = int(key)
                new_dict[key] = _parse(value)
            return new_dict
        if isinstance(obj, list):
            return [_parse(item) for item in obj]
        if isinstance(obj, str):
            lower_val = obj.lower()
            if lower_val == "true":
                return True
            if lower_val == "false":
                return False
            try:
                return int(obj)
            except ValueError:
                try:
                    return float(obj)
                except ValueError:
                    return obj
        else:
            return obj

    parsed = _parse(json_obj)
    op = parsed.get("op", None)
    return parsed, op


def merge_control_code_blocks_to_output(file_paths, output_path, mode):
    """
    Merges the code blocks between:
        XAie_OpenControlCodeFile(&DevInst, "Work_AIE4/test.asm", 8192);
    and
        XAie_CloseControlCodeFile(&DevInst);

    from all files into one file at `output_path`.
    The Open/Close lines are taken only from the first file.
    Other files contribute only the in-between content.
    """
    if mode == "cpp" and config.ENABLE_MULTI_UC:
        uc_list = ["uc0.asm", "uc2.asm", "uc4.asm"]
    else:
        uc_list = ["test.asm"]

    # Markers for each UC
    start_block = {uc: f'XAie_OpenControlCodeFile(&DevInst, "Work_AIE4/{uc}", 8192);'
                   for uc in uc_list}
    end_block = {uc: 'XAie_CloseControlCodeFile(&DevInst);' for uc in uc_list}

    def is_attach_to_group(line: str) -> bool:
        return "XAie_AttachToGroup(&DevInst" in line

    # Helper to extract prefix, control_code[uc], suffix from one file
    def extract_all_blocks(lines):
        prefix = []
        suffix = []
        control_code = {uc: [] for uc in uc_list}
        mode = "prefix"
        current_uc = None

        for line in lines:
            # detect open
            uc_hit = next((u for u in uc_list if start_block[u] in line), None)
            if uc_hit:
                current_uc = uc_hit
                mode = "control_code"
                continue

            # detect close
            if current_uc and end_block[current_uc] in line:
                current_uc = None
                mode = "suffix"
                continue

            # store content
            if mode == "prefix":
                prefix.append(line)
            elif mode == "control_code" and current_uc:
                control_code[current_uc].append(line)
            else:
                suffix.append(line)

        return prefix, control_code, suffix

    # --- Read and process the first file ---
    with open(file_paths[0], "r", encoding="utf-8") as f:
        prefix, merged_control_code, suffix = extract_all_blocks(f.readlines())

    # --- Read and append blocks from remaining files ---
    for path in file_paths[1:]:
        try:
            with open(path, "r", encoding="utf-8") as f:
                lines = f.readlines()
        except FileNotFoundError as e:
            raise FileNotFoundError(f"Input file not found: {path}") from e

        _, control, _ = extract_all_blocks(lines)

        for uc in uc_list:
            if control[uc]:
                filtered = [ln for ln in control[uc]
                            if not (config.ENABLE_MULTI_UC and is_attach_to_group(ln))]
                merged_control_code[uc].extend(filtered)

    # --- Compose and write the final output ---
    with open(output_path, "w", encoding="utf-8") as out:
        out.writelines(prefix)
        for uc in uc_list:
            out.write("    " + start_block[uc] + "\n")
            out.writelines(merged_control_code[uc])
            out.write("    " + end_block[uc] + "\n")
        out.writelines(suffix)

    log(f"[INFO] Merged {len(file_paths)} files into: {output_path}")


def make_l2_fused_subdir(outfolder, subgraph_suffix: str = None):
    """ "Create dicrectroy for fused files"""
    package_name = "fused_hw_package"
    if subgraph_suffix:
        package_name = package_name + f"_{subgraph_suffix}"
    path = os.path.join(outfolder, package_name)
    os.makedirs(path, exist_ok=True)
    return path


def copy_pdi_asm_files(root_output_dir: str, fused_path: str):
    """Copy .asm files from pdi operator folder"""
    source_dir = os.path.join(
        root_output_dir, "op_pdi_shape_input_0", "Work", "ps", "asm"
    )
    asm_files = [
        "aie_asm_elfs.asm",
        "aie_asm_init.asm",
        "aie_asm_enable.asm",
        "pdi.asm",
    ]

    for fname in asm_files:
        src = os.path.join(source_dir, fname)
        dst = os.path.join(fused_path, fname)
        if os.path.isfile(src):
            shutil.copy2(src, dst)
            log(f"[INFO] Copied {fname}")
        else:
            log(f"[WARN] Missing ASM file: {fname} (expected at {src})")


def restructure_fused_hw_package(fused_path: str, dma_hpp_paths: list[str]) -> None:
    """Structure fused output files according to hw_package dir layout"""
    if not dma_hpp_paths:
        log("[WARN] No DMA HPP paths provided. Skipping restructure.")
        return

    work_dir = os.path.join(fused_path, "Work_AIE4")
    param_bin_path = os.path.join(fused_path, "param.bin")

    # --- Step 1: Remove param.bin if present ---
    if os.path.isfile(param_bin_path):
        os.remove(param_bin_path)
        log("[INFO] Removed old param.bin")
    else:
        log(f"[WARN] param.bin not found in {fused_path}")

    # --- Step 2: Move control_code files ---
    if config.ENABLE_MULTI_UC:
        control_files = ["uc0.asm", "uc2.asm", "uc4.asm", "pdi2.asm", "pdi4.asm"]
        generate_multi_uc_control_asm(fused_path)
    else:
        control_files = ["control.asm"]
    for control_file in control_files:
        control_file_src = os.path.join(work_dir, control_file)
        control_file_dst = os.path.join(fused_path, control_file)
        if os.path.isfile(control_file_src):
            shutil.move(control_file_src, control_file_dst)
            log(f"[INFO] Moved {control_file} -> {control_file_dst}")
        else:
            log(f"[WARN] {control_file} not found in {work_dir}")

    # --- Step 3: Remove Work_AIE4 ---
    if os.path.isdir(work_dir):
        shutil.rmtree(work_dir, ignore_errors=True)
        log(f"[INFO] Deleted directory: {work_dir}")
    else:
        log(f"[WARN] Work_AIE4 not found in {fused_path}")


def fuse_bins(
    src_paths: list[str], dest_path: str, offsets: Optional[list[int]] = None
) -> int:
    """Fuse bin files in order with optional padding/offsets"""
    assert len(src_paths) > 0
    if offsets is not None:
        assert len(offsets) == len(src_paths)
        # relative offsets
        offsets = list(map(lambda x: x - offsets[0], offsets))

    current_size = 0
    with open(dest_path, "wb") as dest:
        for idx, src_path in enumerate(src_paths):
            with open(src_path, "rb") as src:
                if offsets is not None:
                    padding = offsets[idx] - current_size
                    assert padding >= 0
                    dest.write(b"\x00" * padding)
                    current_size += padding

                src_data = src.read()
                dest.write(src_data)
                current_size += len(src_data)
    return current_size


def prep_read_bins(
    curr_layer_name: str,
    prev_layer_names: list[str],
    intermediate_bin_dir,
    read_bins: ReadBins,
) -> None:
    """
    Copy previous layer ofm bins to known directory as ifm bins
    """
    os.makedirs(intermediate_bin_dir, exist_ok=True)
    if read_bins.read_ifm == 1:
        for idx, prev_layer in enumerate(prev_layer_names):
            input_name = os.path.join(
                intermediate_bin_dir, f"ofm{prev_layer}.bin")
            intermediate_bin = os.path.join(
                intermediate_bin_dir, f"ifm{idx + 1}.bin")
            shutil.copyfile(input_name, intermediate_bin)
    if read_bins.read_wgt == 1:
        input_name = os.path.join(
            intermediate_bin_dir, f"wgt{curr_layer_name}.bin")
        intermediate_bin = os.path.join(intermediate_bin_dir, "wgt.bin")
        shutil.copyfile(input_name, intermediate_bin)


def save_intermediate_bin(
    curr_layer_dir: str, intermediate_bin_dir: str, layer_output_name: str
) -> None:
    """
    Copy layer bins to known intermediate directory
    """
    # ofm.bin
    os.makedirs(intermediate_bin_dir, exist_ok=True)
    src_bin = os.path.join(curr_layer_dir, "ofm.bin")
    dest_bin = os.path.join(intermediate_bin_dir,
                            f"ofm{layer_output_name}.bin")
    shutil.copyfile(src_bin, dest_bin)
    # ifm.bin
    src_bin = os.path.join(curr_layer_dir, "ifm.bin")
    dest_bin = os.path.join(intermediate_bin_dir,
                            f"ifm{layer_output_name}.bin")
    shutil.copyfile(src_bin, dest_bin)
    # wgt.bin
    src_bin = os.path.join(curr_layer_dir, "wgt.bin")
    dest_bin = os.path.join(intermediate_bin_dir,
                            f"wgt{layer_output_name}.bin")
    shutil.copyfile(src_bin, dest_bin)


def join_paths_with_suffix(path_list, suffix) -> List[str]:
    """Appends `suffix` to each path in `path_list` using os.path.join"""
    return [os.path.join(base, suffix) for base in path_list]


def merge_dicts(dict_list: List[Dict[str, Any]]) -> Dict[str, Any]:
    """Takes a list of dictionaries, merges them into one dictionary"""
    merged = {}
    for d in dict_list:
        merged.update(d)
    return dict(merged)


def __copy_read_bins(config_dict, intermediate_bin_dir, ifm_path, wgt_path) -> None:
    """Copy first block read bins to intermediate_bins_dir"""
    os.makedirs(intermediate_bin_dir, exist_ok=True)
    for i, ifm_p in enumerate(ifm_path):
        # Special case for op with more than 1 ifm
        key = f"input{i + 1}_name" if len(ifm_path) > 1 else "input_name"
        if os.path.exists(ifm_p):
            shutil.copyfile(
                ifm_p,
                os.path.join(
                    intermediate_bin_dir, f"ofm{config_dict[key].replace('/', '_')}.bin"
                ),
            )
        else:
            log(f"[WARN] Skip copying first block ifm bin {ifm_p}.")
    if wgt_path:
        if os.path.exists(wgt_path):
            shutil.copyfile(
                wgt_path,
                os.path.join(
                    intermediate_bin_dir,
                    f"wgt{config_dict['output_name'].replace('/', '_')}.bin",
                ),
            )
        else:
            log(f"[WARN] Skip copying first block wgt bin {wgt_path}.")


def create_fused_package(
    op_output_dirs: List[str],
    outfolder: str,
    subgraph_suffix: str,
    shim_prm_wgt_offset: list,
    enable_chained_di: bool = False,
    block_ids: List[int] = None,
    combined_pdi_path: str = "pdi.asm",
) -> str:
    """Fuse JSON block ids to create HW package"""
    # Join expected file paths across per-op output dirs
    all_dma_cpp_paths = join_paths_with_suffix(op_output_dirs, "aie4_dma.cpp")
    all_prm_bin_paths = join_paths_with_suffix(op_output_dirs, "param.bin")
    all_wgt_bin_paths = join_paths_with_suffix(op_output_dirs, "wgt.bin")

    # Prepare L2 fused output folder
    fused_dir = make_l2_fused_subdir(
        outfolder, subgraph_suffix=subgraph_suffix)
    fused_path = os.path.join(CURRDIR, fused_dir)
    fused_dma_cpp = os.path.join(fused_path, "aie4_dma.cpp")
    fused_prm_bin = os.path.join(fused_path, "param.bin")
    fused_wgt_bin = os.path.join(fused_path, "wgt.bin")

    # Merge control code and build
    merge_control_code_blocks_to_output(
        all_dma_cpp_paths, fused_dma_cpp, "cpp")

    prev_cwd = os.getcwd()
    try:
        os.chdir(fused_path)
        build_cert(copy_pdi=False, fused_layer_ids=block_ids)
    finally:
        os.chdir(prev_cwd)

    # Restructure package and fuse bins
    restructure_fused_hw_package(fused_path, op_output_dirs)
    fuse_bins(all_prm_bin_paths, fused_prm_bin)
    fuse_bins(all_wgt_bin_paths, fused_wgt_bin)

    # Generate PDI ASM
    gen_pdi_asm(fused_path, combined_pdi_path)

    # Generate ELF
    cfg = [CfgItem(id="aie4_models", path=fused_path)]
    generate_ctrl_elf(cfg, fused_path)

    # Metadata
    log(f"[INFO] Saved control.elf to: {fused_path}")
    log(f"[INFO] Saved Fused hw_package to: {fused_path}")

    # Write Shim Offsets to CSV
    csv_path = os.path.join(fused_path, "shim_prm_wgt_offsets.csv")
    with open(csv_path, "w", newline="", encoding="utf-8") as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(["layer_id", "operator", "shim_prm_offset", "shim_wgt_offset"])
        writer.writerows(shim_prm_wgt_offset)
    log(f"[INFO] Saved shim offsets log to: {csv_path}")

    if enable_chained_di:
        # Capture first/last IFM/OFM bins if present
        IFM_BIN_PATH = os.path.join(op_output_dirs[0], "ifm.bin")
        OFM_BIN_PATH = os.path.join(op_output_dirs[-1], "ofm.bin")
        fused_ifm_bin = os.path.join(fused_path, "ifm.bin")
        fused_ofm_bin = os.path.join(fused_path, "ofm.bin")
        # Copy through IFM/OFM if they exist
        if os.path.exists(IFM_BIN_PATH):
            shutil.copyfile(IFM_BIN_PATH, fused_ifm_bin)
        if os.path.exists(OFM_BIN_PATH):
            shutil.copyfile(OFM_BIN_PATH, fused_ofm_bin)

    return fused_path


def gen_model_elf(cfg: List[CfgItem], out_path: str):
    """Generate ELF for the whole model"""
    # Create Dir for Model ELF
    elf_out_path = os.path.join(out_path, "model_elf")
    os.makedirs(elf_out_path, exist_ok=True)

    # Generate Model ELF
    generate_ctrl_elf(cfg, elf_out_path)


def get_os_core_count() -> int:
    """Get OS Core Count"""
    try:
        return len(os.sched_getaffinity(0))
    except AttributeError:
        return os.cpu_count() or 1


def default_L3_mappings(alloc_json: dict[str, Any], block_id: str) -> None:
    """
    Update the L3 dictionary to Default for a given block in-place.

    Rules:
      1. xrt_id for all IFMs (keys starting with "ifm") -> 1
      2. xrt_id for OFM -> 0 and xrt_offset -> 0
      3. xrt_offset for IFMs:
           ifm0.offset = 0
           ifm1.offset = ifm0.size
           ifm2.offset = ifm0.size + ifm1.size
    """
    block = alloc_json.get(block_id)
    if not block or "L3" not in block:
        raise KeyError(f"Block '{block_id}' missing or has no 'L3' section")

    L3 = block["L3"]

    # Separate IFM and OFM keys
    ifm_keys = [k for k in L3.keys() if k.startswith("ifm")]
    ofm_keys = [k for k in L3.keys() if k.startswith("ofm")]

    # --- Update IFMs ---
    offset = 0
    for key in sorted(ifm_keys):  # ensure consistent order (ifm0, ifm1, ifm2, …)
        entry = L3[key]
        if not isinstance(entry, list) or len(entry) != 3:
            raise ValueError(
                f"L3[{key}] must be a list of [xrt_id, xrt_offset, size]")
        size = entry[2]
        L3[key] = [1, offset, size]
        offset += size

    # --- Update OFMs ---
    for key in ofm_keys:
        entry = L3[key]
        if not isinstance(entry, list) or len(entry) != 3:
            raise ValueError(
                f"L3[{key}] must be a list of [xrt_id, xrt_offset, size]")
        size = entry[2]
        L3[key] = [0, 0, size]


def values_for_keys(
    mapping: Dict[int, str], keys: Optional[List[int]] = None
) -> List[str]:
    """
    Return values from the dictionary for the given keys.
    """
    if keys:
        return [mapping[k] for k in keys if k in mapping]
    return list(mapping.values())


def mark_block_compilable(raw_json: dict, bids: list[str], compilable: bool) -> None:
    """Mark one or more blocks in raw_json as compilable or un-compilable."""
    for bid in bids:
        try:
            if raw_json and isinstance(raw_json.get(bid), dict):
                raw_json[bid]["is_compilable"] = compilable
            else:
                print(f"[WARN] Block {bid} not found or invalid type.")
        except Exception as mark_err:  # pylint: disable=broad-exception-caught
            state = "compilable" if compilable else "un-compilable"
            print(f"[WARN] Could not mark block {bid} as {state}: {mark_err}")


def is_chained_di():
    """Function to check if CHAINED_DI is enabled"""
    return int(os.getenv("CHAINED_DI", "false").lower() in ("1", "true", "yes"))


def validate_paths(path_vars: dict, *, keys: Iterable[str]) -> None:
    """Validate if file path exists."""
    for key in keys:
        path = path_vars.get(key)

        if not path:  # None or ""
            continue

        if not os.path.exists(path):
            raise RuntimeError(f"The following path does not exist: {path}")


def generate_subgraph_nodelists(
    outfolder: str,
    subgraphs: Dict[int, list[int]],
    alloc_json: dict,
) -> None:
    """
    Generate per-subgraph node list files and exit.

    Creates {outfolder}/nodelists/nodelist_subgraph_<idx>.txt,
    where each file contains ordered layer names for that subgraph.
    """
    nodelists = os.path.join(outfolder, "nodelists")

    # Clean and recreate output directory
    shutil.rmtree(nodelists, ignore_errors=True)
    os.makedirs(nodelists, exist_ok=True)

    # One-time map for fast id -> layer name lookup
    id_to_name = {k: v["name"] for k, v in alloc_json.items()}

    # Write one node list file per subgraph
    for idx, sg in subgraphs.items():
        out_path = os.path.join(nodelists, f"nodelist_subgraph_{idx}.txt")
        with open(out_path, "w", encoding="utf-8", newline="\n") as f:
            f.writelines(f"{id_to_name[str(layer)]}\n" for layer in sg)

    # Explicit early exit after nodelist generation
    raise SystemExit(0)


def get_ml_timeline_log_level():
    """Function to return ML_TIMER_LOG_LEVEL"""
    return int(os.getenv("ML_TIMER_LOG_LEVEL", "0").lower() in ("1", "true", "yes"))


def inject_timestamps(
    aie4_dma_cpp_path: str | Path,
    block_ids: Optional[List[int]] = None,
) -> None:
    """
    In-place inject timestamps into aie4_dma.cpp and write ml_timeline_metadata.json.

    Inside each UC Open/Close region:
      - after "    // RECORD TIMER START" -> start timestamp
      - after "    // RECORD TIMER STOP"  -> end timestamp

    Non-fused: exactly 1 START/STOP per UC, metadata key "0".
    Fused: exactly len(block_ids) START/STOP per UC, keys are block_ids in encounter order.
    Timestamp indices reset to 0 per UC.
    """
    def sort_metadata_numeric(meta: dict) -> dict:
        """
        This function sorts the generated metadata in ascending order of layer ids for each uC.
        Example of metadata:
            metadata = { "uc2.asm": { "3": {...}, "10": {...} }, ... }
        """
        sorted_meta = {}
        for uc, blocks in meta.items():
            sorted_blocks = dict(sorted(blocks.items(), key=lambda kv: int(kv[0])))
            sorted_meta[uc] = sorted_blocks
        return sorted_meta

    cpp_path = Path(aie4_dma_cpp_path)
    if not cpp_path.exists():
        raise FileNotFoundError(f"aie4_dma.cpp not found: {cpp_path}")

    ids = list(block_ids or [])
    is_fused = block_ids is not None

    uc_list = ["uc0.asm", "uc2.asm", "uc4.asm"] if config.ENABLE_MULTI_UC else ["test.asm"]
    open_lines = {uc: f'XAie_OpenControlCodeFile(&DevInst, "Work_AIE4/{uc}", 8192);' for uc in uc_list}

    CLOSE = "XAie_CloseControlCodeFile(&DevInst);"
    TSTART = "    // RECORD TIMER START"
    TSTOP = "    // RECORD TIMER STOP"
    OPEN_HEAD = "XAie_OpenControlCodeFile(&DevInst"
    TS_FMT = "\n{ind}XRT_ERRCHK(XAie_ControlCodeSaveTimestamp(&DevInst, {i}));\n"

    lines = cpp_path.read_text(encoding="utf-8").splitlines(keepends=True)

    out: List[str] = []
    meta: Dict[str, Dict[str, Dict[str, int]]] = {uc: {} for uc in uc_list}

    # per-UC state (active UC only)
    in_uc: Optional[str] = None
    ts_idx = 0
    j_start = 0
    j_stop = 0
    seen_start = 0
    seen_stop = 0
    exp = len(ids) if is_fused else 1

    def match_open(ln: str) -> Optional[str]:
        """
        OPEN_HEAD = "XAie_OpenControlCodeFile(&DevInst"
        Function check that if line contains the string OPEN_HEAD.
        """
        if OPEN_HEAD not in ln:
            return None
        return next((uc for uc in uc_list if open_lines[uc] in ln), None)

    # Parser the control code to look for XAie_OpenControlCodeFile
    # and start appending the following lines until XAie_CloseControlCodeFile
    # is enocuntered. This is control code for one uC.
    for ln in lines:
        if in_uc is None:
            uc = match_open(ln)
            out.append(ln)
            if uc is not None:
                in_uc = uc
                ts_idx = j_start = j_stop = seen_start = seen_stop = 0
            continue

        # inside UC
        out.append(ln)
        s = ln.rstrip("\n")

        if s == TSTART:
            ind = ln[: len(ln) - len(ln.lstrip())]
            out.append(TS_FMT.format(ind=ind, i=ts_idx))
            key = str(ids[j_start]) if is_fused else "0"
            meta[in_uc].setdefault(key, {"start": -1, "end": -1})["start"] = ts_idx
            ts_idx += 1
            j_start += 1
            seen_start += 1
            continue

        if s == TSTOP:
            ind = ln[: len(ln) - len(ln.lstrip())]
            out.append(TS_FMT.format(ind=ind, i=ts_idx))
            key = str(ids[j_stop]) if is_fused else "0"
            if key not in meta[in_uc] or meta[in_uc][key]["start"] == -1:
                raise ValueError(f"[{in_uc}] STOP before START for block_id={key}.")
            meta[in_uc][key]["end"] = ts_idx
            ts_idx += 1
            j_stop += 1
            seen_stop += 1
            continue

        if CLOSE in ln:
            # UC end: validate marker counts + all ends set
            if seen_start != exp or seen_stop != exp:
                raise ValueError(
                    f"[{in_uc}] Expected {exp} START/STOP markers, got {seen_start}/{seen_stop}."
                )
            for bid, se in meta[in_uc].items():
                if se.get("end", -1) == -1:
                    raise ValueError(f"[{in_uc}] Missing end timestamp for block_id={bid}.")
            in_uc = None

    # If file ended while still inside a UC, that's malformed
    if in_uc is not None:
        raise ValueError(f"[{in_uc}] Unterminated UC block (missing CloseControlCodeFile).")

    cpp_path.write_text("".join(out), encoding="utf-8")

    meta = sort_metadata_numeric(meta)
    (cpp_path.parent / "ml_timeline_metadata.json").write_text(
        json.dumps(meta, indent=2, sort_keys=True),
        encoding="utf-8",
    )


def set_datatype(is_qdq_dtype_fp16: bool):
    """Function to set env variable for QDQ Datatype."""
    if is_qdq_dtype_fp16:
        os.environ["IS_QDQ_FP16"] = str(1)
    else:
        os.environ["IS_QDQ_FP16"] = str(0)


def is_qdq_fp16() -> bool:
    """Is QDQ FP16 Enabled?"""
    return int(os.environ.get("IS_QDQ_FP16", "1") == "1")


def get_unique_ops(d: dict[str, Any]) -> Set[str]:
    """
    Extract unique 'op_type' strings from a dict-of-dicts JSON structure:
      { "Add_0": {"op_type": "...", ...}, "Add_1": {"op_type": "...", ...}, ... }
    """
    ops: Set[str] = set()
    for v in d.values():
        if not isinstance(v, dict):
            continue
        op_type = v.get("op_type")
        if isinstance(op_type, str):
            ops.add(op_type)
    return ops


def get_skip_ops(json_path: str, include_ops: str) -> str:
    """
    Returns:
      - skip_ops_str: comma-separated string with no spaces
    """
    with open(json_path, "r", encoding="utf-8") as f:
        data = json.load(f)

    unique_ops = get_unique_ops(data)

    include_set = {
        op.strip()
        for op in include_ops.split(",")
        if op.strip()
    }

    skip_ops = unique_ops - include_set

    # Log
    print("Include ops provided:")
    print(f"  {sorted(include_set)}\n")

    print("Unique ops found in JSON:")
    print(f"  {sorted(unique_ops)}\n")

    print(
        "The above unique ops were compared against the include ops list.\n"
        "The following ops are unique nodes not present in the include list "
        "and will therefore be skipped:"
    )

    if skip_ops:
        for op in sorted(skip_ops):
            print(f"  - {op}")
    else:
        print("  (None — no skip ops)")

    skip_ops_str = ",".join(sorted(skip_ops))
    return skip_ops_str


def map_subgraphs_to_json_keys(json_path: str, subgraphs: Dict[int, List[str]]) -> Dict[int, List[int]]:
    """
    For each subgraph_id -> [node_name1, node_name2, ...],
    return subgraph_id -> [json_key_int1, json_key_int2, ...]
    where json_key is the key in the json whose value["name"] matches node_name.
    """
    with open(json_path, "r", encoding="utf-8") as f:
        data = json.load(f)

    # Build reverse lookup: node_name -> json_key(int)
    name_to_key = {}
    for k, v in data.items():
        if isinstance(v, dict) and "name" in v:
            name_to_key[v["name"]] = int(k)

    mapping = {}
    for subgraph_id, names in subgraphs.items():
        mapping[subgraph_id] = [name_to_key[n] for n in names if n in name_to_key]

    return mapping


def generate_xrt_ini() -> None:
    """
    Generate xrt.ini configuration file for ML Timeline.
    """
    xrt_ini_content = """[Debug]
ml_timeline=true
"""
    xrt_ini_path = os.path.join(os.getcwd(), "xrt.ini")
    with open(xrt_ini_path, "w", encoding="utf-8") as f:
        f.write(xrt_ini_content)
    print(f"[ML-Timeline] Generated {xrt_ini_path}")

    # Copy static aie_trace_config.json to enable ML Timline on device
    trace_json_path_src = os.path.join(CURRDIR, "HW_requirements", "aie_trace_config.json")
    trace_json_path_dst = os.path.join(os.getcwd(), "aie_trace_config.json")
    shutil.copy2(trace_json_path_src, trace_json_path_dst)
