"""
Utilities for deduplicating and pruning PDI kernel variants.

This module:
  - Finds unique PDI variants from many (includes + kernel-name) combinations
    while treating ordering as irrelevant.
  - Removes redundant variants by keeping only strict supersets (i.e., variants
    that cover all required kernels/includes, and possibly more).
  - Exports the final minimal set of variants in a paste-ready Python format.

Primary output:
  - unique_pdi_variants.py (pdi_variants dict)
"""
from __future__ import annotations

from typing import Any, Dict, Tuple, List, Set
import json
from pathlib import Path
from datetime import date


def prune_by_kernel_superset(
    unique_pdi_map: Dict[str, Dict[str, Any]],
) -> Tuple[Dict[str, Dict[str, Any]], Dict[str, str]]:
    """
    Remove any combo whose kernel set is a strict subset of another combo.
    Kernel ID values are ignored.
    """
    kernel_sets: Dict[str, Set[str]] = {}

    for ukey, combo in unique_pdi_map.items():
        names = combo.get("combined_kernel_names", {})
        if not isinstance(names, dict):
            raise TypeError(f"{ukey}: combined_kernel_names must be a dict")
        kernel_sets[ukey] = set(names.keys())

    keys = list(unique_pdi_map.keys())
    keys.sort(key=lambda k: len(kernel_sets[k]), reverse=True)

    removed_to_kept: Dict[str, str] = {}
    kept: Set[str] = set()

    for i, k_big in enumerate(keys):
        if k_big in removed_to_kept:
            continue

        kept.add(k_big)
        big_kernels = kernel_sets[k_big]

        for k_small in keys[i + 1:]:
            if k_small in removed_to_kept:
                continue

            small_kernels = kernel_sets[k_small]

            # strict subset => remove smaller
            if small_kernels < big_kernels:
                removed_to_kept[k_small] = k_big

    pruned_unique_map = {k: v for k, v in unique_pdi_map.items() if k not in removed_to_kept}
    return pruned_unique_map, removed_to_kept


def print_final_unique_combinations(
    unique_pdi_mapping: Dict[str, Dict[str, Any]],
    duplicate_pdi_groups: Dict[str, List[str]],
) -> None:
    """
    Prints final unique combos after superset pruning + prints a paste-ready dict:

    pdi_variants = {
        "pdi_combination_0": {
            "combined_kernel_names": {...},
            "combined_kernel_includes": [...],
        },
        ...
    }
    """
    pruned_unique_map, removed_to_kept = prune_by_kernel_superset(unique_pdi_mapping)

    # Merge duplicate_pdi_groups info into final combos
    final_groups: Dict[str, List[str]] = {
        k: list(duplicate_pdi_groups.get(k, [])) for k in pruned_unique_map.keys()
    }

    # Any removed unique_key should contribute its group IDs to its kept unique_key
    for removed_key, kept_key in removed_to_kept.items():
        if kept_key in final_groups:
            final_groups[kept_key].extend(duplicate_pdi_groups.get(removed_key, []))

    # Deduplicate IDs and sort
    for k in final_groups:
        final_groups[k] = sorted(set(final_groups[k]))

    # Stable printing order: by (#kernels desc, #includes desc)
    def sort_key(ukey: str) -> tuple:
        combo = pruned_unique_map[ukey]
        kernels = combo.get("combined_kernel_names", {})
        includes = combo.get("combined_kernel_includes", [])
        return (-len(kernels), -len(includes))

    ordered_keys = sorted(pruned_unique_map.keys(), key=sort_key)

    print("\n" + "=" * 120)
    print("FINAL UNIQUE COMBINATIONS (after superset pruning)")
    print(f"Before pruning: {len(unique_pdi_mapping)} | After pruning: {len(pruned_unique_map)}")
    print("=" * 120)

    # Logging
    for idx, ukey in enumerate(ordered_keys, start=1):
        combo = pruned_unique_map[ukey]

        includes = sorted(set(combo.get("combined_kernel_includes", [])))
        names_dict = combo.get("combined_kernel_names", {})
        kernel_names = sorted(names_dict.keys())
        matched_combo_ids = final_groups.get(ukey, [])

        print("\n" + "-" * 120)
        print(f"[FINAL #{idx}] kernels={len(kernel_names)} includes={len(includes)}")
        print(f"Matches original combos: {matched_combo_ids}")

        print("\nKernel Names:")
        for kn in kernel_names:
            print(f"  - {kn}")

        print("\nIncludes:")
        for inc in includes:
            print(f"  - {inc}")

    print("\n" + "=" * 120)

    # Build pdi_variants dict with deterministic ordering and formatting
    out_path = Path(__file__).resolve().parent / "unique_pdi_variants.py"

    with out_path.open("w", encoding="utf-8") as f:
        f.write('"""\nAuto-generated unique PDI variants (for operators supported on TOT).')
        f.write(f' Generated on {date.today().isoformat()}.\n"""\n\n')
        f.write("pdi_variants = {\n")

        for idx, ukey in enumerate(ordered_keys):
            combo = pruned_unique_map[ukey]

            includes = set(combo.get("combined_kernel_includes", []))
            includes_sorted = ["super.hh"] + sorted(x for x in includes if x != "super.hh")

            names_dict = combo.get("combined_kernel_names", {})
            names_sorted = {k: names_dict[k] for k in sorted(names_dict.keys())}

            f.write(f'    "pdi_combination_{idx}": {{\n')

            f.write('        "combined_kernel_names": {\n')
            for k, v in names_sorted.items():
                f.write(f'            "{k}": {v},\n')
            f.write("        },\n")

            f.write('        "combined_kernel_includes": [\n')
            for inc in includes_sorted:
                f.write(f'            "{inc}",\n')
            f.write("        ],\n")

            f.write("    },\n")

        f.write("}\n")

    print(f"\nSaved DPI variant dictionary to: {out_path}\n")


def _canon_combo(combo: Dict[str, Any]) -> Tuple[Tuple[str, ...], Tuple[Tuple[str, int], ...]]:
    """
    Convert a combo dict into a canonical (hashable) representation where:
      - combined_kernel_includes order doesn't matter  -> sorted tuple
      - combined_kernel_names order doesn't matter     -> sorted tuple of (name, id)
    """
    includes = combo.get("combined_kernel_includes", [])
    names = combo.get("combined_kernel_names", {})

    canon_includes = tuple(sorted(includes))
    canon_names = tuple(sorted(names.items(), key=lambda x: x[0]))  # sort by kernel name

    return canon_includes, canon_names


def find_unique_combinations(
    pdi_combinations: Dict[str, Dict[str, Any]]
) -> Tuple[Dict[str, Dict[str, Any]], Dict[str, List[str]]]:
    """
    Returns:
      unique_map:
        { unique_key -> representative combo dict }

      duplicate_groups:
        { unique_key -> list of combination_ids that match that unique_key }
    """
    unique_pdi_mapping: Dict[str, Dict[str, Any]] = {}
    pdi_duplicate_groups: Dict[str, List[str]] = {}

    for combo_id, combo in pdi_combinations.items():
        canon = _canon_combo(combo)

        unique_key = json.dumps(
            {
                "includes": list(canon[0]),
                "names": [[k, v] for k, v in canon[1]],
            },
            sort_keys=True,
        )

        if unique_key not in unique_pdi_mapping:
            unique_pdi_mapping[unique_key] = combo

        pdi_duplicate_groups.setdefault(unique_key, []).append(combo_id)

    return unique_pdi_mapping, pdi_duplicate_groups


if __name__ == "__main__":
    combinations = {
        "combination_0": {
            "combined_kernel_includes": ['super.hh', 'q/q.hpp', 'q/q_wrapper.cc', 'layer_norm_fp16x16/layer_norm_fp16x16_wrapper.cc',
                                         'gemm_qdq_int16x8/gemm_int16x8_wrapper.cc', 'gemm_qdq_int16x16_transpose/gemm_int16x16_transpose_wrapper.cc',
                                         'broadcast/run_bdcastadd_wrapper.cc', 'q/q_impl.hpp', 'dq/dq_impl.hpp', 'dq/dq.hpp',
                                         'softmax_fp16x16/softmax_fp16x16_wrapper.cc', 'linear_approx_bf16/linear_approx_bf16_wrapper.cc', 'dq/dq_wrapper.cc'],
            "combined_kernel_names": {'run_quant': 11, 'run_layernorm_fp16x16': 14, 'run_gemm_int16x8': 4,
                                      'run_gemm_int16x16_transpose': 16, 'run_bdcastadd_16': 17, 'run_softmax_fp16x16': 6,
                                      'run_lut_fp16x16': 20, 'run_dequant': 10}
            },
        "combination_1": {
            "combined_kernel_includes": ['super.hh', 'q/q.hpp', 'q/q_wrapper.cc', 'layer_norm_fp16x16/layer_norm_fp16x16_wrapper.cc',
                                         'gemm_qdq_int16x8/gemm_int16x8_wrapper.cc', 'gemm_qdq_int16x16_transpose/gemm_int16x16_transpose_wrapper.cc',
                                         'broadcast/run_bdcastadd_wrapper.cc', 'q/q_impl.hpp', 'dq/dq_impl.hpp', 'dq/dq.hpp',
                                         'softmax_fp16x16/softmax_fp16x16_wrapper.cc', 'linear_approx_bf16/linear_approx_bf16_wrapper.cc', 'dq/dq_wrapper.cc'],
            "combined_kernel_names": {'run_quant': 11, 'run_layernorm_fp16x16': 14, 'run_gemm_int16x8': 4, 'run_gemm_int16x16_transpose': 16,
                                      'run_bdcastadd_16': 17, 'run_softmax_fp16x16': 6, 'run_lut_fp16x16': 20, 'run_dequant': 10}
            },
        "combination_2": {
            "combined_kernel_includes": ['super.hh', 'q/q.hpp', 'q/q_wrapper.cc', 'conv_qdq_int16x8/conv_qdq_a16w8_wrapper.cc',
                                         'dq/dq.hpp', 'linear_approx_bf16/linear_approx_bf16_wrapper.cc', 'gemm_qdq_int16x8/gemm_int16x8_wrapper.cc',
                                         'broadcast/run_bdcastadd_wrapper.cc', 'q/q_impl.hpp', 'dq/dq_impl.hpp', 'groupnorm/norm.cc', 'dq/dq_wrapper.cc',
                                         'layer_norm_fp16x16/layer_norm_fp16x16_wrapper.cc', 'gemm_qdq_int16x16_transpose/gemm_int16x16_transpose_wrapper.cc',
                                         'softmax_fp16x16/softmax_fp16x16_wrapper.cc', 'broadcast/run_bdcastmul_wrapper.cc'],
            "combined_kernel_names": {'run_quant': 11, 'run_conv_qdq_a16w8': 15, 'run_lut_fp16x16': 20, 'run_gemm_int16x8': 4, 'run_bdcastadd_16': 17,
                                      'run_group_norm_qdq': 22, 'run_dequant': 10, 'run_layernorm_fp16x16': 14, 'run_gemm_int16x16_transpose': 16,
                                      'run_softmax_fp16x16': 6, 'run_bdcastmul_16': 18}
            },
        "combination_3": {
            "combined_kernel_includes": ['super.hh', 'q/q.hpp', 'q/q_wrapper.cc', 'conv_qdq_int16x8/conv_qdq_a16w8_wrapper.cc', 'dq/dq.hpp',
                                         'linear_approx_bf16/linear_approx_bf16_wrapper.cc', 'gemm_qdq_int16x8/gemm_int16x8_wrapper.cc',
                                         'broadcast/run_bdcastadd_wrapper.cc', 'q/q_impl.hpp', 'dq/dq_impl.hpp', 'groupnorm/norm.cc', 'dq/dq_wrapper.cc',
                                         'layer_norm_fp16x16/layer_norm_fp16x16_wrapper.cc', 'gemm_qdq_int16x16_transpose/gemm_int16x16_transpose_wrapper.cc',
                                         'softmax_fp16x16/softmax_fp16x16_wrapper.cc', 'broadcast/run_bdcastmul_wrapper.cc'],
            "combined_kernel_names": {'run_quant': 11, 'run_conv_qdq_a16w8': 15, 'run_lut_fp16x16': 20, 'run_gemm_int16x8': 4, 'run_bdcastadd_16': 17,
                                      'run_group_norm_qdq': 22, 'run_dequant': 10, 'run_layernorm_fp16x16': 14, 'run_gemm_int16x16_transpose': 16,
                                      'run_softmax_fp16x16': 6, 'run_bdcastmul_16': 18}
            },
        "combination_4": {
            "combined_kernel_includes": ['super.hh', 'q/q.hpp', 'q/q_wrapper.cc', 'dq/dq.hpp', 'linear_approx_bf16/linear_approx_bf16_wrapper.cc',
                                         'conv_qdq_int16x8/conv_qdq_a16w8_wrapper.cc', 'gemm_qdq_int16x8/gemm_int16x8_wrapper.cc', 'groupnorm/norm.cc',
                                         'broadcast/run_bdcastadd_wrapper.cc', 'q/q_impl.hpp', 'dq/dq_impl.hpp',
                                         'layer_norm_fp16x16/layer_norm_fp16x16_wrapper.cc', 'gemm_qdq_int16x16_transpose/gemm_int16x16_transpose_wrapper.cc',
                                         'softmax_fp16x16/softmax_fp16x16_wrapper.cc', 'broadcast/run_bdcastmul_wrapper.cc', 'dq/dq_wrapper.cc'],
            "combined_kernel_names": {'run_quant': 11, 'run_lut_fp16x16': 20, 'run_conv_qdq_a16w8': 15, 'run_gemm_int16x8': 4, 'run_group_norm_qdq': 22,
                                      'run_bdcastadd_16': 17, 'run_layernorm_fp16x16': 14, 'run_gemm_int16x16_transpose': 16, 'run_softmax_fp16x16': 6,
                                      'run_bdcastmul_16': 18, 'run_dequant': 10}
            },
        "combination_5": {
            "combined_kernel_includes": ['super.hh', 'q/q.hpp', 'q/q_wrapper.cc', 'dq/dq.hpp', 'l2norm_fp16x16/l2norm_fp16x16_wrapper.cc',
                                         'gemm_qdq_int16x4/gemm_int16x4_wrapper.cc', 'dq/dq_wrapper.cc', 'gemm_qdq_int16x8/gemm_int16x8_wrapper.cc',
                                         'broadcast/run_bdcastadd_wrapper.cc', 'q/q_impl.hpp', 'dq/dq_impl.hpp',
                                         'linear_approx_bf16/linear_approx_bf16_wrapper.cc', 'broadcast/run_bdcastmul_wrapper.cc'],
            "combined_kernel_names": {'run_quant': 11, 'run_l2norm_fp16x16': 7, 'run_gemm_int16x4': 5, 'run_dequant': 10, 'run_gemm_int16x8': 4,
                                      'run_bdcastadd_16': 17, 'run_lut_fp16x16': 20, 'run_bdcastmul_16': 18}
            },
        "combination_6": {
            "combined_kernel_includes": ['super.hh', 'q/q.hpp', 'q/q_wrapper.cc', 'dq/dq.hpp', 'l2norm_fp16x16/l2norm_fp16x16_wrapper.cc',
                                         'gemm_qdq_int16x4/gemm_int16x4_wrapper.cc', 'dq/dq_wrapper.cc', 'gemm_qdq_int16x8/gemm_int16x8_wrapper.cc',
                                         'broadcast/run_bdcastadd_wrapper.cc', 'q/q_impl.hpp', 'dq/dq_impl.hpp',
                                         'linear_approx_bf16/linear_approx_bf16_wrapper.cc', 'broadcast/run_bdcastmul_wrapper.cc'],
            "combined_kernel_names": {'run_quant': 11, 'run_l2norm_fp16x16': 7, 'run_gemm_int16x4': 5, 'run_dequant': 10, 'run_gemm_int16x8': 4,
                                      'run_bdcastadd_16': 17, 'run_lut_fp16x16': 20, 'run_bdcastmul_16': 18}
            },
        "combination_7": {
            "combined_kernel_includes": ['super.hh', 'q/q.hpp', 'q/q_wrapper.cc', 'broadcast/run_bdcastadd_wrapper.cc', 'q/q_impl.hpp',
                                         'dq/dq_impl.hpp', 'layer_norm_fp16x16/layer_norm_fp16x16_wrapper.cc', 'gemm_qdq_int16x8/gemm_int16x8_wrapper.cc',
                                         'dq/dq.hpp', 'dq/dq_wrapper.cc', 'gemm_qdq_int16x16_transpose/gemm_int16x16_transpose_wrapper.cc',
                                         'softmax_fp16x16/softmax_fp16x16_wrapper.cc', 'linear_approx_bf16/linear_approx_bf16_wrapper.cc',
                                         'l2norm_fp16x16/l2norm_fp16x16_wrapper.cc'],
            "combined_kernel_names": {'run_quant': 11, 'run_bdcastadd_16': 17, 'run_layernorm_fp16x16': 14, 'run_gemm_int16x8': 4, 'run_dequant': 10,
                                      'run_gemm_int16x16_transpose': 16, 'run_softmax_fp16x16': 6, 'run_lut_fp16x16': 20, 'run_l2norm_fp16x16': 7}
            },
        "combination_8": {
            "combined_kernel_includes": ['super.hh', 'q/q.hpp', 'q/q_wrapper.cc', 'broadcast/run_bdcastadd_wrapper.cc', 'q/q_impl.hpp', 'dq/dq_impl.hpp',
                                         'layer_norm_fp16x16/layer_norm_fp16x16_wrapper.cc', 'gemm_qdq_int16x8/gemm_int16x8_wrapper.cc', 'dq/dq.hpp',
                                         'dq/dq_wrapper.cc', 'gemm_qdq_int16x16_transpose/gemm_int16x16_transpose_wrapper.cc',
                                         'softmax_fp16x16/softmax_fp16x16_wrapper.cc', 'linear_approx_bf16/linear_approx_bf16_wrapper.cc',
                                         'l2norm_fp16x16/l2norm_fp16x16_wrapper.cc'],
            "combined_kernel_names": {'run_quant': 11, 'run_bdcastadd_16': 17, 'run_layernorm_fp16x16': 14, 'run_gemm_int16x8': 4, 'run_dequant': 10,
                                      'run_gemm_int16x16_transpose': 16, 'run_softmax_fp16x16': 6, 'run_lut_fp16x16': 20, 'run_l2norm_fp16x16': 7}
            },
        "combination_9": {
            "combined_kernel_includes": ['super.hh', 'q/q.hpp', 'q/q_wrapper.cc', 'broadcast/run_bdcastadd_wrapper.cc', 'q/q_impl.hpp', 'dq/dq_impl.hpp',
                                         'layer_norm_fp16x16/layer_norm_fp16x16_wrapper.cc', 'gemm_qdq_int16x8/gemm_int16x8_wrapper.cc', 'dq/dq.hpp',
                                         'dq/dq_wrapper.cc', 'gemm_qdq_int16x16_transpose/gemm_int16x16_transpose_wrapper.cc',
                                         'softmax_fp16x16/softmax_fp16x16_wrapper.cc', 'linear_approx_bf16/linear_approx_bf16_wrapper.cc',
                                         'l2norm_fp16x16/l2norm_fp16x16_wrapper.cc'],
            "combined_kernel_names": {'run_quant': 11, 'run_bdcastadd_16': 17, 'run_layernorm_fp16x16': 14, 'run_gemm_int16x8': 4, 'run_dequant': 10,
                                      'run_gemm_int16x16_transpose': 16, 'run_softmax_fp16x16': 6, 'run_lut_fp16x16': 20, 'run_l2norm_fp16x16': 7}
            },
        "combination_10": {
            "combined_kernel_includes": ['super.hh', 'broadcast/run_bdcastadd_wrapper.cc', 'q/q_impl.hpp', 'dq/dq_impl.hpp',
                                         'layer_norm_fp16x16/layer_norm_fp16x16_wrapper.cc', 'q/q.hpp', 'q/q_wrapper.cc',
                                         'gemm_qdq_int16x8/gemm_int16x8_wrapper.cc', 'dq/dq.hpp', 'dq/dq_wrapper.cc',
                                         'gemm_qdq_int16x16_transpose/gemm_int16x16_transpose_wrapper.cc', 'softmax_fp16x16/softmax_fp16x16_wrapper.cc',
                                         'linear_approx_bf16/linear_approx_bf16_wrapper.cc'],
            "combined_kernel_names": {'run_bdcastadd_16': 17, 'run_layernorm_fp16x16': 14, 'run_quant': 11, 'run_gemm_int16x8': 4, 'run_dequant': 10,
                                      'run_gemm_int16x16_transpose': 16, 'run_softmax_fp16x16': 6, 'run_lut_fp16x16': 20}
            },
        "combination_11": {
            "combined_kernel_includes": ['super.hh', 'broadcast/run_bdcastadd_wrapper.cc', 'q/q_impl.hpp', 'dq/dq_impl.hpp',
                                         'layer_norm_fp16x16/layer_norm_fp16x16_wrapper.cc', 'q/q.hpp', 'q/q_wrapper.cc',
                                         'gemm_qdq_int16x8/gemm_int16x8_wrapper.cc', 'gemm_qdq_int16x16_transpose/gemm_int16x16_transpose_wrapper.cc',
                                         'dq/dq.hpp', 'softmax_fp16x16/softmax_fp16x16_wrapper.cc', 'linear_approx_bf16/linear_approx_bf16_wrapper.cc',
                                         'dq/dq_wrapper.cc'],
            "combined_kernel_names": {'run_bdcastadd_16': 17, 'run_layernorm_fp16x16': 14, 'run_quant': 11, 'run_gemm_int16x8': 4,
                                      'run_gemm_int16x16_transpose': 16, 'run_softmax_fp16x16': 6, 'run_lut_fp16x16': 20, 'run_dequant': 10}
            },
        "combination_12": {
            "combined_kernel_includes": ['super.hh', 'q/q.hpp', 'q/q_wrapper.cc', 'gemm_qdq_int16x8/gemm_int16x8_wrapper.cc',
                                         'broadcast/run_bdcastadd_wrapper.cc', 'q/q_impl.hpp', 'dq/dq_impl.hpp',
                                         'layer_norm_fp16x16/layer_norm_fp16x16_wrapper.cc', 'gemm_qdq_int16x16_transpose/gemm_int16x16_transpose_wrapper.cc',
                                         'dq/dq.hpp', 'softmax_fp16x16/softmax_fp16x16_wrapper.cc', 'linear_approx_bf16/linear_approx_bf16_wrapper.cc',
                                         'dq/dq_wrapper.cc'],
            "combined_kernel_names": {'run_quant': 11, 'run_gemm_int16x8': 4, 'run_bdcastadd_16': 17, 'run_layernorm_fp16x16': 14,
                                      'run_gemm_int16x16_transpose': 16, 'run_softmax_fp16x16': 6, 'run_lut_fp16x16': 20, 'run_dequant': 10}
            },
        "combination_13": {
            "combined_kernel_includes": ['super.hh', 'q/q.hpp', 'q/q_wrapper.cc', 'layer_norm_fp16x16/layer_norm_fp16x16_wrapper.cc',
                                         'gemm_qdq_int16x8/gemm_int16x8_wrapper.cc', 'broadcast/run_bdcastadd_wrapper.cc', 'q/q_impl.hpp',
                                         'dq/dq_impl.hpp', 'dq/dq.hpp', 'dq/dq_wrapper.cc', 'gemm_qdq_int16x16_transpose/gemm_int16x16_transpose_wrapper.cc',
                                         'linear_approx_bf16/linear_approx_bf16_wrapper.cc', 'broadcast/run_bdcastmul_wrapper.cc',
                                         'softmax_fp16x16/softmax_fp16x16_wrapper.cc', 'l2norm_fp16x16/l2norm_fp16x16_wrapper.cc'],
            "combined_kernel_names": {'run_quant': 11, 'run_layernorm_fp16x16': 14, 'run_gemm_int16x8': 4, 'run_bdcastadd_16': 17, 'run_dequant': 10,
                                      'run_gemm_int16x16_transpose': 16, 'run_lut_fp16x16': 20, 'run_bdcastmul_16': 18, 'run_softmax_fp16x16': 6,
                                      'run_l2norm_fp16x16': 7}
            },
        "combination_14": {
            "combined_kernel_includes": ['super.hh', 'q/q.hpp', 'q/q_wrapper.cc', 'broadcast/run_bdcastadd_wrapper.cc', 'q/q_impl.hpp', 'dq/dq_impl.hpp',
                                         'layer_norm_fp16x16/layer_norm_fp16x16_wrapper.cc', 'gemm_qdq_int16x8/gemm_int16x8_wrapper.cc',
                                         'gemm_qdq_int16x16_transpose/gemm_int16x16_transpose_wrapper.cc', 'dq/dq.hpp',
                                         'linear_approx_bf16/linear_approx_bf16_wrapper.cc', 'broadcast/run_bdcastmul_wrapper.cc',
                                         'softmax_fp16x16/softmax_fp16x16_wrapper.cc', 'dq/dq_wrapper.cc', 'broadcast/run_bdcastdiv_wrapper.cc'],
            "combined_kernel_names": {'run_quant': 11, 'run_bdcastadd_16': 17, 'run_layernorm_fp16x16': 14, 'run_gemm_int16x8': 4,
                                      'run_gemm_int16x16_transpose': 16, 'run_lut_fp16x16': 20, 'run_bdcastmul_16': 18, 'run_softmax_fp16x16': 6,
                                      'run_dequant': 10, 'run_bdcastdiv_16': 23}
            },
        "combination_15": {
            "combined_kernel_includes": ['super.hh', 'q/q.hpp', 'q/q_wrapper.cc', 'conv_qdq_int16x8/conv_qdq_a16w8_wrapper.cc',
                                         'layer_norm_fp16x16/layer_norm_fp16x16_wrapper.cc', 'broadcast/run_bdcastadd_wrapper.cc',
                                         'q/q_impl.hpp', 'dq/dq_impl.hpp', 'gemm_qdq_int16x8/gemm_int16x8_wrapper.cc',
                                         'gemm_qdq_int16x16_transpose/gemm_int16x16_transpose_wrapper.cc', 'dq/dq.hpp',
                                         'softmax_fp16x16/softmax_fp16x16_wrapper.cc', 'linear_approx_bf16/linear_approx_bf16_wrapper.cc',
                                         'dq/dq_wrapper.cc', 'l2norm_fp16x16/l2norm_fp16x16_wrapper.cc'],
            "combined_kernel_names": {'run_quant': 11, 'run_conv_qdq_a16w8': 15, 'run_layernorm_fp16x16': 14, 'run_bdcastadd_16': 17,
                                      'run_gemm_int16x8': 4, 'run_gemm_int16x16_transpose': 16, 'run_softmax_fp16x16': 6, 'run_lut_fp16x16': 20,
                                      'run_dequant': 10, 'run_l2norm_fp16x16': 7}
            },
        "combination_16": {
            "combined_kernel_includes": ['super.hh', 'q/q.hpp', 'dq/dq.hpp', 'l2norm_fp16x16/l2norm_fp16x16_wrapper.cc',
                                         'gemm_qdq_int16x4/gemm_int16x4_wrapper.cc', 'dq/dq_wrapper.cc', 'q/q_wrapper.cc',
                                         'broadcast/run_bdcastadd_wrapper.cc', 'q/q_impl.hpp', 'dq/dq_impl.hpp',
                                         'linear_approx_bf16/linear_approx_bf16_wrapper.cc', 'broadcast/run_bdcastmul_wrapper.cc'],
            "combined_kernel_names": {'run_l2norm_fp16x16': 7, 'run_gemm_int16x4': 5, 'run_dequant': 10, 'run_quant': 11, 'run_bdcastadd_16': 17,
                                      'run_lut_fp16x16': 20, 'run_bdcastmul_16': 18}
            },
        "combination_17": {
            "combined_kernel_includes": ['super.hh', 'q/q.hpp', 'dq/dq.hpp', 'l2norm_fp16x16/l2norm_fp16x16_wrapper.cc',
                                         'gemm_qdq_int16x4/gemm_int16x4_wrapper.cc', 'dq/dq_wrapper.cc', 'q/q_wrapper.cc',
                                         'broadcast/run_bdcastadd_wrapper.cc', 'q/q_impl.hpp', 'dq/dq_impl.hpp',
                                         'linear_approx_bf16/linear_approx_bf16_wrapper.cc', 'broadcast/run_bdcastmul_wrapper.cc'],
            "combined_kernel_names": {'run_l2norm_fp16x16': 7, 'run_gemm_int16x4': 5, 'run_dequant': 10, 'run_quant': 11, 'run_bdcastadd_16': 17,
                                      'run_lut_fp16x16': 20, 'run_bdcastmul_16': 18}
            },
        "combination_18": {
            "combined_kernel_includes": ['super.hh', 'q/q.hpp', 'dq/dq.hpp', 'l2norm_fp16x16/l2norm_fp16x16_wrapper.cc',
                                         'gemm_qdq_int16x4/gemm_int16x4_wrapper.cc', 'dq/dq_wrapper.cc', 'q/q_wrapper.cc',
                                         'broadcast/run_bdcastadd_wrapper.cc', 'q/q_impl.hpp', 'dq/dq_impl.hpp',
                                         'linear_approx_bf16/linear_approx_bf16_wrapper.cc', 'broadcast/run_bdcastmul_wrapper.cc'],
            "combined_kernel_names": {'run_l2norm_fp16x16': 7, 'run_gemm_int16x4': 5, 'run_dequant': 10, 'run_quant': 11, 'run_bdcastadd_16': 17,
                                      'run_lut_fp16x16': 20, 'run_bdcastmul_16': 18}
            },
        "combination_19": {
            "combined_kernel_includes": ['super.hh', 'q/q.hpp', 'dq/dq.hpp', 'l2norm_fp16x16/l2norm_fp16x16_wrapper.cc',
                                         'gemm_qdq_int16x4/gemm_int16x4_wrapper.cc', 'dq/dq_wrapper.cc', 'q/q_wrapper.cc',
                                         'broadcast/run_bdcastadd_wrapper.cc', 'q/q_impl.hpp', 'dq/dq_impl.hpp',
                                         'linear_approx_bf16/linear_approx_bf16_wrapper.cc', 'broadcast/run_bdcastmul_wrapper.cc'],
            "combined_kernel_names": {'run_l2norm_fp16x16': 7, 'run_gemm_int16x4': 5, 'run_dequant': 10, 'run_quant': 11, 'run_bdcastadd_16': 17,
                                      'run_lut_fp16x16': 20, 'run_bdcastmul_16': 18}
            },
        "combination_20": {
            "combined_kernel_includes": ['super.hh', 'q/q.hpp', 'dq/dq.hpp', 'l2norm_fp16x16/l2norm_fp16x16_wrapper.cc',
                                         'gemm_qdq_int16x4/gemm_int16x4_wrapper.cc', 'dq/dq_wrapper.cc', 'q/q_wrapper.cc',
                                         'broadcast/run_bdcastadd_wrapper.cc', 'q/q_impl.hpp', 'dq/dq_impl.hpp',
                                         'linear_approx_bf16/linear_approx_bf16_wrapper.cc', 'broadcast/run_bdcastmul_wrapper.cc'],
            "combined_kernel_names": {'run_l2norm_fp16x16': 7, 'run_gemm_int16x4': 5, 'run_dequant': 10, 'run_quant': 11, 'run_bdcastadd_16': 17,
                                      'run_lut_fp16x16': 20, 'run_bdcastmul_16': 18}
            },
        "combination_21": {
            "combined_kernel_includes": ['super.hh', 'q/q.hpp', 'dq/dq.hpp', 'l2norm_fp16x16/l2norm_fp16x16_wrapper.cc',
                                         'gemm_qdq_int16x4/gemm_int16x4_wrapper.cc', 'dq/dq_wrapper.cc', 'q/q_wrapper.cc',
                                         'broadcast/run_bdcastadd_wrapper.cc', 'q/q_impl.hpp', 'dq/dq_impl.hpp',
                                         'linear_approx_bf16/linear_approx_bf16_wrapper.cc', 'broadcast/run_bdcastmul_wrapper.cc'],
            "combined_kernel_names": {'run_l2norm_fp16x16': 7, 'run_gemm_int16x4': 5, 'run_dequant': 10, 'run_quant': 11, 'run_bdcastadd_16': 17,
                                      'run_lut_fp16x16': 20, 'run_bdcastmul_16': 18}
            }
        }

    unique_map, duplicate_groups = find_unique_combinations(combinations)
    print_final_unique_combinations(unique_map, duplicate_groups)
