#pragma once
#include <fstream>
#include <iostream>
#include <common.hpp>
#include <nlohmann/json.hpp>

namespace waic_runtime_aie4 {
	using json = nlohmann::json;

	static json read_json(const std::string& node_file) {
#ifdef _WIN32
        FILE* f = wfopen_long(node_file, L"rb");
        if (!f) {
            std::cout << "File not found : " << node_file << std::endl;
            std::exit(1);
        }

        // Read entire file into std::string
        std::fseek(f, 0, SEEK_END);
        long n = std::ftell(f);
        std::fseek(f, 0, SEEK_SET);

        std::string s;
        if (n > 0) {
            s.resize(static_cast<size_t>(n));
            std::fread(s.data(), 1, s.size(), f);
        }
        std::fclose(f);

        return json::parse(s);

#else
    	std::ifstream f(node_file);
    	if (!f.is_open()) {
            std::cout << "File not found : " << node_file << std::endl;
            exit(1);
        }
        json json_data = json::parse(f);
        f.close();
        return (json_data);
#endif
    }

	inline void replace_symbols(std::string& node_name) {
		std::replace(node_name.begin(), node_name.end(), '/', '_');
		std::replace(node_name.begin(), node_name.end(), '.', '_');
		std::replace(node_name.begin(), node_name.end(), '#', '_');
		std::replace(node_name.begin(), node_name.end(), ':', '_');
	}

	inline std::vector<json>
		get_scale_zp_vector(const std::string& data_folder,
			std::string& node_name) {
		replace_symbols(node_name);
		std::string node_file = (std::filesystem::path(data_folder) / node_name / "graph_params.json").string();

		json json_data = read_json(node_file);
		std::vector<json> scale_zp;


		if (json_data.contains("qdq") && json_data["qdq"].is_array()) {
			for (const auto& item : json_data["qdq"]) {
				if (!item.empty()) {
					auto it = item.begin();
					scale_zp.push_back(it.value());
				}
			}
		}

		return scale_zp;
	}

  template <typename T>
  inline void unpack_wgt_values_int4(std::vector<T>& ws) {
    for (size_t i = 0; i < ws.size(); ++i) {
      T lower = ws[i] & 0x0F;
      if (lower & 0x8) {
        lower = lower - 16;
      }
      T upper = ws[i] >> 4 & 0x0F;
      uint8_t repacked_byte = (upper << 4) | lower;
      ws[i] = static_cast<T>(repacked_byte);
    }
  };

	template <typename T>
	std::vector<std::vector<T>> fold2D(std::vector<T> ws,
		const std::vector<int64_t>& shape, std::string wgt_type="int8") {
    if (wgt_type == "int4") {
      unpack_wgt_values_int4(ws);
    }
		int32_t rows = (int32_t)shape[0];
		int32_t cols = (int32_t)shape[1];
		std::vector<std::vector<T>> ret(rows);
		for (int i = 0; i < rows; ++i) {
			ret[i].resize(cols);
		}
		
		for (size_t i = 0; i < ws.size(); ++i) {
			int r = (int)i / cols;
			int c = (int)i % cols;
			ret[r][c] = ws[i];
		}
		
		return ret;
	}

	template <typename Tw, typename Tb>
	std::tuple<std::vector<float>, std::vector<float>, std::vector<float>>
		calculate_matmul_qdq_params_no_bias(
			const std::vector<std::vector<Tw>>& weights, float a_sc, uint16_t a_zp,
			float w_sc, uint16_t w_zp, float q_sc, uint16_t q_zp) {
		int64_t a_zp_int64 = static_cast<int64_t>(a_zp);
		int64_t w_zp_int64 = static_cast<int64_t>(w_zp);
		int64_t q_zp_int64 = static_cast<int64_t>(q_zp);
		int64_t weights_in_ch = static_cast<int64_t>(weights.size());
		std::vector<std::vector<int64_t>> weights_int64(
			weights.size(), std::vector<int64_t>(weights[0].size()));
		for (size_t i = 0; i < weights.size(); ++i) {
			for (size_t j = 0; j < weights[i].size(); ++j) {
				weights_int64[i][j] = static_cast<int64_t>(weights[i][j]);
			}
		}
		// C2
		float c2_coeff = (float)(a_sc * w_sc) / (float)q_sc;
		std::vector<float> c1_coeff(weights[0].size());
		for (size_t i = 0; i < weights[0].size(); ++i) {
			int64_t weights_sum = 0;
			for (size_t j = 0; j < weights.size(); ++j) {
				weights_sum += weights_int64[j][i];
			}
			c1_coeff[i] =
				(float)(((float)-a_zp_int64) * c2_coeff * weights_sum) + (float)q_zp_int64;
		}
		int64_t num_weights_unrolled = weights_in_ch;
		float c3_coeff_offset = (float)(-a_zp_int64) * num_weights_unrolled;
		// C1
		float c3_coeff_scale = -c2_coeff * (float)w_zp_int64;

		float temp = c3_coeff_scale * c3_coeff_offset;
		// C0
		std::transform(c1_coeff.begin(), c1_coeff.end(), c1_coeff.begin(),
			[temp](float c) { return c + temp; });

		// Create C2 and C3 vector and fill with same value
		std::vector<float> C2(c1_coeff.size(), c3_coeff_scale);
		std::vector<float> C3(c1_coeff.size(), c2_coeff);

		return std::make_tuple(c1_coeff, C2, C3);
	}

	template <typename Tw, typename Tb>
	std::tuple<std::vector<float>, std::vector<float>, std::vector<float>>
		dq_uint16A_uint8W_bias_matmul_q_param_gen(
			float a_dq_xscale, uint16_t a_dq_xzero_pt,
			const std::vector<std::vector<Tw>>& weights, float w_dq_xscale,
			uint16_t w_dq_xzero_pt, const std::vector<Tb>& bias, float b_dq_xscale,
			uint16_t b_dq_xzero_pt, float a_q_yscale, uint16_t a_q_yzero_pt) {

		int64_t a_dq_xzero_pt_int64 = static_cast<int64_t>(a_dq_xzero_pt);
		int64_t w_dq_xzero_pt_int64 = static_cast<int64_t>(w_dq_xzero_pt);
		int64_t a_q_yzero_pt_int64 = static_cast<int64_t>(a_q_yzero_pt);

		int64_t weights_in_ch = static_cast<int64_t>(weights.size());

		std::vector<std::vector<int64_t>> weights_int64(
			weights.size(), std::vector<int64_t>(weights[0].size()));
		for (size_t i = 0; i < weights.size(); ++i) {
			for (size_t j = 0; j < weights[i].size(); ++j) {
				weights_int64[i][j] = static_cast<int64_t>(weights[i][j]);
			}
		}

		std::vector<int64_t> bias_min_zp(bias.size());
		std::transform(
			bias.begin(), bias.end(), bias_min_zp.begin(), [b_dq_xzero_pt](Tb b) {
				return static_cast<int64_t>(b) - static_cast<int64_t>(b_dq_xzero_pt);
			});

		// C2
		float c2_coeff = float(float(a_dq_xscale * w_dq_xscale) / a_q_yscale);
		float c4_coeff = float(b_dq_xscale / a_q_yscale);

		std::vector<float> c1_coeff(weights[0].size());
		for (size_t i = 0; i < weights[0].size(); ++i) {
			int64_t weights_sum = 0;
			for (size_t j = 0; j < weights.size(); ++j) {
				weights_sum += weights_int64[j][i];
			}
			c1_coeff[i] = float((-a_dq_xzero_pt_int64) * c2_coeff * weights_sum) +
				a_q_yzero_pt_int64 + float(bias_min_zp[i] * c4_coeff);
		}

		int64_t num_weights_unrolled = weights_in_ch;
		float c3_coeff_offset = (float)(-a_dq_xzero_pt_int64) * num_weights_unrolled;
		// C1
		float c3_coeff_scale = -c2_coeff * (float)w_dq_xzero_pt_int64;

		float temp = c3_coeff_scale * c3_coeff_offset;
		// C0
		std::transform(c1_coeff.begin(), c1_coeff.end(), c1_coeff.begin(),
			[temp](float c) { return c + temp; });

		// Create C2 and C3 vector and fill with same value
		std::vector<float> C2(c1_coeff.size(), c3_coeff_scale);
		std::vector<float> C3(c1_coeff.size(), c2_coeff);

		return std::make_tuple(c1_coeff, C2, C3);
	}

	template <typename Tw, typename Tb>
	std::tuple<std::vector<float>, std::vector<float>, std::vector<float>>
		dq_uint16A_int8W_bias_matmul_q_param_gen_chwise(
			float a_dq_xscale, uint16_t a_dq_xzero_pt,
			const std::vector<std::vector<Tw>>& weights,
			const std::vector<float> w_dq_xscale,
			const std::vector<uint16_t> w_dq_xzero_pt, const std::vector<Tb>& bias,
			const std::vector<float> b_dq_xscale,
			const std::vector<uint16_t> b_dq_xzero_pt,
			float a_q_yscale, uint16_t a_q_yzero_pt) {

		int64_t a_dq_xzero_pt_int64 = static_cast<int64_t>(a_dq_xzero_pt);
		int64_t a_q_yzero_pt_int64 = static_cast<int64_t>(a_q_yzero_pt);

		int64_t weights_in_ch = static_cast<int64_t>(weights.size());

		std::vector<std::vector<int64_t>> weights_int64(
			weights.size(), std::vector<int64_t>(weights[0].size()));
		for (size_t i = 0; i < weights.size(); ++i) {
			for (size_t j = 0; j < weights[i].size(); ++j) {
				weights_int64[i][j] = static_cast<int64_t>(weights[i][j]);
			}
		}

		std::vector<int64_t> bias_min_zp(bias.size());
		if (bias.size() != 0){
		    for (int i = 0; i < bias.size(); ++i) {
		    	bias_min_zp[i] = static_cast<int64_t>(bias[i]) -
		    		static_cast<int64_t>(b_dq_xzero_pt[i]);
		    }
		}

		std::vector<float> c2_coeff(w_dq_xscale.size());
		for (int i = 0; i < w_dq_xscale.size(); ++i) {
			c2_coeff[i] = float(a_dq_xscale * w_dq_xscale[i]) / a_q_yscale;
		}
		std::vector<float> c4_coeff(b_dq_xscale.size());
		for (int i = 0; i < b_dq_xscale.size(); ++i) {
			c4_coeff[i] = b_dq_xscale[i] / a_q_yscale;
		}

		std::vector<float> c1_coeff(weights[0].size());
		for (size_t i = 0; i < weights[0].size(); ++i) {
			int64_t weights_sum = 0;
			for (size_t j = 0; j < weights.size(); ++j) {
				weights_sum += weights_int64[j][i];
			}
			c1_coeff[i] = (float)((-a_dq_xzero_pt_int64) * c2_coeff[i]) * (float)weights_sum;
			c1_coeff[i] += (float)a_q_yzero_pt_int64;
			if (bias.size() != 0) {
				c1_coeff[i] += (float)(bias_min_zp[i] * c4_coeff[i]);
			} else {
				c1_coeff[i] += c4_coeff[i];
			}
		}

		float c3_coeff_offset = -a_dq_xzero_pt_int64 * (float)weights_in_ch;
		std::vector<float> c3_coeff_scale(weights[0].size(), 0);
		for (int i = 0; i < weights[0].size(); ++i) {
			int64_t w_dq_xzero_pt_int64 = static_cast<int64_t>(w_dq_xzero_pt[i]);
			c3_coeff_scale[i] = -c2_coeff[i] * (float)w_dq_xzero_pt_int64;
		}

		std::vector<float> c3_coeff_scale_1(weights[0].size(), 0);
		for (int i = 0; i < weights[0].size(); ++i) {
			c3_coeff_scale_1[i] = c3_coeff_scale[i];
			float temp = c3_coeff_scale_1[i] * c3_coeff_offset;
			c1_coeff[i] += temp;
		}

		return std::make_tuple(c1_coeff, c3_coeff_scale_1, c2_coeff);
        }

	template <typename Tw, typename Tb>
	std::tuple<std::vector<float>, std::vector<float>, std::vector<float>>
		calculate_matmul_qdq_params_uint16_int4(
			const std::vector<std::vector<Tw>>& weights, float a_sc, uint16_t a_zp,
			const std::vector<float> w_sc, std::vector<uint16_t> w_zp, float q_sc,
			uint16_t q_zp) {
		int64_t a_zp_int64 = static_cast<int64_t>(a_zp);
		int64_t q_zp_int64 = static_cast<int64_t>(q_zp);
		int64_t weights_in_ch = static_cast<int64_t>(weights.size());
		// Copy weights to int64_t
		std::vector<std::vector<int64_t>> weights_int64(
			weights.size(), std::vector<int64_t>(weights[0].size()));
		for (size_t i = 0; i < weights.size(); ++i) {
			for (size_t j = 0; j < weights[i].size(); ++j) {
				weights_int64[i][j] = static_cast<int64_t>(weights[i][j]);
			}
		}
		std::vector<float> c2_coeff(weights[0].size(), 0);
		for (int i = 0; i < weights[0].size(); ++i) {
			c2_coeff[i] = (float)(a_sc * w_sc[i]) / (float)q_sc;
		}

		std::vector<float> c1_coeff(weights[0].size());
		for (size_t i = 0; i < weights[0].size(); ++i) {
			int64_t weights_sum = 0;
			for (size_t j = 0; j < weights.size(); ++j) {
				weights_sum += weights_int64[j][i];
			}
			c1_coeff[i] = (float)((float)((-a_zp_int64) * c2_coeff[i]) * weights_sum) + q_zp_int64;
		}
		int64_t num_weights_unrolled = weights_in_ch;
		float c3_coeff_offset = (float)(-a_zp_int64) * float(num_weights_unrolled);
		std::vector<float> c3_coeff_scale(weights[0].size(), 0);
		for (int i = 0; i < weights[0].size(); ++i) {
			int64_t w_dq_xzero_pt_int64 = static_cast<int64_t>(w_zp[i]);
			c3_coeff_scale[i] = -c2_coeff[i] * (float)w_dq_xzero_pt_int64;
		}

		std::vector<float> c2(weights[0].size(), 0);
		std::vector<float> c3_coeff_scale_1(weights[0].size(), 0);
		for (int i = 0; i < weights[0].size(); ++i) {
			c3_coeff_scale_1[i] = c3_coeff_scale[i];
			c2[i] = c2_coeff[i];
			int64_t temp = c3_coeff_scale_1[i] * c3_coeff_offset;
			c1_coeff[i] += temp;
		}

		return std::make_tuple(c1_coeff, c3_coeff_scale_1, c2);
	}

        template <typename Tw, typename Tb>
	std::tuple<std::vector<float>, std::vector<float>, std::vector<float>>
		dq_uint16A_int4W_bias_matmul_q_param_gen(
			float a_dq_xscale, uint16_t a_dq_xzero_pt,
			const std::vector<std::vector<Tw>>& weights,
			const std::vector<float> w_dq_xscale,
			const std::vector<uint16_t> w_dq_xzero_pt, const std::vector<Tb>& bias,
			float b_dq_xscale, uint16_t b_dq_xzero_pt, float a_q_yscale,
			uint16_t a_q_yzero_pt) {

		int64_t a_dq_xzero_pt_int64 = static_cast<int64_t>(a_dq_xzero_pt);
		int64_t a_q_yzero_pt_int64 = static_cast<int64_t>(a_q_yzero_pt);

		int64_t weights_in_ch = static_cast<int64_t>(weights.size());

		std::vector<std::vector<int64_t>> weights_int64(
			weights.size(), std::vector<int64_t>(weights[0].size()));
		for (size_t i = 0; i < weights.size(); ++i) {
			for (size_t j = 0; j < weights[i].size(); ++j) {
				weights_int64[i][j] = static_cast<int64_t>(weights[i][j]);
			}
		}

		std::vector<int64_t> bias_min_zp(bias.size());
		std::transform(
			bias.begin(), bias.end(), bias_min_zp.begin(), [b_dq_xzero_pt](Tb b) {
				return static_cast<int64_t>(b) - static_cast<int64_t>(b_dq_xzero_pt);
			});

		std::vector<float> c2_coeff(w_dq_xscale.size());
		for (int i = 0; i < w_dq_xscale.size(); ++i) {
			c2_coeff[i] = (float)(a_dq_xscale * w_dq_xscale[i]) / a_q_yscale;
		}
		float c4_coeff = b_dq_xscale / a_q_yscale;

		std::vector<float> c1_coeff(weights[0].size());
		for (size_t i = 0; i < weights[0].size(); ++i) {
			int64_t weights_sum = 0;
			for (size_t j = 0; j < weights.size(); ++j) {
				weights_sum += weights_int64[j][i];
			}
			c1_coeff[i] = (float)((float)((-a_dq_xzero_pt_int64) * c2_coeff[i]) * weights_sum) +
				(float)(a_q_yzero_pt_int64 + (float)(bias_min_zp[i] * c4_coeff));
		}

		int64_t num_weights_unrolled = weights_in_ch;
		float c3_coeff_offset = (float)(-a_dq_xzero_pt_int64 * num_weights_unrolled);
		std::vector<float> c3_coeff_scale(weights[0].size(), 0);
		for (int i = 0; i < weights[0].size(); ++i) {
			int64_t w_dq_xzero_pt_int64 = static_cast<int64_t>(w_dq_xzero_pt[i]);
			c3_coeff_scale[i] = -c2_coeff[i] * (float)w_dq_xzero_pt_int64;
		}

		std::vector<float> c3_coeff_scale_1(weights[0].size(), 0);
		for (int i = 0; i < weights[0].size(); ++i) {
			c3_coeff_scale_1[i] = c3_coeff_scale[i];
			float temp = c3_coeff_scale_1[i] * c3_coeff_offset;
			c1_coeff[i] += temp;
		}

		return std::make_tuple(c1_coeff, c3_coeff_scale_1, c2_coeff);
	}

	template <typename T>
	void transpose_conv_wgt(const T* in,
							std::vector<T> &out,
							size_t KX,
							size_t KY,
							size_t CI,
							size_t CO)
	{
		for (size_t i = 0; i < KX; ++i) {
			for (size_t j = 0; j < KY; ++j) {
				for (size_t k = 0; k < CI; ++k) {
					for (size_t l = 0; l < CO; ++l) {
						out[l * KX * KY * CI + k * KY * KX + j * KX + i] =
							in[i * KY * CI * CO + j * CI * CO + k * CO + l];
					}
				}
			}
		}
	}

	/*
	*  c2_coeff = (sa * sw) / so
	*  c4_coeff = sb / so
	*  c1_coeff = (-za * c2_coeff * sigma(w(k))) + zo + (bias min zp * (sb / so)) 
	*  c3_coeff_offset = -za * K
	*  c3_coeff_scale = -c2_coeff * zw
	*  temp = za * ((sa * sw) / so) * K * zw
	*  C0 = c1_coeff + temp
	*  C1 = c3_coeff_scale
	*  C2 = c2_coeff
	*  Here Sw, Zw, Sb, Zb are vectors
	*/
	template <typename Tw, typename Tb>
	std::tuple<std::vector<float>, std::vector<float>, std::vector<float>>
	dq_uint16A_int8W_bias_conv_q_param_gen_chwise(
			float a_dq_xscale, uint16_t a_dq_xzero_pt,
			const std::vector<std::vector<Tw>>& weights,
			const std::vector<float> w_dq_xscale,
			const std::vector<uint16_t> w_dq_xzero_pt, const std::vector<Tb>& bias,
			const std::vector<float> b_dq_xscale,
			const std::vector<uint16_t> b_dq_xzero_pt,
			float a_q_yscale, uint16_t a_q_yzero_pt) {

		int64_t a_dq_xzero_pt_int64 = static_cast<int64_t>(a_dq_xzero_pt);
		int64_t a_q_yzero_pt_int64 = static_cast<int64_t>(a_q_yzero_pt);

		int64_t weights_in_ch = static_cast<int64_t>(weights.size());

		std::vector<std::vector<int64_t>> weights_int64(
			weights.size(), std::vector<int64_t>(weights[0].size()));
		for (size_t i = 0; i < weights.size(); ++i) {
			for (size_t j = 0; j < weights[i].size(); ++j) {
				weights_int64[i][j] = static_cast<int64_t>(weights[i][j]);
			}
		}

		std::vector<int64_t> bias_min_zp(bias.size());
		if (bias.size() != 0){
		    for (int i = 0; i < bias.size(); ++i) {
		    	bias_min_zp[i] = static_cast<int64_t>(bias[i]) -
		    		static_cast<int64_t>(b_dq_xzero_pt[i]);
		    }
		}

		std::vector<float> c2_coeff(w_dq_xscale.size());
		for (int i = 0; i < w_dq_xscale.size(); ++i) {
			c2_coeff[i] = float(a_dq_xscale * w_dq_xscale[i]) / a_q_yscale;
		}
		std::vector<float> c4_coeff(b_dq_xscale.size());
		for (int i = 0; i < b_dq_xscale.size(); ++i) {
			c4_coeff[i] = b_dq_xscale[i] / a_q_yscale;
		}

		std::vector<float> c1_coeff(weights[0].size());
		for (size_t i = 0; i < weights[0].size(); ++i) {
			int64_t weights_sum = 0;
			for (size_t j = 0; j < weights.size(); ++j) {
				weights_sum += weights_int64[j][i];
			}
			c1_coeff[i] = (float)((-a_dq_xzero_pt_int64) * c2_coeff[i]) * (float)weights_sum;
			c1_coeff[i] += (float)a_q_yzero_pt_int64;
			if (bias.size() != 0) {
				c1_coeff[i] += (float)(bias_min_zp[i] * c4_coeff[i]);
			} else {
				c1_coeff[i] += c4_coeff[i];
			}
		}

		float c3_coeff_offset = -a_dq_xzero_pt_int64 * (float)weights_in_ch;
		std::vector<float> c3_coeff_scale(weights[0].size(), 0);
		for (int i = 0; i < weights[0].size(); ++i) {
			int64_t w_dq_xzero_pt_int64 = static_cast<int64_t>(w_dq_xzero_pt[i]);
			c3_coeff_scale[i] = -c2_coeff[i] * (float)w_dq_xzero_pt_int64;
		}

		std::vector<float> c3_coeff_scale_1(weights[0].size(), 0);
		for (int i = 0; i < weights[0].size(); ++i) {
			c3_coeff_scale_1[i] = c3_coeff_scale[i];
			float temp = c3_coeff_scale_1[i] * c3_coeff_offset;
			c1_coeff[i] += temp;
		}

		return std::make_tuple(c1_coeff, c3_coeff_scale_1, c2_coeff);
    }

	inline std::tuple<float, float, float, float>
        qdq_act_matmul_uint16_uint16_cstm(float a_dq_xscale, int64_t a_dq_xzero_pt,
        int64_t in_ch_dim, float w_dq_xscale,
        int64_t w_dq_xzero_pt, float a_q_yscale,
        int64_t a_q_yzero_pt) {
      // Calculate the c2 coefficient
      float c2_coeff = (float)(a_dq_xscale * w_dq_xscale) / a_q_yscale;
      float c3_coeff_scale = -c2_coeff * (float)w_dq_xzero_pt;

      float C3 = c2_coeff;
      float C2 = c3_coeff_scale;
      float C1 = (-a_dq_xzero_pt) * c2_coeff;
      float C0 = (float)a_q_yzero_pt + (float)((float)(a_dq_xzero_pt * w_dq_xzero_pt) * (float)(in_ch_dim * c2_coeff));

      return std::make_tuple(C0,  // C0
                             C2,  // C1
                             C3,  // C2
                             C1   // C3
          );
    }
	/*
	*  c2_coeff = (sa * sw) / so
	*  c4_coeff = sb / so
	*  c1_coeff = (-za * c2_coeff * sigma(w(k))) + zo + (bias min zp * (sb / so)) 
	*  c3_coeff_offset = -za * K
	*  c3_coeff_scale = -c2_coeff * zw
	*  temp = za * ((sa * sw) / so) * K * zw
	*  C0 = c1_coeff + temp
	*  C1 = c3_coeff_scale
	*  C2 = c2_coeff
	*/
	template <typename Tw, typename Tb>
	std::tuple<std::vector<float>, std::vector<float>, std::vector<float>>
	dq_uint16A_uint8W_bias_conv_q_param_gen(
			float a_dq_xscale, uint16_t a_dq_xzero_pt,
			const std::vector<std::vector<Tw>>& weights, float w_dq_xscale,
			uint16_t w_dq_xzero_pt, const std::vector<Tb>& bias, float b_dq_xscale,
			uint16_t b_dq_xzero_pt, float a_q_yscale, uint16_t a_q_yzero_pt)
	{

		int64_t a_dq_xzero_pt_int64 = static_cast<int64_t>(a_dq_xzero_pt);
		int64_t w_dq_xzero_pt_int64 = static_cast<int64_t>(w_dq_xzero_pt);
		int64_t a_q_yzero_pt_int64 = static_cast<int64_t>(a_q_yzero_pt);

		int64_t weights_in_ch = static_cast<int64_t>(weights.size());

		std::vector<std::vector<int64_t>> weights_int64(
			weights.size(), std::vector<int64_t>(weights[0].size()));
		for (size_t i = 0; i < weights.size(); ++i) {
			for (size_t j = 0; j < weights[i].size(); ++j) {
				weights_int64[i][j] = static_cast<int64_t>(weights[i][j]);
			}
		}
		
		std::vector<int64_t> bias_min_zp(bias.size());
		std::transform(
			bias.begin(), bias.end(), bias_min_zp.begin(), [b_dq_xzero_pt](Tb b) {
				return static_cast<int64_t>(b) - static_cast<int64_t>(b_dq_xzero_pt);
			});

		// C2
		float c2_coeff = float(float(a_dq_xscale * w_dq_xscale) / a_q_yscale);
		float c4_coeff = float(b_dq_xscale / a_q_yscale);

		std::vector<float> c1_coeff(weights[0].size());
		for (size_t i = 0; i < weights[0].size(); ++i) {
			int64_t weights_sum = 0;
			for (size_t j = 0; j < weights.size(); ++j) {
				weights_sum += weights_int64[j][i];
			}
			c1_coeff[i] = float((-a_dq_xzero_pt_int64) * c2_coeff * weights_sum) +
				a_q_yzero_pt_int64 + float(bias_min_zp[i] * c4_coeff);
		}
		
		int64_t num_weights_unrolled = weights_in_ch;
		float c3_coeff_offset = (float)(-a_dq_xzero_pt_int64) * num_weights_unrolled;
		// C1
		float c3_coeff_scale = -c2_coeff * (float)w_dq_xzero_pt_int64;

		float temp = c3_coeff_scale * c3_coeff_offset;
		
		// C0
		std::transform(c1_coeff.begin(), c1_coeff.end(), c1_coeff.begin(),
			[temp](float c) { return c + temp; });
		
		// Create C2 and C3 vector and fill with same value
		std::vector<float> C2(c1_coeff.size(), c3_coeff_scale);
		std::vector<float> C3(c1_coeff.size(), c2_coeff);

		return std::make_tuple(c1_coeff, C2, C3);
	}

	/*
	*  c2_coeff = (sa * sw) / so
	*  c1_coeff = (-za * c2_coeff * sigma(w(k))) + zo
	*  c3_coeff_offset = -za * K
	*  c3_coeff_scale = -c2_coeff * zw
	*  temp = za * ((sa * sw) / so) * K * zw
	*  C0 = c1_coeff + temp
	*  C1 = c3_coeff_scale
	*  C2 = c2_coeff
	*/
	template <typename Tw, typename Tb>
	std::tuple<std::vector<float>, std::vector<float>, std::vector<float>>
	calculate_conv_qdq_params_no_bias(
			const std::vector<std::vector<Tw>>& weights, float a_sc, uint16_t a_zp,
			float w_sc, uint16_t w_zp, float q_sc, uint16_t q_zp)
	{
		int64_t a_zp_int64 = static_cast<int64_t>(a_zp);
		int64_t w_zp_int64 = static_cast<int64_t>(w_zp);
		int64_t q_zp_int64 = static_cast<int64_t>(q_zp);
		int64_t weights_in_ch = static_cast<int64_t>(weights.size());
		std::vector<std::vector<int64_t>> weights_int64(
			weights.size(), std::vector<int64_t>(weights[0].size()));
		for (size_t i = 0; i < weights.size(); ++i) {
			for (size_t j = 0; j < weights[i].size(); ++j) {
				weights_int64[i][j] = static_cast<int64_t>(weights[i][j]);
			}
		}
		// C2
		float c2_coeff = (float)(a_sc * w_sc) / (float)q_sc;
		std::vector<float> c1_coeff(weights[0].size());
		for (size_t i = 0; i < weights[0].size(); ++i) {
			int64_t weights_sum = 0;
			for (size_t j = 0; j < weights.size(); ++j) {
				weights_sum += weights_int64[j][i];
			}
			c1_coeff[i] =
				(float)(((float)-a_zp_int64) * c2_coeff * weights_sum) + (float)q_zp_int64;
		}
		int64_t num_weights_unrolled = weights_in_ch;
		float c3_coeff_offset = (float)(-a_zp_int64) * num_weights_unrolled;
		// C1
		float c3_coeff_scale = -c2_coeff * (float)w_zp_int64;

		float temp = c3_coeff_scale * c3_coeff_offset;
		// C0
		std::transform(c1_coeff.begin(), c1_coeff.end(), c1_coeff.begin(),
			[temp](float c) { return c + temp; });

		// Create C2 and C3 vector and fill with same value
		std::vector<float> C2(c1_coeff.size(), c3_coeff_scale);
		std::vector<float> C3(c1_coeff.size(), c2_coeff);

		return std::make_tuple(c1_coeff, C2, C3);
	}

	template <typename TI, typename TDQ>
	void dequantize_data(
		const std::vector<TI>& in_buf,
		std::vector<int64_t>& zp,
		std::vector<float>& scale,
		int num_channels,
		std::vector<TDQ>& out_buf)
	{
		// Perform the calculation (input - zp) * scale
		for (size_t i = 0; i < num_channels; ++i) {
			float out_dq;
			//
			// Check if zp is a channel wise array
			//
			if (scale.size() == 1) {
				out_dq = (static_cast<int32_t>(in_buf[i]) - zp[0]) * scale[0];
			} else if (zp.size() == scale.size()) {
				out_dq = (static_cast<int32_t>(in_buf[i]) - zp[i]) * scale[i];
			} else {
				out_dq = (static_cast<int32_t>(in_buf[i]) - zp[0]) * scale[i];
			}
			
			//
			// If size of TDQ is uint16 then convert float to bfloat
			//
			if (sizeof(TDQ) == sizeof(uint16_t))
			{
#if (__IS_QDQ_FP16__)
				out_buf[i] = float32_to_float16(out_dq);
#else
				out_buf[i] = float_to_bfloat16(out_dq).value;
#endif
			}
			else
			{
				out_buf[i] = out_dq;
			}
		}
	}

	template <typename TG, typename TB, typename TGDQ, typename TBDQ>
	inline void read_scale_bias_bins(std::string function,
									 std::string const_path,
									 std::string node_name,
									 std::vector<float>& s_dq_s,
									 std::vector<int64_t>& s_dq_zp,
									 std::vector<float> b_dq_s,
									 std::vector<int64_t> b_dq_zp,
									 std::vector<TGDQ>& gamma_dq,
									 uint32_t& gamma_size,
									 std::vector<TBDQ>& beta_dq,
									 uint32_t& beta_size 
									 )
	{
		replace_symbols(node_name);
		std::filesystem::path node_scale_file;
		std::filesystem::path node_bias_file;
		if (function == "layernorm") {
			node_scale_file = const_path + "/" + node_name + "/Scale.bin";
			node_bias_file = const_path + "/" + node_name + "/B.bin";
		} else if (function == "groupnorm") {
			node_scale_file = const_path + "/" + node_name + "/mul_B.bin";
			node_bias_file = const_path + "/" + node_name + "/add_B.bin";
		}

		//
		// read scale and bias bins
		//
		std::vector<TG> gamma;
		std::vector<TB> beta;
		
		gamma.resize(gamma_size);
		beta.resize(beta_size);

		//
		// read gamma (scale) and beta (bias) bin files
		//
		gamma_size = read_bin_file(node_scale_file, reinterpret_cast<char *>(gamma.data()), gamma_size * sizeof(TG));
		gamma_size = gamma_size / sizeof(TG);
		beta_size = read_bin_file(node_bias_file, reinterpret_cast<char *>(beta.data()), beta_size * sizeof(TB));
		beta_size = beta_size / sizeof(TB);
		//
		// dequantize gamma (scale) and beta (bias) values
		//
		dequantize_data<TG, TGDQ>(gamma, s_dq_zp, s_dq_s, gamma_size, gamma_dq);
		dequantize_data<TB, TBDQ>(beta, b_dq_zp, b_dq_s, beta_size, beta_dq);
	}
}
