Source code for quark.onnx.quantization.config.algorithm

#
# Copyright (C) 2025 Advanced Micro Devices, Inc. All rights reserved.
# SPDX-License-Identifier: MIT
#
"""Quark Quantization Algorithm Config API for ONNX"""

from abc import ABC
from typing import Any

from quark.shares.utils.log import ScreenLogger

logger = ScreenLogger(__name__)


[docs] class AlgoConfig(ABC): def _get_config(self, extra_options: dict[str, Any]) -> dict[str, Any]: raise NotImplementedError()
[docs] class SmoothQuantConfig(AlgoConfig): """Configuration for the Smooth Quant algorithm, which is originally proposed in the following paper: "Guangxuan Xiao et al., SmoothQuant: Accurate and Efficient Post-Training Quantization for Large Language Models, arXiv:2211.10438, 2022." SmoothQuant is a PTQ algorithm designed to reduce the accuracy drop when quantizing large language models (LLMs), especially for transformer architectures. It tackles one of the key issues in activation quantization: the mismatch in dynamic ranges between weights and activations across different layers. The core idea is to smooth out the activation and weight ranges by inserting a scaling factor that shifts some of the variation in activations into the weights. SmoothQuant requires only a small set of calibration data and no model retraining. By aligning the quantization ranges, it minimizes information loss in layers like attention or MLP, leading to much better accuracy retention. It has proven particularly effective for large models such as OPT, BLOOM, and GPT-like architectures under INT8 quantization. :param float alpha: A parameter in SmoothQuant that controls the trade-off between shifting activation range into weights and preserving the original distribution, enabling optimal balancing for quantization accuracy. Defaults to 0.5. """ def __init__(self, alpha: float = 0.5): self.name: str = "smooth_quant" self.alpha = alpha def _get_config(self, extra_options: dict[str, Any]) -> dict[str, Any]: smooth_quant_config = dict() if "SmoothAlpha" not in extra_options: smooth_quant_config["SmoothAlpha"] = self.alpha return smooth_quant_config
[docs] class CLEConfig(AlgoConfig): """Configuration for the CLE algorithm, which is originally proposed in the following paper: "Markus Nagel et al., Data-Free Quantization Through Weight Equalization and Bias Correction, arXiv:1906.04721, 2019." CLE (Cross-Layer Equalization) is a pre-processing technique used in PTQ that improves the quantization robustness of deep neural networks by reducing the range imbalance across layers. It operates by scaling the weights of adjacent layers in such a way that their output distributions become more uniform, minimizing the dynamic range mismatch that often causes quantization errors. The core idea behind CLE is that certain operations (like ReLU activations) are scale-invariant, meaning you can scale the output of one layer and inversely scale the next without affecting the final output. CLE leverages this property to propagate scale adjustments across consecutive layers, typically convolutional or linear layers followed by batch norm or ReLU. CLE does not require retraining, and it’s particularly effective when applied to networks that have large layer-wise scale imbalances. By smoothing out these differences before quantization, CLE helps preserve accuracy and stabilizes quantized inference in a lightweight, calibration-only pipeline. :param str cle_balance_method: The balance method of CLE. Defaults to "max". :param int cle_steps: The steps for CrossLayerEqualization execution. When set to -1, an adaptive CrossLayerEqualization will be conducted. Defaults to 1. :param float cle_weight_threshold: The threshold of the scale of the weights when calculating them. Defulats to 0.5. :param bool cle_scale_append_bias: Whether the bias be included when calculating the scale of the weights. Defaults to True. :param bool cle_scale_use_threshold: Whether use the threshold when calculating the scale of the wegiths. Defaults to True. :param float cle_total_layer_diff_threshold: The threshold represents the sum of mean transformations of CrossLayerEqualization transformations across all layers. Defaults to 1.9e-7. """ def __init__( self, cle_balance_method: str = "max", cle_steps: int = 1, cle_weight_threshold: float = 0.5, cle_scale_append_bias: bool = True, cle_scale_use_threshold: bool = True, cle_total_layer_diff_threshold: float = 1.9e-7, ) -> None: self.name: str = "cle" self.cle_balance_method = cle_balance_method self.cle_steps = cle_steps self.cle_weight_threshold = cle_weight_threshold self.cle_scale_append_bias = cle_scale_append_bias self.cle_scale_use_threshold = cle_scale_use_threshold self.cle_total_layer_diff_threshold = cle_total_layer_diff_threshold def _get_config(self, extra_options: dict[str, Any]) -> dict[str, Any]: cle_config: dict[str, Any] = dict() if "CLEBalanceMethod" not in extra_options: cle_config["CLEBalanceMethod"] = self.cle_balance_method if "CLESteps" not in extra_options: cle_config["CLESteps"] = self.cle_steps if "CLEWeightThreshold" not in extra_options: cle_config["CLEWeightThreshold"] = self.cle_weight_threshold if "CLEScaleAppendBias" not in extra_options: cle_config["CLEScaleAppendBias"] = self.cle_scale_append_bias if "CLEScaleUseThreshold" not in extra_options: cle_config["CLEScaleUseThreshold"] = self.cle_scale_use_threshold if "CLETotalLayerDiffThreshold" not in extra_options: cle_config["CLETotalLayerDiffThreshold"] = self.cle_total_layer_diff_threshold return cle_config
[docs] class BiasCorrectionConfig(AlgoConfig): """Configuration for the Bias Correction algorithm, which is originally proposed in the following paper: "Markus Nagel et al., Data-Free Quantization Through Weight Equalization and Bias Correction, arXiv:1906.04721, 2019." Bias Correction is a PTQ technique designed to reduce the quantization-induced shift in a neural network's output by adjusting the bias terms in layers like convolution or linear. It computes the difference (bias error) between the original float model and the quantized model outputs using a small calibration dataset. It then adjusts the biases of the affected layers so that the quantized model better matches the float model’s behavior, particularly at the layer output level. This method is simple, data-efficient (requiring no retraining), and effective at improving accuracy—especially for models that are sensitive to quantization noise, such as those with small activations or low-bit quantization like INT8. """ def __init__(self) -> None: self.name: str = "bias_correction" def _get_config(self, extra_options: dict[str, Any]) -> dict[str, Any]: bias_correction_config: dict[str, Any] = dict() bias_correction_config["BiasCorrection"] = True return bias_correction_config
[docs] class GPTQConfig(AlgoConfig): """Configuration for the GPTQ algorithm, which is originally proposed in the following paper: "Elias Frantar et al., GPTQ: Accurate Post-Training Quantization for Generative Pre-trained Transformers, arXiv:2210.17323, 2022." GPTQ is an efficient PTQ algorithm for compressing LLMs. It quantizes weights layer-by-layer and column-by-column within each layer. Crucially, when quantizing one column, it calculates the error and updates subsequent unquantized columns using an approximate Hessian matrix to minimize output distortion. This error correction step preserves accuracy far better than simple rounding. The result is near-original model accuracy at ultra-low precision (e.g., 4-bit) with fast, single-GPU quantization. This makes GPTQ a key technique for efficient LLM deployment. :param int bits: The quantization bits used in GPTQ. Defaults to 8. :param int block_size: The block size in GPTQ determines how many columns of weights will be quantized for one update. Defaults to 128. :param int group_size: The group size in GPTQ determines how many columns of weights share one set of scale and zero-point. Defaults is -1. :param float perc_damp: Percent of the average Hessian diagonal to use for dampening. Defaults to 0.01. :param bool act_order: Whether to re-order Hessian matrix according the values of diag. Defulats to False. :param bool per_channel: Whether to perform per-channel quantization in GPTQ. Defaults to False. :param bool mse: Whether to use MSE method to do data calibration in GPTQ. Defaults to False. :param bool weight_symmetric: Whether to only quantize weights of the model. Defaults to True. """ def __init__( self, bits: int = 8, block_size: int = 128, group_size: int = -1, perc_damp: float = 0.01, act_order: bool = False, per_channel: bool = False, mse: bool = False, weight_symmetric: bool = True, ) -> None: self.name: str = "gptq" self.bits = bits self.block_size = block_size self.group_size = group_size self.perc_damp = perc_damp self.act_order = act_order self.per_channel = per_channel self.mse = mse self.weight_symmetric = weight_symmetric def _get_config(self, extra_options: dict[str, Any]) -> dict[str, Any]: gptq_config: dict[str, Any] = dict() gptq_config["UseGPTQ"] = True gptq_config["GPTQParams"] = {} if "GPTQParams" not in extra_options: extra_options["GPTQParams"] = {} if "Bits" not in extra_options["GPTQParams"]: gptq_config["GPTQParams"]["Bits"] = self.bits if "BlockSize" not in extra_options["GPTQParams"]: gptq_config["GPTQParams"]["BlockSize"] = self.block_size if "PercDamp" not in extra_options["GPTQParams"]: gptq_config["GPTQParams"]["PercDamp"] = self.perc_damp if "GroupSize" not in extra_options["GPTQParams"]: gptq_config["GPTQParams"]["GroupSize"] = self.group_size if "ActOrder" not in extra_options["GPTQParams"]: gptq_config["GPTQParams"]["ActOrder"] = self.act_order if "PerChannel" not in extra_options["GPTQParams"]: gptq_config["GPTQParams"]["PerChannel"] = self.per_channel if "WeightSymmetric" not in extra_options["GPTQParams"]: gptq_config["GPTQParams"]["WeightSymmetric"] = self.weight_symmetric if "MSE" not in extra_options["GPTQParams"]: gptq_config["GPTQParams"]["MSE"] = self.mse return gptq_config
[docs] class AutoMixprecisionConfig(AlgoConfig): """Configuration for the automatic mixed precision. Mixed precision is a highly effective technique in the field of quantization. When low-bit quantization leads to poor accuracy, quantizing part of the tensors or layers with higher bit-width can often significantly improve the overall quantization accuracy. Automatic mixed-precision algorithms can automatically identify tensors or layers that suffer from low-bit quantization errors and replace them with higher-bit quantization, thereby enhancing the final model performance. :param int data_size: The size of the data used for mix-precision. Defaults to 10000000. :param Tuple[str, ...] target_op_type: The user defined op type set for mix-precision. Defaults to (‘Conv’, ‘ConvTranspose’, ‘Gemm’, ‘MatMul’). :param QuantType target_quant_type: Activation data type to be mixed in the model if 'act_target_quant_type' is not given. Error will be raised if 'target_quant_type', 'act_target_quant_type' and 'weight_target_quant_type' are not given. :param QuantType act_target_quant_type: Activation data type to be mixed in the model. If both 'act_target_quant_type' and 'weight_target_quant_type' are not specified, the 'act_target_quant_type' will be same as 'target_quant_type'. If only 'act_target_quant_type' is not specified, it will be the original activation_type. :param QuantType weight_target_quant_type: Weight data type to be mixed in the model. If both 'act_target_quant_type' and 'weight_target_quant_type' are not specified, the 'weight_target_quant_type' will be same as 'target_quant_type'. If only 'weight_target_quant_type' is not specified, it will be the original weight_type. :param QuantType bias_target_quant_type: Bias data type to be mixed in the model. If 'bias_target_quant_type' is not specified and Int32Bias is True, the 'bias_target_quant_type' will be int32. If 'bias_target_quant_type' is not specified and Int32Bias is False, the 'bias_target_quant_type' will be same as 'weight_target_quant_type'. :param bool dual_quant_nodes: Some backend compilers require that two types of quantization nodes exist simultaneously on the tensors which connect two different precision nodes, for example, they require the tensor that connects BFP16 Conv and BF16 Reshape has a BFP node and a QDQ pair both. Defaults to False. :param int output_index: The index of model output to be calculated for loss. Defaults to 0. :param float l2_target: The L2 metric as a target. Defaults to 0.5. :param Optional[float] top1_acc_target: The Top1 accuracy as a target. Defaults to None. :param Any evaluate_function: The function to measure top1 accuracy loss. Input of the function is model output(numpy tensor), output of the function is top1 accuracy(between 0~1). If 'evaluate_function' is not specified while 'top1_acc_target' is given, error will be raised. :param int num_target: The number of nodes for mix-precision to minimize the loss. Defaults to 0. :param List[str] target_tensors: The names of nodes to mix into the target quant type. Defaults to []. :param List[str] target_indices: The indices (based on sensitivity analysis results) of the nodes to mix into the target quant type. Defaults to []. :param List[str] exclude_indices: The indices (based on sensitivity analysis results) of the nodes not to mix into the target quant type. Defaults to []. :param bool no_input_qdq_shared: Whether to skip the nodes who shared the input Q/DQ pair with other nodes. Defaults to True. :param bool auto_mix_use_fast_ft: Whether to perform fast finetune to improve accuracy after mixed a layer. Defaults to False. """ def __init__( self, data_size: int = 10000000, target_op_type: tuple[str, ...] = ("Conv", "ConvTranspose", "Gemm", "MatMul"), target_quant_type: Any = None, act_target_quant_type: Any = None, weight_target_quant_type: Any = None, bias_target_quant_type: Any = None, dual_quant_nodes: bool = False, output_index: int = 0, l2_target: float = 0.5, top1_acc_target: float | None = None, evaluate_function: Any = None, num_target: int = 0, target_tensors: list[str] = [], target_indices: list[Any] = [], exclude_indices: list[Any] = [], no_input_qdq_shared: bool = True, auto_mix_use_fast_ft: bool = False, ) -> None: self.name: str = "auto_mixprecision" self.data_size = data_size self.target_op_type = target_op_type self.target_quant_type = target_quant_type self.act_target_quant_type = act_target_quant_type self.weight_target_quant_type = weight_target_quant_type self.bias_target_quant_type = bias_target_quant_type self.dual_quant_nodes = dual_quant_nodes self.output_index = output_index self.l2_target = l2_target self.top1_acc_target = top1_acc_target self.evaluate_function = evaluate_function self.num_target = num_target self.target_tensors = target_tensors self.target_indices = target_indices self.exclude_indices = exclude_indices self.no_input_qdq_shared = no_input_qdq_shared self.auto_mix_use_fast_ft = auto_mix_use_fast_ft def _get_config(self, extra_options: dict[str, Any]) -> dict[str, Any]: auto_mixprecision_config: dict[str, Any] = dict() auto_mixprecision_config["AutoMixprecision"] = {} if "AutoMixprecision" not in extra_options: extra_options["AutoMixprecision"] = {} if "DataSize" not in extra_options["AutoMixprecision"]: auto_mixprecision_config["AutoMixprecision"]["DataSize"] = self.data_size if "TargetOpType" not in extra_options["AutoMixprecision"]: auto_mixprecision_config["AutoMixprecision"]["TargetOpType"] = self.target_op_type if "TargetQuantType" not in extra_options["AutoMixprecision"]: if self.target_quant_type is not None: auto_mixprecision_config["AutoMixprecision"]["TargetQuantType"] = self.target_quant_type.map_onnx_format else: auto_mixprecision_config["AutoMixprecision"]["TargetQuantType"] = self.target_quant_type if "ActTargetQuantType" not in extra_options["AutoMixprecision"]: if self.act_target_quant_type is not None: auto_mixprecision_config["AutoMixprecision"]["ActTargetQuantType"] = ( self.act_target_quant_type.map_onnx_format ) else: auto_mixprecision_config["AutoMixprecision"]["ActTargetQuantType"] = self.act_target_quant_type if "WeightTargetQuantType" not in extra_options["AutoMixprecision"]: if self.weight_target_quant_type is not None: auto_mixprecision_config["AutoMixprecision"]["WeightTargetQuantType"] = ( self.weight_target_quant_type.map_onnx_format ) else: auto_mixprecision_config["AutoMixprecision"]["WeightTargetQuantType"] = self.weight_target_quant_type if "BiasTargetQuantType" not in extra_options["AutoMixprecision"]: if self.bias_target_quant_type is not None: auto_mixprecision_config["AutoMixprecision"]["BiasTargetQuantType"] = ( self.bias_target_quant_type.map_onnx_format ) else: auto_mixprecision_config["AutoMixprecision"]["BiasTargetQuantType"] = self.bias_target_quant_type if "DualQuantNodes" not in extra_options["AutoMixprecision"]: auto_mixprecision_config["AutoMixprecision"]["DualQuantNodes"] = self.dual_quant_nodes if "OutputIndex" not in extra_options["AutoMixprecision"]: auto_mixprecision_config["AutoMixprecision"]["OutputIndex"] = self.output_index if "L2Target" not in extra_options["AutoMixprecision"]: auto_mixprecision_config["AutoMixprecision"]["L2Target"] = self.l2_target if "Top1AccTarget" not in extra_options["AutoMixprecision"]: auto_mixprecision_config["AutoMixprecision"]["Top1AccTarget"] = self.top1_acc_target if "EvaluateFunction" not in extra_options["AutoMixprecision"]: auto_mixprecision_config["AutoMixprecision"]["EvaluateFunction"] = self.evaluate_function if "NumTarget" not in extra_options["AutoMixprecision"]: auto_mixprecision_config["AutoMixprecision"]["NumTarget"] = self.num_target if "TargetTensors" not in extra_options["AutoMixprecision"]: auto_mixprecision_config["AutoMixprecision"]["TargetTensors"] = self.target_tensors if "TargetIndices" not in extra_options["AutoMixprecision"]: auto_mixprecision_config["AutoMixprecision"]["TargetIndices"] = self.target_indices if "ExcludeIndices" not in extra_options["AutoMixprecision"]: auto_mixprecision_config["AutoMixprecision"]["ExcludeIndices"] = self.exclude_indices if "NoInputQDQShared" not in extra_options["AutoMixprecision"]: auto_mixprecision_config["AutoMixprecision"]["NoInputQDQShared"] = self.no_input_qdq_shared if "AutoMixUseFastFT" not in extra_options["AutoMixprecision"]: auto_mixprecision_config["AutoMixprecision"]["AutoMixUseFastFT"] = self.auto_mix_use_fast_ft return auto_mixprecision_config
[docs] class AdaRoundConfig(AlgoConfig): """Configuration for the AdaRound algorithm, which is originally proposed in the following paper: "Markus Nagel et al., Up or Down? Adaptive Rounding for Post-Training Quantization, arXiv:2004.10568, 2020." AdaRound (Adaptive Rounding) is a post-training quantization method that aims to mitigate the accuracy degradation caused by rounding during quantization. Traditional quantization methods often use a simple rounding scheme (e.g., round-to-nearest) to convert floating-point values to their quantized integer representation. This can lead to a significant loss of information, especially in deep neural networks. AdaRound addresses this by treating the rounding decision as a learnable parameter. Instead of deterministically rounding up or down, it introduces a soft rounding function and optimizes the rounding direction for each weight. The optimization is performed using a limited amount of unlabeled data (calibration data) to minimize the difference between the floating-point model's output and the quantized model's output. The objective function typically includes a reconstruction loss term to minimize the L2 distance between the original and quantized weight tensors, and a regularization term that encourages the soft rounding parameters to converge to either 0 or 1, corresponding to rounding down or up, respectively. The key idea behind AdaRound is to find the optimal rounding decisions for each weight, such that the overall model's performance is preserved after quantization. :param str optim_device: The device for optimization. Defaults to "cpu". :param str infer_device: The device for inference. Defaults to "cpu". :param int fixed_seed: A fixed seed for reproducibility. Defaults to 1705472343. :param int data_size: The total size of the dataset. Defaults to 1000000000. :param int batch_size: The batch size for optimization. Defaults to 1. :param int num_batches: The number of batches for optimization. Defaults to 1. :param int num_iterations: The number of optimization iterations. Defaults to 1000. :param float learning_rate: The learning rate for optimization. Defaults to 1e-1. :param bool early_stop: Whether to use early stopping. Defaults to False. :param int output_index: The index of the model's output to use for loss calculation. Defaults to 0. :param Optional[Tuple[float, float]] lr_adjust: Learning rate adjustment parameters. Defaults to None. :param List[str] target_op_type: List of operator types to be quantized. Defaults to ["Conv", "ConvTranspose", "Gemm", "MatMul", "InstanceNormalization", "LayerNormalization"]. :param bool selective_update: Whether to selectively update weights. Defaults to False. :param bool update_bias: Whether to update the bias terms. Defaults to False. :param bool output_qdq: Whether to output QDQ format. Defaults to False. :param float drop_ratio: The ratio of weights to drop. Defaults to 1.0. :param int mem_opt_level: Memory optimization level. Defaults to 1. :param Optional[str] cache_dir: Directory for caching. Defaults to None. :param int log_period: Logging period. Defaults to 100. :param Optional[str] ref_model_path: Path to the reference model. Defaults to None. :param bool dynamic_batch: Whether to use dynamic batching. Defaults to False. :param bool parallel: Whether to use parallel processing. Defaults to False. :param float reg_param: The regularization parameter for the rounding loss. This controls the trade-off between minimizing the reconstruction error and forcing the rounding parameters to be binary. Defaults to 0.01. :param Tuple[float, float] beta_range: The range of the temperature parameter 'beta'. the 'beta' controls the sharpness of the soft rounding function. It is annealed from the first value to the second value over the course of optimization. A high 'beta' at the beginning allows for more exploration, while a low 'beta' at the end encourages convergence to a binary solution. Defaults to (20, 2). :param float warm_start: The fraction of total iterations for the "warm start" phase. During this phase, only the reconstruction loss is used, and the regularization term is gradually introduced. This helps to find a good initial state before forcing the rounding decisions to be binary. Defaults to 0.2. :param bool select_max_mem_layer: Whether to select the layer with largest estimated memory usage to run. :param int num_workers: Number of subprocesses used for data loading. - 0 means the data will be loaded in the main process. - >0 enables multi-process data loading, which can significantly speed up data pipeline when dataset and transforms are heavy. Note: Using multiple workers increases CPU usage and may require careful handling of worker-safe code. :param bool pin_memory: If True, the DataLoader will copy tensors into CUDA pinned memory before returning them. """ def __init__( self, optim_device: str = "cpu", infer_device: str = "cpu", fixed_seed: int = 1705472343, data_size: int = 1000000000, batch_size: int = 1, num_batches: int = 1, num_iterations: int = 1000, learning_rate: float = 1e-1, early_stop: bool = False, output_index: int = 0, lr_adjust: tuple[float, float] | None = None, target_op_type: list[str] = [ "Conv", "ConvTranspose", "Gemm", "MatMul", "InstanceNormalization", "LayerNormalization", ], selective_update: bool = False, update_bias: bool = False, output_qdq: bool = False, drop_ratio: float = 1.0, mem_opt_level: int = 1, cache_dir: str | None = None, log_period: int = 100, ref_model_path: str | None = None, dynamic_batch: bool = False, parallel: bool = False, reg_param: float = 0.01, beta_range: tuple[float, float] = (20, 2), warm_start: float = 0.2, select_max_mem_layer: bool = False, num_workers: int = 1, pin_memory: bool = False, ) -> None: self.name: str = "adaround" self.optim_device = optim_device self.infer_device = infer_device self.fixed_seed = fixed_seed self.data_size = data_size self.batch_size = batch_size self.num_batches = num_batches self.num_iterations = num_iterations self.learning_rate = learning_rate self.early_stop = early_stop self.output_index = output_index self.lr_adjust = lr_adjust self.target_op_type = target_op_type self.selective_update = selective_update self.update_bias = update_bias self.output_qdq = output_qdq self.drop_ratio = drop_ratio self.mem_opt_level = mem_opt_level self.cache_dir = cache_dir self.log_period = log_period self.ref_model_path = ref_model_path self.dynamic_batch = dynamic_batch self.parallel = parallel self.reg_param = reg_param self.beta_range = beta_range self.warm_start = warm_start self.select_max_mem_layer = select_max_mem_layer self.num_workers = num_workers self.pin_memory = pin_memory def _get_config(self, extra_options: dict[str, Any]) -> dict[str, Any]: adaround_config: dict[str, Any] = dict() adaround_config["FastFinetune"] = {} if "FastFinetune" not in extra_options: extra_options["FastFinetune"] = {} if "OptimAlgorithm" not in extra_options["FastFinetune"]: adaround_config["FastFinetune"]["OptimAlgorithm"] = self.name if "OptimDevice" not in extra_options["FastFinetune"]: adaround_config["FastFinetune"]["OptimDevice"] = self.optim_device if "InferDevice" not in extra_options["FastFinetune"]: adaround_config["FastFinetune"]["InferDevice"] = self.infer_device if "FixedSeed" not in extra_options["FastFinetune"]: adaround_config["FastFinetune"]["FixedSeed"] = self.fixed_seed if "DataSize" not in extra_options["FastFinetune"]: adaround_config["FastFinetune"]["DataSize"] = self.data_size if "BatchSize" not in extra_options["FastFinetune"]: adaround_config["FastFinetune"]["BatchSize"] = self.batch_size if "NumBatches" not in extra_options["FastFinetune"]: adaround_config["FastFinetune"]["NumBatches"] = self.num_batches if "NumIterations" not in extra_options["FastFinetune"]: adaround_config["FastFinetune"]["NumIterations"] = self.num_iterations if "LearningRate" not in extra_options["FastFinetune"]: adaround_config["FastFinetune"]["LearningRate"] = self.learning_rate if "EarlyStop" not in extra_options["FastFinetune"]: adaround_config["FastFinetune"]["EarlyStop"] = self.early_stop if "LRAdjust" not in extra_options["FastFinetune"]: adaround_config["FastFinetune"]["LRAdjust"] = self.lr_adjust if "TargetOpType" not in extra_options["FastFinetune"]: adaround_config["FastFinetune"]["TargetOpType"] = self.target_op_type if "SelectiveUpdate" not in extra_options["FastFinetune"]: adaround_config["FastFinetune"]["SelectiveUpdate"] = self.selective_update if "UpdateBias" not in extra_options["FastFinetune"]: adaround_config["FastFinetune"]["UpdateBias"] = self.update_bias if "OutputQDQ" not in extra_options["FastFinetune"]: adaround_config["FastFinetune"]["OutputQDQ"] = self.output_qdq if "DropRatio" not in extra_options["FastFinetune"]: adaround_config["FastFinetune"]["DropRatio"] = self.drop_ratio if "MemOptLevel" not in extra_options["FastFinetune"]: adaround_config["FastFinetune"]["MemOptLevel"] = self.mem_opt_level if "CacheDir" not in extra_options["FastFinetune"]: adaround_config["FastFinetune"]["CacheDir"] = self.cache_dir if "LogPeriod" not in extra_options["FastFinetune"]: adaround_config["FastFinetune"]["LogPeriod"] = self.log_period if "SelectMaxMemLayer" not in extra_options["FastFinetune"]: adaround_config["FastFinetune"]["SelectMaxMemLayer"] = self.select_max_mem_layer if "NumWorkers" not in extra_options["FastFinetune"]: adaround_config["FastFinetune"]["NumWorkers"] = self.num_workers if "PinMemory" not in extra_options["FastFinetune"]: adaround_config["FastFinetune"]["PinMemory"] = self.pin_memory return adaround_config
[docs] class AdaQuantConfig(AlgoConfig): """Configuration for the AdaQuant algorithm, which is originally proposed in the following paper: "Itay Hubara et al., Improving Post Training Neural Quantization: Layer-wise Calibration and Integer Programming, arXiv:2006.10518, 2020." AdaQuant (Adaptive Quantization) is a PTQ algorithm that adaptively adjusts quantization parameters based on calibration data. Rather than relying on fixed statistics, it performs lightweight optimization to minimize the difference between the original and quantized model activations, leading to better accuracy retention. The core idea is to minimize loss metrics such as L2 distance between original and quantized activation distributions. Like Adaround, AdaQuant doesn't require labeled data or full retraining, making it suitable for deployment-time optimization. Its adaptive nature makes it more robust than static quantization, especially when quantizing large or sensitive models. :param str optim_device: The device for optimization. Defaults to "cpu". :param str infer_device: The device for inference. Defaults to "cpu". :param int fixed_seed: A fixed seed for reproducibility. Defaults to 1705472343. :param int data_size: The total size of the dataset. Defaults to 1000000000. :param int batch_size: The batch size for optimization. Defaults to 1. :param int num_batches: The number of batches for optimization. Defaults to 1. :param int num_iterations: The number of optimization iterations. Defaults to 3000. :param float learning_rate: The learning rate for optimization. Defaults to 1e-5. :param bool early_stop: Whether to use early stopping. Defaults to False. :param int output_index: The index of the model's output to use for loss calculation. Defaults to 0. :param Optional[Tuple[float, float]] lr_adjust: Learning rate adjustment parameters. Defaults to None. :param List[str] target_op_type: List of operator types to be quantized. Defaults to ["Conv", "ConvTranspose", "Gemm", "MatMul", "InstanceNormalization", "LayerNormalization"]. :param bool selective_update: Whether to selectively update weights. Defaults to False. :param bool update_bias: Whether to update the bias terms. Defaults to False. :param bool output_qdq: Whether to output QDQ format. Defaults to False. :param float drop_ratio: The ratio of weights to drop. Defaults to 1.0. :param int mem_opt_level: Memory optimization level. Defaults to 1. :param Optional[str] cache_dir: Directory for caching. Defaults to None. :param int log_period: Logging period. Defaults to 100. :param Optional[str] ref_model_path: Path to the reference model. Defaults to None. :param bool dynamic_batch: Whether to use dynamic batching. Defaults to False. :param bool parallel: Whether to use parallel processing. Defaults to False. :param float reg_param: The regularization parameter for the rounding loss. This controls the trade-off between minimizing the reconstruction error and forcing the rounding parameters to be binary. Defaults to 0.01. :param Tuple[float, float] beta_range: The range of the temperature parameter 'beta'. the 'beta' controls the sharpness of the soft rounding function. It is annealed from the first value to the second value over the course of optimization. A high 'beta' at the beginning allows for more exploration, while a low 'beta' at the end encourages convergence to a binary solution. Defaults to (20, 2). :param float warm_start: The fraction of total iterations for the "warm start" phase. During this phase, only the reconstruction loss is used, and the regularization term is gradually introduced. This helps to find a good initial state before forcing the rounding decisions to be binary. Defaults to 0.2. :param bool select_max_mem_layer: Whether to select the layer with largest estimated memory usage to run. :param int num_workers: Number of subprocesses used for data loading. - 0 means the data will be loaded in the main process. - >0 enables multi-process data loading, which can significantly speed up data pipeline when dataset and transforms are heavy. Note: Using multiple workers increases CPU usage and may require careful handling of worker-safe code. :param bool pin_memory: If True, the DataLoader will copy tensors into CUDA pinned memory before returning them. """ def __init__( self, optim_device: str = "cpu", infer_device: str = "cpu", fixed_seed: int = 1705472343, data_size: int = 1000000000, batch_size: int = 1, num_batches: int = 1, num_iterations: int = 3000, learning_rate: float = 1e-5, early_stop: bool = False, output_index: int = 0, lr_adjust: tuple[float, float] | None = None, target_op_type: list[str] = [ "Conv", "ConvTranspose", "Gemm", "MatMul", "InstanceNormalization", "LayerNormalization", ], selective_update: bool = False, update_bias: bool = False, output_qdq: bool = False, drop_ratio: float = 1.0, mem_opt_level: int = 1, cache_dir: str | None = None, log_period: int = 100, ref_model_path: str | None = None, dynamic_batch: bool = False, parallel: bool = False, reg_param: float = 0.01, beta_range: tuple[float, float] = (20, 2), warm_start: float = 0.2, select_max_mem_layer: bool = False, num_workers: int = 1, pin_memory: bool = False, ) -> None: self.name: str = "adaquant" self.optim_device = optim_device self.infer_device = infer_device self.fixed_seed = fixed_seed self.data_size = data_size self.batch_size = batch_size self.num_batches = num_batches self.num_iterations = num_iterations self.learning_rate = learning_rate self.early_stop = early_stop self.output_index = output_index self.lr_adjust = lr_adjust self.target_op_type = target_op_type self.selective_update = selective_update self.update_bias = update_bias self.output_qdq = output_qdq self.drop_ratio = drop_ratio self.mem_opt_level = mem_opt_level self.cache_dir = cache_dir self.log_period = log_period self.ref_model_path = ref_model_path self.dynamic_batch = dynamic_batch self.parallel = parallel self.reg_param = reg_param self.beta_range = beta_range self.warm_start = warm_start self.select_max_mem_layer = select_max_mem_layer self.num_workers = num_workers self.pin_memory = pin_memory def _get_config(self, extra_options: dict[str, Any]) -> dict[str, Any]: adaquant_config: dict[str, Any] = dict() adaquant_config["FastFinetune"] = {} if "FastFinetune" not in extra_options: extra_options["FastFinetune"] = {} if "OptimAlgorithm" not in extra_options["FastFinetune"]: adaquant_config["FastFinetune"]["OptimAlgorithm"] = self.name if "OptimDevice" not in extra_options["FastFinetune"]: adaquant_config["FastFinetune"]["OptimDevice"] = self.optim_device if "InferDevice" not in extra_options["FastFinetune"]: adaquant_config["FastFinetune"]["InferDevice"] = self.infer_device if "FixedSeed" not in extra_options["FastFinetune"]: adaquant_config["FastFinetune"]["FixedSeed"] = self.fixed_seed if "DataSize" not in extra_options["FastFinetune"]: adaquant_config["FastFinetune"]["DataSize"] = self.data_size if "BatchSize" not in extra_options["FastFinetune"]: adaquant_config["FastFinetune"]["BatchSize"] = self.batch_size if "NumBatches" not in extra_options["FastFinetune"]: adaquant_config["FastFinetune"]["NumBatches"] = self.num_batches if "NumIterations" not in extra_options["FastFinetune"]: adaquant_config["FastFinetune"]["NumIterations"] = self.num_iterations if "LearningRate" not in extra_options["FastFinetune"]: adaquant_config["FastFinetune"]["LearningRate"] = self.learning_rate if "EarlyStop" not in extra_options["FastFinetune"]: adaquant_config["FastFinetune"]["EarlyStop"] = self.early_stop if "LRAdjust" not in extra_options["FastFinetune"]: adaquant_config["FastFinetune"]["LRAdjust"] = self.lr_adjust if "TargetOpType" not in extra_options["FastFinetune"]: adaquant_config["FastFinetune"]["TargetOpType"] = self.target_op_type if "SelectiveUpdate" not in extra_options["FastFinetune"]: adaquant_config["FastFinetune"]["SelectiveUpdate"] = self.selective_update if "UpdateBias" not in extra_options["FastFinetune"]: adaquant_config["FastFinetune"]["UpdateBias"] = self.update_bias if "OutputQDQ" not in extra_options["FastFinetune"]: adaquant_config["FastFinetune"]["OutputQDQ"] = self.output_qdq if "DropRatio" not in extra_options["FastFinetune"]: adaquant_config["FastFinetune"]["DropRatio"] = self.drop_ratio if "MemOptLevel" not in extra_options["FastFinetune"]: adaquant_config["FastFinetune"]["MemOptLevel"] = self.mem_opt_level if "CacheDir" not in extra_options["FastFinetune"]: adaquant_config["FastFinetune"]["CacheDir"] = self.cache_dir if "LogPeriod" not in extra_options["FastFinetune"]: adaquant_config["FastFinetune"]["LogPeriod"] = self.log_period if "SelectMaxMemLayer" not in extra_options["FastFinetune"]: adaquant_config["FastFinetune"]["SelectMaxMemLayer"] = self.select_max_mem_layer if "NumWorkers" not in extra_options["FastFinetune"]: adaquant_config["FastFinetune"]["NumWorkers"] = self.num_workers if "PinMemory" not in extra_options["FastFinetune"]: adaquant_config["FastFinetune"]["PinMemory"] = self.pin_memory return adaquant_config
[docs] class QuarotConfig(AlgoConfig): """Configuration for the Quarot algorithm, which is originally proposed in the following paper: "Saleh Ashkboos et al., QuaRot: Outlier-Free 4-Bit Inference in Rotated LLMs, arXiv:2404.00456, 2024." Quarot is a PTQ algorithm that enhances model robustness and accuracy by applying a rotation to the weight matrices before quantization. Instead of quantizing weights directly in their original basis, Quarot learns an optimal rotation that aligns the weights with a more quantization-friendly direction. This process reduces the quantization error without requiring full retraining. The algorithm works by factorizing a rotation matrix (e.g., using SVD or low-rank approximations) and optimizing it on unlabeled calibration data. The rotated weights are quantized, and the inverse rotation is fused back cleverly so that the final computation remains efficient and accurate. By leveraging the structure of the weight distribution and introducing minimal additional overhead, Quarot significantly improves quantization performance—especially in low-bit regimes such as INT4. It’s particularly effective for transformer-based models or MLPs, where preserving fine-grained relationships between weights is crucial for maintaining performance. :param int r_matrix_dim: The dimension of constructing rotation matrix. Defaults to 4096. :param bool use_random_had: If True, the rotation matrix will be generated by the random Hadamard scheme. Defaults to False. :param Optional[str] r_config_path: The path of rotation config file. This is necessary when using QuaRot. Defaults to None. """ def __init__( self, r_matrix_dim: int = 4096, use_random_had: bool = False, r_config_path: str | None = None ) -> None: self.name: str = "quarot" self.r_matrix_dim = r_matrix_dim self.use_random_had = use_random_had self.r_config_path = r_config_path def _get_config(self, extra_options: dict[str, Any]) -> dict[str, Any]: quarot_config: dict[str, Any] = dict() if "RMatrixDim" not in extra_options: quarot_config["RMatrixDim"] = self.r_matrix_dim if "UseRandomHad" not in extra_options: quarot_config["UseRandomHad"] = self.use_random_had if "RConfigPath" not in extra_options: quarot_config["RConfigPath"] = self.r_config_path return quarot_config
def _algo_flag(algorithms: list[AlgoConfig], algo_config: type[AlgoConfig]) -> bool: return any(isinstance(algo, algo_config) for algo in algorithms) def _resolove_algo_conflict(algorithms: list[AlgoConfig]) -> list[AlgoConfig]: new_algorithms = set() ada_count = 0 for algo in algorithms: if isinstance(algo, AdaRoundConfig) or isinstance(algo, AdaQuantConfig): ada_count += 1 if ada_count >= 2: logger.warning(f"Only one of the AdaRound and AdaQuant can be selected. {algo.name} has been removed.") # type: ignore ada_count -= 1 continue new_algorithms.add(algo) return list(new_algorithms)