Source code for bigdl.nano.pytorch.inference.optimizer

#
# Copyright 2016 The BigDL Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

from collections import namedtuple
import torch
from torch import nn
import subprocess
from importlib.util import find_spec
import time
import numpy as np
from copy import deepcopy
from typing import Dict, Callable, Tuple
from torch.utils.data import DataLoader
from torchmetrics.metric import Metric
from bigdl.nano.utils.log4Error import invalidInputError, invalidOperationError
from bigdl.nano.pytorch.amp import BF16Model
from bigdl.nano.deps.openvino.openvino_api import PytorchOpenVINOModel, load_openvino_model
from bigdl.nano.deps.ipex.ipex_api import create_IPEXAccelerator, create_IPEXAccelerator_1_9, \
    PytorchIPEXJITModel, PytorchIPEXJITBF16Model, load_ipexjit_model
from bigdl.nano.deps.onnxruntime.onnxruntime_api import PytorchONNXRuntimeModel, \
    load_onnxruntime_model
from bigdl.nano.deps.neural_compressor.inc_api import load_inc_model, quantize as inc_quantize
from bigdl.nano.utils.inference.pytorch.model import AcceleratedLightningModule
from bigdl.nano.utils.inference.pytorch.model_utils import get_forward_args, get_input_example
from bigdl.nano.pytorch.utils import TORCH_VERSION_LESS_1_10
import warnings
# Filter out useless Userwarnings
warnings.filterwarnings('ignore', category=UserWarning, module='pytorch_lightning')
warnings.filterwarnings('ignore', category=DeprecationWarning, module='pytorch_lightning')
warnings.filterwarnings('ignore', category=UserWarning, module='torch')
warnings.filterwarnings('ignore', category=DeprecationWarning, module='torch')

import os
os.environ['LOGLEVEL'] = 'ERROR'  # remove parital output of inc


_whole_acceleration_options = ["inc", "ipex", "onnxruntime", "openvino", "pot",
                               "bf16", "jit", "channels_last"]

CompareMetric = namedtuple("CompareMetric", ["method_name", "latency", "accuracy"])


class AccelerationOption(object):
    __slot__ = _whole_acceleration_options

    def __init__(self, *args, **kwargs):
        '''
        initialize optimization option
        '''
        for option in _whole_acceleration_options:
            setattr(self, option, kwargs.get(option, False))
        self.method = kwargs.get("method", None)

    def get_precision(self):
        if self.inc or self.pot:
            return "int8"
        if self.bf16:
            return "bf16"
        return "fp32"

    def get_accelerator(self):
        if self.onnxruntime:
            return "onnxruntime"
        if self.openvino:
            return "openvino"
        if self.jit:
            return "jit"
        return None


# acceleration method combinations, developers may want to register some new
# combinations here
ALL_INFERENCE_ACCELERATION_METHOD = \
    {
        "original": AccelerationOption(),
        "fp32_ipex": AccelerationOption(ipex=True),
        "bf16": AccelerationOption(bf16=True),
        "bf16_ipex": AccelerationOption(bf16=True, ipex=True),
        "int8": AccelerationOption(inc=True),
        "jit_fp32": AccelerationOption(jit=True),
        "jit_fp32_ipex": AccelerationOption(jit=True, ipex=True),
        "jit_fp32_ipex_channels_last": AccelerationOption(jit=True, ipex=True,
                                                          channels_last=True),
        "openvino_fp32": AccelerationOption(openvino=True),
        "openvino_int8": AccelerationOption(openvino=True, pot=True),
        "onnxruntime_fp32": AccelerationOption(onnxruntime=True),
        "onnxruntime_int8_qlinear": AccelerationOption(onnxruntime=True, inc=True,
                                                       method="qlinear"),
        "onnxruntime_int8_integer": AccelerationOption(onnxruntime=True, inc=True,
                                                       method="integer"),
    }


[docs]class InferenceOptimizer:

    def __init__(self):
        '''
        InferenceOptimizer for Pytorch Model.

        It can be used to accelerate inference pipeline with very few code changes.
        '''
        # optimized_model_dict handles the optimized model and some metadata
        # in {"method_name": {"latency": ..., "accuracy": ..., "model": ...}}
        self.optimized_model_dict = {}
        self._optimize_result = None

[docs]    def optimize(self, model: nn.Module,
                 training_data: DataLoader,
                 validation_data: DataLoader = None,
                 metric: Callable = None,
                 direction: str = "max",
                 thread_num: int = None,
                 logging: bool = False,
                 latency_sample_num: int = 100) -> None:
        '''
        This function will give all available inference acceleration methods a try
        and record the latency, accuracy and model instance inside the Optimizer for
        future usage. All model instance is setting to eval mode.

        :param model: A nn.module to be optimized
        :param training_data: A pytorch dataloader for training dataset.
               Users should be careful with this parameter since this dataloader
               might be exposed to the model, which causing data leak. The
               batch_size of this dataloader is important as well, users may
               want to set it to the same batch size you may want to use the model
               in real deploy environment. E.g. batch size should be set to 1
               if you would like to use the accelerated model in an online service.
        :param validation_data: (optional) A pytorch dataloader for accuracy evaluation
               This is only needed when users care about the possible accuracy drop.
        :param metric: (optional) A callable object takes prediction and target
               and returns a accuracy value in this calling method `metric(pred, target)`
        :param direction: (optional) A string that indicates the higher/lower
               better for the metric, "min" for the lower the better and "max" for the
               higher the better. Default value is "max".
        :param thread_num: (optional) a int represents how many threads(cores) is needed for
               inference.
        :param logging: whether to log detailed information of model conversion.
               default: False.
        :param latency_sample_num: (optional) a int represents the number of repetitions
               to calculate the average latency. The default value is 100.
        '''
        # TODO: may support accuracy_criterion

        # check if model is a nn.Module or inherited from a nn.Module
        invalidInputError(isinstance(model, nn.Module), "model should be a nn module.")
        invalidInputError(direction in ['min', 'max'],
                          "Only support direction 'min', 'max'.")

        # get the available methods whose dep is met
        available_dict: Dict = _available_acceleration_combination()

        self._direction: str = direction  # save direction as attr
        # record whether calculate accuracy in optimize by this attr
        if validation_data is not None and metric is not None:
            self._calculate_accuracy = True
        else:
            self._calculate_accuracy = False

        default_threads: int = torch.get_num_threads()
        thread_num: int = default_threads if thread_num is None else int(thread_num)

        result_map: Dict[str, Dict] = {}

        model.eval()  # change model to eval mode

        forward_args = get_forward_args(model)
        input_sample = get_input_example(model, training_data, forward_args)
        st = time.perf_counter()
        try:
            with torch.no_grad():
                model(*input_sample)
        except Exception:
            invalidInputError(False,
                              "training_data is incompatible with your model input.")
        baseline_time = time.perf_counter() - st

        print("==========================Start Optimization==========================")
        start_time = time.perf_counter()
        for idx, (method, available) in enumerate(available_dict.items()):
            result_map[method] = {}
            if available is False:
                result_map[method]["status"] = "lack dependency"
            else:
                print(f"----------Start test {method} model "
                      f"({idx+1}/{len(ALL_INFERENCE_ACCELERATION_METHOD)})----------")
                option: AccelerationOption = ALL_INFERENCE_ACCELERATION_METHOD[method]
                use_ipex: bool = option.ipex
                use_channels_last: bool = option.channels_last
                accelerator: str = option.get_accelerator()
                precision: str = option.get_precision()
                # if precision is fp32, then we will use trace method
                if precision == "fp32":
                    try:
                        if accelerator is None and use_ipex is False:
                            acce_model = model
                        else:
                            if accelerator in ("jit", None):
                                acce_model = \
                                    InferenceOptimizer.trace(model=model,
                                                             accelerator=accelerator,
                                                             use_ipex=use_ipex,
                                                             # channels_last is only for jit
                                                             channels_last=use_channels_last,
                                                             input_sample=input_sample)
                            else:
                                acce_model = \
                                    InferenceOptimizer.trace(model=model,
                                                             accelerator=accelerator,
                                                             input_sample=input_sample,
                                                             thread_num=thread_num,
                                                             # remove output of openvino
                                                             logging=logging)
                    except Exception as e:
                        print(e)
                        result_map[method]["status"] = "fail to convert"
                        print(f"----------Failed to convert to {method}----------")
                        continue

                # if precision is int8 or bf16, then we will use quantize method
                elif precision in ("int8", "bf16"):
                    ort_method: str = option.method
                    try:
                        acce_model = \
                            InferenceOptimizer.quantize(model=deepcopy(model),
                                                        precision=precision,
                                                        accelerator=accelerator,
                                                        use_ipex=use_ipex,
                                                        calib_dataloader=training_data,
                                                        method=ort_method,
                                                        thread_num=thread_num,
                                                        # remove output of openvino
                                                        logging=logging)
                    except Exception as e:
                        print(e)
                        result_map[method]["status"] = "fail to convert"
                        print(f"----------Failed to convert to {method}----------")
                        continue

                result_map[method]["status"] = "successful"

                def func_test(model, input_sample):
                    with torch.no_grad():
                        model(*input_sample)

                torch.set_num_threads(thread_num)
                try:
                    result_map[method]["latency"], status =\
                        _throughput_calculate_helper(latency_sample_num, baseline_time,
                                                     func_test, acce_model, input_sample)
                    if status is False:
                        result_map[method]["status"] = "early stopped"
                        torch.set_num_threads(default_threads)
                        continue
                except Exception as e:
                    result_map[method]["status"] = "fail to forward"
                    torch.set_num_threads(default_threads)
                    continue

                torch.set_num_threads(default_threads)
                if self._calculate_accuracy:
                    # here we suppose trace don't change accuracy,
                    # so we jump it to reduce time cost of optimize
                    if precision == "fp32" and method != "original":
                        result_map[method]["accuracy"] = "not recomputed"
                    else:
                        result_map[method]["accuracy"] =\
                            _accuracy_calculate_helper(acce_model,
                                                       metric, validation_data)
                else:
                    result_map[method]["accuracy"] = None

                result_map[method]["model"] = acce_model
                print(f"----------Finish test {method} model "
                      f"({idx+1}/{len(ALL_INFERENCE_ACCELERATION_METHOD)})----------")

        self.optimized_model_dict: Dict = result_map
        print("\n\n==========================Optimization Results==========================")

        self._optimize_result = _format_optimize_result(self.optimized_model_dict,
                                                        self._calculate_accuracy)
        print(self._optimize_result)
        print("Optimization cost {:.3}s at all.".format(time.perf_counter() - start_time))
        print("===========================Stop Optimization===========================")

[docs]    def summary(self):
        '''
        Print format string representation for optimization result
        '''
        invalidOperationError(len(self.optimized_model_dict) > 0,
                              "There is no optimization result. You should call .optimize() "
                              "before summary()")
        print(self._optimize_result)

[docs]    def get_best_model(self,
                       accelerator: str = None,
                       precision: str = None,
                       use_ipex: bool = None,
                       accuracy_criterion: float = None) -> Tuple[nn.Module, str]:
        '''
        :param accelerator: (optional) Use accelerator 'None', 'onnxruntime',
               'openvino', 'jit', defaults to None. If not None, then will only find the
               model with this specific accelerator.
        :param precision: (optional) Supported type: 'int8', 'bf16',
               defaults to None which represents 'fp32'. If not None, the will
               only find the model with thie specific precision.
        :param use_ipex: (optional) if not NOne, then will only find the
               model with this specific ipex setting
        :param :param accuracy_criterion: (optional) a float represents tolerable
               accuracy drop percentage, defaults to None meaning no accuracy control.
        :return: best model, corresponding acceleration option
        '''
        invalidOperationError(len(self.optimized_model_dict) > 0,
                              "There is no optimized model. You should call .optimize() "
                              "before get_best_model()")
        invalidInputError(accelerator in [None, 'onnxruntime', 'openvino', 'jit'],
                          "Only support accelerator 'onnxruntime', 'openvino' and 'jit'.")
        # TODO: include fp16?
        invalidInputError(precision in [None, 'int8', 'bf16'],
                          "Only support precision 'int8', 'bf16'.")
        if accuracy_criterion is not None and not self._calculate_accuracy:
            invalidInputError(False, "If you want to specify accuracy_criterion, you need "
                              "to set metric and validation_data when call 'optimize'.")

        best_model = self.optimized_model_dict["original"]["model"]
        best_metric = CompareMetric("original",
                                    self.optimized_model_dict["original"]["latency"],
                                    self.optimized_model_dict["original"]["accuracy"])

        for method in self.optimized_model_dict.keys():
            if method == "original" or self.optimized_model_dict[method]["status"] != "successful":
                continue
            option: AccelerationOption = ALL_INFERENCE_ACCELERATION_METHOD[method]
            result: Dict = self.optimized_model_dict[method]
            if accelerator is not None:
                if not getattr(option, accelerator):
                    continue
            if precision is not None:
                if precision == 'bf16' and not option.bf16:
                    continue
                if precision == 'int8' and not (option.inc or option.pot):
                    continue
            if use_ipex:
                if not option.ipex:
                    continue

            if accuracy_criterion is not None:
                accuracy = result["accuracy"]
                compare_acc: float = best_metric.accuracy
                if accuracy == "not recomputed":
                    pass
                elif self._direction == "min":
                    if (accuracy - compare_acc) / compare_acc > accuracy_criterion:
                        continue
                else:
                    if (compare_acc - accuracy) / compare_acc > accuracy_criterion:
                        continue

            # After the above conditions are met, the latency comparison is performed
            if result["latency"] < best_metric.latency:
                best_model = result["model"]
                if result["accuracy"] != "not recomputed":
                    accuracy = result["accuracy"]
                else:
                    accuracy = self.optimized_model_dict["original"]["accuracy"]
                best_metric = CompareMetric(method, result["latency"], accuracy)

        return best_model, _format_acceleration_option(best_metric.method_name)

[docs]    @staticmethod
    def quantize(model: nn.Module,
                 precision: str = 'int8',
                 accelerator: str = None,
                 use_ipex: bool = False,
                 calib_dataloader: DataLoader = None,
                 metric: Metric = None,
                 accuracy_criterion: dict = None,
                 approach: str = 'static',
                 method: str = None,
                 conf: str = None,
                 tuning_strategy: str = None,
                 timeout: int = None,
                 max_trials: int = None,
                 input_sample=None,
                 thread_num: int = None,
                 onnxruntime_session_options=None,
                 logging: bool = True,
                 **export_kwargs):
        """
        Calibrate a Pytorch-Lightning model for post-training quantization.

        :param model:           A model to be quantized. Model type should be an instance of
                                nn.Module.
        :param precision:       Global precision of quantized model,
                                supported type: 'int8', 'bf16', 'fp16', defaults to 'int8'.
        :param accelerator:     Use accelerator 'None', 'onnxruntime', 'openvino', defaults to None.
                                None means staying in pytorch.
        :param calib_dataloader:    A torch.utils.data.dataloader.DataLoader object for calibration.
                                    Required for static quantization.
                                    It's also used as validation dataloader.
        :param metric:              A torchmetrics.metric.Metric object for evaluation.
        :param accuracy_criterion:  Tolerable accuracy drop, defaults to None meaning no
                                    accuracy control.
                                    accuracy_criterion = {'relative': 0.1, 'higher_is_better': True}
                                    allows relative accuracy loss: 1%. accuracy_criterion =
                                    {'absolute': 0.99, 'higher_is_better':False} means accuracy
                                    must be smaller than 0.99.
        :param approach:    'static' or 'dynamic'.
                            'static': post_training_static_quant,
                            'dynamic': post_training_dynamic_quant.
                            Default: 'static'. OpenVINO supports static mode only.
        :param method:          Method to do quantization. When accelerator=None, supported
            methods: 'fx', 'eager', 'ipex', defaults to 'fx'. If you don't use ipex, suggest using
            'fx' which executes automatic optimizations like fusion. For more information, please
            refer to https://pytorch.org/docs/stable/quantization.html#eager-mode-quantization.
            When accelerator='onnxruntime', supported methods: 'qlinear', 'integer', defaults
            to 'qlinear'. Suggest 'qlinear' for lower accuracy drop if using static quantization.
            More details in https://onnxruntime.ai/docs/performance/quantization.html.
            This argument doesn't take effect for OpenVINO, don't change it for OpenVINO.
        :param conf:        A path to conf yaml file for quantization.
                            Default: None, using default config.
        :param tuning_strategy:    'bayesian', 'basic', 'mse', 'sigopt'. Default: 'bayesian'.
        :param timeout:     Tuning timeout (seconds). Default: None,  which means early stop.
                            Combine with max_trials field to decide when to exit.
        :param max_trials:  Max tune times. Default: None, which means no tuning.
                            Combine with timeout field to decide when to exit.
                            "timeout=0, max_trials=1" means it will try quantization only once and
                            return satisfying best model.
        :param input_sample:      An input example to convert pytorch model into ONNX/OpenVINO.
        :param thread_num: (optional) a int represents how many threads(cores) is needed for
                           inference, only valid for accelerator='onnxruntime'
                           or accelerator='openvino'.
        :param onnxruntime_session_options: The session option for onnxruntime, only valid when
                                            accelerator='onnxruntime', otherwise will be ignored.
        :param logging: whether to log detailed information of model conversion, only valid when
                        accelerator='openvino', otherwise will be ignored. default: True.
        :param **export_kwargs: will be passed to torch.onnx.export function.
        :return:            A accelerated Pytorch-Lightning Model if quantization is sucessful.
        """
        if precision == 'bf16':
            if accelerator is None:
                if use_ipex:
                    invalidInputError(not TORCH_VERSION_LESS_1_10,
                                      "torch version should >=1.10 to use ipex")
                    use_jit = (accelerator == "jit")
                    channels_last = export_kwargs["channels_last"] \
                        if "channels_last" in export_kwargs else None
                    return PytorchIPEXJITBF16Model(model, input_sample=input_sample,
                                                   use_ipex=use_ipex, use_jit=use_jit,
                                                   channels_last=channels_last)
                bf16_model = BF16Model(model)
                return bf16_model
            else:
                invalidInputError(False,
                                  "Accelerator {} is invalid for BF16.".format(accelerator))
        if precision == 'int8':
            if not accelerator or accelerator == 'onnxruntime':
                method_map = {
                    None: {
                        'fx': 'pytorch_fx',
                        'eager': 'pytorch',
                        'ipex': 'pytorch_ipex',
                        None: 'pytorch_fx'  # default
                    },
                    'onnxruntime': {
                        'qlinear': 'onnxrt_qlinearops',
                        'integer': 'onnxrt_integerops',
                        None: 'onnxrt_qlinearops'  # default
                    }
                }
                framework = method_map[accelerator].get(method, None)
                if accelerator == "onnxruntime":
                    if not type(model).__name__ == 'PytorchONNXRuntimeModel':
                        # try to establish onnx model
                        if input_sample is None:
                            # input_sample can be a dataloader
                            input_sample = calib_dataloader
                        if onnxruntime_session_options is None:
                            import onnxruntime
                            onnxruntime_session_options = onnxruntime.SessionOptions()
                            if thread_num is not None:
                                onnxruntime_session_options.intra_op_num_threads = thread_num
                                onnxruntime_session_options.inter_op_num_threads = thread_num
                        model = InferenceOptimizer.trace(
                            model,
                            input_sample=input_sample,
                            accelerator='onnxruntime',
                            onnxruntime_session_options=onnxruntime_session_options,
                            **export_kwargs)
                """
                If accelerator==None, quantized model returned should be an object of PytorchModel
                which is defined by neural-compressor containing a `GraphModule` for inference.
                Otherwise accelerator=='onnxruntime', it returns an ONNXModel object. A supported
                model which is able to run on Pytorch or ONNXRuntime can be fetched by
                `quantized_model.model`.
                """
                return inc_quantize(model, calib_dataloader, metric,
                                    framework=framework,
                                    conf=conf,
                                    approach=approach,
                                    tuning_strategy=tuning_strategy,
                                    accuracy_criterion=accuracy_criterion,
                                    timeout=timeout,
                                    max_trials=max_trials,
                                    onnxruntime_session_options=onnxruntime_session_options)

            elif accelerator == 'openvino':
                model_type = type(model).__name__
                if not model_type == 'PytorchOpenVINOModel':
                    if input_sample is None:
                        # input_sample can be a dataloader
                        input_sample = calib_dataloader
                    model = InferenceOptimizer.trace(model,
                                                     input_sample=input_sample,
                                                     accelerator='openvino',
                                                     thread_num=thread_num,
                                                     logging=logging,
                                                     **export_kwargs)
                invalidInputError(type(model).__name__ == 'PytorchOpenVINOModel',
                                  "Invalid model to quantize. Please use a nn.Module or a model "
                                  "from trainer.trance(accelerator=='openvino')")
                drop_type = None
                higher_is_better = None
                maximal_drop = None
                if metric:
                    if not isinstance(accuracy_criterion, dict):
                        accuracy_criterion = {'relative': 0.99, 'higher_is_better': True}

                    drop_type = 'relative' if 'relative' in accuracy_criterion else 'absolute'
                    higher_is_better = accuracy_criterion.get('higher_is_better', None)
                    maximal_drop = accuracy_criterion.get(drop_type, None)

                kwargs = {
                    "metric": metric,
                    "higher_better": higher_is_better,
                    "drop_type": drop_type,
                    "maximal_drop": maximal_drop,
                    "max_iter_num": max_trials,
                    # TODO following two keys are optional, if there is need, we can add them
                    # "n_requests": None,
                    # "sample_size": 300
                }
                return model.pot(calib_dataloader, **kwargs)
            else:
                invalidInputError(False,
                                  "Accelerator {} is invalid.".format(accelerator))
        invalidInputError(False,
                          "Precision {} is invalid.".format(precision))

[docs]    @staticmethod
    def trace(model: nn.Module,
              input_sample=None,
              accelerator: str = None,
              use_ipex: bool = False,
              thread_num: int = None,
              onnxruntime_session_options=None,
              logging: bool = True,
              **export_kwargs):
        """
        Trace a pytorch model and convert it into an accelerated module for inference.

        For example, this function returns a PytorchOpenVINOModel when accelerator=='openvino'.

        :param model: An torch.nn.Module model, including pl.LightningModule.
        :param input_sample: A set of inputs for trace, defaults to None if you have trace before or
                             model is a LightningModule with any dataloader attached.
        :param accelerator: The accelerator to use, defaults to None meaning staying in Pytorch
                            backend. 'openvino', 'onnxruntime' and 'jit' are supported for now.
        :param use_ipex: whether we use ipex as accelerator for inferencing. default: False.
        :param thread_num: (optional) a int represents how many threads(cores) is needed for
                           inference, only valid for accelerator='onnxruntime'
                           or accelerator='openvino'.
        :param onnxruntime_session_options: The session option for onnxruntime, only valid when
                                            accelerator='onnxruntime', otherwise will be ignored.
        :param logging: whether to log detailed information of model conversion, only valid when
                        accelerator='openvino', otherwise will be ignored. default: True.
        :param **kwargs: other extra advanced settings include
                         1. those be passed to torch.onnx.export function, only valid when
                         accelerator='onnxruntime'/'openvino', otherwise will be ignored.
                         2. if channels_last is set and use_ipex=True, we will transform the
                         data to be channels last according to the setting. Defaultly, channels_last
                         will be set to True if use_ipex=True.
        :return: Model with different acceleration.
        """
        invalidInputError(
            isinstance(model, nn.Module) and not isinstance(model, AcceleratedLightningModule),
            "Expect a nn.Module instance that is not traced or quantized"
            "but got type {}".format(type(model))
        )
        if accelerator == 'openvino':  # openvino backend will not care about ipex usage
            return PytorchOpenVINOModel(model, input_sample, thread_num, logging, **export_kwargs)
        if accelerator == 'onnxruntime':  # onnxruntime backend will not care about ipex usage
            if onnxruntime_session_options is None:
                import onnxruntime
                onnxruntime_session_options = onnxruntime.SessionOptions()
                if thread_num is not None:
                    onnxruntime_session_options.intra_op_num_threads = thread_num
                    onnxruntime_session_options.inter_op_num_threads = thread_num
            return PytorchONNXRuntimeModel(model, input_sample, onnxruntime_session_options,
                                           **export_kwargs)
        if accelerator == 'jit' or use_ipex:
            if use_ipex:
                invalidInputError(not TORCH_VERSION_LESS_1_10,
                                  "torch version should >=1.10 to use ipex")
            use_jit = (accelerator == "jit")
            channels_last = export_kwargs["channels_last"]\
                if "channels_last" in export_kwargs else None
            return PytorchIPEXJITModel(model, input_sample=input_sample, use_ipex=use_ipex,
                                       use_jit=use_jit, channels_last=channels_last)
        invalidInputError(False, "Accelerator {} is invalid.".format(accelerator))


def _inc_checker():
    '''
    check if intel neural compressor is installed
    '''
    return not find_spec("neural_compressor") is None


def _ipex_checker():
    '''
    check if intel pytorch extension is installed
    '''
    return not find_spec("intel_extension_for_pytorch") is None


def _onnxruntime_checker():
    '''
    check if onnxruntime and onnx is installed
    '''
    onnxruntime_installed = not find_spec("onnxruntime") is None
    onnx_installed = not find_spec("onnx") is None
    return onnxruntime_installed and onnx_installed


def _openvino_checker():
    '''
    check if openvino-dev is installed
    '''
    return not find_spec("openvino-dev") is None


def _bf16_checker():
    '''
    bf16 availablity will be decided dynamically during the optimization
    '''
    msg = subprocess.check_output(["lscpu"]).decode("utf-8")
    return "avx512_bf16" in msg or "amx_bf16" in msg


def _available_acceleration_combination():
    '''
    :return: a dictionary states the availablity (if meet depdencies)
    '''
    dependency_checker = {"inc": _inc_checker,
                          "ipex": _ipex_checker,
                          "onnxruntime": _onnxruntime_checker,
                          "openvino": _openvino_checker,
                          "pot": _openvino_checker,
                          "bf16": _bf16_checker}
    available_dict = {}
    for method, option in ALL_INFERENCE_ACCELERATION_METHOD.items():
        available_iter = True
        for name, value in option.__dict__.items():
            if value is True:
                if name in dependency_checker and not dependency_checker[name]():
                    available_iter = False
        available_dict[method] = available_iter
    return available_dict


def _throughput_calculate_helper(iterrun, baseline_time, func, *args):
    '''
    A simple helper to calculate average latency
    '''
    start_time = time.perf_counter()
    time_list = []
    for i in range(iterrun):
        st = time.perf_counter()
        with torch.no_grad():
            func(*args)
        end = time.perf_counter()
        time_list.append(end - st)
        # if three samples cost more than 4x time than baseline model, prune it
        if i == 2 and end - start_time > 12 * baseline_time:
            return np.mean(time_list) * 1000, False
        # at least need 10 iters and try to control calculation
        # time less than 2 min
        if i + 1 >= min(iterrun, 10) and (end - start_time) > 2:
            iterrun = i + 1
            break
    time_list.sort()
    # remove top and least 10% data
    time_list = time_list[int(0.1 * iterrun): int(0.9 * iterrun)]
    return np.mean(time_list) * 1000, True


def _accuracy_calculate_helper(model, metric, data):
    '''
    A quick helper to calculate accuracy
    '''
    metric_list = []
    sample_num = 0
    with torch.no_grad():
        for i, (data_input, target) in enumerate(data):
            metric_list.append(metric(model(data_input), target).numpy() * data_input.shape[0])
            sample_num += data_input.shape[0]
    return np.sum(metric_list) / sample_num


def _format_acceleration_option(method_name: str) -> str:
    '''
    Get a string represation for current method's acceleration option
    '''
    option = ALL_INFERENCE_ACCELERATION_METHOD[method_name]
    repr_str = ""
    for key, value in option.__dict__.items():
        if value is True:
            if key == "pot":
                repr_str = repr_str + "int8" + " + "
            else:
                repr_str = repr_str + key + " + "
        elif isinstance(value, str):
            repr_str = repr_str + value + " + "
    if len(repr_str) > 0:
        repr_str = repr_str[:-2]
    return repr_str


def _format_optimize_result(optimize_result_dict: dict,
                            calculate_accuracy: bool) -> str:
    '''
    Get a format string represation for optimization result
    '''
    if calculate_accuracy is True:
        horizontal_line = " {0} {1} {2} {3}\n" \
            .format("-" * 32, "-" * 22, "-" * 14, "-" * 22)
        repr_str = horizontal_line
        repr_str += "| {0:^30} | {1:^20} | {2:^12} | {3:^20} |\n" \
            .format("method", "status", "latency(ms)", "accuracy")
        repr_str += horizontal_line
        for method, result in optimize_result_dict.items():
            status = result["status"]
            latency = result.get("latency", "None")
            if latency != "None":
                latency = round(latency, 3)
            accuracy = result.get("accuracy", "None")
            if accuracy != "None" and isinstance(accuracy, float):
                accuracy = round(accuracy, 3)
            method_str = f"| {method:^30} | {status:^20} | " \
                         f"{latency:^12} | {accuracy:^20} |\n"
            repr_str += method_str
        repr_str += horizontal_line
    else:
        horizontal_line = " {0} {1} {2}\n" \
            .format("-" * 32, "-" * 22, "-" * 14)
        repr_str = horizontal_line
        repr_str += "| {0:^30} | {1:^20} | {2:^12} |\n" \
            .format("method", "status", "latency(ms)")
        repr_str += horizontal_line
        for method, result in optimize_result_dict.items():
            status = result["status"]
            latency = result.get("latency", "None")
            if latency != "None":
                latency = round(latency, 3)
            method_str = f"| {method:^30} | {status:^20} | {latency:^12} |\n"
            repr_str += method_str
        repr_str += horizontal_line
    return repr_str