Source code for bigdl.nano.tf.keras.inference.optimizer

#
# Copyright 2016 The BigDL Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

import os
import subprocess
import tempfile
import cloudpickle
import copy
import time
import operator
from pathlib import Path
import numpy as np
import traceback
import inspect
import sigfig
import tensorflow as tf
import keras
from typing import Dict, Optional, List, Union, Callable
from bigdl.nano.utils.common import BaseInferenceOptimizer, available_acceleration_combination,\
    AccelerationOption, latency_calculate_helper, format_optimize_result
from bigdl.nano.utils.common import invalidInputError
from bigdl.nano.utils.tf import _ModuleWrapper
from bigdl.nano.utils.tf import patch_compiled_and_attrs, patch_attrs
from tensorflow.keras import Model as Model
from tensorflow.data import Dataset
from tensorflow.keras.metrics import Metric
from bigdl.nano.deps.neural_compressor.inc_api import quantize as inc_quantzie
from bigdl.nano.deps.openvino.openvino_api import KerasOpenVINOModel
from bigdl.nano.deps.onnxruntime.onnxruntime_api import KerasONNXRuntimeModel
from bigdl.nano.deps.openvino.openvino_api import load_openvino_model
from bigdl.nano.deps.onnxruntime.onnxruntime_api import load_onnxruntime_model
from bigdl.nano.deps.neural_compressor.inc_api import load_inc_model
from bigdl.nano.tf.keras.amp import BF16Model, load_bf16_model
from bigdl.nano.utils.common import compare_version
from bigdl.nano.utils.tf import try_fake_inference


class TFAccelerationOption(AccelerationOption):
    def optimize(self, model, x=None, y=None, input_spec=None,
                 thread_num=None, logging=False, sample_size_for_pot=100):
        accelerator = self.get_accelerator()
        if self.get_precision() == "fp32":
            # trace
            if accelerator is None:
                return model
            else:
                acce_model = InferenceOptimizer.trace(model=model,
                                                      accelerator=accelerator,
                                                      input_spec=input_spec,
                                                      thread_num=thread_num,
                                                      # remove output of openvino
                                                      logging=logging)
        else:
            # quantize
            ort_method: str = self.method
            acce_model = InferenceOptimizer.quantize(model=model,
                                                     precision=self.get_precision(),
                                                     accelerator=accelerator,
                                                     input_spec=input_spec,
                                                     x=x,
                                                     y=y,
                                                     method=ort_method,
                                                     thread_num=thread_num,
                                                     sample_size=sample_size_for_pot,
                                                     # remove output of openvino
                                                     logging=logging)
        return acce_model


[docs]class InferenceOptimizer(BaseInferenceOptimizer):

    # acceleration method combinations, developers may want to register some new
    # combinations here
    ALL_INFERENCE_ACCELERATION_METHOD: Dict = \
        {  # type: ignore
            "original": TFAccelerationOption(),
            "static_int8": TFAccelerationOption(inc=True),
            "bf16": TFAccelerationOption(bf16=True),
            "openvino_fp32": TFAccelerationOption(openvino=True),
            "openvino_bf16": TFAccelerationOption(openvino=True, bf16=True),
            "openvino_fp16": TFAccelerationOption(openvino=True, fp16=True),
            "openvino_int8": TFAccelerationOption(openvino=True, pot=True),
            "onnxruntime_fp32": TFAccelerationOption(onnxruntime=True),
            "onnxruntime_int8_qlinear": TFAccelerationOption(onnxruntime=True, inc=True,
                                                             method="qlinear"),
            "onnxruntime_int8_integer": TFAccelerationOption(onnxruntime=True, inc=True,
                                                             method="integer"),
        }  # type: ignore

[docs]    def optimize(self, model: Model,
                 x: Union[tf.Tensor, np.ndarray, tf.data.Dataset],
                 y: Union[tf.Tensor, np.ndarray] = None,
                 validation_data: Optional[Dataset] = None,
                 input_spec=None,
                 batch_size: int = 1,
                 metric: Optional[Metric] = None,
                 direction: str = "max",
                 thread_num: Optional[int] = None,
                 logging: bool = False,
                 latency_sample_num: int = 100,
                 includes: Optional[List[str]] = None,
                 excludes: Optional[List[str]] = None,
                 output_filename: Optional[str] = None) -> None:
        '''
        This function will give all available inference acceleration methods a try
        and record the latency, accuracy and model instance inside the Optimizer for
        future usage. All model instance is setting to eval mode.

        The available methods are "original", "openvino_fp32", "onnxruntime_fp32", "int8".

        :param model: A keras.Model to be optimized
        :param x: Input data which is used for training. It could be:

                  | 1. a Numpy array (or array-like), or a list of arrays (in case the model
                  | has multiple inputs).
                  |
                  | 2. a TensorFlow tensor, or a list of tensors (in case the model has
                  | multiple inputs).
                  |
                  | 3. an unbatched tf.data.Dataset. Should return a tuple of (inputs, targets).

                  X will be used as calibration dataset for Post-Training Static Quantization (PTQ),
                  as well as be used for generating input_sample to calculate latency.
                  To avoid data leak during calibration, please use training dataset.
        :param y: Target data. Like the input data x, it could be either Numpy array(s) or
                  TensorFlow tensor(s). Its length should be consistent with x.
                  If x is a dataset, y will be ignored (since targets will be obtained from x).
        :param validation_data: (optional) An unbatched tf.data.Dataset object for accuracy
               evaluation. This is only needed when users care about the possible accuracy drop.
        :param input_spec: (optional) A (tuple or list of) ``tf.TensorSpec``
                           defining the shape/dtype of the input. This is only required when
                           you have a custom Keras model (no input/output layer is explicitly
                           defined).
        :param metric: (optional) A tensorflow.keras.metrics.Metric object which is used for
               calculating accuracy.
        :param direction: (optional) A string that indicates the higher/lower
               better for the metric, "min" for the lower the better and "max" for the
               higher the better. Default value is "max".
        :param thread_num: (optional) An int represents how many threads(cores) is needed for
               inference. This parameter only controls the usage of thread number in the process
               of latency calculation as well as later inference process of your obtained
               accelerated model. In other words, the process of model conversion and optional
               accuracy calculation won't be restricted by this parameter. Defaults to None,
               represents that all cores will be used.
        :param logging: whether to log detailed information of model conversion.
               Default: False.
        :param latency_sample_num: (optional) a int represents the number of repetitions
               to calculate the average latency. The default value is 100.
        :param includes: (optional) a list of acceleration methods that will be included in the
               search. Default to None meaning including all available methods. "original" method
               will be automatically add to includes.
        :param excludes: (optional) a list of acceleration methods that will be excluded from the
               search. "original" will be ignored in the excludes.
        :param output_filename: (optional) a string filename is used to specify the file which the
               optimized table will be writed. The default is None which means don't write to file.
        '''
        # check if model is a nn.Module or inherited from a nn.Module
        invalidInputError(isinstance(model, Model), "model should be a Keras Model.")
        invalidInputError(direction in ['min', 'max'],
                          "Only support direction 'min', 'max'.")

        # get the available methods whose dep is met
        available_dict: Dict =\
            available_acceleration_combination(excludes=excludes,
                                               includes=includes,
                                               full_methods=self.ALL_INFERENCE_ACCELERATION_METHOD)

        self._direction: str = direction  # save direction as attr
        # record whether calculate accuracy in optimize by this attr
        if validation_data is None or metric is None:
            self._calculate_accuracy = False
        else:
            # test whether accuracy calculation works later
            # make sure dataset don't have batch
            batched_validation_data = validation_data.batch(batch_size)
            self._calculate_accuracy = True

        if os.getenv('OMP_NUM_THREADS') is not None:
            default_threads: int = int(os.getenv('OMP_NUM_THREADS'))  # type: ignore
        else:
            default_threads = None  # type: ignore
        thread_num = default_threads if thread_num is None else int(thread_num)  # type: ignore

        result_map: Dict[str, Dict] = {}

        if isinstance(x, Dataset):
            batched_training_dataset = x.batch(batch_size)
            input_sample = next(iter(batched_training_dataset))
            # todo: for now, if len(batch_data) == 2 we assume it is (x, y),
            # otherwise, we assume it is x or (x1, x2, x3, ...)
            if isinstance(input_sample, (list, tuple)) and len(input_sample) == 2:
                input_sample = input_sample[:-1]
        else:
            input_sample = tf.convert_to_tensor(x[:batch_size])

        if isinstance(input_sample, (list, tuple)) and len(input_sample) == 1:
            input_sample = input_sample[0]

        st = time.perf_counter()
        try:
            if isinstance(input_sample, tf.Tensor):
                model(input_sample)
            else:
                model(*input_sample)
        except Exception:
            invalidInputError(False,
                              "x is incompatible with your model input.")
        baseline_time = time.perf_counter() - st
        if baseline_time > 0.1:  # 100ms
            sample_size_for_pot = 15
        else:
            sample_size_for_pot = 100

        print("==========================Start Optimization==========================")
        start_time = time.perf_counter()
        for idx, (method, available) in enumerate(available_dict.items()):
            result_map[method] = {}
            if available is False:
                result_map[method]["status"] = "lack dependency"
            else:
                print(f"----------Start test {method} model "
                      f"({idx+1}/{len(available_dict)})----------")
                option: AccelerationOption = self.ALL_INFERENCE_ACCELERATION_METHOD[method]
                precision: str = option.get_precision()
                try:
                    acce_model = option.optimize(model=model,
                                                 x=x,
                                                 y=y,
                                                 input_spec=input_spec,
                                                 thread_num=thread_num,
                                                 logging=logging,
                                                 sample_size_for_pot=sample_size_for_pot)
                except Exception:
                    traceback.print_exc()
                    result_map[method]["status"] = "fail to convert"
                    print(f"----------Failed to convert to {method}----------")
                    continue

                result_map[method]["status"] = "successful"

                def func_test(model, *args):
                    model(*args)
                try:
                    if method in ("original", "static_int8") and thread_num is not None:
                        _flag = True  # represent whether subprocess works
                        # for original keras model, as tf.config.threading can't set thread
                        # during running, so here we use subprocess to calculate throughput
                        params = {"iterrun": latency_sample_num,
                                  "func": func_test,
                                  "model": model,  # save original model
                                  "input_sample": input_sample,
                                  "method": method}
                        with tempfile.TemporaryDirectory() as temp_dir:
                            if method != "original":
                                # save accelerated model
                                InferenceOptimizer.save(acce_model, temp_dir)
                            _filename = os.path.join(temp_dir, "params")
                            cloudpickle.dump(params, open(_filename, "wb"))
                            my_env = os.environ.copy()
                            my_env["OMP_NUM_THREADS"] = str(thread_num)
                            worker_file = os.path.join(
                                os.path.split(os.path.realpath(__file__))[0], "_worker.py")
                            try:
                                result = subprocess.run(["python", worker_file,
                                                         _filename, str(thread_num)],
                                                        capture_output=True,
                                                        universal_newlines=True,
                                                        env=my_env)
                                latency = float(result.stdout.strip())
                                result_map[method]["latency"] = latency
                            except Exception:
                                _flag = False
                    if method != "original" or thread_num is None or _flag is False:
                        if isinstance(input_sample, tf.Tensor):
                            result_map[method]["latency"], status =\
                                latency_calculate_helper(latency_sample_num, baseline_time,
                                                         func_test, acce_model, input_sample)
                        else:
                            result_map[method]["latency"], status =\
                                latency_calculate_helper(latency_sample_num, baseline_time,
                                                         func_test, acce_model, *input_sample)
                        if status is False and method != "original":
                            result_map[method]["status"] = "early stopped"
                            continue
                except Exception:
                    traceback.print_exc()
                    result_map[method]["status"] = "fail to forward"
                    print(f"----------{method} failed to forward----------")
                    continue

                if self._calculate_accuracy:
                    # here we suppose trace don't change accuracy,
                    # so we jump it to reduce time cost of optimize
                    if precision == "fp32" and method != "original":
                        _accuracy = result_map["original"]["accuracy"]
                        _accuracy = sigfig.round(_accuracy, sigfigs=5)
                        result_map[method]["accuracy"] = str(_accuracy) + '*'
                    else:
                        if method == "original":
                            # test whether metric works
                            try:
                                result_map[method]["accuracy"] =\
                                    _accuracy_calculate_helper(acce_model, metric,
                                                               batched_validation_data)
                            except Exception:
                                traceback.print_exc()
                                self._calculate_accuracy = False
                        else:
                            result_map[method]["accuracy"] =\
                                _accuracy_calculate_helper(acce_model, metric,
                                                           batched_validation_data)
                else:
                    result_map[method]["accuracy"] = None

                result_map[method]["model"] = acce_model
                print(f"----------Finish test {method} model "
                      f"({idx+1}/{len(available_dict)})----------")

        self.optimized_model_dict: Dict = result_map
        print("\n\n==========================Optimization Results==========================")

        self._optimize_result = format_optimize_result(self.optimized_model_dict,
                                                       self._calculate_accuracy)
        if self._calculate_accuracy:
            # only show this line when there is accuracy data
            self._optimize_result += "* means we assume the metric value of the traced "\
                "model does not change, so we don't recompute metric value to save time.\n"
        # save time cost to self._optimize_result
        time_cost = time.perf_counter() - start_time
        time_cost_str = f"Optimization cost {time_cost:.1f}s in total."
        self._optimize_result += time_cost_str
        if output_filename is not None:
            with open(output_filename, "w") as f:
                f.write(self._optimize_result)
        print(self._optimize_result)
        print("===========================Stop Optimization===========================")

[docs]    @staticmethod
    def trace(model: Model,
              accelerator: Optional[str] = None,
              input_spec=None,
              thread_num: Optional[int] = None,
              device: Optional[str] = 'CPU',
              onnxruntime_session_options=None,
              openvino_config=None,
              logging=True,
              **kwargs):
        """
        Trace a Keras model and convert it into an accelerated module for inference.

        :param model: The Keras model to trace.
        :param accelerator: The accelerator to use, defaults to None meaning staying in Keras
                            backend. 'openvino' and 'onnxruntime' are supported for now.
        :param input_spec: (optional) A (tuple or list of) ``tf.TensorSpec``
                           defining the shape/dtype of the input. This is only required when
                           you have a custom Keras model (no input/output layer is explicitly
                           defined).
        :param thread_num: (optional) a int represents how many threads(cores) is needed for
                           inference, only valid for accelerator='onnxruntime'
                           or accelerator='openvino'.
        :param device: (optional) A string represents the device of the inference. Default to 'CPU',
                        only valid when accelerator='openvino', otherwise will be ignored.
                        'CPU', 'GPU' are supported for now.
        :param onnxruntime_session_options: The session option for onnxruntime, only valid when
                                            accelerator='onnxruntime', otherwise will be ignored.
        :param openvino_config: The config to be inputted in core.compile_model. Only valid when
                                accelerator='openvino', otherwise will be ignored.
        :param logging: whether to log detailed information of model conversion, only valid when
                        accelerator='openvino', otherwise will be ignored. Default: ``True``.
        :param **kwargs: Other extra advanced settings include those be passed to model optimizer
                         function of openvino, only valid when accelerator='openvino',
                         otherwise will be ignored.
                         Possible arguments are: mean_values, layout, input, output, et al.
                         For more details about model optimizer, you can see mo --help .
        :return: Model with different acceleration(OpenVINO/ONNX Runtime).
        """
        # device name might be: CPU, GPU, GPU.0, VPUX ...
        invalidInputError(device == 'CPU' or 'GPU' in device,
                          "Now we only support fp32 for CPU and GPU, not {}".format(device))
        if device != 'CPU' and accelerator != 'openvino':
            invalidInputError(False,
                              "Now we only support {} device when accelerator "
                              "is openvino.".format(device))
        if accelerator == 'openvino':
            final_openvino_option = {"INFERENCE_PRECISION_HINT": "f32"} if device == 'CPU' else {}
            if openvino_config is not None:
                final_openvino_option.update(openvino_config)
            result = KerasOpenVINOModel(model,
                                        input_spec=input_spec,
                                        precision='fp32',
                                        thread_num=thread_num,
                                        device=device,
                                        config=final_openvino_option,
                                        logging=logging,
                                        **kwargs)
        elif accelerator == 'onnxruntime':
            if onnxruntime_session_options is None:
                import onnxruntime
                onnxruntime_session_options = onnxruntime.SessionOptions()
            if thread_num is not None:
                onnxruntime_session_options.intra_op_num_threads = thread_num
                onnxruntime_session_options.inter_op_num_threads = thread_num
            result = KerasONNXRuntimeModel(model, input_spec, onnxruntime_session_options)
        else:
            invalidInputError(False, "Accelerator {} is invalid.".format(accelerator))
        return patch_compiled_and_attrs(result, model)

[docs]    @staticmethod
    def quantize(model: Model,
                 x: Union[tf.Tensor, np.ndarray, tf.data.Dataset] = None,
                 y: Union[tf.Tensor, np.ndarray] = None,
                 precision: str = 'int8',
                 accelerator: Optional[str] = None,
                 input_spec=None,
                 eval_func: Optional[Callable] = None,
                 metric: Optional[Metric] = None,
                 accuracy_criterion: Optional[dict] = None,
                 approach: str = 'static',
                 method: Optional[str] = None,
                 conf: Optional[str] = None,
                 tuning_strategy: Optional[str] = None,
                 timeout: Optional[int] = None,
                 max_trials: Optional[int] = None,
                 batch: Optional[int] = None,
                 thread_num: Optional[int] = None,
                 device: Optional[str] = 'CPU',
                 custom_objects=None,
                 inputs: List[str] = None,
                 outputs: List[str] = None,
                 sample_size: int = 100,
                 onnxruntime_session_options=None,
                 openvino_config=None,
                 logging: bool = True,
                 **kwargs):
        """
        Post-training quantization on a keras model.

        :param model: The Keras model to quantize.
        :param x: Input data which is used for training. It could be:

                  | 1. a Numpy array (or array-like), or a list of arrays (in case the model
                  | has multiple inputs).
                  |
                  | 2. a TensorFlow tensor, or a list of tensors (in case the model has
                  | multiple inputs).
                  |
                  | 3. an unbatched tf.data.Dataset. Should return a tuple of (inputs, targets).

                  X will be used as calibration dataset for Post-Training Static Quantization (PTQ).
                  To avoid data leak during calibration, please use training dataset.
                  only valid when precision='int8', otherwise will be ignored.
        :param y: Target data. Like the input data x, it could be either Numpy array(s) or
                  TensorFlow tensor(s). Its length should be consistent with x.
                  If x is a dataset, y will be ignored (since targets will be obtained from x).
        :param precision:       Global precision of quantized model,
                                supported type: 'int8', 'bf16', 'fp16', defaults to 'int8'.
                                Note that, mixed bf16 precision only works for ``keras.Model`` with
                                explict input and output definition(e.g.,
                                model = keras.Model(inputs=inputs, outputs=outputs)).
        :param accelerator:     Use accelerator 'None', 'onnxruntime', 'openvino', defaults to None.
                                None means staying in tensorflow.
        :param input_spec:      (optional) A (tuple or list of) ``tf.TensorSpec``
                                defining the shape/dtype of the input. This is only required when
                                you have a custom Keras model (no input/output layer is explicitly
                                defined).
        :param eval_func:       A evaluation function which only accepts model as input and return
                                evaluation value. This parameter provides a higher degree of
                                freedom than using eval_loader and metric. Default to None meaning
                                no performance tuning, but it would be better give an evaluation
                                function to get better quantization performance.
        :param metric:          A tensorflow.keras.metrics.Metric object for evaluation.
        :param accuracy_criterion:  Tolerable accuracy drop, defaults to None meaning no
                                    accuracy control.
                                    accuracy_criterion = {'absolute':0.99, 'higher_is_better':False}
                                    means accuracy loss must be smaller than 0.99. For example, if
                                    higher_is_better is True, then this requires original metric
                                    value subtract current metric value be smaller than 0.99.
                                    For inc 1.x, this value must be set to [0, 1), for inc 2.x,
                                    there is no limit.
                                    accuracy_criterion = {'relative':0.1, 'higher_is_better':True}
                                    allows relative accuracy loss: 10%.
        :param approach:        'static' or 'dynamic'.
                                'static': post_training_static_quant,
                                'dynamic': post_training_dynamic_quant.
                                Default: 'static'. Only 'static' approach is supported now.
        :param method:      Method to do quantization. When accelerator=None, supported methods:
                None. When accelerator='onnxruntime', supported methods: 'qlinear', 'integer',
                defaults to 'qlinear'. Suggest 'qlinear' for lower accuracy drop if using
                static quantization.
                More details in https://onnxruntime.ai/docs/performance/quantization.html.
                This argument doesn't take effect for OpenVINO, don't change it for OpenVINO.
        :param conf:        A path to conf yaml file for quantization.
                                Default: None, using default config.
        :param tuning_strategy:    'bayesian', 'basic', 'mse', 'sigopt'. Default: 'bayesian'.
        :param timeout:     Tuning timeout (seconds). Default: None,  which means early stop.
                            Combine with max_trials field to decide when to exit.
        :param max_trials:  Max tune times. Default: None, which means no tuning.
                            Combine with timeout field to decide when to exit.
                            "timeout=0, max_trials=1" means it will try quantization only once and
                            return satisfying best model.
        :param batch:       Batch size of dataloader for calib_dataset. Defaults to None, if the
                            dataset is not a BatchDataset, batchsize equals to 1. Otherwise,
                            batchsize complies with the dataset._batch_size.
        :param thread_num:  (optional) a int represents how many threads(cores) is needed for
                            inference, only valid for accelerator='onnxruntime'
                            or accelerator='openvino'.
        :param device: (optional) A string represents the device of the inference. Default to 'CPU',
                        only valid when accelerator='openvino', otherwise will be ignored.
                        'CPU', 'GPU' and 'VPUX' are supported for now.
        :param custom_objects: Optional dictionary mapping names (strings) to custom classes
                               or functions to be considered during deserialization.
                               Only may be required when quantizing bf16 model and `accelerator`
                               is None.
        :param inputs:      A list of input names.
                            Default: None, automatically get names from graph.
        :param outputs:     A list of output names.
                            Default: None, automatically get names from graph.
        :param sample_size: (optional) a int represents how many samples will be used for
                            Post-training Optimization Tools (POT) from OpenVINO toolkit,
                            only valid for accelerator='openvino'. Default to 100.
                            The larger the value, the more accurate the conversion,
                            the lower the performance degradation, but the longer the time.
        :param onnxruntime_session_options: The session option for onnxruntime, only valid when
                                            accelerator='onnxruntime', otherwise will be ignored.
        :param openvino_config: The config to be inputted in core.compile_model. Only valid when
                                accelerator='openvino', otherwise will be ignored.
        :param logging: whether to log detailed information of model conversion, only valid when
                        accelerator='openvino', otherwise will be ignored. Default: ``True``.
        :param **kwargs: Other extra advanced settings include:
                         1. those be passed to ``torch.onnx.export`` function,
                         only valid when accelerator='onnxruntime'/'openvino',
                         otherwise will be ignored.
                         Possible arguments are: input_names, output_names, opset_version,
                         et al. For more details, please refer
                         https://pytorch.org/docs/stable/onnx.html#torch.onnx.export.
                         2. those be passed to ``model optimizer`` function of openvino,
                         only valid when accelerator='openvino',
                         otherwise will be ignored.
                         Possible arguments are: mean_values, layout, input, output, et al.
                         For more details about model optimizer, you can see mo --help .
                         If you want to quantize with openvino on VPUX device,
                         you must specify  ``mean_value`` for model optimizer function.
                         Here ``mean_value`` represents mean values to be used for the input image
                         per channel. Values to be provided in the (R,G,B) or [R,G,B] format.
                         Can be defined for desired input of the model, for example:
                         "--mean_values data[255,255,255],info[255,255,255]". The exact meaning
                         and order of channels depend on how the original model was trained.
        :return:            A TensorflowBaseModel. If there is no model found, return None.
        """
        invalidInputError(precision in ['int8', 'fp16', 'bf16'],
                          "Only support 'int8', 'bf16', 'fp16' now, "
                          "no support for {}.".format(precision))
        # device name might be: CPU, GPU, GPU.0, VPUX ...
        invalidInputError(device == 'CPU' or 'GPU' in device or device == 'VPUX',
                          "Now we only support CPU, GPU and VPUX, not {}".format(device))
        if device != 'CPU' and accelerator != 'openvino':
            invalidInputError(False,
                              "Now we only support {} device when accelerator "
                              "is openvino.".format(device))

        if isinstance(model, _ModuleWrapper):
            original_model = model.source_obj
            model = model.target_obj
        else:
            original_model = model

        if precision == 'fp16':
            invalidInputError(accelerator == 'openvino',
                              "fp16 is not supported on {} accelerator.".format(accelerator))
            if device == 'VPUX':
                # for fp16 on VPUX, must specify mean_value.
                invalidInputError('mean_value' in kwargs,
                                  "If you want to quantize with openvino float16 precision on "
                                  "VPUX device, you must specify mean_value for model optimizer "
                                  "function. For more details about model optimizer, you can "
                                  "see mo --help .")
            from bigdl.nano.deps.openvino.tf.model import KerasOpenVINOModel    # type: ignore
            result = KerasOpenVINOModel(model,
                                        input_spec=input_spec,
                                        precision=precision,
                                        thread_num=thread_num,
                                        device=device,
                                        config=openvino_config,
                                        logging=logging,
                                        **kwargs)
            return patch_compiled_and_attrs(result, original_model)

        elif precision == 'bf16':
            invalidInputError(accelerator == 'openvino' or accelerator is None,
                              "Accelerator {} is invalid for BF16.".format(accelerator))
            invalidInputError(device == 'CPU',
                              "Device {} don't support bfloat16.".format(device))
            if accelerator == 'openvino':
                final_openvino_option = {"INFERENCE_PRECISION_HINT": "bf16"}
                if openvino_config is not None:
                    final_openvino_option.update(openvino_config)
                from bigdl.nano.deps.openvino.tf.model import KerasOpenVINOModel  # type: ignore
                result = KerasOpenVINOModel(model,
                                            input_spec=input_spec,
                                            precision=precision,
                                            thread_num=thread_num,
                                            device=device,
                                            config=final_openvino_option,
                                            logging=logging,
                                            **kwargs)
            elif accelerator is None:
                return BF16Model(model, custom_objects=custom_objects)
            return patch_compiled_and_attrs(result, original_model)

        invalidInputError(approach == 'static', "Only 'static' approach is supported now.")

        if not isinstance(x, tf.data.Dataset) and y is None:
            # fake label to make quantization work
            y = range(len(x))    # type: ignore
        if isinstance(x, tf.data.Dataset):
            batch_data = next(iter(x))
            # todo: for now, if len(batch_data) == 2 we assume it is (x, y),
            # otherwise, we assume it is x or (x1, x2, x3, ...)
            if isinstance(batch_data, tf.Tensor) or \
                    isinstance(batch_data, tuple) and len(batch_data) != 2:
                # fake label to make quantization work
                y = range(len(x))    # type: ignore
                y = tf.data.Dataset.from_tensor_slices(y)
                x = tf.data.Dataset.zip((x, y))

        if accelerator is None:
            if isinstance(x, tf.data.Dataset):
                calib_dataset = x
            else:
                calib_dataset = tf.data.Dataset.from_tensor_slices((x, y))
            if batch:
                calib_dataset = calib_dataset.batch(batch)

            try_fake_inference(model, input_spec)
            if model.inputs is None or model.outputs is None:
                INC_LESS_14 = compare_version("neural_compressor", operator.lt, "1.14")
                # oly works for inc version >= 1.14
                if not INC_LESS_14:
                    # try to fake input and output for model
                    signature = inspect.signature(model.call)
                    input_names = []
                    for param in signature.parameters.values():
                        input_names.append(param.name)
                    if inputs is None:
                        inputs = input_names
                    if outputs is None:
                        outputs = "outputs"    # type: ignore

            result = inc_quantzie(model, dataloader=calib_dataset,
                                  eval_func=eval_func,
                                  metric=metric,
                                  framework='tensorflow',
                                  conf=conf,
                                  approach=approach,
                                  tuning_strategy=tuning_strategy,
                                  accuracy_criterion=accuracy_criterion,
                                  timeout=timeout,
                                  max_trials=max_trials,
                                  inputs=inputs,
                                  outputs=outputs)
        elif accelerator == 'openvino':
            from bigdl.nano.deps.openvino.tf.model import KerasOpenVINOModel    # type: ignore
            if isinstance(model, KerasOpenVINOModel):    # type: ignore
                openvino_model = model
            else:
                # For CPU: fp32 -> int8, for GPU: fp16 -> int8
                _precision = 'fp16' if device != 'CPU' else 'fp32'
                if device == 'VPUX':
                    # for fp16 on VPUX, must specify mean_value.
                    invalidInputError('mean_value' in kwargs,
                                      "If you want to quantize with openvino on VPUX device, "
                                      "you must specify mean_value for model optimizer "
                                      "function. For more details about model optimizer, you "
                                      "can see mo --help .")
                openvino_model = KerasOpenVINOModel(model,
                                                    input_spec=input_spec,
                                                    precision=_precision,
                                                    thread_num=thread_num,
                                                    device=device,
                                                    config=openvino_config,
                                                    logging=logging,
                                                    **kwargs)
            if metric:
                if not isinstance(accuracy_criterion, dict):
                    accuracy_criterion = {'relative': 0.99, 'higher_is_better': True}
                drop_type = 'relative' if 'relative' in accuracy_criterion else 'absolute'
                higher_is_better = accuracy_criterion.get('higher_is_better', None)
                maximal_drop = accuracy_criterion.get(drop_type, None)
            else:
                drop_type, higher_is_better, maximal_drop = None, None, None
            result = openvino_model.pot(x=x,  # type: ignore
                                        y=y,
                                        metric=metric,
                                        higher_better=higher_is_better,
                                        drop_type=drop_type,
                                        maximal_drop=maximal_drop,
                                        max_iter_num=max_trials,
                                        sample_size=sample_size,
                                        config=openvino_config,
                                        thread_num=thread_num)
        elif accelerator == 'onnxruntime':
            # convert tensorflow model to onnx model
            from bigdl.nano.deps.onnxruntime.tensorflow.model import KerasONNXRuntimeModel
            if isinstance(model, KerasONNXRuntimeModel):     # type: ignore
                onnx_model = model
            else:
                onnx_model = InferenceOptimizer.trace(model=model, accelerator='onnxruntime',
                                                      input_spec=input_spec, thread_num=thread_num)

            # trace onnx model
            method_map = {
                'qlinear': 'onnxrt_qlinearops',
                'integer': 'onnxrt_integerops',
                None: 'onnxrt_qlinearops'  # default
            }
            framework = method_map.get(method, None)
            result = inc_quantzie(onnx_model, dataloader=(x, y),
                                  eval_func=eval_func,
                                  metric=metric,
                                  framework=framework,
                                  thread_num=thread_num,
                                  conf=conf,
                                  approach=approach,
                                  tuning_strategy=tuning_strategy,
                                  accuracy_criterion=accuracy_criterion,
                                  timeout=timeout,
                                  max_trials=max_trials,
                                  inputs=inputs,
                                  outputs=outputs,
                                  onnx_option='tensorflow',
                                  onnxruntime_session_options=onnxruntime_session_options)
            result._inputs_dtypes = onnx_model._inputs_dtypes
            result._mode = "arg"    # todo
        else:
            invalidInputError(False, "Accelerator {} is invalid.".format(accelerator))
        return patch_compiled_and_attrs(result, original_model)

[docs]    @staticmethod
    def save(model: Model, path):
        """
        Save the model to local file.

        :param model: Any model of keras.Model, including all models accelareted by
               InferenceOptimizer.trace/InferenceOptimizer.quantize.
        :param path: Path to saved model. Path should be a directory.
        """
        import yaml
        path = Path(path)
        path.mkdir(parents=path.parent, exist_ok=True)
        if hasattr(model, '_save'):
            model._save(path)
        else:
            # typically for keras Model
            meta_path = Path(path) / "nano_model_meta.yml"
            with open(meta_path, 'w+') as f:
                metadata = {
                    'ModelType': 'KerasModel',
                    'checkpoint': 'saved_weight.ckpt'
                }
                yaml.safe_dump(metadata, f)
            checkpoint_path = path / metadata['checkpoint']
            model.save(checkpoint_path)

[docs]    @staticmethod
    def load(path, model: Optional[Model] = None, device=None, custom_objects=None):
        """
        Load a model from local.

        :param path: Path to model to be loaded. Path should be a directory.
        :param model: Required FP32 model to load pytorch model, it is needed if:
               1. you accelerate the model with accelerator=None by
               InferenceOptimizer.trace()/InferenceOptimizer.quantize().
               2. you accelerate the model with InferenceOptimizer.optimize() and
               get_model()/get_best_model(), and the best method or the method you
               specify don't contain accelerator 'onnxruntime'/'openvino'/'jit'.
               If you are not sure what optimization method is used, we recommend that
               you always pass in the original model for this case.
               3. you want to the loaded model contains the attributes of original model.
        :param device: A string represents the device of the inference. Default to None.
               Only valid for openvino model, otherwise will be ignored.
        :param custom_objects: Same to `custom_objects` parameter of `tf.keras.models.load_model`,
               only may be required when loading bf16 model.
        :return: Model with different acceleration(None/OpenVINO/ONNX Runtime) or
                 precision(FP32/FP16/BF16/INT8).
        """
        import yaml
        path = Path(path)
        invalidInputError(path.exists(), "{} doesn't exist.".format(path))
        meta_path = path / "nano_model_meta.yml"
        invalidInputError(meta_path.exists(),
                          "File {} is required to load model.".format(str(meta_path)))
        with open(meta_path, 'r') as f:
            metadata = yaml.safe_load(f)
        model_type = metadata.get('ModelType', None)
        if model_type == 'KerasOpenVINOModel':
            result = load_openvino_model(path, framework='tensorflow', device=device)
            return patch_attrs(result, model)
        if model_type == 'KerasONNXRuntimeModel':
            result = load_onnxruntime_model(path, framework='tensorflow')
            return patch_attrs(result, model)
        if model_type == 'KerasQuantizedModel':
            result = load_inc_model(path, model, framework='tensorflow')
            return patch_attrs(result, model)
        # Arriving here means we are loading a bf16 model or normal keras model
        checkpoint_path = metadata.get('checkpoint', None)
        invalidInputError(checkpoint_path is not None, "Key 'checkpoint' must be specified.")
        checkpoint_path = path / metadata['checkpoint']
        model = keras.models.load_model(checkpoint_path, custom_objects=custom_objects)
        return model


def _accuracy_calculate_helper(model, metric, data):
    '''
    A quick helper to calculate accuracy
    '''
    if isinstance(metric, tf.keras.metrics.Metric):
        metric.reset_states()
        for data_input, target in data:
            metric.update_state(y_true=target, y_pred=model(data_input))
        return metric.result().numpy()
    elif isinstance(metric, Callable):
        results = []
        for data_input, target in data:
            result = metric(y_true=target, y_pred=model(data_input))
            results.append(result)
        return np.average(results)
    else:
        invalidInputError(False, "metric should be a tf.keras.metrics.Metric or a Callable")