Source code for bigdl.nano.tf.keras.inference.optimizer

#
# Copyright 2016 The BigDL Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

import os
import copy
import time
from pathlib import Path
import numpy as np
import traceback
import tensorflow as tf
from typing import Dict, Optional, List, Union
from bigdl.nano.utils.inference.common.base_optimizer import BaseInferenceOptimizer
from bigdl.nano.utils.inference.common.checker import available_acceleration_combination
from bigdl.nano.utils.inference.common.utils import AccelerationOption,\
    throughput_calculate_helper, format_optimize_result
from bigdl.nano.tf.utils import patch_compiled_and_attrs, patch_attrs
from bigdl.nano.utils.log4Error import invalidInputError
from tensorflow.keras import Model as Model
from tensorflow.data import Dataset
from tensorflow.keras.metrics import Metric
from bigdl.nano.deps.neural_compressor.inc_api import quantize as inc_quantzie
from bigdl.nano.deps.openvino.openvino_api import KerasOpenVINOModel
from bigdl.nano.deps.onnxruntime.onnxruntime_api import KerasONNXRuntimeModel
from bigdl.nano.deps.openvino.openvino_api import load_openvino_model
from bigdl.nano.deps.onnxruntime.onnxruntime_api import load_onnxruntime_model
from bigdl.nano.deps.neural_compressor.inc_api import load_inc_model
from bigdl.nano.tf.keras.amp import BF16Model, load_bf16_model


class TFAccelerationOption(AccelerationOption):
    def optimize(self, model, x=None, y=None, input_spec=None,
                 thread_num=None, logging=False, sample_size_for_pot=100):
        accelerator = self.get_accelerator()
        if self.get_precision() == "fp32":
            # trace
            if accelerator is None:
                return model
            else:
                acce_model = InferenceOptimizer.trace(model=model,
                                                      accelerator=accelerator,
                                                      input_spec=input_spec,
                                                      thread_num=thread_num,
                                                      # remove output of openvino
                                                      logging=logging)
        else:
            # quantize
            ort_method: str = self.method
            acce_model = InferenceOptimizer.quantize(model=model,
                                                     precision=self.get_precision(),
                                                     accelerator=accelerator,
                                                     input_spec=input_spec,
                                                     x=x,
                                                     y=y,
                                                     method=ort_method,
                                                     thread_num=thread_num,
                                                     sample_size=sample_size_for_pot,
                                                     # remove output of openvino
                                                     logging=logging)
        return acce_model


[docs]class InferenceOptimizer(BaseInferenceOptimizer):

    # acceleration method combinations, developers may want to register some new
    # combinations here
    ALL_INFERENCE_ACCELERATION_METHOD: Dict = \
        {  # type: ignore
            "original": TFAccelerationOption(),
            "int8": TFAccelerationOption(inc=True),
            "openvino_fp32": TFAccelerationOption(openvino=True),
            "openvino_int8": TFAccelerationOption(openvino=True, pot=True),
            "onnxruntime_fp32": TFAccelerationOption(onnxruntime=True),
            "onnxruntime_int8_qlinear": TFAccelerationOption(onnxruntime=True, inc=True,
                                                             method="qlinear"),
            "onnxruntime_int8_integer": TFAccelerationOption(onnxruntime=True, inc=True,
                                                             method="integer"),
        }  # type: ignore

[docs]    def optimize(self, model: Model,
                 x: Union[tf.Tensor, np.ndarray, tf.data.Dataset],
                 y: Union[tf.Tensor, np.ndarray] = None,
                 validation_data: Optional[Dataset] = None,
                 input_spec=None,
                 batch_size: int = 1,
                 metric: Optional[Metric] = None,
                 direction: str = "max",
                 thread_num: Optional[int] = None,
                 logging: bool = False,
                 latency_sample_num: int = 100,
                 includes: Optional[List[str]] = None,
                 excludes: Optional[List[str]] = None,
                 output_filename: Optional[str] = None) -> None:
        '''
        This function will give all available inference acceleration methods a try
        and record the latency, accuracy and model instance inside the Optimizer for
        future usage. All model instance is setting to eval mode.

        The available methods are "original", "openvino_fp32", "onnxruntime_fp32", "int8".

        :param model: A keras.Model to be optimized
        :param x: Input data which is used for training. It could be:
                  | 1. a Numpy array (or array-like), or a list of arrays (in case the model
                  | has multiple inputs).
                  |
                  | 2. a TensorFlow tensor, or a list of tensors (in case the model has
                  | multiple inputs).
                  |
                  | 3. an unbatched tf.data.Dataset. Should return a tuple of (inputs, targets).

                  X will be used as calibration dataset for Post-Training Static Quantization (PTQ),
                  as well as be used for generating input_sample to calculate latency.
                  To avoid data leak during calibration, please use training dataset.
        :param y: Target data. Like the input data x, it could be either Numpy array(s) or
                  TensorFlow tensor(s). Its length should be consistent with x.
                  If x is a dataset, y will be ignored (since targets will be obtained from x).
        :param validation_data: (optional) An unbatched tf.data.Dataset object for accuracy
               evaluation. This is only needed when users care about the possible accuracy drop.
        :param input_spec: A (tuple or list of) tf.TensorSpec or numpy array defining the
                           shape/dtype of the input when using 'onnxruntime' accelerator.
                           It will be ignored if accelerator is 'openvino'.
        :param metric: (optional) A tensorflow.keras.metrics.Metric object which is used for
               calculating accuracy.
        :param direction: (optional) A string that indicates the higher/lower
               better for the metric, "min" for the lower the better and "max" for the
               higher the better. Default value is "max".
        :param thread_num: (optional) a int represents how many threads(cores) is needed for
               inference.
        :param logging: whether to log detailed information of model conversion.
               Default: False.
        :param latency_sample_num: (optional) a int represents the number of repetitions
               to calculate the average latency. The default value is 100.
        :param includes: (optional) a list of acceleration methods that will be included in the
               search. Default to None meaning including all available methods. "original" method
               will be automatically add to includes.
        :param excludes: (optional) a list of acceleration methods that will be excluded from the
               search. "original" will be ignored in the excludes.
        :param output_filename: (optional) a string filename is used to specify the file which the
               optimized table will be writed. The default is None which means don't write to file.
        '''
        # check if model is a nn.Module or inherited from a nn.Module
        invalidInputError(isinstance(model, Model), "model should be a Keras Model.")
        invalidInputError(direction in ['min', 'max'],
                          "Only support direction 'min', 'max'.")

        # get the available methods whose dep is met
        available_dict: Dict =\
            available_acceleration_combination(excludes=excludes,
                                               includes=includes,
                                               full_methods=self.ALL_INFERENCE_ACCELERATION_METHOD)

        self._direction: str = direction  # save direction as attr
        # record whether calculate accuracy in optimize by this attr
        if validation_data is None or metric is None:
            self._calculate_accuracy = False
        else:
            # test whether accuracy calculation works later
            # make sure dataset don't have batch
            batched_validation_data = validation_data.batch(batch_size)
            self._calculate_accuracy = True

        if os.getenv('OMP_NUM_THREADS') is not None:
            default_threads: int = int(os.getenv('OMP_NUM_THREADS'))  # type: ignore
        else:
            # TODO: how to get and control thread num in tf?
            default_threads = None  # type: ignore
        thread_num = default_threads if thread_num is None else int(thread_num)  # type: ignore

        result_map: Dict[str, Dict] = {}

        if isinstance(x, Dataset):
            batched_training_dataset = x.batch(batch_size)
            input_sample = next(iter(batched_training_dataset))
            if isinstance(input_sample, (list, tuple)) and len(input_sample) > 1:
                input_sample = input_sample[:-1]
        else:
            input_sample = tf.convert_to_tensor(x[:batch_size])

        if isinstance(input_sample, (list, tuple)) and len(input_sample) == 1:
            input_sample = input_sample[0]

        st = time.perf_counter()
        try:
            if isinstance(input_sample, tf.Tensor):
                model(input_sample)
            else:
                model(*input_sample)
        except Exception:
            invalidInputError(False,
                              "x is incompatible with your model input.")
        baseline_time = time.perf_counter() - st
        if baseline_time > 0.1:  # 100ms
            sample_size_for_pot = 15
        else:
            sample_size_for_pot = 100

        print("==========================Start Optimization==========================")
        start_time = time.perf_counter()
        for idx, (method, available) in enumerate(available_dict.items()):
            result_map[method] = {}
            if available is False:
                result_map[method]["status"] = "lack dependency"
            else:
                print(f"----------Start test {method} model "
                      f"({idx+1}/{len(available_dict)})----------")
                option: AccelerationOption = self.ALL_INFERENCE_ACCELERATION_METHOD[method]
                precision: str = option.get_precision()
                try:
                    acce_model = option.optimize(model=model,
                                                 x=x,
                                                 y=y,
                                                 input_spec=input_spec,
                                                 thread_num=thread_num,
                                                 logging=logging,
                                                 sample_size_for_pot=sample_size_for_pot)
                except Exception:
                    traceback.print_exc()
                    result_map[method]["status"] = "fail to convert"
                    print(f"----------Failed to convert to {method}----------")
                    continue

                result_map[method]["status"] = "successful"

                def func_test(model, sample):
                    model(sample)
                try:
                    result_map[method]["latency"], status =\
                        throughput_calculate_helper(latency_sample_num, baseline_time,
                                                    func_test, acce_model, input_sample)
                    if status is False and method != "original":
                        result_map[method]["status"] = "early stopped"
                        continue
                except Exception:
                    traceback.print_exc()
                    result_map[method]["status"] = "fail to forward"
                    print(f"----------{method} failed to forward----------")
                    continue

                if self._calculate_accuracy:
                    # here we suppose trace don't change accuracy,
                    # so we jump it to reduce time cost of optimize
                    if precision == "fp32" and method != "original":
                        _accuracy = result_map["original"]["accuracy"]
                        _accuracy = round(_accuracy, 3)
                        result_map[method]["accuracy"] = str(_accuracy) + '*'
                    else:
                        if method == "original":
                            # test whether metric works
                            try:
                                result_map[method]["accuracy"] =\
                                    _accuracy_calculate_helper(acce_model, metric,
                                                               batched_validation_data)
                            except Exception:
                                traceback.print_exc()
                                self._calculate_accuracy = False
                        else:
                            result_map[method]["accuracy"] =\
                                _accuracy_calculate_helper(acce_model, metric,
                                                           batched_validation_data)
                else:
                    result_map[method]["accuracy"] = None

                result_map[method]["model"] = acce_model
                print(f"----------Finish test {method} model "
                      f"({idx+1}/{len(available_dict)})----------")

        self.optimized_model_dict: Dict = result_map
        print("\n\n==========================Optimization Results==========================")

        self._optimize_result = format_optimize_result(self.optimized_model_dict,
                                                       self._calculate_accuracy)
        if self._calculate_accuracy:
            # only show this line when there is accuracy data
            self._optimize_result += "* means we assume the metric value of the traced "\
                "model does not change, so we don't recompute metric value to save time.\n"
        # save time cost to self._optimize_result
        time_cost = time.perf_counter() - start_time
        time_cost_str = f"Optimization cost {time_cost:.1f}s in total."
        self._optimize_result += time_cost_str
        if output_filename is not None:
            with open(output_filename, "w") as f:
                f.write(self._optimize_result)
        print(self._optimize_result)
        print("===========================Stop Optimization===========================")

[docs]    @staticmethod
    def trace(model: Model,
              accelerator: Optional[str] = None,
              input_spec=None,
              thread_num: Optional[int] = None,
              device: Optional[str] = 'CPU',
              onnxruntime_session_options=None,
              openvino_config=None,
              logging=True,
              **kwargs):
        """
        Trace a Keras model and convert it into an accelerated module for inference.

        :param model: The Keras model to trace.
        :param accelerator: The accelerator to use, defaults to None meaning staying in Keras
                            backend. 'openvino' and 'onnxruntime' are supported for now.
        :param input_spec: A (tuple or list of) tf.TensorSpec or numpy array defining the
                           shape/dtype of the input when using 'onnxruntime' accelerator.
        :param thread_num: (optional) a int represents how many threads(cores) is needed for
                           inference, only valid for accelerator='onnxruntime'
                           or accelerator='openvino'.
        :param device: (optional) A string represents the device of the inference. Default to 'CPU',
                        only valid when accelerator='openvino', otherwise will be ignored.
                        'CPU', 'GPU' are supported for now.
        :param onnxruntime_session_options: The session option for onnxruntime, only valid when
                                            accelerator='onnxruntime', otherwise will be ignored.
        :param openvino_config: The config to be inputted in core.compile_model. Only valid when
                                accelerator='openvino', otherwise will be ignored.
        :param logging: whether to log detailed information of model conversion, only valid when
                        accelerator='openvino', otherwise will be ignored. Default: ``True``.
        :param **kwargs: Other extra advanced settings include those be passed to model optimizer
                         function of openvino, only valid when accelerator='openvino',
                         otherwise will be ignored.
                         Possible arguments are: mean_values, layout, input, output, et al.
                         For more details about model optimizer, you can see mo --help .
        :return: Model with different acceleration(OpenVINO/ONNX Runtime).
        """
        # device name might be: CPU, GPU, GPU.0, VPUX ...
        invalidInputError(device == 'CPU' or 'GPU' in device,
                          "Now we only support fp32 for CPU and GPU, not {}".format(device))
        if device != 'CPU' and accelerator != 'openvino':
            invalidInputError(False,
                              "Now we only support {} device when accelerator "
                              "is openvino.".format(device))
        if accelerator == 'openvino':
            final_openvino_option = {"INFERENCE_PRECISION_HINT": "f32"} if device is 'CPU' else {}
            if openvino_config is not None:
                final_openvino_option.update(openvino_config)
            result = KerasOpenVINOModel(model,
                                        input_spec=input_spec,
                                        precision='fp32',
                                        thread_num=thread_num,
                                        device=device,
                                        config=final_openvino_option,
                                        logging=logging,
                                        **kwargs)
        elif accelerator == 'onnxruntime':
            if onnxruntime_session_options is None:
                import onnxruntime
                onnxruntime_session_options = onnxruntime.SessionOptions()
            if thread_num is not None:
                onnxruntime_session_options.intra_op_num_threads = thread_num
                onnxruntime_session_options.inter_op_num_threads = thread_num
            result = KerasONNXRuntimeModel(model, input_spec, onnxruntime_session_options)
        else:
            invalidInputError(False, "Accelerator {} is invalid.".format(accelerator))
        return patch_compiled_and_attrs(result, model)

[docs]    @staticmethod
    def quantize(model: Model,
                 x: Union[tf.Tensor, np.ndarray, tf.data.Dataset] = None,
                 y: Union[tf.Tensor, np.ndarray] = None,
                 precision: str = 'int8',
                 accelerator: Optional[str] = None,
                 input_spec=None,
                 metric: Optional[Metric] = None,
                 accuracy_criterion: Optional[dict] = None,
                 approach: str = 'static',
                 method: Optional[str] = None,
                 conf: Optional[str] = None,
                 tuning_strategy: Optional[str] = None,
                 timeout: Optional[int] = None,
                 max_trials: Optional[int] = None,
                 batch: Optional[int] = None,
                 thread_num: Optional[int] = None,
                 device: Optional[str] = 'CPU',
                 inputs: List[str] = None,
                 outputs: List[str] = None,
                 sample_size: int = 100,
                 onnxruntime_session_options=None,
                 openvino_config=None,
                 logging: bool = True,
                 **kwargs):
        """
        Post-training quantization on a keras model.

        :param model: The Keras model to quantize.
        :param x: Input data which is used for training. It could be:
                  | 1. a Numpy array (or array-like), or a list of arrays (in case the model
                  | has multiple inputs).
                  |
                  | 2. a TensorFlow tensor, or a list of tensors (in case the model has
                  | multiple inputs).
                  |
                  | 3. an unbatched tf.data.Dataset. Should return a tuple of (inputs, targets).

                  X will be used as calibration dataset for Post-Training Static Quantization (PTQ).
                  To avoid data leak during calibration, please use training dataset.
                  only valid when precision='int8', otherwise will be ignored.
        :param y: Target data. Like the input data x, it could be either Numpy array(s) or
                  TensorFlow tensor(s). Its length should be consistent with x.
                  If x is a dataset, y will be ignored (since targets will be obtained from x).
        :param precision:       Global precision of quantized model,
                                supported type: 'int8', 'bf16', 'fp16', defaults to 'int8'.
        :param accelerator:     Use accelerator 'None', 'onnxruntime', 'openvino', defaults to None.
                                None means staying in tensorflow.
        :param input_spec: A (tuple or list of) tf.TensorSpec or numpy array defining the
                           shape/dtype of the input when using 'onnxruntime' accelerator.
        :param metric:          A tensorflow.keras.metrics.Metric object for evaluation.
        :param accuracy_criterion:  Tolerable accuracy drop.
                                    accuracy_criterion = {'relative': 0.1, 'higher_is_better': True}
                                    allows relative accuracy loss: 1%. accuracy_criterion =
                                    {'absolute': 0.99, 'higher_is_better':False} means accuracy
                                    must be smaller than 0.99.
        :param approach:        'static' or 'dynamic'.
                                'static': post_training_static_quant,
                                'dynamic': post_training_dynamic_quant.
                                Default: 'static'. Only 'static' approach is supported now.
        :param method:      Method to do quantization. When accelerator=None, supported methods:
                None. When accelerator='onnxruntime', supported methods: 'qlinear', 'integer',
                defaults to 'qlinear'. Suggest 'qlinear' for lower accuracy drop if using
                static quantization.
                More details in https://onnxruntime.ai/docs/performance/quantization.html.
                This argument doesn't take effect for OpenVINO, don't change it for OpenVINO.
        :param conf:        A path to conf yaml file for quantization.
                                Default: None, using default config.
        :param tuning_strategy:    'bayesian', 'basic', 'mse', 'sigopt'. Default: 'bayesian'.
        :param timeout:     Tuning timeout (seconds). Default: None,  which means early stop.
                            Combine with max_trials field to decide when to exit.
        :param max_trials:  Max tune times. Default: None, which means no tuning.
                            Combine with timeout field to decide when to exit.
                            "timeout=0, max_trials=1" means it will try quantization only once and
                            return satisfying best model.
        :param batch:       Batch size of dataloader for calib_dataset. Defaults to None, if the
                            dataset is not a BatchDataset, batchsize equals to 1. Otherwise,
                            batchsize complies with the dataset._batch_size.
        :param thread_num:  (optional) a int represents how many threads(cores) is needed for
                            inference, only valid for accelerator='onnxruntime'
                            or accelerator='openvino'.
        :param device: (optional) A string represents the device of the inference. Default to 'CPU',
                        only valid when accelerator='openvino', otherwise will be ignored.
                        'CPU', 'GPU' and 'VPUX' are supported for now.
        :param inputs:      A list of input names.
                            Default: None, automatically get names from graph.
        :param outputs:     A list of output names.
                            Default: None, automatically get names from graph.
        :param sample_size: (optional) a int represents how many samples will be used for
                            Post-training Optimization Tools (POT) from OpenVINO toolkit,
                            only valid for accelerator='openvino'. Default to 100.
                            The larger the value, the more accurate the conversion,
                            the lower the performance degradation, but the longer the time.
        :param onnxruntime_session_options: The session option for onnxruntime, only valid when
                                            accelerator='onnxruntime', otherwise will be ignored.
        :param openvino_config: The config to be inputted in core.compile_model. Only valid when
                                accelerator='openvino', otherwise will be ignored.
        :param logging: whether to log detailed information of model conversion, only valid when
                        accelerator='openvino', otherwise will be ignored. Default: ``True``.
        :param **kwargs: Other extra advanced settings include:
                         1. those be passed to ``torch.onnx.export`` function,
                         only valid when accelerator='onnxruntime'/'openvino',
                         otherwise will be ignored.
                         Possible arguments are: input_names, output_names, opset_version,
                         et al. For more details, please refer
                         https://pytorch.org/docs/stable/onnx.html#torch.onnx.export.
                         2. those be passed to ``model optimizer`` function of openvino,
                         only valid when accelerator='openvino',
                         otherwise will be ignored.
                         Possible arguments are: mean_values, layout, input, output, et al.
                         For more details about model optimizer, you can see mo --help .
                         If you want to quantize with openvino on VPUX device,
                         you must specify  ``mean_value`` for model optimizer function.
                         Here ``mean_value`` represents mean values to be used for the input image
                         per channel. Values to be provided in the (R,G,B) or [R,G,B] format.
                         Can be defined for desired input of the model, for example:
                         "--mean_values data[255,255,255],info[255,255,255]". The exact meaning
                         and order of channels depend on how the original model was trained.
        :return:            A TensorflowBaseModel. If there is no model found, return None.
        """
        invalidInputError(precision in ['int8', 'fp16', 'bf16'],
                          "Only support 'int8', 'bf16', 'fp16' now, "
                          "no support for {}.".format(precision))
        # device name might be: CPU, GPU, GPU.0, VPUX ...
        invalidInputError(device == 'CPU' or 'GPU' in device or device == 'VPUX',
                          "Now we only support CPU, GPU and VPUX, not {}".format(device))
        if device != 'CPU' and accelerator != 'openvino':
            invalidInputError(False,
                              "Now we only support {} device when accelerator "
                              "is openvino.".format(device))
        if precision == 'fp16':
            invalidInputError('GPU' in device or device == 'VPUX',
                              "fp16 is not supported on {} device.".format(device))
            invalidInputError(accelerator == 'openvino',
                              "fp16 is not supported on {} accelerator.".format(accelerator))
            if device == 'VPUX':
                # for fp16 on VPUX, must specify mean_value.
                invalidInputError('mean_value' in kwargs,
                                  "If you want to quantize with openvino float16 precision on "
                                  "VPUX device, you must specify mean_value for model optimizer "
                                  "function. For more details about model optimizer, you can "
                                  "see mo --help .")
            from bigdl.nano.deps.openvino.tf.model import KerasOpenVINOModel    # type: ignore
            result = KerasOpenVINOModel(model,
                                        input_spec=input_spec,
                                        precision=precision,
                                        thread_num=thread_num,
                                        device=device,
                                        config=openvino_config,
                                        logging=logging,
                                        **kwargs)
            return patch_compiled_and_attrs(result, model)

        elif precision == 'bf16':
            invalidInputError(accelerator == 'openvino' or accelerator is None,
                              "Accelerator {} is invalid for BF16.".format(accelerator))
            invalidInputError(device == 'CPU',
                              "Device {} don't support bfloat16.".format(device))
            if accelerator == 'openvino':
                final_openvino_option = {"INFERENCE_PRECISION_HINT": "bf16"}
                if openvino_config is not None:
                    final_openvino_option.update(openvino_config)
                from bigdl.nano.deps.openvino.tf.model import KerasOpenVINOModel    # type: ignore
                result = KerasOpenVINOModel(model,
                                            input_spec=input_spec,
                                            precision=precision,
                                            thread_num=thread_num,
                                            device=device,
                                            config=final_openvino_option,
                                            logging=logging,
                                            **kwargs)
                return patch_compiled_and_attrs(result, model)
            elif accelerator is None:
                result = BF16Model(model)
                return patch_compiled_and_attrs(result, model)

        invalidInputError(approach == 'static', "Only 'static' approach is supported now.")

        if not isinstance(x, tf.data.Dataset) and y is None:
            # fake label to make quantization work
            y = range(len(x))    # type: ignore
        if isinstance(x, tf.data.Dataset):
            batch_data = next(iter(x))
            if isinstance(batch_data, tf.Tensor) or \
                    isinstance(batch_data, tuple) and len(batch_data) == 1:
                # fake label to make quantization work
                y = range(len(x))    # type: ignore
                y = tf.data.Dataset.from_tensor_slices(y)
                x = tf.data.Dataset.zip((x, y))
        if accelerator is None:
            if isinstance(x, tf.data.Dataset):
                calib_dataset = x
            else:
                calib_dataset = tf.data.Dataset.from_tensor_slices((x, y))
            if batch:
                calib_dataset = calib_dataset.batch(batch)
            result = inc_quantzie(model, dataloader=calib_dataset,
                                  metric=metric,
                                  framework='tensorflow',
                                  conf=conf,
                                  approach=approach,
                                  tuning_strategy=tuning_strategy,
                                  accuracy_criterion=accuracy_criterion,
                                  timeout=timeout,
                                  max_trials=max_trials,
                                  inputs=inputs,
                                  outputs=outputs)
        elif accelerator == 'openvino':
            from bigdl.nano.deps.openvino.tf.model import KerasOpenVINOModel    # type: ignore
            if isinstance(model, KerasOpenVINOModel):    # type: ignore
                openvino_model = model
                openvino_model = openvino_model.target_obj
            else:
                # For CPU: fp32 -> int8, for GPU: fp16 -> int8
                _precision = 'fp16' if device != 'CPU' else 'fp32'
                if device == 'VPUX':
                    # for fp16 on VPUX, must specify mean_value.
                    invalidInputError('mean_value' in kwargs,
                                      "If you want to quantize with openvino on VPUX device, "
                                      "you must specify mean_value for model optimizer "
                                      "function. For more details about model optimizer, you "
                                      "can see mo --help .")
                openvino_model = KerasOpenVINOModel(model,
                                                    input_spec=input_spec,
                                                    precision=_precision,
                                                    thread_num=thread_num,
                                                    device=device,
                                                    config=openvino_config,
                                                    logging=logging,
                                                    **kwargs)
            if metric:
                if not isinstance(accuracy_criterion, dict):
                    accuracy_criterion = {'relative': 0.99, 'higher_is_better': True}
                drop_type = 'relative' if 'relative' in accuracy_criterion else 'absolute'
                higher_is_better = accuracy_criterion.get('higher_is_better', None)
                maximal_drop = accuracy_criterion.get(drop_type, None)
            else:
                drop_type, higher_is_better, maximal_drop = None, None, None
            result = openvino_model.pot(x=x,  # type: ignore
                                        y=y,
                                        metric=metric,
                                        higher_better=higher_is_better,
                                        drop_type=drop_type,
                                        maximal_drop=maximal_drop,
                                        max_iter_num=max_trials,
                                        sample_size=sample_size,
                                        config=openvino_config,
                                        thread_num=thread_num)
        elif accelerator == 'onnxruntime':
            # convert tensorflow model to onnx model
            from bigdl.nano.deps.onnxruntime.tensorflow.tensorflow_onnxruntime_model \
                import KerasONNXRuntimeModel
            if isinstance(model, KerasONNXRuntimeModel):     # type: ignore
                onnx_model = model
            else:
                onnx_model = InferenceOptimizer.trace(model=model, accelerator='onnxruntime',
                                                      input_spec=input_spec, thread_num=thread_num)
            onnx_model = onnx_model.target_obj

            # trace onnx model
            method_map = {
                'qlinear': 'onnxrt_qlinearops',
                'integer': 'onnxrt_integerops',
                None: 'onnxrt_qlinearops'  # default
            }
            framework = method_map.get(method, None)
            result = inc_quantzie(onnx_model, dataloader=(x, y),
                                  metric=metric,
                                  framework=framework,
                                  thread_num=thread_num,
                                  conf=conf,
                                  approach=approach,
                                  tuning_strategy=tuning_strategy,
                                  accuracy_criterion=accuracy_criterion,
                                  timeout=timeout,
                                  max_trials=max_trials,
                                  inputs=inputs,
                                  outputs=outputs,
                                  onnx_option='tensorflow',
                                  onnxruntime_session_options=onnxruntime_session_options)
            result._nesting_level = onnx_model._nesting_level
            result._inputs_dtypes = onnx_model._inputs_dtypes
            result._default_kwargs = onnx_model._default_kwargs
            result._call_fn_args_backup = onnx_model._call_fn_args_backup
        else:
            invalidInputError(False, "Accelerator {} is invalid.".format(accelerator))
        return patch_compiled_and_attrs(result, model)

[docs]    @staticmethod
    def save(model: Model, path):
        """
        Save the model to local file.

        :param model: Any model of keras.Model, including all models accelareted by
               InferenceOptimizer.trace/InferenceOptimizer.quantize.
        :param path: Path to saved model. Path should be a directory.
        """
        import yaml
        path = Path(path)
        path.mkdir(parents=path.parent, exist_ok=True)
        if hasattr(model, '_save'):
            model._save(path)
        else:
            # typically for keras Model
            meta_path = Path(path) / "nano_model_meta.yml"
            with open(meta_path, 'w+') as f:
                metadata = {
                    'ModelType': 'KerasModel',
                    'checkpoint': 'saved_weight.ckpt'
                }
                yaml.safe_dump(metadata, f)
            checkpoint_path = path / metadata['checkpoint']
            model.save_weights(checkpoint_path)

[docs]    @staticmethod
    def load(path, model: Model, device=None):
        """
        Load a model from local.

        :param path: Path to model to be loaded. Path should be a directory.
        :param model: Required FP32 model to load tensorflow model.
        :param device: A string represents the device of the inference. Default to None.
               Only valid for openvino model, otherwise will be ignored.
        :return: Model with different acceleration(None/OpenVINO/ONNX Runtime) or
                 precision(FP32/FP16/BF16/INT8).
        """
        import yaml
        path = Path(path)
        invalidInputError(path.exists(), "{} doesn't exist.".format(path))
        meta_path = path / "nano_model_meta.yml"
        invalidInputError(meta_path.exists(),
                          "File {} is required to load model.".format(str(meta_path)))
        with open(meta_path, 'r') as f:
            metadata = yaml.safe_load(f)
        model_type = metadata.get('ModelType', None)
        if model_type == 'KerasOpenVINOModel':
            result = load_openvino_model(path, framework='tensorflow', device=device)
            return patch_attrs(result, model)
        if model_type == 'KerasONNXRuntimeModel':
            result = load_onnxruntime_model(path, framework='tensorflow')
            return patch_attrs(result, model)
        if model_type == 'KerasQuantizedModel':
            result = load_inc_model(path, model, framework='tensorflow')
            return patch_attrs(result, model)
        if model_type == 'BF16Model':
            result = load_bf16_model(path)
            return patch_attrs(result, model)
        if isinstance(model, Model):
            # typically for keras Model
            model = copy.deepcopy(model)
            checkpoint_path = metadata.get('checkpoint', None)
            if checkpoint_path:
                checkpoint_path = path / metadata['checkpoint']
                model.load_weights(checkpoint_path)
                return model
            else:
                invalidInputError(False, "Key 'checkpoint' must be specified.")
        else:
            invalidInputError(False,
                              "ModelType {} or argument 'model={}' is not acceptable for tensorflow"
                              " loading.".format(model_type, type(model)))


def _accuracy_calculate_helper(model, metric, data):
    '''
    A quick helper to calculate accuracy
    '''
    for data_input, target in data:
        metric.update_state(y_true=target, y_pred=model(data_input))
    return metric.result().numpy()