#
# Copyright 2016 The BigDL Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import os
import copy
import time
from pathlib import Path
import numpy as np
import traceback
import tensorflow as tf
from typing import Dict, Optional, List, Union
from bigdl.nano.utils.inference.common.base_optimizer import BaseInferenceOptimizer
from bigdl.nano.utils.inference.common.checker import available_acceleration_combination
from bigdl.nano.utils.inference.common.utils import AccelerationOption,\
throughput_calculate_helper, format_optimize_result
from bigdl.nano.tf.utils import patch_compiled_and_attrs, patch_attrs
from bigdl.nano.utils.log4Error import invalidInputError
from tensorflow.keras import Model as Model
from tensorflow.data import Dataset
from tensorflow.keras.metrics import Metric
from bigdl.nano.deps.neural_compressor.inc_api import quantize as inc_quantzie
from bigdl.nano.deps.openvino.openvino_api import KerasOpenVINOModel
from bigdl.nano.deps.onnxruntime.onnxruntime_api import KerasONNXRuntimeModel
from bigdl.nano.deps.openvino.openvino_api import load_openvino_model
from bigdl.nano.deps.onnxruntime.onnxruntime_api import load_onnxruntime_model
from bigdl.nano.deps.neural_compressor.inc_api import load_inc_model
from bigdl.nano.tf.keras.amp import BF16Model, load_bf16_model
class TFAccelerationOption(AccelerationOption):
def optimize(self, model, x=None, y=None, input_spec=None,
thread_num=None, logging=False, sample_size_for_pot=100):
accelerator = self.get_accelerator()
if self.get_precision() == "fp32":
# trace
if accelerator is None:
return model
else:
acce_model = InferenceOptimizer.trace(model=model,
accelerator=accelerator,
input_spec=input_spec,
thread_num=thread_num,
# remove output of openvino
logging=logging)
else:
# quantize
ort_method: str = self.method
acce_model = InferenceOptimizer.quantize(model=model,
precision=self.get_precision(),
accelerator=accelerator,
input_spec=input_spec,
x=x,
y=y,
method=ort_method,
thread_num=thread_num,
sample_size=sample_size_for_pot,
# remove output of openvino
logging=logging)
return acce_model
[docs]class InferenceOptimizer(BaseInferenceOptimizer):
# acceleration method combinations, developers may want to register some new
# combinations here
ALL_INFERENCE_ACCELERATION_METHOD: Dict = \
{ # type: ignore
"original": TFAccelerationOption(),
"int8": TFAccelerationOption(inc=True),
"openvino_fp32": TFAccelerationOption(openvino=True),
"openvino_int8": TFAccelerationOption(openvino=True, pot=True),
"onnxruntime_fp32": TFAccelerationOption(onnxruntime=True),
"onnxruntime_int8_qlinear": TFAccelerationOption(onnxruntime=True, inc=True,
method="qlinear"),
"onnxruntime_int8_integer": TFAccelerationOption(onnxruntime=True, inc=True,
method="integer"),
} # type: ignore
[docs] def optimize(self, model: Model,
x: Union[tf.Tensor, np.ndarray, tf.data.Dataset],
y: Union[tf.Tensor, np.ndarray] = None,
validation_data: Optional[Dataset] = None,
input_spec=None,
batch_size: int = 1,
metric: Optional[Metric] = None,
direction: str = "max",
thread_num: Optional[int] = None,
logging: bool = False,
latency_sample_num: int = 100,
includes: Optional[List[str]] = None,
excludes: Optional[List[str]] = None,
output_filename: Optional[str] = None) -> None:
'''
This function will give all available inference acceleration methods a try
and record the latency, accuracy and model instance inside the Optimizer for
future usage. All model instance is setting to eval mode.
The available methods are "original", "openvino_fp32", "onnxruntime_fp32", "int8".
:param model: A keras.Model to be optimized
:param x: Input data which is used for training. It could be:
| 1. a Numpy array (or array-like), or a list of arrays (in case the model
| has multiple inputs).
|
| 2. a TensorFlow tensor, or a list of tensors (in case the model has
| multiple inputs).
|
| 3. an unbatched tf.data.Dataset. Should return a tuple of (inputs, targets).
X will be used as calibration dataset for Post-Training Static Quantization (PTQ),
as well as be used for generating input_sample to calculate latency.
To avoid data leak during calibration, please use training dataset.
:param y: Target data. Like the input data x, it could be either Numpy array(s) or
TensorFlow tensor(s). Its length should be consistent with x.
If x is a dataset, y will be ignored (since targets will be obtained from x).
:param validation_data: (optional) An unbatched tf.data.Dataset object for accuracy
evaluation. This is only needed when users care about the possible accuracy drop.
:param input_spec: A (tuple or list of) tf.TensorSpec or numpy array defining the
shape/dtype of the input when using 'onnxruntime' accelerator.
It will be ignored if accelerator is 'openvino'.
:param metric: (optional) A tensorflow.keras.metrics.Metric object which is used for
calculating accuracy.
:param direction: (optional) A string that indicates the higher/lower
better for the metric, "min" for the lower the better and "max" for the
higher the better. Default value is "max".
:param thread_num: (optional) a int represents how many threads(cores) is needed for
inference.
:param logging: whether to log detailed information of model conversion.
Default: False.
:param latency_sample_num: (optional) a int represents the number of repetitions
to calculate the average latency. The default value is 100.
:param includes: (optional) a list of acceleration methods that will be included in the
search. Default to None meaning including all available methods. "original" method
will be automatically add to includes.
:param excludes: (optional) a list of acceleration methods that will be excluded from the
search. "original" will be ignored in the excludes.
:param output_filename: (optional) a string filename is used to specify the file which the
optimized table will be writed. The default is None which means don't write to file.
'''
# check if model is a nn.Module or inherited from a nn.Module
invalidInputError(isinstance(model, Model), "model should be a Keras Model.")
invalidInputError(direction in ['min', 'max'],
"Only support direction 'min', 'max'.")
# get the available methods whose dep is met
available_dict: Dict =\
available_acceleration_combination(excludes=excludes,
includes=includes,
full_methods=self.ALL_INFERENCE_ACCELERATION_METHOD)
self._direction: str = direction # save direction as attr
# record whether calculate accuracy in optimize by this attr
if validation_data is None or metric is None:
self._calculate_accuracy = False
else:
# test whether accuracy calculation works later
# make sure dataset don't have batch
batched_validation_data = validation_data.batch(batch_size)
self._calculate_accuracy = True
if os.getenv('OMP_NUM_THREADS') is not None:
default_threads: int = int(os.getenv('OMP_NUM_THREADS')) # type: ignore
else:
# TODO: how to get and control thread num in tf?
default_threads = None # type: ignore
thread_num = default_threads if thread_num is None else int(thread_num) # type: ignore
result_map: Dict[str, Dict] = {}
if isinstance(x, Dataset):
batched_training_dataset = x.batch(batch_size)
input_sample = next(iter(batched_training_dataset))
if isinstance(input_sample, (list, tuple)) and len(input_sample) > 1:
input_sample = input_sample[:-1]
else:
input_sample = tf.convert_to_tensor(x[:batch_size])
if isinstance(input_sample, (list, tuple)) and len(input_sample) == 1:
input_sample = input_sample[0]
st = time.perf_counter()
try:
if isinstance(input_sample, tf.Tensor):
model(input_sample)
else:
model(*input_sample)
except Exception:
invalidInputError(False,
"x is incompatible with your model input.")
baseline_time = time.perf_counter() - st
if baseline_time > 0.1: # 100ms
sample_size_for_pot = 15
else:
sample_size_for_pot = 100
print("==========================Start Optimization==========================")
start_time = time.perf_counter()
for idx, (method, available) in enumerate(available_dict.items()):
result_map[method] = {}
if available is False:
result_map[method]["status"] = "lack dependency"
else:
print(f"----------Start test {method} model "
f"({idx+1}/{len(available_dict)})----------")
option: AccelerationOption = self.ALL_INFERENCE_ACCELERATION_METHOD[method]
precision: str = option.get_precision()
try:
acce_model = option.optimize(model=model,
x=x,
y=y,
input_spec=input_spec,
thread_num=thread_num,
logging=logging,
sample_size_for_pot=sample_size_for_pot)
except Exception:
traceback.print_exc()
result_map[method]["status"] = "fail to convert"
print(f"----------Failed to convert to {method}----------")
continue
result_map[method]["status"] = "successful"
def func_test(model, sample):
model(sample)
try:
result_map[method]["latency"], status =\
throughput_calculate_helper(latency_sample_num, baseline_time,
func_test, acce_model, input_sample)
if status is False and method != "original":
result_map[method]["status"] = "early stopped"
continue
except Exception:
traceback.print_exc()
result_map[method]["status"] = "fail to forward"
print(f"----------{method} failed to forward----------")
continue
if self._calculate_accuracy:
# here we suppose trace don't change accuracy,
# so we jump it to reduce time cost of optimize
if precision == "fp32" and method != "original":
_accuracy = result_map["original"]["accuracy"]
_accuracy = round(_accuracy, 3)
result_map[method]["accuracy"] = str(_accuracy) + '*'
else:
if method == "original":
# test whether metric works
try:
result_map[method]["accuracy"] =\
_accuracy_calculate_helper(acce_model, metric,
batched_validation_data)
except Exception:
traceback.print_exc()
self._calculate_accuracy = False
else:
result_map[method]["accuracy"] =\
_accuracy_calculate_helper(acce_model, metric,
batched_validation_data)
else:
result_map[method]["accuracy"] = None
result_map[method]["model"] = acce_model
print(f"----------Finish test {method} model "
f"({idx+1}/{len(available_dict)})----------")
self.optimized_model_dict: Dict = result_map
print("\n\n==========================Optimization Results==========================")
self._optimize_result = format_optimize_result(self.optimized_model_dict,
self._calculate_accuracy)
if self._calculate_accuracy:
# only show this line when there is accuracy data
self._optimize_result += "* means we assume the metric value of the traced "\
"model does not change, so we don't recompute metric value to save time.\n"
# save time cost to self._optimize_result
time_cost = time.perf_counter() - start_time
time_cost_str = f"Optimization cost {time_cost:.1f}s in total."
self._optimize_result += time_cost_str
if output_filename is not None:
with open(output_filename, "w") as f:
f.write(self._optimize_result)
print(self._optimize_result)
print("===========================Stop Optimization===========================")
[docs] @staticmethod
def trace(model: Model,
accelerator: Optional[str] = None,
input_spec=None,
thread_num: Optional[int] = None,
device: Optional[str] = 'CPU',
onnxruntime_session_options=None,
openvino_config=None,
logging=True,
**kwargs):
"""
Trace a Keras model and convert it into an accelerated module for inference.
:param model: The Keras model to trace.
:param accelerator: The accelerator to use, defaults to None meaning staying in Keras
backend. 'openvino' and 'onnxruntime' are supported for now.
:param input_spec: A (tuple or list of) tf.TensorSpec or numpy array defining the
shape/dtype of the input when using 'onnxruntime' accelerator.
:param thread_num: (optional) a int represents how many threads(cores) is needed for
inference, only valid for accelerator='onnxruntime'
or accelerator='openvino'.
:param device: (optional) A string represents the device of the inference. Default to 'CPU',
only valid when accelerator='openvino', otherwise will be ignored.
'CPU', 'GPU' are supported for now.
:param onnxruntime_session_options: The session option for onnxruntime, only valid when
accelerator='onnxruntime', otherwise will be ignored.
:param openvino_config: The config to be inputted in core.compile_model. Only valid when
accelerator='openvino', otherwise will be ignored.
:param logging: whether to log detailed information of model conversion, only valid when
accelerator='openvino', otherwise will be ignored. Default: ``True``.
:param **kwargs: Other extra advanced settings include those be passed to model optimizer
function of openvino, only valid when accelerator='openvino',
otherwise will be ignored.
Possible arguments are: mean_values, layout, input, output, et al.
For more details about model optimizer, you can see mo --help .
:return: Model with different acceleration(OpenVINO/ONNX Runtime).
"""
# device name might be: CPU, GPU, GPU.0, VPUX ...
invalidInputError(device == 'CPU' or 'GPU' in device,
"Now we only support fp32 for CPU and GPU, not {}".format(device))
if device != 'CPU' and accelerator != 'openvino':
invalidInputError(False,
"Now we only support {} device when accelerator "
"is openvino.".format(device))
if accelerator == 'openvino':
final_openvino_option = {"INFERENCE_PRECISION_HINT": "f32"} if device is 'CPU' else {}
if openvino_config is not None:
final_openvino_option.update(openvino_config)
result = KerasOpenVINOModel(model,
input_spec=input_spec,
precision='fp32',
thread_num=thread_num,
device=device,
config=final_openvino_option,
logging=logging,
**kwargs)
elif accelerator == 'onnxruntime':
if onnxruntime_session_options is None:
import onnxruntime
onnxruntime_session_options = onnxruntime.SessionOptions()
if thread_num is not None:
onnxruntime_session_options.intra_op_num_threads = thread_num
onnxruntime_session_options.inter_op_num_threads = thread_num
result = KerasONNXRuntimeModel(model, input_spec, onnxruntime_session_options)
else:
invalidInputError(False, "Accelerator {} is invalid.".format(accelerator))
return patch_compiled_and_attrs(result, model)
[docs] @staticmethod
def quantize(model: Model,
x: Union[tf.Tensor, np.ndarray, tf.data.Dataset] = None,
y: Union[tf.Tensor, np.ndarray] = None,
precision: str = 'int8',
accelerator: Optional[str] = None,
input_spec=None,
metric: Optional[Metric] = None,
accuracy_criterion: Optional[dict] = None,
approach: str = 'static',
method: Optional[str] = None,
conf: Optional[str] = None,
tuning_strategy: Optional[str] = None,
timeout: Optional[int] = None,
max_trials: Optional[int] = None,
batch: Optional[int] = None,
thread_num: Optional[int] = None,
device: Optional[str] = 'CPU',
inputs: List[str] = None,
outputs: List[str] = None,
sample_size: int = 100,
onnxruntime_session_options=None,
openvino_config=None,
logging: bool = True,
**kwargs):
"""
Post-training quantization on a keras model.
:param model: The Keras model to quantize.
:param x: Input data which is used for training. It could be:
| 1. a Numpy array (or array-like), or a list of arrays (in case the model
| has multiple inputs).
|
| 2. a TensorFlow tensor, or a list of tensors (in case the model has
| multiple inputs).
|
| 3. an unbatched tf.data.Dataset. Should return a tuple of (inputs, targets).
X will be used as calibration dataset for Post-Training Static Quantization (PTQ).
To avoid data leak during calibration, please use training dataset.
only valid when precision='int8', otherwise will be ignored.
:param y: Target data. Like the input data x, it could be either Numpy array(s) or
TensorFlow tensor(s). Its length should be consistent with x.
If x is a dataset, y will be ignored (since targets will be obtained from x).
:param precision: Global precision of quantized model,
supported type: 'int8', 'bf16', 'fp16', defaults to 'int8'.
:param accelerator: Use accelerator 'None', 'onnxruntime', 'openvino', defaults to None.
None means staying in tensorflow.
:param input_spec: A (tuple or list of) tf.TensorSpec or numpy array defining the
shape/dtype of the input when using 'onnxruntime' accelerator.
:param metric: A tensorflow.keras.metrics.Metric object for evaluation.
:param accuracy_criterion: Tolerable accuracy drop.
accuracy_criterion = {'relative': 0.1, 'higher_is_better': True}
allows relative accuracy loss: 1%. accuracy_criterion =
{'absolute': 0.99, 'higher_is_better':False} means accuracy
must be smaller than 0.99.
:param approach: 'static' or 'dynamic'.
'static': post_training_static_quant,
'dynamic': post_training_dynamic_quant.
Default: 'static'. Only 'static' approach is supported now.
:param method: Method to do quantization. When accelerator=None, supported methods:
None. When accelerator='onnxruntime', supported methods: 'qlinear', 'integer',
defaults to 'qlinear'. Suggest 'qlinear' for lower accuracy drop if using
static quantization.
More details in https://onnxruntime.ai/docs/performance/quantization.html.
This argument doesn't take effect for OpenVINO, don't change it for OpenVINO.
:param conf: A path to conf yaml file for quantization.
Default: None, using default config.
:param tuning_strategy: 'bayesian', 'basic', 'mse', 'sigopt'. Default: 'bayesian'.
:param timeout: Tuning timeout (seconds). Default: None, which means early stop.
Combine with max_trials field to decide when to exit.
:param max_trials: Max tune times. Default: None, which means no tuning.
Combine with timeout field to decide when to exit.
"timeout=0, max_trials=1" means it will try quantization only once and
return satisfying best model.
:param batch: Batch size of dataloader for calib_dataset. Defaults to None, if the
dataset is not a BatchDataset, batchsize equals to 1. Otherwise,
batchsize complies with the dataset._batch_size.
:param thread_num: (optional) a int represents how many threads(cores) is needed for
inference, only valid for accelerator='onnxruntime'
or accelerator='openvino'.
:param device: (optional) A string represents the device of the inference. Default to 'CPU',
only valid when accelerator='openvino', otherwise will be ignored.
'CPU', 'GPU' and 'VPUX' are supported for now.
:param inputs: A list of input names.
Default: None, automatically get names from graph.
:param outputs: A list of output names.
Default: None, automatically get names from graph.
:param sample_size: (optional) a int represents how many samples will be used for
Post-training Optimization Tools (POT) from OpenVINO toolkit,
only valid for accelerator='openvino'. Default to 100.
The larger the value, the more accurate the conversion,
the lower the performance degradation, but the longer the time.
:param onnxruntime_session_options: The session option for onnxruntime, only valid when
accelerator='onnxruntime', otherwise will be ignored.
:param openvino_config: The config to be inputted in core.compile_model. Only valid when
accelerator='openvino', otherwise will be ignored.
:param logging: whether to log detailed information of model conversion, only valid when
accelerator='openvino', otherwise will be ignored. Default: ``True``.
:param **kwargs: Other extra advanced settings include:
1. those be passed to ``torch.onnx.export`` function,
only valid when accelerator='onnxruntime'/'openvino',
otherwise will be ignored.
Possible arguments are: input_names, output_names, opset_version,
et al. For more details, please refer
https://pytorch.org/docs/stable/onnx.html#torch.onnx.export.
2. those be passed to ``model optimizer`` function of openvino,
only valid when accelerator='openvino',
otherwise will be ignored.
Possible arguments are: mean_values, layout, input, output, et al.
For more details about model optimizer, you can see mo --help .
If you want to quantize with openvino on VPUX device,
you must specify ``mean_value`` for model optimizer function.
Here ``mean_value`` represents mean values to be used for the input image
per channel. Values to be provided in the (R,G,B) or [R,G,B] format.
Can be defined for desired input of the model, for example:
"--mean_values data[255,255,255],info[255,255,255]". The exact meaning
and order of channels depend on how the original model was trained.
:return: A TensorflowBaseModel. If there is no model found, return None.
"""
invalidInputError(precision in ['int8', 'fp16', 'bf16'],
"Only support 'int8', 'bf16', 'fp16' now, "
"no support for {}.".format(precision))
# device name might be: CPU, GPU, GPU.0, VPUX ...
invalidInputError(device == 'CPU' or 'GPU' in device or device == 'VPUX',
"Now we only support CPU, GPU and VPUX, not {}".format(device))
if device != 'CPU' and accelerator != 'openvino':
invalidInputError(False,
"Now we only support {} device when accelerator "
"is openvino.".format(device))
if precision == 'fp16':
invalidInputError('GPU' in device or device == 'VPUX',
"fp16 is not supported on {} device.".format(device))
invalidInputError(accelerator == 'openvino',
"fp16 is not supported on {} accelerator.".format(accelerator))
if device == 'VPUX':
# for fp16 on VPUX, must specify mean_value.
invalidInputError('mean_value' in kwargs,
"If you want to quantize with openvino float16 precision on "
"VPUX device, you must specify mean_value for model optimizer "
"function. For more details about model optimizer, you can "
"see mo --help .")
from bigdl.nano.deps.openvino.tf.model import KerasOpenVINOModel # type: ignore
result = KerasOpenVINOModel(model,
input_spec=input_spec,
precision=precision,
thread_num=thread_num,
device=device,
config=openvino_config,
logging=logging,
**kwargs)
return patch_compiled_and_attrs(result, model)
elif precision == 'bf16':
invalidInputError(accelerator == 'openvino' or accelerator is None,
"Accelerator {} is invalid for BF16.".format(accelerator))
invalidInputError(device == 'CPU',
"Device {} don't support bfloat16.".format(device))
if accelerator == 'openvino':
final_openvino_option = {"INFERENCE_PRECISION_HINT": "bf16"}
if openvino_config is not None:
final_openvino_option.update(openvino_config)
from bigdl.nano.deps.openvino.tf.model import KerasOpenVINOModel # type: ignore
result = KerasOpenVINOModel(model,
input_spec=input_spec,
precision=precision,
thread_num=thread_num,
device=device,
config=final_openvino_option,
logging=logging,
**kwargs)
return patch_compiled_and_attrs(result, model)
elif accelerator is None:
result = BF16Model(model)
return patch_compiled_and_attrs(result, model)
invalidInputError(approach == 'static', "Only 'static' approach is supported now.")
if not isinstance(x, tf.data.Dataset) and y is None:
# fake label to make quantization work
y = range(len(x)) # type: ignore
if isinstance(x, tf.data.Dataset):
batch_data = next(iter(x))
if isinstance(batch_data, tf.Tensor) or \
isinstance(batch_data, tuple) and len(batch_data) == 1:
# fake label to make quantization work
y = range(len(x)) # type: ignore
y = tf.data.Dataset.from_tensor_slices(y)
x = tf.data.Dataset.zip((x, y))
if accelerator is None:
if isinstance(x, tf.data.Dataset):
calib_dataset = x
else:
calib_dataset = tf.data.Dataset.from_tensor_slices((x, y))
if batch:
calib_dataset = calib_dataset.batch(batch)
result = inc_quantzie(model, dataloader=calib_dataset,
metric=metric,
framework='tensorflow',
conf=conf,
approach=approach,
tuning_strategy=tuning_strategy,
accuracy_criterion=accuracy_criterion,
timeout=timeout,
max_trials=max_trials,
inputs=inputs,
outputs=outputs)
elif accelerator == 'openvino':
from bigdl.nano.deps.openvino.tf.model import KerasOpenVINOModel # type: ignore
if isinstance(model, KerasOpenVINOModel): # type: ignore
openvino_model = model
openvino_model = openvino_model.target_obj
else:
# For CPU: fp32 -> int8, for GPU: fp16 -> int8
_precision = 'fp16' if device != 'CPU' else 'fp32'
if device == 'VPUX':
# for fp16 on VPUX, must specify mean_value.
invalidInputError('mean_value' in kwargs,
"If you want to quantize with openvino on VPUX device, "
"you must specify mean_value for model optimizer "
"function. For more details about model optimizer, you "
"can see mo --help .")
openvino_model = KerasOpenVINOModel(model,
input_spec=input_spec,
precision=_precision,
thread_num=thread_num,
device=device,
config=openvino_config,
logging=logging,
**kwargs)
if metric:
if not isinstance(accuracy_criterion, dict):
accuracy_criterion = {'relative': 0.99, 'higher_is_better': True}
drop_type = 'relative' if 'relative' in accuracy_criterion else 'absolute'
higher_is_better = accuracy_criterion.get('higher_is_better', None)
maximal_drop = accuracy_criterion.get(drop_type, None)
else:
drop_type, higher_is_better, maximal_drop = None, None, None
result = openvino_model.pot(x=x, # type: ignore
y=y,
metric=metric,
higher_better=higher_is_better,
drop_type=drop_type,
maximal_drop=maximal_drop,
max_iter_num=max_trials,
sample_size=sample_size,
config=openvino_config,
thread_num=thread_num)
elif accelerator == 'onnxruntime':
# convert tensorflow model to onnx model
from bigdl.nano.deps.onnxruntime.tensorflow.tensorflow_onnxruntime_model \
import KerasONNXRuntimeModel
if isinstance(model, KerasONNXRuntimeModel): # type: ignore
onnx_model = model
else:
onnx_model = InferenceOptimizer.trace(model=model, accelerator='onnxruntime',
input_spec=input_spec, thread_num=thread_num)
onnx_model = onnx_model.target_obj
# trace onnx model
method_map = {
'qlinear': 'onnxrt_qlinearops',
'integer': 'onnxrt_integerops',
None: 'onnxrt_qlinearops' # default
}
framework = method_map.get(method, None)
result = inc_quantzie(onnx_model, dataloader=(x, y),
metric=metric,
framework=framework,
thread_num=thread_num,
conf=conf,
approach=approach,
tuning_strategy=tuning_strategy,
accuracy_criterion=accuracy_criterion,
timeout=timeout,
max_trials=max_trials,
inputs=inputs,
outputs=outputs,
onnx_option='tensorflow',
onnxruntime_session_options=onnxruntime_session_options)
result._nesting_level = onnx_model._nesting_level
result._inputs_dtypes = onnx_model._inputs_dtypes
result._default_kwargs = onnx_model._default_kwargs
result._call_fn_args_backup = onnx_model._call_fn_args_backup
else:
invalidInputError(False, "Accelerator {} is invalid.".format(accelerator))
return patch_compiled_and_attrs(result, model)
[docs] @staticmethod
def save(model: Model, path):
"""
Save the model to local file.
:param model: Any model of keras.Model, including all models accelareted by
InferenceOptimizer.trace/InferenceOptimizer.quantize.
:param path: Path to saved model. Path should be a directory.
"""
import yaml
path = Path(path)
path.mkdir(parents=path.parent, exist_ok=True)
if hasattr(model, '_save'):
model._save(path)
else:
# typically for keras Model
meta_path = Path(path) / "nano_model_meta.yml"
with open(meta_path, 'w+') as f:
metadata = {
'ModelType': 'KerasModel',
'checkpoint': 'saved_weight.ckpt'
}
yaml.safe_dump(metadata, f)
checkpoint_path = path / metadata['checkpoint']
model.save_weights(checkpoint_path)
[docs] @staticmethod
def load(path, model: Model, device=None):
"""
Load a model from local.
:param path: Path to model to be loaded. Path should be a directory.
:param model: Required FP32 model to load tensorflow model.
:param device: A string represents the device of the inference. Default to None.
Only valid for openvino model, otherwise will be ignored.
:return: Model with different acceleration(None/OpenVINO/ONNX Runtime) or
precision(FP32/FP16/BF16/INT8).
"""
import yaml
path = Path(path)
invalidInputError(path.exists(), "{} doesn't exist.".format(path))
meta_path = path / "nano_model_meta.yml"
invalidInputError(meta_path.exists(),
"File {} is required to load model.".format(str(meta_path)))
with open(meta_path, 'r') as f:
metadata = yaml.safe_load(f)
model_type = metadata.get('ModelType', None)
if model_type == 'KerasOpenVINOModel':
result = load_openvino_model(path, framework='tensorflow', device=device)
return patch_attrs(result, model)
if model_type == 'KerasONNXRuntimeModel':
result = load_onnxruntime_model(path, framework='tensorflow')
return patch_attrs(result, model)
if model_type == 'KerasQuantizedModel':
result = load_inc_model(path, model, framework='tensorflow')
return patch_attrs(result, model)
if model_type == 'BF16Model':
result = load_bf16_model(path)
return patch_attrs(result, model)
if isinstance(model, Model):
# typically for keras Model
model = copy.deepcopy(model)
checkpoint_path = metadata.get('checkpoint', None)
if checkpoint_path:
checkpoint_path = path / metadata['checkpoint']
model.load_weights(checkpoint_path)
return model
else:
invalidInputError(False, "Key 'checkpoint' must be specified.")
else:
invalidInputError(False,
"ModelType {} or argument 'model={}' is not acceptable for tensorflow"
" loading.".format(model_type, type(model)))
def _accuracy_calculate_helper(model, metric, data):
'''
A quick helper to calculate accuracy
'''
for data_input, target in data:
metric.update_state(y_true=target, y_pred=model(data_input))
return metric.result().numpy()