Source code for bigdl.nano.pytorch.trainer.Trainer

#
# Copyright 2016 The BigDL Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import copy
from logging import warning
from pathlib import Path
from typing import Any, List, Optional, Union
import pytorch_lightning as pl
import torch
from torch import nn
from torch.nn.modules.loss import _Loss
from torch.utils.data import DataLoader
from torchmetrics.metric import Metric
from torch.optim.lr_scheduler import _LRScheduler
import yaml
from bigdl.nano.pytorch import InferenceOptimizer
from bigdl.nano.pytorch.utils import TORCH_VERSION_LESS_1_10, TORCH_VERSION_LESS_1_11
from bigdl.nano.pytorch.utils import ChannelsLastCallback
from bigdl.nano.pytorch.algorithms import SelectiveBackprop
from bigdl.nano.pytorch.lightning import LightningModule
from bigdl.nano.pytorch.plugins.ddp_spawn import DDPSpawnPlugin
from bigdl.nano.pytorch.plugins.ddp_subprocess import DDPSubprocessPlugin
from bigdl.nano.deps.automl.hpo_api import create_hpo_searcher, check_hpo_status
from bigdl.nano.deps.ray.ray_api import distributed_ray
from bigdl.nano.utils.log4Error import invalidInputError
from bigdl.nano.deps.openvino.openvino_api import load_openvino_model
from bigdl.nano.deps.ipex.ipex_api import load_ipexjit_model
from bigdl.nano.deps.onnxruntime.onnxruntime_api import load_onnxruntime_model
from bigdl.nano.deps.neural_compressor.inc_api import load_inc_model
from bigdl.nano.common import check_avx512
from bigdl.nano.utils import deprecated

distributed_backends = ["spawn", "ray", "subprocess"]


[docs]class Trainer(pl.Trainer):
    """
    Trainer for BigDL-Nano pytorch.

    This Trainer extends PyTorch Lightning Trainer by adding
    various options to accelerate pytorch training.
    """

    def __init__(self, num_processes: int = 1,
                 use_ipex: bool = False,
                 distributed_backend="subprocess",
                 cpu_for_each_process: Optional[List[List[int]]] = None,
                 use_hpo=False,
                 channels_last: bool = False,
                 auto_lr: Union[int, bool] = True,
                 precision: Union[str, int] = 32,
                 *args: Any, **kwargs: Any) -> None:
        """
        A pytorch lightning trainer that uses bigdl-nano optimization.

        :param num_processes: number of processes in distributed training. default: 4.
        :param use_ipex: whether we use ipex as accelerator for trainer. default: False.
        :param cpu_for_each_process: A list of length `num_processes`, each containing a list of
            indices of cpus each process will be using. default: None, and the cpu will be
            automatically and evenly distributed among processes.
        :param precision: Double precision (64), full precision (32), half precision (16)
            or bfloat16 precision (bf16), defaults to 32.
            Enable ipex bfloat16 weight prepack when `use_ipex=True` and `precision='bf16'`
        """
        # Check keyword arguments
        if "accelerator" in kwargs:
            warning(f"""Accelerator will be specified by bigdl-nano,
            accelerator entered {kwargs['accelerator']} will be ignored. """)

            kwargs.pop('accelerator')
        if "plugins" in kwargs:
            warning(f"""Plugins will be specified by bigdl-nano,
             plugines entered {kwargs['plugins']} will be ignored. """)

            kwargs.pop('plugins')
        if cpu_for_each_process is not None:
            if len(cpu_for_each_process) != num_processes:
                invalidInputError(False,
                                  f"The length of `cpu_for_each_process` ("
                                  f"{len(cpu_for_each_process)}) is not equal to the number of"
                                  f" processes {num_processes}.")

        accelerator = None

        if "algorithms" in kwargs:
            kwargs = self._add_algorithms(kwargs)

        if channels_last:
            callbacks = kwargs.get("callbacks")
            if callbacks:
                callbacks.append(ChannelsLastCallback())
            else:
                kwargs["callbacks"] = [ChannelsLastCallback()]

        self.use_ipex = use_ipex
        dtype = None
        if self.use_ipex and precision == 'bf16':
            # Enable ipex bfloat16 weight prepack and disable pytorch-lightning native AMP
            dtype = torch.bfloat16
            precision = 32

        # Confirm if cpu supports avx512
        if self.use_ipex and not check_avx512():
            if TORCH_VERSION_LESS_1_11:
                warning("Enable ipex<=1.11 in a cpu instruction set"
                        " without avx512 will crash."
                        "Fall back to regular pytorch.")
                self.use_ipex = False
            elif dtype == torch.bfloat16:
                warning("Enable IPEX bfloat16 in a cpu instruction set"
                        " without avx512 will crash. "
                        "Using 32-bit precision")
                dtype = None

        kwargs['precision'] = precision

        if num_processes == 1:
            from bigdl.nano.pytorch.strategies import create_IPEXStrategy
            strategy = create_IPEXStrategy(dtype=dtype) if self.use_ipex else None
            kwargs["strategy"] = strategy
            super().__init__(*args, **kwargs)
        else:
            plugin = None
            invalidInputError(distributed_backend in distributed_backends,
                              f"Distributed backends supported now are {distributed_backends},"
                              f" but get {distributed_backend}.")
            if "checkpoint_callback" in kwargs:
                if not kwargs["checkpoint_callback"]:
                    invalidInputError(False,
                                      f"`checkpoint_callback` set to False. "
                                      f"Currently, disable checkpoint callback make "
                                      f"distributed training backend work incorrect")
            if distributed_backend == "spawn":
                from bigdl.nano.pytorch.strategies import DDPSpawnStrategy
                strategy = DDPSpawnStrategy(num_processes=num_processes,
                                            cpu_for_each_process=cpu_for_each_process,
                                            use_ipex=self.use_ipex,
                                            dtype=dtype,
                                            auto_lr=auto_lr)
            elif distributed_backend == "subprocess":
                from bigdl.nano.pytorch.strategies import DDPSubprocessStrategy
                strategy = DDPSubprocessStrategy(num_processes=num_processes,
                                                 cpu_for_each_process=cpu_for_each_process,
                                                 use_ipex=self.use_ipex,
                                                 dtype=dtype,
                                                 auto_lr=auto_lr)
            elif distributed_backend == "ray":
                from bigdl.nano.pytorch.strategies import create_RayStrategy
                strategy = create_RayStrategy(num_workers=num_processes,
                                              use_ipex=self.use_ipex,
                                              dtype=dtype,
                                              auto_lr=auto_lr)
            kwargs["strategy"] = strategy
            super().__init__(*args, **kwargs)

        if use_hpo:
            self.hposearcher = create_hpo_searcher(trainer=self, num_processes=num_processes)
        else:
            self.hposearcher = None

    def _add_algorithms(self, kwargs):
        callbacks = kwargs.get("callbacks")
        for algorithm in kwargs['algorithms']:
            if isinstance(algorithm, SelectiveBackprop):
                if callbacks:
                    callbacks.append(algorithm)
                else:
                    kwargs["callbacks"] = [algorithm]
        del kwargs['algorithms']

        return kwargs

[docs]    @staticmethod
    def compile(model: nn.Module,
                loss: _Loss = None,
                optimizer: torch.optim.Optimizer = None,
                scheduler: _LRScheduler = None,
                metrics: List[Metric] = None):
        """
        Construct a pytorch-lightning model.

        If model is already a pytorch-lightning model,
        return model. If model is pytorch model, construct a new pytorch-lightning module
        with model, loss and optimizer.

        :param model:       A model instance.
        :param loss:        Loss to construct pytorch-lightning model.
                            Should be None if model is instance of pl.LightningModule.
        :param optimizer:   Optimizer to construct pytorch-lightning model Should be None.
                            if model is instance of pl.LightningModule.
        :param metrics:     A list of torchmetrics to validate/test performance.
        :return:            A LightningModule object.
        """
        invalidInputError(isinstance(model, nn.Module),
                          "Model must be instance of nn.Module but got {}".format(model.__class__))

        pl_model = None
        if isinstance(model, pl.LightningModule):
            invalidInputError(not (loss or optimizer),
                              "Loss and optimizer should be None if model"
                              " is a pytorch-lightning model.")
            pl_model = model
        else:
            pl_model = LightningModule(model, loss, optimizer, scheduler, metrics)

        return pl_model

[docs]    def search(self,
               model,
               resume: bool = False,
               target_metric=None,
               n_parallels=1,
               acceleration=False,
               input_sample=None,
               **kwargs):
        """
        Run HPO search. It will be called in Trainer.search().

        :param model: The model to be searched. It should be an auto model.
        :param resume: whether to resume the previous or start a new one,
            defaults to False.
        :param target_metric: the object metric to optimize,
            defaults to None.
        :param n_parallels: the number of parallel processes for running trials.
        :param acceleration: Whether to automatically consider the model after
            inference acceleration in the search process. It will only take
            effect if target_metric contains "latency". Default value is False.
        :param input_sample: A set of inputs for trace, defaults to None if you have
            trace before or model is a LightningModule with any dataloader attached.
        :return: the model with study meta info attached.
        """
        if not check_hpo_status(self.hposearcher):
            return None
        Trainer._log_api_event("search")

        return self.hposearcher.search(model,
                                       resume=resume,
                                       target_metric=target_metric,
                                       n_parallels=n_parallels,
                                       acceleration=acceleration,
                                       input_sample=input_sample,
                                       **kwargs)

[docs]    def search_summary(self):
        """
        Retrive a summary of trials.

        :return: A summary of all the trials. Currently the entire study is
            returned to allow more flexibility for further analysis and visualization.
        """
        if not check_hpo_status(self.hposearcher):
            return None
        return self.hposearcher.search_summary()

[docs]    @staticmethod
    @deprecated(func_name="bigdl.nano.pytorch.Trainer.trace",
                message="Please use `bigdl.nano.pytorch.InferenceOptimizer.trace` instead.")
    def trace(model: nn.Module,
              input_sample=None,
              accelerator: str = None,
              use_ipex: bool = False,
              thread_num: int = None,
              onnxruntime_session_options=None,
              logging: bool = True,
              **export_kwargs):
        """
        Trace a pytorch model and convert it into an accelerated module for inference.

        For example, this function returns a PytorchOpenVINOModel when accelerator=='openvino'.

        :param model: An torch.nn.Module model, including pl.LightningModule.
        :param input_sample: A set of inputs for trace, defaults to None if you have trace before or
                             model is a LightningModule with any dataloader attached.
        :param accelerator: The accelerator to use, defaults to None meaning staying in Pytorch
                            backend. 'openvino', 'onnxruntime' and 'jit' are supported for now.
        :param use_ipex: whether we use ipex as accelerator for inferencing. default: False.
        :param thread_num: (optional) a int represents how many threads(cores) is needed for
                           inference, only valid for accelerator='onnxruntime'
                           or accelerator='openvino'.
        :param onnxruntime_session_options: The session option for onnxruntime, only valid when
                                            accelerator='onnxruntime', otherwise will be ignored.
        :param logging: whether to log detailed information of model conversion, only valid when
                        accelerator='openvino', otherwise will be ignored. default: True.
        :param **kwargs: other extra advanced settings include
                         1. those be passed to torch.onnx.export function, only valid when
                         accelerator='onnxruntime'/'openvino', otherwise will be ignored.
                         2. if channels_last is set and use_ipex=True, we will transform the
                         data to be channels last according to the setting. Defaultly, channels_last
                         will be set to True if use_ipex=True.
        :return: Model with different acceleration.

        .. warning::
             ``bigdl.nano.pytorch.Trainer.trace`` will be deprecated in future release.

             Please use ``bigdl.nano.pytorch.InferenceOptimizer.trace`` instead.
        """
        return InferenceOptimizer.trace(model=model,
                                        input_sample=input_sample,
                                        accelerator=accelerator,
                                        use_ipex=use_ipex,
                                        thread_num=thread_num,
                                        onnxruntime_session_options=onnxruntime_session_options,
                                        logging=logging,
                                        **export_kwargs)

[docs]    @staticmethod
    @deprecated(func_name="bigdl.nano.pytorch.Trainer.quantize",
                message="Please use `bigdl.nano.pytorch.InferenceOptimizer.quantize` instead.")
    def quantize(model: nn.Module,
                 precision: str = 'int8',
                 accelerator: str = None,
                 use_ipex: bool = False,
                 calib_dataloader: DataLoader = None,
                 metric: Metric = None,
                 accuracy_criterion: dict = None,
                 approach: str = 'static',
                 method: str = None,
                 conf: str = None,
                 tuning_strategy: str = None,
                 timeout: int = None,
                 max_trials: int = None,
                 input_sample=None,
                 thread_num: int = None,
                 onnxruntime_session_options=None,
                 logging: bool = True,
                 **export_kwargs):
        """
        Calibrate a Pytorch-Lightning model for post-training quantization.

        :param model:           A model to be quantized. Model type should be an instance of
                                nn.Module.
        :param precision:       Global precision of quantized model,
                                supported type: 'int8', 'bf16', 'fp16', defaults to 'int8'.
        :param accelerator:     Use accelerator 'None', 'onnxruntime', 'openvino', defaults to None.
                                None means staying in pytorch.
        :param calib_dataloader:    A torch.utils.data.dataloader.DataLoader object for calibration.
                                    Required for static quantization.
                                    It's also used as validation dataloader.
        :param metric:              A torchmetrics.metric.Metric object for evaluation.
        :param accuracy_criterion:  Tolerable accuracy drop, defaults to None meaning no
                                    accuracy control.
                                    accuracy_criterion = {'relative': 0.1, 'higher_is_better': True}
                                    allows relative accuracy loss: 1%. accuracy_criterion =
                                    {'absolute': 0.99, 'higher_is_better':False} means accuracy
                                    must be smaller than 0.99.
        :param approach:    'static' or 'dynamic'.
                            'static': post_training_static_quant,
                            'dynamic': post_training_dynamic_quant.
                            Default: 'static'. OpenVINO supports static mode only.
        :param method:          Method to do quantization. When accelerator=None, supported
            methods: 'fx', 'eager', 'ipex', defaults to 'fx'. If you don't use ipex, suggest using
            'fx' which executes automatic optimizations like fusion. For more information, please
            refer to https://pytorch.org/docs/stable/quantization.html#eager-mode-quantization.
            When accelerator='onnxruntime', supported methods: 'qlinear', 'integer', defaults
            to 'qlinear'. Suggest 'qlinear' for lower accuracy drop if using static quantization.
            More details in https://onnxruntime.ai/docs/performance/quantization.html.
            This argument doesn't take effect for OpenVINO, don't change it for OpenVINO.
        :param conf:        A path to conf yaml file for quantization.
                            Default: None, using default config.
        :param tuning_strategy:    'bayesian', 'basic', 'mse', 'sigopt'. Default: 'bayesian'.
        :param timeout:     Tuning timeout (seconds). Default: None,  which means early stop.
                            Combine with max_trials field to decide when to exit.
        :param max_trials:  Max tune times. Default: None, which means no tuning.
                            Combine with timeout field to decide when to exit.
                            "timeout=0, max_trials=1" means it will try quantization only once and
                            return satisfying best model.
        :param input_sample:      An input example to convert pytorch model into ONNX/OpenVINO.
        :param thread_num: (optional) a int represents how many threads(cores) is needed for
                           inference, only valid for accelerator='onnxruntime'
                           or accelerator='openvino'.
        :param onnxruntime_session_options: The session option for onnxruntime, only valid when
                                            accelerator='onnxruntime', otherwise will be ignored.
        :param logging: whether to log detailed information of model conversion, only valid when
                        accelerator='openvino', otherwise will be ignored. default: True.
        :param **export_kwargs: will be passed to torch.onnx.export function.
        :return:            A accelerated Pytorch-Lightning Model if quantization is sucessful.

        .. warning::
             ``bigdl.nano.pytorch.Trainer.quantize`` will be deprecated in future release.

             Please use ``bigdl.nano.pytorch.InferenceOptimizer.quantize`` instead.
        """
        return InferenceOptimizer.quantize(model=model,
                                           precision=precision,
                                           accelerator=accelerator,
                                           use_ipex=use_ipex,
                                           calib_dataloader=calib_dataloader,
                                           metric=metric,
                                           accuracy_criterion=accuracy_criterion,
                                           approach=approach,
                                           method=method,
                                           conf=conf,
                                           tuning_strategy=tuning_strategy,
                                           timeout=timeout,
                                           max_trials=max_trials,
                                           input_sample=input_sample,
                                           thread_num=thread_num,
                                           onnxruntime_session_options=onnxruntime_session_options,
                                           logging=logging,
                                           **export_kwargs)

[docs]    @staticmethod
    def save(model: pl.LightningModule, path):
        """
        Save the model to local file.

        :param model: Any model of torch.nn.Module, including all models accelareted by
               Trainer.trace/Trainer.quantize.
        :param path: Path to saved model. Path should be a directory.
        """
        path = Path(path)
        path.mkdir(parents=path.parent, exist_ok=True)
        if hasattr(model, '_save'):
            model._save(path)
        else:
            # typically for models of nn.Module, pl.LightningModule type
            meta_path = Path(path) / "nano_model_meta.yml"
            with open(meta_path, 'w+') as f:
                metadata = {
                    'ModelType': 'PytorchModel',
                    'checkpoint': 'saved_weight.pt'
                }
                yaml.safe_dump(metadata, f)
            checkpoint_path = path / metadata['checkpoint']
            torch.save(model.state_dict(), checkpoint_path)

[docs]    @staticmethod
    def load(path, model: pl.LightningModule = None):
        """
        Load a model from local.

        :param path: Path to model to be loaded. Path should be a directory.
        :param model: Required FP32 model to load pytorch model, it is needed if you accelerated
               the model with accelerator=None by Trainer.trace/Trainer.quantize. model
               should be set to None if you choose accelerator="onnxruntime"/"openvino"/"jit".
        :return: Model with different acceleration(None/OpenVINO/ONNX Runtime/JIT) or
                 precision(FP32/FP16/BF16/INT8).
        """
        path = Path(path)
        if not path.exists():
            invalidInputError(False, "{} doesn't exist.".format(path))
        meta_path = path / "nano_model_meta.yml"
        if not meta_path.exists():
            invalidInputError(False, "File {} is required to load model.".format(str(meta_path)))
        with open(meta_path, 'r') as f:
            metadata = yaml.safe_load(f)
        model_type = metadata.get('ModelType', None)
        if model_type == 'PytorchOpenVINOModel':
            invalidInputError(model is None,
                              "Argument 'model' must be None for OpenVINO loading.")
            return load_openvino_model(path)
        if model_type == 'PytorchONNXRuntimeModel':
            invalidInputError(model is None,
                              "Argument 'model' must be None for ONNX Runtime loading.")
            return load_onnxruntime_model(path)
        if model_type == 'PytorchQuantizedModel':
            return load_inc_model(path, model, 'pytorch')
        if model_type == 'PytorchIPEXJITModel':
            return load_ipexjit_model(path, model)
        if isinstance(model, nn.Module):
            # typically for models of nn.Module, pl.LightningModule type
            model = copy.deepcopy(model)
            checkpoint_path = metadata.get('checkpoint', None)
            if checkpoint_path:
                checkpoint_path = path / metadata['checkpoint']
                state_dict = torch.load(checkpoint_path, map_location='cpu')
                model.load_state_dict(state_dict)
                return model
            else:
                invalidInputError(False, "Key 'checkpoint' must be specified.")
        else:
            invalidInputError(False,
                              "ModelType {} or argument 'model={}' is not acceptable for pytorch"
                              " loading.".format(model_type, type(model)))

[docs]    def save_checkpoint(    # type: ignore[override]
        self, filepath, weights_only: bool = False, storage_options: Optional[Any] = None
    ) -> None:
        """Save checkpoint after one train epoch."""
        # When using ipex==1.9 and custom lr_schedulers for training, if set `weights_only` to
        # False,`save_checkpoint` method will report an error of 'Unsupport storage type'
        # because the model is in 'xpu', so we temporarily move it to 'cpu',
        # then move it back after `save_checkpoint`.
        if self.use_ipex and TORCH_VERSION_LESS_1_10 and not weights_only:
            self.model.to('cpu')

        super().save_checkpoint(filepath, weights_only, storage_options)    # type: ignore
        if self.use_ipex and TORCH_VERSION_LESS_1_10 and not weights_only:
            self.model.to(self.strategy.root_device)    # type: ignore