Source code for bigdl.orca.learn.pytorch.estimator

#
# Copyright 2016 The BigDL Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

import logging

from typing import TYPE_CHECKING, Union, Optional, Callable, Dict, List
if TYPE_CHECKING:
    from torch.nn import Module
    from torch.optim import Optimizer
    from torch.nn.modules.loss import _Loss as Loss
    from torch.optim.lr_scheduler import _LRScheduler as LRScheduler
    from bigdl.orca.learn.metrics import Metric
    from bigdl.orca.learn.pytorch.pytorch_ray_estimator import PyTorchRayEstimator
    from bigdl.orca.learn.pytorch.pytorch_pyspark_estimator import PyTorchPySparkEstimator
    from bigdl.orca.learn.pytorch.experimential.mmcv.mmcv_ray_estimator import MMCVRayEstimator


[docs]class Estimator(object):
[docs]    @staticmethod
    def from_torch(*,
                   model: Union['Module', Callable[[Dict], 'Module'], None]=None,
                   optimizer: Optional[Union['Optimizer',
                                       Callable[['Module', Dict], 'Optimizer'],
                                       None]]=None,
                   loss: Union['Loss', Callable[[Dict], 'Loss'], None]=None,
                   metrics: Union['Metric', List['Metric'], None]=None,
                   backend: str="spark",
                   config: Optional[Dict]=None,
                   workers_per_node: int=1,
                   scheduler_creator: Optional[Callable[[Dict], 'LRScheduler']]=None,
                   use_tqdm: bool=False,
                   model_dir: Optional[str]=None,
                   sync_stats: bool=False,
                   log_level: int=logging.INFO,
                   log_to_driver: bool=True,
                   ) -> Union['PyTorchRayEstimator',
                              'PyTorchPySparkEstimator',
                              None]:
        """
        Create an Estimator for PyTorch.

        :param model: A model creator function that takes the parameter "config" and returns a
               PyTorch model.
        :param optimizer: An optimizer creator function that has two parameters "model" and
               "config" and returns a PyTorch optimizer.
               Default: None if training is not performed.
        :param loss: An instance of PyTorch loss.
               Default: None if loss computation is not needed.
        :param metrics: One or a list of Orca validation metrics. Function(s) that computes the
               metrics between the output and target tensors are also supported.
               Default: None if no validation is involved.
        :param backend: The distributed backend for the Estimator. One of "spark",  "ray"
               or "horovod".
               Default: "spark".
        :param config: A parameter config dict, CfgNode or any class instance that plays a role of
               configuration to create model, loss, optimizer, scheduler and data.
               Default: None if no config is needed.
        :param workers_per_node: The number of PyTorch workers on each node.
               Default: 1.
        :param scheduler_creator: A scheduler creator function that has two parameters "optimizer"
               and "config" and returns a PyTorch learning rate scheduler wrapping the optimizer.
               By default a scheduler will take effect automatically every epoch.
               Default: None if no scheduler is needed.
        :param use_tqdm: Whether to use tqdm to monitor the training progress.
               Default: False.
        :param model_dir: The path to save the PyTorch model during the training if
               checkpoint_trigger is defined and triggered.
               Default: None.
        :param sync_stats: Whether to sync metrics across all distributed workers after each epoch.
               If set to False, only the metrics of the worker with rank 0 are printed.
               Default: True
        :param log_level: The log_level of each distributed worker.
               Default: logging.INFO.
        :param log_to_driver: Whether to display executor log on driver in cluster mode for spark
               backend. Default: True.

        :return: A Estimator object for PyTorch.
        """
        if backend in {"horovod", "ray"}:
            from bigdl.orca.learn.pytorch.pytorch_ray_estimator import PyTorchRayEstimator
            return PyTorchRayEstimator(model_creator=model,
                                       optimizer_creator=optimizer,  # type:ignore
                                       loss_creator=loss,
                                       metrics=metrics,
                                       scheduler_creator=scheduler_creator,
                                       config=config,
                                       use_tqdm=use_tqdm,
                                       workers_per_node=workers_per_node,
                                       backend=backend,
                                       sync_stats=sync_stats,
                                       log_level=log_level)
        elif backend == "spark":
            from bigdl.orca.learn.pytorch.pytorch_pyspark_estimator import PyTorchPySparkEstimator
            return PyTorchPySparkEstimator(model_creator=model,
                                           optimizer_creator=optimizer,  # type:ignore
                                           loss_creator=loss,
                                           metrics=metrics,
                                           scheduler_creator=scheduler_creator,
                                           config=config,
                                           use_tqdm=use_tqdm,
                                           workers_per_node=workers_per_node,
                                           sync_stats=sync_stats,
                                           log_level=log_level,
                                           model_dir=model_dir,
                                           log_to_driver=log_to_driver,
                                           )
        else:
            from bigdl.dllib.utils.log4Error import invalidInputError
            invalidInputError(False,
                              "Only horovod, ray and spark backends are "
                              f"supported for now, got backend: {backend}")
            return None

[docs]    @staticmethod
    def from_mmcv(*,
                  mmcv_runner_creator: Callable,
                  backend: str = "ray",
                  workers_per_node: int = 1,
                  config: Optional[Dict] = None) -> Union['MMCVRayEstimator', None]:
        """
        Create an Estimator for MMCV.

        :param mmcv_runner_creator: A runner creator function that takes the parameter
               "config" and returns a MMCV runner.
        :param backend: The distributed backend for the Estimator.
               Default: "ray".
        :param config: A parameter config dict, CfgNode or any class instance that plays a role of
               configuration to create runner data.
               Default: None if no config is needed.
        :param workers_per_node: The number of MMCV workers on each node.
               Default: 1.

        :return: A Estimator object for MMCV.
        """
        if backend == "ray":
            from bigdl.orca.learn.pytorch.experimential.mmcv.mmcv_ray_estimator \
                import MMCVRayEstimator
            return MMCVRayEstimator(mmcv_runner_creator=mmcv_runner_creator,
                                    backend=backend,
                                    workers_per_node=workers_per_node,
                                    config=config)
        else:
            from bigdl.dllib.utils.log4Error import invalidInputError
            invalidInputError(False,
                              "Only ray backend are supported for now, "
                              f"got backend: {backend}")
            return None

[docs]    @staticmethod
    def latest_checkpoint(checkpoint_dir: str) -> str:
        from .callbacks.model_checkpoint import ModelCheckpoint
        checkpoint_path = ModelCheckpoint.get_latest_checkpoint(checkpoint_dir)
        return checkpoint_path