Source code for bigdl.orca.data.shard

#
# Copyright 2016 The BigDL Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
from py4j.protocol import Py4JError

from bigdl.orca.data.utils import *
from bigdl.orca import OrcaContext
from bigdl.dllib.utils.common import (get_node_and_core_number,
                                      get_spark_sql_context,
                                      get_spark_context)
from bigdl.dllib.utils import nest
from bigdl.dllib.utils.log4Error import invalidInputError

import numpy as np

from typing import TYPE_CHECKING, Any
from typing import (
    Callable,
    Dict,
    List,
    Optional,
    Tuple,
    Union
)

if TYPE_CHECKING:
    from numpy import ndarray
    from pandas.core.frame import DataFrame as PandasDataFrame
    from pyspark.rdd import PipelinedRDD, RDD
    from pyspark.sql.dataframe import DataFrame as SparkDataFrame
    from pyspark.sql.column import Column
    from ray.data.dataset import Dataset


[docs]class XShards(object):
    """

    A collection of data which can be pre-processed in parallel.
    """
[docs]    def transform_shard(self, func: Callable, *args):
        """

        Transform each shard in the XShards using specified function.

        :param func: pre-processing function
        :param args: arguments for the pre-processing function
        :return: DataShard
        """
        pass

[docs]    def collect(self):
        """

        Returns a list that contains all of the elements in this XShards

        :return: list of elements
        """
        pass

[docs]    def num_partitions(self):
        """

        return the number of partitions in this XShards

        :return: an int
        """
        pass

[docs]    @classmethod
    def load_pickle(cls, path: str, minPartitions: Optional[int] = None) -> "SparkXShards":
        """

        Load XShards from pickle files.

        :param path: The pickle file path/directory
        :param minPartitions: The minimum partitions for the XShards
        :return: SparkXShards object
        """
        sc = OrcaContext.get_spark_context()
        return SparkXShards(sc.pickleFile(path, minPartitions))

[docs]    @staticmethod
    def partition(
        data: Union["ndarray",
                    List["ndarray"],
                    Tuple["ndarray", "ndarray"],
                    Dict[str, Union["ndarray", Tuple["ndarray"], List["ndarray"]]]],
        num_shards: Optional[int] = None
    ) -> "SparkXShards":
        """

        Partition local in memory data and form a SparkXShards

        :param data: np.ndarray, a tuple, list, dict of np.ndarray, or a nested structure
        made of tuple, list, dict with ndarray as the leaf value
        :param num_shards: the number of shards that the data will be partitioned into
        :return: a SparkXShards
        """
        sc = OrcaContext.get_spark_context()
        node_num, core_num = get_node_and_core_number()
        shard_num = node_num * core_num if num_shards is None else num_shards
        import numpy as np
        type_err_msg = """
The types supported in bigdl.orca.data.XShards.partition are
1. np.ndarray
2. a tuple, list, dict of np.ndarray
3. nested structure made of tuple, list, dict with ndarray as the leaf value

But got data of type {}
        """.format(type(data))
        supported_types = {list, tuple, dict}
        if isinstance(data, np.ndarray):
            if data.shape[0] < shard_num:
                invalidInputError(False,
                                  "The length of data {} is smaller than the total number "
                                  "of shards {}. Please adjust the num_shards option to be "
                                  "at most {}.".format(data.shape[0], shard_num, data.shape[0]))
            arrays = np.array_split(data, shard_num)
            rdd = sc.parallelize(arrays)
        else:
            invalidInputError(type(data) in supported_types, type_err_msg)
            flattened = nest.flatten(data)
            data_length = len(flattened[0])
            data_to_be_shard = []  # type:ignore
            if data_length < shard_num:
                invalidInputError(False,
                                  "The length of data {} is smaller than the total number "
                                  "of shards {}. Please adjust the num_shards option to be "
                                  "at most {}.".format(data_length, shard_num, data_length))
            for i in range(shard_num):
                data_to_be_shard.append([])
            for x in flattened:
                invalidInputError(len(x) == data_length,
                                  "the ndarrays in data must all have the same size in first"
                                  " dimension, got first ndarray of size {} and"
                                  " another {}".format(data_length, len(x)))
                x_parts = np.array_split(x, shard_num)
                for idx, x_part in enumerate(x_parts):
                    data_to_be_shard[idx].append(x_part)

            data_to_be_shard = [nest.pack_sequence_as(data, shard) for shard in data_to_be_shard]
            rdd = sc.parallelize(data_to_be_shard)

        data_shards = SparkXShards(rdd)
        return data_shards


class SparkXShards(XShards):
    """
    A collection of data which can be pre-processed in parallel on Spark.
    """
    def __init__(self,
                 rdd: Union["PipelinedRDD", "RDD"],
                 transient: bool = False,
                 class_name: Optional[str] = None) -> None:
        self.rdd = rdd
        self.user_caching = self.rdd.is_cached
        if transient:
            self.eager = False
        else:
            self.eager = OrcaContext._eager_mode
            self.rdd.cache()
        if self.eager:
            self.compute()
            self.is_lazy = False
        else:
            self.is_lazy = True
        self.type = {}
        if class_name:
            self.type['class_name'] = class_name

    @classmethod
    def lazy(cls,
             rdd: Union["PipelinedRDD", "RDD"],
             class_name: str = None) -> "SparkXShards":
        """
        Create an instance of SparkXShards that computes lazily.
        """
        return SparkXShards(rdd, transient=True, class_name=class_name).to_lazy()

    def _create(self,
                rdd: Union["PipelinedRDD", "RDD"],
                class_name: str = None) -> "SparkXShards":
        """
        Create an instance of SparkXShards after transformation.
        """
        if self.is_lazy:
            return SparkXShards(rdd, transient=True, class_name=class_name).to_lazy()
        else:
            return SparkXShards(rdd, class_name=class_name)

    def to_lazy(self) -> "SparkXShards":
        """
        Making the current SparkXShards lazy won't change the behavior of the current SparkXShards.
        Cached data won't be uncached unless uncache is explicitly invoked.
        After being marked as lazy, future operations would be performed lazily.

        :return: An instance of SparkXShards that computes lazily.
        """
        self.eager = False
        self.is_lazy = True
        self.user_caching = True
        return self

    def transform_shard(self, func: Callable, *args) -> "SparkXShards":
        """

        Return a new SparkXShards by applying a function to each shard of this SparkXShards

        :param func: python function to process data. The first argument is the data shard.
        :param args: other arguments in this function.
        :return: a new SparkXShards.
        """
        def transform(iter, func, *args):
            for x in iter:
                yield func(x, *args)

        transformed_shard = self._create(self.rdd.mapPartitions(lambda iter:
                                                                transform(iter, func, *args)))
        self._uncache()
        return transformed_shard

    def collect(self) -> List[Any]:
        """

        Returns a list that contains all of the elements in this SparkXShards

        :return: a list of data elements.
        """
        return self.rdd.collect()

    def first(self):
        """
        Returns the first element in the rdd of SparkXShards
        :return: a record of data.
        """
        return self.rdd.first()

    def take(self, n) -> List[Any]:
        """
        Returns n element in the rdd of SparkXShards
        :return: n records of data.
        """
        return self.rdd.take(n)

    def cache(self) -> "SparkXShards":
        """

        Persist this SparkXShards in memory

        :return:
        """
        self.user_caching = True
        self.rdd.cache()
        return self

    def uncache(self) -> "SparkXShards":
        """

        Make this SparkXShards as non-persistent, and remove all blocks for it from memory

        :return:
        """
        self.user_caching = False
        try:
            if self.is_cached():
                self.rdd.unpersist()
        except (Py4JError, TypeError):
            pass
        return self

    def _uncache(self) -> None:
        if not self.user_caching:
            self.uncache()

    def is_cached(self) -> bool:
        return self.rdd.is_cached

    def compute(self) -> "SparkXShards":
        self.rdd.count()
        return self

    def num_partitions(self) -> int:
        """

        Get number of partitions for this SparkXShards.

        :return: number of partitions.
        """
        return self.rdd.getNumPartitions()

    def repartition(self, num_partitions: int) -> "SparkXShards":
        """

        Return a new SparkXShards that has exactly num_partitions partitions.

        :param num_partitions: target number of partitions
        :return: a new SparkXShards object.
        """
        class_name = self._get_class_name()
        if class_name == 'pandas.core.frame.DataFrame':
            import pandas as pd

            rdd = self.rdd \
                .flatMap(lambda df: df.apply(lambda row: (row[0], row.values.tolist()), axis=1)
                         .values.tolist()) \
                .partitionBy(num_partitions)

            schema = self.get_schema()

            def merge_rows(iter):
                data = [value[1] for value in list(iter)]
                if data:
                    df = pd.DataFrame(data=data, columns=schema['columns']) \
                        .astype(schema['dtypes'])
                    return [df]
                else:
                    # no data in this partition
                    return iter

            repartitioned_shard = self._create(rdd.mapPartitions(merge_rows),
                                               class_name=class_name)
        elif class_name == 'builtins.list':
            if num_partitions > self.rdd.getNumPartitions():
                rdd = self.rdd \
                    .flatMap(lambda data: data) \
                    .repartition(num_partitions)

                repartitioned_shard = self._create(rdd.mapPartitions(
                    lambda iter: [list(iter)]), class_name=class_name)
            else:
                rdd = self.rdd.coalesce(num_partitions)
                from functools import reduce
                repartitioned_shard = self._create(rdd.mapPartitions(
                    lambda iter: [reduce(lambda l1, l2: l1 + l2, iter)]),  # type:ignore
                    class_name=class_name)  # type:ignore
        elif class_name == 'numpy.ndarray':
            elem = self.rdd.first()
            shape = elem.shape
            dtype = elem.dtype
            if len(shape) > 0:
                if num_partitions > self.rdd.getNumPartitions():
                    rdd = self.rdd \
                        .flatMap(lambda data: list(data)) \
                        .repartition(num_partitions)

                    repartitioned_shard = self._create(rdd.mapPartitions(
                        lambda iter: np.stack([list(iter)], axis=0).astype(dtype)),
                        class_name=class_name)
                else:
                    rdd = self.rdd.coalesce(num_partitions)
                    from functools import reduce
                    repartitioned_shard = self._create(rdd.mapPartitions(
                        lambda iter: [np.concatenate(list(iter), axis=0)]),
                        class_name=class_name)
            else:
                repartitioned_shard = self._create(self.rdd.repartition(num_partitions),
                                                   class_name=class_name)
        elif class_name == "builtins.dict":
            elem = self.rdd.first()
            keys = list(elem.keys())
            dtypes = []
            dict_of_batched_ndarray = True
            # Check if all values are ndarray and shape > 1
            for v in elem.values():
                if v.__class__.__name__ != "ndarray" or len(v.shape) == 0:
                    dict_of_batched_ndarray = False
                    break
                else:
                    dtypes.append(v.dtype)
            if dict_of_batched_ndarray:
                def dict_to_unbatched_list(d):
                    values = [list(d[k]) for k in keys]
                    return list(zip(*values))

                def to_batched_dict(iter):
                    batch_values = list(zip(*iter))
                    if not batch_values:
                        return []
                    batch_ndarrays = [np.stack(v, axis=0).astype(dtype)
                                      for v, dtype in zip(batch_values, dtypes)]
                    return [dict(zip(keys, batch_ndarrays))]

                # If number of records in a partition <= 10, may produce empty partition
                rdd = self.rdd.flatMap(lambda data: dict_to_unbatched_list(data)) \
                    .repartition(num_partitions)
                repartitioned_shard = self._create(rdd.mapPartitions(
                    lambda iter: to_batched_dict(iter)), class_name=class_name)
            else:
                repartitioned_shard = self._create(self.rdd.repartition(num_partitions),
                                                   class_name=class_name)
        else:
            repartitioned_shard = self._create(self.rdd.repartition(num_partitions),
                                               class_name=class_name)
        self._uncache()
        return repartitioned_shard

    def partition_by(self, cols: str,
                     num_partitions: Optional[int] = None) -> Optional["SparkXShards"]:
        """

        Return a new SparkXShards partitioned using the specified columns.
        This is only applicable for SparkXShards of Pandas DataFrame.

        :param cols: specified columns to partition by.
        :param num_partitions: target number of partitions. If not specified,
        the new SparkXShards would keep the current partition number.
        :return: a new SparkXShards.
        """
        if self._get_class_name() == 'pandas.core.frame.DataFrame':
            import pandas as pd
            schema = self.get_schema()
            # if partition by a column
            if isinstance(cols, str):
                if not isinstance(schema, Dict) or cols not in schema['columns']:
                    invalidInputError(False,
                                      "The partition column is not in the DataFrame")
                # change data to key value pairs
                rdd = self.rdd.flatMap(
                    lambda df: df.apply(
                        lambda row: (row[cols], row.values.tolist()), axis=1).values.tolist())

                partition_num = self.rdd.getNumPartitions() if not num_partitions \
                    else num_partitions
                # partition with key
                partitioned_rdd = rdd.partitionBy(partition_num)
            else:
                invalidInputError(False,
                                  "Only support partition by a column name")

            def merge(iterator):
                data = [value[1] for value in list(iterator)]
                if data:
                    df = pd.DataFrame(data=data, columns=schema['columns']).astype(schema['dtypes'])
                    return [df]
                else:
                    # no data in this partition
                    return []

            # merge records to df in each partition
            partitioned_shard = SparkXShards(partitioned_rdd.mapPartitions(merge))
            self._uncache()
            return partitioned_shard
        else:
            invalidInputError(False,
                              "Currently only support partition by for XShards"
                              " of Pandas DataFrame")
        return None

    def unique(self) -> Optional["ndarray"]:
        """

        Return a unique list of elements of this SparkXShards.
        This is only applicable for SparkXShards of Pandas Series.

        :return: a unique list of elements of this SparkXShards.
        """
        if self._get_class_name() == 'pandas.core.series.Series':
            import pandas as pd
            rdd = self.rdd.map(lambda s: s.unique())
            import numpy as np
            result = rdd.reduce(lambda list1, list2: pd.unique(np.concatenate((list1, list2),
                                                                              axis=0)))
            return result
        else:
            invalidInputError(False,
                              "Currently only support unique() on XShards of Pandas Series")
        return None

    def deduplicates(self) -> Optional["SparkXShards"]:
        if self._get_class_name() == 'pandas.core.frame.DataFrame':
            import pandas as pd
            df = self.to_spark_df()
            distinctDF = df.distinct()
            data_shards = spark_df_to_pd_sparkxshards(distinctDF)
            return data_shards
        else:
            invalidInputError(False,
                              "Currently only support dedup() on XShards of Pandas DataFrame")
        return None

    def sort_values(self, col_names: Optional[Union[str, List[str]]]=None,
                    ascending: bool = True) -> Optional["SparkXShards"]:
        """
        Sort the value of shards. This is only applicable for SparkXShards of Pandas Series.

        :param col_names list of column or column names to sort by
        :param ascending bool, default True. Specify sort orders
        :return: a new SparkXShards sorted by the specified columns.
        """
        if self._get_class_name() == 'pandas.core.frame.DataFrame':
            import pandas as pd
            df = self.to_spark_df()
            sqlContext = get_spark_sql_context(get_spark_context())
            defaultPartitionNum = sqlContext.getConf("spark.sql.shuffle.partitions")
            partitionNum = df.rdd.getNumPartitions()
            sqlContext.setConf("spark.sql.shuffle.partitions", str(partitionNum))
            sort_df = df.sort(col_names, ascending=ascending)  # type:ignore
            data_shards = spark_df_to_pd_sparkxshards(sort_df)
            sqlContext.setConf("spark.sql.shuffle.partitions", defaultPartitionNum)
            return data_shards
        else:
            invalidInputError(False,
                              "Currently only support sort() on XShards of Pandas DataFrame")
        return None

    def max_values(self, col_name: str) -> Union[int, float, None]:
        """
        Get the max values of the column name. This is only applicable for SparkXShards
         of Pandas Series.

        :param col_name column name that need return the max value
        :return: maximum value for the specified columns.
        """
        if self._get_class_name() == 'pandas.core.frame.DataFrame':
            import pandas as pd
            rdd = self.rdd.map(lambda s: s[col_name].max())
            max_value = rdd.reduce(lambda value1, value2: max(value1, value2))
            return max_value
        else:
            invalidInputError(False,
                              "Currently only support max() on XShards of Pandas DataFrame")
        return None

    def get_null_sum(self) -> "PandasDataFrame":
        """
        With SparkXShards of pandas data frame, the api will get null numbers for
        each column. For other type of SparkXShards, it will throw exception

       :return: pandas data frame with 2 columns, `col` represents column name,
        `total` represents null numbers
        """
        if self._get_class_name() != 'pandas.core.frame.DataFrame':
            invalidInputError(False,
                              "Currently only support get_null_sum() on"
                              " XShards of Pandas DataFrame")

        def get_na_sum(iter):
            for df in iter:
                import pandas as pd
                series = df.isnull().sum()
                df = pd.DataFrame({'col': series.index, 'total': series.values})
                return [df]

        na_cnt_rdd = self.rdd.mapPartitions(get_na_sum)

        na_cnt = na_cnt_rdd.reduce(lambda l1, l2: l1.add(l2))
        return na_cnt

    def drop_missing_value(self) -> "SparkXShards":
        """
        With SparkXShards of pandas data frame, the api will drop null values in shards.
         For other type of SparkXShards, it will throw exception

       :return: a new SparkXShards without null values
        """
        if self._get_class_name() != 'pandas.core.frame.DataFrame':
            invalidInputError(False,
                              "Currently only support assembleFeatureLabelCols() on"
                              " XShards of Pandas DataFrame")

        null_cnt = self.get_null_sum()

        def drop_missing_data(pdf):
            df2 = pdf.drop((null_cnt[null_cnt['total'] > 0]['col']), 1)
            return df2

        # dealing with missing data
        return self.transform_shard(drop_missing_data)

    def assembleFeatureLabelCols(self,
                                 featureCols: List[Union[str, "Column"]],
                                 labelCols: List[Union[str, "Column"]]) -> "SparkXShards":
        """
        The api is used to merge/convert one or multiple feature columns into a numpy array,
        merge/convert one or multiple label columns into a numpy array.

        :param featureCols: a list of feature columns.
        :param labelCols: a list of label columns.
        :return: SparkXShards of dictionary, key is assembled feature numpy array, value is
         assembled label numpy array

        eg:
        shards: SparkXShards of pandas data frame with 9 cols ['f1', 'f2', 'f3', 'f4', 'f5', 'f6',
         'f7', 'f8', 'lable']
            f1   f2  f3  f4   f5    f6     f7  f8  label
             6  148  72  35    0  33.6  0.627  50      1
             1   85  66  29    0  26.6  0.351  31      0
             8  183  64   0    0  23.3  0.672  32      1
             1   89  66  23   94  28.1  0.167  21      0
             0  137  40  35  168  43.1  2.288  33      1

        transform_shards =
          shards.assembleFeatureLabelCols(featureCols=['f1', 'f2', 'f3', 'f4', 'f5', 'f6',
           'f7', 'f8'], labelCols=['label'])

        transform_shards will be SparkXShards of dictionary. key will be a stacked numpy array
        (stack feature columns), value will be a numpy array
        {'x': array([[  6.   , 148.   ,  72.   , ...,  33.6  ,   0.627,  50.   ],
           [  1.   ,  85.   ,  66.   , ...,  26.6  ,   0.351,  31.   ],
           [  8.   , 183.   ,  64.   , ...,  23.3  ,   0.672,  32.   ],
           [  1.   , 89.   ,  66.   , ...,  28.1  ,   0.167,  21.   ],
           [  0.   , 137.   ,  40.   , ...,  43.1  ,  2.288, 33.   ]]),
         'y': array([[1],
           [0],
           [1],
           [0],
           [1]
        """
        if self._get_class_name() != 'pandas.core.frame.DataFrame':
            invalidInputError(False,
                              "Currently only support assembleFeatureLabelCols() on"
                              " XShards of Pandas DataFrame")

        def to_shard_dict(df):
            featureLists = [df[feature_col].to_numpy() for feature_col in featureCols]
            labelLists = [df[label_col].to_numpy() for label_col in labelCols]
            result = {
                "x": np.stack(featureLists, axis=1),
                "y": np.stack(labelLists, axis=1)}
            return result

        invalidInputError(type(featureCols) == list, "expect featureCols is a list")
        invalidInputError(type(labelCols) == list, "expect labelCols is a list")
        transformed_shard = self.transform_shard(to_shard_dict)
        return transformed_shard

    def split(self) -> Optional[List["SparkXShards"]]:
        """

        Split SparkXShards into multiple SparkXShards.
        Each element in the SparkXShards needs be a list or tuple with same length.

        :return: Splits of SparkXShards. If element in the input SparkDataShard is not
                list or tuple, return list of input SparkDataShards.
        """
        # get number of splits
        list_split_length = self.rdd.map(
            lambda data: len(data) if isinstance(data, list) or isinstance(data, tuple) else 1)\
            .collect()
        # check if each element has same splits
        if list_split_length.count(list_split_length[0]) != len(list_split_length):
            invalidInputError(False,
                              "Cannot split this XShards because its partitions "
                              "have different split length")
        else:
            if list_split_length[0] > 1:
                def get_data(order):
                    def transform(data):
                        return data[order]

                    return transform

                split_shard_list = [SparkXShards(self.rdd.map(get_data(i)))
                                    for i in range(list_split_length[0])]
                self._uncache()
                return split_shard_list
            else:
                return [self]
        return None

    def zip(self, other: "SparkXShards") -> Optional["SparkXShards"]:
        """

        Zips this SparkXShards with another one, returning key-value pairs with the first element
        in each SparkXShards, second element in each SparkXShards, etc. Assumes that the two
        SparkXShards have the *same number of partitions* and the *same number of elements
        in each partition*(e.g. one was made through a transform_shard on the other

        :param other: another SparkXShards
        :return: zipped SparkXShards
        """
        invalidInputError(isinstance(other, SparkXShards), "other should be a SparkXShards")
        invalidInputError(self.num_partitions() == other.num_partitions(),
                          "The two SparkXShards should have the same number of partitions")
        try:
            rdd = self.rdd.zip(other.rdd)
            zipped_shard = SparkXShards(rdd)
            other._uncache()
            self._uncache()
            return zipped_shard
        except Exception:
            invalidInputError(False,
                              "The two SparkXShards should have the same number of elements "
                              "in each partition")
        return None

    def group_by(
        self,
        columns: Union[str, List[str]]=[],
        agg: Union[Dict[str, List[str]], List[str], Dict[str, str], str]="count",
        join: bool = False
    ) -> "SparkXShards":
        """
        Group the Shards with specified columns and then run aggregation. Optionally join the
        result with the original Shards.

        :param columns: str or a list of str. Columns to group the SparkXShards. If it is an
               empty list, aggregation is run directly without grouping. Default is [].
        :param agg: str, list or dict. Aggregate functions to be applied to grouped SparkXShards.
               Default is "count".
               Supported aggregate functions are: "max", "min", "count", "sum", "avg", "mean",
               "sumDistinct", "stddev", "stddev_pop", "variance", "var_pop", "skewness", "kurtosis",
               "collect_list", "collect_set", "approx_count_distinct", "first", "last".
               If agg is a str, then agg is the aggregate function and the aggregation is performed
               on all columns that are not in `columns`.
               If agg is a list of str, then agg is a list of aggregate function and the aggregation
               is performed on all columns that are not in `columns`.
               If agg is a single dict mapping from str to str, then the key is the column
               to perform aggregation on, and the value is the aggregate function.
               If agg is a single dict mapping from str to list, then the key is the
               column to perform aggregation on, and the value is list of aggregate functions.

               Examples:
               agg="sum"
               agg=["last", "stddev"]
               agg={"*":"count"}
               agg={"col_1":"sum", "col_2":["count", "mean"]}
        :param join: boolean. If True, join the aggregation result with original SparkXShards.

        :return: A new SparkXShards with aggregated column fields.
        """

        if self._get_class_name() != 'pandas.core.frame.DataFrame':
            invalidInputError(False,
                              "Currently only support sort() on XShards of Pandas DataFrame")

        df = self.to_spark_df()
        sqlContext = get_spark_sql_context(get_spark_context())
        defaultPartitionNum = sqlContext.getConf("spark.sql.shuffle.partitions")
        partitionNum = df.rdd.getNumPartitions()
        sqlContext.setConf("spark.sql.shuffle.partitions", str(partitionNum))

        result_df = group_by_spark_df(df, columns, agg, join)

        agg_shards = spark_df_to_pd_sparkxshards(result_df)
        sqlContext.setConf("spark.sql.shuffle.partitions", defaultPartitionNum)
        return agg_shards

    def _to_spark_df_without_arrow(self):
        def f(iter):
            from bigdl.dllib.utils.log4Error import invalidInputError
            pdf_list = list(iter)
            invalidInputError(len(pdf_list) == 1,
                              f"For XShards of pandas dataframe, expects there is only 1"
                              f" pandas dataframe for each partition, but got {len(pdf_list)}")
            for pdf in pdf_list:
                np_records = pdf.to_records(index=False)
                return [r.tolist() for r in np_records]

        rdd = self.rdd.mapPartitions(f)
        column = self.get_schema()['columns']
        df = rdd.toDF(list(column))
        return df

    # to_spark_df adapted from pyspark
    # https://github.com/apache/spark/blob/master/python/pyspark/sql/pandas/conversion.py
    def to_spark_df(self) -> "SparkDataFrame":
        if self._get_class_name() != 'pandas.core.frame.DataFrame':
            invalidInputError(False,
                              "Currently only support to_spark_df on XShards of Pandas DataFrame")

        try:
            import pyarrow as pa
            sdf_schema = self._get_spark_df_schema()

            sqlContext = get_spark_sql_context(get_spark_context())
            timezone = sqlContext.getConf("spark.sql.session.timeZone")

            def f(iter):
                from bigdl.dllib.utils.log4Error import invalidInputError
                pdf_list = list(iter)
                invalidInputError(len(pdf_list) == 1,
                                  f"For XShards of pandas dataframe, expects there is only 1"
                                  f" pandas dataframe for each partition, but got {len(pdf_list)}")
                for pdf in pdf_list:
                    import os
                    import uuid
                    from pyspark.sql.pandas.types import to_arrow_type
                    from pyspark.sql.pandas.serializers import ArrowStreamPandasSerializer
                    from tempfile import NamedTemporaryFile

                    tmpFile = "/tmp/" + str(uuid.uuid1())
                    os.mkdir(tmpFile)

                    arrow_types = [to_arrow_type(f.dataType) for f in sdf_schema.fields]
                    arrow_data = [[(c, t) for (_, c), t in zip(pdf.items(), arrow_types)]]
                    col_by_name = True
                    safecheck = False
                    ser = ArrowStreamPandasSerializer(timezone, safecheck, col_by_name)

                    tempFile = NamedTemporaryFile(delete=False, dir=tmpFile)
                    try:
                        ser.dump_stream(arrow_data, tempFile)
                    finally:
                        tempFile.close()
                    return [tempFile.name]

            jiter = self.rdd.mapPartitions(f)
            from bigdl.dllib.utils.file_utils import callZooFunc

            df = callZooFunc("float", "orcaToDataFrame", jiter, sdf_schema.json(), sqlContext)
            return df
        except Exception as e:
            print(f"createDataFrame from shards attempted Arrow optimization failed as: {str(e)},"
                  f"Will try without Arrow optimization")
            return self._to_spark_df_without_arrow()

    def __len__(self) -> int:
        return self.rdd.map(lambda data: len(data) if hasattr(data, '__len__') else 1) \
            .reduce(lambda l1, l2: l1 + l2)

    def save_pickle(self, path: str, batchSize: int = 10) -> "SparkXShards":
        """

        Save this SparkXShards as a SequenceFile of serialized objects.
        The serializer used is pyspark.serializers.PickleSerializer, default batch size is 10.

        :param path: target path.
        :param batchSize: batch size for each sequence file chunk.
        """
        self.rdd.saveAsPickleFile(path, batchSize)
        return self

    def __del__(self):
        self._uncache()

    def __getitem__(self, key: str) -> "SparkXShards":
        def get_data(data):
            invalidInputError(hasattr(data, '__getitem__'),
                              "No selection operation available for this XShards")
            try:
                value = data[key]
            except:
                invalidInputError(False,
                                  "Invalid key for this XShards")
            return value

        return SparkXShards(self.rdd.map(get_data), transient=True)

    def _for_each(self, func: Callable, *args, **kwargs) -> "RDD[Any]":
        def utility_func(x, func, *args, **kwargs):
            try:
                result = func(x, *args, **kwargs)
            except Exception as e:
                return e
            return result

        result_rdd = self.rdd.map(lambda x: utility_func(x, func, *args, **kwargs))
        return result_rdd

    def get_schema(self) -> Optional[str]:
        if 'schema' in self.type:
            return self.type['schema']

        if 'class_name' not in self.type \
                or self.type['class_name'] == 'pandas.core.frame.DataFrame':
            class_name, pdf_schema, sdf_schema = self._get_schema_class_name()
            self.type['class_name'] = class_name
            self.type['schema'] = pdf_schema
            self.type['spark_df_schema'] = sdf_schema
            return self.type['schema']
        return None

    def _get_spark_df_schema(self):
        if 'spark_df_schema' in self.type:
            return self.type['spark_df_schema']

        if 'class_name' not in self.type \
                or self.type['class_name'] == 'pandas.core.frame.DataFrame':
            class_name, pdf_schema, sdf_schema = self._get_schema_class_name()
            self.type['class_name'] = class_name
            self.type['schema'] = pdf_schema
            self.type['spark_df_schema'] = sdf_schema
            return self.type['spark_df_schema']
        return None

    def _get_class_name(self):
        if 'class_name' in self.type:
            return self.type['class_name']
        else:
            class_name, schema, sdf_schema = self._get_schema_class_name()
            self.type['class_name'] = class_name
            self.type['schema'] = schema
            self.type['spark_df_schema'] = sdf_schema
            return self.type['class_name']

    def _set_class_name(self, class_name):
        if class_name and isinstance(class_name, str):
            self.type['class_name'] = class_name

    def _get_schema_class_name(self):
        class_name = self.type['class_name'] if 'class_name' in self.type else None
        import pyspark
        spark_version = pyspark.version.__version__
        major_version = spark_version.split(".")[0]

        def func(pdf):
            pdf_schema = None
            spark_df_schema = None
            _class_name = class_name
            if not _class_name:
                _class_name = pdf.__class__.__module__ + '.' + pdf.__class__.__name__

            if _class_name == 'pandas.core.frame.DataFrame':
                schema = [str(x) if not isinstance(x, str) else x for x in pdf.columns]
                pdf_schema = {'columns': schema, 'dtypes': pdf.dtypes}

                if major_version >= '3':
                    from pyspark.sql.pandas.types import from_arrow_type
                    from pyspark.sql.types import StructType

                    if isinstance(schema, (list, tuple)):
                        import pyarrow as pa
                        arrow_schema = pa.Schema.from_pandas(pdf, preserve_index=False)
                        struct = StructType()
                        for name, field in zip(schema, arrow_schema):
                            struct.add(
                                name, from_arrow_type(field.type), nullable=field.nullable
                            )
                        spark_df_schema = struct

            return (_class_name, pdf_schema, spark_df_schema)

        return self.rdd.map(lambda x: func(x)).first()

    def merge(self,
              right: "SparkXShards",
              how: str = "inner",
              on: Optional[str] = None) -> "SparkXShards":
        """
        Merge two SparkXShards into a single SparkXShards with a database-style join.

        :param right: The other SparkXShards to be merged.
        :param how: Type of merge. 'left', 'right', 'outer' or 'inner'. Default is 'inner'.
        :param on: Column name(s) to join on.
        :return: A new merged SparkXShards.
        """
        from bigdl.orca.data.utils import spark_df_to_pd_sparkxshards
        invalidInputError(isinstance(right, SparkXShards), "right should be a SparkXShards")

        left_df = self.to_spark_df()
        right_df = right.to_spark_df()
        merged = left_df.join(right_df, on=on, how=how)

        # count non-empty partitions
        nonEmptyPart = get_spark_context().accumulator(0)

        def f(iterator):
            isEmpty = 1
            for x in iterator:
                isEmpty = 0
                break
            nonEmptyPart.add(isEmpty == 0)
        merged.rdd.foreachPartition(f)

        # repartition evenly according to the index
        if nonEmptyPart.value != merged.rdd.getNumPartitions():
            merged_withIndex_rdd = merged.rdd.zipWithIndex().map(lambda p: (p[1], p[0]))
            merged = merged_withIndex_rdd.partitionBy(nonEmptyPart.value) \
                .map(lambda p: p[1]).toDF(merged.schema)
        mergedXShards = spark_df_to_pd_sparkxshards(merged)
        return mergedXShards

    def sample(self,
               frac: float,
               replace: bool=False,
               weights=None,
               random_state=None) -> "SparkXShards":
        """
        Samples from each pandas dataframe in old SparkXShards, Return a new SparkXShards  .

        :param frac: float,  Fraction of items to return.
        :param replace: bool, default False,
            Allow or disallow sampling of the same row more than once.
        :param weights: str or ndarray-like, optional
            Default 'None' results in equal probability weighting.
        :param random_state: int, array-like, BitGenerator, np.random.RandomState, optional
            If int, array-like, or BitGenerator (NumPy>=1.17), seed for
            random number generator
            If np.random.RandomState, use as numpy RandomState object.
        :return: a new SparkXShards.
        """
        if self._get_class_name() != 'pandas.core.frame.DataFrame':
            invalidInputError(False,
                              "Currently only support sample() on"
                              " SparkXShards of Pandas DataFrame")

        def inner_sample(iter, frac, replace=False, weights=None, random_state=None):
            for df in iter:
                yield df.sample(
                    frac=frac, replace=replace, weights=weights, random_state=random_state)

        rdd1 = self.rdd.mapPartitions(lambda iter:
                                      inner_sample(iter, frac, replace, weights, random_state))
        return SparkXShards(rdd1)

    def select(self, cols: Union[str, List[str]]) -> "SparkXShards":
        """
        Select specific columns of each pandas dataframe in SparkXShards and
        return a new SparkXShards.

        :param cols: string or list string.
        :return: a new SparkXShards.
        """
        if self._get_class_name() != 'pandas.core.frame.DataFrame':
            invalidInputError(False,
                              "Currently only support select() on"
                              " SparkXShards of Pandas DataFrame")

        if isinstance(cols, str):
            cols = [cols]
        invalidInputError(isinstance(cols, list), "cols should be str or list")

        columns = [c for c in self.rdd.first().columns]
        for c in cols:
            check_cols_exists(columns, c, "cols")

        return SparkXShards(self.rdd.map(lambda df: df[cols]))

    def describe(self, cols: Optional[Union[str, List[str]]]=None) -> "PandasDataFrame":
        """
        Computes basic statistics for numeric and string columns.

        This include count, mean, stddev, min, and max. If no columns are
        given, this function computes statistics for all numerical or string columns.

        :param cols: string or list string.
        :return: a panda dataframe of description.
        """
        if self._get_class_name() != 'pandas.core.frame.DataFrame':
            invalidInputError(False,
                              "Currently only support select() on"
                              " SparkXShards of Pandas DataFrame")

        columns = [c for c in self.rdd.first().columns]
        cols = cols if cols else columns

        if isinstance(cols, str):
            cols = [cols]
        invalidInputError(isinstance(cols, list), "cols should be str or list")

        for c in cols:
            check_cols_exists(columns, c, "cols")

        spark_df = self.to_spark_df()

        description = spark_df.describe(*cols).toPandas()
        return description

    def head(self, n: int=5) -> 'PandasDataFrame':
        """
        Retrun first rows of the first element of a SparkXShards.

        :param n: int, default 5
        :return: same type as self.type['class_name']
            The first `n` rows of the first element of this SparkXShards.
        """

        return self.rdd.first().head(n)

    def concat_to_pdf(self, axis: int=0) -> "PandasDataFrame":
        """
        Concatenate all pandas dataframes in SparsXShards into one single pandas dataframe

        :param axis, integer, default 0
        """
        if self._get_class_name() != 'pandas.core.frame.DataFrame':
            invalidInputError(False,
                              "Currently only support concat_to_pdf() on"
                              " XShards of Pandas DataFrame")

        dfs = self.rdd.collect()
        import pandas as pd
        return pd.concat(dfs, axis=axis)

    def sample_to_pdf(self,
                      frac: float,
                      replace: bool=False,
                      weights=None,
                      random_state=None,
                      axis: int=0) -> "PandasDataFrame":
        """
        Samples from each pandas dataframe in old SparkXShards, then concatenate into one single
        pandas dataframe and return it

        :param frac: float,  Fraction of items to return.
        :param replace: bool, default False,
            Allow or disallow sampling of the same row more than once.
        :param weights : str or ndarray-like, optional
            Default 'None' results in equal probability weighting.
        :param random_state : int, array-like, BitGenerator, np.random.RandomState, optional
            If int, array-like, or BitGenerator (NumPy>=1.17), seed for
            random number generator
            If np.random.RandomState, use as numpy RandomState object.
        :param axis, integer, default 0

        :return: a pandas dataframe.
        """
        if self._get_class_name() != 'pandas.core.frame.DataFrame':
            invalidInputError(False,
                              "Currently only support sample_to_pdf() on"
                              " XShards of Pandas DataFrame")
        sampled = self.sample(
            frac=frac, replace=replace, weights=weights, random_state=random_state)
        pdf = sampled.concat_to_pdf(axis=axis)
        return pdf

    def stack_feature_labels(self) -> "SparkXShards":
        """
        Stack tuple of features and labels in each partition into an ndarray for
        Orca Estimator traning

        :return: SparkXShards.
        """
        if self._get_class_name() != "builtins.tuple":
            invalidInputError(False,
                              "Currently only support stack_feature_labels() on"
                              " XShards of tuple of features and labels")

        def per_partition(iterator):
            features = []
            labels = []
            for it in iterator:
                feature, label = it[0], it[1]
                features.append(feature)
                labels.append(label)
            out = {'x': np.array(features).astype(np.float32),
                   'y': np.array(labels).astype(np.float32)}
            return [out]
        rdd = self.rdd.mapPartitions(lambda x: per_partition(x))
        return SparkXShards(rdd)


class SharedValue(object):
    def __init__(self, data) -> None:
        sc = OrcaContext.get_spark_context()
        self.broadcast_data = sc.broadcast(data)
        self._value = None

    @property
    def value(self):
        self._value = self.broadcast_data.value
        return self._value

    def unpersist(self):
        self.broadcast_data.unpersist()


def spark_df_to_ray_dataset(df: "SparkDataFrame") -> "Dataset":
    """
    Convert a Spark DataFrame to Ray Dataset. The block number of ray datasets equals to the
    partition number of the input DataFrame.
    :param df: A Spark dataframe.
    :return: A Ray Dataset holding Arrow records read from the dataframe.
    """
    spark_xshards = spark_df_to_pd_sparkxshards(df)
    ray_dataset = spark_xshards_to_ray_dataset(spark_xshards)
    return ray_dataset