Source code for ai4water.preprocessing.transformations._wrapper


from typing import Union, List, Dict

from ai4water.backend import np, pd
from ai4water.preprocessing.transformations import Transformation
from ai4water.utils.utils import jsonize, deepcopy_dict_without_clone


[docs]class Transformations(object):
    """
    While the [Transformation][ai4water.preprocessing.transformations.Transformation]
    class is useful to apply a single transformation to a single data source, this
    class is helpful to apply multple transformations to a single data or multiple
    transformations to multiple data. This class is especially designed to be applied
    as part of `model` inside the `fit`, `predict` or `evaluate` methods. The
    `fit_transform` method should be applied before feeding the data to the
    algorithm and `inverse_transform` method should be called after algorithm has
    worked with data.

    Examples
    --------
        >>> import numpy as np
        >>> from ai4water.preprocessing.transformations import Transformations
        >>> x = np.arange(50).reshape(25, 2)
        >>> transformer = Transformations(['a', 'b'], config=['minmax', 'zscore'])
        >>> x_ = transformer.fit_transform(x)
        >>> _x = transformer.inverse_transform(x_)
        ...
        ... # Apply multiple transformations on multiple arrays which are passed as list
        >>> transformer = Transformations([['a', 'b'], ['a', 'b']],
        ...                              config=['minmax', 'zscore'])
        >>> x1 = np.arange(50).reshape(25, 2)
        >>> x2 = np.arange(50, 100).reshape(25, 2)
        >>> x1_transformed = transformer.fit_transform([x1, x2])
        >>> _x1 = transformer.inverse_transform(x1_transformed)

        We can also do more complicated stuff as following

        >>> transformer = Transformations({'x1': ['a', 'b'], 'x2': ['a', 'b']},
        ...        config={'x1': ['minmax', 'zscore'],
        ...                'x2': [{'method': 'log', 'features': ['a', 'b']},
        ...                       {'method': 'robust', 'features': ['a', 'b']}]
        ...                                      })
        >>> x1 = np.arange(20).reshape(10, 2)
        >>> x2 = np.arange(100, 120).reshape(10, 2)
        >>> x = {'x1': x1, 'x2': x2}
        >>> x_transformed = transformer.fit_transform(x)
        >>> _x = transformer.inverse_transform(x_transformed)

        In above example we apply `minmax` and `zscore` transformations on x1
        and `log` and `robust` transformations on x2 array
    """
[docs]    def __init__(
            self,
            feature_names: Union[list, dict],
            config: Union[str, list, dict] = None,
    ):
        """
        Arguments:
            feature_names:
                names of features in data
            config:
                Determines the type of transformation to be applied on data.
                It can be one of the following types

                - `string` when you want to apply single transformation

                >>> config='minmax'

                - `dict`: to pass additional arguments to the :py:class:`ai4water.preprocessing.Transformation`
                   class

                >>> config = {"method": 'log', 'treat_negatives': True, 'features': ['features']}

                - `list` when we want to apply multiple transformations

                >>> ['minmax', 'zscore']

                or

                >>> [{"method": 'log', 'treat_negatives': True, 'features': ['features']},
                >>> {'method': 'sqrt', 'treat_negatives': True}]


        """
        self.names = feature_names
        self.t_config = config
        self.without_fit = False

    def _fetch_transformation(self, data):
        config = self.t_config

        if isinstance(data, list):
            if isinstance(config, str):
                config = [config for _ in range(len(data))]
        elif isinstance(data, dict):
            if isinstance(config, str):
                config = {k:config for k in data.keys()}

        return config

    def _check_features(self):

        if self.is_numpy_:
            assert isinstance(self.names, list), f"""
            feature_names are of type {type(self.names)}"""

        elif self.is_list_:
            for idx, n in enumerate(self.names):
                assert isinstance(n, list), f"""
                feature_names for {idx} source is {type(n)}. It should be list"""

        elif self.is_dict_:
            assert isinstance(self.names, dict), f"""
            feature_names are of type {type(self.names)}"""
            for src_name, n in self.names.items():
                assert n.__class__.__name__ in ["ListWrapper", 'list']

        return

[docs]    def transform(self, data:Union[np.ndarray, List, Dict]):
        """Transforms the data according the the `config`.

        Arguments:
            data:
                The data on which to apply transformations. It can be one of following

                - a (2d or 3d) numpy array
                - a list of numpy arrays
                - a dictionary of numpy arrays
        Returns:
            The transformed data which has same type and dimensions as the input data
        """

        if self.t_config is None:  # if no transformation then just return the data as it is
            return data

        orignal_data_type = data.__class__.__name__

        assert hasattr(self, 'transformers_'), f"transformer has not been fitted yet"
        assert len(self.transformers_) > 0

        # first check that data matches config
        self._check_features()

        # then apply transformation
        data = self._transform(data)

        # now pack it in original form
        assert data.__class__.__name__ == orignal_data_type, f"""
        type changed from {orignal_data_type} to {data.__class__.__name__}
        """

        #self._assert_same_dim(self, orignal_data, data)

        return data

[docs]    def fit_transform(self, data:Union[np.ndarray, List, Dict]):
        """Transforms the data according the the `config`.

        Arguments:
            data:
                The data on which to apply transformations. It can be one of following

                - a (2d or 3d) numpy array
                - a list of numpy arrays
                - a dictionary of numpy arrays
        Returns:
            The transformed data which has same type and dimensions as the input data
        """
        self.is_numpy_ = False
        self.is_list_ = False
        self.is_dict_ =False
        self.transformers_ = {}

        if self.t_config is None:  # if no transformation then just return the data as it is
            return data

        orignal_data_type = data.__class__.__name__

        if isinstance(data, np.ndarray):
            self.is_numpy_ = True
        elif isinstance(data, list):
            self.is_list_ = True
        elif isinstance(data, dict):
            self.is_dict_ = True
        else:
            raise ValueError(f"invalid data of type {data.__class__.__name__}")

        # first check that data matches config
        self._check_features()

        # then apply transformation
        data = self._fit_transform(data)

        # now pack it in original form
        assert data.__class__.__name__ == orignal_data_type, f"""
        type changed from {orignal_data_type} to {data.__class__.__name__}
        """

        return data

    def _transform_2d(self, data, columns, transformation=None, key="5"):
        data = data.copy()

        if transformation:

            if isinstance(transformation, dict):
                config = self.transformers_[key]
                transformer = Transformation.from_config(config)
                data = transformer.transform(pd.DataFrame(data, columns=columns))

            # we want to apply multiple transformations
            elif isinstance(transformation, list):
                for idx, trans in enumerate(transformation):

                    if isinstance(trans, str):
                        config = self.transformers_[f'{key}_{trans}_{idx}']
                        transformer = Transformation.from_config(config)
                        data = transformer.transform(pd.DataFrame(data, columns=columns))

                    elif trans['method'] is not None:
                        config = self.transformers_[f'{key}_{trans["method"]}_{idx}']
                        transformer = Transformation.from_config(config)
                        data = transformer.transform(pd.DataFrame(data, columns=columns))
            else:
                assert isinstance(transformation, str)
                transformer = Transformation.from_config(self.transformers_[key])
                data = transformer.transform(pd.DataFrame(data, columns=columns))

            data = data.values

        return data

    def _fit_transform_2d(self, data, columns, transformation=None, key="5"):
        """performs transformation on single data 2D source"""
        # it is better to make a copy here because all the operations
        # on data happen after this.
        data = data.copy()
        transformers = {}
        if transformation:

            if isinstance(transformation, dict):
                transformer = Transformation(**transformation)
                data = transformer.fit_transform(pd.DataFrame(data, columns=columns))
                transformers[key] = transformer.config()

            # we want to apply multiple transformations
            elif isinstance(transformation, list):
                for idx, trans in enumerate(transformation):

                    if isinstance(trans, str):
                        transformer = Transformation(method=trans)
                        data = transformer.fit_transform(pd.DataFrame(data, columns=columns))
                        transformers[f'{key}_{trans}_{idx}'] = transformer.config()

                    elif trans['method'] is not None:
                        transformer = Transformation(**trans)
                        data = transformer.fit_transform(pd.DataFrame(data, columns=columns))
                        transformers[f'{key}_{trans["method"]}_{idx}'] = transformer.config()
                    else:
                        raise ValueError(f"{trans['method']} is invalid transformation")
            else:
                assert isinstance(transformation, str)
                transformer = Transformation(method=transformation)
                data = transformer.fit_transform(pd.DataFrame(data, columns=columns))
                transformers[key] = transformer.config()

            data = data.values

        self.transformers_.update(transformers)

        return data

    def __transform(self, data, feature_names, transformation=None, key="5"):
        """performs transformation on single data source without fiting on it first.
        In case of 3d array, the shape is supposed to be following
        (num_examples, time_steps, num_features)
        Therefore, each time_step is extracted and transfomred individually
        for example with time_steps of 2, two 2d arrays will be extracted and
        transformed individually
        (num_examples, 0,num_features), (num_examples, 1, num_features)
        """
        if data.ndim == 3:
            _data = np.full(data.shape, np.nan)
            for time_step in range(data.shape[1]):
                _data[:, time_step] = self._transform_2d(data[:, time_step],
                                                        feature_names,
                                                        transformation,
                                                        key=f"{key}_{time_step}")
        else:
            _data = self._transform_2d(data, feature_names, transformation, key=key)

        return _data

    def __fit_transform(self, data, feature_names, transformation=None, key="5"):
        """performs transformation on single data source
        In case of 3d array, the shape is supposed to be following
        (num_examples, time_steps, num_features)
        Therefore, each time_step is extracted and transfomred individually
        for example with time_steps of 2, two 2d arrays will be extracted and
        transformed individually
        (num_examples, 0,num_features), (num_examples, 1, num_features)
        """
        if data.ndim == 3:
            _data = np.full(data.shape, np.nan)
            for time_step in range(data.shape[1]):
                _data[:, time_step] = self._fit_transform_2d(data[:, time_step],
                                                        feature_names,
                                                        transformation,
                                                        key=f"{key}_{time_step}")
        else:
            _data = self._fit_transform_2d(data, feature_names, transformation, key=key)

        return _data

    def _transform(self, data, key="5"):
        """performs transformation on every data source in data"""
        transformation = self._fetch_transformation(data)
        if self.is_numpy_:
            _data = self.__transform(data, self.names, transformation, key)

        elif self.is_list_:
            _data = []
            for idx, array in enumerate(data):
                _data.append(self.__transform(array,
                                              self.names[idx],
                                              transformation[idx],
                                              key=f"{key}_{idx}")
                             )
        else:
            _data = {}
            for src_name, array in data.items():
                _data[src_name] = self.__transform(array,
                                                   self.names[src_name],
                                                   transformation[src_name],
                                                   f"{key}_{src_name}")
        return _data

    def _fit_transform(self, data, key="5"):
        """performs transformation on every data source in data"""
        transformation = self._fetch_transformation(data)
        if self.is_numpy_:
            _data = self.__fit_transform(data, self.names, transformation, key)

        elif self.is_list_:
            _data = []
            for idx, array in enumerate(data):
                _data.append(self.__fit_transform(array,
                                                  self.names[idx],
                                                  transformation[idx],
                                                  key=f"{key}_{idx}")
                             )
        else:
            _data = {}
            for src_name, array in data.items():
                _data[src_name] = self.__fit_transform(array,
                                                       self.names[src_name],
                                                       transformation[src_name],
                                                       f"{key}_{src_name}")
        return _data

[docs]    def inverse_transform(self, data, postprocess=True):
        """inverse transforms data where data can be dictionary, list or numpy
        array.

        Arguments:
            data:
                the data which is to be inverse transformed. The output of
                `fit_transform` method.
            postprocess : bool

        Returns:
            The original data which was given to `fit_transform` method.
        """
        if not hasattr(self, 'transformers_'):
            raise ValueError(f"Transformations class has not been fitted yet")
        return self._inverse_transform(data, postprocess=postprocess)

[docs]    def inverse_transform_without_fit(self, data, postprocess=True)->np.ndarray:
        data = np.array(data)
        if data.ndim == 1:
            data = data.reshape(-1, 1)

        assert isinstance(self.names, list)
        assert data.shape[-1] == len(self.names)

        data = pd.DataFrame(data, columns=self.names)

        kwargs = {}
        if isinstance(self.t_config, str):
            kwargs['method'] = self.t_config
        elif isinstance(self.t_config, dict):
            kwargs = self.t_config
        elif isinstance(self.t_config, list):
            assert len(self.t_config) == 1
            t_config = self.t_config[0]
            if isinstance(t_config, str):
                kwargs['method'] = t_config
            elif isinstance(t_config, dict):
                kwargs = t_config
            else:
                raise ValueError(f"invalid type of t_config {t_config.__class__.__name__}")
        else:
            raise ValueError(f"invalid type of t_config {self.t_config.__class__.__name__}")

        transformer = Transformation(**kwargs)
        transformed_data = transformer.inverse_transform(data=data, postprocess=postprocess)

        return transformed_data.values

    def _inverse_transform(self, data, key="5", postprocess=True):

        transformation = self._fetch_transformation(data)

        if self.is_numpy_:
            data = self.__inverse_transform(data,
                                            self.names,
                                            transformation,
                                            key,
                                            postprocess=postprocess)

        elif self.is_list_:
            assert isinstance(data, list)
            _data = []
            for idx, src in enumerate(data):
                __data = self.__inverse_transform(src,
                                                 self.names[idx],
                                                 transformation[idx],
                                                 f'{key}_{idx}',
                                                  postprocess=postprocess)
                _data.append(__data)
            data = _data

        elif self.is_dict_:
            assert isinstance(data, dict)
            _data = {}
            for src_name, src in data.items():
                _data[src_name] = self.__inverse_transform(src,
                                                          self.names[src_name],
                                                          transformation[src_name],
                                                          f'{key}_{src_name}',
                                                           postprocess=postprocess)
            data = _data

        return data

    def __inverse_transform(self,
                            data,
                            feature_names,
                            transformation, key="5",
                            postprocess=True):
        """inverse transforms one data source which may 2d or 3d nd array"""
        if data.ndim == 3:
            _data = np.full(data.shape, np.nan)
            for time_step in range(data.shape[1]):
                _data[:, time_step] = self._inverse_transform_2d(
                    data[:, time_step],
                    columns=feature_names,
                    transformation=transformation,
                    key=f"{key}_{time_step}",
                    postprocess=postprocess)
        else:
            _data = self._inverse_transform_2d(data,
                                               feature_names,
                                               key,
                                               transformation,
                                               postprocess=postprocess)

        return _data

    def _inverse_transform_2d(self,
                              data,
                              columns,
                              key,
                              transformation,
                              postprocess=True)->np.ndarray:
        """inverse transforms one 2d array"""
        data = pd.DataFrame(data.copy(), columns=columns)

        if transformation is not None:
            if isinstance(transformation, str):

                if key not in self.transformers_:
                    raise ValueError(f"""
                    key `{key}` for inverse transformation not found. Available keys are {list(self.transformers_.keys())}""")

                transformer = self.transformers_[key]
                transformer, shape = transformer, transformer['shape']
                original_shape = data.shape

                transformer = Transformation.from_config(transformer)
                transformed_data = transformer.inverse_transform(data, postprocess=postprocess)
                data = transformed_data

            elif isinstance(transformation, list):
                # idx and trans both in reverse form
                for idx, trans in reversed(list(enumerate(transformation))):
                    if isinstance(trans, str):
                        transformer = self.transformers_[f'{key}_{trans}_{idx}']
                        transformer, shape = transformer, transformer['shape']
                        transformer = Transformation.from_config(transformer)
                        data = transformer.inverse_transform(data=data, postprocess=postprocess)

                    elif trans['method'] is not None:
                        features = trans.get('features', columns)
                        # if any of the feature in data was transformed
                        if any([True if f in data else False for f in features]):
                            orig_cols = data.columns  # copy teh columns in the original df
                            transformer = self.transformers_[f'{key}_{trans["method"]}_{idx}']
                            transformer, shape = transformer, transformer['shape']
                            data, dummy_features = conform_shape(data, shape, features)  # get data to transform

                            transformer = Transformation.from_config(transformer)
                            transformed_data = transformer.inverse_transform(data=data,
                                                                             postprocess=postprocess)
                            data = transformed_data[orig_cols]  # remove the dummy data

            elif isinstance(transformation, dict):

                features = transformation.get('features', columns)
                if any([True if f in data else False for f in features]):
                    orig_cols = data.columns
                    transformer = self.transformers_[key]
                    transformer, shape = transformer, transformer['shape']
                    data, dummy_features = conform_shape(data, shape, features=features)

                    transformer = Transformation.from_config(transformer)
                    transformed_data = transformer.inverse_transform(data=data, postprocess=postprocess)
                    data = transformed_data[orig_cols]  # remove the dummy data

        if data.__class__.__name__ == "DataFrame":
            data = data.values  # there is no need to return DataFrame

        return data

[docs]    def config(self)->dict:
        """returns a python dictionary which can be used to construct this class
        in fitted form i.e as if the fit_transform method has already been applied.
        Returns:
            a dictionary from which `Transformations` class can be constructed
        """
        return {
            'transformers_': jsonize(self.transformers_),
            "feature_names": self.names,
            "config": self.t_config,
            "is_numpy_": self.is_numpy_,
            "is_dict_": self.is_dict_,
            "is_list_": self.is_list_,
        }

[docs]    @classmethod
    def from_config(cls, config:dict)->"Transformations":
        """constructs the Transformations class which may has already been fitted.
        """
        config = deepcopy_dict_without_clone(config)

        transformer = cls(config.pop('feature_names'), config.pop('config'))

        for attr_name, attr_val in config.items():
            setattr(cls, attr_name, attr_val)

        return transformer


def conform_shape(data, shape, features=None):
    # if the difference is of only 1 dim, we resolve it
    if data.ndim > len(shape):
        data = np.squeeze(data, axis=-1)
    elif data.ndim < len(shape):
        data = np.expand_dims(data, axis=-1)

    assert data.ndim == len(shape), f"""original data had {len(shape)} wihle the 
    new data has {data.ndim} dimensions"""

    # how manu dummy features we have to add to match the shape
    dummy_features = shape[-1] - data.shape[-1]

    if data.__class__.__name__ in ['DataFrame', 'Series']:
        # we know what features must be in data, so put them in data one by one
        # if they do not exist in data already
        if features:
            for f in features:
                if f not in data:
                    data[f] = np.random.random(len(data))
        # identify how many features to be added by shape information
        elif dummy_features > 0:
            dummy_data = pd.DataFrame(np.random.random((len(data), dummy_features)))
            data = pd.concat([dummy_data, data], axis=1)
    else:
        dummy_data = np.random.random((len(data), dummy_features))
        data = np.concatenate([dummy_data, data], axis=1)

    return data, dummy_features