Source code for ai4water.preprocessing.transformations._wrapper
from typing import Union, List, Dict
from ai4water.backend import np, pd
from ai4water.preprocessing.transformations import Transformation
from ai4water.utils.utils import jsonize, deepcopy_dict_without_clone
[docs]class Transformations(object):
"""
While the [Transformation][ai4water.preprocessing.transformations.Transformation]
class is useful to apply a single transformation to a single data source, this
class is helpful to apply multple transformations to a single data or multiple
transformations to multiple data. This class is especially designed to be applied
as part of `model` inside the `fit`, `predict` or `evaluate` methods. The
`fit_transform` method should be applied before feeding the data to the
algorithm and `inverse_transform` method should be called after algorithm has
worked with data.
Examples
--------
>>> import numpy as np
>>> from ai4water.preprocessing.transformations import Transformations
>>> x = np.arange(50).reshape(25, 2)
>>> transformer = Transformations(['a', 'b'], config=['minmax', 'zscore'])
>>> x_ = transformer.fit_transform(x)
>>> _x = transformer.inverse_transform(x_)
...
... # Apply multiple transformations on multiple arrays which are passed as list
>>> transformer = Transformations([['a', 'b'], ['a', 'b']],
... config=['minmax', 'zscore'])
>>> x1 = np.arange(50).reshape(25, 2)
>>> x2 = np.arange(50, 100).reshape(25, 2)
>>> x1_transformed = transformer.fit_transform([x1, x2])
>>> _x1 = transformer.inverse_transform(x1_transformed)
We can also do more complicated stuff as following
>>> transformer = Transformations({'x1': ['a', 'b'], 'x2': ['a', 'b']},
... config={'x1': ['minmax', 'zscore'],
... 'x2': [{'method': 'log', 'features': ['a', 'b']},
... {'method': 'robust', 'features': ['a', 'b']}]
... })
>>> x1 = np.arange(20).reshape(10, 2)
>>> x2 = np.arange(100, 120).reshape(10, 2)
>>> x = {'x1': x1, 'x2': x2}
>>> x_transformed = transformer.fit_transform(x)
>>> _x = transformer.inverse_transform(x_transformed)
In above example we apply `minmax` and `zscore` transformations on x1
and `log` and `robust` transformations on x2 array
"""
[docs] def __init__(
self,
feature_names: Union[list, dict],
config: Union[str, list, dict] = None,
):
"""
Arguments:
feature_names:
names of features in data
config:
Determines the type of transformation to be applied on data.
It can be one of the following types
- `string` when you want to apply single transformation
>>> config='minmax'
- `dict`: to pass additional arguments to the :py:class:`ai4water.preprocessing.Transformation`
class
>>> config = {"method": 'log', 'treat_negatives': True, 'features': ['features']}
- `list` when we want to apply multiple transformations
>>> ['minmax', 'zscore']
or
>>> [{"method": 'log', 'treat_negatives': True, 'features': ['features']},
>>> {'method': 'sqrt', 'treat_negatives': True}]
"""
self.names = feature_names
self.t_config = config
self.without_fit = False
def _fetch_transformation(self, data):
config = self.t_config
if isinstance(data, list):
if isinstance(config, str):
config = [config for _ in range(len(data))]
elif isinstance(data, dict):
if isinstance(config, str):
config = {k:config for k in data.keys()}
return config
def _check_features(self):
if self.is_numpy_:
assert isinstance(self.names, list), f"""
feature_names are of type {type(self.names)}"""
elif self.is_list_:
for idx, n in enumerate(self.names):
assert isinstance(n, list), f"""
feature_names for {idx} source is {type(n)}. It should be list"""
elif self.is_dict_:
assert isinstance(self.names, dict), f"""
feature_names are of type {type(self.names)}"""
for src_name, n in self.names.items():
assert n.__class__.__name__ in ["ListWrapper", 'list']
return
[docs] def transform(self, data:Union[np.ndarray, List, Dict]):
"""Transforms the data according the the `config`.
Arguments:
data:
The data on which to apply transformations. It can be one of following
- a (2d or 3d) numpy array
- a list of numpy arrays
- a dictionary of numpy arrays
Returns:
The transformed data which has same type and dimensions as the input data
"""
if self.t_config is None: # if no transformation then just return the data as it is
return data
orignal_data_type = data.__class__.__name__
assert hasattr(self, 'transformers_'), f"transformer has not been fitted yet"
assert len(self.transformers_) > 0
# first check that data matches config
self._check_features()
# then apply transformation
data = self._transform(data)
# now pack it in original form
assert data.__class__.__name__ == orignal_data_type, f"""
type changed from {orignal_data_type} to {data.__class__.__name__}
"""
#self._assert_same_dim(self, orignal_data, data)
return data
[docs] def fit_transform(self, data:Union[np.ndarray, List, Dict]):
"""Transforms the data according the the `config`.
Arguments:
data:
The data on which to apply transformations. It can be one of following
- a (2d or 3d) numpy array
- a list of numpy arrays
- a dictionary of numpy arrays
Returns:
The transformed data which has same type and dimensions as the input data
"""
self.is_numpy_ = False
self.is_list_ = False
self.is_dict_ =False
self.transformers_ = {}
if self.t_config is None: # if no transformation then just return the data as it is
return data
orignal_data_type = data.__class__.__name__
if isinstance(data, np.ndarray):
self.is_numpy_ = True
elif isinstance(data, list):
self.is_list_ = True
elif isinstance(data, dict):
self.is_dict_ = True
else:
raise ValueError(f"invalid data of type {data.__class__.__name__}")
# first check that data matches config
self._check_features()
# then apply transformation
data = self._fit_transform(data)
# now pack it in original form
assert data.__class__.__name__ == orignal_data_type, f"""
type changed from {orignal_data_type} to {data.__class__.__name__}
"""
return data
def _transform_2d(self, data, columns, transformation=None, key="5"):
data = data.copy()
if transformation:
if isinstance(transformation, dict):
config = self.transformers_[key]
transformer = Transformation.from_config(config)
data = transformer.transform(pd.DataFrame(data, columns=columns))
# we want to apply multiple transformations
elif isinstance(transformation, list):
for idx, trans in enumerate(transformation):
if isinstance(trans, str):
config = self.transformers_[f'{key}_{trans}_{idx}']
transformer = Transformation.from_config(config)
data = transformer.transform(pd.DataFrame(data, columns=columns))
elif trans['method'] is not None:
config = self.transformers_[f'{key}_{trans["method"]}_{idx}']
transformer = Transformation.from_config(config)
data = transformer.transform(pd.DataFrame(data, columns=columns))
else:
assert isinstance(transformation, str)
transformer = Transformation.from_config(self.transformers_[key])
data = transformer.transform(pd.DataFrame(data, columns=columns))
data = data.values
return data
def _fit_transform_2d(self, data, columns, transformation=None, key="5"):
"""performs transformation on single data 2D source"""
# it is better to make a copy here because all the operations
# on data happen after this.
data = data.copy()
transformers = {}
if transformation:
if isinstance(transformation, dict):
transformer = Transformation(**transformation)
data = transformer.fit_transform(pd.DataFrame(data, columns=columns))
transformers[key] = transformer.config()
# we want to apply multiple transformations
elif isinstance(transformation, list):
for idx, trans in enumerate(transformation):
if isinstance(trans, str):
transformer = Transformation(method=trans)
data = transformer.fit_transform(pd.DataFrame(data, columns=columns))
transformers[f'{key}_{trans}_{idx}'] = transformer.config()
elif trans['method'] is not None:
transformer = Transformation(**trans)
data = transformer.fit_transform(pd.DataFrame(data, columns=columns))
transformers[f'{key}_{trans["method"]}_{idx}'] = transformer.config()
else:
raise ValueError(f"{trans['method']} is invalid transformation")
else:
assert isinstance(transformation, str)
transformer = Transformation(method=transformation)
data = transformer.fit_transform(pd.DataFrame(data, columns=columns))
transformers[key] = transformer.config()
data = data.values
self.transformers_.update(transformers)
return data
def __transform(self, data, feature_names, transformation=None, key="5"):
"""performs transformation on single data source without fiting on it first.
In case of 3d array, the shape is supposed to be following
(num_examples, time_steps, num_features)
Therefore, each time_step is extracted and transfomred individually
for example with time_steps of 2, two 2d arrays will be extracted and
transformed individually
(num_examples, 0,num_features), (num_examples, 1, num_features)
"""
if data.ndim == 3:
_data = np.full(data.shape, np.nan)
for time_step in range(data.shape[1]):
_data[:, time_step] = self._transform_2d(data[:, time_step],
feature_names,
transformation,
key=f"{key}_{time_step}")
else:
_data = self._transform_2d(data, feature_names, transformation, key=key)
return _data
def __fit_transform(self, data, feature_names, transformation=None, key="5"):
"""performs transformation on single data source
In case of 3d array, the shape is supposed to be following
(num_examples, time_steps, num_features)
Therefore, each time_step is extracted and transfomred individually
for example with time_steps of 2, two 2d arrays will be extracted and
transformed individually
(num_examples, 0,num_features), (num_examples, 1, num_features)
"""
if data.ndim == 3:
_data = np.full(data.shape, np.nan)
for time_step in range(data.shape[1]):
_data[:, time_step] = self._fit_transform_2d(data[:, time_step],
feature_names,
transformation,
key=f"{key}_{time_step}")
else:
_data = self._fit_transform_2d(data, feature_names, transformation, key=key)
return _data
def _transform(self, data, key="5"):
"""performs transformation on every data source in data"""
transformation = self._fetch_transformation(data)
if self.is_numpy_:
_data = self.__transform(data, self.names, transformation, key)
elif self.is_list_:
_data = []
for idx, array in enumerate(data):
_data.append(self.__transform(array,
self.names[idx],
transformation[idx],
key=f"{key}_{idx}")
)
else:
_data = {}
for src_name, array in data.items():
_data[src_name] = self.__transform(array,
self.names[src_name],
transformation[src_name],
f"{key}_{src_name}")
return _data
def _fit_transform(self, data, key="5"):
"""performs transformation on every data source in data"""
transformation = self._fetch_transformation(data)
if self.is_numpy_:
_data = self.__fit_transform(data, self.names, transformation, key)
elif self.is_list_:
_data = []
for idx, array in enumerate(data):
_data.append(self.__fit_transform(array,
self.names[idx],
transformation[idx],
key=f"{key}_{idx}")
)
else:
_data = {}
for src_name, array in data.items():
_data[src_name] = self.__fit_transform(array,
self.names[src_name],
transformation[src_name],
f"{key}_{src_name}")
return _data
[docs] def inverse_transform(self, data, postprocess=True):
"""inverse transforms data where data can be dictionary, list or numpy
array.
Arguments:
data:
the data which is to be inverse transformed. The output of
`fit_transform` method.
postprocess : bool
Returns:
The original data which was given to `fit_transform` method.
"""
if not hasattr(self, 'transformers_'):
raise ValueError(f"Transformations class has not been fitted yet")
return self._inverse_transform(data, postprocess=postprocess)
[docs] def inverse_transform_without_fit(self, data, postprocess=True)->np.ndarray:
data = np.array(data)
if data.ndim == 1:
data = data.reshape(-1, 1)
assert isinstance(self.names, list)
assert data.shape[-1] == len(self.names)
data = pd.DataFrame(data, columns=self.names)
kwargs = {}
if isinstance(self.t_config, str):
kwargs['method'] = self.t_config
elif isinstance(self.t_config, dict):
kwargs = self.t_config
elif isinstance(self.t_config, list):
assert len(self.t_config) == 1
t_config = self.t_config[0]
if isinstance(t_config, str):
kwargs['method'] = t_config
elif isinstance(t_config, dict):
kwargs = t_config
else:
raise ValueError(f"invalid type of t_config {t_config.__class__.__name__}")
else:
raise ValueError(f"invalid type of t_config {self.t_config.__class__.__name__}")
transformer = Transformation(**kwargs)
transformed_data = transformer.inverse_transform(data=data, postprocess=postprocess)
return transformed_data.values
def _inverse_transform(self, data, key="5", postprocess=True):
transformation = self._fetch_transformation(data)
if self.is_numpy_:
data = self.__inverse_transform(data,
self.names,
transformation,
key,
postprocess=postprocess)
elif self.is_list_:
assert isinstance(data, list)
_data = []
for idx, src in enumerate(data):
__data = self.__inverse_transform(src,
self.names[idx],
transformation[idx],
f'{key}_{idx}',
postprocess=postprocess)
_data.append(__data)
data = _data
elif self.is_dict_:
assert isinstance(data, dict)
_data = {}
for src_name, src in data.items():
_data[src_name] = self.__inverse_transform(src,
self.names[src_name],
transformation[src_name],
f'{key}_{src_name}',
postprocess=postprocess)
data = _data
return data
def __inverse_transform(self,
data,
feature_names,
transformation, key="5",
postprocess=True):
"""inverse transforms one data source which may 2d or 3d nd array"""
if data.ndim == 3:
_data = np.full(data.shape, np.nan)
for time_step in range(data.shape[1]):
_data[:, time_step] = self._inverse_transform_2d(
data[:, time_step],
columns=feature_names,
transformation=transformation,
key=f"{key}_{time_step}",
postprocess=postprocess)
else:
_data = self._inverse_transform_2d(data,
feature_names,
key,
transformation,
postprocess=postprocess)
return _data
def _inverse_transform_2d(self,
data,
columns,
key,
transformation,
postprocess=True)->np.ndarray:
"""inverse transforms one 2d array"""
data = pd.DataFrame(data.copy(), columns=columns)
if transformation is not None:
if isinstance(transformation, str):
if key not in self.transformers_:
raise ValueError(f"""
key `{key}` for inverse transformation not found. Available keys are {list(self.transformers_.keys())}""")
transformer = self.transformers_[key]
transformer, shape = transformer, transformer['shape']
original_shape = data.shape
transformer = Transformation.from_config(transformer)
transformed_data = transformer.inverse_transform(data, postprocess=postprocess)
data = transformed_data
elif isinstance(transformation, list):
# idx and trans both in reverse form
for idx, trans in reversed(list(enumerate(transformation))):
if isinstance(trans, str):
transformer = self.transformers_[f'{key}_{trans}_{idx}']
transformer, shape = transformer, transformer['shape']
transformer = Transformation.from_config(transformer)
data = transformer.inverse_transform(data=data, postprocess=postprocess)
elif trans['method'] is not None:
features = trans.get('features', columns)
# if any of the feature in data was transformed
if any([True if f in data else False for f in features]):
orig_cols = data.columns # copy teh columns in the original df
transformer = self.transformers_[f'{key}_{trans["method"]}_{idx}']
transformer, shape = transformer, transformer['shape']
data, dummy_features = conform_shape(data, shape, features) # get data to transform
transformer = Transformation.from_config(transformer)
transformed_data = transformer.inverse_transform(data=data,
postprocess=postprocess)
data = transformed_data[orig_cols] # remove the dummy data
elif isinstance(transformation, dict):
features = transformation.get('features', columns)
if any([True if f in data else False for f in features]):
orig_cols = data.columns
transformer = self.transformers_[key]
transformer, shape = transformer, transformer['shape']
data, dummy_features = conform_shape(data, shape, features=features)
transformer = Transformation.from_config(transformer)
transformed_data = transformer.inverse_transform(data=data, postprocess=postprocess)
data = transformed_data[orig_cols] # remove the dummy data
if data.__class__.__name__ == "DataFrame":
data = data.values # there is no need to return DataFrame
return data
[docs] def config(self)->dict:
"""returns a python dictionary which can be used to construct this class
in fitted form i.e as if the fit_transform method has already been applied.
Returns:
a dictionary from which `Transformations` class can be constructed
"""
return {
'transformers_': jsonize(self.transformers_),
"feature_names": self.names,
"config": self.t_config,
"is_numpy_": self.is_numpy_,
"is_dict_": self.is_dict_,
"is_list_": self.is_list_,
}
[docs] @classmethod
def from_config(cls, config:dict)->"Transformations":
"""constructs the Transformations class which may has already been fitted.
"""
config = deepcopy_dict_without_clone(config)
transformer = cls(config.pop('feature_names'), config.pop('config'))
for attr_name, attr_val in config.items():
setattr(cls, attr_name, attr_val)
return transformer
def conform_shape(data, shape, features=None):
# if the difference is of only 1 dim, we resolve it
if data.ndim > len(shape):
data = np.squeeze(data, axis=-1)
elif data.ndim < len(shape):
data = np.expand_dims(data, axis=-1)
assert data.ndim == len(shape), f"""original data had {len(shape)} wihle the
new data has {data.ndim} dimensions"""
# how manu dummy features we have to add to match the shape
dummy_features = shape[-1] - data.shape[-1]
if data.__class__.__name__ in ['DataFrame', 'Series']:
# we know what features must be in data, so put them in data one by one
# if they do not exist in data already
if features:
for f in features:
if f not in data:
data[f] = np.random.random(len(data))
# identify how many features to be added by shape information
elif dummy_features > 0:
dummy_data = pd.DataFrame(np.random.random((len(data), dummy_features)))
data = pd.concat([dummy_data, data], axis=1)
else:
dummy_data = np.random.random((len(data), dummy_features))
data = np.concatenate([dummy_data, data], axis=1)
return data, dummy_features