Open In Colab

View Source on GitHub

Data Splitting

This notebook describes how to split data into training, validation and test sets.

[24]:

import numpy as np from ai4water.datasets import busan_beach from ai4water.preprocessing import DataSet from ai4water.utils.utils import get_version_info
[2]:
for lib, ver in get_version_info().items():
    print(lib, ver)
python 3.9.7 | packaged by conda-forge | (default, Sep 29 2021, 19:20:16) [MSC v.1916 64 bit (AMD64)]
os nt
ai4water 1.07
lightgbm 3.3.1
tcn 3.4.0
catboost 0.26
xgboost 1.5.0
easy_mpl 0.21.3
SeqMetrics 1.3.3
tensorflow 2.7.0
keras.api._v2.keras 2.7.0
numpy 1.21.0
pandas 1.3.4
matplotlib 3.4.3
h5py 3.5.0
sklearn 1.0.1
shapefile 2.3.0
fiona 1.8.22
xarray 0.20.1
netCDF4 1.5.7
optuna 2.10.1
skopt 0.9.0
hyperopt 0.2.7
plotly 5.3.1
lime NotDefined
seaborn 0.11.2
[3]:
data = busan_beach()
data.shape
[3]:
(1446, 14)
[4]:
data.dropna().shape
[4]:
(218, 14)
[6]:
ds = DataSet(data=data)

train_x, train_y = ds.training_data()
val_x, val_y = ds.validation_data()
test_x, test_y = ds.test_data()

len(train_x) + len(val_x) + len(test_x)

********** Removing Examples with nan in labels  **********

***** Training *****
input_x shape:  (121, 13)
target shape:  (121, 1)

********** Removing Examples with nan in labels  **********

***** Validation *****
input_x shape:  (31, 13)
target shape:  (31, 1)

********** Removing Examples with nan in labels  **********

***** Test *****
input_x shape:  (66, 13)
target shape:  (66, 1)
[6]:
218

train fraction

[7]:
ds.train_fraction
[7]:
0.7
[9]:
ds = DataSet(data=data, train_fraction=1.0)

train_x, train_y = ds.training_data()
val_x, val_y = ds.validation_data()
test_x, test_y = ds.test_data()

len(train_x) + len(val_x) + len(test_x)

********** Removing Examples with nan in labels  **********

***** Training *****
input_x shape:  (174, 13)
target shape:  (174, 1)

********** Removing Examples with nan in labels  **********

***** Validation *****
input_x shape:  (44, 13)
target shape:  (44, 1)
***** Test *****
input_x shape:  (0,)
target shape:  (0,)
[9]:
218

val_fraction

[10]:
ds = DataSet(data=data, train_fraction=1.0, val_fraction=0.5)

train_x, train_y = ds.training_data()
val_x, val_y = ds.validation_data()
test_x, test_y = ds.test_data()

len(train_x) + len(val_x) + len(test_x)

********** Removing Examples with nan in labels  **********

***** Training *****
input_x shape:  (109, 13)
target shape:  (109, 1)

********** Removing Examples with nan in labels  **********

***** Validation *****
input_x shape:  (109, 13)
target shape:  (109, 1)
***** Test *****
input_x shape:  (0,)
target shape:  (0,)
[10]:
218
[11]:
ds = DataSet(data=data, train_fraction=0.7, val_fraction=0.5)

train_x, train_y = ds.training_data()
val_x, val_y = ds.validation_data()
test_x, test_y = ds.test_data()

len(train_x) + len(val_x) + len(test_x)

********** Removing Examples with nan in labels  **********

***** Training *****
input_x shape:  (76, 13)
target shape:  (76, 1)

********** Removing Examples with nan in labels  **********

***** Validation *****
input_x shape:  (76, 13)
target shape:  (76, 1)

********** Removing Examples with nan in labels  **********

***** Test *****
input_x shape:  (66, 13)
target shape:  (66, 1)
[11]:
218

random splitting

[12]:
train_x[0]
[12]:
array([ -22.245026 ,   19.457182 ,   34.00429  ,   24.28     ,
          0.       ,    0.       ,    0.       ,    6.       ,
        205.00667  ,    1.6533333,  998.61334  , 1002.9133   ,
         75.1      ], dtype=float32)
[13]:
train_y[0]
[13]:
array([444866.9004])
[21]:
ds = DataSet(data=data, split_random=True, verbosity=0)

train_x, train_y = ds.training_data()
val_x, val_y = ds.validation_data()
test_x, test_y = ds.test_data()

train_y[0]
[21]:
array([1363760.645])
[22]:
train_y[1]
[22]:
array([2356366.032])

reproducibility

[18]:
for i in range(10):
    ds = DataSet(data=data, split_random=True, verbosity=0)

    train_x, train_y = ds.training_data()
    val_x, val_y = ds.validation_data()
    test_x, test_y = ds.test_data()

    print(train_y[0])
[1363760.645]
[1363760.645]
[1363760.645]
[1363760.645]
[1363760.645]
[1363760.645]
[1363760.645]
[1363760.645]
[1363760.645]
[1363760.645]
[19]:
for i in range(10):
    ds = DataSet(data=data, split_random=True, seed=None, verbosity=0)

    train_x, train_y = ds.training_data()
    val_x, val_y = ds.validation_data()
    test_x, test_y = ds.test_data()

    print(train_y[0])
[332194.7001]
[275100.6769]
[4881349.328]
[278787.0508]
[844769.5803]
[216767.3381]
[2391848.163]
[141357.2781]
[761979.6799]
[20083984.46]
[20]:
for i in range(10):
    ds = DataSet(data=data, split_random=True, seed=i, verbosity=0)

    train_x, train_y = ds.training_data()
    val_x, val_y = ds.validation_data()
    test_x, test_y = ds.test_data()

    print(train_y[0])
[774076.752]
[2060160.801]
[202449.1352]
[21289.67181]
[36045668.64]
[14976057.52]
[3291674.776]
[2356366.032]
[836261.1064]
[3256878.75]

spliting using indices

[25]:
indices = {
    'training': np.arange(50)
}
ds = DataSet(data=data, indices=indices, verbosity=0)


train_x, train_y = ds.training_data()
val_x, val_y = ds.validation_data()
test_x, test_y = ds.test_data()

[28]:
len(train_x), len(val_x), len(test_x)
[28]:
(40, 10, 168)
[29]:
train_y[0]
[29]:
array([444866.9004])

spliting using intervals

[ ]: