Data Splitting

This notebook describes how to split data into training, validation and test sets.

[24]:

import numpy as np

from ai4water.datasets import busan_beach
from ai4water.preprocessing import DataSet
from ai4water.utils.utils import get_version_info

[2]:

for lib, ver in get_version_info().items():
    print(lib, ver)

python 3.9.7 | packaged by conda-forge | (default, Sep 29 2021, 19:20:16) [MSC v.1916 64 bit (AMD64)]
os nt
ai4water 1.07
lightgbm 3.3.1
tcn 3.4.0
catboost 0.26
xgboost 1.5.0
easy_mpl 0.21.3
SeqMetrics 1.3.3
tensorflow 2.7.0
keras.api._v2.keras 2.7.0
numpy 1.21.0
pandas 1.3.4
matplotlib 3.4.3
h5py 3.5.0
sklearn 1.0.1
shapefile 2.3.0
fiona 1.8.22
xarray 0.20.1
netCDF4 1.5.7
optuna 2.10.1
skopt 0.9.0
hyperopt 0.2.7
plotly 5.3.1
lime NotDefined
seaborn 0.11.2

[3]:

data = busan_beach()
data.shape

[3]:

(1446, 14)

[4]:

data.dropna().shape

[4]:

(218, 14)

[6]:

ds = DataSet(data=data)

train_x, train_y = ds.training_data()
val_x, val_y = ds.validation_data()
test_x, test_y = ds.test_data()

len(train_x) + len(val_x) + len(test_x)


********** Removing Examples with nan in labels  **********

***** Training *****
input_x shape:  (121, 13)
target shape:  (121, 1)

********** Removing Examples with nan in labels  **********

***** Validation *****
input_x shape:  (31, 13)
target shape:  (31, 1)

********** Removing Examples with nan in labels  **********

***** Test *****
input_x shape:  (66, 13)
target shape:  (66, 1)

[6]:

train fraction

[7]:

ds.train_fraction

[7]:

0.7

[9]:

ds = DataSet(data=data, train_fraction=1.0)

train_x, train_y = ds.training_data()
val_x, val_y = ds.validation_data()
test_x, test_y = ds.test_data()

len(train_x) + len(val_x) + len(test_x)


********** Removing Examples with nan in labels  **********

***** Training *****
input_x shape:  (174, 13)
target shape:  (174, 1)

********** Removing Examples with nan in labels  **********

***** Validation *****
input_x shape:  (44, 13)
target shape:  (44, 1)
***** Test *****
input_x shape:  (0,)
target shape:  (0,)

[9]:

val_fraction

[10]:

ds = DataSet(data=data, train_fraction=1.0, val_fraction=0.5)

train_x, train_y = ds.training_data()
val_x, val_y = ds.validation_data()
test_x, test_y = ds.test_data()

len(train_x) + len(val_x) + len(test_x)


********** Removing Examples with nan in labels  **********

***** Training *****
input_x shape:  (109, 13)
target shape:  (109, 1)

********** Removing Examples with nan in labels  **********

***** Validation *****
input_x shape:  (109, 13)
target shape:  (109, 1)
***** Test *****
input_x shape:  (0,)
target shape:  (0,)

[10]:

[11]:

ds = DataSet(data=data, train_fraction=0.7, val_fraction=0.5)

train_x, train_y = ds.training_data()
val_x, val_y = ds.validation_data()
test_x, test_y = ds.test_data()

len(train_x) + len(val_x) + len(test_x)


********** Removing Examples with nan in labels  **********

***** Training *****
input_x shape:  (76, 13)
target shape:  (76, 1)

********** Removing Examples with nan in labels  **********

***** Validation *****
input_x shape:  (76, 13)
target shape:  (76, 1)

********** Removing Examples with nan in labels  **********

***** Test *****
input_x shape:  (66, 13)
target shape:  (66, 1)

[11]:

random splitting

[12]:

train_x[0]

[12]:

array([ -22.245026 ,   19.457182 ,   34.00429  ,   24.28     ,
          0.       ,    0.       ,    0.       ,    6.       ,
        205.00667  ,    1.6533333,  998.61334  , 1002.9133   ,
         75.1      ], dtype=float32)

[13]:

train_y[0]

[13]:

array([444866.9004])

[21]:

ds = DataSet(data=data, split_random=True, verbosity=0)

train_x, train_y = ds.training_data()
val_x, val_y = ds.validation_data()
test_x, test_y = ds.test_data()

train_y[0]

[21]:

array([1363760.645])

[22]:

train_y[1]

[22]:

array([2356366.032])

reproducibility

[18]:

for i in range(10):
    ds = DataSet(data=data, split_random=True, verbosity=0)

    train_x, train_y = ds.training_data()
    val_x, val_y = ds.validation_data()
    test_x, test_y = ds.test_data()

    print(train_y[0])

[1363760.645]
[1363760.645]
[1363760.645]
[1363760.645]
[1363760.645]
[1363760.645]
[1363760.645]
[1363760.645]
[1363760.645]
[1363760.645]

[19]:

for i in range(10):
    ds = DataSet(data=data, split_random=True, seed=None, verbosity=0)

    train_x, train_y = ds.training_data()
    val_x, val_y = ds.validation_data()
    test_x, test_y = ds.test_data()

    print(train_y[0])

[332194.7001]
[275100.6769]
[4881349.328]
[278787.0508]
[844769.5803]
[216767.3381]
[2391848.163]
[141357.2781]
[761979.6799]
[20083984.46]

[20]:

for i in range(10):
    ds = DataSet(data=data, split_random=True, seed=i, verbosity=0)

    train_x, train_y = ds.training_data()
    val_x, val_y = ds.validation_data()
    test_x, test_y = ds.test_data()

    print(train_y[0])

[774076.752]
[2060160.801]
[202449.1352]
[21289.67181]
[36045668.64]
[14976057.52]
[3291674.776]
[2356366.032]
[836261.1064]
[3256878.75]

spliting using indices

[25]:

indices = {
    'training': np.arange(50)
}
ds = DataSet(data=data, indices=indices, verbosity=0)


train_x, train_y = ds.training_data()
val_x, val_y = ds.validation_data()
test_x, test_y = ds.test_data()

[28]:

len(train_x), len(val_x), len(test_x)

[28]:

(40, 10, 168)

[29]:

train_y[0]

[29]:

array([444866.9004])

spliting using intervals

[ ]: