Data Splitting
This notebook describes how to split data into training, validation and test sets.
[24]:
import numpy as np
from ai4water.datasets import busan_beach
from ai4water.preprocessing import DataSet
from ai4water.utils.utils import get_version_info
[2]:
for lib, ver in get_version_info().items():
print(lib, ver)
python 3.9.7 | packaged by conda-forge | (default, Sep 29 2021, 19:20:16) [MSC v.1916 64 bit (AMD64)]
os nt
ai4water 1.07
lightgbm 3.3.1
tcn 3.4.0
catboost 0.26
xgboost 1.5.0
easy_mpl 0.21.3
SeqMetrics 1.3.3
tensorflow 2.7.0
keras.api._v2.keras 2.7.0
numpy 1.21.0
pandas 1.3.4
matplotlib 3.4.3
h5py 3.5.0
sklearn 1.0.1
shapefile 2.3.0
fiona 1.8.22
xarray 0.20.1
netCDF4 1.5.7
optuna 2.10.1
skopt 0.9.0
hyperopt 0.2.7
plotly 5.3.1
lime NotDefined
seaborn 0.11.2
[3]:
data = busan_beach()
data.shape
[3]:
(1446, 14)
[4]:
data.dropna().shape
[4]:
(218, 14)
[6]:
********** Removing Examples with nan in labels **********
***** Training *****
input_x shape: (121, 13)
target shape: (121, 1)
********** Removing Examples with nan in labels **********
***** Validation *****
input_x shape: (31, 13)
target shape: (31, 1)
********** Removing Examples with nan in labels **********
***** Test *****
input_x shape: (66, 13)
target shape: (66, 1)
[6]:
218
train fraction
[7]:
ds.train_fraction
[7]:
0.7
[9]:
********** Removing Examples with nan in labels **********
***** Training *****
input_x shape: (174, 13)
target shape: (174, 1)
********** Removing Examples with nan in labels **********
***** Validation *****
input_x shape: (44, 13)
target shape: (44, 1)
***** Test *****
input_x shape: (0,)
target shape: (0,)
[9]:
218
val_fraction
[10]:
********** Removing Examples with nan in labels **********
***** Training *****
input_x shape: (109, 13)
target shape: (109, 1)
********** Removing Examples with nan in labels **********
***** Validation *****
input_x shape: (109, 13)
target shape: (109, 1)
***** Test *****
input_x shape: (0,)
target shape: (0,)
[10]:
218
[11]:
********** Removing Examples with nan in labels **********
***** Training *****
input_x shape: (76, 13)
target shape: (76, 1)
********** Removing Examples with nan in labels **********
***** Validation *****
input_x shape: (76, 13)
target shape: (76, 1)
********** Removing Examples with nan in labels **********
***** Test *****
input_x shape: (66, 13)
target shape: (66, 1)
[11]:
218
random splitting
[12]:
train_x[0]
[12]:
array([ -22.245026 , 19.457182 , 34.00429 , 24.28 ,
0. , 0. , 0. , 6. ,
205.00667 , 1.6533333, 998.61334 , 1002.9133 ,
75.1 ], dtype=float32)
[13]:
train_y[0]
[13]:
array([444866.9004])
[21]:
ds = DataSet(data=data, split_random=True, verbosity=0)
train_x, train_y = ds.training_data()
val_x, val_y = ds.validation_data()
test_x, test_y = ds.test_data()
train_y[0]
[21]:
array([1363760.645])
[22]:
train_y[1]
[22]:
array([2356366.032])
reproducibility
[18]:
[1363760.645]
[1363760.645]
[1363760.645]
[1363760.645]
[1363760.645]
[1363760.645]
[1363760.645]
[1363760.645]
[1363760.645]
[1363760.645]
[19]:
[332194.7001]
[275100.6769]
[4881349.328]
[278787.0508]
[844769.5803]
[216767.3381]
[2391848.163]
[141357.2781]
[761979.6799]
[20083984.46]
[20]:
[774076.752]
[2060160.801]
[202449.1352]
[21289.67181]
[36045668.64]
[14976057.52]
[3291674.776]
[2356366.032]
[836261.1064]
[3256878.75]
spliting using indices
[25]:
[28]:
(40, 10, 168)
[29]:
train_y[0]
[29]:
array([444866.9004])
spliting using intervals
[ ]: