Tab Transformer

This notebook shows the use of Tab Transformer neural network architecture for a prediction probelm.

Open In Colab

View Source on GitHub

[2]:
try:
    import ai4water
except (ImportError, ModuleNotFoundError):
    !pip install ai4water[tf2]
D:\C\Anaconda3\envs\tfcpu27_py39\lib\site-packages\numpy\_distributor_init.py:30: UserWarning: loaded more than 1 DLL from .libs:
D:\C\Anaconda3\envs\tfcpu27_py39\lib\site-packages\numpy\.libs\libopenblas.EL2C6PLE4ZYW3ECEVIV3OXXGRN2NRFM2.gfortran-win_amd64.dll
D:\C\Anaconda3\envs\tfcpu27_py39\lib\site-packages\numpy\.libs\libopenblas.GK7GX5KEQ4F6UYO3P26ULGBQYHGQO7J4.gfortran-win_amd64.dll
  warnings.warn("loaded more than 1 DLL from .libs:"
D:\C\Anaconda3\envs\tfcpu27_py39\lib\site-packages\sklearn\experimental\enable_hist_gradient_boosting.py:16: UserWarning: Since version 1.0, it is not needed to import enable_hist_gradient_boosting anymore. HistGradientBoostingClassifier and HistGradientBoostingRegressor are now stable and can be normally imported from sklearn.ensemble.
  warnings.warn(
[3]:
import numpy as np

from ai4water import Model
from ai4water.utils.utils import TrainTestSplit
from ai4water.models.utils import gen_cat_vocab
from ai4water.models import TabTransformer
from ai4water.utils.utils import get_version_info
from ai4water.datasets import mg_photodegradation

from easy_mpl import bar_chart
[4]:
for lib,ver in get_version_info().items():
    print(lib, ver)
python 3.9.7 | packaged by conda-forge | (default, Sep 29 2021, 19:20:16) [MSC v.1916 64 bit (AMD64)]
os nt
ai4water 1.06
lightgbm 3.3.1
tcn 3.4.0
catboost 0.26
xgboost 1.5.0
easy_mpl 0.21.3
SeqMetrics 1.3.3
tensorflow 2.7.0
keras.api._v2.keras 2.7.0
numpy 1.21.0
pandas 1.3.4
matplotlib 3.4.3
h5py 3.5.0
sklearn 1.0.1
shapefile 2.3.0
fiona 1.8.22
xarray 0.20.1
netCDF4 1.5.7
optuna 2.10.1
skopt 0.9.0
hyperopt 0.2.7
plotly 5.3.1
lime NotDefined
seaborn 0.11.2
[5]:
data, *_ = mg_photodegradation()
[6]:
data.shape
[6]:
(1200, 12)
[7]:
print(data.head())
   Surface area  Pore Volume  Catalyst_loading (g/L)  Light_intensity (W)  \
0           0.0          0.0                     0.0                  105
1           0.0          0.0                     0.0                  105
2           0.0          0.0                     0.0                  105
3           0.0          0.0                     0.0                  105
4           0.0          0.0                     0.0                  105

   time (min)  solution_pH  HA (mg/L)  Ci (mg/L)  Cf (mg/L) Catalyst_type  \
0           0         5.45          0         10      10.00   no catalyst
1          30         5.45          0         10       9.98   no catalyst
2          60         5.45          0         10       9.96   no catalyst
3          90         5.45          0         10       9.94   no catalyst
4         120         5.45          0         10       9.87   no catalyst

          Anions  Efficiency (%)
0  without Anion             0.0
1  without Anion             0.2
2  without Anion             0.4
3  without Anion             0.6
4  without Anion             1.3
[8]:
NUMERIC_FEATURES = data.columns.to_list()[0:8]
CAT_FEATURES = ["Catalyst_type", "Anions"]
LABEL = "Efficiency (%)"


splitter = TrainTestSplit(seed=313)

data[NUMERIC_FEATURES] = data[NUMERIC_FEATURES].astype(float)
data[CAT_FEATURES] = data[CAT_FEATURES].astype(str)
data[LABEL] = data[LABEL].astype(float)

train_data, test_data, _, _ = splitter.split_by_random(data)
train_data.shape, test_data.shape

[8]:
((840, 12), (360, 12))
[9]:
# create vocabulary of unique values of categorical features
cat_vocabulary = gen_cat_vocab(data)

cat_vocabulary
[9]:
{'Catalyst_type': ['0.25 wt% Pt-BFO',
  '0.5 wt% Pd-BFO',
  '0.5 wt% Pt-BFO',
  '1 wt% Ag-BFO',
  '1 wt% Pd-BFO',
  '1 wt% Pt-BFO',
  '2 wt% Ag-BFO',
  '2 wt% Pd-BFO',
  '2 wt% Pt-BFO',
  '3 wt% Ag-BFO',
  '3 wt% Pd-BFO',
  '4 wt% Ag-BFO',
  'commercial TiO2',
  'no catalyst',
  'pure BFO'],
 'Anions': ['Na2HPO4', 'Na2SO4', 'NaCO3', 'NaCl', 'NaHCO3', 'without Anion']}
[10]:
train_x = [train_data[NUMERIC_FEATURES].values, train_data[CAT_FEATURES].values]
test_x = [test_data[NUMERIC_FEATURES].values, test_data[CAT_FEATURES].values]
[11]:
depth = 3
num_heads = 4
hidden_units = 16
final_mpl_units = [84, 42]
num_numeric_features = len(NUMERIC_FEATURES)
[18]:
model = Model(model=TabTransformer(
    cat_vocabulary=cat_vocabulary,
    num_numeric_features=num_numeric_features,
    hidden_units=hidden_units,
    final_mlp_units = final_mpl_units,
    depth=depth,
    num_heads=num_heads,
))

            building DL model for
            regression problem using Model
Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to
==================================================================================================
 Input_cat (InputLayer)         [(None, 2)]          0           []

 cat_embeddings (CatEmbeddings)  (None, 2, 16)       336         ['Input_cat[0][0]']

 layer_normalization_1 (LayerNo  (None, 2, 16)       32          ['cat_embeddings[0][0]']
 rmalization)

 multi_head_attention (MultiHea  ((None, 2, 16),     4304        ['layer_normalization_1[0][0]',
 dAttention)                     (None, 4, 2, 2))                 'layer_normalization_1[0][0]']

 add (Add)                      (None, 2, 16)        0           ['layer_normalization_1[0][0]',
                                                                  'multi_head_attention[0][0]']

 sequential (Sequential)        (None, 2, 16)        304         ['add[0][0]']

 add_1 (Add)                    (None, 2, 16)        0           ['sequential[0][0]',
                                                                  'add[0][0]']

 layer_normalization_3 (LayerNo  (None, 2, 16)       32          ['add_1[0][0]']
 rmalization)

 layer_normalization_4 (LayerNo  (None, 2, 16)       32          ['layer_normalization_3[0][0]']
 rmalization)

 multi_head_attention_1 (MultiH  ((None, 2, 16),     4304        ['layer_normalization_4[0][0]',
 eadAttention)                   (None, 4, 2, 2))                 'layer_normalization_4[0][0]']

 add_2 (Add)                    (None, 2, 16)        0           ['layer_normalization_4[0][0]',
                                                                  'multi_head_attention_1[0][0]']

 sequential_1 (Sequential)      (None, 2, 16)        304         ['add_2[0][0]']

 add_3 (Add)                    (None, 2, 16)        0           ['sequential_1[0][0]',
                                                                  'add_2[0][0]']

 layer_normalization_6 (LayerNo  (None, 2, 16)       32          ['add_3[0][0]']
 rmalization)

 layer_normalization_7 (LayerNo  (None, 2, 16)       32          ['layer_normalization_6[0][0]']
 rmalization)

 multi_head_attention_2 (MultiH  ((None, 2, 16),     4304        ['layer_normalization_7[0][0]',
 eadAttention)                   (None, 4, 2, 2))                 'layer_normalization_7[0][0]']

 add_4 (Add)                    (None, 2, 16)        0           ['layer_normalization_7[0][0]',
                                                                  'multi_head_attention_2[0][0]']

 sequential_2 (Sequential)      (None, 2, 16)        304         ['add_4[0][0]']

 add_5 (Add)                    (None, 2, 16)        0           ['sequential_2[0][0]',
                                                                  'add_4[0][0]']

 Input_num (InputLayer)         [(None, 8)]          0           []

 layer_normalization_9 (LayerNo  (None, 2, 16)       32          ['add_5[0][0]']
 rmalization)

 layer_normalization (LayerNorm  (None, 8)           16          ['Input_num[0][0]']
 alization)

 flatten (Flatten)              (None, 32)           0           ['layer_normalization_9[0][0]']

 concatenate (Concatenate)      (None, 40)           0           ['layer_normalization[0][0]',
                                                                  'flatten[0][0]']

 MLP (Sequential)               (None, 16)           816         ['concatenate[0][0]']

 Dense_out (Dense)              (None, 1)            17          ['MLP[0][0]']

==================================================================================================
Total params: 15,201
Trainable params: 15,121
Non-trainable params: 80
__________________________________________________________________________________________________
[13]:
_ = model.fit(x=train_x, y= train_data[LABEL].values,
              validation_data=(test_x, test_data[LABEL].values),
              epochs=500, verbose=0)
assigning name Input_num to IteratorGetNext:0 with shape (None, 8)
assigning name Input_cat to IteratorGetNext:1 with shape (None, 2)
assigning name Input_num to IteratorGetNext:0 with shape (None, 8)
assigning name Input_cat to IteratorGetNext:1 with shape (None, 2)
assigning name Input_num to IteratorGetNext:0 with shape (None, 8)
assigning name Input_cat to IteratorGetNext:1 with shape (None, 2)
../../_images/_notebooks_model_tab_transformer_12_1.png
********** Successfully loaded weights from weights_456_49.76471.hdf5 file **********
[14]:
model.evaluate(x=train_x, y=train_data[LABEL].values, metrics=["r2", "r2_score", "rmse"])
assigning name Input_num to IteratorGetNext:0 with shape (None, 8)
assigning name Input_cat to IteratorGetNext:1 with shape (None, 2)
27/27 [==============================] - 0s 2ms/step
[14]:
{'r2': 0.9548887536801427,
 'r2_score': 0.9534963094014572,
 'rmse': 6.893919813820322}
[15]:
model.evaluate(x=test_x, y=test_data[LABEL].values, metrics=["r2", "r2_score", "rmse"])
12/12 [==============================] - 0s 2ms/step
[15]:
{'r2': 0.952070663191044,
 'r2_score': 0.9510105449344268,
 'rmse': 7.054410196882702}
[34]:

weights = model.get_intermediate_output('multi_head_attention', train_x) print(f"found {len(weights)} layers with multi-head-attention weights named {weights.keys()}") importances = [] for k, v in weights.items(): out, importance = v importance = importance[:, :, 0, :] importances.append(np.sum(importance, axis=1)) atten_weights = np.sum(np.stack(importances), axis=0) / (depth * num_heads) atten_weights.shape
found 3 layers with multi-head-attention weights named dict_keys(['multi_head_attention', 'multi_head_attention_1', 'multi_head_attention_2'])
[34]:
(840, 2)
[35]:

importance = np.sum(atten_weights, axis=0) importance
[35]:
array([423.76852, 416.23145], dtype=float32)
[46]:
_ = bar_chart(importance/sum(importance),
              labels=CAT_FEATURES,
              orient ="v",
              ax_kws=dict(ylabel="Relative Importance", ylabel_kws={"fontsize": 12, "weight": "bold"},
                         xlabel="Cat Features", xlabel_kws={"fontsize": 12, "weight": "bold"}),
              color="salmon"
             )
../../_images/_notebooks_model_tab_transformer_17_0.png
[ ]: