1. Exploratory Data Analysis

import matplotlib.pyplot as plt

from ai4water.eda import EDA
from ai4water.utils import TrainTestSplit

from easy_mpl import hist
from easy_mpl import plot
from easy_mpl import scatter
from easy_mpl import boxplot
from easy_mpl.utils import create_subplots
from easy_mpl.utils import map_array_to_cmap, process_cbar

from utils import read_data
from utils import COLUMN_MAPS
from utils import set_rcParams, version_info

/home/docs/checkouts/readthedocs.org/user_builds/xyzxyzxyz/envs/latest/lib/python3.9/site-packages/sklearn/experimental/enable_hist_gradient_boosting.py:15: UserWarning: Since version 1.0, it is not needed to import enable_hist_gradient_boosting anymore. HistGradientBoostingClassifier and HistGradientBoostingRegressor are now stable and can be normally imported from sklearn.ensemble.
  warnings.warn(
/home/docs/checkouts/readthedocs.org/user_builds/xyzxyzxyz/envs/latest/lib/python3.9/site-packages/shap/utils/_clustering.py:35: NumbaDeprecationWarning: The 'nopython' keyword argument was not supplied to the 'numba.jit' decorator. The implicit default value for this argument is currently False, but it will be changed to True in Numba 0.59.0. See https://numba.readthedocs.io/en/stable/reference/deprecation.html#deprecation-of-object-mode-fall-back-behaviour-when-using-jit for details.
  def _pt_shuffle_rec(i, indexes, index_mask, partition_tree, M, pos):
/home/docs/checkouts/readthedocs.org/user_builds/xyzxyzxyz/envs/latest/lib/python3.9/site-packages/shap/utils/_clustering.py:54: NumbaDeprecationWarning: The 'nopython' keyword argument was not supplied to the 'numba.jit' decorator. The implicit default value for this argument is currently False, but it will be changed to True in Numba 0.59.0. See https://numba.readthedocs.io/en/stable/reference/deprecation.html#deprecation-of-object-mode-fall-back-behaviour-when-using-jit for details.
  def delta_minimization_order(all_masks, max_swap_size=100, num_passes=2):
/home/docs/checkouts/readthedocs.org/user_builds/xyzxyzxyz/envs/latest/lib/python3.9/site-packages/shap/utils/_clustering.py:63: NumbaDeprecationWarning: The 'nopython' keyword argument was not supplied to the 'numba.jit' decorator. The implicit default value for this argument is currently False, but it will be changed to True in Numba 0.59.0. See https://numba.readthedocs.io/en/stable/reference/deprecation.html#deprecation-of-object-mode-fall-back-behaviour-when-using-jit for details.
  def _reverse_window(order, start, length):
/home/docs/checkouts/readthedocs.org/user_builds/xyzxyzxyz/envs/latest/lib/python3.9/site-packages/shap/utils/_clustering.py:69: NumbaDeprecationWarning: The 'nopython' keyword argument was not supplied to the 'numba.jit' decorator. The implicit default value for this argument is currently False, but it will be changed to True in Numba 0.59.0. See https://numba.readthedocs.io/en/stable/reference/deprecation.html#deprecation-of-object-mode-fall-back-behaviour-when-using-jit for details.
  def _reverse_window_score_gain(masks, order, start, length):
/home/docs/checkouts/readthedocs.org/user_builds/xyzxyzxyz/envs/latest/lib/python3.9/site-packages/shap/utils/_clustering.py:77: NumbaDeprecationWarning: The 'nopython' keyword argument was not supplied to the 'numba.jit' decorator. The implicit default value for this argument is currently False, but it will be changed to True in Numba 0.59.0. See https://numba.readthedocs.io/en/stable/reference/deprecation.html#deprecation-of-object-mode-fall-back-behaviour-when-using-jit for details.
  def _mask_delta_score(m1, m2):
/home/docs/checkouts/readthedocs.org/user_builds/xyzxyzxyz/envs/latest/lib/python3.9/site-packages/shap/links.py:5: NumbaDeprecationWarning: The 'nopython' keyword argument was not supplied to the 'numba.jit' decorator. The implicit default value for this argument is currently False, but it will be changed to True in Numba 0.59.0. See https://numba.readthedocs.io/en/stable/reference/deprecation.html#deprecation-of-object-mode-fall-back-behaviour-when-using-jit for details.
  def identity(x):
/home/docs/checkouts/readthedocs.org/user_builds/xyzxyzxyz/envs/latest/lib/python3.9/site-packages/shap/links.py:10: NumbaDeprecationWarning: The 'nopython' keyword argument was not supplied to the 'numba.jit' decorator. The implicit default value for this argument is currently False, but it will be changed to True in Numba 0.59.0. See https://numba.readthedocs.io/en/stable/reference/deprecation.html#deprecation-of-object-mode-fall-back-behaviour-when-using-jit for details.
  def _identity_inverse(x):
/home/docs/checkouts/readthedocs.org/user_builds/xyzxyzxyz/envs/latest/lib/python3.9/site-packages/shap/links.py:15: NumbaDeprecationWarning: The 'nopython' keyword argument was not supplied to the 'numba.jit' decorator. The implicit default value for this argument is currently False, but it will be changed to True in Numba 0.59.0. See https://numba.readthedocs.io/en/stable/reference/deprecation.html#deprecation-of-object-mode-fall-back-behaviour-when-using-jit for details.
  def logit(x):
/home/docs/checkouts/readthedocs.org/user_builds/xyzxyzxyz/envs/latest/lib/python3.9/site-packages/shap/links.py:20: NumbaDeprecationWarning: The 'nopython' keyword argument was not supplied to the 'numba.jit' decorator. The implicit default value for this argument is currently False, but it will be changed to True in Numba 0.59.0. See https://numba.readthedocs.io/en/stable/reference/deprecation.html#deprecation-of-object-mode-fall-back-behaviour-when-using-jit for details.
  def _logit_inverse(x):
/home/docs/checkouts/readthedocs.org/user_builds/xyzxyzxyz/envs/latest/lib/python3.9/site-packages/shap/utils/_masked_model.py:363: NumbaDeprecationWarning: The 'nopython' keyword argument was not supplied to the 'numba.jit' decorator. The implicit default value for this argument is currently False, but it will be changed to True in Numba 0.59.0. See https://numba.readthedocs.io/en/stable/reference/deprecation.html#deprecation-of-object-mode-fall-back-behaviour-when-using-jit for details.
  def _build_fixed_single_output(averaged_outs, last_outs, outputs, batch_positions, varying_rows, num_varying_rows, link, linearizing_weights):
/home/docs/checkouts/readthedocs.org/user_builds/xyzxyzxyz/envs/latest/lib/python3.9/site-packages/shap/utils/_masked_model.py:385: NumbaDeprecationWarning: The 'nopython' keyword argument was not supplied to the 'numba.jit' decorator. The implicit default value for this argument is currently False, but it will be changed to True in Numba 0.59.0. See https://numba.readthedocs.io/en/stable/reference/deprecation.html#deprecation-of-object-mode-fall-back-behaviour-when-using-jit for details.
  def _build_fixed_multi_output(averaged_outs, last_outs, outputs, batch_positions, varying_rows, num_varying_rows, link, linearizing_weights):
/home/docs/checkouts/readthedocs.org/user_builds/xyzxyzxyz/envs/latest/lib/python3.9/site-packages/shap/utils/_masked_model.py:428: NumbaDeprecationWarning: The 'nopython' keyword argument was not supplied to the 'numba.jit' decorator. The implicit default value for this argument is currently False, but it will be changed to True in Numba 0.59.0. See https://numba.readthedocs.io/en/stable/reference/deprecation.html#deprecation-of-object-mode-fall-back-behaviour-when-using-jit for details.
  def _init_masks(cluster_matrix, M, indices_row_pos, indptr):
/home/docs/checkouts/readthedocs.org/user_builds/xyzxyzxyz/envs/latest/lib/python3.9/site-packages/shap/utils/_masked_model.py:439: NumbaDeprecationWarning: The 'nopython' keyword argument was not supplied to the 'numba.jit' decorator. The implicit default value for this argument is currently False, but it will be changed to True in Numba 0.59.0. See https://numba.readthedocs.io/en/stable/reference/deprecation.html#deprecation-of-object-mode-fall-back-behaviour-when-using-jit for details.
  def _rec_fill_masks(cluster_matrix, indices_row_pos, indptr, indices, M, ind):
/home/docs/checkouts/readthedocs.org/user_builds/xyzxyzxyz/envs/latest/lib/python3.9/site-packages/shap/maskers/_tabular.py:186: NumbaDeprecationWarning: The 'nopython' keyword argument was not supplied to the 'numba.jit' decorator. The implicit default value for this argument is currently False, but it will be changed to True in Numba 0.59.0. See https://numba.readthedocs.io/en/stable/reference/deprecation.html#deprecation-of-object-mode-fall-back-behaviour-when-using-jit for details.
  def _single_delta_mask(dind, masked_inputs, last_mask, data, x, noop_code):
/home/docs/checkouts/readthedocs.org/user_builds/xyzxyzxyz/envs/latest/lib/python3.9/site-packages/shap/maskers/_tabular.py:197: NumbaDeprecationWarning: The 'nopython' keyword argument was not supplied to the 'numba.jit' decorator. The implicit default value for this argument is currently False, but it will be changed to True in Numba 0.59.0. See https://numba.readthedocs.io/en/stable/reference/deprecation.html#deprecation-of-object-mode-fall-back-behaviour-when-using-jit for details.
  def _delta_masking(masks, x, curr_delta_inds, varying_rows_out,
/home/docs/checkouts/readthedocs.org/user_builds/xyzxyzxyz/envs/latest/lib/python3.9/site-packages/shap/maskers/_image.py:175: NumbaDeprecationWarning: The 'nopython' keyword argument was not supplied to the 'numba.jit' decorator. The implicit default value for this argument is currently False, but it will be changed to True in Numba 0.59.0. See https://numba.readthedocs.io/en/stable/reference/deprecation.html#deprecation-of-object-mode-fall-back-behaviour-when-using-jit for details.
  def _jit_build_partition_tree(xmin, xmax, ymin, ymax, zmin, zmax, total_ywidth, total_zwidth, M, clustering, q):
/home/docs/checkouts/readthedocs.org/user_builds/xyzxyzxyz/envs/latest/lib/python3.9/site-packages/shap/explainers/_partition.py:676: NumbaDeprecationWarning: The 'nopython' keyword argument was not supplied to the 'numba.jit' decorator. The implicit default value for this argument is currently False, but it will be changed to True in Numba 0.59.0. See https://numba.readthedocs.io/en/stable/reference/deprecation.html#deprecation-of-object-mode-fall-back-behaviour-when-using-jit for details.
  def lower_credit(i, value, M, values, clustering):
The 'nopython' keyword argument was not supplied to the 'numba.jit' decorator. The implicit default value for this argument is currently False, but it will be changed to True in Numba 0.59.0. See https://numba.readthedocs.io/en/stable/reference/deprecation.html#deprecation-of-object-mode-fall-back-behaviour-when-using-jit for details.
The 'nopython' keyword argument was not supplied to the 'numba.jit' decorator. The implicit default value for this argument is currently False, but it will be changed to True in Numba 0.59.0. See https://numba.readthedocs.io/en/stable/reference/deprecation.html#deprecation-of-object-mode-fall-back-behaviour-when-using-jit for details.
IPython could not be loaded!
datasets module is deprecated. Please install water-datasets and import
              corresponding dataset from there.

for k,v in version_info().items():
    print(k, v)

python 3.9.20 (main, Nov  5 2024, 16:07:55)
[GCC 11.4.0]
os posix
ai4water 1.07
easy_mpl 0.21.4
SeqMetrics 2.0.0
tensorflow 2.10.1
keras.api._v2.keras 2.10.0
numpy 1.21.6
pandas 1.5.3
matplotlib 3.7.1
h5py 3.13.0
sklearn 1.3.1
seaborn 0.13.2
ngboost 0.4.1
shap 0.41.0

set_rcParams()

COLUMN_MAPS_ = {v:k for k,v in COLUMN_MAPS.items()}
COLUMN_MAPS_['ww_conc'] = "Wastewater Conc."
COLUMN_MAPS_['sonic_pd'] = "Sonicator Power"
COLUMN_MAPS_['h20_conc.'] = 'H2O2 Conc.'

data = read_data()
data_area = read_data(target='Area (ABD) Mean')
data_both = read_data(target=['Area (ABD) Mean', 'Efficiency'])

print(data.shape)

(314, 7)

data_both.describe()

	Time (min)	Ini. CC	Sonic. PD	h20 Conc.	Volume (mL)	Solution pH	Area (ABD) Mean	Efficiency
count	314.000000	314.000000	314.000000	314.000000	314.000000	314.000000	314.000000	314.000000
mean	26.687898	218408.840764	22.006369	1.248408	128.343949	7.289172	38.776688	52.272698
std	19.864189	47614.530611	7.798580	0.936693	80.686726	1.375810	18.543966	20.718310
min	0.000000	108768.000000	10.000000	0.000000	50.000000	3.000000	22.860000	0.000000
25%	10.000000	200000.000000	20.000000	0.000000	100.000000	7.400000	30.502500	49.355463
50%	30.000000	200000.000000	20.000000	2.000000	100.000000	7.400000	32.895000	60.258500
75%	40.000000	232000.000000	20.000000	2.000000	100.000000	7.400000	35.527500	64.878500
max	60.000000	415000.000000	50.000000	2.500000	400.000000	11.000000	163.660000	89.268000

eda = EDA(data=data, save=False, show=False)

ax = eda.correlation(figsize=(8,8), square=True,
                     cbar_kws={"shrink": .72},
                     cmap="Spectral"
                     )
ax.set_xticklabels(ax.get_xticklabels(), fontsize=12, weight='bold', rotation=70)
ax.set_yticklabels(ax.get_yticklabels(), fontsize=12, weight='bold')
plt.tight_layout()
plt.show()

Correlation

eda = EDA(data=data_area, save=False, show=False)

ax = eda.correlation(figsize=(8,8), square=True,
                     cbar_kws={"shrink": .72},
                     cmap="Spectral"
                     )
ax.set_xticklabels(ax.get_xticklabels(), fontsize=12, weight='bold', rotation=70)
ax.set_yticklabels(ax.get_yticklabels(), fontsize=12, weight='bold')
plt.tight_layout()
plt.show()

eda = EDA(data=data_both, save=False, show=False)

ax = eda.correlation(figsize=(8,8), square=True,
                     cbar_kws={"shrink": .72},
                     cmap="Spectral"
                     )
ax.set_xticklabels(ax.get_xticklabels(), fontsize=12, weight='bold', rotation=70)
ax.set_yticklabels(ax.get_yticklabels(), fontsize=12, weight='bold')
plt.tight_layout()
plt.show()

Distribution

f, axes = create_subplots(data_both.shape[1], sharex="all")

for col, ax in zip(data_both.columns, axes.flatten()):

    boxplot(
        data_both[col], labels=col, ax=ax, show=False,
        fill_color="lightpink",
        patch_artist=True,
        widths=0.7,
        flierprops=dict(ms=2.0),
        medianprops={"color": "black"},
            )

    ax.set_xlabel(col)

plt.tight_layout()
plt.show()

xticks (2) and xticklabels (1) dont match

xticks (3) and xticklabels (1) dont match

xticks (4) and xticklabels (1) dont match

xticks (5) and xticklabels (1) dont match

xticks (6) and xticklabels (1) dont match

xticks (7) and xticklabels (1) dont match

xticks (8) and xticklabels (1) dont match

train_count, test_count, _, _ = TrainTestSplit(seed=313).split_by_random(
    data['Efficiency'],
)

ax, _ = boxplot([train_count, test_count],
                flierprops=dict(ms=2.0),
                widths=0.6,
                labels=["Train", "Test"],
                showmeans=True,
                patch_artist=True,
                fill_color=["darkorange", "peachpuff"],
                medianprops={"color": "black", 'linewidth': 2},
                capprops={"linewidth":2}, whiskerprops=dict(linewidth=2),
                line_width=2.,
                meanprops={"markerfacecolor": "black",
                           "markeredgecolor": 'black',
                           "marker": "o"},
                show=False
        )
ax.tick_params(labelsize=12)
ax.set_xticklabels(["Train", "Test"], fontsize=12)
ax.grid(visible=True, ls='--', color='lightgrey')
plt.show()

_ = hist([train_count.values, test_count.values],
     labels=["Train", "Test"], alpha=0.7)

train_area, test_area, _, _ = TrainTestSplit(seed=313).split_by_random(
    data_area['Area (ABD) Mean'],
)

ax, _ = boxplot([train_area, test_area],
                flierprops=dict(ms=2.0),
                widths=0.6,
                labels=["Train", "Test"],
                showmeans=True,
                patch_artist=True,
                fill_color=["darkorange", "peachpuff"],
                medianprops={"color": "black", 'linewidth': 2},
                capprops={"linewidth":2}, whiskerprops=dict(linewidth=2),
                line_width=2.,
                meanprops={"markerfacecolor": "black",
                           "markeredgecolor": 'black',
                           "marker": "o"},
               show=False
               )
ax.tick_params(labelsize=12)
ax.set_xticklabels(["Train", "Test"], fontsize=12)
ax.grid(visible=True, ls='--', color='lightgrey')
plt.show()

_ = hist([train_area.values, test_area.values],
     labels=["Train", "Test"], alpha=0.7)

line plot

fig, axes = create_subplots(data_both.shape[1])

for ax, col, label  in zip(axes.flat, data_both, data_both.columns):

    plot(data_both[col].values, ax=ax,
         ax_kws=dict(ylabel=COLUMN_MAPS_.get(col, col),
                     ylabel_kws={"fontsize": 12, 'weight': 'bold'},),
         lw=0.9,
         color='darkcyan', show=False)
plt.tight_layout()
plt.show()

Feature Interaction

def draw_scatter(target, ax, label="Efficiency"):
    ax.grid(visible=True, ls='--', color='lightgrey')
    c, mapper = map_array_to_cmap(data[target].values, "inferno")

    if target in ["Sonic. PD", "Volume (mL)"]:
        ylabel = None
    else:
        ylabel = label

    if target in ['Solution pH', 'Volume (mL)']:
        xlabel = "Time (min)"
    else:
        xlabel = None

    ax_, _ = scatter(data_both['Time (min)'], data_both[label],
                      color=c, alpha=0.5, s=40, ec="grey", zorder=10,
                      ax_kws=dict(logy=True, ylabel=ylabel,
                                  ylabel_kws={"fontsize": 12, 'weight': 'bold'},
                                  top_spine=False, right_spine=False,
                                  xlabel=xlabel,
                                  xlabel_kws={"fontsize": 12, 'weight': 'bold'}),
                      ax=ax, show=False)
    process_cbar(ax_, mappable=mapper, orientation="vertical", pad=0.1,
                 border=False,
                 title=COLUMN_MAPS_.get(target, target),
                 title_kws=dict(fontsize=12))
    return


f, all_axes = create_subplots(5, sharex="all", facecolor="#EFE9E6", figsize=(9, 6))

targets = ['Ini. CC', 'Sonic. PD', 'h20 Conc.', 'Volume (mL)', 'Solution pH']
for col, axes in zip(targets, all_axes.flatten()):
    draw_scatter(col, axes)

plt.tight_layout()
plt.show()

`process_cbar` is deprecated as a function name; use `add_cbar` instead.

f, all_axes = create_subplots(5, sharex="all", facecolor="#EFE9E6", figsize=(9, 6))

targets = ['Ini. CC', 'Sonic. PD', 'h20 Conc.', 'Volume (mL)', 'Solution pH']
for col, axes in zip(targets, all_axes.flatten()):
    draw_scatter(col, axes, label="Area (ABD) Mean")

plt.tight_layout()
plt.show()

`process_cbar` is deprecated as a function name; use `add_cbar` instead.

Total running time of the script: (0 minutes 8.514 seconds)

Gallery generated by Sphinx-Gallery