{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "## General Information" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "This notebook is used to train a simple neural network model to predict the chemistry in the barite benchmark (50x50 grid). The training data is stored in the repository using **git large file storage** and can be downloaded after the installation of git lfs using the `git lfs pull` command.\n", "\n", "It is then recommended to create a Python environment using miniconda. The necessary dependencies are contained in `environment.yml` and can be installed using `conda env create -f environment.yml`.\n", "\n", "The data set is divided into a design and result part and consists of the iterations of a reference simulation. The design part of the data set contains the chemical concentrations at time $t$ and the result part at time $t+1$, which are to be learned by the model." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Setup Libraries" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "2025-01-15 16:24:49.275664: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.\n", "2025-01-15 16:24:49.404820: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n", "To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Running Keras in version 3.6.0\n" ] } ], "source": [ "import keras\n", "print(\"Running Keras in version {}\".format(keras.__version__))\n", "\n", "import h5py\n", "import numpy as np\n", "import pandas as pd\n", "import time\n", "import sklearn.model_selection as sk\n", "import matplotlib.pyplot as plt\n", "from sklearn.cluster import KMeans\n", "from imblearn.over_sampling import SMOTE\n", "from imblearn.under_sampling import RandomUnderSampler\n", "from imblearn.over_sampling import RandomOverSampler\n", "from collections import Counter\n", "import os" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Define parameters" ] }, { "cell_type": "code", "execution_count": 141, "metadata": {}, "outputs": [], "source": [ "dtype = \"float32\"\n", "activation = \"relu\"\n", "\n", "lr = 0.001\n", "batch_size = 512\n", "epochs = 50 # default 400 epochs\n", "\n", "lr_schedule = keras.optimizers.schedules.ExponentialDecay(\n", " initial_learning_rate=lr,\n", " decay_steps=2000,\n", " decay_rate=0.9,\n", " staircase=True\n", ")\n", "\n", "optimizer_simple = keras.optimizers.Adam(learning_rate=lr_schedule)\n", "optimizer_large = keras.optimizers.Adam(learning_rate=lr_schedule)\n", "\n", "loss = keras.losses.Huber()\n", "\n", "sample_fraction = 0.8" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Setup the model" ] }, { "cell_type": "code", "execution_count": 142, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
Model: \"sequential_5\"\n",
"\n"
],
"text/plain": [
"\u001b[1mModel: \"sequential_5\"\u001b[0m\n"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┓\n",
"┃ Layer (type) ┃ Output Shape ┃ Param # ┃\n",
"┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━┩\n",
"│ dense_17 (Dense) │ (None, 128) │ 1,664 │\n",
"├─────────────────────────────────┼────────────────────────┼───────────────┤\n",
"│ dense_18 (Dense) │ (None, 128) │ 16,512 │\n",
"├─────────────────────────────────┼────────────────────────┼───────────────┤\n",
"│ dense_19 (Dense) │ (None, 12) │ 1,548 │\n",
"└─────────────────────────────────┴────────────────────────┴───────────────┘\n",
"\n"
],
"text/plain": [
"┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┓\n",
"┃\u001b[1m \u001b[0m\u001b[1mLayer (type) \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mOutput Shape \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1m Param #\u001b[0m\u001b[1m \u001b[0m┃\n",
"┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━┩\n",
"│ dense_17 (\u001b[38;5;33mDense\u001b[0m) │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m128\u001b[0m) │ \u001b[38;5;34m1,664\u001b[0m │\n",
"├─────────────────────────────────┼────────────────────────┼───────────────┤\n",
"│ dense_18 (\u001b[38;5;33mDense\u001b[0m) │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m128\u001b[0m) │ \u001b[38;5;34m16,512\u001b[0m │\n",
"├─────────────────────────────────┼────────────────────────┼───────────────┤\n",
"│ dense_19 (\u001b[38;5;33mDense\u001b[0m) │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m12\u001b[0m) │ \u001b[38;5;34m1,548\u001b[0m │\n",
"└─────────────────────────────────┴────────────────────────┴───────────────┘\n"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"Total params: 19,724 (77.05 KB)\n", "\n" ], "text/plain": [ "\u001b[1m Total params: \u001b[0m\u001b[38;5;34m19,724\u001b[0m (77.05 KB)\n" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
Trainable params: 19,724 (77.05 KB)\n", "\n" ], "text/plain": [ "\u001b[1m Trainable params: \u001b[0m\u001b[38;5;34m19,724\u001b[0m (77.05 KB)\n" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
Non-trainable params: 0 (0.00 B)\n", "\n" ], "text/plain": [ "\u001b[1m Non-trainable params: \u001b[0m\u001b[38;5;34m0\u001b[0m (0.00 B)\n" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "model_simple = keras.Sequential(\n", " [\n", " keras.Input(shape = (12,), dtype = \"float32\"),\n", " keras.layers.Dense(units = 128, activation = \"relu\", dtype = \"float32\"),\n", " keras.layers.Dense(units = 128, activation = \"relu\", dtype = \"float32\"),\n", " keras.layers.Dense(units = 12, dtype = \"float32\")\n", " ]\n", ")\n", "\n", "model_simple.compile(optimizer=optimizer_simple, loss = loss)\n", "model_simple.summary()" ] }, { "cell_type": "code", "execution_count": 143, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
Model: \"sequential_6\"\n",
"\n"
],
"text/plain": [
"\u001b[1mModel: \"sequential_6\"\u001b[0m\n"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┓\n",
"┃ Layer (type) ┃ Output Shape ┃ Param # ┃\n",
"┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━┩\n",
"│ dense_20 (Dense) │ (None, 512) │ 6,656 │\n",
"├─────────────────────────────────┼────────────────────────┼───────────────┤\n",
"│ dense_21 (Dense) │ (None, 1024) │ 525,312 │\n",
"├─────────────────────────────────┼────────────────────────┼───────────────┤\n",
"│ dense_22 (Dense) │ (None, 512) │ 524,800 │\n",
"├─────────────────────────────────┼────────────────────────┼───────────────┤\n",
"│ dense_23 (Dense) │ (None, 12) │ 6,156 │\n",
"└─────────────────────────────────┴────────────────────────┴───────────────┘\n",
"\n"
],
"text/plain": [
"┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┓\n",
"┃\u001b[1m \u001b[0m\u001b[1mLayer (type) \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mOutput Shape \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1m Param #\u001b[0m\u001b[1m \u001b[0m┃\n",
"┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━┩\n",
"│ dense_20 (\u001b[38;5;33mDense\u001b[0m) │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m512\u001b[0m) │ \u001b[38;5;34m6,656\u001b[0m │\n",
"├─────────────────────────────────┼────────────────────────┼───────────────┤\n",
"│ dense_21 (\u001b[38;5;33mDense\u001b[0m) │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m1024\u001b[0m) │ \u001b[38;5;34m525,312\u001b[0m │\n",
"├─────────────────────────────────┼────────────────────────┼───────────────┤\n",
"│ dense_22 (\u001b[38;5;33mDense\u001b[0m) │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m512\u001b[0m) │ \u001b[38;5;34m524,800\u001b[0m │\n",
"├─────────────────────────────────┼────────────────────────┼───────────────┤\n",
"│ dense_23 (\u001b[38;5;33mDense\u001b[0m) │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m12\u001b[0m) │ \u001b[38;5;34m6,156\u001b[0m │\n",
"└─────────────────────────────────┴────────────────────────┴───────────────┘\n"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"Total params: 1,062,924 (4.05 MB)\n", "\n" ], "text/plain": [ "\u001b[1m Total params: \u001b[0m\u001b[38;5;34m1,062,924\u001b[0m (4.05 MB)\n" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
Trainable params: 1,062,924 (4.05 MB)\n", "\n" ], "text/plain": [ "\u001b[1m Trainable params: \u001b[0m\u001b[38;5;34m1,062,924\u001b[0m (4.05 MB)\n" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
Non-trainable params: 0 (0.00 B)\n", "\n" ], "text/plain": [ "\u001b[1m Non-trainable params: \u001b[0m\u001b[38;5;34m0\u001b[0m (0.00 B)\n" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "model_large = keras.Sequential(\n", " [keras.layers.Input(shape=(12,), dtype=dtype),\n", " keras.layers.Dense(512, activation='relu', dtype=dtype),\n", " keras.layers.Dense(1024, activation='relu', dtype=dtype),\n", " keras.layers.Dense(512, activation='relu', dtype=dtype),\n", " keras.layers.Dense(12, dtype=dtype)\n", " ])\n", "\n", "model_large.compile(optimizer=optimizer_large, loss = loss)\n", "model_large.summary()\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Define some functions and helper classes" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "def Safelog(val):\n", " # get range of vector\n", " if val > 0:\n", " return np.log10(val)\n", " elif val < 0:\n", " return -np.log10(-val)\n", " else:\n", " return 0\n", "\n", "def Safeexp(val):\n", " if val > 0:\n", " return -10 ** -val\n", " elif val < 0:\n", " return 10 ** val\n", " else:\n", " return 0\n" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "# ? Why does the charge is using another logarithm than the other species\n", "\n", "func_dict_in = {\n", " \"H\" : np.log1p,\n", " \"O\" : np.log1p,\n", " \"Charge\" : Safelog,\n", " \"H_0_\" : np.log1p,\n", " \"O_0_\" : np.log1p,\n", " \"Ba\" : np.log1p,\n", " \"Cl\" : np.log1p,\n", " \"S_2_\" : np.log1p,\n", " \"S_6_\" : np.log1p,\n", " \"Sr\" : np.log1p,\n", " \"Barite\" : np.log1p,\n", " \"Celestite\" : np.log1p,\n", "}\n", "\n", "func_dict_out = {\n", " \"H\" : np.expm1,\n", " \"O\" : np.expm1,\n", " \"Charge\" : Safeexp,\n", " \"H_0_\" : np.expm1,\n", " \"O_0_\" : np.expm1,\n", " \"Ba\" : np.expm1,\n", " \"Cl\" : np.expm1,\n", " \"S_2_\" : np.expm1,\n", " \"S_6_\" : np.expm1,\n", " \"Sr\" : np.expm1,\n", " \"Barite\" : np.expm1,\n", " \"Celestite\" : np.expm1,\n", "}\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Read data from `.h5` file and convert it to a `pandas.DataFrame`" ] }, { "cell_type": "code", "execution_count": 42, "metadata": {}, "outputs": [], "source": [ "# os.chdir('/mnt/beegfs/home/signer/projects/model-training')\n", "data_file = h5py.File(\"Barite_50_Data_training.h5\")\n", "\n", "design = data_file[\"design\"]\n", "results = data_file[\"result\"]\n", "\n", "df_design = pd.DataFrame(np.array(design[\"data\"]).transpose(), columns = design[\"names\"].asstr())\n", "df_results = pd.DataFrame(np.array(results[\"data\"]).transpose(), columns = results[\"names\"].asstr())\n", "\n", "data_file.close()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Classify each cell with kmeans" ] }, { "cell_type": "code", "execution_count": 43, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/home/signer/bin/miniconda3/envs/training/lib/python3.11/site-packages/sklearn/base.py:1473: ConvergenceWarning: Number of distinct clusters (1) found smaller than n_clusters (2). Possibly due to duplicate points in X.\n", " return fit_method(estimator, *args, **kwargs)\n" ] } ], "source": [ "# widget with slider for the index\n", "\n", "class_label_design = np.array([])\n", "class_label_result = np.array([])\n", "\n", "\n", "i = 1000\n", "for i in range(0,1001):\n", " field_design = np.array(df_design['Barite'][(i*2500):(i*2500+2500)]).reshape(50,50)\n", " field_result = np.array(df_results['Barite'][(i*2500):(i*2500+2500)]).reshape(50,50)\n", " \n", " kmeans_design = KMeans(n_clusters=2, random_state=0).fit(field_design.reshape(-1,1))\n", " kmeans_result = KMeans(n_clusters=2, random_state=0).fit(field_result.reshape(-1,1))\n", " \n", " class_label_design = np.append(class_label_design.astype(int), kmeans_design.labels_)\n", " class_label_result = np.append(class_label_result.astype(int), kmeans_result.labels_)\n", " \n", "\n", "\n", "class_label_design = pd.DataFrame(class_label_design, columns = [\"Class\"])\n", "class_label_result = pd.DataFrame(class_label_result, columns = [\"Class\"])\n" ] }, { "cell_type": "code", "execution_count": 44, "metadata": {}, "outputs": [], "source": [ "if(\"Class\" in df_design.columns and \"Class\" in df_results.columns):\n", " print(\"Class column already exists\")\n", "else:\n", " df_design = pd.concat([df_design, class_label_design], axis=1)\n", " df_results = pd.concat([df_results, class_label_design], axis=1)" ] }, { "cell_type": "code", "execution_count": 45, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Amount class 0: 0.9879380619380619\n", "Amount class 1: 0.012061938061938062\n" ] } ], "source": [ "counter = Counter(df_design.iloc[:,-1])\n", "print(\"Amount class 0:\", counter[0] / (counter[0] + counter[1]) )\n", "print(\"Amount class 1:\", counter[1] / (counter[0] + counter[1]) )\n" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "