From 05bbb875624be702d67b6ddd1517d7894ddc5a1c Mon Sep 17 00:00:00 2001 From: Hannes Signer Date: Thu, 27 Feb 2025 12:48:58 +0100 Subject: [PATCH] update readme and environment --- README.md | 22 ++++- environment.yaml | 248 +++++++++++++++++++++++++++++++++++++++++++++++ environment.yml | 162 ------------------------------- 3 files changed, 268 insertions(+), 164 deletions(-) create mode 100644 environment.yaml delete mode 100644 environment.yml diff --git a/README.md b/README.md index a7cb5f4..9fba248 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,23 @@ # Training of AI Surrogate Models -- run `git lfs pull` to get the data from the large file storage -- create conda environment with `conda env create -f environment.yml` +This notebook contains the current experiments for training AI models that attempt to predict the chemistry component of POET. + +The repository is structured as follows: +``` +└── dataset + └── Barite_50_Data_training.h5 + └── barite_50_4_corner.h5 +└── doc +└── results +└── src + └── POET_Training.ipynb + └── convert_data.jl + └── optuna_runs.py + └── preprocessing.py +``` + +The datasets in `datasets` must first be pulled via `git lfs pull` to get the data from the large file storage. +A conda environment can then be set up with the packages contained in environment.yml with `conda env create -f environment.yml` + +The `preprocessing.py` file defines all the necessary steps for preprocessing as well as the keras models used. The actual training and additional explanations then take place in `POET_Training.ipynb`. diff --git a/environment.yaml b/environment.yaml new file mode 100644 index 0000000..18c15aa --- /dev/null +++ b/environment.yaml @@ -0,0 +1,248 @@ +name: training +channels: + - conda-forge + - defaults +dependencies: + - _libgcc_mutex=0.1=main + - _openmp_mutex=5.1=1_gnu + - absl-py=2.1.0=py311h06a4308_0 + - alembic=1.13.3=py311h06a4308_0 + - anyio=4.6.2=py311h06a4308_0 + - argon2-cffi=21.3.0=pyhd3eb1b0_0 + - argon2-cffi-bindings=21.2.0=py311h5eee18b_1 + - asttokens=2.0.5=pyhd3eb1b0_0 + - astunparse=1.6.3=py_0 + - async-lru=2.0.4=py311h06a4308_0 + - attrs=24.3.0=py311h06a4308_0 + - babel=2.11.0=py311h06a4308_0 + - beautifulsoup4=4.12.3=py311h06a4308_0 + - blas=1.0=mkl + - bleach=6.2.0=py311h06a4308_0 + - bottleneck=1.4.2=py311hf4808d0_0 + - brotli=1.0.9=h5eee18b_9 + - brotli-bin=1.0.9=h5eee18b_9 + - brotli-python=1.0.9=py311h6a678d5_9 + - bzip2=1.0.8=h5eee18b_6 + - c-ares=1.19.1=h5eee18b_0 + - ca-certificates=2024.12.31=h06a4308_0 + - certifi=2025.1.31=py311h06a4308_0 + - cffi=1.17.1=py311h1fdaa30_1 + - charset-normalizer=3.3.2=pyhd3eb1b0_0 + - colorlog=5.0.1=py311h06a4308_1 + - comm=0.2.1=py311h06a4308_0 + - contourpy=1.3.1=py311hdb19cb5_0 + - cycler=0.11.0=pyhd3eb1b0_0 + - cyrus-sasl=2.1.28=h52b45da_1 + - dbus=1.13.18=hb2f20db_0 + - debugpy=1.8.11=py311h6a678d5_0 + - decorator=5.1.1=pyhd3eb1b0_0 + - defusedxml=0.7.1=pyhd3eb1b0_0 + - executing=0.8.3=pyhd3eb1b0_0 + - expat=2.6.4=h6a678d5_0 + - flatbuffers=24.3.25=h6a678d5_0 + - fontconfig=2.14.1=h55d465d_3 + - fonttools=4.51.0=py311h5eee18b_0 + - freetype=2.12.1=h4a9f257_0 + - gast=0.5.3=pyhd3eb1b0_0 + - giflib=5.2.2=h5eee18b_0 + - glib=2.78.4=h6a678d5_0 + - glib-tools=2.78.4=h6a678d5_0 + - google-pasta=0.2.0=pyhd3eb1b0_0 + - greenlet=3.1.1=py311h6a678d5_0 + - grpcio=1.62.2=py311h6a678d5_0 + - gst-plugins-base=1.14.1=h6a678d5_1 + - gstreamer=1.14.1=h5eee18b_1 + - h11=0.14.0=py311h06a4308_0 + - h5py=3.12.1=py311hc0802c4_0 + - hdf5=1.12.1=h2b7332f_3 + - httpcore=1.0.2=py311h06a4308_0 + - httpx=0.27.0=py311h06a4308_0 + - icu=73.1=h6a678d5_0 + - idna=3.7=py311h06a4308_0 + - imbalanced-learn=0.12.3=py311h06a4308_1 + - intel-openmp=2023.1.0=hdb19cb5_46306 + - ipykernel=6.29.5=py311h06a4308_0 + - ipython=8.30.0=py311h06a4308_0 + - ipywidgets=8.1.5=py311h06a4308_0 + - jedi=0.19.2=py311h06a4308_0 + - jinja2=3.1.4=py311h06a4308_1 + - joblib=1.4.2=py311h06a4308_0 + - jpeg=9e=h5eee18b_3 + - json5=0.9.25=py311h06a4308_0 + - jsonschema=4.23.0=py311h06a4308_0 + - jsonschema-specifications=2023.7.1=py311h06a4308_0 + - jupyter=1.0.0=py311h06a4308_9 + - jupyter-lsp=2.2.0=py311h06a4308_0 + - jupyter_client=8.6.0=py311h06a4308_0 + - jupyter_console=6.6.3=py311h06a4308_0 + - jupyter_core=5.7.2=py311h06a4308_0 + - jupyter_events=0.10.0=py311h06a4308_0 + - jupyter_server=2.14.1=py311h06a4308_0 + - jupyter_server_terminals=0.4.4=py311h06a4308_1 + - jupyterlab=4.2.5=py311h06a4308_0 + - jupyterlab_pygments=0.1.2=py_0 + - jupyterlab_server=2.27.3=py311h06a4308_0 + - jupyterlab_widgets=3.0.13=py311h06a4308_0 + - keras=3.6.0=py311h06a4308_0 + - kiwisolver=1.4.4=py311h6a678d5_0 + - krb5=1.20.1=h143b758_1 + - lcms2=2.16=hb9589c4_0 + - ld_impl_linux-64=2.40=h12ee557_0 + - lerc=4.0.0=h6a678d5_0 + - libabseil=20240116.2=cxx17_h6a678d5_0 + - libbrotlicommon=1.0.9=h5eee18b_9 + - libbrotlidec=1.0.9=h5eee18b_9 + - libbrotlienc=1.0.9=h5eee18b_9 + - libclang=14.0.6=default_hc6dbbc7_2 + - libclang13=14.0.6=default_he11475f_2 + - libcups=2.4.2=h2d74bed_1 + - libcurl=8.11.1=hc9e6f67_0 + - libdeflate=1.22=h5eee18b_0 + - libedit=3.1.20230828=h5eee18b_0 + - libev=4.33=h7f8727e_1 + - libffi=3.4.4=h6a678d5_1 + - libgcc-ng=11.2.0=h1234567_1 + - libgfortran-ng=11.2.0=h00389a5_1 + - libgfortran5=11.2.0=h1234567_1 + - libglib=2.78.4=hdc74915_0 + - libgomp=11.2.0=h1234567_1 + - libgrpc=1.62.2=h2d74bed_0 + - libiconv=1.16=h5eee18b_3 + - libllvm14=14.0.6=hecde1de_4 + - libnghttp2=1.57.0=h2d74bed_0 + - libpng=1.6.39=h5eee18b_0 + - libpq=17.2=hdbd6064_0 + - libprotobuf=4.25.3=he621ea3_0 + - libsodium=1.0.18=h7b6447c_0 + - libssh2=1.11.1=h251f7ec_0 + - libstdcxx-ng=11.2.0=h1234567_1 + - libtiff=4.5.1=hffd6297_1 + - libuuid=1.41.5=h5eee18b_0 + - libwebp-base=1.3.2=h5eee18b_1 + - libxcb=1.15=h7f8727e_0 + - libxkbcommon=1.0.1=h097e994_2 + - libxml2=2.13.5=hfdd30dd_0 + - lz4-c=1.9.4=h6a678d5_1 + - mako=1.2.3=py311h06a4308_0 + - markdown=3.4.1=py311h06a4308_0 + - markdown-it-py=2.2.0=py311h06a4308_1 + - markupsafe=2.1.3=py311h5eee18b_1 + - matplotlib=3.10.0=py311h06a4308_0 + - matplotlib-base=3.10.0=py311hbfdbfaf_0 + - matplotlib-inline=0.1.6=py311h06a4308_0 + - mdurl=0.1.0=py311h06a4308_0 + - mistune=2.0.4=py311h06a4308_0 + - mkl=2023.1.0=h213fc3f_46344 + - mkl-service=2.4.0=py311h5eee18b_2 + - mkl_fft=1.3.11=py311h5eee18b_0 + - mkl_random=1.2.8=py311ha02d727_0 + - ml_dtypes=0.4.0=py311ha02d727_0 + - mysql=8.4.0=h29a9f33_1 + - namex=0.0.7=py311h06a4308_0 + - nbclient=0.8.0=py311h06a4308_0 + - nbconvert=7.16.4=py311h06a4308_0 + - nbformat=5.10.4=py311h06a4308_0 + - ncurses=6.4=h6a678d5_0 + - nest-asyncio=1.6.0=py311h06a4308_0 + - notebook=7.2.2=py311h06a4308_1 + - notebook-shim=0.2.3=py311h06a4308_0 + - numexpr=2.10.1=py311h3c60e43_0 + - numpy=1.26.4=py311h08b1b3b_0 + - numpy-base=1.26.4=py311hf175353_0 + - openjpeg=2.5.2=he7f1fd0_0 + - openldap=2.6.4=h42fbc30_0 + - openssl=3.0.15=h5eee18b_0 + - opt_einsum=3.3.0=pyhd3eb1b0_1 + - optree=0.12.1=py311hdb19cb5_0 + - optuna=4.2.1=pyhd8ed1ab_0 + - overrides=7.4.0=py311h06a4308_0 + - packaging=24.2=py311h06a4308_0 + - pandas=2.2.3=py311h6a678d5_0 + - pandocfilters=1.5.0=pyhd3eb1b0_0 + - parso=0.8.4=py311h06a4308_0 + - pcre2=10.42=hebb0a14_1 + - pexpect=4.8.0=pyhd3eb1b0_3 + - pillow=11.0.0=py311hcea889d_1 + - pip=24.2=py311h06a4308_0 + - platformdirs=3.10.0=py311h06a4308_0 + - ply=3.11=py311h06a4308_0 + - prometheus_client=0.21.0=py311h06a4308_0 + - prompt-toolkit=3.0.43=py311h06a4308_0 + - prompt_toolkit=3.0.43=hd3eb1b0_0 + - protobuf=4.25.3=py311he36ed58_1 + - psutil=5.9.0=py311h5eee18b_1 + - ptyprocess=0.7.0=pyhd3eb1b0_2 + - pure_eval=0.2.2=pyhd3eb1b0_0 + - pycparser=2.21=pyhd3eb1b0_0 + - pygments=2.15.1=py311h06a4308_1 + - pyparsing=3.2.0=py311h06a4308_0 + - pyqt=5.15.10=py311h6a678d5_0 + - pyqt5-sip=12.13.0=py311h5eee18b_0 + - pysocks=1.7.1=py311h06a4308_0 + - python=3.11.11=he870216_0 + - python-dateutil=2.9.0post0=py311h06a4308_2 + - python-fastjsonschema=2.20.0=py311h06a4308_0 + - python-flatbuffers=24.3.25=py311h06a4308_0 + - python-json-logger=3.2.1=py311h06a4308_0 + - python-tzdata=2023.3=pyhd3eb1b0_0 + - pytz=2024.1=py311h06a4308_0 + - pyyaml=6.0.2=py311h5eee18b_0 + - pyzmq=26.2.0=py311h6a678d5_0 + - qt-main=5.15.2=hb6262e9_11 + - qtconsole=5.6.0=py311h06a4308_0 + - qtpy=2.4.1=py311h06a4308_0 + - re2=2022.04.01=h295c915_0 + - readline=8.2=h5eee18b_0 + - referencing=0.30.2=py311h06a4308_0 + - requests=2.32.3=py311h06a4308_1 + - rfc3339-validator=0.1.4=py311h06a4308_0 + - rfc3986-validator=0.1.1=py311h06a4308_0 + - rich=13.9.4=py311h06a4308_0 + - rpds-py=0.22.3=py311h4aa5aa6_0 + - scikit-learn=1.5.2=py311h6a678d5_0 + - scipy=1.14.1=py311h08b1b3b_0 + - seaborn=0.13.2=py311h06a4308_1 + - send2trash=1.8.2=py311h06a4308_1 + - setuptools=75.1.0=py311h06a4308_0 + - sip=6.7.12=py311h6a678d5_1 + - six=1.16.0=pyhd3eb1b0_1 + - snappy=1.2.1=h6a678d5_0 + - sniffio=1.3.0=py311h06a4308_0 + - soupsieve=2.5=py311h06a4308_0 + - sqlalchemy=2.0.37=py311h00e1ef3_0 + - sqlite=3.45.3=h5eee18b_0 + - stack_data=0.2.0=pyhd3eb1b0_0 + - tbb=2021.8.0=hdb19cb5_0 + - tensorboard=2.17.0=py311h06a4308_0 + - tensorboard-data-server=0.7.0=py311h52d8a92_1 + - tensorflow=2.17.0=cpu_py311hbca4264_0 + - tensorflow-base=2.17.0=cpu_py311hb07566e_0 + - tensorflow-estimator=2.17.0=cpu_py311hfedf350_0 + - termcolor=2.1.0=py311h06a4308_0 + - terminado=0.17.1=py311h06a4308_0 + - threadpoolctl=3.5.0=py311h92b7b1e_0 + - tinycss2=1.2.1=py311h06a4308_0 + - tk=8.6.14=h39e8969_0 + - tornado=6.4.2=py311h5eee18b_0 + - tqdm=4.67.1=py311h92b7b1e_0 + - traitlets=5.14.3=py311h06a4308_0 + - typing-extensions=4.12.2=py311h06a4308_0 + - typing_extensions=4.12.2=py311h06a4308_0 + - tzdata=2024b=h04d1e81_0 + - unicodedata2=15.1.0=py311h5eee18b_1 + - urllib3=2.2.3=py311h06a4308_0 + - wcwidth=0.2.5=pyhd3eb1b0_0 + - webencodings=0.5.1=py311h06a4308_1 + - websocket-client=1.8.0=py311h06a4308_0 + - werkzeug=3.0.6=py311h06a4308_0 + - wheel=0.44.0=py311h06a4308_0 + - widgetsnbextension=4.0.13=py311h06a4308_0 + - wrapt=1.17.0=py311h5eee18b_0 + - xz=5.4.6=h5eee18b_1 + - yaml=0.2.5=h7b6447c_0 + - zeromq=4.3.5=h6a678d5_0 + - zlib=1.2.13=h5eee18b_1 + - zstd=1.5.6=hc292b87_0 + - pip: + - smogn==0.1.2 +prefix: /home/signer/bin/miniconda3/envs/training diff --git a/environment.yml b/environment.yml deleted file mode 100644 index 9cea602..0000000 --- a/environment.yml +++ /dev/null @@ -1,162 +0,0 @@ -name: ai -channels: - - conda-forge - - defaults - - https://repo.anaconda.com/pkgs/main - - https://repo.anaconda.com/pkgs/r -dependencies: - - absl-py=2.1.0=py311hca03da5_0 - - appnope=0.1.4=pyhd8ed1ab_1 - - asttokens=3.0.0=pyhd8ed1ab_1 - - astunparse=1.6.3=py_0 - - blas=1.0=openblas - - bottleneck=1.4.2=py311hb9f6ed7_0 - - brotli=1.0.9=h80987f9_9 - - brotli-bin=1.0.9=h80987f9_9 - - brotli-python=1.0.9=py311h313beb8_9 - - bzip2=1.0.8=h80987f9_6 - - c-ares=1.34.4=h5505292_0 - - ca-certificates=2024.12.31=hca03da5_0 - - cached-property=1.5.2=py_0 - - certifi=2024.12.14=py311hca03da5_0 - - charset-normalizer=3.3.2=pyhd3eb1b0_0 - - comm=0.2.2=pyhd8ed1ab_1 - - contourpy=1.3.1=py311h48ca7d4_0 - - cycler=0.11.0=pyhd3eb1b0_0 - - debugpy=1.8.11=py311h155a34a_0 - - decorator=5.1.1=pyhd8ed1ab_1 - - exceptiongroup=1.2.2=pyhd8ed1ab_1 - - executing=2.1.0=pyhd8ed1ab_1 - - flatbuffers=24.3.25=h313beb8_0 - - fonttools=4.51.0=py311h80987f9_0 - - freetype=2.12.1=hadb7bae_2 - - gast=0.5.3=pyhd3eb1b0_0 - - giflib=5.2.2=h80987f9_0 - - google-pasta=0.2.0=pyhd3eb1b0_0 - - grpcio=1.65.5=py311hc367efa_0 - - h5py=3.12.1=nompi_py311h5dd25b7_103 - - hdf5=1.14.4=nompi_ha698983_105 - - icu=75.1=hfee45f7_0 - - idna=3.7=py311hca03da5_0 - - importlib-metadata=8.5.0=pyha770c72_1 - - ipykernel=6.29.5=pyh57ce528_0 - - ipython=8.31.0=pyh707e725_0 - - jedi=0.19.2=pyhd8ed1ab_1 - - joblib=1.4.2=py311hca03da5_0 - - jupyter_client=8.6.3=pyhd8ed1ab_1 - - jupyter_core=5.7.2=pyh31011fe_1 - - keras=3.8.0=pyh753f3f9_0 - - kiwisolver=1.4.4=py311h313beb8_0 - - krb5=1.21.3=hf3e1bf2_0 - - lcms2=2.16=ha0e7c42_0 - - lerc=4.0.0=h313beb8_0 - - libabseil=20240722.0=cxx17_h07bc746_4 - - libaec=1.1.3=h313beb8_0 - - libbrotlicommon=1.0.9=h80987f9_9 - - libbrotlidec=1.0.9=h80987f9_9 - - libbrotlienc=1.0.9=h80987f9_9 - - libcurl=8.11.1=h73640d1_0 - - libcxx=19.1.6=ha82da77_1 - - libdeflate=1.23=hec38601_0 - - libedit=3.1.20230828=h80987f9_0 - - libev=4.33=h1a28f6b_1 - - libexpat=2.6.4=h286801f_0 - - libffi=3.4.4=hca03da5_1 - - libgfortran=5.0.0=13_2_0_hd922786_3 - - libgfortran5=13.2.0=hf226fd6_3 - - libgrpc=1.65.5=h3d9cf25_0 - - libjpeg-turbo=3.0.3=h80987f9_0 - - liblzma=5.6.3=h39f12f2_1 - - libnghttp2=1.64.0=h6d7220d_0 - - libopenblas=0.3.21=h269037a_0 - - libpng=1.6.45=h3783ad8_0 - - libprotobuf=5.27.5=h53f8970_2 - - libre2-11=2024.07.02=h07bc746_2 - - libsodium=1.0.20=h99b78c6_0 - - libsqlite=3.47.2=h3f77e49_0 - - libssh2=1.11.1=h9cc3647_0 - - libtiff=4.7.0=h551f018_3 - - libwebp-base=1.5.0=h2471fea_0 - - libxcb=1.17.0=hdb1d25a_0 - - libzlib=1.3.1=h8359307_2 - - llvm-openmp=14.0.6=hc6e5704_0 - - lz4-c=1.9.4=h313beb8_1 - - markdown=3.4.1=py311hca03da5_0 - - markdown-it-py=2.2.0=py311hca03da5_1 - - markupsafe=2.1.3=py311h80987f9_1 - - matplotlib=3.10.0=py311hca03da5_0 - - matplotlib-base=3.10.0=py311h7ef442a_0 - - matplotlib-inline=0.1.7=pyhd8ed1ab_1 - - mdurl=0.1.0=py311hca03da5_0 - - ml_dtypes=0.4.0=py311h7aedaa7_0 - - namex=0.0.7=py311hca03da5_0 - - ncurses=6.5=h5e97a16_2 - - nest-asyncio=1.6.0=pyhd8ed1ab_1 - - numexpr=2.10.1=py311h5d9532f_0 - - numpy=1.26.4=py311he598dae_0 - - numpy-base=1.26.4=py311hfbfe69c_0 - - openjpeg=2.5.3=h8a3d83b_0 - - openssl=3.4.0=h81ee809_1 - - opt_einsum=3.3.0=pyhd3eb1b0_1 - - optree=0.12.1=py311h48ca7d4_0 - - packaging=24.2=py311hca03da5_0 - - pandas=2.2.3=py311hcf29cfe_0 - - parso=0.8.4=pyhd8ed1ab_1 - - pexpect=4.9.0=pyhd8ed1ab_1 - - pickleshare=0.7.5=pyhd8ed1ab_1004 - - pillow=11.1.0=py311hb9ba9e9_0 - - pip=24.2=py311hca03da5_0 - - platformdirs=4.3.6=pyhd8ed1ab_1 - - prompt-toolkit=3.0.48=pyha770c72_1 - - protobuf=5.27.5=py311h3f08180_0 - - psutil=6.1.1=py311h917b07b_0 - - pthread-stubs=0.3=h1a28f6b_1 - - ptyprocess=0.7.0=pyhd8ed1ab_1 - - pure_eval=0.2.3=pyhd8ed1ab_1 - - pygments=2.15.1=py311hca03da5_1 - - pyparsing=3.2.0=py311hca03da5_0 - - pysocks=1.7.1=py311hca03da5_0 - - python=3.11.11=hc22306f_1_cpython - - python-dateutil=2.9.0.post0=pyhff2d567_1 - - python-flatbuffers=24.3.25=py311hca03da5_0 - - python-tzdata=2023.3=pyhd3eb1b0_0 - - python_abi=3.11=5_cp311 - - pytz=2024.1=py311hca03da5_0 - - pyzmq=26.2.0=py311h730b646_3 - - re2=2024.07.02=h6589ca4_2 - - readline=8.2=h1a28f6b_0 - - requests=2.32.3=py311hca03da5_1 - - rich=13.9.4=py311hca03da5_0 - - scikit-learn=1.5.2=py311h313beb8_0 - - scipy=1.14.1=py311hac8794a_0 - - setuptools=75.1.0=py311hca03da5_0 - - six=1.16.0=pyhd3eb1b0_1 - - snappy=1.2.1=h313beb8_0 - - sqlite=3.47.2=hd7222ec_0 - - stack_data=0.6.3=pyhd8ed1ab_1 - - tensorboard=2.17.1=pyhd8ed1ab_0 - - tensorboard-data-server=0.7.0=py311ha6e5c4f_1 - - tensorflow=2.17.0=cpu_py311h9d3d1e9_3 - - tensorflow-base=2.17.0=cpu_py311ha270cad_3 - - tensorflow-estimator=2.17.0=cpu_py311h935fadc_3 - - termcolor=2.1.0=py311hca03da5_0 - - threadpoolctl=3.5.0=py311hb6e6a13_0 - - tk=8.6.13=h5083fa2_1 - - tornado=6.4.2=py311h917b07b_0 - - traitlets=5.14.3=pyhd8ed1ab_1 - - typing-extensions=4.12.2=py311hca03da5_0 - - typing_extensions=4.12.2=py311hca03da5_0 - - tzdata=2024b=h04d1e81_0 - - unicodedata2=15.1.0=py311h80987f9_1 - - urllib3=2.2.3=py311hca03da5_0 - - wcwidth=0.2.13=pyhd8ed1ab_1 - - werkzeug=3.0.6=py311hca03da5_0 - - wheel=0.44.0=py311hca03da5_0 - - wrapt=1.17.0=py311h80987f9_0 - - xorg-libxau=1.0.12=h5505292_0 - - xorg-libxdmcp=1.1.5=hd74edd7_0 - - xz=5.4.6=h80987f9_1 - - zeromq=4.3.5=hc1bb282_7 - - zipp=3.21.0=pyhd8ed1ab_1 - - zlib=1.3.1=h8359307_2 - - zstd=1.5.6=hb46c0d2_0