From 78cd2f20834ee8ebb284645abe723c1ab8b5e79a Mon Sep 17 00:00:00 2001
From: Hannes Signer <signer@uni-potsdam.de>
Date: Wed, 15 Jan 2025 12:02:54 +0100
Subject: [PATCH] add data conversion script

---
 convert_data.jl | 60 +++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 60 insertions(+)
 create mode 100644 convert_data.jl

diff --git a/convert_data.jl b/convert_data.jl
new file mode 100644
index 0000000..3cd3cc9
--- /dev/null
+++ b/convert_data.jl
@@ -0,0 +1,60 @@
+using HDF5
+using RData
+
+using DataFrames
+
+# Load Training Data 
+# train_data = load("Barite_50_Data.rds")
+
+# training_h5_name = "Barite_50_Data.h5"
+# h5open(training_h5_name, "w") do fid
+#     for key in keys(train_data)
+#         group = create_group(fid, key)
+#         group["names"] = names(train_data[key])
+#         group["data", compress=3] = Matrix(train_data[key])
+#         # group = create_group(fid, key)
+#         # grou["names"] = coln
+#     end
+# end
+
+# List all .rds files starting with "iter" in a given directory
+rds_files = filter(x -> startswith(x, "iter"), readdir("barite_out/"))
+
+# remove "iter_0.rds" from the list 
+rds_files = rds_files[2:end]
+
+big_df_in = DataFrame()
+big_df_out = DataFrame()
+
+for rds_file in rds_files
+    # Load the RDS file
+    data = load("barite_out/$rds_file")
+    # Convert the data to a DataFrame
+    df_T = DataFrame(data["T"])
+    df_C = DataFrame(data["C"])
+    # Append the DataFrame to the big DataFrame
+    append!(big_df_in, df_T)
+    append!(big_df_out, df_C)
+end
+
+# remove ID, Barite_p1, Celestite_p1 columns
+big_df_in = big_df_in[:, Not([:ID, :Barite_p1, :Celestite_p1])]
+big_df_out = big_df_out[:, Not([:ID, :Barite_p1, :Celestite_p1])]
+
+inference_h5_name = "Barite_50_Data_inference.h5"
+h5open(inference_h5_name, "w") do fid
+    fid["names"] = names(big_df_in)
+    fid["data", compress=9] = Matrix(big_df_in)
+end
+
+training_h5_name = "Barite_50_Data_training.h5"
+h5open(training_h5_name, "w") do fid
+    group_in = create_group(fid, "design")
+    group_out = create_group(fid, "result")
+
+    group_in["names"] = names(big_df_in)
+    group_in["data", compress=9] = Matrix(big_df_in)
+
+    group_out["names"] = names(big_df_out)
+    group_out["data", compress=9] = Matrix(big_df_out)
+end
\ No newline at end of file