From 78cd2f20834ee8ebb284645abe723c1ab8b5e79a Mon Sep 17 00:00:00 2001 From: Hannes Signer Date: Wed, 15 Jan 2025 12:02:54 +0100 Subject: [PATCH] add data conversion script --- convert_data.jl | 60 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 60 insertions(+) create mode 100644 convert_data.jl diff --git a/convert_data.jl b/convert_data.jl new file mode 100644 index 0000000..3cd3cc9 --- /dev/null +++ b/convert_data.jl @@ -0,0 +1,60 @@ +using HDF5 +using RData + +using DataFrames + +# Load Training Data +# train_data = load("Barite_50_Data.rds") + +# training_h5_name = "Barite_50_Data.h5" +# h5open(training_h5_name, "w") do fid +# for key in keys(train_data) +# group = create_group(fid, key) +# group["names"] = names(train_data[key]) +# group["data", compress=3] = Matrix(train_data[key]) +# # group = create_group(fid, key) +# # grou["names"] = coln +# end +# end + +# List all .rds files starting with "iter" in a given directory +rds_files = filter(x -> startswith(x, "iter"), readdir("barite_out/")) + +# remove "iter_0.rds" from the list +rds_files = rds_files[2:end] + +big_df_in = DataFrame() +big_df_out = DataFrame() + +for rds_file in rds_files + # Load the RDS file + data = load("barite_out/$rds_file") + # Convert the data to a DataFrame + df_T = DataFrame(data["T"]) + df_C = DataFrame(data["C"]) + # Append the DataFrame to the big DataFrame + append!(big_df_in, df_T) + append!(big_df_out, df_C) +end + +# remove ID, Barite_p1, Celestite_p1 columns +big_df_in = big_df_in[:, Not([:ID, :Barite_p1, :Celestite_p1])] +big_df_out = big_df_out[:, Not([:ID, :Barite_p1, :Celestite_p1])] + +inference_h5_name = "Barite_50_Data_inference.h5" +h5open(inference_h5_name, "w") do fid + fid["names"] = names(big_df_in) + fid["data", compress=9] = Matrix(big_df_in) +end + +training_h5_name = "Barite_50_Data_training.h5" +h5open(training_h5_name, "w") do fid + group_in = create_group(fid, "design") + group_out = create_group(fid, "result") + + group_in["names"] = names(big_df_in) + group_in["data", compress=9] = Matrix(big_df_in) + + group_out["names"] = names(big_df_out) + group_out["data", compress=9] = Matrix(big_df_out) +end \ No newline at end of file