From 1d9d193668345695f07d07991e5dfc8cfd88058f Mon Sep 17 00:00:00 2001 From: Hannes Martin Signer Date: Thu, 27 Feb 2025 11:44:51 +0100 Subject: [PATCH] update julia script --- src/convert_data.jl | 112 +++++++++++++++++++++++++++----------------- 1 file changed, 70 insertions(+), 42 deletions(-) diff --git a/src/convert_data.jl b/src/convert_data.jl index 3cd3cc9..d57df53 100644 --- a/src/convert_data.jl +++ b/src/convert_data.jl @@ -1,60 +1,88 @@ -using HDF5 -using RData +#!/usr/bin/env julia -using DataFrames +# qs_read = [] -# Load Training Data -# train_data = load("Barite_50_Data.rds") -# training_h5_name = "Barite_50_Data.h5" -# h5open(training_h5_name, "w") do fid -# for key in keys(train_data) -# group = create_group(fid, key) -# group["names"] = names(train_data[key]) -# group["data", compress=3] = Matrix(train_data[key]) -# # group = create_group(fid, key) -# # grou["names"] = coln -# end -# end +# # find all the files in 'barite_out' +# files = readdir("barite_out"; join=true) + +# # remove files which do not have the extension '.qs2' and contains 'iter' +# files = filter(x -> occursin(r".*\.qs2", x) && occursin(r"iter", x), files) + +# # remove first entry as it is iteration 0 +# files = files[2:end] + +# test1 = qs_read(files[1]) + +# @rput test1 + +# R"test1 <- test1$C" + +# @rget test1 + +# check if ARGS contains 2 elements +if length(ARGS) != 2 + println("Usage: julia convert.jl .h5") + exit(1) +end + +to_read_dir = ARGS[1] +output_file_name = ARGS[2] * ".h5" + +# check if the directory exists +if !isdir(to_read_dir) + println("The directory \"$to_read_dir\" does not exist") + exit(1) +end + +using HDF5, RCall, DataFrames +@rlibrary qs2 # List all .rds files starting with "iter" in a given directory -rds_files = filter(x -> startswith(x, "iter"), readdir("barite_out/")) +qs_files = filter(x -> occursin(r".*\.qs2", x) && occursin(r"iter", x), readdir(to_read_dir; join=true))[2:end] -# remove "iter_0.rds" from the list -rds_files = rds_files[2:end] +df_design = DataFrame() +df_result = DataFrame() -big_df_in = DataFrame() -big_df_out = DataFrame() - -for rds_file in rds_files +for file in qs_files # Load the RDS file - data = load("barite_out/$rds_file") - # Convert the data to a DataFrame - df_T = DataFrame(data["T"]) - df_C = DataFrame(data["C"]) + data = qs_read(file) + + # get basename of the file + basename = split(file, "/")[end] + + # get the iteration number by splitting the basename and parse the second element + iteration = parse(Int, split(split(basename, "_")[2], ".")[1]) + + @rput data + + R"transport <- data$T" + R"chemistry <- data$C" + + @rget transport + @rget chemistry + + # Add iteration number to the DataFrame + transport.iteration = fill(iteration, size(transport, 1)) + chemistry.iteration = fill(iteration, size(chemistry, 1)) + # Append the DataFrame to the big DataFrame - append!(big_df_in, df_T) - append!(big_df_out, df_C) + append!(df_design, transport) + append!(df_result, chemistry) end # remove ID, Barite_p1, Celestite_p1 columns -big_df_in = big_df_in[:, Not([:ID, :Barite_p1, :Celestite_p1])] -big_df_out = big_df_out[:, Not([:ID, :Barite_p1, :Celestite_p1])] +df_design = df_design[:, Not([:ID, :Barite_p1, :Celestite_p1])] +df_result = df_result[:, Not([:ID, :Barite_p1, :Celestite_p1])] -inference_h5_name = "Barite_50_Data_inference.h5" -h5open(inference_h5_name, "w") do fid - fid["names"] = names(big_df_in) - fid["data", compress=9] = Matrix(big_df_in) -end -training_h5_name = "Barite_50_Data_training.h5" -h5open(training_h5_name, "w") do fid +h5open(output_file_name, "w") do fid group_in = create_group(fid, "design") group_out = create_group(fid, "result") - group_in["names"] = names(big_df_in) - group_in["data", compress=9] = Matrix(big_df_in) + group_in["names"] = names(df_design) + group_in["data", compress=9] = Matrix(df_design) - group_out["names"] = names(big_df_out) - group_out["data", compress=9] = Matrix(big_df_out) -end \ No newline at end of file + group_out["names"] = names(df_result) + group_out["data", compress=9] = Matrix(df_result) +end