using DataFrames, RCall, Statistics, Plots, Base.Threads, CSV, CategoricalArrays prefix = "barite_fgcs" input_dir = "./" output_dir = "error_eval/" if !isdir(output_dir) mkdir(output_dir) end relevant_header = ["H", "O", "Ba", "Cl", "S", "Sr", "Barite", "Celestite"] @rput relevant_header function alphas(obs::AbstractVector{<:Real}, pred::AbstractVector{<:Real}) # Workhorse function computing the "alphas" (cfr workflows/metrics) # for relative error measures. We first compute the alphas, then take # care of the +Inf, -Inf, NaN cases scanning the input vectors and # substituting 0 and 1. NAs in the predictions are preserved # create vector of length obs alphas_result = zeros(length(obs)) for i in eachindex(obs, pred, alphas_result) if isnan(obs[i]) && isnan(pred[i]) # Preserve NaN in pred alphas_result[i] = NaN elseif obs[i] == 0 && pred[i] == 0 alphas_result[i] = 0 elseif obs[i] == 0 && pred[i] > 0 alphas_result[i] = 1 elseif obs[i] == 0 && pred[i] < 0 alphas_result[i] = -1 else alphas_result[i] = 1 - pred[i] / obs[i] end end return alphas_result end function MAPE(obs::AbstractVector{<:Real}, pred::AbstractVector{<:Real}) # Compute the Mean Absolute Percentage Error (MAPE) between obs and pred # MAPE is defined as the mean of the absolute values of the alphas # (relative errors) times 100 alphas_result = alphas(obs, pred) return 100 * mean(abs.(alphas_result[.!isnan.(alphas_result)])) end function RRMSE(obs::AbstractVector{<:Real}, pred::AbstractVector{<:Real}) # Compute the Relative Root Mean Squared Error (RRMSE) between obs and pred # RRMSE is defined as the RMSE divided by the maximum of the observations alphas_result = alphas(obs, pred) return sqrt(mean(alphas_result[.!isnan.(alphas_result)] .^ 2)) end function apply_functions_colwise(obs::DataFrame, interp::DataFrame)::DataFrame res = DataFrame() # Loop over all columns for spec in relevant_header real = obs[:, spec] pred = interp[:, spec] # Compute RRMSE and MAPE rrmse = RRMSE(real, pred) mape = MAPE(real, pred) # Append the results to the new DataFrame colname = string(spec, "_RRMSE") res[:, colname] = [rrmse] colname = string(spec, "_MAPE") res[:, colname] = [mape] end return res end function get_all_qs_files(dir::String)::Vector{String} files = readdir(dir, join=true) files = filter(x -> occursin(r"iter_\d+.qs2", x), files) return files[2:end] end function get_chemistry_data(files::Vector{String})::Dict{Int,DataFrame} res = nothing basenames = basename.(files) basenames = replace.(basenames, ".qs2" => "") its = split.(basenames, "_") its = last.(its) iterations = parse.(Int, its) @rput files R"res <- lapply(files, function(x) { dat <- qs2::qs_read(x)$C ; dat <- dat[,relevant_header] ; return(dat) })" @rget res data = Dict(zip(iterations, res)) return data end reference_dir = "$input_dir/barite_fgcs_ref_128_1/" reference_files = get_all_qs_files(reference_dir) reference_data = get_chemistry_data(reference_files) result_dirs = readdir(input_dir) df_global_data = DataFrame() for dir in result_dirs # continue if dir contains "$prefix_ref" if occursin("$(prefix)_ref", dir) || !occursin(prefix, dir) continue end metainfo = split(dir, "_") has_important = length(metainfo) == 7 aqueous = parse(Int, metainfo[3]) minerals = parse(Int, metainfo[4]) ntpn = parse(Int, metainfo[5]) files = get_all_qs_files(joinpath(input_dir, dir)) data = get_chemistry_data(files) for (iter, val) in data realval = reference_data[iter] df_local = apply_functions_colwise(realval, val) df_local[:, "Aqueous"] .= aqueous df_local[:, "Minerals"] .= minerals df_local[:, "iter"] .= iter global df_global_data = vcat(df_global_data, df_local) end end function plot_by_iteration_and_rounding(df_in::DataFrame, species::String, errtype::String) my_symbol = Symbol(species * "_" * errtype) df = select(df_in, [:AM, :iter, my_symbol]) sort!(df, :iter) max_iter = maximum(df.iter) only_last_iter = filter(row -> row.iter == max_iter, df) sort!(only_last_iter, my_symbol, rev=true) df[!, :AM] = CategoricalArray(df[!, :AM]) levels!(df[!, :AM], only_last_iter[!, :AM]) AM_CA = CategoricalArray(only_last_iter.AM) levels!(AM_CA, only_last_iter.AM) # plot p = plot(df.iter, df[!, my_symbol], group=df.AM, xlabel="Iteration", ylabel=errtype, title="$(errtype) $(species) vs. Iteration", minorgrid=true, legend=:outerright, # yticks=[1e-7, 1e-6, 1e-5, 1e-4, 1e-3], # ylim=(1e-7, 1e-3), legendtitle="(Aqu, Min)", legendcolumns=1, legendfontsize=7, legendtitlefontsize=7,) return p end grouped_df = groupby(df_global_data, [:Aqueous, :Minerals, :iter]) # Combine all columns by maximum combine_df = combine(grouped_df, names(df_global_data) .=> maximum .=> names(df_global_data)) combine_df[!, :AM] = map(x -> "(" * string(x[1]) * ", " * string(x[2]) * ")", zip(combine_df.Aqueous, combine_df.Minerals)) CSV.write("$output_dir/eval_barite_aq_min_MAPE_RRMSE.csv", combine_df) p = plot_by_iteration_and_rounding(combine_df, "Barite", "MAPE") savefig(p, "$output_dir/MAPE_Barite.pdf") p = plot_by_iteration_and_rounding(combine_df, "Celestite", "MAPE") savefig(p, "$output_dir/MAPE_Celestite.pdf") p = plot_by_iteration_and_rounding(combine_df, "Ba", "MAPE") savefig(p, "$output_dir/MAPE_Ba.pdf")