FGCS/eval_dolomite.jl
2025-03-16 23:49:48 +01:00

210 lines
6.6 KiB
Julia

using DataFrames, RCall, Statistics, Plots, Base.Threads, CSV, CategoricalArrays
prefix = "dolo_inter"
input_dir = "./"
output_dir = "error_eval/"
if !isdir(output_dir)
mkdir(output_dir)
end
relevant_header = ["H", "O", "C", "Ca", "Cl", "Mg", "Calcite", "Dolomite"]
@rput relevant_header
function alphas(obs::AbstractVector{<:Real}, pred::AbstractVector{<:Real})
# Workhorse function computing the "alphas" (cfr workflows/metrics)
# for relative error measures. We first compute the alphas, then take
# care of the +Inf, -Inf, NaN cases scanning the input vectors and
# substituting 0 and 1. NAs in the predictions are preserved
# create vector of length obs
alphas_result = zeros(length(obs))
for i in eachindex(obs, pred, alphas_result)
if isnan(obs[i]) && isnan(pred[i])
# Preserve NaN in pred
alphas_result[i] = NaN
elseif obs[i] == 0 && pred[i] == 0
alphas_result[i] = 0
elseif obs[i] == 0 && pred[i] > 0
alphas_result[i] = 1
elseif obs[i] == 0 && pred[i] < 0
alphas_result[i] = -1
else
alphas_result[i] = 1 - pred[i] / obs[i]
end
end
return alphas_result
end
function MAPE(obs::AbstractVector{<:Real}, pred::AbstractVector{<:Real})
# Compute the Mean Absolute Percentage Error (MAPE) between obs and pred
# MAPE is defined as the mean of the absolute values of the alphas
# (relative errors) times 100
alphas_result = alphas(obs, pred)
return 100 * mean(abs.(alphas_result[.!isnan.(alphas_result)]))
end
function RRMSE(obs::AbstractVector{<:Real}, pred::AbstractVector{<:Real})
# Compute the Relative Root Mean Squared Error (RRMSE) between obs and pred
# RRMSE is defined as the RMSE divided by the maximum of the observations
alphas_result = alphas(obs, pred)
return sqrt(mean(alphas_result[.!isnan.(alphas_result)] .^ 2))
end
function apply_functions_colwise(obs::DataFrame, interp::DataFrame)::DataFrame
res = DataFrame()
# Loop over all columns
for spec in relevant_header
real = obs[:, spec]
pred = interp[:, spec]
# Compute RRMSE and MAPE
rrmse = RRMSE(real, pred)
mape = MAPE(real, pred)
# Append the results to the new DataFrame
colname = string(spec, "_RRMSE")
res[:, colname] = [rrmse]
colname = string(spec, "_MAPE")
res[:, colname] = [mape]
end
return res
end
function get_all_qs_files(dir::String)::Vector{String}
files = readdir(dir, join=true)
files = filter(x -> occursin(r"iter_\d+.qs2", x), files)
return files[2:end]
end
function get_chemistry_data(files::Vector{String})::Dict{Int,DataFrame}
res = nothing
basenames = basename.(files)
basenames = replace.(basenames, ".qs2" => "")
its = split.(basenames, "_")
its = last.(its)
iterations = parse.(Int, its)
@rput files
R"res <- lapply(files, function(x) { dat <- qs2::qs_read(x)$C ; dat <- dat[,relevant_header] ; return(dat) })"
@rget res
data = Dict(zip(iterations, res))
return data
end
reference_dir = "$input_dir/dolo_inter_ref_128_1/"
reference_files = get_all_qs_files(reference_dir)
reference_data = get_chemistry_data(reference_files)
result_dirs = readdir(input_dir)
# testdatadir = result_dirs[1]
#
# metainfo = split(testdatadir, "_")
#
# aqueous = parse(Int, metainfo[3])
# minerals = parse(Int, metainfo[4])
# ntpn = parse(Int, metainfo[5])
#
# files = get_all_qs_files(joinpath(input_dir, testdatadir))
# data = get_chemistry_data(files)
#
# apply_functions_colwise(reference_data[1], data[1])
df_global_data = DataFrame()
for dir in result_dirs
# continue if dir contains "$prefix_ref"
if occursin("$(prefix)_ref", dir) || !occursin(prefix, dir)
continue
end
metainfo = split(dir, "_")
has_important = length(metainfo) == 7
aqueous = parse(Int, metainfo[3])
minerals = parse(Int, metainfo[4])
ntpn = parse(Int, metainfo[5])
files = get_all_qs_files(joinpath(input_dir, dir))
data = get_chemistry_data(files)
for (iter, val) in data
realval = reference_data[iter]
df_local = apply_functions_colwise(realval, val)
df_local[:, "Aqueous"] .= aqueous
df_local[:, "Minerals"] .= minerals
df_local[:, "iter"] .= iter
global df_global_data = vcat(df_global_data, df_local)
end
end
function plot_by_iteration_and_rounding(df_in::DataFrame, species::String, errtype::String)
my_symbol = Symbol(species * "_" * errtype)
df = select(df_in, [:AM, :iter, my_symbol])
sort!(df, :iter)
max_iter = maximum(df.iter)
only_last_iter = filter(row -> row.iter == max_iter, df)
sort!(only_last_iter, my_symbol, rev=true)
df[!, :AM] = CategoricalArray(df[!, :AM])
levels!(df[!, :AM], only_last_iter[!, :AM])
AM_CA = CategoricalArray(only_last_iter.AM)
levels!(AM_CA, only_last_iter.AM)
# plot
p = plot(df.iter, df[!, my_symbol], group=df.AM,
xlabel="Iteration", ylabel=errtype, title="$(errtype) $(species) vs. Iteration",
minorgrid=true,
legend=:outerright,
# yticks=[1e-7, 1e-6, 1e-5, 1e-4, 1e-3],
# ylim=(1e-7, 1e-3),
legendtitle="(Aqu, Min)",
legendcolumns=1,
legendfontsize=7,
legendtitlefontsize=7,)
return p
end
grouped_df = groupby(df_global_data, [:Aqueous, :Minerals, :iter])
# Combine all columns by maximum
combine_df = combine(grouped_df, names(df_global_data) .=> maximum .=> names(df_global_data))
combine_df[!, :AM] = map(x -> "(" * string(x[1]) * ", " * string(x[2]) * ")",
zip(combine_df.Aqueous, combine_df.Minerals))
CSV.write("$output_dir/eval_dolomite_MAPE_RRMSE.csv", combine_df)
p = plot_by_iteration_and_rounding(combine_df, "Dolomite", "MAPE")
savefig(p, "$output_dir/MAPE_Dolomite.pdf")
p = plot_by_iteration_and_rounding(combine_df, "Calcite", "MAPE")
savefig(p, "$output_dir/MAPE_Calcite.pdf")
p = plot_by_iteration_and_rounding(combine_df, "Mg", "MAPE")
savefig(p, "$output_dir/MAPE_Mg.pdf")