Add new dataset and missing scripts

This commit is contained in:
Max Luebke 2025-03-16 23:49:48 +01:00
parent 5718ace0c1
commit 24a3ef2d4b
771 changed files with 1286 additions and 20 deletions

194
eval_barite.jl Normal file
View File

@ -0,0 +1,194 @@
using DataFrames, RCall, Statistics, Plots, Base.Threads, CSV, CategoricalArrays
prefix = "barite_fgcs"
input_dir = "./"
output_dir = "error_eval/"
if !isdir(output_dir)
mkdir(output_dir)
end
relevant_header = ["H", "O", "Ba", "Cl", "S", "Sr", "Barite", "Celestite"]
@rput relevant_header
function alphas(obs::AbstractVector{<:Real}, pred::AbstractVector{<:Real})
# Workhorse function computing the "alphas" (cfr workflows/metrics)
# for relative error measures. We first compute the alphas, then take
# care of the +Inf, -Inf, NaN cases scanning the input vectors and
# substituting 0 and 1. NAs in the predictions are preserved
# create vector of length obs
alphas_result = zeros(length(obs))
for i in eachindex(obs, pred, alphas_result)
if isnan(obs[i]) && isnan(pred[i])
# Preserve NaN in pred
alphas_result[i] = NaN
elseif obs[i] == 0 && pred[i] == 0
alphas_result[i] = 0
elseif obs[i] == 0 && pred[i] > 0
alphas_result[i] = 1
elseif obs[i] == 0 && pred[i] < 0
alphas_result[i] = -1
else
alphas_result[i] = 1 - pred[i] / obs[i]
end
end
return alphas_result
end
function MAPE(obs::AbstractVector{<:Real}, pred::AbstractVector{<:Real})
# Compute the Mean Absolute Percentage Error (MAPE) between obs and pred
# MAPE is defined as the mean of the absolute values of the alphas
# (relative errors) times 100
alphas_result = alphas(obs, pred)
return 100 * mean(abs.(alphas_result[.!isnan.(alphas_result)]))
end
function RRMSE(obs::AbstractVector{<:Real}, pred::AbstractVector{<:Real})
# Compute the Relative Root Mean Squared Error (RRMSE) between obs and pred
# RRMSE is defined as the RMSE divided by the maximum of the observations
alphas_result = alphas(obs, pred)
return sqrt(mean(alphas_result[.!isnan.(alphas_result)] .^ 2))
end
function apply_functions_colwise(obs::DataFrame, interp::DataFrame)::DataFrame
res = DataFrame()
# Loop over all columns
for spec in relevant_header
real = obs[:, spec]
pred = interp[:, spec]
# Compute RRMSE and MAPE
rrmse = RRMSE(real, pred)
mape = MAPE(real, pred)
# Append the results to the new DataFrame
colname = string(spec, "_RRMSE")
res[:, colname] = [rrmse]
colname = string(spec, "_MAPE")
res[:, colname] = [mape]
end
return res
end
function get_all_qs_files(dir::String)::Vector{String}
files = readdir(dir, join=true)
files = filter(x -> occursin(r"iter_\d+.qs2", x), files)
return files[2:end]
end
function get_chemistry_data(files::Vector{String})::Dict{Int,DataFrame}
res = nothing
basenames = basename.(files)
basenames = replace.(basenames, ".qs2" => "")
its = split.(basenames, "_")
its = last.(its)
iterations = parse.(Int, its)
@rput files
R"res <- lapply(files, function(x) { dat <- qs2::qs_read(x)$C ; dat <- dat[,relevant_header] ; return(dat) })"
@rget res
data = Dict(zip(iterations, res))
return data
end
reference_dir = "$input_dir/barite_fgcs_ref_128_1/"
reference_files = get_all_qs_files(reference_dir)
reference_data = get_chemistry_data(reference_files)
result_dirs = readdir(input_dir)
df_global_data = DataFrame()
for dir in result_dirs
# continue if dir contains "$prefix_ref"
if occursin("$(prefix)_ref", dir) || !occursin(prefix, dir)
continue
end
metainfo = split(dir, "_")
has_important = length(metainfo) == 7
aqueous = parse(Int, metainfo[3])
minerals = parse(Int, metainfo[4])
ntpn = parse(Int, metainfo[5])
files = get_all_qs_files(joinpath(input_dir, dir))
data = get_chemistry_data(files)
for (iter, val) in data
realval = reference_data[iter]
df_local = apply_functions_colwise(realval, val)
df_local[:, "Aqueous"] .= aqueous
df_local[:, "Minerals"] .= minerals
df_local[:, "iter"] .= iter
global df_global_data = vcat(df_global_data, df_local)
end
end
function plot_by_iteration_and_rounding(df_in::DataFrame, species::String, errtype::String)
my_symbol = Symbol(species * "_" * errtype)
df = select(df_in, [:AM, :iter, my_symbol])
sort!(df, :iter)
max_iter = maximum(df.iter)
only_last_iter = filter(row -> row.iter == max_iter, df)
sort!(only_last_iter, my_symbol, rev=true)
df[!, :AM] = CategoricalArray(df[!, :AM])
levels!(df[!, :AM], only_last_iter[!, :AM])
AM_CA = CategoricalArray(only_last_iter.AM)
levels!(AM_CA, only_last_iter.AM)
# plot
p = plot(df.iter, df[!, my_symbol], group=df.AM,
xlabel="Iteration", ylabel=errtype, title="$(errtype) $(species) vs. Iteration",
minorgrid=true,
legend=:outerright,
# yticks=[1e-7, 1e-6, 1e-5, 1e-4, 1e-3],
# ylim=(1e-7, 1e-3),
legendtitle="(Aqu, Min)",
legendcolumns=1,
legendfontsize=7,
legendtitlefontsize=7,)
return p
end
grouped_df = groupby(df_global_data, [:Aqueous, :Minerals, :iter])
# Combine all columns by maximum
combine_df = combine(grouped_df, names(df_global_data) .=> maximum .=> names(df_global_data))
combine_df[!, :AM] = map(x -> "(" * string(x[1]) * ", " * string(x[2]) * ")",
zip(combine_df.Aqueous, combine_df.Minerals))
CSV.write("$output_dir/eval_barite_aq_min_MAPE_RRMSE.csv", combine_df)
p = plot_by_iteration_and_rounding(combine_df, "Barite", "MAPE")
savefig(p, "$output_dir/MAPE_Barite.pdf")
p = plot_by_iteration_and_rounding(combine_df, "Celestite", "MAPE")
savefig(p, "$output_dir/MAPE_Celestite.pdf")
p = plot_by_iteration_and_rounding(combine_df, "Ba", "MAPE")
savefig(p, "$output_dir/MAPE_Ba.pdf")

209
eval_dolomite.jl Normal file
View File

@ -0,0 +1,209 @@
using DataFrames, RCall, Statistics, Plots, Base.Threads, CSV, CategoricalArrays
prefix = "dolo_inter"
input_dir = "./"
output_dir = "error_eval/"
if !isdir(output_dir)
mkdir(output_dir)
end
relevant_header = ["H", "O", "C", "Ca", "Cl", "Mg", "Calcite", "Dolomite"]
@rput relevant_header
function alphas(obs::AbstractVector{<:Real}, pred::AbstractVector{<:Real})
# Workhorse function computing the "alphas" (cfr workflows/metrics)
# for relative error measures. We first compute the alphas, then take
# care of the +Inf, -Inf, NaN cases scanning the input vectors and
# substituting 0 and 1. NAs in the predictions are preserved
# create vector of length obs
alphas_result = zeros(length(obs))
for i in eachindex(obs, pred, alphas_result)
if isnan(obs[i]) && isnan(pred[i])
# Preserve NaN in pred
alphas_result[i] = NaN
elseif obs[i] == 0 && pred[i] == 0
alphas_result[i] = 0
elseif obs[i] == 0 && pred[i] > 0
alphas_result[i] = 1
elseif obs[i] == 0 && pred[i] < 0
alphas_result[i] = -1
else
alphas_result[i] = 1 - pred[i] / obs[i]
end
end
return alphas_result
end
function MAPE(obs::AbstractVector{<:Real}, pred::AbstractVector{<:Real})
# Compute the Mean Absolute Percentage Error (MAPE) between obs and pred
# MAPE is defined as the mean of the absolute values of the alphas
# (relative errors) times 100
alphas_result = alphas(obs, pred)
return 100 * mean(abs.(alphas_result[.!isnan.(alphas_result)]))
end
function RRMSE(obs::AbstractVector{<:Real}, pred::AbstractVector{<:Real})
# Compute the Relative Root Mean Squared Error (RRMSE) between obs and pred
# RRMSE is defined as the RMSE divided by the maximum of the observations
alphas_result = alphas(obs, pred)
return sqrt(mean(alphas_result[.!isnan.(alphas_result)] .^ 2))
end
function apply_functions_colwise(obs::DataFrame, interp::DataFrame)::DataFrame
res = DataFrame()
# Loop over all columns
for spec in relevant_header
real = obs[:, spec]
pred = interp[:, spec]
# Compute RRMSE and MAPE
rrmse = RRMSE(real, pred)
mape = MAPE(real, pred)
# Append the results to the new DataFrame
colname = string(spec, "_RRMSE")
res[:, colname] = [rrmse]
colname = string(spec, "_MAPE")
res[:, colname] = [mape]
end
return res
end
function get_all_qs_files(dir::String)::Vector{String}
files = readdir(dir, join=true)
files = filter(x -> occursin(r"iter_\d+.qs2", x), files)
return files[2:end]
end
function get_chemistry_data(files::Vector{String})::Dict{Int,DataFrame}
res = nothing
basenames = basename.(files)
basenames = replace.(basenames, ".qs2" => "")
its = split.(basenames, "_")
its = last.(its)
iterations = parse.(Int, its)
@rput files
R"res <- lapply(files, function(x) { dat <- qs2::qs_read(x)$C ; dat <- dat[,relevant_header] ; return(dat) })"
@rget res
data = Dict(zip(iterations, res))
return data
end
reference_dir = "$input_dir/dolo_inter_ref_128_1/"
reference_files = get_all_qs_files(reference_dir)
reference_data = get_chemistry_data(reference_files)
result_dirs = readdir(input_dir)
# testdatadir = result_dirs[1]
#
# metainfo = split(testdatadir, "_")
#
# aqueous = parse(Int, metainfo[3])
# minerals = parse(Int, metainfo[4])
# ntpn = parse(Int, metainfo[5])
#
# files = get_all_qs_files(joinpath(input_dir, testdatadir))
# data = get_chemistry_data(files)
#
# apply_functions_colwise(reference_data[1], data[1])
df_global_data = DataFrame()
for dir in result_dirs
# continue if dir contains "$prefix_ref"
if occursin("$(prefix)_ref", dir) || !occursin(prefix, dir)
continue
end
metainfo = split(dir, "_")
has_important = length(metainfo) == 7
aqueous = parse(Int, metainfo[3])
minerals = parse(Int, metainfo[4])
ntpn = parse(Int, metainfo[5])
files = get_all_qs_files(joinpath(input_dir, dir))
data = get_chemistry_data(files)
for (iter, val) in data
realval = reference_data[iter]
df_local = apply_functions_colwise(realval, val)
df_local[:, "Aqueous"] .= aqueous
df_local[:, "Minerals"] .= minerals
df_local[:, "iter"] .= iter
global df_global_data = vcat(df_global_data, df_local)
end
end
function plot_by_iteration_and_rounding(df_in::DataFrame, species::String, errtype::String)
my_symbol = Symbol(species * "_" * errtype)
df = select(df_in, [:AM, :iter, my_symbol])
sort!(df, :iter)
max_iter = maximum(df.iter)
only_last_iter = filter(row -> row.iter == max_iter, df)
sort!(only_last_iter, my_symbol, rev=true)
df[!, :AM] = CategoricalArray(df[!, :AM])
levels!(df[!, :AM], only_last_iter[!, :AM])
AM_CA = CategoricalArray(only_last_iter.AM)
levels!(AM_CA, only_last_iter.AM)
# plot
p = plot(df.iter, df[!, my_symbol], group=df.AM,
xlabel="Iteration", ylabel=errtype, title="$(errtype) $(species) vs. Iteration",
minorgrid=true,
legend=:outerright,
# yticks=[1e-7, 1e-6, 1e-5, 1e-4, 1e-3],
# ylim=(1e-7, 1e-3),
legendtitle="(Aqu, Min)",
legendcolumns=1,
legendfontsize=7,
legendtitlefontsize=7,)
return p
end
grouped_df = groupby(df_global_data, [:Aqueous, :Minerals, :iter])
# Combine all columns by maximum
combine_df = combine(grouped_df, names(df_global_data) .=> maximum .=> names(df_global_data))
combine_df[!, :AM] = map(x -> "(" * string(x[1]) * ", " * string(x[2]) * ")",
zip(combine_df.Aqueous, combine_df.Minerals))
CSV.write("$output_dir/eval_dolomite_MAPE_RRMSE.csv", combine_df)
p = plot_by_iteration_and_rounding(combine_df, "Dolomite", "MAPE")
savefig(p, "$output_dir/MAPE_Dolomite.pdf")
p = plot_by_iteration_and_rounding(combine_df, "Calcite", "MAPE")
savefig(p, "$output_dir/MAPE_Calcite.pdf")
p = plot_by_iteration_and_rounding(combine_df, "Mg", "MAPE")
savefig(p, "$output_dir/MAPE_Mg.pdf")

View File

@ -1,4 +1,4 @@
using CSV, DataFrames, Statistics, Base.cumsum
using CSV, DataFrames, Statistics
function read_csvs(csv_files)
full_vec = []
@ -43,13 +43,50 @@ p = plot(mean_vec .* 1e3,
linecolor=:black,
)
for i in 1:length(mean_intervalls)-1
plot!(p, [mean_intervalls[i]; mean_intervalls[i+1]], [mean_cummulative[i]; mean_cummulative[i]],
linecolor=:red,
linewidth=1,
)
end
display(p)
savefig(p, "phreeeqc_per_iteration_barite.pdf")
mean(mean_vec) * 1e3
input_dir = "phreeqc_dolo/"
csv_files = readdir(input_dir, join=true)
csv_files = filter(x -> occursin(r"worker_timings_\d+.csv", x), csv_files)
mat = read_csvs(csv_files)
# calculate the mean of each row
mean_vec = vec(mean(mat; dims=2))
mean_intervalls = collect(0:1000:10000)
# calculate the mean of each intervall
mean_cummulative = [mean(@view mean_vec[mean_intervalls[i]+1:mean_intervalls[i+1]])
for i in 1:length(mean_intervalls)-1] .* 1e3
using Plots
p = plot(mean_vec,
xlabel="Iteration",
ylabel="Runimte [ms]",
xguidefontsize=8,
yguidefontsize=8,
xtickfontsize=6,
ytickfontsize=6,
legend=false,
formatter=:plain,
yminorgrid=true,
# ylimits=(0, 16),
# yticks=collect(0:2:16),
yminorticks=4,
linecolor=:black,
)
display(p)
savefig(p, "phreeeqc_per_iteration_barite.pdf")
mean(mean_vec) * 1e6
maximum(mean_vec) * 1e6
minimum(mean_vec) * 1e6

View File

@ -221,26 +221,40 @@ max_count = get_count_of_id2("barite/barite_fgcs_ref_128_3/iter_0000.qs2")
interp_3_1 = df_interp[(3, 1)]
plot(interp_3_1[!, :Calls],
color=:black,
interp_3_1[:, :RelCalls] = interp_3_1[!, :Calls] ./ max_count
interp_3_1[:, :RelCached] = interp_3_1[!, :Cached] ./ interp_3_1[!, :Calls]
interp_3_1[:, :iter] = 1:nrow(interp_3_1)
plot_df = stack(interp_3_1, [:RelCalls, :RelCached], variable_name=:variable, value_name=:value)
rename_dict = Dict("RelCalls" => "Interpolation Calls", "RelCached" => "O/W Cached Sets")
plot_df[!, :variable] = map(x -> rename_dict[x], plot_df[!, :variable])
plot_df[!, :variable] = CategoricalArray(plot_df[!, :variable])
levels!(plot_df[!, :variable], ["Interpolation Calls", "O/W Cached Sets"])
my_colors = palette(:viridis, 3)[2:3]
color_dict = Dict("Interpolation Calls" => my_colors[1], "O/W Cached Sets" => my_colors[2])
plot_df[!, :Colors] = map(x -> color_dict[x], plot_df[!, :variable])
plot(plot_df[!, :iter],
plot_df[!, :value] * 100,
group=plot_df[!, :variable],
color=plot_df[!, :Colors],
xguidefontsize=8,
yguidefontsize=8,
xtickfontsize=6,
ytickfontsize=6,
legend=false,
ylimits=(68000, 82000),
ylimits=(0, 100),
yminorgrid=true,
yminorticks=4,
formatter=:plain,
ylabel="Interpolation Calls",
linewidth=2,
ylabel="Relative Count [%]",
xlabel="Iteration")
hline!([max_count],
color=:red,
linestyle=:dash,
label="Mean")
savefig("$output_dir/interp_calls_3_1_clipped.pdf")
savefig("$output_dir/interp_calls_barite_3_1.pdf")
interp_3_1 = df_timings[df_timings[!, :Type].=="(3,1)", :]
interp_ref = df_timings[df_timings[!, :Type].=="Reference", :]
@ -305,9 +319,54 @@ bar(df_timings[!, :Type], df_timings[!, :Energy],
savefig("$output_dir/energy_dolomite.pdf")
interp_3_3 = df_interp[(3, 3)]
max_count = 400 * 400
interp_3_3[:, :RelCalls] = interp_3_3[!, :Calls] ./ max_count
interp_3_3[:, :RelCached] = interp_3_3[!, :Cached] ./ interp_3_3[!, :Calls]
interp_3_3[:, :iter] = 1:nrow(interp_3_3)
plot_df = stack(interp_3_3, [:RelCalls, :RelCached], variable_name=:variable, value_name=:value)
rename_dict = Dict("RelCalls" => "Interpolation Calls", "RelCached" => "O/W Cached Sets")
plot_df[!, :variable] = map(x -> rename_dict[x], plot_df[!, :variable])
plot_df[!, :variable] = CategoricalArray(plot_df[!, :variable])
levels!(plot_df[!, :variable], ["Interpolation Calls", "O/W Cached Sets"])
my_colors = palette(:viridis, 3)[2:3]
color_dict = Dict("Interpolation Calls" => my_colors[1], "O/W Cached Sets" => my_colors[2])
plot_df[!, :Colors] = map(x -> color_dict[x], plot_df[!, :variable])
plot(plot_df[!, :iter],
plot_df[!, :value] * 100,
group=plot_df[!, :variable],
color=plot_df[!, :Colors],
xguidefontsize=8,
yguidefontsize=8,
xtickfontsize=6,
ytickfontsize=6,
ylimits=(0, 100),
yminorgrid=true,
yminorticks=4,
linewidth=2,
formatter=:plain,
ylabel="Relative Count [%]",
xlabel="Iteration")
savefig("$output_dir/interp_calls_dolomite_3_3.pdf")
# get timings for (4,3) and Reference
interp_4_3 = df_timings[df_timings[!, :Type].=="(4,3)", :]
interp_ref = df_timings[df_timings[!, :Type].=="Reference", :]
interp_4_3.Chemistry / interp_ref.Chemistry
interp_4_3.Energy / interp_ref.Energy
1 - (interp_4_3.Chemistry/interp_ref.Chemistry)[1]
1 - (interp_4_3.Energy/interp_ref.Energy)[1]
interp_3_3 = df_timings[df_timings[!, :Type].=="(3,3)", :]
1 - (interp_3_3.Chemistry/interp_ref.Chemistry)[1]
1 - (interp_3_3.Energy/interp_ref.Energy)[1]

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

Some files were not shown because too many files have changed in this diff Show More