FGCS/eval_dolomite.jl

using DataFrames, RCall, Statistics, Plots, Base.Threads, CSV, CategoricalArrays

prefix = "dolo_inter"
input_dir = "./"

output_dir = "error_eval/"

if !isdir(output_dir)
        mkdir(output_dir)
end

relevant_header = ["H", "O", "C", "Ca", "Cl", "Mg", "Calcite", "Dolomite"]
@rput relevant_header

function alphas(obs::AbstractVector{<:Real}, pred::AbstractVector{<:Real})
        # Workhorse function computing the "alphas" (cfr workflows/metrics)
        # for relative error measures. We first compute the alphas, then take
        # care of the +Inf, -Inf, NaN cases scanning the input vectors and
        # substituting 0 and 1. NAs in the predictions are preserved

        # create vector of length obs
        alphas_result = zeros(length(obs))

        for i in eachindex(obs, pred, alphas_result)
                if isnan(obs[i]) && isnan(pred[i])
                        # Preserve NaN in pred
                        alphas_result[i] = NaN
                elseif obs[i] == 0 && pred[i] == 0
                        alphas_result[i] = 0
                elseif obs[i] == 0 && pred[i] > 0
                        alphas_result[i] = 1
                elseif obs[i] == 0 && pred[i] < 0
                        alphas_result[i] = -1
                else
                        alphas_result[i] = 1 - pred[i] / obs[i]
                end
        end

        return alphas_result
end

function MAPE(obs::AbstractVector{<:Real}, pred::AbstractVector{<:Real})
        # Compute the Mean Absolute Percentage Error (MAPE) between obs and pred
        # MAPE is defined as the mean of the absolute values of the alphas
        # (relative errors) times 100

        alphas_result = alphas(obs, pred)

        return 100 * mean(abs.(alphas_result[.!isnan.(alphas_result)]))
end

function RRMSE(obs::AbstractVector{<:Real}, pred::AbstractVector{<:Real})
        # Compute the Relative Root Mean Squared Error (RRMSE) between obs and pred
        # RRMSE is defined as the RMSE divided by the maximum of the observations

        alphas_result = alphas(obs, pred)

        return sqrt(mean(alphas_result[.!isnan.(alphas_result)] .^ 2))
end


function apply_functions_colwise(obs::DataFrame, interp::DataFrame)::DataFrame
        res = DataFrame()

        # Loop over all columns
        for spec in relevant_header
                real = obs[:, spec]
                pred = interp[:, spec]

                # Compute RRMSE and MAPE
                rrmse = RRMSE(real, pred)
                mape = MAPE(real, pred)

                # Append the results to the new DataFrame
                colname = string(spec, "_RRMSE")
                res[:, colname] = [rrmse]

                colname = string(spec, "_MAPE")
                res[:, colname] = [mape]
        end

        return res
end

function get_all_qs_files(dir::String)::Vector{String}
        files = readdir(dir, join=true)
        files = filter(x -> occursin(r"iter_\d+.qs2", x), files)
        return files[2:end]
end

function get_chemistry_data(files::Vector{String})::Dict{Int,DataFrame}
        res = nothing

        basenames = basename.(files)
        basenames = replace.(basenames, ".qs2" => "")
        its = split.(basenames, "_")
        its = last.(its)

        iterations = parse.(Int, its)

        @rput files
        R"res <- lapply(files, function(x) { dat <- qs2::qs_read(x)$C ; dat <- dat[,relevant_header] ; return(dat) })"
        @rget res

        data = Dict(zip(iterations, res))

        return data
end

reference_dir = "$input_dir/dolo_inter_ref_128_1/"
reference_files = get_all_qs_files(reference_dir)
reference_data = get_chemistry_data(reference_files)

result_dirs = readdir(input_dir)

# testdatadir = result_dirs[1]
#
# metainfo = split(testdatadir, "_")
#
# aqueous = parse(Int, metainfo[3])
# minerals = parse(Int, metainfo[4])
# ntpn = parse(Int, metainfo[5])
#
# files = get_all_qs_files(joinpath(input_dir, testdatadir))
# data = get_chemistry_data(files)
#
# apply_functions_colwise(reference_data[1], data[1])

df_global_data = DataFrame()

for dir in result_dirs
        # continue if dir contains "$prefix_ref"
        if occursin("$(prefix)_ref", dir) || !occursin(prefix, dir)
                continue
        end

        metainfo = split(dir, "_")

        has_important = length(metainfo) == 7

        aqueous = parse(Int, metainfo[3])
        minerals = parse(Int, metainfo[4])
        ntpn = parse(Int, metainfo[5])

        files = get_all_qs_files(joinpath(input_dir, dir))
        data = get_chemistry_data(files)

        for (iter, val) in data
                realval = reference_data[iter]

                df_local = apply_functions_colwise(realval, val)
                df_local[:, "Aqueous"] .= aqueous
                df_local[:, "Minerals"] .= minerals
                df_local[:, "iter"] .= iter

                global df_global_data = vcat(df_global_data, df_local)
        end
end

function plot_by_iteration_and_rounding(df_in::DataFrame, species::String, errtype::String)
        my_symbol = Symbol(species * "_" * errtype)

        df = select(df_in, [:AM, :iter, my_symbol])

        sort!(df, :iter)

        max_iter = maximum(df.iter)
        only_last_iter = filter(row -> row.iter == max_iter, df)
        sort!(only_last_iter, my_symbol, rev=true)
        df[!, :AM] = CategoricalArray(df[!, :AM])
        levels!(df[!, :AM], only_last_iter[!, :AM])

        AM_CA = CategoricalArray(only_last_iter.AM)
        levels!(AM_CA, only_last_iter.AM)

        # plot
        p = plot(df.iter, df[!, my_symbol], group=df.AM,
                xlabel="Iteration", ylabel=errtype, title="$(errtype) $(species) vs. Iteration",
                minorgrid=true,
                legend=:outerright,
                #  yticks=[1e-7, 1e-6, 1e-5, 1e-4, 1e-3],
                # ylim=(1e-7, 1e-3),
                legendtitle="(Aqu, Min)",
                legendcolumns=1,
                legendfontsize=7,
                legendtitlefontsize=7,)

        return p
end

grouped_df = groupby(df_global_data, [:Aqueous, :Minerals, :iter])
# Combine all columns by maximum
combine_df = combine(grouped_df, names(df_global_data) .=> maximum .=> names(df_global_data))

combine_df[!, :AM] = map(x -> "(" * string(x[1]) * ", " * string(x[2]) * ")",
        zip(combine_df.Aqueous, combine_df.Minerals))

CSV.write("$output_dir/eval_dolomite_MAPE_RRMSE.csv", combine_df)

p = plot_by_iteration_and_rounding(combine_df, "Dolomite", "MAPE")
savefig(p, "$output_dir/MAPE_Dolomite.pdf")

p = plot_by_iteration_and_rounding(combine_df, "Calcite", "MAPE")
savefig(p, "$output_dir/MAPE_Calcite.pdf")

p = plot_by_iteration_and_rounding(combine_df, "Mg", "MAPE")
savefig(p, "$output_dir/MAPE_Mg.pdf")