## R code to compute the different metrics for GeoML Benchmarks, GFZ ## implementation ## ## M. De Lucia, delucia@gfz-potsdam.de ## ## Time-stamp: "Last modified 2024-04-10 23:40:08 delucia" ## Hopefully self-evident naming of the functions ## Convenience function for output of messages to stdout prepending ## name of function which called it msg_ <- function(...) { ## check if we are called from another function or interactively fname <- sys.call(-1) if (is.null(fname)) { cat(paste(..., "\n")) } else { prefix <- paste0("::", as.list(fname)[[1]], "::") cat(paste(prefix, ..., "\n")) } invisible() } MAElog <- function(actual, pred) { pred[is.na(pred)] <- 0 ind <- which(actual == 0 | pred <= 0) if (length(ind) > 0) { msg_("Removed ", length(ind)) mean(abs(log(pred[-ind]/actual[-ind])), na.rm=TRUE) } else { mean(abs(log(pred/actual)), na.rm=TRUE) } } RMSElog <- function(actual, pred, eps=FALSE) { pred[is.na(pred)] <- 0 if (length(ind <- which(actual==0 | pred<=0)) > 0) { msg_("Removed ", length(ind)) sqrt(mean(log(pred[-ind]/actual[-ind])^2, na.rm=TRUE)) } else { sqrt(mean(log(pred/actual)^2, na.rm=TRUE)) } } GMAQabs <- function(actual, pred) { pred[is.na(pred)] <- 0 ind <- which(actual==0 | pred<=0) if (length(ind) > 0) { msg_("Removed ", length(ind)) abs(1-exp(mean(log(abs(pred[-ind]/actual[-ind])), na.rm=TRUE))) } else { abs(1-exp(mean(log(abs(pred/actual)), na.rm=TRUE))) } } ## Workhorse function computing the "alphas" (cfr workflows/metrics) ## for relative error measures. We first compute the alphas, then take ## care of the +Inf, -Inf, NaN cases scanning the input vectors and ## substituting 0 and 1. NAs in the predictions are preserved Alphas <- function(actual, pred) { alphas <- 1-pred/actual alphas[actual==0 & pred==0] <- 0 alphas[actual==0 & pred > 0] <- 1 alphas[actual==0 & pred < 0] <- -1 alphas } MAPE <- function(actual, pred) { 100*mean(abs(Alphas(actual, pred)), na.rm=TRUE) } RRMSE <- function(actual, pred) { sqrt(mean(Alphas(actual, pred)^2, na.rm=TRUE)) } RMSLE <- function(actual, pred) { sqrt(mean((log((1+pred)/(1+actual))^2), na.rm=TRUE)) } MAE <- function(actual, pred) { mean(abs(actual-pred), na.rm=TRUE) } RMSE <- function(actual, pred) { sqrt(mean((actual-pred)^2, na.rm=TRUE)) } MSE <- function(actual, pred) { mean((actual-pred)^2, na.rm=TRUE) } ## Function is called "R2" but it actually computes "1-R2" (note that ## "1-rss/tss" is missing) R2 <- function(actual, pred) { rss <- sum((actual - pred)^2, na.rm=TRUE) tss <- sum((pred - mean(pred, na.rm=TRUE))^2, na.rm=TRUE) rss/tss } Range <- function(x) { diff(range(x, na.rm=TRUE)) } ## 2024-03-08: fixed Range of actual instead of pred, thanks @Mary Normsupnorm<- function(actual, pred) { max(abs(actual - pred), na.rm=TRUE)/Range(actual) } ## This function computes all the measures at once in the (hopefully) ## expected right order. You need to loop this function over all ## variables. Metrics <- function(actual, pred) { ## MAE MSE RMSE 1-R2 NMAE NRMSE Normsupnorm MAElog RMSElog RMSLE GMAQabs MAPE RRMSE NNEG Range alphas <- Alphas(actual, pred) ran_pred <- Range(pred) ran_actu <- Range(actual) ## 2024-03-08: same fix as in Normsupnorm ret <- c( MAE = MAE(actual, pred), MSE = MSE(actual, pred), RMSE = RMSE(actual, pred), '1-R2' = R2(actual, pred), NMAE = MAE(actual, pred)/ran_actu, NRMSE = RMSE(actual, pred)/ran_actu, Normsupnorm = Normsupnorm(actual, pred), MAElog = MAElog(actual,pred), RMSElog = RMSElog(actual,pred), RMSLE = RMSLE(actual, pred), GMAQabs = GMAQabs(actual, pred), MAPE = MAPE(actual, pred), RRMSE = RRMSE(actual, pred), NNEG = NNEG(pred), Range = ran_pred ) ret } ## Function applying Metrics() on all columns of two data.frames ## having the same name DoAllMetrics <- function(vals, preds) { ## Match column names (=variables) in both data.frames vars <- intersect(colnames(vals), colnames(preds)) msg_("Going to compute metrics for: ", paste(vars, collapse=', ')) ## Some lambda calculus combined with R ugliness as.data.frame(t(sapply(vars, function(x) Metrics(actual = vals[[x]], pred = preds[[x]])))) } ### Store the computed metrics into a .csv file. First format all ### columns with "formatC" specifying 7 digits, and then use function ### "data.table::fwrite" with option "scipen=1" to try and use the ### scientific/exponential notation as much as possible. The empty ### spaces resulting from padding are removed too. WriteMetrics <- function(tab, fileout) { require(data.table) tocsv <- data.frame(Output=rownames(tab), gsub(" ", "", sapply(tab, formatC, digits = 7)), check.names = FALSE) data.table::fwrite(tocsv, file=fileout, scipen=1) } ################################################################## ### ### ### Usage Example ### ### ### ################################################################## ### We start by sourcing these functions ## source("./Metrics.R") ### NOTE: I use the extension package data.table to read and write the ### .csv files. ## require(data.table) ### NOTE2: we expect named columns in the .csv - with header! ### Example to compute all metrics at once for from a file - check colnames! ## Trues <- data.table::fread("./Trues.csv", header=TRUE, data.table = FALSE) ### Load the simulation results dataset ## Preds <- data.table::fread("./MyPreds.csv", header=TRUE, data.table = FALSE) ## res <- DoAllMetrics(Val, Preds) ### Write out the metrics ## WriteMetrcs(res, file="MyMetrics.csv")