Source code for dea_tools.validation
## validation.py
"""
Tools for validating outputs and producing accuracy assessment metrics.
License: The code in this notebook is licensed under the Apache License,
Version 2.0 (https://www.apache.org/licenses/LICENSE-2.0). Digital Earth
Australia data is licensed under the Creative Commons by Attribution 4.0
license (https://creativecommons.org/licenses/by/4.0/).
Contact: If you need assistance, please post a question on the Open Data
Cube Discord chat (https://discord.com/invite/4hhBQVas5U) or on the GIS Stack
Exchange (https://gis.stackexchange.com/questions/ask?tags=open-data-cube)
using the `open-data-cube` tag (you can view previously asked questions
here: https://gis.stackexchange.com/questions/tagged/open-data-cube).
If you would like to report an issue with this script, you can file one
on GitHub (https://github.com/GeoscienceAustralia/dea-notebooks/issues/new).
Last modified: April 2023
"""
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from math import sqrt
from scipy import stats
[docs]
def eval_metrics(x, y, round=3, all_regress=False):
"""
Calculate a set of common statistical metrics
based on two input actual and predicted vectors.
These include:
- Pearson correlation
- Root Mean Squared Error
- Mean Absolute Error
- R-squared
- Bias
- Linear regression parameters (slope,
p-value, intercept, standard error)
Parameters
----------
x : numpy.array
An array providing "actual" variable values
y : numpy.array
An array providing "predicted" variable values
round : int
Number of decimal places to round each metric
to. Defaults to 3
all_regress : bool
Whether to return linear regression p-value,
intercept and standard error (in addition to
only regression slope). Defaults to False
Returns
-------
A pandas.Series containing calculated metrics
"""
# Create dataframe to drop na
xy_df = pd.DataFrame({"x": x, "y": y}).dropna()
# Compute linear regression
lin_reg = stats.linregress(x=xy_df.x, y=xy_df.y)
# Calculate statistics
stats_dict = {
"Correlation": xy_df.corr().iloc[0, 1],
"RMSE": sqrt(mean_squared_error(xy_df.x, xy_df.y)),
"MAE": mean_absolute_error(xy_df.x, xy_df.y),
"R-squared": lin_reg.rvalue**2,
"Bias": (xy_df.y - xy_df.x).mean(),
"Regression slope": lin_reg.slope,
}
# Additional regression params
if all_regress:
stats_dict.update(
{
"Regression p-value": lin_reg.pvalue,
"Regression intercept": lin_reg.intercept,
"Regression standard error": lin_reg.stderr,
}
)
# Return as
return pd.Series(stats_dict).round(round)