Scope
This article explains how to normalise biomarker outputs computed by
HealthMarkers functions using hm_normalize() and the
underlying normalize_vec().
Normalisation is useful before:
- Genome-wide association studies (GWAS) — inverse-normal transform is standard
- Principal component analysis (PCA) — z-score or min-max
- Polygenic score construction — z-score or robust scaling
- Visualisation — when variables are on very different scales
When to use hm_normalize() vs the built-in
normalize argument
The normalize argument is only implemented in the
insulin sensitivity functions (fasting_is(),
ogtt_is(), adipo_is(),
all_insulin_indices()). For all other domain functions —
glycemic_markers(), lipid_markers(),
renal_markers(), etc. — the normalize argument
has no effect.
hm_normalize() works post-computation on
any data frame and covers all cases.
Setup
if (requireNamespace("pkgload", quietly = TRUE)) {
pkgload::load_all()
} else {
library(HealthMarkers)
}
sim_path <- system.file("extdata", "simulated_hm_data.rds", package = "HealthMarkers")
sim <- readRDS(sim_path)[1:50, ]Step 2 — normalise with hm_normalize()
Z-score (mean 0, sd 1)
out_z <- hm_normalize(out, cols = new_cols, method = "z")
round(head(out_z[new_cols[1:5]]), 3)
#> whr MAP BUN_Cr_ratio VLDL non_HDL
#> 1 -0.206 -1.502 -1.410 1.396 0.883
#> 2 0.631 -0.948 0.710 -1.296 0.332
#> 3 -0.762 -1.827 0.189 -0.909 1.435
#> 4 0.137 -1.567 -0.322 -1.075 -1.253
#> 5 -1.063 1.559 -0.102 -0.277 -1.105
#> 6 -0.438 -1.469 1.397 -0.676 -0.239Rank-based inverse-normal transform (Rankit)
The most common choice for GWAS. Normally distributed regardless of the original distribution.
out_int <- hm_normalize(out, cols = new_cols, method = "inverse")
round(head(out_int[new_cols[1:5]]), 3)
#> whr MAP BUN_Cr_ratio VLDL non_HDL
#> 1 -0.025 -1.341 -1.645 1.341 0.739
#> 2 0.739 -0.878 0.739 -1.476 0.279
#> 3 -0.739 -1.881 0.385 -1.126 1.476
#> 4 0.385 -1.476 -0.279 -1.227 -0.954
#> 5 -1.341 1.476 0.025 -0.151 -0.806
#> 6 -0.279 -1.227 1.126 -0.739 -0.126Use the Blom denominator instead of Rankit:
out_blom <- hm_normalize(out, cols = new_cols, method = "inverse",
invnorm_denominator = "blom")Min-max scaling to [0, 1]
out_range <- hm_normalize(out, cols = new_cols, method = "range")
round(head(out_range[new_cols[1:5]]), 3)
#> whr MAP BUN_Cr_ratio VLDL non_HDL
#> 1 0.366 0.152 0.021 0.631 0.746
#> 2 0.559 0.288 0.553 0.046 0.586
#> 3 0.237 0.072 0.422 0.130 0.906
#> 4 0.445 0.136 0.294 0.094 0.126
#> 5 0.167 0.904 0.349 0.267 0.169
#> 6 0.312 0.160 0.725 0.181 0.420Scale to a custom interval (e.g. [-1, 1]):
out_range2 <- hm_normalize(out, cols = new_cols, method = "range",
feature_range = c(-1, 1))Robust median/MAD scaling
Less sensitive to outliers than z-score.
out_rob <- hm_normalize(out, cols = new_cols, method = "robust")
round(head(out_rob[new_cols[1:5]]), 3)
#> whr MAP BUN_Cr_ratio VLDL non_HDL
#> 1 -0.055 -1.878 -1.216 1.982 0.753
#> 2 0.878 -1.258 0.786 -1.363 0.290
#> 3 -0.675 -2.242 0.294 -0.881 1.217
#> 4 0.327 -1.951 -0.189 -1.087 -1.043
#> 5 -1.010 1.550 0.020 -0.096 -0.919
#> 6 -0.313 -1.841 1.434 -0.592 -0.190Protecting covariate columns
Use skip_cols to exclude variables you want to keep on
their original scale. When cols = NULL (the default),
hm_normalize() targets all numeric columns;
skip_cols removes covariates from that set.
# Normalise all numeric columns except age, BMI, sex-encoded variables
out_protected <- hm_normalize(out, method = "inverse",
skip_cols = c("age", "BMI", "sex"))Using normalize_vec() on a single vector
hm_normalize() calls normalize_vec()
internally. You can also use it directly:
x <- out[[new_cols[1]]]
normalize_vec(x, method = "z")[1:10]
#> [1] -0.2056805 0.6308316 -0.7621945 0.1368883 -1.0631482 -0.4375478
#> [7] -0.9907357 2.5049090 0.5142453 -0.6308403
normalize_vec(x, method = "inverse")[1:10]
#> [1] -0.02506891 0.73884685 -0.73884685 0.38532047 -1.34075503 -0.27931903
#> [7] -1.12639113 1.88079361 0.67448975 -0.43991317
normalize_vec(x, method = "range")[1:10]
#> [1] 0.3656107 0.5590158 0.2369424 0.4448141 0.1673606 0.3120021 0.1841027
#> [8] 0.9923103 0.5320606 0.2673120
normalize_vec(x, method = "robust")[1:10]
#> [1] -0.05450437 0.87759325 -0.67460932 0.32720869 -1.00995204 -0.31286630
#> [7] -0.92926521 2.96581519 0.74768494 -0.52824568Comparison of methods
compare <- data.frame(
raw = x,
z = normalize_vec(x, "z"),
INT = normalize_vec(x, "inverse"),
range01 = normalize_vec(x, "range"),
robust = normalize_vec(x, "robust")
)
round(head(compare, 10), 3)
#> raw z INT range01 robust
#> 1 0.884 -0.206 -0.025 0.366 -0.055
#> 2 1.002 0.631 0.739 0.559 0.878
#> 3 0.806 -0.762 -0.739 0.237 -0.675
#> 4 0.932 0.137 0.385 0.445 0.327
#> 5 0.763 -1.063 -1.341 0.167 -1.010
#> 6 0.852 -0.438 -0.279 0.312 -0.313
#> 7 0.774 -0.991 -1.126 0.184 -0.929
#> 8 1.266 2.505 1.881 0.992 2.966
#> 9 0.986 0.514 0.674 0.532 0.748
#> 10 0.824 -0.631 -0.440 0.267 -0.528Summary statistics after normalisation:
round(sapply(compare, function(v) c(mean = mean(v, na.rm = TRUE),
sd = sd(v, na.rm = TRUE),
min = min(v, na.rm = TRUE),
max = max(v, na.rm = TRUE))), 3)
#> raw z INT range01 robust
#> mean 0.913 0.000 0.000 0.413 0.175
#> sd 0.141 1.000 0.997 0.231 1.114
#> min 0.661 -1.787 -2.326 0.000 -1.817
#> max 1.271 2.538 2.326 1.000 3.003See also
-
?hm_normalize— full argument reference -
?normalize_vec— single-vector normalisation details -
Fasting insulin sensitivity —
normalizeargument used at compute time