library(tidyverse)
library(lubridate)
# Step 1: Start with SDTM DM
adsl <- sdtm_dm |>
mutate(
# Age group
AGEGR1 = case_when(
AGE < 40 ~ "<40",
AGE >= 40 & AGE < 55 ~ "40-54",
AGE >= 55 & AGE < 65 ~ "55-64",
AGE >= 65 ~ ">=65"
),
# Treatment variables
TRT01P = ARM,
TRT01PN = case_when(
ARMCD == "DRUG1" ~ 1,
ARMCD == "DRUG2" ~ 2,
ARMCD == "PLACEBO" ~ 3
),
TRT01A = ARM,
TRT01AN = TRT01PN,
# Dates
RANDDT = RFSTDTC,
TRTSDT = RFSTDTC,
# Population flags
SAFFL = "Y",
ITTFL = "Y",
PPROTFL = "Y"
)
# Step 2: Derive baseline HbA1c from SDTM LB
# Baseline = last non-missing value on or before randomisation date
hba1c_baseline <- sdtm_lb |>
filter(LBTESTCD == "HBA1C") |>
left_join(sdtm_dm |> select(USUBJID, RFSTDTC), by = "USUBJID") |>
filter(LBDTC <= RFSTDTC) |> # On or before randomisation
group_by(USUBJID) |>
slice_max(LBDTC, n = 1) |> # Last value before/at randomisation
ungroup() |>
select(USUBJID, HBA1CBL = LBSTRESN)
# Step 3: Derive BMI from raw demographics
bmi_baseline <- raw_demographics |>
mutate(
USUBJID = paste("GLPX-001", site, pt_id, sep = "-"),
BMIBL = round(weight_kg / (height_cm / 100)^2, 1)
) |>
select(USUBJID, BMIBL, WTBL = weight_kg, HTBL = height_cm)
# Step 4: Merge everything onto ADSL
adsl <- adsl |>
left_join(hba1c_baseline, by = "USUBJID") |>
left_join(bmi_baseline, by = "USUBJID") |>
select(STUDYID, USUBJID, SUBJID, SITEID,
AGE, AGEGR1, AGEU, SEX, RACE, ETHNIC,
TRT01P, TRT01PN, TRT01A, TRT01AN,
ARMCD, ARM, RANDDT, TRTSDT,
SAFFL, ITTFL, PPROTFL,
HBA1CBL, BMIBL, WTBL, HTBL)
# Preview
adsl |>
select(USUBJID, AGE, AGEGR1, SEX, TRT01P, HBA1CBL, BMIBL, SAFFL) |>
print()3 Module 3: ADaM Deep Dive
Clinical Data Science for Pharma β CDISC From Scratch
3.1 What We Do in This Module
In Module 2 we built SDTM β the standardised collected data. Now we build ADaM β the analysis-ready data derived from SDTM.
By the end of this module you will:
- Understand the ADaM principles: traceability, one record per analysis need
- Build ADSL β the subject-level dataset that every other ADaM merges with
- Build ADLB β the lab dataset with baseline, change from baseline, and analysis flags
- Understand analysis flags and population flags
- Be ready to produce TLFs from ADaM in Module 4
We continue with our running trial: GLPX-001.
3.2 ADaM Principles: Read These First
Before writing a single line of code, understand these three rules. They govern every ADaM dataset you will ever build.
3.2.1 Rule 1: Traceability
Every derived variable must be traceable back to SDTM. If an FDA reviewer asks βwhere did this number come from?β you must be able to answer with a specific SDTM variable and dataset.
This means: never invent values. Every derivation must follow a rule documented in the Statistical Analysis Plan (SAP).
3.2.2 Rule 2: One record per analysis need
SDTM has one row per observation. ADaM has one row per analysis-relevant observation. These are not always the same.
For example: SDTM LB has a row for every lab result ever collected. ADLB may have additional derived rows (e.g., a βbaselineβ record that combines information from multiple SDTM records).
3.2.3 Rule 3: ADSL is built first β always
ADSL (Subject-Level Analysis Dataset) contains one row per subject and all key baseline and treatment variables. Every other ADaM dataset merges with ADSL to pick up subject-level information.
Build ADSL first. Always.
3.3 Part 1: Building ADSL
3.3.1 What ADSL contains
ADSL has exactly one row per subject. It is the master subject-level dataset. Key variable groups:
| Group | Examples | Purpose |
|---|---|---|
| Identifiers | USUBJID, SUBJID, SITEID |
Link to SDTM |
| Demographics | AGE, AGEGR1, SEX, RACE |
Subgroup analyses |
| Treatment | TRT01P, TRT01A, RANDDT |
What drug, when |
| Timing | TRTSDT, TRTEDTM, EOSDT |
Trial period dates |
| Population flags | SAFFL, ITTFL, PPROTFL |
Who is in which analysis |
| Baseline values | HBA1CBL, BMIBL, WTBL |
Baseline characteristics |
3.3.2 Population flags β a critical concept
Every clinical trial defines analysis populations. The most common:
| Flag | Population | Definition |
|---|---|---|
SAFFL |
Safety | Randomised + received at least one dose |
ITTFL |
Intent-to-Treat | All randomised subjects |
PPROTFL |
Per-Protocol | Completed trial without major protocol deviations |
These flags drive which subjects are included in each analysis. A subject might be in the ITT population but excluded from the per-protocol population due to a protocol deviation.
In our GLPX-001 trial we assume all subjects received treatment, so SAFFL = ITTFL = βYβ for all.
3.3.3 Build ADSL in SAS
/*=============================================================
GLPX-001 | Module 3 | Build ADaM ADSL
=============================================================*/
/* Step 1: Start with SDTM DM as the base */
data adam.adsl_base;
set sdtm.dm;
/* Carry forward identifiers and demographics */
/* These come directly from DM β full traceability */
/* Age group derivation */
if AGE < 40 then AGEGR1 = "<40";
else if 40 <= AGE < 55 then AGEGR1 = "40-54";
else if 55 <= AGE < 65 then AGEGR1 = "55-64";
else if AGE >= 65 then AGEGR1 = ">=65";
/* Treatment variables */
/* TRT01P = Planned Treatment Period 1 */
TRT01P = ARM;
TRT01PN = .; /* Numeric code β useful for sorting in TLFs */
if ARMCD = "DRUG1" then TRT01PN = 1;
else if ARMCD = "DRUG2" then TRT01PN = 2;
else if ARMCD = "PLACEBO" then TRT01PN = 3;
/* Actual treatment (same as planned β no switches) */
TRT01A = ARM;
TRT01AN = TRT01PN;
/* Randomisation date from RFSTDTC */
RANDDT = RFSTDTC;
/* Treatment start date (same as randomisation in this trial) */
TRTSDT = RFSTDTC;
/* Population flags β all subjects received treatment */
SAFFL = "Y";
ITTFL = "Y";
PPROTFL = "Y"; /* Simplified: assume no protocol deviations */
keep STUDYID USUBJID SUBJID SITEID
AGE AGEGR1 AGEU SEX RACE ETHNIC
TRT01P TRT01PN TRT01A TRT01AN
ARMCD ARM RANDDT TRTSDT
SAFFL ITTFL PPROTFL RFSTDTC;
run;
/* Step 2: Derive baseline lab values from SDTM LB
Baseline = last non-missing value on or before randomisation date */
proc sql;
create table adam.adsl_labs as
select
dm.USUBJID,
/* HbA1c at baseline */
max(case when lb.LBTESTCD = "HBA1C"
and lb.LBDTC <= dm.RFSTDTC
then lb.LBSTRESN else . end) as HBA1CBL,
/* BMI at baseline β derived from weight and height in DM */
round(dm_w.weight_kg / ((dm_w.height_cm/100)**2), 0.1) as BMIBL
from sdtm.dm as dm
left join sdtm.lb as lb on dm.USUBJID = lb.USUBJID
left join raw.demographics as dm_w on dm.SUBJID = dm_w.pt_id
group by dm.USUBJID, dm_w.weight_kg, dm_w.height_cm;
quit;
/* Step 3: Merge baseline labs onto ADSL */
proc sort data=adam.adsl_base; by USUBJID; run;
proc sort data=adam.adsl_labs; by USUBJID; run;
data adam.adsl;
merge adam.adsl_base (in=a)
adam.adsl_labs (in=b);
by USUBJID;
if a; /* Keep all subjects from base, whether or not labs merge */
run;
proc print data=adam.adsl (obs=5);
var USUBJID AGE AGEGR1 SEX TRT01P HBA1CBL BMIBL SAFFL ITTFL;
title "ADSL β First 5 Subjects";
run;
3.3.4 Build ADSL in R
3.4 What ADSL looks like
| USUBJID | AGE | AGEGR1 | SEX | TRT01P | HBA1CBL | BMIBL | SAFFL | ITTFL |
|---|---|---|---|---|---|---|---|---|
| GLPX-001-001-0001 | 54 | 40-54 | M | Drug 1mg | 8.2 | 31.2 | Y | Y |
| GLPX-001-001-0002 | 62 | 55-64 | F | Drug 2mg | 9.0 | 28.7 | Y | Y |
| GLPX-001-001-0003 | 48 | 40-54 | F | Placebo | 7.8 | 33.7 | Y | Y |
4 π§ͺ Exercise 3.1 β Build ADSL
5 Part 2: Building ADLB
ADLB is derived from SDTM LB and is designed to support all lab-related analyses. The most important additions compared to SDTM LB are:
| New variable | Meaning |
|---|---|
PARAM |
Full parameter label |
PARAMCD |
Parameter code |
AVAL |
Analysis value (numeric) |
BASE |
Baseline value |
CHG |
Change from baseline (AVAL β BASE) |
PCHG |
Percent change from baseline |
ANL01FL |
Analysis flag β Y = include in primary analysis |
BASEFL |
Flag for the baseline record itself |
ABLFL |
Actual baseline flag |
5.1 Defining baseline in ADLB
Baseline is defined in the SAP. For GLPX-001:
Baseline is defined as the last non-missing value collected on or before the date of first dose.
This is critical: baseline is an analytical decision, not just the screening visit. If a subject has a lab result on day -3 and another on day -1, the baseline is the day -1 value (last before dose).
5.2 Build ADLB in SAS
/*=============================================================
GLPX-001 | Module 3 | Build ADaM ADLB
=============================================================*/
/* Step 1: Start from SDTM LB, merge ADSL for subject-level vars */
proc sort data=sdtm.lb; by USUBJID; run;
proc sort data=adam.adsl; by USUBJID; run;
data adlb_step1;
merge sdtm.lb (in=a)
adam.adsl (in=b
keep=USUBJID TRT01P TRT01PN TRT01A TRT01AN
TRTSDT SAFFL ITTFL);
by USUBJID;
if a;
/* Parameter code and label */
PARAMCD = LBTESTCD;
select (LBTESTCD);
when ("HBA1C") PARAM = "HbA1c (%)";
when ("GLUC") PARAM = "Glucose (mmol/L)";
otherwise PARAM = LBTEST;
end;
/* Analysis value = standardised numeric result */
AVAL = LBSTRESN;
ADT = LBDTC; /* Analysis date */
/* Baseline flag: last record on or before treatment start */
if LBDTC <= TRTSDT then ABLFL_CAND = 1;
else ABLFL_CAND = 0;
run;
/* Step 2: Identify the actual baseline record per subject per param */
proc sort data=adlb_step1;
by USUBJID PARAMCD descending LBDTC;
run;
data adlb_step2;
set adlb_step1;
by USUBJID PARAMCD;
/* First record per subject+param in descending date order
that is on/before treatment start = the baseline */
if first.PARAMCD and ABLFL_CAND = 1 then ABLFL = "Y";
else ABLFL = "";
run;
/* Step 3: Merge baseline value onto all records */
data baseline_vals;
set adlb_step2;
where ABLFL = "Y";
keep USUBJID PARAMCD AVAL;
rename AVAL = BASE;
run;
proc sort data=adlb_step2; by USUBJID PARAMCD; run;
proc sort data=baseline_vals; by USUBJID PARAMCD; run;
data adlb_step3;
merge adlb_step2 (in=a)
baseline_vals (in=b);
by USUBJID PARAMCD;
if a;
/* Change from baseline */
if BASE ne . and AVAL ne . then do;
CHG = AVAL - BASE;
PCHG = (CHG / BASE) * 100;
end;
/* Analysis flag: include post-baseline records in primary analysis */
if ABLFL ne "Y" and AVAL ne . then ANL01FL = "Y";
else ANL01FL = "";
run;
/* Step 4: Add analysis visit labels */
data adam.adlb;
set adlb_step3;
/* Map visit number to analysis visit */
select (VISITNUM);
when (1) AVISIT = "Baseline";
when (2) AVISIT = "Week 13";
when (3) AVISIT = "Week 26";
when (4) AVISIT = "Week 52";
otherwise AVISIT = VISIT;
end;
AVISITN = VISITNUM;
keep STUDYID USUBJID PARAMCD PARAM AVISIT AVISITN
ADT AVAL BASE CHG PCHG
ABLFL ANL01FL
TRT01P TRT01PN SAFFL ITTFL;
run;
proc print data=adam.adlb (obs=10);
var USUBJID PARAMCD AVISIT AVAL BASE CHG ANL01FL;
title "ADLB β First 10 Records";
run;
5.3 Build ADLB in R
# Step 1: Join SDTM LB with ADSL subject-level variables
adlb_step1 <- sdtm_lb |>
left_join(
adsl |> select(USUBJID, TRT01P, TRT01PN, TRTSDT, SAFFL, ITTFL),
by = "USUBJID"
) |>
mutate(
PARAMCD = LBTESTCD,
PARAM = case_when(
LBTESTCD == "HBA1C" ~ "HbA1c (%)",
LBTESTCD == "GLUC" ~ "Glucose (mmol/L)",
TRUE ~ LBTEST
),
AVAL = LBSTRESN,
ADT = LBDTC
)
# Step 2: Flag baseline records
# Baseline = last non-missing value on or before treatment start date
adlb_step2 <- adlb_step1 |>
group_by(USUBJID, PARAMCD) |>
mutate(
# Candidate baseline: on or before treatment start
is_bl_candidate = ADT <= TRTSDT & !is.na(AVAL),
# Among candidates, flag the last one (latest date)
ABLFL = if_else(
is_bl_candidate & ADT == max(ADT[is_bl_candidate], na.rm = TRUE),
"Y", ""
)
) |>
ungroup()
# Step 3: Derive baseline value and merge back
baseline_vals <- adlb_step2 |>
filter(ABLFL == "Y") |>
select(USUBJID, PARAMCD, BASE = AVAL)
adlb_step3 <- adlb_step2 |>
left_join(baseline_vals, by = c("USUBJID", "PARAMCD")) |>
mutate(
CHG = if_else(!is.na(AVAL) & !is.na(BASE), AVAL - BASE, NA_real_),
PCHG = if_else(!is.na(CHG) & BASE != 0, (CHG / BASE) * 100, NA_real_),
ANL01FL = if_else(ABLFL != "Y" & !is.na(AVAL), "Y", "")
)
# Step 4: Add analysis visit labels
adlb <- adlb_step3 |>
mutate(
AVISIT = case_when(
VISITNUM == 1 ~ "Baseline",
VISITNUM == 2 ~ "Week 13",
VISITNUM == 3 ~ "Week 26",
VISITNUM == 4 ~ "Week 52",
TRUE ~ VISIT
),
AVISITN = VISITNUM
) |>
select(STUDYID, USUBJID, PARAMCD, PARAM,
AVISIT, AVISITN, ADT,
AVAL, BASE, CHG, PCHG,
ABLFL, ANL01FL,
TRT01P, TRT01PN, SAFFL, ITTFL)
# Preview
adlb |>
filter(PARAMCD == "HBA1C") |>
select(USUBJID, AVISIT, AVAL, BASE, CHG, ANL01FL) |>
print(n = 10)5.4 What ADLB looks like
| USUBJID | PARAMCD | AVISIT | AVAL | BASE | CHG | ANL01FL |
|---|---|---|---|---|---|---|
| GLPX-001-001-0001 | HBA1C | Baseline | 8.2 | 8.2 | 0.0 | |
| GLPX-001-001-0001 | HBA1C | Week 13 | 7.6 | 8.2 | -0.6 | Y |
| GLPX-001-001-0001 | HBA1C | Week 26 | 7.1 | 8.2 | -1.1 | Y |
| GLPX-001-001-0001 | HBA1C | Week 52 | 6.8 | 8.2 | -1.4 | Y |
| GLPX-001-001-0002 | HBA1C | Baseline | 9.0 | 9.0 | 0.0 | |
| GLPX-001-001-0002 | HBA1C | Week 13 | 8.3 | 9.0 | -0.7 | Y |
Notice: - BASE is the same value repeated on every row for that subject+parameter - CHG is 0 at baseline (AVAL = BASE) - ANL01FL is empty at baseline β the baseline record itself is not included in the change-from-baseline analysis
6 π§ͺ Exercise 3.2 β Understanding CHG and BASE
7 π§ͺ Exercise 3.3 β Build a Glucose ADLB Subset
8 π§ͺ Exercise 3.4 β Challenge: Plot HbA1c Over Time
9 Validate Your ADaM
Always check your ADaM datasets before moving to TLFs.
# 1. ADSL: one row per subject
stopifnot(nrow(adsl) == n_distinct(adsl$USUBJID))
cat("β
ADSL: one row per subject\n")
# 2. BASE is consistent within subject+parameter
base_check <- adlb |>
group_by(USUBJID, PARAMCD) |>
summarise(n_base_vals = n_distinct(BASE, na.rm = TRUE), .groups = "drop") |>
filter(n_base_vals > 1)
if (nrow(base_check) == 0) {
cat("β
ADLB: BASE is consistent within subject+parameter\n")
} else {
cat("β ADLB: inconsistent BASE values found\n")
print(base_check)
}
# 3. CHG = AVAL - BASE
chg_check <- adlb |>
filter(!is.na(AVAL), !is.na(BASE), !is.na(CHG)) |>
mutate(chg_correct = abs((AVAL - BASE) - CHG) < 0.001) |>
filter(!chg_correct)
if (nrow(chg_check) == 0) {
cat("β
ADLB: CHG = AVAL - BASE for all records\n")
} else {
cat("β ADLB: CHG derivation errors found\n")
print(chg_check)
}
# 4. All ADLB subjects in ADSL
orphans <- adlb |>
anti_join(adsl, by = "USUBJID") |>
distinct(USUBJID)
if (nrow(orphans) == 0) {
cat("β
ADLB: all subjects found in ADSL\n")
} else {
cat("β ADLB: subjects in ADLB not in ADSL\n")
print(orphans)
}10 Save Your ADaM Datasets
library(haven)
write_xpt(adsl, "data/adam/adsl.xpt", version = 5, name = "ADSL")
write_xpt(adlb, "data/adam/adlb.xpt", version = 5, name = "ADLB")
cat("ADaM datasets saved to data/adam/\n")/* SAS */
libname xptout xport "/path/to/adam/adsl.xpt";
proc copy in=adam out=xptout; select adsl; run;
libname xptout xport "/path/to/adam/adlb.xpt";
proc copy in=adam out=xptout; select adlb; run;
11 Module 3 Summary
12 Your Tasks Before Module 4
Answer: If BASE is missing (subject had no pre-treatment lab result), then CHG = missing for all records. How to handle this is defined in the SAP β common options include using the screening value as a fallback baseline, or excluding the subject from the change-from-baseline analysis.
13 Whatβs Next
In Module 4 we take our ADSL and ADLB and produce the actual Tables, Listings, and Figures β the outputs that go into the Clinical Study Report. We will build:
- Table 14.1.1: Demographics and Baseline Characteristics
- Table 14.2.1: Primary Efficacy β Change from Baseline in HbA1c
- Figure 14.2.1: Mean HbA1c Over Time (the plot from Exercise 3.4)
This course is open source and free forever. Found an error or want to contribute? Open an issue or pull request on GitHub.