Example of plots and tables to create for data checking.
For a dataset that is much larger that the example one used here, the
default figure size may be too small for all plots to be readable. In
that case, add or edit the fig.height
and
fig.width
in the top of each chunk. For example:
{r, fig.height = 6, fig.width = 8}
Data specifications can be accessed on Datasets and Rmarkdown template to generate this page can be found on Rmarkdown-Template. You may also download the Multiple Ascending Dose PK/PD dataset for your reference (download dataset).
library(tidyverse)
library(DT)
library(xgxr)
#set chunk default options
xgx_theme_set()
knitr::opts_chunk$set(warning = FALSE, message = FALSE, fig.height = 4, fig.width = 4)
options(DT.options = list(pageLength = 20)) #pageLength = number of entries per page
#specify the time units and your covariates
covariates = c("WEIGHT0","AGE0","SEX","WEIGHT0_CAT")
time_units_dataset = "days"
filename = "../Data/Data_Checking.csv"
data_in = read.csv(filename, stringsAsFactors = FALSE)
data = data_in %>%
# in this mutate command are all columns that are required for this check
# if your dataset has different naming convention,
# then rename the right hand side of the equality
mutate(USUBJID = USUBJID, # Unique subject identifier
TIME = TIME, # Actual Time
NOMTIME = NOMTIME, # Nominal Time
AMT = AMT, # Dose Amount
LIDV = LIDV, # Dependent variable
YTYPE = YTYPE, # Number for type of dependent variable
NAME = NAME, # Name for the type of dependent variable (e.g. PK)
MDV = MDV, # Missing Dependent Varibale (0 = no, 1 = yes)
CENS = CENS, # Censored Data (0 = no, 1 = yes)
EVID = EVID, # Event ID. 0 = Dependent Variabale, 1 = dose
TRT = TRT, # Treatment Arm character description
TRTN = TRTN) %>%# Treatment Arm numeric description (used for sorting)
arrange(TRTN) %>%
mutate(TRT_low2high = factor(TRT, levels = unique(TRT)),
TRT_high2low = factor(TRT, levels = rev(unique(TRT))))
data1 = data %>%
filter(!duplicated(USUBJID))
data = data %>%
mutate(USUBJID = paste0("PAT",USUBJID),
WEIGHT0_CAT = case_when(WEIGHT0 < 60 ~ "WT<60", #creating 2nd categorical var
WEIGHT0 > 80 ~ "WT>80",
TRUE ~ "WT:60-80"))
# a patient with no PK or PD data is added to the dataset to illustrate how the
# data checking will report when such data is present. This code block should be
# removed if you'll be using it with your dataset.
empty_row = data[1,] %>%
mutate(ID = 23,
USUBJID = paste0("PAT",ID),
LIDV = 0,
YTYPE = 0,
ADM = 1,
CMT = 0,
NAME = "Dose",
EVENTU = "mg",
UNIT = "mg",
MDV = 1,
CENS = 0,
EVID = 1)
data = bind_rows(data, empty_row)
data1 = data %>%
filter(!duplicated(USUBJID))
Check to see the number of patients in the dataset is what you expect.
total_patients = data %>%
filter(!duplicated(USUBJID)) %>%
tally() %>%
rename(n_patients = n) %>%
mutate(TRT_low2high = "TOTAL NUMBER")
summary_by_trt = data %>%
filter(!duplicated(USUBJID)) %>%
group_by(TRT_low2high) %>%
tally() %>%
rename(n_patients = n) %>%
bind_rows(total_patients)
datatable(summary_by_trt)
This is another quick way to see if there is way more or way less data than you expected.
summary_by_trt_ytype = data %>%
group_by(YTYPE, NAME, TRT_low2high) %>%
tally() %>%
rename(n_data_points = n) %>%
mutate(YTYPE_NAME = paste0(YTYPE, ":", NAME)) %>%
ungroup() %>%
select(-YTYPE, -NAME) %>%
spread(YTYPE_NAME, n_data_points)
datatable(summary_by_trt_ytype)
Summary of the dosing information. It can be useful to know if there are patients in the database that never received a dose of the drug. Also, usually AMT usually should not equal 0, and so
dose = data %>%
filter(EVID == 1)
dose_summ = data %>%
group_by(USUBJID) %>%
summarize(total_dose = sum(AMT))
dose_summary = data.frame(
patients_that_never_received_drug = sum(dose_summ$total_dose == 0),
entries_where_AMT_equals_0 = sum(dose$AMT == 0),
entries_where_AMT_greater_than_0 = sum(dose$AMT > 0)) %>%
t() %>%
as.data.frame() %>%
rename(N = V1)
datatable(dose_summary)
# compute number of DV by YTYPE and by patient
# with this method, NAs also mean 0
# here we do not remove EVID == 0 records,
# so that patients with a dose but no DV values will be included in the count
dv_number_by_patient = data %>%
group_by(YTYPE, USUBJID) %>%
summarize(n_obs = sum(MDV == 0 & !is.na(LIDV))) %>%
ungroup() %>%
spread(YTYPE, n_obs)
# count total number of patients missing any DV of each type
dv_number_summary = dv_number_by_patient %>%
select(-USUBJID) %>%
summarize_all(c(Nnone = function(x) sum(x == 0 | is.na(x)),
Nmin = function(x) min(x, na.rm = TRUE),
Nmedian = function(x) round(median(x, na.rm = TRUE)),
Nmax = function(x) max(x, na.rm = TRUE))) %>%
t() %>%
as.data.frame() %>%
rename(N = V1)
dv_number_summary$YTYPE_SUMM = row.names(dv_number_summary)
dv_number_summary = dv_number_summary %>%
mutate(YTYPE = as.numeric(str_extract(YTYPE_SUMM, "^\\d+")),
SUMM = str_extract(YTYPE_SUMM, "[A-Za-z]+$")) %>%
select(-YTYPE_SUMM) %>%
spread(SUMM, N)
# create an overall summary table
dv_summary_overall = data %>%
filter(EVID == 0) %>%
group_by(YTYPE, NAME) %>%
summarize(n_total = n(),
n_missing_or_NA = sum(is.na(LIDV) | MDV == 1),
n_zeroes = sum(LIDV == 0),
n_censored = sum(CENS == 1),
n_negative = sum(LIDV < 0),
n_duplicate_times = sum(duplicated(paste(USUBJID,TIME))),
min = min (LIDV, na.rm = TRUE),
Q1 = quantile(LIDV, 0.25, na.rm = TRUE),
median = median (LIDV, na.rm = TRUE),
Q3 = quantile(LIDV, 0.75, na.rm = TRUE),
max = max (LIDV, na.rm = TRUE)) %>%
left_join(dv_number_summary, by = "YTYPE") %>%
mutate(YTYPE_NAME = paste0(YTYPE,":",NAME)) %>%
ungroup() %>%
select(-YTYPE, -NAME) %>%
t() %>%
as.data.frame()
names(dv_summary_overall) = as.matrix(dv_summary_overall["YTYPE_NAME",])
dv_summary_overall$Value = row.names(dv_summary_overall)
row.names(dv_summary_overall) = c()
dv_summary_overall = dv_summary_overall %>%
filter(Value != "YTYPE_NAME") %>%
mutate(Type = case_when(str_detect(Value, "^n_") ~ "Number of data points",
str_detect(Value, "^N") ~ "Number of data points per patient",
TRUE ~ "Value of data points")) %>%
mutate(Value = str_replace(Value, "^n_", ""),
Value = str_replace(Value, "^N", "")) %>%
select(Type, Value, everything()) %>%
arrange(Type)
datatable(dv_summary_overall)
Provide overview of the covariates. Check to see in particular that the distributions (along the diagonals) make sense.
num_unique_vals = data1[,covariates] %>%
summarise_all(function(x){length(unique(x))}) %>%
as.numeric()
cts_cov = data1[,covariates[num_unique_vals >= 8]]
cat_cov = data1[,covariates[num_unique_vals < 8]]
cts_cov_summary = cts_cov %>%
gather() %>%
group_by(key) %>%
summarise(n_missing_or_NA = sum(is.na(value)),
min = min (value, na.rm = TRUE),
Q1 = quantile(value, 0.25, na.rm = TRUE),
median = median (value, na.rm = TRUE),
Q3 = quantile(value, 0.75, na.rm = TRUE),
max = max (value, na.rm = TRUE))
datatable(cts_cov_summary)
cat_summary = function(x) {
unique_x = sort(unique(x))
str = 1:length(unique_x)
for(i in 1:length(unique_x))
{
str[i] = paste0(unique_x[i], "=", sum(x == unique_x[i], na.rm = TRUE))
}
output = paste(str, collapse = ", ")
}
cat_cov_summary = cat_cov %>%
gather() %>%
group_by(key) %>%
summarise(n_missing_or_NA = sum(is.na(value)),
n_distinct = n_distinct(value),
count_summary = cat_summary(value))
datatable(cat_cov_summary)
GGally::ggpairs(data1,
columns = covariates,
diag = list(continuous = "barDiag"))
Often, a dataset is not supposed to contain any NAs. This section highlights where the NAs occur. If the table below is empty, then there are no NAs in the dataset.
na_summary = data %>%
dplyr::summarise_all(function(x) {sum(is.na(x))}) %>%
t() %>%
as.data.frame() %>%
rename(N_NA = V1)
na_summary$Column = names(data)
na_summary = na_summary %>%
select(Column, N_NA) %>%
filter(N_NA > 0)
datatable(na_summary)
Get an overview of the timing of the dose, PK, and PD data collection
for all patients and make sure it’s in line with expectations. Each “x”
denotes a record, i.e. a dose time or a PK or PD assessment time. Each
red circle denotes what may be a dose interruption, when the time
between doses is greater than DT_flag
and
NAME == "Dose"
You might need to adjust the figure height and width in the line below, depending on how many different YTYPEs and how many patients your dataset has.
DT_flag = 10 # >10 days between doses is flagged
NAME_DOSE = "Dose" # NAME == NAME_DOSE is flagged
data_dose_interruption = data %>%
group_by(USUBJID, NAME) %>%
mutate(DT = lead(TIME, default = max(TIME)) - TIME,
TMID = TIME + DT/2) %>%
ungroup() %>%
filter(DT > DT_flag, NAME == NAME_DOSE)
g = ggplot(data, aes(x = TIME, y = USUBJID))
g = g + geom_point(shape = 4)
g = g + geom_point(data = data_dose_interruption,
aes(x = TMID, y = USUBJID),
color = "red", alpha = 0.2, size = 5)
g = g + xgx_scale_x_time_units(units_dataset = time_units_dataset)
g = g + facet_wrap(~YTYPE+NAME, labeller = label_both)
print(g)
A common error is that either the nominal or actual time were derived incorrectly. Thus a quick sanity check is to plot nominal vs actual time. Any points that don’t lie near the identity line require further investigation.
xymin = min(c(data$NOMTIME, data$TIME), na.rm = TRUE)
xymax = max(c(data$NOMTIME, data$TIME), na.rm = TRUE)
g = ggplot(data = data, aes(x = TIME, y = NOMTIME))
g = g + geom_point()
g = g + annotate("segment", x = xymin, xend = xymax, y = xymin, yend = xymax, color = "blue")
g = g + xlim(c(xymin, xymax))
g = g + ylim(c(xymin, xymax))
print(g)
difftime = data %>%
mutate(DIFF_TIME = abs(NOMTIME - TIME)) %>%
arrange(-DIFF_TIME) %>%
slice(1:10)
difftime %>%
select(USUBJID, TRT, DIFF_TIME, TIME, NOMTIME, YTYPE, NAME, LIDV) %>%
DT::datatable(difftime)
Look at the nominal times in the dataset, just to make sure they’re correct. In this case, there is a mistake because nominal time has units of days and the 5m, 1h, 2h, 4h, and 8h nominal time values were all set to “1” rather than having unique values (i.e. 1 hour should be 1/24 = 0.041 rather than 1).
cat(sort(unique(data$NOMTIME)))
## 1 2 4 8 15 22 29 30 32 36 43 50 57 64 71 78 85 92 99 106 113 120 127 134 141 148 155 162 169 176 183 190 197 204 568
sessionInfo()
## R version 4.1.0 (2021-05-18)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Red Hat Enterprise Linux Server 7.9 (Maipo)
##
## Matrix products: default
## BLAS/LAPACK: /CHBS/apps/EB/software/imkl/2019.1.144-gompi-2019a/compilers_and_libraries_2019.1.144/linux/mkl/lib/intel64_lin/libmkl_gf_lp64.so
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8 LC_MONETARY=en_US.UTF-8
## [6] LC_MESSAGES=en_US.UTF-8 LC_PAPER=en_US.UTF-8 LC_NAME=C LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] DT_0.26 forcats_0.5.2 stringr_1.4.1 purrr_0.3.5 readr_2.1.3 tibble_3.1.8 tidyverse_1.3.2 xgxr_1.1.1 zoo_1.8-11
## [10] gridExtra_2.3 tidyr_1.2.1 dplyr_1.0.10 ggplot2_3.3.6
##
## loaded via a namespace (and not attached):
## [1] googledrive_2.0.0 colorspace_2.0-3 deldir_1.0-6 ellipsis_0.3.2 class_7.3-19 htmlTable_2.2.1 base64enc_0.1-3
## [8] fs_1.5.2 gld_2.6.2 rstudioapi_0.14 proxy_0.4-26 farver_2.1.1 Deriv_4.1.3 fansi_1.0.3
## [15] mvtnorm_1.1-3 lubridate_1.8.0 xml2_1.3.3 codetools_0.2-18 splines_4.1.0 cachem_1.0.6 rootSolve_1.8.2.2
## [22] knitr_1.40 Formula_1.2-4 jsonlite_1.8.3 broom_1.0.1 binom_1.1-1 cluster_2.1.3 dbplyr_2.2.1
## [29] png_0.1-7 compiler_4.1.0 httr_1.4.4 backports_1.4.1 assertthat_0.2.1 Matrix_1.5-1 fastmap_1.1.0
## [36] gargle_1.2.1 cli_3.4.1 prettyunits_1.1.1 htmltools_0.5.3 tools_4.1.0 gtable_0.3.1 glue_1.6.2
## [43] lmom_2.8 Rcpp_1.0.9 cellranger_1.1.0 jquerylib_0.1.4 vctrs_0.5.0 nlme_3.1-160 crosstalk_1.2.0
## [50] xfun_0.34 rvest_1.0.3 lifecycle_1.0.3 googlesheets4_1.0.1 MASS_7.3-58.1 scales_1.2.1 hms_1.1.2
## [57] expm_0.999-6 RColorBrewer_1.1-3 yaml_2.3.6 Exact_2.1 pander_0.6.4 sass_0.4.2 rpart_4.1.16
## [64] reshape_0.8.8 latticeExtra_0.6-30 stringi_1.7.8 highr_0.9 e1071_1.7-8 checkmate_2.1.0 boot_1.3-28
## [71] rlang_1.0.6 pkgconfig_2.0.3 bitops_1.0-7 evaluate_0.17 lattice_0.20-45 htmlwidgets_1.5.4 labeling_0.4.2
## [78] tidyselect_1.2.0 GGally_2.1.2 plyr_1.8.7 magrittr_2.0.3 R6_2.5.1 DescTools_0.99.42 generics_0.1.3
## [85] Hmisc_4.7-0 DBI_1.1.3 pillar_1.8.1 haven_2.5.1 foreign_0.8-82 withr_2.5.0 mgcv_1.8-41
## [92] survival_3.4-0 RCurl_1.98-1.4 nnet_7.3-17 crayon_1.5.2 modelr_0.1.9 interp_1.1-2 utf8_1.2.2
## [99] tzdb_0.3.0 rmarkdown_2.17 progress_1.2.2 jpeg_0.1-9 grid_4.1.0 readxl_1.4.1 minpack.lm_1.2-1
## [106] data.table_1.14.2 reprex_2.0.2 digest_0.6.30 munsell_0.5.0 bslib_0.4.0