Read Preprocessed File
We read output data from the previous section.
Check for integer vector
We have a function that checks if the numeric vector has integers.
Code
non_integer_data <- data.frame(
non_integer_col = c(-1, 0, NA, 2.0000,
3.010, pi, exp(1)
)
)
non_integer_data |>
pointblank::col_vals_expr(
expr = ~ harmonisation::is_integer_vector(
input_vector = non_integer_data[["non_integer_col"]],
allow_na = TRUE)
)
Error: The `col_vals_expr()` validation failed beyond the absolute threshold level (1).
* failure level (3) >= failure threshold (1)
Demographics and Behavioral parameters
Age and Sex
age_years
will be mapped from the column age
. age
value of 0 is set as missing.
sex
is grouped as follows:
Code
age_gender_data <- cohort_A_data |>
dplyr::select(c("cohort_unique_id",
"age",
"sex")) |>
pointblank::col_vals_expr(
expr = ~ harmonisation::is_integer_vector(
cohort_A_data[["age"]],
allow_na = TRUE)
) |>
dplyr::mutate(
# Convert age to type integer
age_years = as.integer(.data[["age"]]),
# Convert age of 0 to NA
age_years = dplyr::case_when(
.data[["age_years"]] == 0 ~ NA_integer_,
.default = .data[["age_years"]]
),
sex_before = .data[["sex"]],
# Convert categorical columns to factors
sex = dplyr::case_when(
.data[["sex_before"]] == "F" ~ "0",
.data[["sex_before"]] == "M" ~ "1",
.default = as.character(.data[["sex_before"]])
),
`sex_before` = forcats::fct_relevel(
.data[["sex_before"]],
c("F", "M")
),
sex = forcats::fct_relevel(
.data[["sex"]],
c("0", "1")),
) |>
dplyr::relocate(
"sex",
.after = "sex_before"
) |>
pointblank::col_vals_in_set(
columns = "sex",
set = c("0", "1")
) |>
pointblank::col_vals_between(
columns = "age_years",
left = 0,
right = 100,
inclusive = c(FALSE, TRUE),
na_pass = TRUE
)
Code
if (params$show_table && knitr::is_html_output()) {
age_gender_data |>
harmonisation::reactable_with_download_csv_button()
}
Remove unnecessary columns so that we can merge with the other fields.
Code
age_gender_data <- age_gender_data |>
dplyr::select(-c("age", "sex_before"))
Height, Weight, BMI and BSA
height_cm
will be mapped from the column height
. weight_kg
will be mapped from the column weight
.
bsa_m2
in m^2 will be calculated as sqrt([Height(cm) x Weight(kg)]/3600) bmi
will be calculated as Weight(kg)/((Height(m))^2)
All values are then converted to two decimal places.
To date, only patient A010
has a bmi greater than 50.
Code
body_measurement_data <- cohort_A_data |>
dplyr::select(c("cohort_unique_id",
"weight", "height")) |>
dplyr::mutate(
height_cm = .data[["height"]],
weight_kg = .data[["weight"]],
bsa_m2 = sqrt((.data[["height_cm"]] * .data[["weight_kg"]]) / 3600),
bsa_m2 = harmonisation::round_to_nearest_digit(.data[["bsa_m2"]], digits = 2),
bmi = .data[["weight_kg"]] / ((.data[["height_cm"]] / 100)^2),
bmi = harmonisation::round_to_nearest_digit(.data[["bmi"]], digits = 2),
height_cm = harmonisation::round_to_nearest_digit(.data[["height_cm"]], digits = 2),
weight_kg = harmonisation::round_to_nearest_digit(.data[["weight_kg"]], digits = 2)
) |>
pointblank::col_vals_gt(
columns = "bmi",
preconditions = ~ . %>%
dplyr::filter(
.data[["cohort_unique_id"]] %in% c("A010")
),
value = 50,
na_pass = TRUE
) |>
pointblank::col_vals_between(
columns = "bmi",
preconditions = ~ . %>%
dplyr::filter(
!.data[["cohort_unique_id"]] %in% c("A010")
),
left = 10,
right = 50,
inclusive = c(TRUE, TRUE),
na_pass = TRUE
)
Code
if (params$show_table && knitr::is_html_output()) {
body_measurement_data |>
harmonisation::reactable_with_download_csv_button()
}
Remove unnecessary columns so that we can merge with the other fields.
Code
body_measurement_data <- body_measurement_data |>
dplyr::select(-c("height", "weight"))
Smoking History
smoke_current
will be mapped from the column smoke_current_good
. smoke_past
will be mapped from the column smoke_past_good
.
We do a check to ensure that we can only have these scenarios
smoke_current
as 1 and smoke_past
as 0 for current smokers
smoke_current
as 0 and smoke_past
as 1 for past smokers
smoke_current
as 0 and smoke_past
as 0 for non-smokers
smoke_current
as -1 and smoke_past
as -1 for unknown
Code
smoking_data <- cohort_A_data |>
dplyr::select(c("cohort_unique_id",
"smoke_current_good", "smoke_past_good")) |>
dplyr::mutate(
smoke_current = as.character(.data[["smoke_current_good"]]),
smoke_current_good = forcats::fct_relevel(
as.character(.data[["smoke_current_good"]]),
c("0", "1")),
smoke_current = forcats::fct_relevel(
.data[["smoke_current"]],
c("0", "1")),
smoke_past = as.character(.data[["smoke_past_good"]]),
smoke_past_good = forcats::fct_relevel(
as.character(.data[["smoke_past_good"]]),
c("0", "1")),
smoke_past = forcats::fct_relevel(
.data[["smoke_past"]],
c("0", "1")),
) |>
pointblank::col_vals_in_set(
columns = c("smoke_current", "smoke_past"),
set = c("0", "1", "-1")
) |>
pointblank::col_vals_expr(
expr = pointblank::expr(
(.data[["smoke_current"]] == "1" & .data[["smoke_past"]] == "0") |
(.data[["smoke_current"]] == "-1" & .data[["smoke_past"]] == -"1") |
(.data[["smoke_current"]] == "0" & .data[["smoke_past"]] %in% c("0", "1"))
)
)
Code
if (params$show_table && knitr::is_html_output()) {
smoking_data |>
harmonisation::reactable_with_download_csv_button()
}
Remove unnecessary columns so that we can merge with the other fields.
Code
smoking_data <- smoking_data |>
dplyr::select(-c("smoke_current_good", "smoke_past_good"))
Here is a case when the validation has failed.
Code
smoking_data_bad <- cohort_A_data |>
dplyr::select(c("cohort_unique_id",
"smoke_current_bad", "smoke_past_bad")) |>
dplyr::filter(
.data[["cohort_unique_id"]] %in% c("A010", "A016")
) |>
dplyr::mutate(
smoke_current = as.character(.data[["smoke_current_bad"]]),
smoke_past = as.character(.data[["smoke_past_bad"]]),
)
smoking_data_bad |>
pointblank::col_vals_in_set(
columns = c("smoke_current", "smoke_past"),
set = c("0", "1")
) |>
pointblank::col_vals_expr(
expr = pointblank::expr(
(.data[["smoke_current"]] == "1" & .data[["smoke_past"]] == "0") |
(.data[["smoke_current"]] == "-1" & .data[["smoke_past"]] == "-1") |
(.data[["smoke_current"]] == "0" & .data[["smoke_past"]] %in% c("0", "1"))
)
)
Error: The `col_vals_expr()` validation failed beyond the absolute threshold level (1).
* failure level (2) >= failure threshold (1)
Code
if (params$show_table && knitr::is_html_output()) {
smoking_data_bad |>
harmonisation::reactable_with_download_csv_button()
}
Chest Pain
Shortness of Breath
have_sob
values remained unchanged.
Code
shortness_of_breath_data <- cohort_A_data |>
dplyr::select(c("cohort_unique_id", "have_sob")) |>
dplyr::mutate(
have_sob = forcats::fct_relevel(
as.character(.data[["have_sob"]]),
c("0", "1"))
) |>
pointblank::col_vals_in_set(
columns = c("have_sob"),
set = c("0", "1", "-1")
)
Code
if (params$show_table && knitr::is_html_output()) {
shortness_of_breath_data |>
harmonisation::reactable_with_download_csv_button()
}
Have chest pain or not
have_chest_pain
is grouped as follows:
Code
have_chest_pain_data <- cohort_A_data |>
dplyr::select(c("cohort_unique_id", "chest_pain_type")) |>
dplyr::mutate(
have_chest_pain = dplyr::case_when(
.data[["chest_pain_type"]] %in% c(0) ~ "0",
.data[["chest_pain_type"]] %in% c(1, 2, 3) ~ "1",
.default = NA_character_
),
have_chest_pain = forcats::fct_relevel(
.data[["have_chest_pain"]],
c("0", "1")
),
chest_pain_type = forcats::fct_relevel(
as.character(.data[["chest_pain_type"]]),
c("0", "1", "2", "3", "4", "-1")
)
) |>
pointblank::col_vals_in_set(
columns = c("have_chest_pain"),
set = c("0", "1")
)
Code
if (params$show_table && knitr::is_html_output()) {
have_chest_pain_data |>
harmonisation::reactable_with_download_csv_button()
}
Remove unnecessary columns so that we can merge with the other fields.
Code
have_chest_pain_data <- have_chest_pain_data |>
dplyr::select(-c("chest_pain_type"))
Symptomatic or Asymptomatic
symptoms
is grouped as follows:
-1 |
-1 |
-1 |
0 |
0 |
0 |
0 or 1 |
1 |
1 |
1 |
0 |
2 |
Code
symptoms_data <- cohort_A_data |>
dplyr::select(c("cohort_unique_id")) |>
dplyr::inner_join(shortness_of_breath_data,
by = dplyr::join_by("cohort_unique_id"),
unmatched = "error",
relationship = "one-to-one") |>
dplyr::inner_join(have_chest_pain_data,
by = dplyr::join_by("cohort_unique_id"),
unmatched = "error",
relationship = "one-to-one") |>
dplyr::mutate(
symptoms = dplyr::case_when(
(.data[["have_chest_pain"]] == "-1" &
.data[["have_sob"]] == "-1"
) ~ "-1",
(.data[["have_chest_pain"]] == "0" &
.data[["have_sob"]] == "0"
) ~ "0",
(.data[["have_chest_pain"]] == "1" &
.data[["have_sob"]] %in% c("0", "1")
) ~ "1",
(.data[["have_chest_pain"]] == "0" &
.data[["have_sob"]] == "1"
) ~ "2",
.default = NA_character_
),
symptoms = forcats::fct_relevel(
.data[["symptoms"]],
c("0", "1", "2"))
) |>
pointblank::col_vals_in_set(
columns = c("symptoms"),
set = c("0", "1", "2")
)
Code
if (params$show_table && knitr::is_html_output()) {
symptoms_data |>
harmonisation::reactable_with_download_csv_button()
}
Remove unnecessary columns so that we can merge with the other fields.
Code
symptoms_data <- symptoms_data |>
dplyr::select(-c("have_chest_pain", "have_sob"))
Chest Pain Type
chest_pain_type
is grouped as follows:
-1 |
-1 |
-1 |
0 |
0 |
0 |
0 or 1 |
1 |
1 |
0 or 1 |
2 |
2 |
0 or 1 |
3 |
2 |
1 |
0 |
4 |
Code
chest_pain_type_data <- cohort_A_data |>
dplyr::select(c("cohort_unique_id", "chest_pain_type")) |>
dplyr::inner_join(shortness_of_breath_data,
by = dplyr::join_by("cohort_unique_id"),
unmatched = "error",
relationship = "one-to-one") |>
dplyr::mutate(
chest_pain_type_before = .data[["chest_pain_type"]],
chest_pain_type = dplyr::case_when(
(.data[["chest_pain_type_before"]] == "-1" &
.data[["have_sob"]] == "-1"
) ~ "-1",
(.data[["chest_pain_type_before"]] == "0" &
.data[["have_sob"]] == "0"
) ~ "0",
(.data[["chest_pain_type_before"]] == "1" &
.data[["have_sob"]] %in% c("0", "1")
) ~ "1",
(.data[["chest_pain_type_before"]] == "2" &
.data[["have_sob"]] %in% c("0", "1")
) ~ "2",
(.data[["chest_pain_type_before"]] == "3" &
.data[["have_sob"]] %in% c("0", "1")
) ~ "3",
(.data[["chest_pain_type_before"]] == "0" &
.data[["have_sob"]] == "1"
) ~ "4",
.default = NA_character_
),
chest_pain_type_before = forcats::fct_relevel(
as.character(.data[["chest_pain_type_before"]]),
c("0", "1", "2", "3")),
chest_pain_type = forcats::fct_relevel(
.data[["chest_pain_type"]],
c("0", "1", "2", "3"))
) |>
dplyr::relocate(
"chest_pain_type_before",
.before = "chest_pain_type"
) |>
pointblank::col_vals_in_set(
columns = c("chest_pain_type"),
set = c("0", "1", "2", "3", "4")
)
Code
if (params$show_table && knitr::is_html_output()) {
chest_pain_type_data |>
harmonisation::reactable_with_download_csv_button()
}
Remove unnecessary columns so that we can merge with the other fields.
Code
chest_pain_type_data <- chest_pain_type_data |>
dplyr::select(-c("have_sob", "chest_pain_type_before"))
Combine Demographics
We combine all the data to give the demo_behave_data
.
Code
join_specification <- dplyr::join_by("cohort_unique_id")
demo_behave_data <- cohort_A_data |>
dplyr::select(c("cohort_unique_id")) |>
dplyr::inner_join(age_gender_data,
by = join_specification,
unmatched = "error",
relationship = "one-to-one") |>
dplyr::inner_join(body_measurement_data,
by = join_specification,
unmatched = "error",
relationship = "one-to-one") |>
dplyr::inner_join(smoking_data,
by = join_specification,
unmatched = "error",
relationship = "one-to-one") |>
dplyr::inner_join(chest_pain_data,
by = join_specification,
unmatched = "error",
relationship = "one-to-one") |>
dplyr::relocate(c("bsa_m2", "bmi"),
.after = "sex")
testthat::expect_true(
pointblank::has_columns(
demo_behave_data,
columns = c(
"age_years", "sex",
"height_cm", "weight_kg", "bsa_m2", "bmi",
"smoke_current", "smoke_past",
"have_sob", "have_chest_pain",
"symptoms", "chest_pain_type"
)
)
)
testthat::expect_equal(
ncol(demo_behave_data), 13
)
Write Preprocessed File
We output data to be used for the next session.
Code
demo_behave_data |>
fst::write_fst(
path = here::here(
params$analysis_folder,
params$harmonisation_folder,
params$preprocessing_folder,
"02_demographic_data.fst"),
)