3 Extract Demographic

1 Read Preprocessed File

We read output data from the previous section.

2 Check for integer vector

We have a function that checks if the numeric vector has integers.

Code

non_integer_data <- data.frame(
  non_integer_col = c(-1, 0, NA, 2.0000,
                      3.010, pi, exp(1)
                     )
)

non_integer_data |>
  pointblank::col_vals_expr(
    expr = ~ harmonisation::is_integer_vector(
      input_vector = non_integer_data[["non_integer_col"]],
      allow_na = TRUE)
  )

Error: The `col_vals_expr()` validation failed beyond the absolute threshold level (1).
* failure level (3) >= failure threshold (1)

3 Demographics and Behavioral parameters

3.1 Age and Sex

age_years will be mapped from the column age. age value of 0 is set as missing.

sex is grouped as follows:

sex before	sex
F	0
M	1

Code

age_gender_data <- cohort_A_data |>
  dplyr::select(c("cohort_unique_id", 
                  "age", 
                  "sex")) |>
  pointblank::col_vals_expr(
    expr = ~ harmonisation::is_integer_vector(
      cohort_A_data[["age"]],
      allow_na = TRUE)
  ) |>
  dplyr::mutate(
    # Convert age to type integer
    age_years = as.integer(.data[["age"]]),
    # Convert age of 0 to NA
    age_years = dplyr::case_when(
      .data[["age_years"]] == 0 ~ NA_integer_,
      .default = .data[["age_years"]]
    ),
    sex_before = .data[["sex"]],
    # Convert categorical columns to factors
    sex = dplyr::case_when(
      .data[["sex_before"]] == "F" ~ "0",
      .data[["sex_before"]] == "M" ~ "1",
      .default = as.character(.data[["sex_before"]])
    ),
    `sex_before` = forcats::fct_relevel(
      .data[["sex_before"]],
      c("F", "M")
    ),
    sex = forcats::fct_relevel(
      .data[["sex"]],
      c("0", "1")),
  ) |>
  dplyr::relocate(
    "sex",
    .after = "sex_before"
  ) |> 
  pointblank::col_vals_in_set(
    columns = "sex",
    set = c("0", "1")
  ) |> 
  pointblank::col_vals_between(
    columns = "age_years",
    left = 0,
    right = 100,
    inclusive = c(FALSE, TRUE),
    na_pass = TRUE
  )

Code

if (params$show_table && knitr::is_html_output()) {
  age_gender_data |> 
    harmonisation::reactable_with_download_csv_button()
}

Remove unnecessary columns so that we can merge with the other fields.

Code

age_gender_data <- age_gender_data |>
  dplyr::select(-c("age", "sex_before"))

3.2 Height, Weight, BMI and BSA

height_cm will be mapped from the column height. weight_kg will be mapped from the column weight.

bsa_m2 in m^2 will be calculated as sqrt([Height(cm) x Weight(kg)]/3600) bmi will be calculated as Weight(kg)/((Height(m))^2)

All values are then converted to two decimal places.

To date, only patient A010 has a bmi greater than 50.

Code

body_measurement_data <- cohort_A_data |>
  dplyr::select(c("cohort_unique_id", 
                  "weight", "height")) |>
  dplyr::mutate(
    height_cm = .data[["height"]],
    weight_kg = .data[["weight"]],
    bsa_m2 = sqrt((.data[["height_cm"]] * .data[["weight_kg"]]) / 3600),
    bsa_m2 = harmonisation::round_to_nearest_digit(.data[["bsa_m2"]], digits = 2),
    bmi = .data[["weight_kg"]] / ((.data[["height_cm"]] / 100)^2),
    bmi = harmonisation::round_to_nearest_digit(.data[["bmi"]], digits = 2),
    height_cm = harmonisation::round_to_nearest_digit(.data[["height_cm"]], digits = 2),
    weight_kg = harmonisation::round_to_nearest_digit(.data[["weight_kg"]], digits = 2)
  ) |> 
  pointblank::col_vals_gt(
    columns = "bmi",
    preconditions = ~ . %>%
      dplyr::filter(
        .data[["cohort_unique_id"]] %in% c("A010")
      ),
    value = 50,
    na_pass = TRUE    
  ) |> 
  pointblank::col_vals_between(
    columns = "bmi",
    preconditions = ~ . %>%
      dplyr::filter(
        !.data[["cohort_unique_id"]] %in% c("A010")
      ),
    left = 10,
    right = 50,
    inclusive = c(TRUE, TRUE),
    na_pass = TRUE
  )

Code

if (params$show_table && knitr::is_html_output()) {
  body_measurement_data |>
    harmonisation::reactable_with_download_csv_button()
}

Remove unnecessary columns so that we can merge with the other fields.

Code

body_measurement_data <- body_measurement_data |>
  dplyr::select(-c("height", "weight"))

3.3 Smoking History

smoke_current will be mapped from the column smoke_current_good. smoke_past will be mapped from the column smoke_past_good.

We do a check to ensure that we can only have these scenarios

smoke_current as 1 and smoke_past as 0 for current smokers
smoke_current as 0 and smoke_past as 1 for past smokers
smoke_current as 0 and smoke_past as 0 for non-smokers
smoke_current as -1 and smoke_past as -1 for unknown

Code

smoking_data <- cohort_A_data |>
  dplyr::select(c("cohort_unique_id", 
                  "smoke_current_good", "smoke_past_good")) |>
  dplyr::mutate(
    smoke_current = as.character(.data[["smoke_current_good"]]),
    smoke_current_good = forcats::fct_relevel(
      as.character(.data[["smoke_current_good"]]),
      c("0", "1")),    
    smoke_current = forcats::fct_relevel(
      .data[["smoke_current"]],
      c("0", "1")),
    smoke_past = as.character(.data[["smoke_past_good"]]),
    smoke_past_good = forcats::fct_relevel(
      as.character(.data[["smoke_past_good"]]),
      c("0", "1")),     
    smoke_past = forcats::fct_relevel(
      .data[["smoke_past"]],
      c("0", "1")),
  ) |>
  pointblank::col_vals_in_set(
    columns = c("smoke_current", "smoke_past"),
    set = c("0", "1", "-1")
  ) |> 
  pointblank::col_vals_expr(
    expr = pointblank::expr(
      (.data[["smoke_current"]] == "1" & .data[["smoke_past"]] == "0") |
      (.data[["smoke_current"]] == "-1" & .data[["smoke_past"]] == -"1") |
      (.data[["smoke_current"]] == "0" & .data[["smoke_past"]] %in% c("0", "1"))
    )
  )

Code

if (params$show_table && knitr::is_html_output()) {
  smoking_data |>
    harmonisation::reactable_with_download_csv_button()
}

Remove unnecessary columns so that we can merge with the other fields.

Code

smoking_data <- smoking_data |>
  dplyr::select(-c("smoke_current_good", "smoke_past_good"))

Here is a case when the validation has failed.

Code

smoking_data_bad <- cohort_A_data |>
  dplyr::select(c("cohort_unique_id", 
                  "smoke_current_bad", "smoke_past_bad")) |>
  dplyr::filter(
      .data[["cohort_unique_id"]] %in% c("A010", "A016")
  ) |> 
  dplyr::mutate(
    smoke_current = as.character(.data[["smoke_current_bad"]]),
    smoke_past = as.character(.data[["smoke_past_bad"]]),
  )

smoking_data_bad |> 
  pointblank::col_vals_in_set(
    columns = c("smoke_current", "smoke_past"),
    set = c("0", "1")
  ) |> 
  pointblank::col_vals_expr(
    expr = pointblank::expr(
      (.data[["smoke_current"]] == "1" & .data[["smoke_past"]] == "0") |
      (.data[["smoke_current"]] == "-1" & .data[["smoke_past"]] == "-1") |
      (.data[["smoke_current"]] == "0" & .data[["smoke_past"]] %in% c("0", "1"))
    )
  )

Error: The `col_vals_expr()` validation failed beyond the absolute threshold level (1).
* failure level (2) >= failure threshold (1)

Code

if (params$show_table && knitr::is_html_output()) {
  smoking_data_bad |> 
    harmonisation::reactable_with_download_csv_button()
}

3.4 Chest Pain

3.4.1 Shortness of Breath

have_sob values remained unchanged.

Code

shortness_of_breath_data <- cohort_A_data |>
  dplyr::select(c("cohort_unique_id", "have_sob")) |>
  dplyr::mutate(
    have_sob = forcats::fct_relevel(
      as.character(.data[["have_sob"]]),
      c("0", "1"))
  ) |>
  pointblank::col_vals_in_set(
    columns = c("have_sob"),
    set = c("0", "1", "-1")
  )

Code

if (params$show_table && knitr::is_html_output()) {
  shortness_of_breath_data |> 
    harmonisation::reactable_with_download_csv_button()
}

3.4.2 Have chest pain or not

have_chest_pain is grouped as follows:

chest_pain_type	have_chest_pain
0	0
1, 2 or 3	1

Code

have_chest_pain_data <- cohort_A_data |>
  dplyr::select(c("cohort_unique_id", "chest_pain_type")) |>
  dplyr::mutate(
    have_chest_pain = dplyr::case_when(
      .data[["chest_pain_type"]] %in% c(0) ~ "0",
      .data[["chest_pain_type"]] %in% c(1, 2, 3) ~ "1",
      .default = NA_character_
    ),
    have_chest_pain = forcats::fct_relevel(
      .data[["have_chest_pain"]],
      c("0", "1")
    ),
    chest_pain_type = forcats::fct_relevel(
      as.character(.data[["chest_pain_type"]]),
      c("0", "1", "2", "3", "4", "-1")
    )
  ) |>
  pointblank::col_vals_in_set(
    columns = c("have_chest_pain"),
    set = c("0", "1")
  )

Code

if (params$show_table && knitr::is_html_output()) {
  have_chest_pain_data |> 
    harmonisation::reactable_with_download_csv_button()
}

Remove unnecessary columns so that we can merge with the other fields.

Code

have_chest_pain_data <- have_chest_pain_data |>
  dplyr::select(-c("chest_pain_type"))

3.4.3 Symptomatic or Asymptomatic

symptoms is grouped as follows:

have_sob	have_chest_pain	symptoms
-1	-1	-1
0	0	0
0 or 1	1	1
1	0	2

Code

symptoms_data <- cohort_A_data |>
  dplyr::select(c("cohort_unique_id")) |>
  dplyr::inner_join(shortness_of_breath_data, 
                   by = dplyr::join_by("cohort_unique_id"),
                   unmatched = "error",
                   relationship = "one-to-one") |> 
  dplyr::inner_join(have_chest_pain_data, 
                   by = dplyr::join_by("cohort_unique_id"),
                   unmatched = "error",
                   relationship = "one-to-one") |>                   
  dplyr::mutate(
    symptoms = dplyr::case_when(
      (.data[["have_chest_pain"]] == "-1" &
       .data[["have_sob"]] == "-1"
      ) ~ "-1",
      (.data[["have_chest_pain"]] == "0" &
       .data[["have_sob"]] == "0" 
      ) ~ "0",        
      (.data[["have_chest_pain"]] == "1" &
       .data[["have_sob"]] %in% c("0", "1")
      ) ~ "1", 
      (.data[["have_chest_pain"]] == "0" &
       .data[["have_sob"]] == "1" 
      ) ~ "2",  
      .default = NA_character_
    ),
    symptoms = forcats::fct_relevel(
      .data[["symptoms"]],
      c("0", "1", "2"))
  ) |> 
  pointblank::col_vals_in_set(
    columns = c("symptoms"),
    set = c("0", "1", "2")
  )

Code

if (params$show_table && knitr::is_html_output()) {
  symptoms_data |> 
    harmonisation::reactable_with_download_csv_button()
}

Remove unnecessary columns so that we can merge with the other fields.

Code

symptoms_data <- symptoms_data |>
  dplyr::select(-c("have_chest_pain", "have_sob"))

3.4.4 Chest Pain Type

chest_pain_type is grouped as follows:

have_sob	chest_pain_type before	chest_pain_type
-1	-1	-1
0	0	0
0 or 1	1	1
0 or 1	2	2
0 or 1	3	2
1	0	4

Code

chest_pain_type_data <- cohort_A_data |>
  dplyr::select(c("cohort_unique_id", "chest_pain_type")) |>
  dplyr::inner_join(shortness_of_breath_data, 
                   by = dplyr::join_by("cohort_unique_id"),
                   unmatched = "error",
                   relationship = "one-to-one") |> 
  dplyr::mutate(
    chest_pain_type_before = .data[["chest_pain_type"]],
    chest_pain_type = dplyr::case_when(
      (.data[["chest_pain_type_before"]] == "-1" &
       .data[["have_sob"]] == "-1"
      ) ~ "-1",
      (.data[["chest_pain_type_before"]] == "0" &
       .data[["have_sob"]] == "0" 
      ) ~ "0",        
      (.data[["chest_pain_type_before"]] == "1" &
       .data[["have_sob"]] %in% c("0", "1")
      ) ~ "1",
      (.data[["chest_pain_type_before"]] == "2" &
       .data[["have_sob"]] %in% c("0", "1")
      ) ~ "2", 
      (.data[["chest_pain_type_before"]] == "3" &
       .data[["have_sob"]] %in% c("0", "1")
      ) ~ "3", 
      (.data[["chest_pain_type_before"]] == "0" &
       .data[["have_sob"]] == "1" 
      ) ~ "4",  
      .default = NA_character_
    ),
    chest_pain_type_before = forcats::fct_relevel(
      as.character(.data[["chest_pain_type_before"]]),
      c("0", "1", "2", "3")),
    chest_pain_type = forcats::fct_relevel(
      .data[["chest_pain_type"]],
      c("0", "1", "2", "3"))
  ) |> 
  dplyr::relocate(
    "chest_pain_type_before",
    .before = "chest_pain_type"
  ) |> 
  pointblank::col_vals_in_set(
    columns = c("chest_pain_type"),
    set = c("0", "1", "2", "3", "4")
  )

Code

if (params$show_table && knitr::is_html_output()) {
  chest_pain_type_data |> 
    harmonisation::reactable_with_download_csv_button()
}

Remove unnecessary columns so that we can merge with the other fields.

Code

chest_pain_type_data <- chest_pain_type_data |>
  dplyr::select(-c("have_sob", "chest_pain_type_before"))

3.4.5 Combined chest pain related tables

We combine all chest related tables together

Code

join_specification <- dplyr::join_by("cohort_unique_id")

chest_pain_data <- cohort_A_data |> 
  dplyr::select(c("cohort_unique_id")) |> 
  dplyr::inner_join(have_chest_pain_data, 
                   by = join_specification,
                   unmatched = "error",
                   relationship = "one-to-one") |> 
  dplyr::inner_join(chest_pain_type_data, 
                   by = join_specification,
                   unmatched = "error",
                   relationship = "one-to-one") |> 
  dplyr::inner_join(shortness_of_breath_data, 
                   by = join_specification,
                   unmatched = "error",
                   relationship = "one-to-one") |> 
  dplyr::inner_join(symptoms_data,
                   by = join_specification,
                   unmatched = "error",
                   relationship = "one-to-one")

testthat::expect_true(
  pointblank::has_columns(
    chest_pain_data,
    columns = c("have_sob", "have_chest_pain", "symptoms", "chest_pain_type")
  )
) 

testthat::expect_equal(
  ncol(chest_pain_data), 5
)

3.5 Combine Demographics

We combine all the data to give the demo_behave_data.

Code

join_specification <- dplyr::join_by("cohort_unique_id")

demo_behave_data <- cohort_A_data |>
  dplyr::select(c("cohort_unique_id")) |>
  dplyr::inner_join(age_gender_data,
                   by = join_specification,
                   unmatched = "error",
                   relationship = "one-to-one") |>
  dplyr::inner_join(body_measurement_data,
                   by = join_specification,
                   unmatched = "error",
                   relationship = "one-to-one") |>
  dplyr::inner_join(smoking_data,
                   by = join_specification,
                   unmatched = "error",
                   relationship = "one-to-one") |>
  dplyr::inner_join(chest_pain_data,
                   by = join_specification,
                   unmatched = "error",
                   relationship = "one-to-one") |>
  dplyr::relocate(c("bsa_m2", "bmi"),
                  .after = "sex")


testthat::expect_true(
  pointblank::has_columns(
    demo_behave_data,
    columns = c(
      "age_years", "sex",
      "height_cm", "weight_kg", "bsa_m2", "bmi",
      "smoke_current", "smoke_past",
      "have_sob", "have_chest_pain",
      "symptoms", "chest_pain_type"
    )
  )
) 

testthat::expect_equal(
  ncol(demo_behave_data), 13
)

4 Write Preprocessed File

We output data to be used for the next session.

Code

demo_behave_data |>
  nanoparquet::write_parquet(
    file = here::here(
      params$analysis_folder,
      params$harmonisation_folder,
      params$preprocessing_folder,
      "02_demographic_data.parquet"
  )
)