Appendix B — File Processing Routines

B.1 Overview

This report contains a reproducible pipeline for processing and storing data from the study.

B.2 Set the Environment

B.2.1 Load Packages

Code
library(actverse) # github.com/danielvartan/actverse
library(checkmate)
library(clipr)
library(fs)
library(googlesheets4)
library(here)
library(osfr)
library(prettycheck) # github.com/danielvartan/prettycheck
library(readr)
library(rutils) # github.com/danielvartan/rutils
library(sodium)
library(stringr)
library(zip)

B.2.2 Load Custom Functions

Code
source(here::here("R", "anonymize_id.R"))
source(here::here("R", "test_cpf.R"))

B.2.3 Set Data Directories

Code
data_dir <- here("data")
Code
if (!dir_exists(data_dir)) {
  dir_create(data_dir, recurse = TRUE)
}

B.2.4 Set Keys

Code
osf_auth(Sys.getenv("OSF_PAT")) # askpass()
Code
gs4_auth(cache = ".secrets")
Code
private_key <- here::here("_ssh", "id_rsa")
Code
public_key <- here::here("_ssh", "id_rsa.pub")
Code
password <- Sys.getenv("PREGNANCY_PASSWORD") # askpass()
Code
salt <- Sys.getenv("PREGNANCY_SALT") # askpass()

B.2.5 Set Inicial Variables

Code
ids <- c(
  "36566db2",
  "5024b37e",
  "2dec1b47",
  "b5b1d89e",
  "0324c6f8",
  "aeab920e",
  "225f2a39",
  "ca601ee7",
  "3c2a132a",
  "a4899cae",
  "3196960f",
  "f207fd6b",
  "6f89b17f"
)
Code
ids_pattern <- paste0("^", ids, collapse = "|")

B.2.6 Set Temp Directories

Code
dir_temp <- tempfile(pattern = "pregnancy-data-", fileext = "")
Code
dir_raw_data <- path(dir_temp, "raw-data")
dir_processed_data <- path(dir_temp, "processed-data")
dir_bundles <- path(dir_temp, "bundle")
Code
dir_raw_data_actigraphy <- path(dir_raw_data, "actigraphy")
dir_raw_data_consent <- path(dir_raw_data, "consent")
dir_raw_data_control_form <- path(dir_raw_data, "control-form")
dir_raw_data_delivery_receipt <- path(dir_raw_data, "delivery-receipt")
dir_raw_data_field_form <- path(dir_raw_data, "field-form")
dir_raw_data_medical_record <- path(dir_raw_data, "medical-record")
dir_raw_data_pilot_form <- path(dir_raw_data, "pilot-form")
dir_raw_data_pregnancy_booklet <- path(dir_raw_data, "pregnancy-booklet")
dir_raw_data_return_receipt <- path(dir_raw_data, "return-receipt")
dir_raw_data_sleep_diary <- path(dir_raw_data, "sleep-diary")
Code
dir_processed_data_actigraphy <- path(dir_processed_data, "actigraphy")
dir_processed_data_sleep_diary <- path(dir_processed_data, "sleep-diary")
Code
file_pilot_form <- path(dir_raw_data_pilot_form, "raw.csv")
file_field_form <- path(dir_raw_data_field_form, "raw.csv")
file_sleep_diary <- path(dir_raw_data_sleep_diary, "raw.csv")
file_control_form <- path(dir_raw_data_control_form, "raw.csv")
Code
for (i in ls() |> stringr::str_subset("^dir_")) {
  if (!dir.exists(get(i))) {
    dir.create(get(i), recursive = TRUE, showWarnings = FALSE)
  }
}

B.3 Getting Data from Google Sheets

Code
raw_data_pilot_form <- googlesheets4::read_sheet(
  ss = "19VvBMLL0EVk5345pLDdusEfI1WCZD1YfDjqmQnRgsuk",
  sheet = "Dataset",
  range = NULL,
  col_names = TRUE,
  col_types = "c",
  na = c("", "NA"),
  trim_ws = TRUE,
  skip = 0,
  n_max = Inf,
  .name_repair = "unique"
)

raw_data_pilot_form
Code
raw_data_pilot_form |> readr::write_csv(file_pilot_form)
Code
raw_data_field_form <- googlesheets4::read_sheet(
  ss = "1tY_TT0nPXFsErYQS2VED0-hqTzgHudA9BfhRhYG19fw",
  sheet = "Dataset",
  range = NULL,
  col_names = TRUE,
  col_types = "c",
  na = c("", "NA"),
  trim_ws = TRUE,
  skip = 0,
  n_max = Inf,
  .name_repair = "unique"
)

raw_data_field_form
Code
raw_data_field_form |> readr::write_csv(file_field_form)
Code
raw_data_sleep_diary <- googlesheets4::read_sheet(
  ss = "1VKJgS8ZrUO9E-yZ6WwfZgKNoEQWH7afk3K5jpAjgKxU",
  sheet = "Dataset",
  range = NULL,
  col_names = TRUE,
  col_types = "c",
  na = c("", "NA"),
  trim_ws = TRUE,
  skip = 0,
  n_max = Inf,
  .name_repair = "unique"
)

raw_data_sleep_diary
Code
raw_data_sleep_diary |> readr::write_csv(file_sleep_diary)
Code
raw_data_control_form <- googlesheets4::read_sheet(
  ss = "13wtDr4fRD1wJSM-qdLOdGSchF8oXU1bG0Jtccu24GhY",
  sheet = "Dataset",
  range = NULL,
  col_names = TRUE,
  col_types = "c",
  na = c("", "NA"),
  trim_ws = TRUE,
  skip = 0,
  n_max = Inf,
  .name_repair = "unique"
)

raw_data_control_form
Code
raw_data_control_form |> readr::write_csv(file_control_form)

B.4 Getting Data from OSF

B.4.1 Assigning IDs

Code
osf_pilot_data_id <- "tj5u2"
osf_raw_data_id <- "7kg34"
osf_processed_data_id <- "a2dsw"
osf_tidy_data_id <- "npkjw"
osf_bundles_id <- "fvh4u"

B.4.2 Getting File Lists

Code
patterns <- c(
  "pilot_data",
  "raw_data",
  "processed_data",
  "tidy_data",
  "bundles"
)

for (i in patterns) {
  assign(
    paste0("osf_", i, "_files"),
    get(paste0("osf_", i, "_id")) |>
      osfr::osf_retrieve_node() |>
      osfr::osf_ls_files(n_max = Inf),
    envir = .GlobalEnv
  )
}
Code
patterns <- c(
  "actigraphy",
  "consent",
  "control-form",
  "delivery-receipt",
  "field-form",
  "medical-record",
  "pilot-form",
  "pregnancy-booklet",
  "return-receipt",
  "sleep-diary"
)

for (i in patterns) {
  normalized_i <- stringr::str_replace_all(i, "-", "_")

  assign(
    paste0("osf_raw_data_", normalized_i, "_files"),
    osf_raw_data_files |>
      dplyr::filter(name == i) |>
      dplyr::pull(id) |>
      osfr::osf_retrieve_file() |>
      osfr::osf_ls_files(n_max = Inf),
    envir = .GlobalEnv
  )
}
Code
patterns <- c(
  "actigraphy",
  "sleep-diary"
)

for (i in patterns) {
  normalized_i <- stringr::str_replace_all(i, "-", "_")

  assign(
    paste0("osf_processed_data_", normalized_i, "_files"),
    osf_processed_data_files |>
      dplyr::filter(name == i) |>
      dplyr::pull(id) |>
      osfr::osf_retrieve_file() |>
      osfr::osf_ls_files(n_max = Inf),
    envir = .GlobalEnv
  )
}

B.4.3 Downloading Files

Code
patterns <- c(
  "actigraphy",
  "consent",
  "delivery-receipt",
  "medical-record",
  "pregnancy-booklet",
  "return-receipt"
)

for (i in patterns) {
  normalized_i <- stringr::str_replace_all(i, "-", "_")
  dir_i <- get(paste0("dir_raw_data_", normalized_i))

  paste0("osf_raw_data_", normalized_i, "_files") |>
    get() |>
    dplyr::filter(stringr::str_detect(name, ids_pattern)) |>
    osfr::osf_download(
      path = dir_i,
      conflicts = "overwrite",
      progress = TRUE
    )

  dir_i |>
    lockr::unlock_dir(
      private_key = private_key_path,
      password = password
    )
}
Code
patterns <- c("actigraphy")

for (i in patterns) {
  normalized_i <- stringr::str_replace_all(i, "-", "_")
  dir_i <- get(paste0("dir_processed_data_", normalized_i))

  paste0("osf_processed_data_", normalized_i, "_files") |>
    get() |>
    dplyr::filter(stringr::str_detect(name, ids_pattern)) |>
    osfr::osf_download(
      path = dir_i,
      conflicts = "overwrite",
      progress = TRUE
    )

  dir_i |>
    lockr::unlock_dir(
      private_key = private_key_path,
      password = password
    )
}

B.5 Getting CPF Data

Code
id_data <-
  dplyr::tibble(id = ids) |>
  dplyr::left_join(
    raw_data_control_form |>
    janitor::clean_names() |>
      dplyr::select(
        qual_cpf_da_gestante, # col_index == 5
        qual_e_o_id_anonimizado_da_gestante # col_index == 6
      ) |>
      dplyr::rename(
        cpf = qual_cpf_da_gestante,
        id = qual_e_o_id_anonimizado_da_gestante
      ),
    by = "id"
  ) |>
  dplyr::distinct() |>
  dplyr::arrange(cpf)

id_data
Code
id_data |> dplyr::pull(cpf) |> test_cpf()

B.6 Load, Filter & Write Data

Code
for (i in split(id_data, seq(nrow(id_data)))) {
  raw_data_control_form_i <-
    raw_data_control_form |>
    scaler:::filter_data(col_index = 6, value = i$id) |>
    rutils:::shush()

  if (!nrow(raw_data_control_form_i) == 0) {
    raw_data_control_form_i |>
      readr::write_csv(
        path(
          dir_raw_data_control_form,
          paste0(i$id, "_control-form", ".csv")
        )
      )
  }

  raw_data_pilot_form_i <-
    raw_data_pilot_form |>
    scaler:::filter_data(col_index = 5, value = i$cpf) |>
    rutils:::shush()

  raw_data_field_form_i <-
    raw_data_field_form |>
    scaler:::filter_data(col_index = 5, value = i$cpf) |>
    rutils:::shush()

  if (nrow(raw_data_field_form_i) == 0 &&
      !nrow(raw_data_pilot_form_i) == 0) {
    raw_data_pilot_form_i |>
      readr::write_csv(
        path(
          dir_raw_data_pilot_form,
          paste0(i$id, "_pilot-form", ".csv")
        )
      )
  }

  if (!nrow(raw_data_field_form_i) == 0) {
    raw_data_field_form_i |>
      readr::write_csv(
        path(
          dir_raw_data_field_form,
          paste0(i$id, "_field-form", ".csv")
        )
      )
  }

  raw_data_sleep_diary_i <-
    raw_data_sleep_diary |>
    scaler:::filter_data(col_index = 3, value = i$cpf)

  if (!nrow(raw_data_sleep_diary_i) == 0) {
    sleep_diary_type_of_day_i <-
      raw_data_sleep_diary_i |>
      scaler:::get_sleep_diary_type_of_day(col_indexes = c(1, 4, 8, 10))

    tidy_data_sleep_diary_i <-
      raw_data_sleep_diary_i |>
      scaler:::tidy_sleep_diary(col_indexes = c(1, 8, 10, 19:28))

    raw_data_sleep_diary_i |>
      readr::write_csv(
        path(
          dir_raw_data_sleep_diary,
          paste0(i$id, "_sleep-diary", ".csv")
        )
      )

    if (!nrow(sleep_diary_type_of_day_i) == 0) {
      sleep_diary_type_of_day_i |>
        readr::write_csv(
          path(
            dir_processed_data_sleep_diary,
            paste0(i$id, "_sleep-diary-type-of-day", ".csv")
          )
        )
    }

    if (!nrow(tidy_data_sleep_diary_i) == 0) {
      tidy_data_sleep_diary_i |>
        scaler:::actstudio_sleep_diary(
          file = path(
            dir_processed_data_actigraphy,
            paste0(i$id, "_actigraphy-sleep-diary", ".txt")
          )
        )
    }
  }
}

B.7 Creating Bundles

Code
for (i in split(id_data, seq(nrow(id_data)))) {
  bundle_files <- c(
    "actigraphy-raw-data" = path(
      dir_raw_data_actigraphy, paste0(i$id, "_actigraphy-raw-data", ".txt")
    ),
    "actigraphy-raw-data_report" = path(
      dir_raw_data_actigraphy, paste0(i$id, "_actigraphy-raw-data-report", ".txt")
    ),
    "actigraphy-processed-data" = path(
      dir_processed_data_actigraphy, paste0(i$id, "_actigraphy-processed-data", ".txt")
    ),
    "actigraphy-sleep-diary" = path(
      dir_processed_data_actigraphy, paste0(i$id, "_actigraphy-sleep-diary", ".txt")
    ),
    "consent" = path(
      dir_raw_data_consent, paste0(i$id, "_consent", ".pdf")
    ),
    "delivery-receipt" = path(
      dir_raw_data_delivery_receipt, paste0(i$id, "_delivery-receipt", ".pdf")
    ),
    "field-form" = path(
      dir_raw_data_control_form, paste0(i$id, "_control-form", ".csv")
    ),
    "field-form" = path(
      dir_raw_data_field_form, paste0(i$id, "_field-form", ".csv")
    ),
    "medical-record" = path(
      dir_raw_data_medical_record, paste0(i$id, "_medical-record", ".pdf")
    ),
    "pregnancy-booklet" = path(
      dir_raw_data_pregnancy_booklet, paste0(i$id, "_pregnancy-booklet", ".pdf")
    ),
    "return-receipt" = path(
      dir_raw_data_return_receipt, paste0(i$id, "_return-receipt", ".pdf")
    ),
    "sleep-diary" = path(
      dir_raw_data_sleep_diary, paste0(i$id, "_sleep-diary", ".csv")
    ),
    "sleep-diary-type-of-day" = path(
      dir_processed_data_sleep_diary, paste0(i$id, "_sleep-diary-type-of-day", ".csv")
    )
  )

  for (j in bundle_files) {
    if (!file.exists(j)) {
      bundle_files <- bundle_files[!bundle_files %in% j]
    }
  }

  if (!length(bundle_files) == 0) {
    utils::zip(
      zipfile = file.path(dir_bundles, i$id),
      files = bundle_files,
      flags = paste("--password", password),
      extras = "-j"
    )

    # zip::zip(
    #   zipfile = path(dir_bundles, paste0(i$id, ".zip")),
    #   files = bundle_files |> path_rel(start = dir_temp),
    #   root = dir_temp
    # )
  }
}

B.8 Writing Data to OSF

Code
patterns <- c(
  "control-form",
  "field-form",
  "pilot-form",
  "sleep-diary"
)

for (i in patterns) {
  normalized_i <- stringr::str_replace_all(i, "-", "_")

  osf_dir_id <-
    osf_raw_data_files |>
    dplyr::filter(name == i) |>
    dplyr::pull(id) |>
    osfr::osf_retrieve_file()

  dir_raw_data_i <-
    paste0("dir_raw_data_", normalized_i) |>
    get()

  dir_raw_data_i |>
    lockr::lock_dir(
      public_key = public_key_path,
      remove_file = FALSE
    )

  files_i <-
    dir_raw_data_i |>
    dir_ls(type = "file") |>
    stringr::str_subset("\\.lockr$")

  # osf_files <-
  #   paste0("osf_raw_data_", normalized_i, "_files") |>
  #   get() |>
  #   dplyr::pull(name)

  # setdiff(
  #   files_i |> basename() |> paste0(".lockr"),
  #   osf_files
  # )

  osf_dir_id |>
    osfr::osf_upload(
      path = files_i,
      conflicts = "overwrite",
      progress = TRUE
    )
}
Code
patterns <- c(
  "actigraphy",
  "sleep-diary"
)

for (i in patterns) {
  normalized_i <- stringr::str_replace_all(i, "-", "_")

  osf_dir_id <-
    osf_processed_data_files |>
    dplyr::filter(name == i) |>
    dplyr::pull(id) |>
    osfr::osf_retrieve_file()

  dir_processed_data_i <-
    paste0("dir_processed_data_", normalized_i) |>
    get()

  dir_processed_data_i |>
    lockr::lock_dir(
      public_key = public_key_path,
      remove_file = FALSE
    )

  files_i <-
    dir_processed_data_i |>
    dir_ls(type = "file") |>
    stringr::str_subset("sleep-diary") |>
    stringr::str_subset("\\.lockr$")

  osf_dir_id |>
    osfr::osf_upload(
      path = files_i,
      conflicts = "overwrite",
      progress = TRUE
    )
}
Code
dir_bundles |>
  lockr::lock_dir(
    public_key = public_key_path,
    remove_file = FALSE
  )

files_i <-
  dir_bundles |>
  dir_ls(type = "file") |>
  stringr::str_subset("\\.lockr$")

osf_bundles_id |>
  osfr::osf_retrieve_node() |>
  osfr::osf_upload(
    path = files_i,
    conflicts = "overwrite",
    progress = TRUE
  )

B.9 Delete Files

Only delete files after uploading them to OSF and the Cryptomator vault.

Code
dir_delete(dir_temp)