diff --git a/DESCRIPTION b/DESCRIPTION index 817d1e0..e7eae34 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -22,3 +22,4 @@ Imports: Encoding: UTF-8 LazyData: true RoxygenNote: 7.3.2 + diff --git a/NAMESPACE b/NAMESPACE index b0d39a7..9057002 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,15 +1,15 @@ -# Generated by roxygen2: do not edit by hand - -export(createDictionary) -export(dictionaryGood) -export(extractDataset) -export(extractVars) -export(extractWebOutput) -export(filterVars) -export(findVars) -export(getDefaultDataDir) -export(readExclusions) -export(removeExclusions) -export(setDataDir) -export(updateDictionaries) -importFrom(magrittr,"%>%") +# Generated by roxygen2: do not edit by hand + +export(createDictionary) +export(dictionaryGood) +export(extractDataset) +export(extractVars) +export(extractWebOutput) +export(filterVars) +export(findVars) +export(getDefaultDataDir) +export(readExclusions) +export(removeExclusions) +export(setDataDir) +export(updateDictionaries) +importFrom(magrittr,"%>%") diff --git a/R/.Rhistory b/R/.Rhistory new file mode 100644 index 0000000..ef5eadb --- /dev/null +++ b/R/.Rhistory @@ -0,0 +1 @@ +remove.packages("alspac") diff --git a/R/dictionary.r b/R/dictionary.r index b33ecc8..61512cd 100644 --- a/R/dictionary.r +++ b/R/dictionary.r @@ -1,202 +1,228 @@ -loadDictionaries <- function() { - path <- file.path(system.file(package = "alspac"), "data") - assign("globals", new.env(), envir=parent.env(environment())) - for (file in list.files(path, "rdata$", full.names=TRUE)) { - load(file, globals) - } - #combineDictionaries() -} - -combineDictionaries <- function() { - both <- retrieveDictionary("current") - #if (exists("useful", envir=globals)) - # both <- rbind.fill(both, retrieveDictionary("useful")) - assign("both", both, globals) -} - -retrieveDictionary <- function(name) { - if (name %in% ls(envir=globals)) { - get(name, envir=globals) - } else { - stop("dictionary '", name, "' does not exist") - } -} - -saveDictionary <- function(name, dictionary) { - assign(name, dictionary, globals) - #if (name == "current" || name == "useful") - # combineDictionaries() - - path <- file.path(system.file(package="alspac"), "data") - if (!file.exists(path)) { - dir.create(path) - } - save(list=name, - file=file.path(path, paste(name, "rdata", sep=".")), - envir=globals) -} - -#' Checks a dictionary -#' -#' Checks if all the files referred to in the dictionary -#' are accessible given the ALSPAC data directory. -#' -#' @param dictionary The name of an existing dictionary or the dictionary itself. -#' @param max.print The maximum number of missing files to list if any are missing -#' (Default: 10). -#' @export -#' @return \code{TRUE} if all files exist, otherwise \code{FALSE} and a warning listing at most -#' \code{max.print} missing files. -#' -dictionaryGood <- function(dictionary, max.print=10) { - if (is.character(dictionary)) { - dictionary <- retrieveDictionary(dictionary) - } - - alspacdir <- options()$alspac_data_dir - - filenames <- unique(with(dictionary, file.path(alspacdir, path, obj))) - missing.idx <- which(!sapply(filenames, file.exists)) - num.missing <- length(missing.idx) - if (num.missing == 0) { - TRUE - } else { - missing.idx <- missing.idx[1:min(max.print,num.missing)] - warning("Please run 'updateDictionaries()' and try again. ", - "If you are using input from 'findVars()', ", - "then you will need to rerun that as well. ", - "Dictionary refers to missing files, e.g. ", - paste(filenames[missing.idx], collapse=", ")) - FALSE - } -} - - -#' Update dictionaries -#' -#' Update the variable dictionaries for the ALSPAC dataset. -#' -#' @export -updateDictionaries <- function() { - createDictionary("Current", name="current", quick=FALSE) - #createDictionary("Useful_data", name="useful", quick=FALSE) - return(TRUE) -} - - -#' Create a dictionary from ALSPAC Stata files -#' -#' @param datadir ALSPAC data subdirectory from which to create the index -#' (Default: "Current"). . -#' @param name If not \code{NULL}, then the resulting dictionary -#' will be saved to a file in the R package for use next time the package -#' is loaded. The dictionary will be available with the given name (Default: \code{NULL}). -#' @param quick Logical. Default \code{FALSE}. -#' -#' The function uses multiple processors using \code{\link[parallel]{mclapply}()}. -#' Use multiple processors by setting \code{mc.cores} option using -#' \code{options()}. -#' -#' @export -#' @return Data frame dictionary listing available variables. -createDictionary <- function(datadir="Current", name=NULL, quick=FALSE) { - stopifnot(datadir == "Current") - - alspacdir <- options()$alspac_data_dir - datadir <- file.path(alspacdir, datadir) - files <- list.files(datadir, - pattern="dta$", - full.names=TRUE, - recursive=TRUE, - ignore.case=TRUE) - - dictionary <- parallel::mclapply(files, function(file) { - cat(date(), "loading", file, "\n") - tryCatch({ - merge( - processDTA(file, quick), - createFileTable(file, alspacdir), by = "obj") - }, error=function(e) { - warning("Error loading", file, "\n") - print(e) - NULL - }) - }) %>% dplyr::bind_rows() - - dictionary <- dictionary[which(dictionary$counts > 0),] - - ## add data sources information so that withdrawn consent can be - ## handled correctly for each variable - dictionary <- addSourcesToDictionary(dictionary) - - if (!is.null(name)) { - saveDictionary(name, dictionary) - } - - invisible(dictionary) -} - -countCharOccurrences <- function(char, s) { - s2 <- gsub(char,"",s) - return(nchar(s) - nchar(s2)) -} - - -trimWhitespace <- function(x) { - if (is.numeric(x)) { - return(x) - } - flag <- is.factor(x) - x <- gsub("^\\s+|\\s+$", "", x) - if (flag) { - return(as.factor(x)) - } else { - return(x) - } -} - - -createFileTable <- function(fls, alspacdir) { - #fls_dn <- dirname(fls) ## does some weird things with windows network paths - fls_bn <- basename(fls) - fls_dn <- sub(fls_bn, "", fls) - fls_n <- gsub(".dta", "", fls_bn, ignore.case=TRUE) - fls_d <- gsub(alspacdir, "", fls_dn) - fls_d <- gsub("^/", "", fls_d) - nfield <- max(countCharOccurrences("/", fls_d)) + 1 - stopifnot(! any(duplicated(fls_n))) - - sp <- strsplit(fls_d, split="/") - sp <- lapply(sp, function(x) { - y <- rep(NA, nfield) - y[1:length(x)] <- x - return(y) - }) - dat <- data.frame(do.call("rbind", sp), stringsAsFactors=FALSE) - names(dat) <- paste("cat", 1:nfield, sep="") - dat$obj <- fls_bn - dat$path <- fls_d - return(dat) -} - -processDTA <- function(fn, quick=FALSE) { - if (quick) { - temp <- suppressWarnings(readstata13::read.dta13(fn, select.rows=5)) - } else { - temp <- suppressWarnings(readstata13::read.dta13(fn)) - # temp <- haven::read_dta(fn) - } - dat <- dplyr::tibble( - name = colnames(temp), - lab = attributes(temp)$var.labels, - # lab = sapply(temp, function(x) attr(x, "label")), - type = sapply(temp, function(x) class(x)[1]), - obj = basename(fn) - ) - if (quick) { - dat$counts <- NA - } else { - dat$counts = sapply(temp, function(x) sum(!is.na(x) & x != -10 & x != -11)) - } - return(dat) -} + loadDictionaries <- function() { + path <- file.path(system.file(package = "alspac"), "data") + + # Initialize globals in the global environment + if (!exists("globals")) { + assign("globals", new.env(), envir = .GlobalEnv) + } + + for (file in list.files(path, "rdata$", full.names = TRUE)) + load(file, globals) + combineDictionaries() + } + + +combineDictionaries <- function() { + both <- NULL + + # Check if "current" exists + if (exists("current", envir=globals)) { + both <- retrieveDictionary("current") + } else { + # Handle the case when "current" doesn't exist + warning("Dictionary 'current' does not exist. Please run 'updateDictionaries()' to create it.") + return(NULL) + } + + # Check if "custom" exists + if (exists("custom", envir=globals)) { + custom <- retrieveDictionary("custom") + library(plyr) + both <- rbind.fill(both, custom) + assign("both", both, globals) + } else { + warning("Dictionary 'custom' does not exist.") + } +} + + +retrieveDictionary <- function(name) { + if (name %in% ls(envir=globals)) { + get(name, envir=globals) + } else { + stop("dictionary '", name, "' does not exist") + } +} + +saveDictionary <- function(name, dictionary) { + assign(name, dictionary, globals) + #if (name == "current" || name == "useful") + # combineDictionaries() + + path <- file.path(system.file(package="alspac"), "data") + if (!file.exists(path)) { + dir.create(path) + } + save(list=name, + file=file.path(path, paste(name, "rdata", sep=".")), + envir=globals) +} + +#' Checks a dictionary +#' +#' Checks if all the files referred to in the dictionary +#' are accessible given the ALSPAC data directory. +#' +#' @param dictionary The name of an existing dictionary or the dictionary itself. +#' @param max.print The maximum number of missing files to list if any are missing +#' (Default: 10). +#' @export +#' @return \code{TRUE} if all files exist, otherwise \code{FALSE} and a warning listing at most +#' \code{max.print} missing files. +#' +dictionaryGood <- function(dictionary, max.print=10) { + if (is.character(dictionary)) { + dictionary <- retrieveDictionary(dictionary) + } + + alspacdir <- options()$alspac_data_dir + + filenames <- unique(with(dictionary, file.path(alspacdir, path, obj))) + missing.idx <- which(!sapply(filenames, file.exists)) + num.missing <- length(missing.idx) + if (num.missing == 0) { + TRUE + } else { + missing.idx <- missing.idx[1:min(max.print,num.missing)] + warning("Please run 'updateDictionaries()' and try again. ", + "If you are using input from 'findVars()', ", + "then you will need to rerun that as well. ", + "Dictionary refers to missing files, e.g. ", + paste(filenames[missing.idx], collapse=", ")) + FALSE + } +} + + +#' Update dictionaries +#' +#' Update the variable dictionaries for the ALSPAC dataset. +#' +#' @export +updateDictionaries <- function() { + createDictionary("Current", name="current", quick=FALSE) + #createDictionary("Useful_data", name="useful", quick=FALSE) + return(TRUE) +} + + +#' Create a dictionary from ALSPAC Stata files +#' +#' @param datadir ALSPAC data subdirectory from which to create the index +#' (Default: "Current"). . +#' @param name If not \code{NULL}, then the resulting dictionary +#' will be saved to a file in the R package for use next time the package +#' is loaded. The dictionary will be available with the given name (Default: \code{NULL}). +#' @param quick Logical. Default \code{FALSE}. +#' +#' The function uses multiple processors using \code{\link[parallel]{mclapply}()}. +#' Use multiple processors by setting \code{mc.cores} option using +#' \code{options()}. +#' +#' @export +#' @return Data frame dictionary listing available variables. +createDictionary <- function(datadir="Current", name=NULL, quick=FALSE, sourcesFile = NULL) { + stopifnot(datadir %in% c("Current", "../DataBuddy/DataRequests/Waiting Room")) + if(is.null(sourcesFile)) + sourcesFile <- system.file("data", "sources.csv", package = "alspac") + + + alspacdir <- options()$alspac_data_dir + datadir <- file.path(alspacdir, datadir) + files <- list.files(datadir, + pattern="dta$", + full.names=TRUE, + recursive=TRUE, + ignore.case=TRUE) + + dictionary <- parallel::mclapply(files, function(file) { + cat(date(), "loading", file, "\n") + tryCatch({ + merge( + processDTA(file, quick), + createFileTable(file, alspacdir), by = "obj") + }, error=function(e) { + warning("Error loading", file, "\n") + print(e) + NULL + }) + }) %>% dplyr::bind_rows() + + dictionary <- dictionary[which(dictionary$counts > 0),] + + ## add data sources information so that withdrawn consent can be + ## handled correctly for each variable + dictionary <- addSourcesToDictionary(dictionary) + + if (!is.null(name)) { + saveDictionary(name, dictionary) + } + + invisible(dictionary) +} + +countCharOccurrences <- function(char, s) { + s2 <- gsub(char,"",s) + return(nchar(s) - nchar(s2)) +} + + +trimWhitespace <- function(x) { + if (is.numeric(x)) { + return(x) + } + flag <- is.factor(x) + x <- gsub("^\\s+|\\s+$", "", x) + if (flag) { + return(as.factor(x)) + } else { + return(x) + } +} + + +createFileTable <- function(fls, alspacdir) { + #fls_dn <- dirname(fls) ## does some weird things with windows network paths + fls_bn <- basename(fls) + fls_dn <- sub(fls_bn, "", fls) + fls_n <- gsub(".dta", "", fls_bn, ignore.case=TRUE) + fls_d <- gsub(alspacdir, "", fls_dn) + fls_d <- gsub("^/", "", fls_d) + nfield <- max(countCharOccurrences("/", fls_d)) + 1 + stopifnot(! any(duplicated(fls_n))) + + sp <- strsplit(fls_d, split="/") + sp <- lapply(sp, function(x) { + y <- rep(NA, nfield) + y[1:length(x)] <- x + return(y) + }) + dat <- data.frame(do.call("rbind", sp), stringsAsFactors=FALSE) + names(dat) <- paste("cat", 1:nfield, sep="") + dat$obj <- fls_bn + dat$path <- fls_d + return(dat) +} + +processDTA <- function(fn, quick=FALSE) { + if (quick) { + temp <- suppressWarnings(readstata13::read.dta13(fn, select.rows=5)) + } else { + temp <- suppressWarnings(readstata13::read.dta13(fn)) + # temp <- haven::read_dta(fn) + } + dat <- dplyr::tibble( + name = colnames(temp), + lab = attributes(temp)$var.labels, + # lab = sapply(temp, function(x) attr(x, "label")), + type = sapply(temp, function(x) class(x)[1]), + obj = basename(fn) + ) + if (quick) { + dat$counts <- NA + } else { + dat$counts = sapply(temp, function(x) sum(!is.na(x) & x != -10 & x != -11)) + } + return(dat) +} + diff --git a/R/exclusions.r b/R/exclusions.r index b169734..edb9489 100644 --- a/R/exclusions.r +++ b/R/exclusions.r @@ -12,36 +12,50 @@ #' @return The input data frame but with appropriate values set to missing #' with additional variables ("woc_*") identifying participants #' who have withdrawn consent. -removeExclusions <- function(x) { - stopifnot("aln" %in% names(x)) +removeExclusions <- function(x, dictionary) { + stopifnot("aln" %in% names(x)) + + ## obtain alns for individuals that have withdrawn consent + withdrawals <- readExclusions() - ## obtain alns for individuals that have withdrawn consent - withdrawals <- readExclusions() - - ## obtain dictionary corresponding the requested dataset - dictionary <- retrieveDictionary("current") - dictionary <- dictionary[match(colnames(x), dictionary$name),] + ## add variables for identifying core ALSPAC participants + current <- retrieveDictionary("current") + current <- current[which(!current$name %in% dictionary$name),] + for (col in setdiff(colnames(dictionary),colnames(current))) + current[[col]] <- NA + dictionary <- rbind(dictionary,current) - ## check that exclusions information in the dictionary is up-to-date - if(!all(names(withdrawals) %in% colnames(dictionary))) { - stop( - "New exclusion file(s) have been created but are not being handled here: ", - paste(setdiff(names(withdrawals), colnames(dictionary)), collapse=", ")) - } + ## these variables are computed, ignore them + exceptions <- c("alnqlet",colnames(x)[grep("^in_obj_", colnames(x))]) - for (group in names(withdrawals)) { - sample.idx <- which(x$aln %in% withdrawals[[group]]) - if (length(sample.idx) == 0) next + ## check all variables are in the dictionary or are computed variables + if (!all(colnames(x) %in% c(dictionary$name,exceptions))) + stop("Column names do not match the allowed names in the dictionary.") + + ## check that exclusions information in the dictionary is up-to-date + if(!all(names(withdrawals) %in% colnames(dictionary))) { + stop( + "New exclusion file(s) have been created but are not being handled here: ", + paste(setdiff(names(withdrawals), colnames(dictionary)), collapse=", ")) + } - var.idx <- which(dictionary[[group]]) - if (length(var.idx) == 0) next - - x[sample.idx,var.idx] <- NA - withdrawal.name <- paste("woc",group,sep="_") - x[[withdrawal.name]] <- 1:nrow(x) %in% sample.idx - } - - x + ## make sure that the rows of the dictionary match + ##the columns of x (needed below to know which variable values to exclude) + dictionary <- dictionary[match(colnames(x),dictionary$name),] + + for (group in names(withdrawals)) { + sample.idx <- which(x$aln %in% withdrawals[[group]]) + if (length(sample.idx) == 0) next + + var.idx <- which(dictionary[[group]]) + if (length(var.idx) == 0) next + + x[sample.idx,var.idx] <- NA + withdrawal.name <- paste("woc",group,sep="_") + x[[withdrawal.name]] <- 1:nrow(x) %in% sample.idx + } + + x } #' Get list of ALNs to exclude @@ -78,8 +92,9 @@ readExclusions <- function() { #' See generateSourcesSpreadsheet() for details about creating this file. #' This information is used when decide which data values #' to remove for participants who have withdrawn consent. + #' @param dictionary The name of an existing dictionary or the dictionary itself. -addSourcesToDictionary <- function(dictionary) { +addSourcesToDictionary <- function(dictionary, sourcesFile = "sources.csv") { ## obtain alns for individuals that have withdrawn consent withdrawals <- readExclusions() paths <- getPaths() @@ -99,7 +114,7 @@ addSourcesToDictionary <- function(dictionary) { paste(setdiff(names(withdrawals), names(paths)), collapse=", ")) } - sources <- utils::read.csv(system.file("data", "sources.csv", package = "alspac"), stringsAsFactors=FALSE) + sources <- utils::read.csv(sourcesFile, stringsAsFactors=FALSE) stopifnot(all(names(keep) %in% colnames(sources))) ## match 'sources' to 'dictionary' using the 'obj' column diff --git a/R/extractdataset.r b/R/extractdataset.r index c4c95be..eab167f 100644 --- a/R/extractdataset.r +++ b/R/extractdataset.r @@ -1,132 +1,133 @@ -##' Extract a dataset for external ALSPAC users -#' -#' @param variable_file CSV file with column "Name" containing -#' ALSPAC variable names. -#' @param cid_file CSV file with two columns named "ALN" and the last letter -#' of the filename (e.g. for "ACEHDBFG.txt" the column would be named "G"). -#' @param b_number B number of the project. -#' @param author Last name of the project author. -#' @param output_format "sav","csv" or "dta" (Default: "sav"). -#' @param output_path File path of output file, default is the current directory (Default: "."). -#' @param output_file Dataset file (should not already exist). Default is -#' derived from function arguments as follows: -#' /__.. -#' @param dictionary ALSPAC dictionary to use "current" -#' (Default: "current"). -#' @return Saves the output dataset to `output_file` and returns it. -#' -#' @examples\dontrun{ -#' library(alspac) -#' setDataDir("R:/Data") -#' dat <- extractDataset( -#' variable_file="ACEHDBFG.txt", -#' cid_file="Vars_from_Explore.csv", -#' output_format="sav", -#' b_number="B0001", -#' author="Smith") -#' ## creates a data file with a name like "Smith_B0001_12Jul21.sav" -#' ## in the current directory -#' } -#' @export -extractDataset <- function(variable_file, cid_file, - b_number="BXXXX", author="Author", - output_format="sav", - output_path=".", - output_file=file.path( - output_path, - paste0( - author, "_", - b_number, "_", - format(Sys.time(), "%d%b%y"), - ".", output_format)), - dictionary="current") { - if (!dir.exists(output_path)) { - stop("Path in 'output_path' does not exist: ", output_path) - } - - stopifnot(output_format %in% c("sav","csv","dta")) - if (file.exists(output_file)) { - stop("Output file already exists: ", output_file) - } - - cid_map <- utils::read.csv(cid_file,stringsAsFactors=FALSE) - cid_column <- tolower(sub(".*(.{1})\\.[^.]*","\\1",cid_file)) - colnames(cid_map) <- tolower(colnames(cid_map)) - if (!"aln" %in% colnames(cid_map)) { - stop("ALN column is missing from ", cid_file) - } - if (!cid_column %in% colnames(cid_map)) { - stop("CID column ", cid_column, " is missing from ", cid_file) - } - - variables <- utils::read.csv(variable_file,stringsAsFactors=FALSE) - colnames(variables) <- tolower(colnames(variables)) - if (!"name" %in% colnames(variables)) { - stop("Variable name column 'name' is missing from ", variable_file) - } - - dictionary <- retrieveDictionary(dictionary) - - idx <- which(tolower(dictionary$name) %in% tolower(variables$name)) - freq <- table(dictionary$name[idx]) - if (any(freq > 1)) { - duplicates <- names(freq)[freq > 1] - idx <- which(dictionary$name %in% duplicates) - print( - with(dictionary[idx,], - data.frame(name=name,file=paste0(path,obj)))) - msg <- paste( - "Some variables have multiple sources:", - paste(duplicates,collapse=", ")) - warning(msg) - } - - dictionary <- dictionary[order(dictionary$counts,decreasing=TRUE),] - idx <- match(tolower(variables$name), tolower(dictionary$name)) - if (any(is.na(idx))) { - if (all(is.na(idx))) { - stop("None of the requested variables could be found.") - } else { - msg <- paste( - "Several requested variables could not be found:", - paste(variables$name[is.na(idx)],collapse=", ")) - warning(msg) - } - idx <- stats::na.omit(idx) - } - dictionary <- dictionary[idx,] - - dat <- extractVars(dictionary, spss=TRUE) - - idx <- match( - as.character(dat$aln), - as.character(cid_map$aln)) - - dat$aln <- cid_map[[cid_column]][idx] - new_column <- paste0("cid",b_number) - colnames(dat)[colnames(dat)=="aln"] <- new_column - - if ("alnqlet" %in% colnames(dat)) { - dat[["alnqlet"]] <- NULL - } - - dat <- dat[order(dat[[new_column]]),] - - attributes(dat[[new_column]])$label <- paste0( - "Unique pregnancy identifier for ", - author, - " (", sub("\\.[^.]+$", "", basename(cid_file)), ")") - if ("qlet" %in% colnames(dat)) { - attributes(dat$qlet)$label <- "Birth order (within pregnancy)" - } - - message("Saving output to ", output_file, "\n") - if (output_format=="dta") { - haven::write_dta(dat, path=output_file) - } else if (output_format=="csv") { - utils::write.csv(dat, file=output_file, row.names=FALSE) - } else if (output_format=="sav") { - haven::write_sav(dat, path=output_file, compress=TRUE) - } - invisible(dat) -} +##' Extract a dataset for external ALSPAC users +#' +#' @param variable_file CSV file with column "Name" containing +#' ALSPAC variable names. +#' @param cid_file CSV file with two columns named "ALN" and the last letter +#' of the filename (e.g. for "ACEHDBFG.txt" the column would be named "G"). +#' @param b_number B number of the project. +#' @param author Last name of the project author. +#' @param output_format "sav","csv" or "dta" (Default: "sav"). +#' @param output_path File path of output file, default is the current directory (Default: "."). +#' @param output_file Dataset file (should not already exist). Default is +#' derived from function arguments as follows: +#' /__.. +#' @param dictionary ALSPAC dictionary to use "current" +#' (Default: "current"). +#' @return Saves the output dataset to `output_file` and returns it. +#' +#' @examples\dontrun{ +#' library(alspac) +#' setDataDir("R:/Data") +#' dat <- extractDataset( +#' variable_file="ACEHDBFG.txt", +#' cid_file="Vars_from_Explore.csv", +#' output_format="sav", +#' b_number="B0001", +#' author="Smith") +#' ## creates a data file with a name like "Smith_B0001_12Jul21.sav" +#' ## in the current directory +#' } +#' @export +extractDataset <- function(variable_file, cid_file, + b_number="BXXXX", author="Author", + output_format="sav", + output_path=".", + output_file=file.path( + output_path, + paste0( + author, "_", + b_number, "_", + format(Sys.time(), "%d%b%y"), + ".", output_format)), + dictionary="current") { + if (!dir.exists(output_path)) { + stop("Path in 'output_path' does not exist: ", output_path) + } + + stopifnot(output_format %in% c("sav","csv","dta")) + if (file.exists(output_file)) { + stop("Output file already exists: ", output_file) + } + + cid_map <- utils::read.csv(cid_file,stringsAsFactors=FALSE) + cid_column <- tolower(sub(".*(.{1})\\.[^.]*","\\1",cid_file)) + colnames(cid_map) <- tolower(colnames(cid_map)) + if (!"aln" %in% colnames(cid_map)) { + stop("ALN column is missing from ", cid_file) + } + if (!cid_column %in% colnames(cid_map)) { + stop("CID column ", cid_column, " is missing from ", cid_file) + } + + variables <- utils::read.csv(variable_file,stringsAsFactors=FALSE) + colnames(variables) <- tolower(colnames(variables)) + if (!"name" %in% colnames(variables)) { + stop("Variable name column 'name' is missing from ", variable_file) + } + + dictionary <- retrieveDictionary(dictionary) + + idx <- which(tolower(dictionary$name) %in% tolower(variables$name)) + freq <- table(dictionary$name[idx]) + if (any(freq > 1)) { + duplicates <- names(freq)[freq > 1] + idx <- which(dictionary$name %in% duplicates) + print( + with(dictionary[idx,], + data.frame(name=name,file=paste0(path,obj)))) + msg <- paste( + "Some variables have multiple sources:", + paste(duplicates,collapse=", ")) + warning(msg) + } + + dictionary <- dictionary[order(dictionary$counts,decreasing=TRUE),] + idx <- match(tolower(variables$name), tolower(dictionary$name)) + if (any(is.na(idx))) { + if (all(is.na(idx))) { + stop("None of the requested variables could be found.") + } else { + msg <- paste( + "Several requested variables could not be found:", + paste(variables$name[is.na(idx)],collapse=", ")) + warning(msg) + } + idx <- stats::na.omit(idx) + } + dictionary <- dictionary[idx,] + + dat <- extractVars(dictionary, spss=TRUE) + + idx <- match( + as.character(dat$aln), + as.character(cid_map$aln)) + + dat$aln <- cid_map[[cid_column]][idx] + new_column <- paste0("cid",b_number) + colnames(dat)[colnames(dat)=="aln"] <- new_column + + if ("alnqlet" %in% colnames(dat)) { + dat[["alnqlet"]] <- NULL + } + + dat <- dat[order(dat[[new_column]]),] + + attributes(dat[[new_column]])$label <- paste0( + "Unique pregnancy identifier for ", + author, + " (", sub("\\.[^.]+$", "", basename(cid_file)), ")") + if ("qlet" %in% colnames(dat)) { + attributes(dat$qlet)$label <- "Birth order (within pregnancy)" + } + + message("Saving output to ", output_file, "\n") + if (output_format=="dta") { + haven::write_dta(dat, path=output_file) + } else if (output_format=="csv") { + utils::write.csv(dat, file=output_file, row.names=FALSE) + } else if (output_format=="sav") { + haven::write_sav(dat, path=output_file, compress=TRUE) + } + invisible(dat) +} + diff --git a/R/extractvars.r b/R/extractvars.r index 7ad7a05..599a03e 100644 --- a/R/extractvars.r +++ b/R/extractvars.r @@ -1,381 +1,382 @@ -#' Extract variables from data -#' -#' Take the output from `findVars` as a list of variables to extract from ALSPAC data -#' -#' @details There are about 130 ALSPAC data files. Given output from `findVars`, this function will -#' retrieve all the variables from these files and collapse them into a single data frame. -#' It will return columns for all the variables, plus columns for `aln`, `qlet` and `mult_mum` -#' or `mult_dad` if they were present in any of the files. -#' -#' Suppose we extract a four variables, one for each of mothers, children, fathers and partners. This will return the variables requested, along with some other columns - -#' -#' - `aln` - This is the pregnancy identifier. NOTE - this is **not** an individual identifier. For example, notice that row 4 has entries for the father variable `ff1a005a`, the mother variable `fm1a010a`, and the partner variable `pc013`. -#' -#' - `qlet` - This is the child ID for the specific pregnancy. It will take values from A-D. **All** children will have a qlet, and **only** children will have a qlet. Therefore **if qlet is not NA, that row represents an individual child**. -#' -#' - `alnqlet` - this is the ALN + QLET. If the individual is a child (e.g. row 8) then they will have a different `alnqlet` compared to the `aln`. Otherwise, the `aln` is the same as the `alnqlet` -#' -#' - `mult_mum` and `mult_dad` - Sometimes the same mother (or father) had more than one pregnancy in the 18 month recruitment period. Those individuals have two ALNs. If either of these columns is "Yes" then that means you can drop them from the results if you want to avoid individuals being duplicated. This is the guidance from the FOM2 documentation: -#' -#' 1.7 Important Note for all data users: -#' Please be aware that some women may appear in the release file more than once. This is due to the way in which women were originally enrolled into the study and were assigned IDs. ALSPAC started by enrolling pregnant women and the main study ID is a pregnancy based ID. Therefore if a women enrolled with two different pregnancies (both having an expected delivery date within the recruitment period [April 1991-December 1992]), she will have two separate IDs to uniquely identify these women and their pregnancies. An indicator variable has been included in the file, called mult_mum to identify these women. If you are carrying out mother based research that does not require you to consider repeat pregnancies for which we have data then please select mult_mum == 'No' to remove the duplicate entries. This will keep one pregnancy and randomly drop the other pregnancy. If you are matching the data included in this file to child based data or have been provided with a dataset that includes the children of the ALSPAC pregnancies, as well as the mother-based data, you need not do anything as each pregnancy (and hence each child from a separate pregnancy) has a unique identifier and a mothers data has been included/repeated here for each of her pregnancies where appropriate. -#' -#' The speed at which this function runs is dependent upon how fast your connection is to the R drive -#' and how many variables you are extracting at once. -#' -#' @param x Output from `findVars` -#' @param exclude_withdrawn Whether to automatically exclude withdrawn consent IDs. Default is TRUE. -#' This is conservative, removing all withdrawn consant ALNs from all datasets. Only use FALSE here -#' if you have a more specific list of withdrawn consent IDs for your specific variables. -#' @param core_only Whether to automatically exclude data from participants -#' not in the core ALSPAC dataset (Default: TRUE). -#' This should give the same samples as the Stata/SPSS scripts in the R:/Data/Syntax folder. -#' @param adult_only No longer supported. Parent-specific restrictions are applied -#' automatically when child-based or child-completed variables are not requested. -#' @param spss Logical. Default \code{FALSE}. -#' @param haven Logical. Default \code{FALSE}. -#' @export -#' @return A data frame with all the variable specified in `x`. If \code{exclude_withdrawn} was \code{TRUE}, then columns -#' named \code{woc_*} indicate which samples were excluded. -#' @examples \dontrun{ -#' # Find all variables with BMI in the description -#' bmi_variables <- findVars("bmi", ignore.case=TRUE) -#' # Extract all the variables into a data.frame: -#' bmi <- extractVars(bmi_variables) -#' # Alternatively just extract the variables for adults -#' bmi <- extractVars(subset(bmi_variables, cat3 %in% c("Mother", "Adult"))) -#' } -#' -extractVars <- function(x, exclude_withdrawn = TRUE, core_only=TRUE, adult_only=FALSE, spss=FALSE, haven=FALSE) { - dictionaryGood(x) - - if (adult_only) { - warning("'adult_only' is no longer supported. Parent-specific restrictions are applied automatically when child-based or child-completed variables are not requested.") - } - - x <- unique(x) - if (core_only) { - x <- extractVarsCore(x, spss=spss, haven=haven) - } else { - x <- extractVarsFull(x, spss=spss, haven=haven) - } - - if(exclude_withdrawn) { - message("Automatically removing data for individuals who have withdrawn consent.") - x <- removeExclusions(x) - } else { - warning("Withdrawn consent individuals have NOT been removed. ", - "Re-run with the default option or remove the relevant ", - "IDs manually before proceeding with analysis.") - } - x -} - -## restrict data extracted as in the SPSS/Stata -## scripts in R:\Data\Syntax\ -extractVarsCore <- function(x, spss=FALSE, haven=haven) { - dat <- extractVarsFull(x,spss=spss, haven=haven) - - ## return TRUE for each row in x iff that row contains at least one TRUE - any.row <- function(x) { - rowSums(as.matrix(x),na.rm=TRUE) > 0 - } - var.has.mother.data <- any.row(x[,grepl("^mother",colnames(x))]) - var.has.partner.data <- any.row(x[,grepl("^partner",colnames(x))]) - var.has.child.data <- any.row(x[,grepl("^child",colnames(x))]) - - ##based on R:\Data\Syntax\syntax_template_04Nov22.do - mz.obj.pat <- c(obj="mz_[0-9]+[a-z]+",path="Current/Other/Cohort Profile/") - core.filters <- list( - mz010a=mz.obj.pat, - preg_in_alsp=mz.obj.pat, - preg_in_core=mz.obj.pat, - preg_enrol_status=mz.obj.pat, - mum_enrol_status=mz.obj.pat, - mum_and_preg_enrolled=mz.obj.pat, - mz005l=mz.obj.pat, - mz005m=mz.obj.pat, - mz013=mz.obj.pat, - mz014=mz.obj.pat, - mz028b=mz.obj.pat, - ## the following no longer included by default - #a006=c(obj="a_[0-9]+[a-z]+"), - #a525=c(obj="a_[0-9]+[a-z]+"), - #b032=c(obj="b_[0-9]+[a-z]+"), - #b650=c(obj="b_[0-9]+[a-z]+"), - #b663=c(obj="b_[0-9]+[a-z]+"), - #b665=c(obj="b_[0-9]+[a-z]+"), - #b667=c(obj="b_[0-9]+[a-z]+"), - #c645a=c(obj="c_[0-9]+[a-z]+"), - #c755=c(obj="c_[0-9]+[a-z]+"), - #c765=c(obj="c_[0-9]+[a-z]+"), - bestgest=mz.obj.pat) - - mother.filters <- list( - mum_in_alsp=mz.obj.pat, - mum_in_core=mz.obj.pat) - if (any(var.has.mother.data)) { - core.filters <- c(core.filters, mother.filters) - } - - pz.obj.pat <- c(obj="pz_[0-9]+[a-z]+",path="Current/Other/Cohort Profile/") - partner.filters <- list( - partner_in_alspac=pz.obj.pat, - partner_data=pz.obj.pat, - partner_enrolled=pz.obj.pat, - partner_in_core=pz.obj.pat, - pz_mult=pz.obj.pat, - pz_multid=pz.obj.pat, - partner_changed=pz.obj.pat, - partner_changed_when=pz.obj.pat, - partner_age=pz.obj.pat, - second_partner_age=pz.obj.pat) - if (any(var.has.partner.data)) { - core.filters <- c(core.filters, partner.filters) - } - - cp.obj.pat <- c(obj="cp_[0-9]+[a-z]+",path="Current/Other/Cohort Profile/") - child.filters <- list( - kz011b=cp.obj.pat, - kz021=cp.obj.pat, - kz030=cp.obj.pat, - in_core=cp.obj.pat, - in_alsp=cp.obj.pat, - in_phase2=cp.obj.pat, - in_phase3=cp.obj.pat, - in_phase4=cp.obj.pat, - tripquad=cp.obj.pat) - if (any(var.has.child.data)) { - core.filters <- c(core.filters, child.filters) - } - - suppressWarnings(core.vars <- findVars(names(core.filters), dictionary="current")) - core.vars <- core.vars[which(core.vars$name %in% names(core.filters)),] - core.vars <- do.call(filterVars, c(list(x=core.vars), core.filters)) - - missing.vars <- setdiff(names(core.filters), core.vars$name) - if (length(missing.vars) > 0) { - stop("Variables required to identify core ALSPAC participants not available. Please contact maintainers. ", - "Missing variables: ", - paste(missing.vars, collapse=", ")) - } - core.dat <- extractVarsFull(core.vars, spss=spss, haven=haven) - - if (any(var.has.child.data)) { - in_alsp <- as.numeric(as.character(core.dat$in_alsp)) - tripquad <- as.numeric(as.character(core.dat$tripquad)) - core.dat <- core.dat[which(in_alsp == 1 & tripquad == 2),] - } else { - mum_enrol_status <- as.numeric(as.character(core.dat$mum_enrol_status)) - mum_and_preg_enrolled <- as.numeric(as.character(core.dat$mum_and_preg_enrolled)) - core.dat <- core.dat[which(mum_enrol_status %in% 1:2 & mum_and_preg_enrolled == 1),] - } - - if ("qlet" %in% colnames(core.dat) && "qlet" %in% colnames(dat)) { - dat <- dat[match(core.dat$alnqlet, dat$alnqlet),] - } else { - dat <- dat[match(core.dat$aln, dat$aln),] - } - if (any(var.has.partner.data)) { - partner_in_alspac <- as.numeric(as.character(core.dat$partner_in_alspac)) - remove.idx <- which(partner_in_alspac==0) - for (varname in x$name[var.has.partner.data]) { - dat[remove.idx,varname] <- NA - } - } - - id.vars <- intersect(c("aln","qlet","alnqlet"),colnames(core.dat)) - remove.vars <- c("tripquad","in_alsp") - data.vars <- setdiff(colnames(dat),c(colnames(core.dat),remove.vars)) - admin.vars <- setdiff(colnames(core.dat),c(id.vars, remove.vars)) - - dplyr::bind_cols( - core.dat[,id.vars], - dat[,data.vars], - core.dat[,admin.vars]) -} - - - - -extractVarsFull <- function(x, spss=FALSE, haven=FALSE) { - # require(plyr) - # require(readstata13) - message("Starting extraction from ", length(unique(x$obj)), " files in the ALSPAC data directory") - dat <- plyr::dlply(x, c("obj"), function(x) { - x <- plyr::mutate(x) - # Read in data - fn <- paste0(options()$alspac_data_dir, "/", x$path[1], "/", x$obj[1]) - message("Extracting from: ", fn) - if (!file.exists(fn)) { - stop( - fn, " does not exist. ", - "Please run 'updateDictionaries()' and try again. ", - "If you are using input from 'findVars()', ", - "then you will need rerun that as well. ", - "If the problem persists, ", - "please send your data query and the error message ", - "to the maintainer.") - } - if (spss) { - fn.sav <- sub("dta$", "sav", fn) - obj <- suppressWarnings(haven::read_sav(fn.sav, user_na=TRUE)) - } else { - if (haven) { - obj <- suppressWarnings(haven::read_dta(fn)) - } else { - obj <- suppressWarnings(readstata13::read.dta13(fn)) - } - } - - # Make sure aln and qlet variables are lower case - alnc <- grep("^ALN$", names(obj), ignore.case=TRUE) - if(length(alnc) == 1) { - names(obj)[alnc] <- "aln" - } else { - message("ALN codes missing or not as expected in ", - x$obj[1]) - message(names(obj)[alnc]) - stop("Please contact maintainers.") - } - qletc <- grep("^QLET$", names(obj), ignore.case=TRUE) - if (length(qletc) != 0) { - names(obj)[qletc] <- "qlet" - } - # Get aln, mult and qlet variables - ivars <- grep("^(aln|mult|qlet)$", names(obj), ignore.case=FALSE, value=TRUE) - index <- x$name %in% names(obj) - if (!all(index == TRUE)) { - print(x$name) - message("Missing vars from ", x$obj, ":", x$name[!index], "\n") - } - # extract requested variables - cvars <- names(obj)[names(obj) %in% x$name] - vars <- unique(c(ivars, cvars)) - obj <- subset(obj, select=vars) - ## Create in_obj_XX variable - objname <- sub("(.*)_.*", "\\1", basename(fn)) - objname <- sub("\\..*", "", objname) - objname <- sub(" ", "_", objname) - obj[[paste("in_obj", objname, sep="_")]] <- 1 - # Create aln and aln2 variables - obj$aln2 <- obj$aln - if ("qlet" %in% vars) { - obj$qlet <- convertQlet(obj$qlet) - obj$aln <- paste(obj$aln, obj$qlet, sep="") - } - return(obj) - }) - message("Collapsing data") - dat <- Filter(Negate(is.null), dat) - if (length(dat) == 0) { - message("No data found") - return(NULL) - } - - ## create a complete id set ids=aln/aln2/[optional]qlet - aln2 <- unique(unlist(lapply(dat, function(dat) dat$aln2))) - aln <- unique(unlist(lapply(dat, function(dat) dat$aln))) - if (all(aln %in% aln2)) { - ## includes only mothers and/or partners - ids <- data.frame(aln2=aln2, aln=aln2, stringsAsFactors=FALSE) - } else { - ## includes young people - aln <- setdiff(aln, aln2) - ids <- data.frame(aln2=as.integer(sub("[A-Z]+","",aln)), - aln=as.character(aln), - qlet=sub("[0-9]+","",aln), - stringsAsFactors=FALSE) - aln2 <- setdiff(aln2,ids$aln2) - if (length(aln2) > 0) { - ## includes some mothers with no young people - ids <- rbind(ids, - data.frame(aln2=aln2, - aln=aln2, - qlet=NA, - stringsAsFactors=FALSE)) - } - } - if (spss) { - ids <- tibble::as_tibble(ids) - } - - ## merge ids and dat into a single data frame - dat <- lapply(dat, function(dat) { - if ("qlet" %in% colnames(dat)) { - row.idx <- match(ids$aln, dat$aln) - } else { - row.idx <- match(ids$aln2, dat$aln2) - } - col.idx <- which(!(colnames(dat) %in% c("aln","qlet","aln2"))) - dat[row.idx,col.idx,drop=FALSE] - }) - dat <- c(list(ids), dat) - names(dat) <- NULL - x <- do.call(dplyr::bind_cols, dat) - - ## convert 1/NA to 1/0 for all "in_obj_XX" columns and rename them "in_XX" - is_in_obj_column <- grepl("^in_obj_", colnames(x)) - if (any(is_in_obj_column)) { - for (i in which(is_in_obj_column)) { - x[[i]] <- ifelse(is.na(x[[i]]), 0, 1) - } - colnames(x)[is_in_obj_column] <- sub("_obj", "", colnames(x)[is_in_obj_column]) - } - - names(x)[names(x) == "aln"] <- "alnqlet" - names(x)[names(x) == "aln2"] <- "aln" - rownames(x) <- NULL - return(as.data.frame(x)) -} - - - -convertQlet <- function(qlet) { - if (!is.factor(qlet)) { - qlet <- as.factor(qlet) - } - if (!all(levels(qlet) %in% c("A", "B", "C", "D"))) { - levels(qlet) <- c("A", "B", "C", "D") - } - return(qlet) -} - - -#' Extract variables exported from the ALSPAC variable lookup web app -#' -#' The variable lookup webapp allows you to browse the available -#' variables and export a list of selected variables. -#' This function will read that exported list and extract the individual -#' level data for each of the selected variables. -#' -#' More generally, this function requires a file that has at least one -#' column with the header 'Variable' followed by a list of variable names. -#' -#' The R: drive must be mounted and its path set with the \code{setDataDir} function. -#' -#' @param filename Name of file exported from ALSPAC variable lookup web app -#' -#' @export -#' @return Data frame -#' -extractWebOutput <- function(filename) { - input <- utils::read.csv(filename) - if (names(input)[1] != "Variable") { - stop("The first column in ", filename, " should be names 'Variable'. Make sure this file has been exported from the ALSPAC variable lookup webapp.") - } - if (nrow(input) == 0) { - stop("No variables present in ", filename) - } - - l <- retrieveDictionary("current") - l <- subset(l, name %in% input$Variable) - - if (nrow(l) != 0) { - out <- extractVars(l) - return(out) - } else { - stop("None of the variables in ", filename, " were in the 'current' dictionary") - } - return(l) -} +#' Extract variables from data +#' +#' Take the output from `findVars` as a list of variables to extract from ALSPAC data +#' +#' @details There are about 130 ALSPAC data files. Given output from `findVars`, this function will +#' retrieve all the variables from these files and collapse them into a single data frame. +#' It will return columns for all the variables, plus columns for `aln`, `qlet` and `mult_mum` +#' or `mult_dad` if they were present in any of the files. +#' +#' Suppose we extract a four variables, one for each of mothers, children, fathers and partners. This will return the variables requested, along with some other columns - +#' +#' - `aln` - This is the pregnancy identifier. NOTE - this is **not** an individual identifier. For example, notice that row 4 has entries for the father variable `ff1a005a`, the mother variable `fm1a010a`, and the partner variable `pc013`. +#' +#' - `qlet` - This is the child ID for the specific pregnancy. It will take values from A-D. **All** children will have a qlet, and **only** children will have a qlet. Therefore **if qlet is not NA, that row represents an individual child**. +#' +#' - `alnqlet` - this is the ALN + QLET. If the individual is a child (e.g. row 8) then they will have a different `alnqlet` compared to the `aln`. Otherwise, the `aln` is the same as the `alnqlet` +#' +#' - `mult_mum` and `mult_dad` - Sometimes the same mother (or father) had more than one pregnancy in the 18 month recruitment period. Those individuals have two ALNs. If either of these columns is "Yes" then that means you can drop them from the results if you want to avoid individuals being duplicated. This is the guidance from the FOM2 documentation: +#' +#' 1.7 Important Note for all data users: +#' Please be aware that some women may appear in the release file more than once. This is due to the way in which women were originally enrolled into the study and were assigned IDs. ALSPAC started by enrolling pregnant women and the main study ID is a pregnancy based ID. Therefore if a women enrolled with two different pregnancies (both having an expected delivery date within the recruitment period [April 1991-December 1992]), she will have two separate IDs to uniquely identify these women and their pregnancies. An indicator variable has been included in the file, called mult_mum to identify these women. If you are carrying out mother based research that does not require you to consider repeat pregnancies for which we have data then please select mult_mum == 'No' to remove the duplicate entries. This will keep one pregnancy and randomly drop the other pregnancy. If you are matching the data included in this file to child based data or have been provided with a dataset that includes the children of the ALSPAC pregnancies, as well as the mother-based data, you need not do anything as each pregnancy (and hence each child from a separate pregnancy) has a unique identifier and a mothers data has been included/repeated here for each of her pregnancies where appropriate. +#' +#' The speed at which this function runs is dependent upon how fast your connection is to the R drive +#' and how many variables you are extracting at once. +#' +#' @param x Output from `findVars` +#' @param exclude_withdrawn Whether to automatically exclude withdrawn consent IDs. Default is TRUE. +#' This is conservative, removing all withdrawn consant ALNs from all datasets. Only use FALSE here +#' if you have a more specific list of withdrawn consent IDs for your specific variables. +#' @param core_only Whether to automatically exclude data from participants +#' not in the core ALSPAC dataset (Default: TRUE). +#' This should give the same samples as the Stata/SPSS scripts in the R:/Data/Syntax folder. +#' @param adult_only No longer supported. Parent-specific restrictions are applied +#' automatically when child-based or child-completed variables are not requested. +#' @param spss Logical. Default \code{FALSE}. +#' @param haven Logical. Default \code{FALSE}. +#' @export +#' @return A data frame with all the variable specified in `x`. If \code{exclude_withdrawn} was \code{TRUE}, then columns +#' named \code{woc_*} indicate which samples were excluded. +#' @examples \dontrun{ +#' # Find all variables with BMI in the description +#' bmi_variables <- findVars("bmi", ignore.case=TRUE) +#' # Extract all the variables into a data.frame: +#' bmi <- extractVars(bmi_variables) +#' # Alternatively just extract the variables for adults +#' bmi <- extractVars(subset(bmi_variables, cat3 %in% c("Mother", "Adult"))) +#' } +#' +extractVars <- function(x, exclude_withdrawn = TRUE, core_only=TRUE, adult_only=FALSE, spss=FALSE, haven=FALSE) { + vars <- x + dictionaryGood(vars) + + if (adult_only) { + warning("'adult_only' is no longer supported. Parent-specific restrictions are applied automatically when child-based or child-completed variables are not requested.") + } + + vars <- unique(vars) + if (core_only) { + x <- extractVarsCore(vars, spss=spss, haven=haven) + } else { + x <- extractVarsFull(vars, spss=spss, haven=haven) + } + + if(exclude_withdrawn) { + message("Automatically removing data for individuals who have withdrawn consent.") + x <- removeExclusions(x, vars) + } else { + warning("Withdrawn consent individuals have NOT been removed. ", + "Re-run with the default option or remove the relevant ", + "IDs manually before proceeding with analysis.") + } + x +} + +## restrict data extracted as in the SPSS/Stata +## scripts in R:\Data\Syntax\ +extractVarsCore <- function(x, spss=FALSE, haven=haven) { + dat <- extractVarsFull(x,spss=spss, haven=haven) + + ## return TRUE for each row in x iff that row contains at least one TRUE + any.row <- function(x) { + rowSums(as.matrix(x),na.rm=TRUE) > 0 + } + var.has.mother.data <- any.row(x[,grepl("^mother",colnames(x))]) + var.has.partner.data <- any.row(x[,grepl("^partner",colnames(x))]) + var.has.child.data <- any.row(x[,grepl("^child",colnames(x))]) + + ##based on R:\Data\Syntax\syntax_template_04Nov22.do + mz.obj.pat <- c(obj="mz_[0-9]+[a-z]+",path="Current/Other/Cohort Profile/") + core.filters <- list( + mz010a=mz.obj.pat, + preg_in_alsp=mz.obj.pat, + preg_in_core=mz.obj.pat, + preg_enrol_status=mz.obj.pat, + mum_enrol_status=mz.obj.pat, + mum_and_preg_enrolled=mz.obj.pat, + mz005l=mz.obj.pat, + mz005m=mz.obj.pat, + mz013=mz.obj.pat, + mz014=mz.obj.pat, + mz028b=mz.obj.pat, + ## the following no longer included by default + #a006=c(obj="a_[0-9]+[a-z]+"), + #a525=c(obj="a_[0-9]+[a-z]+"), + #b032=c(obj="b_[0-9]+[a-z]+"), + #b650=c(obj="b_[0-9]+[a-z]+"), + #b663=c(obj="b_[0-9]+[a-z]+"), + #b665=c(obj="b_[0-9]+[a-z]+"), + #b667=c(obj="b_[0-9]+[a-z]+"), + #c645a=c(obj="c_[0-9]+[a-z]+"), + #c755=c(obj="c_[0-9]+[a-z]+"), + #c765=c(obj="c_[0-9]+[a-z]+"), + bestgest=mz.obj.pat) + + mother.filters <- list( + mum_in_alsp=mz.obj.pat, + mum_in_core=mz.obj.pat) + if (any(var.has.mother.data)) { + core.filters <- c(core.filters, mother.filters) + } + + pz.obj.pat <- c(obj="pz_[0-9]+[a-z]+",path="Current/Other/Cohort Profile/") + partner.filters <- list( + partner_in_alspac=pz.obj.pat, + partner_data=pz.obj.pat, + partner_enrolled=pz.obj.pat, + partner_in_core=pz.obj.pat, + pz_mult=pz.obj.pat, + pz_multid=pz.obj.pat, + partner_changed=pz.obj.pat, + partner_changed_when=pz.obj.pat, + partner_age=pz.obj.pat, + second_partner_age=pz.obj.pat) + if (any(var.has.partner.data)) { + core.filters <- c(core.filters, partner.filters) + } + + cp.obj.pat <- c(obj="cp_[0-9]+[a-z]+",path="Current/Other/Cohort Profile/") + child.filters <- list( + kz011b=cp.obj.pat, + kz021=cp.obj.pat, + kz030=cp.obj.pat, + in_core=cp.obj.pat, + in_alsp=cp.obj.pat, + in_phase2=cp.obj.pat, + in_phase3=cp.obj.pat, + in_phase4=cp.obj.pat, + tripquad=cp.obj.pat) + if (any(var.has.child.data)) { + core.filters <- c(core.filters, child.filters) + } + + suppressWarnings(core.vars <- findVars(names(core.filters), dictionary="current")) + core.vars <- core.vars[which(core.vars$name %in% names(core.filters)),] + core.vars <- do.call(filterVars, c(list(x=core.vars), core.filters)) + + missing.vars <- setdiff(names(core.filters), core.vars$name) + if (length(missing.vars) > 0) { + stop("Variables required to identify core ALSPAC participants not available. Please contact maintainers. ", + "Missing variables: ", + paste(missing.vars, collapse=", ")) + } + core.dat <- extractVarsFull(core.vars, spss=spss, haven=haven) + + if (any(var.has.child.data)) { + in_alsp <- as.numeric(as.character(core.dat$in_alsp)) + tripquad <- as.numeric(as.character(core.dat$tripquad)) + core.dat <- core.dat[which(in_alsp == 1 & tripquad == 2),] + } else { + mum_enrol_status <- as.numeric(as.character(core.dat$mum_enrol_status)) + mum_and_preg_enrolled <- as.numeric(as.character(core.dat$mum_and_preg_enrolled)) + core.dat <- core.dat[which(mum_enrol_status %in% 1:2 & mum_and_preg_enrolled == 1),] + } + + if ("qlet" %in% colnames(core.dat) && "qlet" %in% colnames(dat)) { + dat <- dat[match(core.dat$alnqlet, dat$alnqlet),] + } else { + dat <- dat[match(core.dat$aln, dat$aln),] + } + if (any(var.has.partner.data)) { + partner_in_alspac <- as.numeric(as.character(core.dat$partner_in_alspac)) + remove.idx <- which(partner_in_alspac==0) + for (varname in x$name[var.has.partner.data]) { + dat[remove.idx,varname] <- NA + } + } + + id.vars <- intersect(c("aln","qlet","alnqlet"),colnames(core.dat)) + remove.vars <- c("tripquad","in_alsp") + data.vars <- setdiff(colnames(dat),c(colnames(core.dat),remove.vars)) + admin.vars <- setdiff(colnames(core.dat),c(id.vars, remove.vars)) + + dplyr::bind_cols( + core.dat[,id.vars], + dat[,data.vars], + core.dat[,admin.vars]) +} + + + + +extractVarsFull <- function(x, spss=FALSE, haven=FALSE) { + # require(plyr) + # require(readstata13) + message("Starting extraction from ", length(unique(x$obj)), " files in the ALSPAC data directory") + dat <- plyr::dlply(x, c("obj"), function(x) { + x <- plyr::mutate(x) + # Read in data + fn <- paste0(options()$alspac_data_dir, "/", x$path[1], "/", x$obj[1]) + message("Extracting from: ", fn) + if (!file.exists(fn)) { + stop( + fn, " does not exist. ", + "Please run 'updateDictionaries()' and try again. ", + "If you are using input from 'findVars()', ", + "then you will need rerun that as well. ", + "If the problem persists, ", + "please send your data query and the error message ", + "to the maintainer.") + } + if (spss) { + fn.sav <- sub("dta$", "sav", fn) + obj <- suppressWarnings(haven::read_sav(fn.sav, user_na=TRUE)) + } else { + if (haven) { + obj <- suppressWarnings(haven::read_dta(fn)) + } else { + obj <- suppressWarnings(readstata13::read.dta13(fn)) + } + } + + # Make sure aln and qlet variables are lower case + alnc <- grep("^ALN$", names(obj), ignore.case=TRUE) + if(length(alnc) == 1) { + names(obj)[alnc] <- "aln" + } else { + message("ALN codes missing or not as expected in ", + x$obj[1]) + message(names(obj)[alnc]) + stop("Please contact maintainers.") + } + qletc <- grep("^QLET$", names(obj), ignore.case=TRUE) + if (length(qletc) != 0) { + names(obj)[qletc] <- "qlet" + } + # Get aln, mult and qlet variables + ivars <- grep("^(aln|mult|qlet)$", names(obj), ignore.case=FALSE, value=TRUE) + index <- x$name %in% names(obj) + if (!all(index == TRUE)) { + print(x$name) + message("Missing vars from ", x$obj, ":", x$name[!index], "\n") + } + # extract requested variables + cvars <- names(obj)[names(obj) %in% x$name] + vars <- unique(c(ivars, cvars)) + obj <- subset(obj, select=vars) + ## Create in_obj_XX variable + objname <- sub("(.*)_.*", "\\1", basename(fn)) + objname <- sub("\\..*", "", objname) + objname <- sub(" ", "_", objname) + obj[[paste("in_obj", objname, sep="_")]] <- 1 + # Create aln and aln2 variables + obj$aln2 <- obj$aln + if ("qlet" %in% vars) { + obj$qlet <- convertQlet(obj$qlet) + obj$aln <- paste(obj$aln, obj$qlet, sep="") + } + return(obj) + }) + message("Collapsing data") + dat <- Filter(Negate(is.null), dat) + if (length(dat) == 0) { + message("No data found") + return(NULL) + } + + ## create a complete id set ids=aln/aln2/[optional]qlet + aln2 <- unique(unlist(lapply(dat, function(dat) dat$aln2))) + aln <- unique(unlist(lapply(dat, function(dat) dat$aln))) + if (all(aln %in% aln2)) { + ## includes only mothers and/or partners + ids <- data.frame(aln2=aln2, aln=aln2, stringsAsFactors=FALSE) + } else { + ## includes young people + aln <- setdiff(aln, aln2) + ids <- data.frame(aln2=as.integer(sub("[A-Z]+","",aln)), + aln=as.character(aln), + qlet=sub("[0-9]+","",aln), + stringsAsFactors=FALSE) + aln2 <- setdiff(aln2,ids$aln2) + if (length(aln2) > 0) { + ## includes some mothers with no young people + ids <- rbind(ids, + data.frame(aln2=aln2, + aln=aln2, + qlet=NA, + stringsAsFactors=FALSE)) + } + } + if (spss) { + ids <- tibble::as_tibble(ids) + } + + ## merge ids and dat into a single data frame + dat <- lapply(dat, function(dat) { + if ("qlet" %in% colnames(dat)) { + row.idx <- match(ids$aln, dat$aln) + } else { + row.idx <- match(ids$aln2, dat$aln2) + } + col.idx <- which(!(colnames(dat) %in% c("aln","qlet","aln2"))) + dat[row.idx,col.idx,drop=FALSE] + }) + dat <- c(list(ids), dat) + names(dat) <- NULL + x <- do.call(dplyr::bind_cols, dat) + + ## convert 1/NA to 1/0 for all "in_obj_XX" columns and rename them "in_XX" + is_in_obj_column <- grepl("^in_obj_", colnames(x)) + if (any(is_in_obj_column)) { + for (i in which(is_in_obj_column)) { + x[[i]] <- ifelse(is.na(x[[i]]), 0, 1) + } + colnames(x)[is_in_obj_column] <- sub("_obj", "", colnames(x)[is_in_obj_column]) + } + + names(x)[names(x) == "aln"] <- "alnqlet" + names(x)[names(x) == "aln2"] <- "aln" + rownames(x) <- NULL + return(as.data.frame(x)) +} + + + +convertQlet <- function(qlet) { + if (!is.factor(qlet)) { + qlet <- as.factor(qlet) + } + if (!all(levels(qlet) %in% c("A", "B", "C", "D"))) { + levels(qlet) <- c("A", "B", "C", "D") + } + return(qlet) +} + + +#' Extract variables exported from the ALSPAC variable lookup web app +#' +#' The variable lookup webapp allows you to browse the available +#' variables and export a list of selected variables. +#' This function will read that exported list and extract the individual +#' level data for each of the selected variables. +#' +#' More generally, this function requires a file that has at least one +#' column with the header 'Variable' followed by a list of variable names. +#' +#' The R: drive must be mounted and its path set with the \code{setDataDir} function. +#' +#' @param filename Name of file exported from ALSPAC variable lookup web app +#' +#' @export +#' @return Data frame +#' +extractWebOutput <- function(filename) { + input <- utils::read.csv(filename) + if (names(input)[1] != "Variable") { + stop("The first column in ", filename, " should be names 'Variable'. Make sure this file has been exported from the ALSPAC variable lookup webapp.") + } + if (nrow(input) == 0) { + stop("No variables present in ", filename) + } + + l <- retrieveDictionary("current") + l <- subset(l, name %in% input$Variable) + + if (nrow(l) != 0) { + out <- extractVars(l) + return(out) + } else { + stop("None of the variables in ", filename, " were in the 'current' dictionary") + } + return(l) +} diff --git a/data/sources.csv b/data/sources.csv index 976cf53..59a1e61 100644 --- a/data/sources.csv +++ b/data/sources.csv @@ -193,4 +193,3 @@ sd_,Current/Quest/Schools/,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,TRUE sefg_,Current/Quest/Schools/,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,TRUE sh_,Current/Quest/Schools/,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,TRUE sk_,Current/Quest/Schools/,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,TRUE - diff --git a/inst/data/current.rdata b/inst/data/current.rdata new file mode 100644 index 0000000..b8e1d63 Binary files /dev/null and b/inst/data/current.rdata differ diff --git a/inst/data/custom.rdata b/inst/data/custom.rdata new file mode 100644 index 0000000..8683645 Binary files /dev/null and b/inst/data/custom.rdata differ diff --git a/man/addSourcesToDictionary.Rd b/man/addSourcesToDictionary.Rd index fe2ecfa..d6d6201 100644 --- a/man/addSourcesToDictionary.Rd +++ b/man/addSourcesToDictionary.Rd @@ -8,7 +8,7 @@ See generateSourcesSpreadsheet() for details about creating this file. This information is used when decide which data values to remove for participants who have withdrawn consent.} \usage{ -addSourcesToDictionary(dictionary) +addSourcesToDictionary(dictionary, sourcesFile = "sources.csv") } \arguments{ \item{dictionary}{The name of an existing dictionary or the dictionary itself.} diff --git a/man/createDictionary.Rd b/man/createDictionary.Rd index 6416852..60d6aed 100644 --- a/man/createDictionary.Rd +++ b/man/createDictionary.Rd @@ -4,7 +4,12 @@ \alias{createDictionary} \title{Create a dictionary from ALSPAC Stata files} \usage{ -createDictionary(datadir = "Current", name = NULL, quick = FALSE) +createDictionary( + datadir = "Current", + name = NULL, + quick = FALSE, + sourcesFile = NULL +) } \arguments{ \item{datadir}{ALSPAC data subdirectory from which to create the index diff --git a/man/removeExclusions.Rd b/man/removeExclusions.Rd index 3bb75af..f69b97b 100644 --- a/man/removeExclusions.Rd +++ b/man/removeExclusions.Rd @@ -4,7 +4,7 @@ \alias{removeExclusions} \title{Remove data for participants who have withdrawn consent.} \usage{ -removeExclusions(x) +removeExclusions(x, dictionary) } \arguments{ \item{x}{Data frame output from \code{\link{extractVars}()}.}