Skip to content

Commit

Permalink
fixed deduplicate issues ref #97
Browse files Browse the repository at this point in the history
  • Loading branch information
schochastics committed Dec 11, 2023
1 parent 724fe5c commit 5246b80
Showing 1 changed file with 4 additions and 19 deletions.
23 changes: 4 additions & 19 deletions R/preprocess.R
Original file line number Diff line number Diff line change
Expand Up @@ -164,7 +164,7 @@ deduplicate <- function(wt, method = "aggregate", within = 1, duration_var = "du
if (!is.null(add_grpvars)) grp_vars <- c(grp_vars, add_grpvars)


wt <- aggregate(cbind(visits = 1, duration = as.numeric(wt$duration), timestamp = wt$timestamp),
wt <- aggregate(data.frame(visits = 1, duration = as.numeric(wt$duration), timestamp = wt$timestamp),
by = wt[grp_vars], FUN = function(x) if (is.numeric(x)) sum(x, na.rm = TRUE) else min(x)
)
wt$day <- NULL
Expand Down Expand Up @@ -201,7 +201,7 @@ deduplicate <- function(wt, method = "aggregate", within = 1, duration_var = "du
wt$tmp_url_prev <- NULL
wt$tmp_timestamp_prev <- NULL
}
class(wt) <- c("wt_dt",class(wt))
class(wt) <- c("wt_dt", class(wt))
return(wt)
}

Expand Down Expand Up @@ -279,6 +279,8 @@ extract_host <- function(wt, varname = "url") {
extract_domain <- function(wt, varname = "url") {
abort_if_not_wtdt(wt)
vars_exist(wt, varname)
protocol <- adaR::ada_get_protocol(wt[[varname]])
wt[[varname]][is.na(protocol)] <- paste0("https://", wt[[varname]][is.na(protocol)])
domain <- adaR::ada_get_domain(wt[[varname]])
if (varname == "url") {
wt[["domain"]] <- domain
Expand Down Expand Up @@ -679,23 +681,6 @@ add_panelist_data <- function(wt, data, cols = NULL, join_on = "panelist_id") {
return(merged_data)
}

#' Clean URLs
#' make URls WHATWG-compliant for faster parsing
#' @param varname character. name of the column from which to extract the host.
#' Defaults to `"url"`.
#' @export
clean_urls <- function(wt, varname = "url") {
abort_if_not_wtdt(wt)
vars_exist(wt, varname)
protocol <- adaR::ada_get_protocol(wt[[varname]])
wt[[varname]][is.na(protocol)] <- paste0("https://", wt[[varname]][is.na(protocol)])
wt[[varname]] <- gsub("/#/", "/", wt[[varname]])
wt
}




# helpers
lead <- function(x, n = 1, default = NA) {
if (length(x) <= n) {
Expand Down

0 comments on commit 5246b80

Please sign in to comment.