Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

reimplemented with Data.table #102

Merged
merged 14 commits into from
Mar 11, 2024
7 changes: 4 additions & 3 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
Package: webtrackR
Title: Preprocessing and Analyzing Web Tracking Data
Version: 0.2.0
Version: 0.2.0.9000
Authors@R: c(
person("David", "Schoch", email = "[email protected]", role = c("aut", "cre"),
comment = c(ORCID = "0000-0003-2952-4812")),
Expand All @@ -19,13 +19,14 @@ Depends:
License: MIT + file LICENSE
Encoding: UTF-8
Roxygen: list(markdown = TRUE)
RoxygenNote: 7.2.3
RoxygenNote: 7.3.0
Imports:
utils,
stats,
fastmatch,
adaR,
httr
httr,
data.table
LazyData: true
Suggests:
knitr,
Expand Down
4 changes: 3 additions & 1 deletion NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,10 @@ export(parse_path)
export(sum_activity)
export(sum_durations)
export(sum_visits)
importFrom(data.table,":=")
importFrom(data.table,.N)
importFrom(data.table,.SD)
importFrom(fastmatch,"%fin%")
importFrom(fastmatch,`%fin%`)
importFrom(stats,aggregate)
importFrom(stats,ave)
importFrom(stats,reshape)
Expand Down
4 changes: 4 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
# webtrackR 0.2.0.9000

* revert back to data.table

# webtrackR 0.2.0

* reimplemented in base R removing several dependencies
Expand Down
22 changes: 14 additions & 8 deletions R/classify.R
Original file line number Diff line number Diff line change
Expand Up @@ -59,31 +59,37 @@ classify_visits <- function(wt, classes, match_by = "domain",
return_rows_val = NULL) {
abort_if_not_wtdt(wt)
match_by <- match.arg(match_by, c("domain", "host", "regex"))
if (!data.table::is.data.table(classes)) {
stop("classes needs to be a data.table")
}
if (match_by == "domain") {
vars_exist(wt, "domain")
vars_exist(classes, "domain")
wt <- merge(wt, classes, by = "domain", all.x = TRUE)
wt <- classes[wt, on = "domain"]
} else if (match_by == "host") {
vars_exist(wt, "host")
vars_exist(classes, "host")
wt <- merge(wt, classes, by = "host", all.x = TRUE)
wt <- classes[wt, on = "host"]
} else if (match_by == "regex") {
stopifnot("You have to specify regex_on if match_by is set to 'regex'" = !is.null(regex_on))
vars_exist(wt, "url")
vars_exist(classes, regex_on)
wt <- drop_query(wt)
wt <- wt[, tmp_index := seq_len(.N)]
tmp_wt <- wt[, list(tmp_index, url_noquery)]
pattern <- paste(classes[[regex_on]], collapse = "|")
idx <- grepl(pattern, wt$url_noquery)
wt$match <- NA
wt$match[idx] <- regmatches(wt$url_noquery, regexpr(pattern, wt$url_noquery))
names(wt)[names(wt) == "match"] <- names(classes)[names(classes) != regex_on]
tmp_wt_matched <- tmp_wt[grepl(pattern, url_noquery)]
tmp_wt_matched <- tmp_wt_matched[, match := regmatches(url_noquery, regexpr(pattern, url_noquery))]
wt_matched <- tmp_wt_matched[, url_noquery := NULL][wt, on = "tmp_index"]
data.table::setnames(wt_matched, "match", regex_on)
wt <- classes[wt_matched, on = regex_on][, c("url_noquery", "tmp_index") := NULL]
}

if (!is.null(return_rows_by)) {
vars_exist(classes, return_rows_by)
stopifnot("You have to specify return_rows_val if return_rows_by is not NULL" = !is.null(return_rows_val))
wt <- wt[!is.na(wt[[return_rows_by]]) & wt[[return_rows_by]] == return_rows_val, ]
wt <- wt[get(return_rows_by) == return_rows_val]
}
class(wt) <- c("wt_dt", class(wt))
wt
wt[]
}
8 changes: 8 additions & 0 deletions R/globals.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
utils::globalVariables(c(
"duration", "timestamp", "panelist_id", "domain", "visit", "day", "type", "prev_type", "tmp", "session", "path",
"host", "suffix", "domain_name", "url_next", "host_next", "domain_next", "url_previous", "host_previous", "domain_previous",
"date", "week", "month", "year", "wave", "duplicate", "i.type", "title", "visits", "url_noquery", "referral",
"tmp_timestamp_next", "tmp_url_next", "tmp_host", "tmp_suffix", "tmp_path", "tmp_scheme", "tmp_domain_name", "tmp_index",
"tmp_class", "tmp_duration", "tmp_timeframe", "tmp_visits", "tmp_last", "device", "device_next",
"tmp_timestamp_prev", "tmp_url_prev", "tmp_index"
))
Loading
Loading