I need some guidance on how to properly use data.table
binary search using setkey
(or another data.table method that's better suited for this problem).
My data consists of two data frames, the first is master_ref_df
that contains dates, employee names, and values (several thousand value
columns). Each date, contains one observation per employee (i.e., no duplicates).
The second is wide_data
in wide format with date, row ids, and employee names and can contain up to 10 employee names (i.e., column names of Name1
, Name2
....Name10
) and 30+ million rows.
I essentially need to left_join
wide_data[,c("Date","Name1....NameN")
with master_ref_df
on c("Date","Name1...NameN")
to get each corresponding employee value. For example, if there are 10 employee names, 10 value1
are returned, and so on, and then sum(value1)
...sum(valueN
) for each date/row id. These are the values I need to get. However, joining isn't an option since it quickly consumes too much RAM.
master_ref_df
:
Key: <Date, Name>
Date Name value1 value2
<Date> <char> <num> <num>
1: 2020-01-01 Aylward, Annastaisa 21.74 18.59
2: 2020-01-01 Babcock, Grace 28.23 15.86
3: 2020-01-01 Becker, Leticya 16.31 12.97
4: 2020-01-01 Bentley, Richard 27.75 17.46
5: 2020-01-01 Bigbey, Ashley 18.98 18.90
wide_data
Date rowidx Name1 Name2 Name3 Name4 Name5 Name6
<Date> <int> <char> <char> <char> <char> <char> <char>
1: 2020-01-01 1 Cordova, Davina Cayou, Hunter Querubin, Khulan el-Kamel, Aamir Colorado, Destiny Ortega, Priyanka
2: 2020-01-01 2 el-Bashir, Faaid Castro, Joachim Cox, Joseph Westfield, Kayla Pickford, Michael Arenivas Jr, Fanny
3: 2020-01-01 3 Marty, Jordan el-Ghazal, Rafeeq el-Khawaja, Lubaaba Feder, Ronny al-Barakat, Mubeena el-Ghazal, Rafeeq
4: 2020-01-01 4 Venegas De Leon, Larissa Ortiz Payan, Maria Puentes, Lisa al-Badie, Muntasir Hughes, Claire el-Mahmood, Haamid
5: 2020-01-01 5 al-Barakat, Nassaar Mims, Eric Hughes, Claire Meraz-Escobar, Issabella el-Islam, Naseeba Currington, Sherry
My current approach consists of a function where I'm splitting my wide_data
by Date
, and iterating through each date element to generate a list of employee names and their corresponding index in the wide_data
and pulling the corresponding value from the master_ref_df
.
library(tidyverse)
library(randomNames)
library(collapse)
library(furrr)
library(tictoc)
l <- 100
list_of_values <- vector("list", length = l)
list_of_values <- lapply(list_of_values, function(x) round(rnorm(n = 2500, mean = 20, sd = 5), 2))
list_of_values <- as.data.frame(list_of_values)
names(list_of_values) <- paste0("value",1:max(l))
master_ref_df <- data.frame(
Date = rep(seq(as.Date("2020-01-01"), as.Date("2020-02-11"), by = "day"), each = 43, length.out = 2500),
Name = rep(randomNames(n = 120, gender = c("Male","Female")), length.out = 2500)
)
master_ref_df <- cbind(master_ref_df, list_of_values)
# no duplicate names on any given date
master_ref_df <- master_ref_df %>%
dplyr::group_by(Date,Name) %>%
dplyr::arrange(desc(value1)) %>%
dplyr::slice_head(., n = 1) %>%
dplyr::ungroup(.)
# double check that for each date, there are no duplicated employee names
master_ref_df %>%
dplyr::group_by(Date,Name) %>%
dplyr::summarise(n = n()) %>%
dplyr::filter(n > 1)
employee_names <- unique(master_ref_df$Name)
unique_dates <- unique(master_ref_df$Date)
size <- 100000
wide_data <- data.frame(
Date = sample(unique_dates, size = size, replace = T),
Name1 = sample(employee_names, size = size, replace = T),
Name2 = sample(employee_names, size = size, replace = T),
Name3 = sample(employee_names, size = size, replace = T),
Name4 = sample(employee_names, size = size, replace = T),
Name5 = sample(employee_names, size = size, replace = T),
Name6 = sample(employee_names, size = size, replace = T)
)
wide_data <- wide_data %>%
group_by(Date) %>%
dplyr::mutate(rowidx = 1:n()) %>%
dplyr::ungroup(.) %>%
dplyr::select(Date,rowidx,everything(.)) %>%
dplyr::arrange(Date, rowidx)
wide_data <- collapse::rsplit(wide_data, wide_data$Date)
all_values <- names(master_ref_df)[-c(1:2)]
run_in_par <- function(value, wide_data)
{
final_row_sums <- vector("list", length = length(wide_data))
all_dates <- names(wide_data)
z <- 1
for (z in seq_along(wide_data))
{
date_of_interest <- unique(wide_data[[z]]$Date)
look_up_table <- master_ref_df[master_ref_df$Date==date_of_interest,c("Date","Name",value)]
look_up_indices <- lapply(collapse::rsplit(look_up_table, look_up_table$Name), function(x) x[[value]])
indices <- collapse::rsplit(seq_along(as.matrix(wide_data[[z]][,-c(1:2)])), as.matrix(wide_data[[z]][,-c(1:2)]))
final_mat <- matrix(data = NA, nrow = nrow(wide_data[[z]]), ncol = ncol(wide_data[[z]][,-c(1:2)]))
for(i in seq_along(indices))
{
if ( is.null( look_up_indices[[ names(indices)[[i]] ]] ) )
{
final_mat[ indices[[i]] ] <- 0
} else
{
final_mat[ indices[[i]] ] <- look_up_indices[[ names(indices)[[i]] ]]
}
}
final_row_sums[[z]] <- rowSums(final_mat)
}
names(final_row_sums) <- all_dates
return(final_row_sums)
}
## purrr::map [~7 sec with length(all_values)==100]
tic()
sum_values_date <- purrr::map(
.x = all_values,
.f = ~run_in_par(value = .x, wide_data = wide_data)
)
names(sum_values_date) <- all_values
toc()
## future_map [~4-6 sec with length(all_values)==100]
plan(multisession)
tic()
sum_values_date <- future_map(
.x = all_values,
.f = ~run_in_par(value = .x, wide_data = wide_data)
)
names(sum_values_date) <- all_values
toc()
base::closeAllConnections()
I'm wondering if data.table
setkey
or DT[DT2, on = ...]
(or some other data.table approach) is appropriate here given the large(ish) amount of data. However, I'm having a hard time getting the syntax correct. For example, I know I can do following below by throwing it into a function but setting a new key for each Name1...NameN
in wide_data_DT
is too expensive on my real data when it comes to gaining any speed over my current method.
## data.table with setkey
wide_data_DT <- rbindlist(wide_data)
setDT(wide_data_DT, key = c("Date","Name1"))
setDT(master_ref_df, key = c("Date","Name"))
wide_data_DT[master_ref_df, c("value1") := .(value1)]
wide_data_DT[master_ref_df, c("value2") := .(value2)]
wide_data_DT[master_ref_df, c("value3") := .(value3)]