r - data.table binary search between wide format and long format data (non joins or casting)

I need some guidance on how to properly use data.table binary search using setkey (or another data.table method that's better suited for this problem).

My data consists of two data frames, the first is master_ref_df that contains dates, employee names, and values (several thousand value columns). Each date, contains one observation per employee (i.e., no duplicates).

The second is wide_data in wide format with date, row ids, and employee names and can contain up to 10 employee names (i.e., column names of Name1, Name2....Name10) and 30+ million rows.

I essentially need to left_join wide_data[,c("Date","Name1....NameN") with master_ref_df on c("Date","Name1...NameN") to get each corresponding employee value. For example, if there are 10 employee names, 10 value1 are returned, and so on, and then sum(value1)...sum(valueN) for each date/row id. These are the values I need to get. However, joining isn't an option since it quickly consumes too much RAM.

master_ref_df:

Key: <Date, Name>
         Date                Name value1 value2
       <Date>              <char>  <num>  <num>
1: 2020-01-01 Aylward, Annastaisa  21.74  18.59
2: 2020-01-01      Babcock, Grace  28.23  15.86
3: 2020-01-01     Becker, Leticya  16.31  12.97
4: 2020-01-01    Bentley, Richard  27.75  17.46
5: 2020-01-01      Bigbey, Ashley  18.98  18.90

wide_data

Date rowidx                    Name1              Name2               Name3                    Name4               Name5              Name6
       <Date>  <int>                   <char>             <char>              <char>                   <char>              <char>             <char>
1: 2020-01-01      1          Cordova, Davina      Cayou, Hunter    Querubin, Khulan          el-Kamel, Aamir   Colorado, Destiny   Ortega, Priyanka
2: 2020-01-01      2         el-Bashir, Faaid    Castro, Joachim         Cox, Joseph         Westfield, Kayla   Pickford, Michael Arenivas Jr, Fanny
3: 2020-01-01      3            Marty, Jordan  el-Ghazal, Rafeeq el-Khawaja, Lubaaba             Feder, Ronny al-Barakat, Mubeena  el-Ghazal, Rafeeq
4: 2020-01-01      4 Venegas De Leon, Larissa Ortiz Payan, Maria       Puentes, Lisa       al-Badie, Muntasir      Hughes, Claire el-Mahmood, Haamid
5: 2020-01-01      5      al-Barakat, Nassaar         Mims, Eric      Hughes, Claire Meraz-Escobar, Issabella   el-Islam, Naseeba Currington, Sherry

My current approach consists of a function where I'm splitting my wide_data by Date, and iterating through each date element to generate a list of employee names and their corresponding index in the wide_data and pulling the corresponding value from the master_ref_df.

library(tidyverse)
library(randomNames)
library(collapse)
library(furrr)
library(tictoc)

l <- 100
list_of_values <- vector("list", length = l)
list_of_values <- lapply(list_of_values, function(x) round(rnorm(n = 2500, mean = 20, sd = 5), 2))
list_of_values <- as.data.frame(list_of_values)
names(list_of_values) <- paste0("value",1:max(l))

master_ref_df <- data.frame(
  Date = rep(seq(as.Date("2020-01-01"), as.Date("2020-02-11"), by = "day"), each = 43, length.out = 2500),
  Name = rep(randomNames(n = 120, gender = c("Male","Female")), length.out = 2500)
)
master_ref_df <- cbind(master_ref_df, list_of_values)

# no duplicate names on any given date
master_ref_df <- master_ref_df %>%
  dplyr::group_by(Date,Name) %>%
  dplyr::arrange(desc(value1)) %>%
  dplyr::slice_head(., n = 1) %>%
  dplyr::ungroup(.)

# double check that for each date, there are no duplicated employee names
master_ref_df %>%
  dplyr::group_by(Date,Name) %>%
  dplyr::summarise(n = n()) %>%
  dplyr::filter(n > 1)

employee_names <- unique(master_ref_df$Name)
unique_dates <- unique(master_ref_df$Date)
size <- 100000

wide_data <- data.frame(
  Date = sample(unique_dates, size = size, replace = T),
  Name1 = sample(employee_names, size = size, replace = T),
  Name2 = sample(employee_names, size = size, replace = T),
  Name3 = sample(employee_names, size = size, replace = T),
  Name4 = sample(employee_names, size = size, replace = T),
  Name5 = sample(employee_names, size = size, replace = T),
  Name6 = sample(employee_names, size = size, replace = T)
)

wide_data <- wide_data %>%
  group_by(Date) %>%
  dplyr::mutate(rowidx = 1:n()) %>%
  dplyr::ungroup(.) %>%
  dplyr::select(Date,rowidx,everything(.)) %>%
  dplyr::arrange(Date, rowidx)

wide_data <- collapse::rsplit(wide_data, wide_data$Date)

all_values <- names(master_ref_df)[-c(1:2)]

run_in_par <- function(value, wide_data)
{
  final_row_sums <- vector("list", length = length(wide_data))
  all_dates <- names(wide_data)
  
  z <- 1
  for (z in seq_along(wide_data))
  {
    date_of_interest <- unique(wide_data[[z]]$Date)
    look_up_table <- master_ref_df[master_ref_df$Date==date_of_interest,c("Date","Name",value)]
    look_up_indices <- lapply(collapse::rsplit(look_up_table, look_up_table$Name), function(x) x[[value]])
    
    indices <- collapse::rsplit(seq_along(as.matrix(wide_data[[z]][,-c(1:2)])), as.matrix(wide_data[[z]][,-c(1:2)]))
    
    final_mat <- matrix(data = NA, nrow = nrow(wide_data[[z]]), ncol = ncol(wide_data[[z]][,-c(1:2)]))
    
    for(i in seq_along(indices))
    {
      if ( is.null( look_up_indices[[ names(indices)[[i]] ]] ) )
      {
        final_mat[ indices[[i]] ] <- 0
      } else
      {
        final_mat[ indices[[i]] ] <- look_up_indices[[ names(indices)[[i]] ]]
      }
    }
    
    final_row_sums[[z]] <- rowSums(final_mat)
  }
  
  names(final_row_sums) <- all_dates
  
  return(final_row_sums)
}

## purrr::map [~7 sec with length(all_values)==100]
tic()
sum_values_date <- purrr::map(
  .x = all_values,
  .f = ~run_in_par(value = .x, wide_data = wide_data)
)
names(sum_values_date) <- all_values
toc()

## future_map [~4-6 sec with length(all_values)==100]
plan(multisession)
tic()
sum_values_date <- future_map(
  .x = all_values,
  .f = ~run_in_par(value = .x, wide_data = wide_data)
)
names(sum_values_date) <- all_values
toc()
base::closeAllConnections()

I'm wondering if data.table setkey or DT[DT2, on = ...] (or some other data.table approach) is appropriate here given the large(ish) amount of data. However, I'm having a hard time getting the syntax correct. For example, I know I can do following below by throwing it into a function but setting a new key for each Name1...NameN in wide_data_DT is too expensive on my real data when it comes to gaining any speed over my current method.

## data.table with setkey
wide_data_DT <- rbindlist(wide_data)

setDT(wide_data_DT, key = c("Date","Name1"))
setDT(master_ref_df, key = c("Date","Name"))

wide_data_DT[master_ref_df, c("value1") := .(value1)]
wide_data_DT[master_ref_df, c("value2") := .(value2)]
wide_data_DT[master_ref_df, c("value3") := .(value3)]

科技改变生活-雨落星辰 - 所有的伟大,都源于一个勇敢的开始

r - data.table binary search between wide format and long format data (non joins or casting) - Stack Overflow

与本文相关的文章

评论列表(0)