I have this data:
library(sqldf)
mytable <- data.frame(
name = c("Alice", "Alice", "Alice", # Alice appears in all years
"Bob", "Bob", # Bob appears in 2009-2010
"Charlie", "Charlie", # Charlie appears in 2010-2011
"David", "David", # David appears in 2009,2011
"Eve"), # Eve appears in just 2009
year = c(2009, 2010, 2011, # Years for Alice
2009, 2010, # Years for Bob
2010, 2011, # Years for Charlie
2009, 2011, # Years for David
2009) # Year for Eve
)
I am trying to find out the following:
how many names in uniquely: (2009), (2010), (2011), (2009, 2010), (2009, 2011), (2010, 2011), (2009, 2010, 2011)?
I first made a list of the min-max year combinations:
year_range_query <- sqldf("
SELECT MIN(year) as min_year, MAX(year) as max_year
FROM mytable")
min_year <- year_range_query$min_year
max_year <- year_range_query$max_year
years <- min_year:max_year
results <- data.frame()
Now, I am trying to write an R function which communicates with the SQL server and the local R environment:
for(r in 1:length(years)) {
combinations <- combn(years, r)
for(i in 1:ncol(combinations)) {
years_in_combo <- combinations[,i]
query <- sprintf("
SELECT '%s' as years, COUNT(*) as matching_names
FROM (
SELECT name
FROM mytable
WHERE year IN (%s)
GROUP BY name
HAVING COUNT(DISTINCT year) = %d
)
",
paste(years_in_combo, collapse="-"),
paste(years_in_combo, collapse=","),
length(years_in_combo)
)
result <- sqldf(query)
results <- rbind(results, result)
}
}
The code ran, but when looking at the results, I can see that there duplicates (e.g. 2009-2010 appears multiple times):
years matching_names
1 2009-2010 2
2 2009-2011 2
3 2010-2011 2
4 2009-2010-2011 1
5 2009-2010 2
6 2009-2011 2
7 2010-2011 2
8 2009-2010-2011 1
9 2009 4
10 2010 3
11 2011 3
12 2009-2010 2
13 2009-2011 2
14 2010-2011 2
15 2009-2010-2011 1
16 2009-2010 2
17 2009-2011 2
18 2010-2011 2
19 2009-2010-2011 1
20 2009-2010 2
21 2009-2011 2
22 2010-2011 2
23 2009-2010-2011 1
24 2009 4
25 2010 3
26 2011 3
27 2009-2010 2
28 2009-2011 2
29 2010-2011 2
30 2009-2010-2011 1
Can someone help me understand what I am doing wrong? For the final answer, if I sum the counts column, it should match the count(*) for the original table. Note that I would like to do this without GROUP_CONCAT since I might need to use this code later on a different server which does not support GROUP_CONCAT.
Idea 1: I wanted to avoid this, but I suppose I could pre-craft all queries and then launch them:
# manually determine the years
years <- c(2009, 2010, 2011)
generate_year_indicators <- function(years) {
indicators <- sapply(years, function(y) {
sprintf("MAX(CASE WHEN year = %d THEN 1 ELSE 0 END) as has_%d", y, y)
})
paste(indicators, collapse = ",\n ")
}
generate_combination_condition <- function(combo_years, all_years) {
conditions <- sapply(all_years, function(y) {
if(y %in% combo_years) {
sprintf("has_%d = 1", y)
} else {
sprintf("has_%d = 0", y)
}
})
paste(conditions, collapse = " AND ")
}
all_combinations <- list()
for(r in 1:length(years)) {
combos <- combn(years, r)
for(i in 1:ncol(combos)) {
all_combinations[[length(all_combinations) + 1]] <- combos[,i]
}
}
base_query <- sprintf("
SELECT
name,
%s
FROM mytable
GROUP BY name
", generate_year_indicators(years))
year_indicators <- sqldf(base_query)
results <- data.frame(year_combination = character(0), count = numeric(0))
for(combo in all_combinations) {
combo_name <- paste(combo, collapse="_")
count_query <- sprintf("
SELECT '%s' as year_combination,
SUM(CASE WHEN %s THEN 1 ELSE 0 END) as count
FROM year_indicators
", combo_name, generate_combination_condition(combo, years))
combo_result <- sqldf(count_query)
results <- rbind(results, combo_result)
}
results <- results[order(nchar(results$year_combination),
results$year_combination), ]
rownames(results) <- NULL
print(results)
year_combination count
1 2009 1
2 2010 0
3 2011 0
4 2009_2010 1
5 2009_2011 1
6 2010_2011 1
7 2009_2010_2011 1
Idea 2: Perhaps I could just get R to generate the text for the full SQL query and then manually copy/paste this into the SQL IDE?
generate_full_sql_query <- function(years) {
# First, let's create the year indicator columns
year_indicators <- sapply(years, function(y) {
sprintf("MAX(CASE WHEN year = %d THEN 1 ELSE 0 END) as has_%d", y, y)
})
year_indicators_text <- paste(year_indicators, collapse = ",\n ")
# Generate all possible year combinations
all_combinations <- list()
for(r in 1:length(years)) {
combos <- combn(years, r)
for(i in 1:ncol(combos)) {
all_combinations[[length(all_combinations) + 1]] <- combos[,i]
}
}
# Create the WITH clause for the base query
base_cte <- sprintf("WITH year_indicators AS (\n SELECT \n name,\n %s\n FROM mytable\n GROUP BY name\n)",
year_indicators_text)
# Create UNION ALL queries for each combination
combination_queries <- sapply(all_combinations, function(combo) {
combo_name <- paste(combo, collapse="_")
conditions <- sapply(years, function(y) {
if(y %in% combo) {
sprintf("has_%d = 1", y)
} else {
sprintf("has_%d = 0", y)
}
})
condition_text <- paste(conditions, collapse = " AND ")
sprintf(" SELECT\n '%s' as year_combination,\n COUNT(*) as count\n FROM year_indicators\n WHERE %s",
combo_name, condition_text)
})
# Combine everything into the final query
final_query <- paste(
base_cte,
"SELECT * FROM (",
paste(combination_queries, collapse = "\n UNION ALL\n"),
") results",
"ORDER BY LENGTH(year_combination), year_combination;",
sep = "\n"
)
return(final_query)
}
years <- c(2009, 2010, 2011)
cat(generate_full_sql_query(years))
output
WITH year_indicators AS (
SELECT
name,
MAX(CASE WHEN year = 2009 THEN 1 ELSE 0 END) as has_2009,
MAX(CASE WHEN year = 2010 THEN 1 ELSE 0 END) as has_2010,
MAX(CASE WHEN year = 2011 THEN 1 ELSE 0 END) as has_2011
FROM mytable
GROUP BY name
)
SELECT * FROM (
SELECT
'2009' as year_combination,
COUNT(*) as count
FROM year_indicators
WHERE has_2009 = 1 AND has_2010 = 0 AND has_2011 = 0
UNION ALL
SELECT
'2010' as year_combination,
COUNT(*) as count
FROM year_indicators
WHERE has_2009 = 0 AND has_2010 = 1 AND has_2011 = 0
UNION ALL
SELECT
'2011' as year_combination,
COUNT(*) as count
FROM year_indicators
WHERE has_2009 = 0 AND has_2010 = 0 AND has_2011 = 1
UNION ALL
SELECT
'2009_2010' as year_combination,
COUNT(*) as count
FROM year_indicators
WHERE has_2009 = 1 AND has_2010 = 1 AND has_2011 = 0
UNION ALL
SELECT
'2009_2011' as year_combination,
COUNT(*) as count
FROM year_indicators
WHERE has_2009 = 1 AND has_2010 = 0 AND has_2011 = 1
UNION ALL
SELECT
'2010_2011' as year_combination,
COUNT(*) as count
FROM year_indicators
WHERE has_2009 = 0 AND has_2010 = 1 AND has_2011 = 1
UNION ALL
SELECT
'2009_2010_2011' as year_combination,
COUNT(*) as count
FROM year_indicators
WHERE has_2009 = 1 AND has_2010 = 1 AND has_2011 = 1
) results
ORDER BY LENGTH(year_combination), year_combination;
Third Option
years <- sort(unique(mytable$year))
year_combinations <- expand.grid(lapply(1:length(years), function(i) c(0, 1)))
colnames(year_combinations) <- paste0("has_", years)
year_combinations$combo_name <- character(nrow(year_combinations))
for (i in 1:nrow(year_combinations)) {
current_row <- year_combinations[i, 1:length(years)]
present_indices <- which(current_row == 1)
present_years <- years[present_indices]
if (length(present_years) == 0) {
year_combinations$combo_name[i] <- "none"
} else {
year_combinations$combo_name[i] <- paste(present_years, collapse = "_")
}
}
year_combinations$sql_condition <- character(nrow(year_combinations))
for (i in 1:nrow(year_combinations)) {
conditions <- character()
for (j in 1:length(years)) {
year_val <- years[j]
has_val <- year_combinations[i, j]
condition <- paste0("has_", year_val, " = ", as.character(has_val))
conditions <- c(conditions, condition)
}
year_combinations$sql_condition[i] <- paste(conditions, collapse = " AND ")
}
year_combinations$sql_query_part <- character(nrow(year_combinations))
for (i in 1:nrow(year_combinations)) {
combo_name <- year_combinations$combo_name[i]
condition <- year_combinations$sql_condition[i]
query_part <- paste0(
" SELECT\n",
" '", combo_name, "' as year_combination,\n",
" COUNT(*) as count\n",
" FROM year_indicators\n",
" WHERE ", condition
)
year_combinations$sql_query_part[i] <- query_part
}
generate_final_sql <- function(years, combinations_df) {
year_indicators <- character(length(years))
for (i in 1:length(years)) {
year_val <- years[i]
year_indicators[i] <- paste0("MAX(CASE WHEN year = ", year_val,
" THEN 1 ELSE 0 END) as has_", year_val)
}
year_indicators_text <- paste(year_indicators, collapse = ",\n ")
base_cte <- paste0("WITH year_indicators AS (\n",
" SELECT \n",
" name,\n",
" ", year_indicators_text, "\n",
" FROM mytable\n",
" GROUP BY name\n",
")")
query_parts <- combinations_df$sql_query_part
combined_parts <- paste(query_parts, collapse = "\n UNION ALL\n")
final_query <- paste(
base_cte,
"SELECT * FROM (",
combined_parts,
") results",
"ORDER BY LENGTH(year_combination), year_combination;",
sep = "\n"
)
return(final_query)
}
print(year_combinations)
sql_query <- generate_final_sql(years, year_combinations)
cat(sql_query)
I have this data:
library(sqldf)
mytable <- data.frame(
name = c("Alice", "Alice", "Alice", # Alice appears in all years
"Bob", "Bob", # Bob appears in 2009-2010
"Charlie", "Charlie", # Charlie appears in 2010-2011
"David", "David", # David appears in 2009,2011
"Eve"), # Eve appears in just 2009
year = c(2009, 2010, 2011, # Years for Alice
2009, 2010, # Years for Bob
2010, 2011, # Years for Charlie
2009, 2011, # Years for David
2009) # Year for Eve
)
I am trying to find out the following:
how many names in uniquely: (2009), (2010), (2011), (2009, 2010), (2009, 2011), (2010, 2011), (2009, 2010, 2011)?
I first made a list of the min-max year combinations:
year_range_query <- sqldf("
SELECT MIN(year) as min_year, MAX(year) as max_year
FROM mytable")
min_year <- year_range_query$min_year
max_year <- year_range_query$max_year
years <- min_year:max_year
results <- data.frame()
Now, I am trying to write an R function which communicates with the SQL server and the local R environment:
for(r in 1:length(years)) {
combinations <- combn(years, r)
for(i in 1:ncol(combinations)) {
years_in_combo <- combinations[,i]
query <- sprintf("
SELECT '%s' as years, COUNT(*) as matching_names
FROM (
SELECT name
FROM mytable
WHERE year IN (%s)
GROUP BY name
HAVING COUNT(DISTINCT year) = %d
)
",
paste(years_in_combo, collapse="-"),
paste(years_in_combo, collapse=","),
length(years_in_combo)
)
result <- sqldf(query)
results <- rbind(results, result)
}
}
The code ran, but when looking at the results, I can see that there duplicates (e.g. 2009-2010 appears multiple times):
years matching_names
1 2009-2010 2
2 2009-2011 2
3 2010-2011 2
4 2009-2010-2011 1
5 2009-2010 2
6 2009-2011 2
7 2010-2011 2
8 2009-2010-2011 1
9 2009 4
10 2010 3
11 2011 3
12 2009-2010 2
13 2009-2011 2
14 2010-2011 2
15 2009-2010-2011 1
16 2009-2010 2
17 2009-2011 2
18 2010-2011 2
19 2009-2010-2011 1
20 2009-2010 2
21 2009-2011 2
22 2010-2011 2
23 2009-2010-2011 1
24 2009 4
25 2010 3
26 2011 3
27 2009-2010 2
28 2009-2011 2
29 2010-2011 2
30 2009-2010-2011 1
Can someone help me understand what I am doing wrong? For the final answer, if I sum the counts column, it should match the count(*) for the original table. Note that I would like to do this without GROUP_CONCAT since I might need to use this code later on a different server which does not support GROUP_CONCAT.
Idea 1: I wanted to avoid this, but I suppose I could pre-craft all queries and then launch them:
# manually determine the years
years <- c(2009, 2010, 2011)
generate_year_indicators <- function(years) {
indicators <- sapply(years, function(y) {
sprintf("MAX(CASE WHEN year = %d THEN 1 ELSE 0 END) as has_%d", y, y)
})
paste(indicators, collapse = ",\n ")
}
generate_combination_condition <- function(combo_years, all_years) {
conditions <- sapply(all_years, function(y) {
if(y %in% combo_years) {
sprintf("has_%d = 1", y)
} else {
sprintf("has_%d = 0", y)
}
})
paste(conditions, collapse = " AND ")
}
all_combinations <- list()
for(r in 1:length(years)) {
combos <- combn(years, r)
for(i in 1:ncol(combos)) {
all_combinations[[length(all_combinations) + 1]] <- combos[,i]
}
}
base_query <- sprintf("
SELECT
name,
%s
FROM mytable
GROUP BY name
", generate_year_indicators(years))
year_indicators <- sqldf(base_query)
results <- data.frame(year_combination = character(0), count = numeric(0))
for(combo in all_combinations) {
combo_name <- paste(combo, collapse="_")
count_query <- sprintf("
SELECT '%s' as year_combination,
SUM(CASE WHEN %s THEN 1 ELSE 0 END) as count
FROM year_indicators
", combo_name, generate_combination_condition(combo, years))
combo_result <- sqldf(count_query)
results <- rbind(results, combo_result)
}
results <- results[order(nchar(results$year_combination),
results$year_combination), ]
rownames(results) <- NULL
print(results)
year_combination count
1 2009 1
2 2010 0
3 2011 0
4 2009_2010 1
5 2009_2011 1
6 2010_2011 1
7 2009_2010_2011 1
Idea 2: Perhaps I could just get R to generate the text for the full SQL query and then manually copy/paste this into the SQL IDE?
generate_full_sql_query <- function(years) {
# First, let's create the year indicator columns
year_indicators <- sapply(years, function(y) {
sprintf("MAX(CASE WHEN year = %d THEN 1 ELSE 0 END) as has_%d", y, y)
})
year_indicators_text <- paste(year_indicators, collapse = ",\n ")
# Generate all possible year combinations
all_combinations <- list()
for(r in 1:length(years)) {
combos <- combn(years, r)
for(i in 1:ncol(combos)) {
all_combinations[[length(all_combinations) + 1]] <- combos[,i]
}
}
# Create the WITH clause for the base query
base_cte <- sprintf("WITH year_indicators AS (\n SELECT \n name,\n %s\n FROM mytable\n GROUP BY name\n)",
year_indicators_text)
# Create UNION ALL queries for each combination
combination_queries <- sapply(all_combinations, function(combo) {
combo_name <- paste(combo, collapse="_")
conditions <- sapply(years, function(y) {
if(y %in% combo) {
sprintf("has_%d = 1", y)
} else {
sprintf("has_%d = 0", y)
}
})
condition_text <- paste(conditions, collapse = " AND ")
sprintf(" SELECT\n '%s' as year_combination,\n COUNT(*) as count\n FROM year_indicators\n WHERE %s",
combo_name, condition_text)
})
# Combine everything into the final query
final_query <- paste(
base_cte,
"SELECT * FROM (",
paste(combination_queries, collapse = "\n UNION ALL\n"),
") results",
"ORDER BY LENGTH(year_combination), year_combination;",
sep = "\n"
)
return(final_query)
}
years <- c(2009, 2010, 2011)
cat(generate_full_sql_query(years))
output
WITH year_indicators AS (
SELECT
name,
MAX(CASE WHEN year = 2009 THEN 1 ELSE 0 END) as has_2009,
MAX(CASE WHEN year = 2010 THEN 1 ELSE 0 END) as has_2010,
MAX(CASE WHEN year = 2011 THEN 1 ELSE 0 END) as has_2011
FROM mytable
GROUP BY name
)
SELECT * FROM (
SELECT
'2009' as year_combination,
COUNT(*) as count
FROM year_indicators
WHERE has_2009 = 1 AND has_2010 = 0 AND has_2011 = 0
UNION ALL
SELECT
'2010' as year_combination,
COUNT(*) as count
FROM year_indicators
WHERE has_2009 = 0 AND has_2010 = 1 AND has_2011 = 0
UNION ALL
SELECT
'2011' as year_combination,
COUNT(*) as count
FROM year_indicators
WHERE has_2009 = 0 AND has_2010 = 0 AND has_2011 = 1
UNION ALL
SELECT
'2009_2010' as year_combination,
COUNT(*) as count
FROM year_indicators
WHERE has_2009 = 1 AND has_2010 = 1 AND has_2011 = 0
UNION ALL
SELECT
'2009_2011' as year_combination,
COUNT(*) as count
FROM year_indicators
WHERE has_2009 = 1 AND has_2010 = 0 AND has_2011 = 1
UNION ALL
SELECT
'2010_2011' as year_combination,
COUNT(*) as count
FROM year_indicators
WHERE has_2009 = 0 AND has_2010 = 1 AND has_2011 = 1
UNION ALL
SELECT
'2009_2010_2011' as year_combination,
COUNT(*) as count
FROM year_indicators
WHERE has_2009 = 1 AND has_2010 = 1 AND has_2011 = 1
) results
ORDER BY LENGTH(year_combination), year_combination;
Third Option
years <- sort(unique(mytable$year))
year_combinations <- expand.grid(lapply(1:length(years), function(i) c(0, 1)))
colnames(year_combinations) <- paste0("has_", years)
year_combinations$combo_name <- character(nrow(year_combinations))
for (i in 1:nrow(year_combinations)) {
current_row <- year_combinations[i, 1:length(years)]
present_indices <- which(current_row == 1)
present_years <- years[present_indices]
if (length(present_years) == 0) {
year_combinations$combo_name[i] <- "none"
} else {
year_combinations$combo_name[i] <- paste(present_years, collapse = "_")
}
}
year_combinations$sql_condition <- character(nrow(year_combinations))
for (i in 1:nrow(year_combinations)) {
conditions <- character()
for (j in 1:length(years)) {
year_val <- years[j]
has_val <- year_combinations[i, j]
condition <- paste0("has_", year_val, " = ", as.character(has_val))
conditions <- c(conditions, condition)
}
year_combinations$sql_condition[i] <- paste(conditions, collapse = " AND ")
}
year_combinations$sql_query_part <- character(nrow(year_combinations))
for (i in 1:nrow(year_combinations)) {
combo_name <- year_combinations$combo_name[i]
condition <- year_combinations$sql_condition[i]
query_part <- paste0(
" SELECT\n",
" '", combo_name, "' as year_combination,\n",
" COUNT(*) as count\n",
" FROM year_indicators\n",
" WHERE ", condition
)
year_combinations$sql_query_part[i] <- query_part
}
generate_final_sql <- function(years, combinations_df) {
year_indicators <- character(length(years))
for (i in 1:length(years)) {
year_val <- years[i]
year_indicators[i] <- paste0("MAX(CASE WHEN year = ", year_val,
" THEN 1 ELSE 0 END) as has_", year_val)
}
year_indicators_text <- paste(year_indicators, collapse = ",\n ")
base_cte <- paste0("WITH year_indicators AS (\n",
" SELECT \n",
" name,\n",
" ", year_indicators_text, "\n",
" FROM mytable\n",
" GROUP BY name\n",
")")
query_parts <- combinations_df$sql_query_part
combined_parts <- paste(query_parts, collapse = "\n UNION ALL\n")
final_query <- paste(
base_cte,
"SELECT * FROM (",
combined_parts,
") results",
"ORDER BY LENGTH(year_combination), year_combination;",
sep = "\n"
)
return(final_query)
}
print(year_combinations)
sql_query <- generate_final_sql(years, year_combinations)
cat(sql_query)
Share
Improve this question
edited Mar 26 at 13:27
user_436830
asked Dec 11, 2024 at 19:06
user_436830user_436830
1474 bronze badges
2
- I am under the impression that sqldf was written to run locally, not to connect to external servers. You may want to look at RODBC. # WARNING: incomplete code below! library( RODBC ) dbhandle <- odbcDriverConnect( sConnectionString, rows_at_time = 10000, readOnlyOptimize = TRUE ) dfResults <- sqlQuery( dbhandle, sSqlQuery ) – Michael Commented Dec 11, 2024 at 20:20
- 1 thanks for the feedback michael ... see idea2... I think this might be a good idea? – user_436830 Commented Dec 11, 2024 at 20:26
2 Answers
Reset to default 1Simply, move the nested sqldf
and rbind()
to the inner loop. Even better, don't run rbind
inside a loop to avoid the quadratic copy. See Patrick Burn's R Inferno - Circle 2: Growing Objects (PDF).
Since combn
supports a function argument, incorporate your query run directly with simplify=FALSE
:
query_run <- function(y) {
query <- sprintf(
"
SELECT '%s' as years, COUNT(*) as matching_names
FROM (
SELECT name
FROM mytable
WHERE year IN (%s)
GROUP BY name
HAVING COUNT(DISTINCT year) = %d
)
",
paste(y, collapse="-"),
paste(y, collapse=","),
length(y)
)
cat(query)
result <- sqldf(query)
results <- rbind(results, result)
}
combinations <- lapply(
seq_along(years), \(r) combn(years, r, query_run, simplify = FALSE)
)
cat
Output
SELECT '2009' as years, COUNT(*) as matching_names
FROM (
SELECT name
FROM mytable
WHERE year IN (2009)
GROUP BY name
HAVING COUNT(DISTINCT year) = 1
)
SELECT '2010' as years, COUNT(*) as matching_names
FROM (
SELECT name
FROM mytable
WHERE year IN (2010)
GROUP BY name
HAVING COUNT(DISTINCT year) = 1
)
SELECT '2011' as years, COUNT(*) as matching_names
FROM (
SELECT name
FROM mytable
WHERE year IN (2011)
GROUP BY name
HAVING COUNT(DISTINCT year) = 1
)
SELECT '2009-2010' as years, COUNT(*) as matching_names
FROM (
SELECT name
FROM mytable
WHERE year IN (2009,2010)
GROUP BY name
HAVING COUNT(DISTINCT year) = 2
)
SELECT '2009-2011' as years, COUNT(*) as matching_names
FROM (
SELECT name
FROM mytable
WHERE year IN (2009,2011)
GROUP BY name
HAVING COUNT(DISTINCT year) = 2
)
SELECT '2010-2011' as years, COUNT(*) as matching_names
FROM (
SELECT name
FROM mytable
WHERE year IN (2010,2011)
GROUP BY name
HAVING COUNT(DISTINCT year) = 2
)
SELECT '2009-2010-2011' as years, COUNT(*) as matching_names
FROM (
SELECT name
FROM mytable
WHERE year IN (2009,2010,2011)
GROUP BY name
HAVING COUNT(DISTINCT year) = 3
)
1) To avoid using group_concat
we encode each year as a bit in an integer so that 1 represents the first element in years
(where years
is defined in question), 2 represents the second element, 4 represents the third and in general the ith element is represented by 2^(i-1). Subsets of years
can be represented by adding the years together. For example to represent years[1]
and years[3]
use 2^(1-1) + 2^(3-1) = 1 + 4 = 5.
Going the other way bin2year
takes an integer vector x
of such encoded subsets and the years
vector and returns a character vector of the years in each element of x
separated by dashes so for example
bin2year(5, 2009:2011) # 5 means years[1] and years[3]
## [1] "2009-2011"
combos
contains the possible combinations (i.e. subsets) of years in the binary form just discussed. We then perform the computation in SQL and using the result from SQL in R we decode the result back using bin2year
and sort. With this setup group_concat
corresponds to using plain sum
on the encoded values.
Below we used power(2, x)
to take powers of 2. This is equivalent to bit shifting of 1 so alternately power(2, x)
could be replaced with (1 << x)
.
Note that the fn$
in fn$sqldf
causes any expression within backquotes in the SQL string to be evaluated in R and then substituted into the SQL statement at that point. fn$
comes from the gsubfn package which the sqldf package loads. It works with just about any function. It is not specific to sqldf
. An alternative would be to use sprintf
.
# this code uses mytable and years from the question
library(magrittr)
library(sqldf)
bin2year <- function(x, years) {
m <- sapply(x, \(z) as.integer(intToBits(z)))
m <- diag(years) %*% m[seq_along(years), ]
ch <- apply(m, 2, function(x) paste(x[x > 0], collapse = "-"))
ch[ch != ""]
}
combos <- data.frame(year = seq_len(2^length(years)-1))
res <- fn$sqldf("with tmp as (
select name, sum(power(2, year - `min(years)`)) year
from mytable
group by name
)
select year, count(b.year) n
from combos a left join tmp b using (year)
group by year")
res %>%
within(year <- bin2year(year, years)) %>%
.[order(lengths(strsplit(.$year, "-"))), ]
giving
year n
1 2009 1
2 2010 0
4 2011 0
3 2009-2010 1
5 2009-2011 1
6 2010-2011 1
7 2009-2010-2011 1
2) Firstly, using the R vector years
defined in the question create the combotab
data.frame and using mytable
create the ag
data.frame. The latter could be created in SQL using group_concat
(as commented out below) but we have done it in R instead since the question asked not to use it. (Note that your database may have string_agg
which is the similar to group_concat
in which case you could use the commented out code instead after modifying it to use string_agg
.)
Then we can do the rest in SQL using combotab
and mytable
.
combotab <- c(years, NA, NA) %>%
combn(3) %>%
t %>%
unique %>%
apply(1, \(x) as.numeric(na.omit(x))) %>%
.[order(lengths(.))] %>%
sapply(paste, collapse = "-") %>%
data.frame(years = .)
# ag <- sqldf(
# "select name, group_concat(cast(year as int), '-') years
# from mytable
# group by name")
ag <- aggregate(list(years = mytable$year), mytable["name"],
paste, collapse = "-")
sqldf("select y.years, count(a.name) n from combotab y
left join ag a using (years)
group by y.years")
giving
years n
1 2009 1
2 2010 0
3 2011 0
4 2009-2010 1
5 2009-2011 1
6 2010-2011 1
7 2009-2010-2011 1