sql - Generating different combinations of occurences

I have this data:

library(sqldf)

mytable <- data.frame(
    name = c("Alice", "Alice", "Alice",      # Alice appears in all years
             "Bob", "Bob",                    # Bob appears in 2009-2010
             "Charlie", "Charlie",            # Charlie appears in 2010-2011
             "David", "David",                # David appears in 2009,2011
             "Eve"),                          # Eve appears in just 2009
    year = c(2009, 2010, 2011,               # Years for Alice
             2009, 2010,                      # Years for Bob
             2010, 2011,                      # Years for Charlie
             2009, 2011,                      # Years for David
             2009)                           # Year for Eve
)

I am trying to find out the following:

how many names in uniquely: (2009), (2010), (2011), (2009, 2010), (2009, 2011), (2010, 2011), (2009, 2010, 2011)?

I first made a list of the min-max year combinations:

year_range_query <- sqldf("
    SELECT MIN(year) as min_year, MAX(year) as max_year 
    FROM mytable")

min_year <- year_range_query$min_year
max_year <- year_range_query$max_year
years <- min_year:max_year
results <- data.frame()

Now, I am trying to write an R function which communicates with the SQL server and the local R environment:

for(r in 1:length(years)) {
    combinations <- combn(years, r)
    
    for(i in 1:ncol(combinations)) {
        years_in_combo <- combinations[,i]
        
        query <- sprintf("
            SELECT '%s' as years, COUNT(*) as matching_names
            FROM (
                SELECT name
                FROM mytable
                WHERE year IN (%s)
                GROUP BY name
                HAVING COUNT(DISTINCT year) = %d
            )
        ",
        paste(years_in_combo, collapse="-"),
        paste(years_in_combo, collapse=","),
        length(years_in_combo)
        )
        
        result <- sqldf(query)
        results <- rbind(results, result)
    }
}

The code ran, but when looking at the results, I can see that there duplicates (e.g. 2009-2010 appears multiple times):

          years matching_names
1       2009-2010              2
2       2009-2011              2
3       2010-2011              2
4  2009-2010-2011              1
5       2009-2010              2
6       2009-2011              2
7       2010-2011              2
8  2009-2010-2011              1
9            2009              4
10           2010              3
11           2011              3
12      2009-2010              2
13      2009-2011              2
14      2010-2011              2
15 2009-2010-2011              1
16      2009-2010              2
17      2009-2011              2
18      2010-2011              2
19 2009-2010-2011              1
20      2009-2010              2
21      2009-2011              2
22      2010-2011              2
23 2009-2010-2011              1
24           2009              4
25           2010              3
26           2011              3
27      2009-2010              2
28      2009-2011              2
29      2010-2011              2
30 2009-2010-2011              1

Can someone help me understand what I am doing wrong? For the final answer, if I sum the counts column, it should match the count(*) for the original table. Note that I would like to do this without GROUP_CONCAT since I might need to use this code later on a different server which does not support GROUP_CONCAT.

Idea 1: I wanted to avoid this, but I suppose I could pre-craft all queries and then launch them:

    # manually determine the years
    years <- c(2009, 2010, 2011)
    
    generate_year_indicators <- function(years) {
        indicators <- sapply(years, function(y) {
            sprintf("MAX(CASE WHEN year = %d THEN 1 ELSE 0 END) as has_%d", y, y)
        })
        paste(indicators, collapse = ",\n        ")
    }
    
    generate_combination_condition <- function(combo_years, all_years) {
        conditions <- sapply(all_years, function(y) {
            if(y %in% combo_years) {
                sprintf("has_%d = 1", y)
            } else {
                sprintf("has_%d = 0", y)
            }
        })
        paste(conditions, collapse = " AND ")
    }
    
    all_combinations <- list()
    for(r in 1:length(years)) {
        combos <- combn(years, r)
        for(i in 1:ncol(combos)) {
            all_combinations[[length(all_combinations) + 1]] <- combos[,i]
        }
    }
    
    base_query <- sprintf("
        SELECT 
            name,
            %s
        FROM mytable
        GROUP BY name
    ", generate_year_indicators(years))
    
    year_indicators <- sqldf(base_query)
    
    results <- data.frame(year_combination = character(0), count = numeric(0))
    
    for(combo in all_combinations) {
        combo_name <- paste(combo, collapse="_")
        count_query <- sprintf("
            SELECT '%s' as year_combination,
                   SUM(CASE WHEN %s THEN 1 ELSE 0 END) as count
            FROM year_indicators
        ", combo_name, generate_combination_condition(combo, years))
        combo_result <- sqldf(count_query)
        results <- rbind(results, combo_result)
    }
    
    results <- results[order(nchar(results$year_combination), 
                             results$year_combination), ]
    rownames(results) <- NULL
    
    print(results)

  year_combination count
1             2009     1
2             2010     0
3             2011     0
4        2009_2010     1
5        2009_2011     1
6        2010_2011     1
7   2009_2010_2011     1

Idea 2: Perhaps I could just get R to generate the text for the full SQL query and then manually copy/paste this into the SQL IDE?

    generate_full_sql_query <- function(years) {
        # First, let's create the year indicator columns
        year_indicators <- sapply(years, function(y) {
            sprintf("MAX(CASE WHEN year = %d THEN 1 ELSE 0 END) as has_%d", y, y)
        })
        year_indicators_text <- paste(year_indicators, collapse = ",\n    ")
        
        # Generate all possible year combinations
        all_combinations <- list()
        for(r in 1:length(years)) {
            combos <- combn(years, r)
            for(i in 1:ncol(combos)) {
                all_combinations[[length(all_combinations) + 1]] <- combos[,i]
            }
        }
        
        # Create the WITH clause for the base query
        base_cte <- sprintf("WITH year_indicators AS (\n  SELECT \n    name,\n    %s\n  FROM mytable\n  GROUP BY name\n)", 
                            year_indicators_text)
        
        # Create UNION ALL queries for each combination
        combination_queries <- sapply(all_combinations, function(combo) {
            combo_name <- paste(combo, collapse="_")
            conditions <- sapply(years, function(y) {
                if(y %in% combo) {
                    sprintf("has_%d = 1", y)
                } else {
                    sprintf("has_%d = 0", y)
                }
            })
            condition_text <- paste(conditions, collapse = " AND ")
            
            sprintf("  SELECT\n    '%s' as year_combination,\n    COUNT(*) as count\n  FROM year_indicators\n  WHERE %s",
                    combo_name, condition_text)
        })
        
        # Combine everything into the final query
        final_query <- paste(
            base_cte,
            "SELECT * FROM (",
            paste(combination_queries, collapse = "\n  UNION ALL\n"),
            ") results",
            "ORDER BY LENGTH(year_combination), year_combination;",
            sep = "\n"
        )
        
        return(final_query)
    }
    
    years <- c(2009, 2010, 2011)
    cat(generate_full_sql_query(years))

output

   WITH year_indicators AS (
     SELECT 
       name,
       MAX(CASE WHEN year = 2009 THEN 1 ELSE 0 END) as has_2009,
       MAX(CASE WHEN year = 2010 THEN 1 ELSE 0 END) as has_2010,
       MAX(CASE WHEN year = 2011 THEN 1 ELSE 0 END) as has_2011
     FROM mytable
     GROUP BY name
   )
   SELECT * FROM (
     SELECT
       '2009' as year_combination,
       COUNT(*) as count
     FROM year_indicators
     WHERE has_2009 = 1 AND has_2010 = 0 AND has_2011 = 0
     UNION ALL
     SELECT
       '2010' as year_combination,
       COUNT(*) as count
     FROM year_indicators
     WHERE has_2009 = 0 AND has_2010 = 1 AND has_2011 = 0
     UNION ALL
     SELECT
       '2011' as year_combination,
       COUNT(*) as count
     FROM year_indicators
     WHERE has_2009 = 0 AND has_2010 = 0 AND has_2011 = 1
     UNION ALL
     SELECT
       '2009_2010' as year_combination,
       COUNT(*) as count
     FROM year_indicators
     WHERE has_2009 = 1 AND has_2010 = 1 AND has_2011 = 0
     UNION ALL
     SELECT
       '2009_2011' as year_combination,
       COUNT(*) as count
     FROM year_indicators
     WHERE has_2009 = 1 AND has_2010 = 0 AND has_2011 = 1
     UNION ALL
     SELECT
       '2010_2011' as year_combination,
       COUNT(*) as count
     FROM year_indicators
     WHERE has_2009 = 0 AND has_2010 = 1 AND has_2011 = 1
     UNION ALL
     SELECT
       '2009_2010_2011' as year_combination,
       COUNT(*) as count
     FROM year_indicators
     WHERE has_2009 = 1 AND has_2010 = 1 AND has_2011 = 1
   ) results
   ORDER BY LENGTH(year_combination), year_combination;

Third Option

years <- sort(unique(mytable$year))

year_combinations <- expand.grid(lapply(1:length(years), function(i) c(0, 1)))

colnames(year_combinations) <- paste0("has_", years)

year_combinations$combo_name <- character(nrow(year_combinations))

for (i in 1:nrow(year_combinations)) {
    current_row <- year_combinations[i, 1:length(years)]
    
    present_indices <- which(current_row == 1)
    present_years <- years[present_indices]
    
    if (length(present_years) == 0) {
        year_combinations$combo_name[i] <- "none"
    } else {
        year_combinations$combo_name[i] <- paste(present_years, collapse = "_")
    }
}

year_combinations$sql_condition <- character(nrow(year_combinations))

for (i in 1:nrow(year_combinations)) {
    conditions <- character()
    
    for (j in 1:length(years)) {
        year_val <- years[j]
        has_val <- year_combinations[i, j]
        
        condition <- paste0("has_", year_val, " = ", as.character(has_val))
        conditions <- c(conditions, condition)
    }
    
    year_combinations$sql_condition[i] <- paste(conditions, collapse = " AND ")
}

year_combinations$sql_query_part <- character(nrow(year_combinations))

for (i in 1:nrow(year_combinations)) {
    combo_name <- year_combinations$combo_name[i]
    condition <- year_combinations$sql_condition[i]
    
    query_part <- paste0(
        "  SELECT\n",
        "    '", combo_name, "' as year_combination,\n",
        "    COUNT(*) as count\n",
        "  FROM year_indicators\n",
        "  WHERE ", condition
    )
    
    year_combinations$sql_query_part[i] <- query_part
}

generate_final_sql <- function(years, combinations_df) {
    year_indicators <- character(length(years))
    
    for (i in 1:length(years)) {
        year_val <- years[i]
        year_indicators[i] <- paste0("MAX(CASE WHEN year = ", year_val, 
                                     " THEN 1 ELSE 0 END) as has_", year_val)
    }
    
    year_indicators_text <- paste(year_indicators, collapse = ",\n    ")
    
    base_cte <- paste0("WITH year_indicators AS (\n",
                       "  SELECT \n",
                       "    name,\n",
                       "    ", year_indicators_text, "\n",
                       "  FROM mytable\n",
                       "  GROUP BY name\n",
                       ")")
    
    query_parts <- combinations_df$sql_query_part
    combined_parts <- paste(query_parts, collapse = "\n  UNION ALL\n")
    
    final_query <- paste(
        base_cte,
        "SELECT * FROM (",
        combined_parts,
        ") results",
        "ORDER BY LENGTH(year_combination), year_combination;",
        sep = "\n"
    )
    
    return(final_query)
}

print(year_combinations)

sql_query <- generate_final_sql(years, year_combinations)
cat(sql_query)

I have this data:

library(sqldf)

mytable <- data.frame(
    name = c("Alice", "Alice", "Alice",      # Alice appears in all years
             "Bob", "Bob",                    # Bob appears in 2009-2010
             "Charlie", "Charlie",            # Charlie appears in 2010-2011
             "David", "David",                # David appears in 2009,2011
             "Eve"),                          # Eve appears in just 2009
    year = c(2009, 2010, 2011,               # Years for Alice
             2009, 2010,                      # Years for Bob
             2010, 2011,                      # Years for Charlie
             2009, 2011,                      # Years for David
             2009)                           # Year for Eve
)

I am trying to find out the following:

how many names in uniquely: (2009), (2010), (2011), (2009, 2010), (2009, 2011), (2010, 2011), (2009, 2010, 2011)?

I first made a list of the min-max year combinations:

year_range_query <- sqldf("
    SELECT MIN(year) as min_year, MAX(year) as max_year 
    FROM mytable")

min_year <- year_range_query$min_year
max_year <- year_range_query$max_year
years <- min_year:max_year
results <- data.frame()

Now, I am trying to write an R function which communicates with the SQL server and the local R environment:

for(r in 1:length(years)) {
    combinations <- combn(years, r)
    
    for(i in 1:ncol(combinations)) {
        years_in_combo <- combinations[,i]
        
        query <- sprintf("
            SELECT '%s' as years, COUNT(*) as matching_names
            FROM (
                SELECT name
                FROM mytable
                WHERE year IN (%s)
                GROUP BY name
                HAVING COUNT(DISTINCT year) = %d
            )
        ",
        paste(years_in_combo, collapse="-"),
        paste(years_in_combo, collapse=","),
        length(years_in_combo)
        )
        
        result <- sqldf(query)
        results <- rbind(results, result)
    }
}

The code ran, but when looking at the results, I can see that there duplicates (e.g. 2009-2010 appears multiple times):

          years matching_names
1       2009-2010              2
2       2009-2011              2
3       2010-2011              2
4  2009-2010-2011              1
5       2009-2010              2
6       2009-2011              2
7       2010-2011              2
8  2009-2010-2011              1
9            2009              4
10           2010              3
11           2011              3
12      2009-2010              2
13      2009-2011              2
14      2010-2011              2
15 2009-2010-2011              1
16      2009-2010              2
17      2009-2011              2
18      2010-2011              2
19 2009-2010-2011              1
20      2009-2010              2
21      2009-2011              2
22      2010-2011              2
23 2009-2010-2011              1
24           2009              4
25           2010              3
26           2011              3
27      2009-2010              2
28      2009-2011              2
29      2010-2011              2
30 2009-2010-2011              1

Idea 1: I wanted to avoid this, but I suppose I could pre-craft all queries and then launch them:

    # manually determine the years
    years <- c(2009, 2010, 2011)
    
    generate_year_indicators <- function(years) {
        indicators <- sapply(years, function(y) {
            sprintf("MAX(CASE WHEN year = %d THEN 1 ELSE 0 END) as has_%d", y, y)
        })
        paste(indicators, collapse = ",\n        ")
    }
    
    generate_combination_condition <- function(combo_years, all_years) {
        conditions <- sapply(all_years, function(y) {
            if(y %in% combo_years) {
                sprintf("has_%d = 1", y)
            } else {
                sprintf("has_%d = 0", y)
            }
        })
        paste(conditions, collapse = " AND ")
    }
    
    all_combinations <- list()
    for(r in 1:length(years)) {
        combos <- combn(years, r)
        for(i in 1:ncol(combos)) {
            all_combinations[[length(all_combinations) + 1]] <- combos[,i]
        }
    }
    
    base_query <- sprintf("
        SELECT 
            name,
            %s
        FROM mytable
        GROUP BY name
    ", generate_year_indicators(years))
    
    year_indicators <- sqldf(base_query)
    
    results <- data.frame(year_combination = character(0), count = numeric(0))
    
    for(combo in all_combinations) {
        combo_name <- paste(combo, collapse="_")
        count_query <- sprintf("
            SELECT '%s' as year_combination,
                   SUM(CASE WHEN %s THEN 1 ELSE 0 END) as count
            FROM year_indicators
        ", combo_name, generate_combination_condition(combo, years))
        combo_result <- sqldf(count_query)
        results <- rbind(results, combo_result)
    }
    
    results <- results[order(nchar(results$year_combination), 
                             results$year_combination), ]
    rownames(results) <- NULL
    
    print(results)

  year_combination count
1             2009     1
2             2010     0
3             2011     0
4        2009_2010     1
5        2009_2011     1
6        2010_2011     1
7   2009_2010_2011     1

Idea 2: Perhaps I could just get R to generate the text for the full SQL query and then manually copy/paste this into the SQL IDE?

    generate_full_sql_query <- function(years) {
        # First, let's create the year indicator columns
        year_indicators <- sapply(years, function(y) {
            sprintf("MAX(CASE WHEN year = %d THEN 1 ELSE 0 END) as has_%d", y, y)
        })
        year_indicators_text <- paste(year_indicators, collapse = ",\n    ")
        
        # Generate all possible year combinations
        all_combinations <- list()
        for(r in 1:length(years)) {
            combos <- combn(years, r)
            for(i in 1:ncol(combos)) {
                all_combinations[[length(all_combinations) + 1]] <- combos[,i]
            }
        }
        
        # Create the WITH clause for the base query
        base_cte <- sprintf("WITH year_indicators AS (\n  SELECT \n    name,\n    %s\n  FROM mytable\n  GROUP BY name\n)", 
                            year_indicators_text)
        
        # Create UNION ALL queries for each combination
        combination_queries <- sapply(all_combinations, function(combo) {
            combo_name <- paste(combo, collapse="_")
            conditions <- sapply(years, function(y) {
                if(y %in% combo) {
                    sprintf("has_%d = 1", y)
                } else {
                    sprintf("has_%d = 0", y)
                }
            })
            condition_text <- paste(conditions, collapse = " AND ")
            
            sprintf("  SELECT\n    '%s' as year_combination,\n    COUNT(*) as count\n  FROM year_indicators\n  WHERE %s",
                    combo_name, condition_text)
        })
        
        # Combine everything into the final query
        final_query <- paste(
            base_cte,
            "SELECT * FROM (",
            paste(combination_queries, collapse = "\n  UNION ALL\n"),
            ") results",
            "ORDER BY LENGTH(year_combination), year_combination;",
            sep = "\n"
        )
        
        return(final_query)
    }
    
    years <- c(2009, 2010, 2011)
    cat(generate_full_sql_query(years))

output

   WITH year_indicators AS (
     SELECT 
       name,
       MAX(CASE WHEN year = 2009 THEN 1 ELSE 0 END) as has_2009,
       MAX(CASE WHEN year = 2010 THEN 1 ELSE 0 END) as has_2010,
       MAX(CASE WHEN year = 2011 THEN 1 ELSE 0 END) as has_2011
     FROM mytable
     GROUP BY name
   )
   SELECT * FROM (
     SELECT
       '2009' as year_combination,
       COUNT(*) as count
     FROM year_indicators
     WHERE has_2009 = 1 AND has_2010 = 0 AND has_2011 = 0
     UNION ALL
     SELECT
       '2010' as year_combination,
       COUNT(*) as count
     FROM year_indicators
     WHERE has_2009 = 0 AND has_2010 = 1 AND has_2011 = 0
     UNION ALL
     SELECT
       '2011' as year_combination,
       COUNT(*) as count
     FROM year_indicators
     WHERE has_2009 = 0 AND has_2010 = 0 AND has_2011 = 1
     UNION ALL
     SELECT
       '2009_2010' as year_combination,
       COUNT(*) as count
     FROM year_indicators
     WHERE has_2009 = 1 AND has_2010 = 1 AND has_2011 = 0
     UNION ALL
     SELECT
       '2009_2011' as year_combination,
       COUNT(*) as count
     FROM year_indicators
     WHERE has_2009 = 1 AND has_2010 = 0 AND has_2011 = 1
     UNION ALL
     SELECT
       '2010_2011' as year_combination,
       COUNT(*) as count
     FROM year_indicators
     WHERE has_2009 = 0 AND has_2010 = 1 AND has_2011 = 1
     UNION ALL
     SELECT
       '2009_2010_2011' as year_combination,
       COUNT(*) as count
     FROM year_indicators
     WHERE has_2009 = 1 AND has_2010 = 1 AND has_2011 = 1
   ) results
   ORDER BY LENGTH(year_combination), year_combination;

Third Option

years <- sort(unique(mytable$year))

year_combinations <- expand.grid(lapply(1:length(years), function(i) c(0, 1)))

colnames(year_combinations) <- paste0("has_", years)

year_combinations$combo_name <- character(nrow(year_combinations))

for (i in 1:nrow(year_combinations)) {
    current_row <- year_combinations[i, 1:length(years)]
    
    present_indices <- which(current_row == 1)
    present_years <- years[present_indices]
    
    if (length(present_years) == 0) {
        year_combinations$combo_name[i] <- "none"
    } else {
        year_combinations$combo_name[i] <- paste(present_years, collapse = "_")
    }
}

year_combinations$sql_condition <- character(nrow(year_combinations))

for (i in 1:nrow(year_combinations)) {
    conditions <- character()
    
    for (j in 1:length(years)) {
        year_val <- years[j]
        has_val <- year_combinations[i, j]
        
        condition <- paste0("has_", year_val, " = ", as.character(has_val))
        conditions <- c(conditions, condition)
    }
    
    year_combinations$sql_condition[i] <- paste(conditions, collapse = " AND ")
}

year_combinations$sql_query_part <- character(nrow(year_combinations))

for (i in 1:nrow(year_combinations)) {
    combo_name <- year_combinations$combo_name[i]
    condition <- year_combinations$sql_condition[i]
    
    query_part <- paste0(
        "  SELECT\n",
        "    '", combo_name, "' as year_combination,\n",
        "    COUNT(*) as count\n",
        "  FROM year_indicators\n",
        "  WHERE ", condition
    )
    
    year_combinations$sql_query_part[i] <- query_part
}

generate_final_sql <- function(years, combinations_df) {
    year_indicators <- character(length(years))
    
    for (i in 1:length(years)) {
        year_val <- years[i]
        year_indicators[i] <- paste0("MAX(CASE WHEN year = ", year_val, 
                                     " THEN 1 ELSE 0 END) as has_", year_val)
    }
    
    year_indicators_text <- paste(year_indicators, collapse = ",\n    ")
    
    base_cte <- paste0("WITH year_indicators AS (\n",
                       "  SELECT \n",
                       "    name,\n",
                       "    ", year_indicators_text, "\n",
                       "  FROM mytable\n",
                       "  GROUP BY name\n",
                       ")")
    
    query_parts <- combinations_df$sql_query_part
    combined_parts <- paste(query_parts, collapse = "\n  UNION ALL\n")
    
    final_query <- paste(
        base_cte,
        "SELECT * FROM (",
        combined_parts,
        ") results",
        "ORDER BY LENGTH(year_combination), year_combination;",
        sep = "\n"
    )
    
    return(final_query)
}

print(year_combinations)

sql_query <- generate_final_sql(years, year_combinations)
cat(sql_query)

Share Improve this question edited Mar 26 at 13:27 asked Dec 11, 2024 at 19:06 user_436830 1474 bronze badges

I am under the impression that sqldf was written to run locally, not to connect to external servers. You may want to look at RODBC. # WARNING: incomplete code below! library( RODBC ) dbhandle <- odbcDriverConnect( sConnectionString, rows_at_time = 10000, readOnlyOptimize = TRUE ) dfResults <- sqlQuery( dbhandle, sSqlQuery ) – Michael Commented Dec 11, 2024 at 20:20
1 thanks for the feedback michael ... see idea2... I think this might be a good idea? – user_436830 Commented Dec 11, 2024 at 20:26

Add a comment |

2 Answers 2

Sorted by: Reset to default 1

Simply, move the nested sqldf and rbind() to the inner loop. Even better, don't run rbind inside a loop to avoid the quadratic copy. See Patrick Burn's R Inferno - Circle 2: Growing Objects (PDF).

Since combn supports a function argument, incorporate your query run directly with simplify=FALSE:

query_run <- function(y) {        
  query <- sprintf(
    "
      SELECT '%s' as years, COUNT(*) as matching_names
      FROM (
        SELECT name
        FROM mytable
        WHERE year IN (%s)
        GROUP BY name
        HAVING COUNT(DISTINCT year) = %d
      )
    ",
    paste(y, collapse="-"),
    paste(y, collapse=","),
    length(y)
  )
       
  cat(query)
  result <- sqldf(query)
  results <- rbind(results, result)
}

combinations <- lapply(
  seq_along(years), \(r) combn(years, r, query_run, simplify = FALSE)
)

cat Output

      SELECT '2009' as years, COUNT(*) as matching_names
      FROM (
        SELECT name
        FROM mytable
        WHERE year IN (2009)
        GROUP BY name
        HAVING COUNT(DISTINCT year) = 1
      )
    
      SELECT '2010' as years, COUNT(*) as matching_names
      FROM (
        SELECT name
        FROM mytable
        WHERE year IN (2010)
        GROUP BY name
        HAVING COUNT(DISTINCT year) = 1
      )
    
      SELECT '2011' as years, COUNT(*) as matching_names
      FROM (
        SELECT name
        FROM mytable
        WHERE year IN (2011)
        GROUP BY name
        HAVING COUNT(DISTINCT year) = 1
      )
    
      SELECT '2009-2010' as years, COUNT(*) as matching_names
      FROM (
        SELECT name
        FROM mytable
        WHERE year IN (2009,2010)
        GROUP BY name
        HAVING COUNT(DISTINCT year) = 2
      )
    
      SELECT '2009-2011' as years, COUNT(*) as matching_names
      FROM (
        SELECT name
        FROM mytable
        WHERE year IN (2009,2011)
        GROUP BY name
        HAVING COUNT(DISTINCT year) = 2
      )
    
      SELECT '2010-2011' as years, COUNT(*) as matching_names
      FROM (
        SELECT name
        FROM mytable
        WHERE year IN (2010,2011)
        GROUP BY name
        HAVING COUNT(DISTINCT year) = 2
      )
    
      SELECT '2009-2010-2011' as years, COUNT(*) as matching_names
      FROM (
        SELECT name
        FROM mytable
        WHERE year IN (2009,2010,2011)
        GROUP BY name
        HAVING COUNT(DISTINCT year) = 3
      )

1) To avoid using group_concat we encode each year as a bit in an integer so that 1 represents the first element in years (where years is defined in question), 2 represents the second element, 4 represents the third and in general the ith element is represented by 2^(i-1). Subsets of years can be represented by adding the years together. For example to represent years[1] and years[3] use 2^(1-1) + 2^(3-1) = 1 + 4 = 5.

Going the other way bin2year takes an integer vector x of such encoded subsets and the years vector and returns a character vector of the years in each element of x separated by dashes so for example

bin2year(5, 2009:2011)  # 5 means years[1] and years[3]
## [1] "2009-2011"

combos contains the possible combinations (i.e. subsets) of years in the binary form just discussed. We then perform the computation in SQL and using the result from SQL in R we decode the result back using bin2year and sort. With this setup group_concat corresponds to using plain sum on the encoded values.

Below we used power(2, x) to take powers of 2. This is equivalent to bit shifting of 1 so alternately power(2, x) could be replaced with (1 << x) .

Note that the fn$ in fn$sqldf causes any expression within backquotes in the SQL string to be evaluated in R and then substituted into the SQL statement at that point. fn$ comes from the gsubfn package which the sqldf package loads. It works with just about any function. It is not specific to sqldf. An alternative would be to use sprintf .

# this code uses mytable and years from the question

library(magrittr)
library(sqldf)
   
bin2year <- function(x, years) {
  m <- sapply(x, \(z) as.integer(intToBits(z)))
  m <- diag(years) %*% m[seq_along(years), ]
  ch <- apply(m, 2, function(x) paste(x[x > 0], collapse = "-"))
  ch[ch != ""]
}

combos <- data.frame(year = seq_len(2^length(years)-1))

res <- fn$sqldf("with tmp as (
    select name, sum(power(2, year - `min(years)`)) year
    from mytable 
    group by name
  )
  select year, count(b.year) n
  from combos a left join tmp b using (year)
  group by year")

res %>%
  within(year <- bin2year(year, years)) %>%
  .[order(lengths(strsplit(.$year, "-"))), ]

giving

            year n
1           2009 1
2           2010 0
4           2011 0
3      2009-2010 1
5      2009-2011 1
6      2010-2011 1
7 2009-2010-2011 1

2) Firstly, using the R vector years defined in the question create the combotab data.frame and using mytable create the ag data.frame. The latter could be created in SQL using group_concat (as commented out below) but we have done it in R instead since the question asked not to use it. (Note that your database may have string_agg which is the similar to group_concat in which case you could use the commented out code instead after modifying it to use string_agg.)

Then we can do the rest in SQL using combotab and mytable.

combotab <- c(years, NA, NA) %>%
  combn(3) %>%
  t %>%
  unique %>%
  apply(1, \(x) as.numeric(na.omit(x))) %>%
  .[order(lengths(.))] %>%   
  sapply(paste, collapse = "-") %>%
  data.frame(years = .)

# ag <- sqldf(
#  "select name, group_concat(cast(year as int), '-') years
#   from mytable
#   group by name")

ag <- aggregate(list(years = mytable$year), mytable["name"],
  paste, collapse = "-")

sqldf("select y.years, count(a.name) n from combotab y
  left join ag a using (years)
  group by y.years")

giving

           years n
1           2009 1
2           2010 0
3           2011 0
4      2009-2010 1
5      2009-2011 1
6      2010-2011 1
7 2009-2010-2011 1

科技改变生活-雨落星辰 - 所有的伟大,都源于一个勇敢的开始

sql - Generating different combinations of occurences - Stack Overflow

2 Answers 2

与本文相关的文章

评论列表(0)