r - Dataframe reshaping - Stack Overflow

I have the following data frame, with column containing block identifiers, well identifiers, and concentration values of test divided per column.

df <- structure(list(block = c(1, 1, 1, 1, 1, 2, 2, 2, 2, 2), Well = c("A01", 
"A02", "A03", "A04", "A05", "A01", "A02", "A03", "A04", "A05"
), Conc.test1 = c(2, NA, 2, NA, NA, 2, NA, 2, NA, NA), Conc.test2 = c(NA, 
2, NA, 2, NA, NA, 2, NA, 2, NA), Conc.Test3 = c(NA, NA, NA, 2, 
NA, NA, NA, NA, 2, NA), Conc.test4 = c(NA, NA, 2, 2, NA, 2, NA, 
2, 2, NA), Conc.test5 = c(NA, 2, NA, NA, NA, NA, 2, NA, NA, NA
), Conc.test6 = c(2, NA, NA, NA, NA, 2, NA, NA, NA, NA)), row.names = c(NA, 
10L), class = "data.frame")

#    block Well Conc.test1 Conc.test2 Conc.Test3 Conc.test4 Conc.test5 Conc.test6
# 1      1  A01          2         NA         NA         NA         NA          2
# 2      1  A02         NA          2         NA         NA          2         NA
# 3      1  A03          2         NA         NA          2         NA         NA
# 4      1  A04         NA          2          2          2         NA         NA
# 5      1  A05         NA         NA         NA         NA         NA         NA
# 6      2  A01          2         NA         NA          2         NA          2
# 7      2  A02         NA          2         NA         NA          2         NA
# 8      2  A03          2         NA         NA          2         NA         NA
# 9      2  A04         NA          2          2          2         NA         NA
# 10     2  A05         NA         NA         NA         NA         NA         NA

I would like to find per Block, per well the different tests used (and the amount of tests) and the concentration used, in the following format:

structure(list(block = c(1, 1, 1, 1, 2, 2, 2, 2), tests = c(2, 
2, 2, 3, 3, 2, 2, 3), well = c("A01", "A02", "A03", "A04", "A01", 
"A02", "A03", "A04"), C1 = c("test1", "test2", "test1", "test2", 
"test1", "test2", "test1", "test2"), conc1 = c(2, 2, 2, 2, 2, 
2, 2, 2), C2 = c("test6", "test5", "test4", "test3", "test6", 
"test5", "test4", "test3"), con2 = c(2, 2, 2, 2, 2, 2, 2, 2), 
    C3 = c(NA, NA, NA, "test4", "test4", NA, NA, "test4"), conc3 = c(NA, 
    NA, NA, 2, 2, NA, NA, 2)), row.names = c(NA, 8L), class = "data.frame")

#   block tests well    C1 conc1    C2 con2    C3 conc3
# 1     1     2  A01 test1     2 test6    2  <NA>    NA
# 2     1     2  A02 test2     2 test5    2  <NA>    NA
# 3     1     2  A03 test1     2 test4    2  <NA>    NA
# 4     1     3  A04 test2     2 test3    2 test4     2
# 5     2     3  A01 test1     2 test6    2 test4     2
# 6     2     2  A02 test2     2 test5    2  <NA>    NA
# 7     2     2  A03 test1     2 test4    2  <NA>    NA
# 8     2     3  A04 test2     2 test3    2 test4     2

I got this far, which kind of works, but I don't see how scalable this is.

output_df <- df %>%
  pivot_longer(cols = starts_with("Conc"), names_to = "Test", values_to = "Concentration") %>%
  filter(!is.na(Concentration)) %>%
  group_by(block, Well) %>%
  mutate(test_num = row_number()) %>%
  pivot_wider(names_from = test_num, values_from = c(Test, Concentration), 
              names_sep = "") %>%
  ungroup()

output_df <- output_df %>%
  mutate(tests = rowSums(!is.na(select(., starts_with("Test"))))) %>%
  select(block, Well, tests, everything()) %>%
  rename_with(~gsub("Test", "C", .x), starts_with("Test")) %>%
  rename_with(~gsub("Concentration", "conc", .x), starts_with("Concentration"))

I have the following data frame, with column containing block identifiers, well identifiers, and concentration values of test divided per column.

df <- structure(list(block = c(1, 1, 1, 1, 1, 2, 2, 2, 2, 2), Well = c("A01", 
"A02", "A03", "A04", "A05", "A01", "A02", "A03", "A04", "A05"
), Conc.test1 = c(2, NA, 2, NA, NA, 2, NA, 2, NA, NA), Conc.test2 = c(NA, 
2, NA, 2, NA, NA, 2, NA, 2, NA), Conc.Test3 = c(NA, NA, NA, 2, 
NA, NA, NA, NA, 2, NA), Conc.test4 = c(NA, NA, 2, 2, NA, 2, NA, 
2, 2, NA), Conc.test5 = c(NA, 2, NA, NA, NA, NA, 2, NA, NA, NA
), Conc.test6 = c(2, NA, NA, NA, NA, 2, NA, NA, NA, NA)), row.names = c(NA, 
10L), class = "data.frame")

#    block Well Conc.test1 Conc.test2 Conc.Test3 Conc.test4 Conc.test5 Conc.test6
# 1      1  A01          2         NA         NA         NA         NA          2
# 2      1  A02         NA          2         NA         NA          2         NA
# 3      1  A03          2         NA         NA          2         NA         NA
# 4      1  A04         NA          2          2          2         NA         NA
# 5      1  A05         NA         NA         NA         NA         NA         NA
# 6      2  A01          2         NA         NA          2         NA          2
# 7      2  A02         NA          2         NA         NA          2         NA
# 8      2  A03          2         NA         NA          2         NA         NA
# 9      2  A04         NA          2          2          2         NA         NA
# 10     2  A05         NA         NA         NA         NA         NA         NA

I would like to find per Block, per well the different tests used (and the amount of tests) and the concentration used, in the following format:

structure(list(block = c(1, 1, 1, 1, 2, 2, 2, 2), tests = c(2, 
2, 2, 3, 3, 2, 2, 3), well = c("A01", "A02", "A03", "A04", "A01", 
"A02", "A03", "A04"), C1 = c("test1", "test2", "test1", "test2", 
"test1", "test2", "test1", "test2"), conc1 = c(2, 2, 2, 2, 2, 
2, 2, 2), C2 = c("test6", "test5", "test4", "test3", "test6", 
"test5", "test4", "test3"), con2 = c(2, 2, 2, 2, 2, 2, 2, 2), 
    C3 = c(NA, NA, NA, "test4", "test4", NA, NA, "test4"), conc3 = c(NA, 
    NA, NA, 2, 2, NA, NA, 2)), row.names = c(NA, 8L), class = "data.frame")

#   block tests well    C1 conc1    C2 con2    C3 conc3
# 1     1     2  A01 test1     2 test6    2  <NA>    NA
# 2     1     2  A02 test2     2 test5    2  <NA>    NA
# 3     1     2  A03 test1     2 test4    2  <NA>    NA
# 4     1     3  A04 test2     2 test3    2 test4     2
# 5     2     3  A01 test1     2 test6    2 test4     2
# 6     2     2  A02 test2     2 test5    2  <NA>    NA
# 7     2     2  A03 test1     2 test4    2  <NA>    NA
# 8     2     3  A04 test2     2 test3    2 test4     2

I got this far, which kind of works, but I don't see how scalable this is.

output_df <- df %>%
  pivot_longer(cols = starts_with("Conc"), names_to = "Test", values_to = "Concentration") %>%
  filter(!is.na(Concentration)) %>%
  group_by(block, Well) %>%
  mutate(test_num = row_number()) %>%
  pivot_wider(names_from = test_num, values_from = c(Test, Concentration), 
              names_sep = "") %>%
  ungroup()

output_df <- output_df %>%
  mutate(tests = rowSums(!is.na(select(., starts_with("Test"))))) %>%
  select(block, Well, tests, everything()) %>%
  rename_with(~gsub("Test", "C", .x), starts_with("Test")) %>%
  rename_with(~gsub("Concentration", "conc", .x), starts_with("Concentration"))

Share Improve this question edited Mar 27 at 9:09 Darren Tsai 36.3k5 gold badges25 silver badges57 bronze badges asked Mar 27 at 7:46 szmple 4772 silver badges9 bronze badges

Stack Overflow is not the place to ask others to write your code for you, though the community is very willing to help you solve specific problems you encounter whilst writing it for yourself. What have you tried so far and why didn't it work? – Limey Commented Mar 27 at 7:50
@Limey yes ofcourse!! I understand that, I will update my question with my solutions – szmple Commented Mar 27 at 7:54
As I see it, block 1 well A04 has 3 tests not 2? Same for block 2 well A04? Why does block 2 well A01 have 3 tests for you? Also, this is a very bad output format for any analysis. – user2974951 Commented Mar 27 at 7:54
@user2974951 You are correct, I made a mistake with my expected output format. I will adjust, sorry about that – szmple Commented Mar 27 at 7:56

Add a comment |

3 Answers 3

Sorted by: Reset to default 3

In base R we can do

reshape(df, direction='l', v.names='Conc', timevar='C', 
        varying=grep('Conc\\.', names(df)), 
        times=tolower(sub('Conc\\.', '', grep('Conc\\.', names(df))))) |> 
  na.omit() |>
  transform(tests=ave(Conc, block, Well, FUN=length), 
            id=ave(Conc, block, Well, FUN=seq)) |>
  reshape(direction='w', timevar='id', idvar=c('block','Well'), v.names=c('C', 'Conc')) |>
  `row.names<-`(NULL) |> sort_by(~block+Well)

where the last line is cosmetics. I recommend you to use either upper case or lower case to start a variable name, not both. If performance matters, we can optimise the transform-step.

  block Well tests   C.1 Conc.1   C.2 Conc.2   C.3 Conc.3
1     1  A01     2 test1      2 test6      2  <NA>     NA
5     1  A02     2 test2      2 test5      2  <NA>     NA
2     1  A03     2 test1      2 test4      2  <NA>     NA
6     1  A04     3 test2      2 test3      2 test4      2
3     2  A01     3 test1      2 test4      2 test6      2
7     2  A02     2 test2      2 test5      2  <NA>     NA
4     2  A03     2 test1      2 test4      2  <NA>     NA
8     2  A04     3 test2      2 test3      2 test4      2

You need pivot_longer at first, creating index column by groups, and then pivot_wider.

library(dplyr)
library(tidyr)

df %>%
  pivot_longer(Conc.test1:Conc.test6, names_prefix = "Conc.", names_to = "C",
               values_to = "conc", values_drop_na = TRUE) %>%
  mutate(id = row_number(), tests = n(), .by = c(block, Well)) %>%
  pivot_wider(names_from = id, values_from = c(C, conc),
              names_vary = "slowest", names_sep = "")

# # A tibble: 8 × 9
#   block Well  tests C1    conc1 C2    conc2 C3    conc3
#   <dbl> <chr> <int> <chr> <dbl> <chr> <dbl> <chr> <dbl>
# 1     1 A01       2 test1     2 test6     2 NA       NA
# 2     1 A02       2 test2     2 test5     2 NA       NA
# 3     1 A03       2 test1     2 test4     2 NA       NA
# 4     1 A04       3 test2     2 Test3     2 test4     2
# 5     2 A01       3 test1     2 test4     2 test6     2
# 6     2 A02       2 test2     2 test5     2 NA       NA
# 7     2 A03       2 test1     2 test4     2 NA       NA
# 8     2 A04       3 test2     2 Test3     2 test4     2

library(dplyr)
library(tidyr)

pivot_longer(df, -c(block, Well), names_to="C", values_to="conc") %>%
  filter(!is.na(conc)) %>%
  mutate(tests=n(), C=sub("Conc.", "", C, fixed=TRUE), seq=row_number(), .by=c(block, Well)) %>%
  data.frame() %>%
  reshape(direction="wide", timevar="seq", idvar=c("block","Well"), v.names=c("C", "conc"))
___
   block Well tests   C.1 conc.1   C.2 conc.2   C.3 conc.3
1      1  A01     2 test1      2 test6      2  <NA>     NA
3      1  A02     2 test2      2 test5      2  <NA>     NA
5      1  A03     2 test1      2 test4      2  <NA>     NA
7      1  A04     3 test2      2 Test3      2 test4      2
10     2  A01     3 test1      2 test4      2 test6      2
13     2  A02     2 test2      2 test5      2  <NA>     NA
15     2  A03     2 test1      2 test4      2  <NA>     NA
17     2  A04     3 test2      2 Test3      2 test4      2

科技改变生活-雨落星辰 - 所有的伟大,都源于一个勇敢的开始

r - Dataframe reshaping - Stack Overflow

3 Answers 3

与本文相关的文章

评论列表(0)