I have the following data frame, with column containing block identifiers, well identifiers, and concentration values of test divided per column.
df <- structure(list(block = c(1, 1, 1, 1, 1, 2, 2, 2, 2, 2), Well = c("A01",
"A02", "A03", "A04", "A05", "A01", "A02", "A03", "A04", "A05"
), Conc.test1 = c(2, NA, 2, NA, NA, 2, NA, 2, NA, NA), Conc.test2 = c(NA,
2, NA, 2, NA, NA, 2, NA, 2, NA), Conc.Test3 = c(NA, NA, NA, 2,
NA, NA, NA, NA, 2, NA), Conc.test4 = c(NA, NA, 2, 2, NA, 2, NA,
2, 2, NA), Conc.test5 = c(NA, 2, NA, NA, NA, NA, 2, NA, NA, NA
), Conc.test6 = c(2, NA, NA, NA, NA, 2, NA, NA, NA, NA)), row.names = c(NA,
10L), class = "data.frame")
# block Well Conc.test1 Conc.test2 Conc.Test3 Conc.test4 Conc.test5 Conc.test6
# 1 1 A01 2 NA NA NA NA 2
# 2 1 A02 NA 2 NA NA 2 NA
# 3 1 A03 2 NA NA 2 NA NA
# 4 1 A04 NA 2 2 2 NA NA
# 5 1 A05 NA NA NA NA NA NA
# 6 2 A01 2 NA NA 2 NA 2
# 7 2 A02 NA 2 NA NA 2 NA
# 8 2 A03 2 NA NA 2 NA NA
# 9 2 A04 NA 2 2 2 NA NA
# 10 2 A05 NA NA NA NA NA NA
I would like to find per Block, per well the different tests used (and the amount of tests) and the concentration used, in the following format:
structure(list(block = c(1, 1, 1, 1, 2, 2, 2, 2), tests = c(2,
2, 2, 3, 3, 2, 2, 3), well = c("A01", "A02", "A03", "A04", "A01",
"A02", "A03", "A04"), C1 = c("test1", "test2", "test1", "test2",
"test1", "test2", "test1", "test2"), conc1 = c(2, 2, 2, 2, 2,
2, 2, 2), C2 = c("test6", "test5", "test4", "test3", "test6",
"test5", "test4", "test3"), con2 = c(2, 2, 2, 2, 2, 2, 2, 2),
C3 = c(NA, NA, NA, "test4", "test4", NA, NA, "test4"), conc3 = c(NA,
NA, NA, 2, 2, NA, NA, 2)), row.names = c(NA, 8L), class = "data.frame")
# block tests well C1 conc1 C2 con2 C3 conc3
# 1 1 2 A01 test1 2 test6 2 <NA> NA
# 2 1 2 A02 test2 2 test5 2 <NA> NA
# 3 1 2 A03 test1 2 test4 2 <NA> NA
# 4 1 3 A04 test2 2 test3 2 test4 2
# 5 2 3 A01 test1 2 test6 2 test4 2
# 6 2 2 A02 test2 2 test5 2 <NA> NA
# 7 2 2 A03 test1 2 test4 2 <NA> NA
# 8 2 3 A04 test2 2 test3 2 test4 2
I got this far, which kind of works, but I don't see how scalable this is.
output_df <- df %>%
pivot_longer(cols = starts_with("Conc"), names_to = "Test", values_to = "Concentration") %>%
filter(!is.na(Concentration)) %>%
group_by(block, Well) %>%
mutate(test_num = row_number()) %>%
pivot_wider(names_from = test_num, values_from = c(Test, Concentration),
names_sep = "") %>%
ungroup()
output_df <- output_df %>%
mutate(tests = rowSums(!is.na(select(., starts_with("Test"))))) %>%
select(block, Well, tests, everything()) %>%
rename_with(~gsub("Test", "C", .x), starts_with("Test")) %>%
rename_with(~gsub("Concentration", "conc", .x), starts_with("Concentration"))
I have the following data frame, with column containing block identifiers, well identifiers, and concentration values of test divided per column.
df <- structure(list(block = c(1, 1, 1, 1, 1, 2, 2, 2, 2, 2), Well = c("A01",
"A02", "A03", "A04", "A05", "A01", "A02", "A03", "A04", "A05"
), Conc.test1 = c(2, NA, 2, NA, NA, 2, NA, 2, NA, NA), Conc.test2 = c(NA,
2, NA, 2, NA, NA, 2, NA, 2, NA), Conc.Test3 = c(NA, NA, NA, 2,
NA, NA, NA, NA, 2, NA), Conc.test4 = c(NA, NA, 2, 2, NA, 2, NA,
2, 2, NA), Conc.test5 = c(NA, 2, NA, NA, NA, NA, 2, NA, NA, NA
), Conc.test6 = c(2, NA, NA, NA, NA, 2, NA, NA, NA, NA)), row.names = c(NA,
10L), class = "data.frame")
# block Well Conc.test1 Conc.test2 Conc.Test3 Conc.test4 Conc.test5 Conc.test6
# 1 1 A01 2 NA NA NA NA 2
# 2 1 A02 NA 2 NA NA 2 NA
# 3 1 A03 2 NA NA 2 NA NA
# 4 1 A04 NA 2 2 2 NA NA
# 5 1 A05 NA NA NA NA NA NA
# 6 2 A01 2 NA NA 2 NA 2
# 7 2 A02 NA 2 NA NA 2 NA
# 8 2 A03 2 NA NA 2 NA NA
# 9 2 A04 NA 2 2 2 NA NA
# 10 2 A05 NA NA NA NA NA NA
I would like to find per Block, per well the different tests used (and the amount of tests) and the concentration used, in the following format:
structure(list(block = c(1, 1, 1, 1, 2, 2, 2, 2), tests = c(2,
2, 2, 3, 3, 2, 2, 3), well = c("A01", "A02", "A03", "A04", "A01",
"A02", "A03", "A04"), C1 = c("test1", "test2", "test1", "test2",
"test1", "test2", "test1", "test2"), conc1 = c(2, 2, 2, 2, 2,
2, 2, 2), C2 = c("test6", "test5", "test4", "test3", "test6",
"test5", "test4", "test3"), con2 = c(2, 2, 2, 2, 2, 2, 2, 2),
C3 = c(NA, NA, NA, "test4", "test4", NA, NA, "test4"), conc3 = c(NA,
NA, NA, 2, 2, NA, NA, 2)), row.names = c(NA, 8L), class = "data.frame")
# block tests well C1 conc1 C2 con2 C3 conc3
# 1 1 2 A01 test1 2 test6 2 <NA> NA
# 2 1 2 A02 test2 2 test5 2 <NA> NA
# 3 1 2 A03 test1 2 test4 2 <NA> NA
# 4 1 3 A04 test2 2 test3 2 test4 2
# 5 2 3 A01 test1 2 test6 2 test4 2
# 6 2 2 A02 test2 2 test5 2 <NA> NA
# 7 2 2 A03 test1 2 test4 2 <NA> NA
# 8 2 3 A04 test2 2 test3 2 test4 2
I got this far, which kind of works, but I don't see how scalable this is.
output_df <- df %>%
pivot_longer(cols = starts_with("Conc"), names_to = "Test", values_to = "Concentration") %>%
filter(!is.na(Concentration)) %>%
group_by(block, Well) %>%
mutate(test_num = row_number()) %>%
pivot_wider(names_from = test_num, values_from = c(Test, Concentration),
names_sep = "") %>%
ungroup()
output_df <- output_df %>%
mutate(tests = rowSums(!is.na(select(., starts_with("Test"))))) %>%
select(block, Well, tests, everything()) %>%
rename_with(~gsub("Test", "C", .x), starts_with("Test")) %>%
rename_with(~gsub("Concentration", "conc", .x), starts_with("Concentration"))
Share
Improve this question
edited Mar 27 at 9:09
Darren Tsai
36.3k5 gold badges25 silver badges57 bronze badges
asked Mar 27 at 7:46
szmpleszmple
4772 silver badges9 bronze badges
4
- Stack Overflow is not the place to ask others to write your code for you, though the community is very willing to help you solve specific problems you encounter whilst writing it for yourself. What have you tried so far and why didn't it work? – Limey Commented Mar 27 at 7:50
- @Limey yes ofcourse!! I understand that, I will update my question with my solutions – szmple Commented Mar 27 at 7:54
- As I see it, block 1 well A04 has 3 tests not 2? Same for block 2 well A04? Why does block 2 well A01 have 3 tests for you? Also, this is a very bad output format for any analysis. – user2974951 Commented Mar 27 at 7:54
- @user2974951 You are correct, I made a mistake with my expected output format. I will adjust, sorry about that – szmple Commented Mar 27 at 7:56
3 Answers
Reset to default 3In base R we can do
reshape(df, direction='l', v.names='Conc', timevar='C',
varying=grep('Conc\\.', names(df)),
times=tolower(sub('Conc\\.', '', grep('Conc\\.', names(df))))) |>
na.omit() |>
transform(tests=ave(Conc, block, Well, FUN=length),
id=ave(Conc, block, Well, FUN=seq)) |>
reshape(direction='w', timevar='id', idvar=c('block','Well'), v.names=c('C', 'Conc')) |>
`row.names<-`(NULL) |> sort_by(~block+Well)
where the last line is cosmetics. I recommend you to use either upper case or lower case to start a variable name, not both. If performance matters, we can optimise the transform
-step.
block Well tests C.1 Conc.1 C.2 Conc.2 C.3 Conc.3
1 1 A01 2 test1 2 test6 2 <NA> NA
5 1 A02 2 test2 2 test5 2 <NA> NA
2 1 A03 2 test1 2 test4 2 <NA> NA
6 1 A04 3 test2 2 test3 2 test4 2
3 2 A01 3 test1 2 test4 2 test6 2
7 2 A02 2 test2 2 test5 2 <NA> NA
4 2 A03 2 test1 2 test4 2 <NA> NA
8 2 A04 3 test2 2 test3 2 test4 2
You need pivot_longer
at first, creating index column by groups, and then pivot_wider
.
library(dplyr)
library(tidyr)
df %>%
pivot_longer(Conc.test1:Conc.test6, names_prefix = "Conc.", names_to = "C",
values_to = "conc", values_drop_na = TRUE) %>%
mutate(id = row_number(), tests = n(), .by = c(block, Well)) %>%
pivot_wider(names_from = id, values_from = c(C, conc),
names_vary = "slowest", names_sep = "")
# # A tibble: 8 × 9
# block Well tests C1 conc1 C2 conc2 C3 conc3
# <dbl> <chr> <int> <chr> <dbl> <chr> <dbl> <chr> <dbl>
# 1 1 A01 2 test1 2 test6 2 NA NA
# 2 1 A02 2 test2 2 test5 2 NA NA
# 3 1 A03 2 test1 2 test4 2 NA NA
# 4 1 A04 3 test2 2 Test3 2 test4 2
# 5 2 A01 3 test1 2 test4 2 test6 2
# 6 2 A02 2 test2 2 test5 2 NA NA
# 7 2 A03 2 test1 2 test4 2 NA NA
# 8 2 A04 3 test2 2 Test3 2 test4 2
library(dplyr)
library(tidyr)
pivot_longer(df, -c(block, Well), names_to="C", values_to="conc") %>%
filter(!is.na(conc)) %>%
mutate(tests=n(), C=sub("Conc.", "", C, fixed=TRUE), seq=row_number(), .by=c(block, Well)) %>%
data.frame() %>%
reshape(direction="wide", timevar="seq", idvar=c("block","Well"), v.names=c("C", "conc"))
___
block Well tests C.1 conc.1 C.2 conc.2 C.3 conc.3
1 1 A01 2 test1 2 test6 2 <NA> NA
3 1 A02 2 test2 2 test5 2 <NA> NA
5 1 A03 2 test1 2 test4 2 <NA> NA
7 1 A04 3 test2 2 Test3 2 test4 2
10 2 A01 3 test1 2 test4 2 test6 2
13 2 A02 2 test2 2 test5 2 <NA> NA
15 2 A03 2 test1 2 test4 2 <NA> NA
17 2 A04 3 test2 2 Test3 2 test4 2