It's in the title, however here is an example
# df is the sample data
df <- data.frame(a = c(1:10), b= c(10:20), c = c(20:30), d= c(30:40))
rq_cols <- c(a,b,c)
#the desired output is:
fn <- as.formula(a + b + c + a:b + a:c + b:c + a:a + b:b + c:c)
#so the output can be inputted into a linear model
rq_lm <- lm(d ~ fn, data = df)
I've already made a function for simple multiple regression
fn <- as.formula(
paste("d ~", paste(names(df %>% select(cols)), collapse = " + ")))
and that worked a charm now I just need a version with the interaction variables
It's in the title, however here is an example
# df is the sample data
df <- data.frame(a = c(1:10), b= c(10:20), c = c(20:30), d= c(30:40))
rq_cols <- c(a,b,c)
#the desired output is:
fn <- as.formula(a + b + c + a:b + a:c + b:c + a:a + b:b + c:c)
#so the output can be inputted into a linear model
rq_lm <- lm(d ~ fn, data = df)
I've already made a function for simple multiple regression
fn <- as.formula(
paste("d ~", paste(names(df %>% select(cols)), collapse = " + ")))
and that worked a charm now I just need a version with the interaction variables
Share Improve this question asked Mar 5 at 19:51 ben_bjrben_bjr 314 bronze badges 3 |2 Answers
Reset to default 2We could expand @greg-snow's solution by adding polynomials to the interactions, wrapped in a function.
poly_fun <- \(data, lhs, degree=2) {
sprintf('%s ~ .^%s + %s', lhs, degree,
paste(
unlist(lapply(2:degree, \(i) {
sprintf('I(%s^%s)', setdiff(names(data), lhs), i)
})),
collapse=' + ')) |>
as.formula()
}
Usage
> poly_fun(df, 'd')
d ~ .^2 + I(a^2) + I(b^2) + I(c^2)
<environment: 0x56133b964f88>
> poly_fun(df, 'b', 3)
b ~ .^3 + I(a^2) + I(c^2) + I(d^2) + I(a^3) + I(c^3) + I(d^3)
<environment: 0x56133b95d7a0>
> df |> poly_fun('d', 4)
d ~ .^4 + I(a^2) + I(b^2) + I(c^2) + I(a^3) + I(b^3) + I(c^3) +
I(a^4) + I(b^4) + I(c^4)
<environment: 0x56133b954140>
>
> lm(poly_fun(df, 'd'), df) |> summary()
Call:
lm(formula = poly_fun(df, "d"), data = df)
Residuals:
Min 1Q Median 3Q Max
-8.1820 -0.7334 -0.0013 0.7376 11.0838
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 4.002851 0.005706 701.559 < 2e-16 ***
a 3.445082 0.003598 957.425 < 2e-16 ***
b 2.391961 0.003602 664.062 < 2e-16 ***
c 1.334112 0.003609 369.631 < 2e-16 ***
I(a^2) 1.496468 0.002551 586.598 < 2e-16 ***
I(b^2) 1.297684 0.002538 511.348 < 2e-16 ***
I(c^2) 1.100695 0.002548 432.021 < 2e-16 ***
a:b 0.039713 0.003588 11.069 < 2e-16 ***
a:c 0.041571 0.003580 11.611 < 2e-16 ***
b:c 0.022472 0.003606 6.233 4.61e-10 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 1.141 on 99990 degrees of freedom
Multiple R-squared: 0.9579, Adjusted R-squared: 0.9579
F-statistic: 2.525e+05 on 9 and 99990 DF, p-value: < 2.2e-16
> df |> poly_fun('d', 3) |> lm(data=df) |> summary()
Call:
lm(formula = poly_fun(df, "d", 3), data = df)
Residuals:
Min 1Q Median 3Q Max
-4.3661 -0.6719 0.0016 0.6763 4.4760
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 4.000443 0.004985 802.511 < 2e-16 ***
a 2.994718 0.005030 595.323 < 2e-16 ***
b 2.001774 0.004991 401.113 < 2e-16 ***
c 0.999756 0.004960 201.555 < 2e-16 ***
I(a^2) 1.498140 0.002229 672.150 < 2e-16 ***
I(b^2) 1.302253 0.002218 587.159 < 2e-16 ***
I(c^2) 1.099046 0.002226 493.730 < 2e-16 ***
I(a^3) 0.150429 0.001311 114.749 < 2e-16 ***
I(b^3) 0.129734 0.001283 101.084 < 2e-16 ***
I(c^3) 0.110257 0.001273 86.603 < 2e-16 ***
a:b 0.030196 0.003136 9.630 < 2e-16 ***
a:c 0.036817 0.003129 11.767 < 2e-16 ***
b:c 0.021395 0.003150 6.792 1.12e-11 ***
a:b:c 0.007114 0.003123 2.278 0.0228 *
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.9972 on 99986 degrees of freedom
Multiple R-squared: 0.9678, Adjusted R-squared: 0.9678
F-statistic: 2.314e+05 on 13 and 99986 DF, p-value: < 2.2e-16
Data:
set.seed(42)
n <- 1e5
df <- data.frame(matrix(rnorm(n*3), n, 3)) |> setNames(letters[1:3]) |>
transform(d=4 +
3*a + 2*b + c +
.9*a^2 + .8*b^2 + .7*c^2 + .6*a^2 + .5*b^2 + .4*c^2 +
.09*a^3 + .08*b^3 + .07*c^3 + .06*a^3 + .05*b^3 + .04*c^3 +
.03*a*b + .03*a*c + .02*b*c + .01*a*b*c +
rnorm(n))
You can use model.matrix
to get the main effects and two-way interactions. The two-degree polynomials require pasting separately. Then use reformulate
to generate the required formula:
library(dplyr)
library(purrr) # for plucking the model.matrix
fn <- reformulate(
paste(c(
df |>
select(all_of(rq_cols)) |>
model.matrix( ~ -1 + .^2, data=_) |>
dimnames() |>
pluck(2),
paste(rq_cols, rq_cols, sep=":")), collapse="+"), # See note
response='d')
which gives
d ~ a + b + c + a:b + a:c + b:c + a:a + b:b + c:c
Which is your expected output.
However, when you run your model:
rq_lm <- lm(fn, data = df)
You won't get the polynomial terms because that is not the way to specify them in lm
.
If you want the polynomial terms, you can replace the noted line above with
sapply(rq_cols, \(x) paste0("I(", x, "^2)"))
which gives
d ~ a + b + c + a:b + a:c + b:c + I(a^2) + I(b^2) + I(c^2)
lm(fn, data = df)
Call:
lm(formula = fn, data = df)
Coefficients:
(Intercept) a b c I(a^2)
0.14155 -0.12365 -0.11968 0.02001 -0.14168
I(b^2) I(c^2) a:b a:c b:c
0.12148 -0.06831 -0.38764 -0.14881 0.07675
Note that specifying lm(d ~ fn, data = df)
won't work because d ~ fn
must be a formula. You can't just add fn
to d~
and expect the modeling to succeed.
If you want orthonormal polynomials, consider using poly(x)
instead of I(x^2)
. Then we don't need to add the main effects.
fn <- reformulate(
paste(c(
df |>
select(all_of(rq_cols)) |>
model.matrix(~ -1 + .^2, data=_) |>
dimnames() |>
pluck(2) |>
grep(pattern=":", value=TRUE), # get only interaction terms
sapply(rq_cols, \(x) paste0("poly(", x, ",2)"))
),
collapse="+"),
response='d')
Data:
set.seed(1)
df <- data.frame(a=rnorm(100), b=rnorm(100),
c=rnorm(100), d=rnorm(100), e=rnorm(100))
rq_cols <- c('a', 'b', 'c')
df0 = data.frame(a=0:10, b=10:20, c=20:30, d=30:40); rq_cols = c('a', 'b', 'c'); df0 |> subset(select = c(rq_cols, 'd')) |> lm(d~.^2, data=_)
?df
corrected, renamed.rq_cols
assignment made valid R code. dropping irrelevant summands. – Friede Commented Mar 5 at 21:01