最新消息:雨落星辰是一个专注网站SEO优化、网站SEO诊断、搜索引擎研究、网络营销推广、网站策划运营及站长类的自媒体原创博客

r - Live logging with doParallel and cmdstanr - Stack Overflow

programmeradmin2浏览0评论

I have defined a Bayesian A/B test power analysis (sample size calculator) using cmdstanr. Since it is a computationally expensive process, I tried to save output as each loop ended in foreach.

library(cmdstanr)
library(doParallel)
library(data.table)

#-------------------------- Parameters
mde <- .007                                # Minimum detectable effect (0.07%)
mean_prior <- .01411629                    # Past experiment mean (1.4%)
sd_prior <- .103165                        # Past experiment SD (10%)
sample_size <- seq(1e6, 40e6, 1e6) |> rep(each = 1e2) # Range of sample sizes to test


#-------------------------- Stan Model
stan_model_code <-
"
data {
        int<lower=0> N;  // Sample size
        vector[N] y;  // Click-through rates
        real mu_prior;
        real sigma_prior;
      }
      
parameters {
              real mu;  // Mean click-through rate
            }

model {
        mu ~ normal(mu_prior, sigma_prior);
        y ~ normal(mu, 0.1); // Assume 10% SD for individual user CTR
      }
"

# Compile the model
stan_file <- write_stan_file(stan_model_code)
stan_model <- cmdstan_model(stan_file)

#-------------------------- Power Analysis Simulation Function
sim <- \(n)
{
    # Generate synthetic data for A and B groups
    y_A <- rnorm(n, mean = mean_prior, sd = 0.1)
    y_B <- rnorm(n, mean = mean_prior * (1 + mde), sd = 0.1)
    
    # Run Bayesian model for each group
    fit_A <- stan_model$sample(data = list(N = n, y = y_A, mu_prior = mean_prior, sigma_prior = sd_prior)
                               , iter_sampling = 2000
                               , chains = 4
                               , parallel_chains = 1
                               , refresh = 0
                               )
    
    fit_B <- stan_model$sample(data = list(N = n, y = y_B, mu_prior = mean_prior, sigma_prior = sd_prior)
                               , iter_sampling = 2000
                               , chains = 4
                               , parallel_chains = 1
                               , refresh = 0
                               )
    
    # Compute probability B > A
    return( list(n = n
                 , power = mean(fit_B$draws("mu") > fit_A$draws("mu"))
                 )
            )
}


#-------------------- set up cluster
n_cores <- parallel::detectCores() - 1
cl <- makeCluster(n_cores)
registerDoParallel(cl, cores = n_cores)


#-------------------- simulation - parallel running
h <- foreach(i = sample_size
             # , bine = c
             , .packages = c("data.table", "cmdstanr")
             , .inorder = FALSE
             ) %dopar%
  {
    x <- sim(i)
    fwrite(x = data.table(time = Sys.time()
                           , sample_size = i
                           , power = x[[2]]
                           )
           , file = 'log - Bayesian cmdstanr.txt'
           , row.names = F
           , append = T
           )
    return(x)
  }

However, fwrite only writes to log file when ALL loops are finished. This is risky since whole task takes a number of hours (PC crashing).

The same method of logging worked with another package so suspect it is to do with cmdstanr. Grateful for ideas.

发布评论

评论列表(0)

  1. 暂无评论