最新消息:雨落星辰是一个专注网站SEO优化、网站SEO诊断、搜索引擎研究、网络营销推广、网站策划运营及站长类的自媒体原创博客

r - Removing white spaces in a histogram? - Stack Overflow

programmeradmin1浏览0评论

I came across this question here in which someone was interested in simulating a game where two players randomly give each other money.

Using R, I tried to simulate multiple trajectories of this game to look at the distributions of different metrics:

    library(ggplot2)
    library(tidyverse)
    library(gridExtra)
    library(future)
    library(furrr)
    library(parallel)
    
    n_cores <- detectCores() - 1  
    plan(multisession, workers = n_cores)


run_money_simulation <- function(n_sims, n_exchanges, player_a_start, player_b_start) {
    set.seed(123) 
    
    sims_per_core <- ceiling(n_sims / n_cores)
    
    simulate_exchange <- function(n_exchanges, player_a_start, player_b_start) {
        person_a <- numeric(n_exchanges + 1)
        person_b <- numeric(n_exchanges + 1)
        
        person_a[1] <- player_a_start
        person_b[1] <- player_b_start
        
        for(i in 2:(n_exchanges + 1)) {
            change <- sample(c(-1, 1), 1)
            person_a[i] <- person_a[i-1] + change
            person_b[i] <- person_b[i-1] - change
        }
        
        return(list(
            final_diff = person_a[n_exchanges + 1] - person_b[n_exchanges + 1],
            max_diff = max(abs(person_a - person_b)),
            max_amount = max(c(max(person_a), max(person_b))),
            min_amount = min(c(min(person_a), min(person_b)))
        ))
    }
    
    start_time <- Sys.time()
    
    results <- future_map(1:n_sims, function(x) {
        simulate_exchange(n_exchanges, player_a_start, player_b_start)
    }, .options = furrr_options(seed = TRUE))
    
    end_time <- Sys.time()
    time_taken <- difftime(end_time, start_time, units = "secs")
 
    
    final_diffs <- sapply(results, `[[`, "final_diff")
    max_diffs <- sapply(results, `[[`, "max_diff")
    max_amounts <- sapply(results, `[[`, "max_amount")
    min_amounts <- sapply(results, `[[`, "min_amount")
    
    plot_data <- tibble(
        final_diff = final_diffs,
        max_diff = max_diffs,
        max_amount = max_amounts,
        min_amount = min_amounts
    ) %>%
        pivot_longer(everything(), 
                     names_to = "metric", 
                     values_to = "value")
    
    main_plot <- ggplot(plot_data, aes(x = value)) +
        geom_histogram(bins = 50, aes(fill = metric), color = "white", alpha = 0.7) +
        facet_wrap(~metric, scales = "free", ncol = 2) +
        scale_fill_manual(values = c("black", "red", "green4", "purple")) +
        labs(title = paste("Money Exchange Simulation Results\n",
                           "Starting amounts: A =", player_a_start, ", B =", player_b_start),
             subtitle = paste("Number of simulations:", n_sims, 
                              "| Exchanges per simulation:", n_exchanges,
                              "\nProcessed using", n_cores, "CPU cores in", 
                              round(time_taken, 2), "seconds"),
             x = "Value",
             y = "Count") +
        theme_bw() +
        theme(legend.position = "none")
    
    
    print(main_plot)
    
    
    invisible(list(
        final_diffs = final_diffs,
        max_diffs = max_diffs,
        max_amounts = max_amounts,
        min_amounts = min_amounts,
        parameters = list(
            n_sims = n_sims,
            n_exchanges = n_exchanges,
            player_a_start = player_a_start,
            player_b_start = player_b_start,
            n_cores = n_cores,
            processing_time = time_taken
        )
    ))
}

When I call the function:

run_money_simulation(100000, 100, 100, 100)

I get the following results:

I am just wondering - is there something I can do in ggplot that detects a scale break format such that the white spaces in these charts will be removed?

I came across this question here https://math.stackexchange/questions/2648895/why-does-fair-random-process-lead-to-unfair-result/5001301#5001301 in which someone was interested in simulating a game where two players randomly give each other money.

Using R, I tried to simulate multiple trajectories of this game to look at the distributions of different metrics:

    library(ggplot2)
    library(tidyverse)
    library(gridExtra)
    library(future)
    library(furrr)
    library(parallel)
    
    n_cores <- detectCores() - 1  
    plan(multisession, workers = n_cores)


run_money_simulation <- function(n_sims, n_exchanges, player_a_start, player_b_start) {
    set.seed(123) 
    
    sims_per_core <- ceiling(n_sims / n_cores)
    
    simulate_exchange <- function(n_exchanges, player_a_start, player_b_start) {
        person_a <- numeric(n_exchanges + 1)
        person_b <- numeric(n_exchanges + 1)
        
        person_a[1] <- player_a_start
        person_b[1] <- player_b_start
        
        for(i in 2:(n_exchanges + 1)) {
            change <- sample(c(-1, 1), 1)
            person_a[i] <- person_a[i-1] + change
            person_b[i] <- person_b[i-1] - change
        }
        
        return(list(
            final_diff = person_a[n_exchanges + 1] - person_b[n_exchanges + 1],
            max_diff = max(abs(person_a - person_b)),
            max_amount = max(c(max(person_a), max(person_b))),
            min_amount = min(c(min(person_a), min(person_b)))
        ))
    }
    
    start_time <- Sys.time()
    
    results <- future_map(1:n_sims, function(x) {
        simulate_exchange(n_exchanges, player_a_start, player_b_start)
    }, .options = furrr_options(seed = TRUE))
    
    end_time <- Sys.time()
    time_taken <- difftime(end_time, start_time, units = "secs")
 
    
    final_diffs <- sapply(results, `[[`, "final_diff")
    max_diffs <- sapply(results, `[[`, "max_diff")
    max_amounts <- sapply(results, `[[`, "max_amount")
    min_amounts <- sapply(results, `[[`, "min_amount")
    
    plot_data <- tibble(
        final_diff = final_diffs,
        max_diff = max_diffs,
        max_amount = max_amounts,
        min_amount = min_amounts
    ) %>%
        pivot_longer(everything(), 
                     names_to = "metric", 
                     values_to = "value")
    
    main_plot <- ggplot(plot_data, aes(x = value)) +
        geom_histogram(bins = 50, aes(fill = metric), color = "white", alpha = 0.7) +
        facet_wrap(~metric, scales = "free", ncol = 2) +
        scale_fill_manual(values = c("black", "red", "green4", "purple")) +
        labs(title = paste("Money Exchange Simulation Results\n",
                           "Starting amounts: A =", player_a_start, ", B =", player_b_start),
             subtitle = paste("Number of simulations:", n_sims, 
                              "| Exchanges per simulation:", n_exchanges,
                              "\nProcessed using", n_cores, "CPU cores in", 
                              round(time_taken, 2), "seconds"),
             x = "Value",
             y = "Count") +
        theme_bw() +
        theme(legend.position = "none")
    
    
    print(main_plot)
    
    
    invisible(list(
        final_diffs = final_diffs,
        max_diffs = max_diffs,
        max_amounts = max_amounts,
        min_amounts = min_amounts,
        parameters = list(
            n_sims = n_sims,
            n_exchanges = n_exchanges,
            player_a_start = player_a_start,
            player_b_start = player_b_start,
            n_cores = n_cores,
            processing_time = time_taken
        )
    ))
}

When I call the function:

run_money_simulation(100000, 100, 100, 100)

I get the following results:

I am just wondering - is there something I can do in ggplot that detects a scale break format such that the white spaces in these charts will be removed?

Share Improve this question asked Nov 20, 2024 at 21:33 farrow90farrow90 84510 bronze badges 6
  • 4 I'm not clear what you want. You just want to skip sections of the x-axis? What exactly are you trying to communicate via the plot? If you don't have a consistent scale, it's much harder to interpret the data. If this question is about plotting, it's a bit overkill to include a multithreaded simulation that needs to be run just to test the plotting. I think you just have discrete outcomes rather than continuous which a histogram assumes. Try bins=40. – MrFlick Commented Nov 20, 2024 at 21:46
  • alternatively, switch to geom_density() – GuedesBF Commented Nov 20, 2024 at 21:52
  • 2 Since your data is discrete, maybe use binwidth = 1 instead of specifying the number of bins, which can create bin spacings that don't align to integer values. – Jon Spring Commented Nov 20, 2024 at 22:32
  • 1 The problem is that you are using the wrong kind of graph. Histograms are for continuous data, and your data appear to be discrete. – user2554330 Commented Nov 20, 2024 at 22:32
  • 1 @user2554330 There is no reason that histograms cannot be made from discrete integer data it is just that the bin size must be an exact integer wide. The OP appears to have 14 bins over a range of 50 ~ 3.571429 channels per bin. I am at a loss to explain the empty gaps in his histograms (worst that should happen is some get 4 channels added in and most get 3 channels). Perhaps the OP could provide his raw data after setting bin width to 4 (ie range of 200 with say 50 bins). ie Width exactly 4 channels. – Martin Brown Commented Nov 20, 2024 at 23:02
 |  Show 1 more comment

2 Answers 2

Reset to default 1

The problem is that you have discrete data. Histograms are a kind of density estimate, designed for continuous data that actually has a density.

Looking at the top left figure final_diff, the data being shown is always a multiple of 4. Since your bin width is slightly smaller than 4, a couple of bins completely miss the data. You could avoid the gaps by setting the bin width to 4, but that's not the best solution. The best solution is to use a display that is designed for discrete data.

The usual display for this kind of discrete data is a bar plot showing counts of each observed value. You can get that in ggplot2 using geom_bar() instead of geom_histogram. For example,


#  ... unchanged code deleted ...

  main_plot <- ggplot(plot_data, aes(x = value)) +
    geom_bar(aes(fill = metric)) +
    facet_wrap(~metric, scales = "free", ncol = 2) +
    scale_fill_manual(values = c("black", "red", "green4", "purple")) +
    labs(title = paste("Money Exchange Simulation Results\n",
                       "Starting amounts: A =", player_a_start, ", B =", player_b_start),
         subtitle = paste("Number of simulations:", n_sims, 
                          "| Exchanges per simulation:", n_exchanges,
                          "\nProcessed using", n_cores, "CPU cores in", 
                          round(time_taken, 2), "seconds"),
         x = "Value",
         y = "Count") +
    theme_bw() +
    theme(legend.position = "none")
  
##  ... more unchanged code ...

Created on 2024-11-21 with reprex v2.1.1

This looks uglier than a histogram, but it is more faithful to the data. And it's less ugly in a higher resolution display, where you can see that all the bars and spaces between them are the same width.

Try adding xlim(c(<lower_limit>, <upper_limit>)). From there, you can create another function to find where the 90th percentile (or another threshold) of your data lies - input that into your upper or lower limits.

发布评论

评论列表(0)

  1. 暂无评论