Simulation

This example demonstrates how to generate simulation plots from raw data as presented in the manuscript. In manuscript, we considered three simulation cases examining \(\mu\), \(\tau^2\), and \(\sigma_0^2\), along with two distribution variants: Gamma and Student’s \(t\) distributions. Below, we focus on the case of \(\mu\).

Generating simulation data

To generate the raw simulation data, we run the simulation script SIMU_MU.R, First, we import the required packages and configure the test values for \(\mu\):

library(BIT)
library(truncdist)

# Configuration parameters ---------------------------------------------------
TR_SIZES <- c(500, 1000, 1500)
MU_VALUES <- c(-5, -4.5, -4, -3.5, -3, -2.5)
TAU_SQ_VALUES <- c(0.5, 0.7, 0.9, 1.1, 1.3, 1.5)
SIGMA0_VALUES <- c(1, 1.2, 1.4, 1.6, 1.8, 2.0)
BASE_DIR <- "./simulation/MU"
N_VALUE <- 30000
ITERATIONS <- 5000
BURN_IN <- 2500

Next, we define helper functions used to generate simulation data for TRs with either single or multiple observations (datasets):

# Helper functions --------------------------------------------------------
logistic_transform <- function(theta) {
  exp(theta) / (1 + exp(theta))
}

generate_TR_size <- function(num_TRs) {
  ceiling(rtrunc(num_TRs, spec = "lnorm",
                 meanlog = 1, sdlog = 1.75,
                 a = 0, b = 1000))
}

generate_multi_obs_TR <- function(n, mu, tau_sq, population) {
  TR_data <- list()
  TR_data$theta_i <- rnorm(1, mu, sqrt(tau_sq))
  TR_data$sigma_i <- rgamma(1, 0.75, 100)
  TR_data$theta_ij <- rnorm(n, TR_data$theta_i, sqrt(TR_data$sigma_i))
  TR_data$x_ij <- rbinom(n, population, logistic_transform(TR_data$theta_ij))
  TR_data$n_ij <- rep(population, n)
  TR_data
}

generate_single_obs_TR <- function(mu, tau_sq, sigma0_sq, population) {
  TR_data <- list()
  TR_data$theta_i <- rnorm(1, mu, sqrt(tau_sq))
  TR_data$theta_ij <- rnorm(1, TR_data$theta_i, sqrt(sigma0_sq))
  TR_data$x_ij <- rbinom(1, population, logistic_transform(TR_data$theta_ij))
  TR_data$n_ij <- population
  TR_data
}

With these helper functions in place, we define the main function for generating simulation data:

# Data generation functions -----------------------------------------------
generate_simulation_data <- function(
    num_TRs = 1000,
    population = 30000,
    mu = -4,
    tau_sq = 0.75,
    sigma0_sq = 1.5) {

  TR_sizes <- generate_TR_size(num_TRs)

  multi_obs_count <- sum(TR_sizes > 1)
  single_obs_count <- num_TRs - multi_obs_count
  ordered_sizes <- sort(TR_sizes[TR_sizes > 1])

  simulation_data <- vector("list", num_TRs)

  # Generate multi-observation TRs
  for (i in seq_len(multi_obs_count)) {
    simulation_data[[i]] <- generate_multi_obs_TR(
      ordered_sizes[i], mu, tau_sq, population
    )
  }

  # Generate single-observation TRs
  for (j in (multi_obs_count + 1):num_TRs) {
    simulation_data[[j]] <- generate_single_obs_TR(
      mu, tau_sq, sigma0_sq, population
    )
  }

  simulation_data
}

structure_simulation_data <- function(raw_data) {
  structured_data <- list(
    xij = unlist(lapply(raw_data, `[[`, "x_ij")),
    nij = unlist(lapply(raw_data, `[[`, "n_ij")),
    label_vec = rep(seq_along(raw_data), lengths(lapply(raw_data, `[[`, "x_ij"))),
    theta_i = unlist(lapply(raw_data, `[[`, "theta_i")),
    theta_ij = unlist(lapply(raw_data, `[[`, "theta_ij"))
  )
  structured_data
}

The simulation workflow consists of generating the data, running the main analysis, and saving the results:

# Simulation workflow ----------------------------------------------------
run_simulation <- function(mu_value, iterations, num_TRs, simulation_id) {
  data_dir <- file.path(BASE_DIR, "SIMU_DATA")
  result_dir <- file.path(BASE_DIR, "SIMU_RESULTS")
  log_dir <- file.path(BASE_DIR, "LOG")

  dir.create(data_dir, showWarnings = FALSE, recursive = TRUE)
  dir.create(result_dir, showWarnings = FALSE, recursive = TRUE)
  dir.create(log_dir, showWarnings = FALSE, recursive = TRUE)

  # Generate and save simulation data
  simulated_data <- generate_simulation_data(
    num_TRs, N_VALUE, mu_value, 0.75, 1.5
  )
  structured_data <- structure_simulation_data(simulated_data)

  data_path <- file.path(data_dir, sprintf("id_%d_data_sim_mu_%g_I_%d.rds",
                                           simulation_id, mu_value, num_TRs))
  log_path <- file.path(log_dir, sprintf("id_%d_data_sim_mu_%g_I_%d.txt",
                                         simulation_id, mu_value, num_TRs))
  saveRDS(structured_data, data_path)

  # Run main analysis
  analysis_results <- Main_Sampling(
    iterations,
    structured_data$xij,
    structured_data$nij,
    structured_data$label_vec,
    log_path
  )

  # Process and save results
  final_results <- list(
    mu = mean(analysis_results$mu0[(iterations - BURN_IN):iterations]),
    theta_i = rowMeans(analysis_results$theta_i[, (iterations - BURN_IN):iterations]),
    label_vec = structured_data$label_vec
  )

  result_path <- file.path(result_dir, sprintf("id_%d_res_sim_mu_%g_I_%d.rds",
                                               simulation_id, mu_value, num_TRs))
  saveRDS(final_results, result_path)
}

We run the simulation 100 times for each setting, using different total numbers of TRs (500, 1000, and 1500), and varying \(\mu\) range from \([-5,-2.5]\), while keeping \(\tau^2=0.75\) and \(\sigma_0^2=1.5\) as default values.

# Execution block ------------------------------------------------------
for (mu_value in MU_VALUES){
  for (sim_id in seq_len(100)) {
    for (tr_size in TR_SIZES) {
      run_simulation(
        mu_value = mu_value,
        iterations = ITERATIONS,
        num_TRs = tr_size,
        simulation_id = sim_id
      )
    }
  }
}

The simulation raw data and BIT derived results will be saved as *.rds files in two separate folders: ./simulation/MU/SIMU_DATA and ./simulation/MU/SIMU_RESULTS.

./simulation/MU/SIMU_DATA

./simulation/MU/SIMU_RESULTS

Next we need to calculate the mean squared error of \(\mu\) and spearman rho of estimated \(\hat{\theta}_i\) with true \(\theta_i\) from the raw data:

library(data.table)

logit_function<-function(x){
      return(log(x/(1-x)))
}

vec_logit<-Vectorize(logit_function)

Naive_Mu<-function(data){
  part1<-data$xij/data$nij
  part1[which(part1==0)]<-part1[which(part1==0)]+0.000001
  part1<-vec_logit(part1)
  group_means <- tapply(part1, data$label_vec, mean)
  return(mean(group_means))
}

Naive_TAU2<-function(data){
      part1<-data$xij/data$nij
      part1[which(part1==0)]<-part1[which(part1==0)]+0.000001
      part1<-vec_logit(part1)
      return(var(part1))
}

Naive_SIGMA0<-function(data,Mc){
      part1<-data$xij/data$nij
      part1[which(part1==0)]<-part1[which(part1==0)]+0.000001
      part1<-vec_logit(part1)
      Mu_est<-Naive_Mu(data)
      part1_Mc<-part1[(length(part1)-Mc+1):length(part1)]
      return(sum((part1_Mc-Mu_est)^2)/(Mc-1))
}

Naive_Theta_i<-function(data){
      part1<-data$xij/data$nij
      part1[which(part1==0)]<-part1[which(part1==0)]+0.000001
      part1<-vec_logit(part1)
      return(tapply(part1,data$label_vec,mean))
}


###################
work_dir_data<-"./simulation/MU/SIMU_DATA/"
work_dir_results<-"./simulation/MU/SIMU_RESULTS/"
output_dir<-"./simulation/RESULTS/MU/"
dir.create(output_dir, showWarnings = FALSE, recursive = TRUE))

M_vec<-c(350,700,1050)
Mc_vec<-c(150,300,450)
MU<-c(-5,-4.5,-4,-3.5,-3,-2.5)


for(i in 1:3){
              output_mu_df<-data.frame(matrix(nrow=6,ncol=4))
              colnames(output_mu_df)<-c("BIT_Bias","BIT_MSE","Naive_Bias","Naive_MSE")
              for(k in 1:6){
                      output_theta_i_df<-data.frame(matrix(nrow=(M_vec[i]+Mc_vec[i]),ncol=6))
                      colnames(output_theta_i_df)<-c("BIT_Bias","BIT_MSE","Naive_Bias","Naive_MSE","Naive_Spearman","BIT_Spearman")

                      data_files<-list.files(work_dir_data,pattern=paste0("*_mu_",MU[k],"_I_",M_vec[i]+Mc_vec[i],".rds"))
                      results_files<-list.files(work_dir_results,pattern=paste0("*_mu_",MU[k],"_I_",M_vec[i]+Mc_vec[i],".rds"))

                      naive_MU_vec<-c()
                      naive_Theta_mat<-matrix(nrow=(M_vec[i]+Mc_vec[i]),ncol=100)

                      BIT_MU_vec<-c()
                      BIT_Theta_mat<-matrix(nrow=(M_vec[i]+Mc_vec[i]),ncol=100)

                      for(m in 1:100){
                      print(paste0(i,"_",k,"_",m))
                              data<-readRDS(paste0(work_dir_data,data_files[m]))
                              results<-readRDS(paste0(work_dir_results,results_files[m]))

                              label_rank<-rank(-data$theta_i)

                              true_mu<-MU[k]
                              true_Theta_i<-data$theta_i

                              naive_Mu<-Naive_Mu(data)
                              naive_Theta_i<-Naive_Theta_i(data)

                              naive_rank<-rank(-naive_Theta_i)
                              names(naive_rank)<-NULL

                              BIT_Mu<-results$mu
                              BIT_Theta_i<-results$theta_i[!duplicated(results$label_vec)]

                              BIT_rank<-rank(-BIT_Theta_i)

                              naive_MU_vec<-c(naive_MU_vec,naive_Mu-true_mu)
                              BIT_MU_vec<-c(BIT_MU_vec,BIT_Mu-true_mu)

                              naive_Theta_mat[,m]<-naive_Theta_i-true_Theta_i
                              BIT_Theta_mat[,m]<-BIT_Theta_i-true_Theta_i

                              spearman_naive<-cor(label_rank,naive_rank,method="spearman")
                              spearman_BIT<-cor(label_rank,BIT_rank,method="spearman")

                              output_theta_i_df[m,5]<-spearman_naive
                              output_theta_i_df[m,6]<-spearman_BIT
                      }
                      output_mu_df[k,1]<-mean(abs(BIT_MU_vec),na.rm=TRUE)
                      output_mu_df[k,2]<-mean(BIT_MU_vec^2,na.rm=TRUE)
                      output_mu_df[k,3]<-mean(abs(naive_MU_vec),na.rm=TRUE)
                      output_mu_df[k,4]<-mean(naive_MU_vec^2,na.rm=TRUE)

                      output_theta_i_df[,1]<-rowMeans(abs(BIT_Theta_mat),na.rm=TRUE)
                      output_theta_i_df[,2]<-rowMeans(BIT_Theta_mat^2,na.rm=TRUE)
                      output_theta_i_df[,3]<-rowMeans(abs(naive_Theta_mat),na.rm=TRUE)
                      output_theta_i_df[,4]<-rowMeans(naive_Theta_mat^2,na.rm=TRUE)

                      fwrite(output_theta_i_df,paste0(output_dir,"Theta_i_I_",M_vec[i]+Mc_vec[i],"_Mu_",MU[k],".csv"))
              }
              fwrite(output_mu_df,paste0(output_dir,"Mu_I_",M_vec[i]+Mc_vec[i],".csv"))
}

We will get tables as below:

./simulation/RESULTS/MU

Generating Figure. 2A

We generate the Fig2A_Mu.csv table from the summarized results:

work_dir_MU<-"./simulation/RESULTS/MU/"
I_vec<-c(500,1000,1500)

new_df<-data.frame(matrix(nrow=6,ncol=7))
colnames(new_df)[1]<-"mu"
new_df[,1]<-c(-5.0,-4.5,-4,-3.5,-3,-2.5)
for(i in 1:3){
  data_df<-read.csv(paste0(work_dir_MU,"Mu_I_",I_vec[i],".csv"))
  new_df[,i+1]<-data_df$BIT_MSE[c(1,2,3,4,5,6)]
  new_df[,3+i+1]<-data_df$Naive_MSE[c(1,2,3,4,5,6)]
}

colnames(new_df)<-c("MU","BIT500","BIT1000","BIT1500","Naive500","Naive1000","Naive1500")
write.csv(new_df,"./simulation/RESULTS/Fig2A_MU.csv",row.names=FALSE)

The Fig2A_MU.csv table should be:

Figure. 2A table
	MU	BIT500	BIT1000	BIT1500	Naive500	Naive1000	Naive1500
1	-5	0.0022580188135531	0.000818472327772802	0.000510569741043017	0.00245874128916261	0.00114969905279038	0.000769359289984737
2	-4.5	0.00212966515481292	0.00103666958740791	0.00064169937082568	0.00264156021151374	0.00101536653110342	0.000768920849747399
3	-4	0.00198518372879334	0.000681825887143333	0.000600056078685824	0.00212557745481653	0.000996199499101249	0.000692048219199762
4	-3.5	0.00165538478428067	0.000852297662144227	0.000485813251788291	0.00197356914261532	0.00112894933573882	0.000772942245152072
5	-3	0.00160390407139712	0.000971558635515894	0.000700264757159971	0.00198764288931968	0.00132907774386411	0.000847521525906551
6	-2.5	0.00188736351670382	0.000770553000802977	0.000572850943985385	0.00270199537402184	0.0010627703005949	0.000731624494748081

With the Fig2A_MU.csv table, we can now plot the MSE of BIT and naive methods:

MU_sim<-read.csv("./simulation/RESULTS/Fig2A_MU.csv")
long_data_mu <- pivot_longer(MU_sim, cols = -MU, names_to = "variable", values_to = "value")

p1<-ggplot(long_data_mu, aes(x = MU, y = value, color = variable, shape = variable)) +
  geom_line() +     # Add lines
  geom_point() +    # Add points
  scale_color_manual(values = c("BIT500" = colors_element1[1], "BIT1000" = colors_element1[2], "BIT1500" = colors_element1[3],"Naive500" = colors_element2[1], "Naive1000" = colors_element2[2], "Naive1500" = colors_element2[3]),
                     labels = c("BIT500" = "BIT (I=500)", "BIT1000" = "BIT (I=1000)", "BIT1500" = "BIT (I=1500)","Naive500" = "Naïve (I=500)", "Naive1000" = "Naïve (I=1000)", "Naive1500" = "Naïve (I=1500)"), name = "") +
  scale_shape_manual(values = c("BIT500" = 1, "BIT1000" = 2, "BIT1500" = 4,"Naive500" = 5, "Naive1000" = 8, "Naive1500" = 9),
                     labels = c("BIT500" = "BIT (I=500)", "BIT1000" = "BIT (I=1000)", "BIT1500" = "BIT (I=1500)","Naive500" = "Naïve (I=500)", "Naive1000" = "Naïve (I=1000)", "Naive1500" = "Naïve (I=1500)"),name = "") +
  labs(title = "", x = expression(bold(mu)), y = "MSE") +
  theme_bw() + theme(legend.position = "none",axis.text.x = element_text(size = 12,color="black"),  # Customizing x-axis tick labels
                     axis.text.y = element_text(size = 10,color="black"),  # Customizing y-axis tick labels
                     axis.title.x = element_text(size = 12,color="black"), # Customizing x-axis label
                     axis.title.y = element_text( size = 12,color="black"), # Customizing y-axis label
                     legend.text = element_text(size = 10,color="black"),  # Customizing legend text
                     legend.title = element_text( size = 12,color="black")  # Customizing legend title
  ) + scale_x_continuous(labels=c("-5","-4.5","-4","-3.5","-3","-2.5"))+scale_y_continuous(limits=c(0,0.0035),breaks=c(0,0.0010,0.0020,0.0030))

Which gives us:

Generating Figure. 2B

We generate the Fig2B_MU.csv table from the summarized results:

library(tidyverse)
work_dir_MU<-"./simulation/RESULTS/MU/"

I_vec<-c(500,1000,1500)
mu<-c(-5,-4.5,-4,-3.5,-3,-2.5)
THETA_MSE_MU<-data.frame(matrix(nrow=18,ncol=4))
colnames(THETA_MSE_MU)<-c("MU","I_SIZE","BIT_MSE_MEAN","Naive_MSE_MEAN")
THETA_MSE_MU$MU<-rep(mu,3)
THETA_MSE_MU$I_SIZE<-rep(I_vec,each=6)

for(i in 1:3){
  BIT_MSE_Mean<-c()
  Naive_MSE_Mean<-c()

  for(k in 1:6){
    data_df<-read.csv(paste0(work_dir_MU,"Theta_i_I_",I_vec[i],"_Mu_",mu[k],".csv"))
    BIT_MSE_Mean<-c(BIT_MSE_Mean,mean(data_df[,2],na.rm=TRUE))
    Naive_MSE_Mean<-c(Naive_MSE_Mean,mean(data_df[,4],na.rm=TRUE))
  }
  THETA_MSE_MU$BIT_MSE_MEAN[((1:6)+(i-1)*6)]<-BIT_MSE_Mean
  THETA_MSE_MU$Naive_MSE_MEAN[((1:6)+(i-1)*6)]<-Naive_MSE_Mean
}

THETA_MSE_MU <- THETA_MSE_MU %>%
    # Pivot the MSE columns to long format
    pivot_longer(
      cols = c(BIT_MSE_MEAN, Naive_MSE_MEAN),
      names_to = "method",
      values_to = "value"
    ) %>%
    # Create the group column by combining method and I_SIZE
    mutate(
      # Extract just "BIT" or "Naive" from the method names
      method = str_replace(method, "_MSE_MEAN", ""),
      # Create the group label in the desired format
      group = sprintf("%s (I=%d)", method, I_SIZE),
      # Rename mu column to match desired output
      mu = MU
    ) %>%
    # Select and arrange the final columns
    select(value, mu, group) %>%
    # Sort by mu and group
    arrange(mu, group)

write.csv(THETA_MSE_MU,"./simulation/RESULTS/Fig2B_MU.csv",row.names = FALSE)

The Fig2B_MU.csv table should be:

With the Fig2B_MU.csv table, we can now plot the MSE of BIT and naive methods:

Theta_MU_sim<-read.csv("./simulation/RESULTS/Fig2B_MU.csv")
plot1<-ggplot(Theta_MU_sim, aes(x = mu, y = value, color = group, shape = group)) +
  geom_line() +     # Add lines
  geom_point() +    # Add points
  scale_color_manual(values = c("BIT (I=500)" = colors_element1[1], "BIT (I=1000)" = colors_element1[2], "BIT (I=1500)" = colors_element1[3],"Naive (I=500)" = colors_element2[1], "Naive (I=1000)" = colors_element2[2], "Naive (I=1500)" = colors_element2[3]), name = "") +
  scale_shape_manual(values = c("BIT (I=500)" = 1, "BIT (I=1000)" = 2, "BIT (I=1500)" = 4,"Naive (I=500)" = 5, "Naive (I=1000)" = 8, "Naive (I=1500)" = 9),name = "") +
  labs(title = "", x = expression(bold(mu)), y = "Average of MSEs") +
  theme_bw() + theme(legend.position = "none",axis.text.x = element_text(size = 12,color="black"),  # Customizing x-axis tick labels
                     axis.text.y = element_text(size = 12,color="black"),  # Customizing y-axis tick labels
                     axis.title.x = element_text(size = 14,color="black"), # Customizing x-axis label
                     axis.title.y = element_text( size = 14,color="black"), # Customizing y-axis label
                     legend.text = element_text(size = 10,color="black"),  # Customizing legend text
                     legend.title = element_text( size = 12,color="black")  # Customizing legend title
  ) + scale_x_continuous(labels=c("-5","-4.5","-4","-3.5","-3","-2.5"))+scale_y_continuous(limits=c(0.1,0.5),breaks=c(0.1,0.2,0.3,0.4,0.5))

which gives the plot:

Generating Figure. 2C

We can also generate the Fig2C_MU.csv table:

work_dir_MU<-"./simulation/RESULTS/MU/"
work_files_MU<-list.files(work_dir_MU,pattern="Theta_i_I_*")
work_files_MU
I_vec<-c(500,1000,1500)
TR_level<-c("500","1000","1500")

mu<-c(-5,-4.5,-4,-3.5,-3,-2.5)
MU_spearman_df<-data.frame(matrix(nrow=3600,ncol=3))
colnames(MU_spearman_df)<-c("value","mu","group")
for(i in 1:6){
  for(j in 1:3){
    Theta_df<-as.data.frame(fread(paste0(work_dir_MU,"Theta_i_I_",I_vec[j],"_Mu_",mu[i],".csv")))
    index1<-(i-1)*600+(j-1)*200+1
    index2<-(i-1)*600+(j-1)*200+100
    index3<-(i-1)*600+(j-1)*200+101
    index4<-(i-1)*600+(j-1)*200+200
    MU_spearman_df[index1:index2,1]<-Theta_df$Naive_Spearman[1:100]
    MU_spearman_df[index3:index4,1]<-Theta_df$BIT_Spearman[1:100]
    MU_spearman_df[index1:index2,2]<-mu[i]
    MU_spearman_df[index3:index4,2]<-mu[i]
    MU_spearman_df[index1:index2,3]<-paste0("Naive (I=",TR_level[j],")")
    MU_spearman_df[index3:index4,3]<-paste0("BIT (I=",TR_level[j],")")
  }
}

df1<-MU_spearman_df
df1$mu<-as.factor(df1$mu)
df1$group<-factor(df1$group,levels=c("BIT (I=1500)","BIT (I=1000)","BIT (I=500)","Naive (I=1500)","Naive (I=1000)","Naive (I=500)"))

write.csv(df1,"./simulation/RESULTS/Fig2C_MU.csv",row.names=FALSE)

The Fig2C_MU.csv table should be:

With the Fig2C_MU.csv table, we can now plot the MSE of BIT and naive methods:

df1<-read.csv(paste0(DATA_DIR,"Fig2C_MU.csv"))
df1$mu<-as.factor(df1$mu)
df1$group<-factor(df1$group,levels=c("BIT (I=1500)","BIT (I=1000)","BIT (I=500)","Naive (I=1500)","Naive (I=1000)","Naive (I=500)"))

plot1<-ggplot(df1, aes(x = mu, y = value, fill = group)) +
  geom_boxplot(width = 0.7, size = 0.3,position = position_dodge(0.8), outlier.shape=NA)+
  ylim(c(0.7,1))+theme_bw()+theme(legend.position = "none",axis.text.x = element_text(size = 10,color="black"),  # Customizing x-axis tick labels
                                   axis.text.y = element_text( size = 10,color="black"),  # Customizing y-axis tick labels
                                   axis.title.x = element_text( size = 12,color="black"), # Customizing x-axis label
                                   axis.title.y = element_text( size = 12,color="black"), # Customizing y-axis label
                                   legend.text = element_text( size = 8,color="black"),  # Customizing legend text
                                   legend.title = element_text(size = 10,color="black")  # Customizing legend title
  )+
  scale_fill_manual(values = c(colors_element1[3:1], colors_element2[3:1]))+
  xlab(expression(mu))+ylab(expression(paste("Spearman ",rho)))

which gives the plot: