Simulation
====================

This example demonstrates how to generate simulation plots from raw data as presented in the manuscript. In manuscript, we considered three simulation cases examining :math:`\mu`, :math:`\tau^2`, and :math:`\sigma_0^2`, along with two distribution variants: Gamma and Student’s :math:`t` distributions. Below, we focus on the case of :math:`\mu`.

Generating simulation data
------------------------

To generate the raw simulation data, we run the simulation script ``SIMU_MU.R``, First, we import the required packages and configure the test values for :math:`\mu`:

.. code-block:: r

  library(BIT)
  library(truncdist)

  # Configuration parameters ---------------------------------------------------
  TR_SIZES <- c(500, 1000, 1500)
  MU_VALUES <- c(-5, -4.5, -4, -3.5, -3, -2.5)
  TAU_SQ_VALUES <- c(0.5, 0.7, 0.9, 1.1, 1.3, 1.5)
  SIGMA0_VALUES <- c(1, 1.2, 1.4, 1.6, 1.8, 2.0)
  BASE_DIR <- "./simulation/MU"
  N_VALUE <- 30000
  ITERATIONS <- 5000
  BURN_IN <- 2500


Next, we define helper functions used to generate simulation data for TRs with either single or multiple observations (datasets):

.. code-block:: r

  # Helper functions --------------------------------------------------------
  logistic_transform <- function(theta) {
    exp(theta) / (1 + exp(theta))
  }

  generate_TR_size <- function(num_TRs) {
    ceiling(rtrunc(num_TRs, spec = "lnorm",
                   meanlog = 1, sdlog = 1.75,
                   a = 0, b = 1000))
  }

  generate_multi_obs_TR <- function(n, mu, tau_sq, population) {
    TR_data <- list()
    TR_data$theta_i <- rnorm(1, mu, sqrt(tau_sq))
    TR_data$sigma_i <- rgamma(1, 0.75, 100)
    TR_data$theta_ij <- rnorm(n, TR_data$theta_i, sqrt(TR_data$sigma_i))
    TR_data$x_ij <- rbinom(n, population, logistic_transform(TR_data$theta_ij))
    TR_data$n_ij <- rep(population, n)
    TR_data
  }

  generate_single_obs_TR <- function(mu, tau_sq, sigma0_sq, population) {
    TR_data <- list()
    TR_data$theta_i <- rnorm(1, mu, sqrt(tau_sq))
    TR_data$theta_ij <- rnorm(1, TR_data$theta_i, sqrt(sigma0_sq))
    TR_data$x_ij <- rbinom(1, population, logistic_transform(TR_data$theta_ij))
    TR_data$n_ij <- population
    TR_data
  }


With these helper functions in place, we define the main function for generating simulation data:

.. code-block:: r

  # Data generation functions -----------------------------------------------
  generate_simulation_data <- function(
      num_TRs = 1000,
      population = 30000,
      mu = -4,
      tau_sq = 0.75,
      sigma0_sq = 1.5) {

    TR_sizes <- generate_TR_size(num_TRs)

    multi_obs_count <- sum(TR_sizes > 1)
    single_obs_count <- num_TRs - multi_obs_count
    ordered_sizes <- sort(TR_sizes[TR_sizes > 1])

    simulation_data <- vector("list", num_TRs)

    # Generate multi-observation TRs
    for (i in seq_len(multi_obs_count)) {
      simulation_data[[i]] <- generate_multi_obs_TR(
        ordered_sizes[i], mu, tau_sq, population
      )
    }

    # Generate single-observation TRs
    for (j in (multi_obs_count + 1):num_TRs) {
      simulation_data[[j]] <- generate_single_obs_TR(
        mu, tau_sq, sigma0_sq, population
      )
    }

    simulation_data
  }

  structure_simulation_data <- function(raw_data) {
    structured_data <- list(
      xij = unlist(lapply(raw_data, `[[`, "x_ij")),
      nij = unlist(lapply(raw_data, `[[`, "n_ij")),
      label_vec = rep(seq_along(raw_data), lengths(lapply(raw_data, `[[`, "x_ij"))),
      theta_i = unlist(lapply(raw_data, `[[`, "theta_i")),
      theta_ij = unlist(lapply(raw_data, `[[`, "theta_ij"))
    )
    structured_data
  }

The simulation workflow consists of generating the data, running the main analysis, and saving the results:

.. code-block:: r

  # Simulation workflow ----------------------------------------------------
  run_simulation <- function(mu_value, iterations, num_TRs, simulation_id) {
    data_dir <- file.path(BASE_DIR, "SIMU_DATA")
    result_dir <- file.path(BASE_DIR, "SIMU_RESULTS")
    log_dir <- file.path(BASE_DIR, "LOG")

    dir.create(data_dir, showWarnings = FALSE, recursive = TRUE)
    dir.create(result_dir, showWarnings = FALSE, recursive = TRUE)
    dir.create(log_dir, showWarnings = FALSE, recursive = TRUE)

    # Generate and save simulation data
    simulated_data <- generate_simulation_data(
      num_TRs, N_VALUE, mu_value, 0.75, 1.5
    )
    structured_data <- structure_simulation_data(simulated_data)

    data_path <- file.path(data_dir, sprintf("id_%d_data_sim_mu_%g_I_%d.rds",
                                             simulation_id, mu_value, num_TRs))
    log_path <- file.path(log_dir, sprintf("id_%d_data_sim_mu_%g_I_%d.txt",
                                           simulation_id, mu_value, num_TRs))
    saveRDS(structured_data, data_path)

    # Run main analysis
    analysis_results <- Main_Sampling(
      iterations,
      structured_data$xij,
      structured_data$nij,
      structured_data$label_vec,
      log_path
    )

    # Process and save results
    final_results <- list(
      mu = mean(analysis_results$mu0[(iterations - BURN_IN):iterations]),
      theta_i = rowMeans(analysis_results$theta_i[, (iterations - BURN_IN):iterations]),
      label_vec = structured_data$label_vec
    )

    result_path <- file.path(result_dir, sprintf("id_%d_res_sim_mu_%g_I_%d.rds",
                                                 simulation_id, mu_value, num_TRs))
    saveRDS(final_results, result_path)
  }


We run the simulation 100 times for each setting, using different total numbers of TRs (500, 1000, and 1500), and varying :math:`\mu` range from :math:`[-5,-2.5]`, while keeping :math:`\tau^2=0.75` and :math:`\sigma_0^2=1.5` as default values.

.. code-block:: r

  # Execution block ------------------------------------------------------
  for (mu_value in MU_VALUES){
    for (sim_id in seq_len(100)) {
      for (tr_size in TR_SIZES) {
        run_simulation(
          mu_value = mu_value,
          iterations = ITERATIONS,
          num_TRs = tr_size,
          simulation_id = sim_id
        )
      }
    }
  }


The simulation raw data and BIT derived results will be saved as ``*.rds`` files in two separate folders: ``./simulation/MU/SIMU_DATA`` and ``./simulation/MU/SIMU_RESULTS``.

``./simulation/MU/SIMU_DATA``

.. image:: ../images/Examples/Simulation/Pic1.png

``./simulation/MU/SIMU_RESULTS``

.. image:: ../images/Examples/Simulation/Pic2.png


Next we need to calculate the mean squared error of :math:`\mu` and spearman rho of estimated :math:`\hat{\theta}_i` with true :math:`\theta_i` from the raw data:

.. code-block:: r

  library(data.table)

  logit_function<-function(x){
  	return(log(x/(1-x)))
  }

  vec_logit<-Vectorize(logit_function)

  Naive_Mu<-function(data){
    part1<-data$xij/data$nij
    part1[which(part1==0)]<-part1[which(part1==0)]+0.000001
    part1<-vec_logit(part1)
    group_means <- tapply(part1, data$label_vec, mean)
    return(mean(group_means))
  }

  Naive_TAU2<-function(data){
  	part1<-data$xij/data$nij
  	part1[which(part1==0)]<-part1[which(part1==0)]+0.000001
  	part1<-vec_logit(part1)
  	return(var(part1))
  }

  Naive_SIGMA0<-function(data,Mc){
  	part1<-data$xij/data$nij
  	part1[which(part1==0)]<-part1[which(part1==0)]+0.000001
  	part1<-vec_logit(part1)
  	Mu_est<-Naive_Mu(data)
  	part1_Mc<-part1[(length(part1)-Mc+1):length(part1)]
  	return(sum((part1_Mc-Mu_est)^2)/(Mc-1))
  }

  Naive_Theta_i<-function(data){
  	part1<-data$xij/data$nij
  	part1[which(part1==0)]<-part1[which(part1==0)]+0.000001
  	part1<-vec_logit(part1)
  	return(tapply(part1,data$label_vec,mean))
  }


  ###################
  work_dir_data<-"./simulation/MU/SIMU_DATA/"
  work_dir_results<-"./simulation/MU/SIMU_RESULTS/"
  output_dir<-"./simulation/RESULTS/MU/"
  dir.create(output_dir, showWarnings = FALSE, recursive = TRUE))

  M_vec<-c(350,700,1050)
  Mc_vec<-c(150,300,450)
  MU<-c(-5,-4.5,-4,-3.5,-3,-2.5)


  for(i in 1:3){
  		output_mu_df<-data.frame(matrix(nrow=6,ncol=4))
  		colnames(output_mu_df)<-c("BIT_Bias","BIT_MSE","Naive_Bias","Naive_MSE")
  		for(k in 1:6){
  			output_theta_i_df<-data.frame(matrix(nrow=(M_vec[i]+Mc_vec[i]),ncol=6))
  			colnames(output_theta_i_df)<-c("BIT_Bias","BIT_MSE","Naive_Bias","Naive_MSE","Naive_Spearman","BIT_Spearman")

  			data_files<-list.files(work_dir_data,pattern=paste0("*_mu_",MU[k],"_I_",M_vec[i]+Mc_vec[i],".rds"))
  			results_files<-list.files(work_dir_results,pattern=paste0("*_mu_",MU[k],"_I_",M_vec[i]+Mc_vec[i],".rds"))

  			naive_MU_vec<-c()
  			naive_Theta_mat<-matrix(nrow=(M_vec[i]+Mc_vec[i]),ncol=100)

  			BIT_MU_vec<-c()
  			BIT_Theta_mat<-matrix(nrow=(M_vec[i]+Mc_vec[i]),ncol=100)

  			for(m in 1:100){
  			print(paste0(i,"_",k,"_",m))
  				data<-readRDS(paste0(work_dir_data,data_files[m]))
  				results<-readRDS(paste0(work_dir_results,results_files[m]))

  				label_rank<-rank(-data$theta_i)

  				true_mu<-MU[k]
  				true_Theta_i<-data$theta_i

  				naive_Mu<-Naive_Mu(data)
  				naive_Theta_i<-Naive_Theta_i(data)

  				naive_rank<-rank(-naive_Theta_i)
  				names(naive_rank)<-NULL

  				BIT_Mu<-results$mu
  				BIT_Theta_i<-results$theta_i[!duplicated(results$label_vec)]

  				BIT_rank<-rank(-BIT_Theta_i)

  				naive_MU_vec<-c(naive_MU_vec,naive_Mu-true_mu)
  				BIT_MU_vec<-c(BIT_MU_vec,BIT_Mu-true_mu)

  				naive_Theta_mat[,m]<-naive_Theta_i-true_Theta_i
  				BIT_Theta_mat[,m]<-BIT_Theta_i-true_Theta_i

  				spearman_naive<-cor(label_rank,naive_rank,method="spearman")
  				spearman_BIT<-cor(label_rank,BIT_rank,method="spearman")

  				output_theta_i_df[m,5]<-spearman_naive
  				output_theta_i_df[m,6]<-spearman_BIT
  			}
  			output_mu_df[k,1]<-mean(abs(BIT_MU_vec),na.rm=TRUE)
  			output_mu_df[k,2]<-mean(BIT_MU_vec^2,na.rm=TRUE)
  			output_mu_df[k,3]<-mean(abs(naive_MU_vec),na.rm=TRUE)
  			output_mu_df[k,4]<-mean(naive_MU_vec^2,na.rm=TRUE)

  			output_theta_i_df[,1]<-rowMeans(abs(BIT_Theta_mat),na.rm=TRUE)
  			output_theta_i_df[,2]<-rowMeans(BIT_Theta_mat^2,na.rm=TRUE)
  			output_theta_i_df[,3]<-rowMeans(abs(naive_Theta_mat),na.rm=TRUE)
  			output_theta_i_df[,4]<-rowMeans(naive_Theta_mat^2,na.rm=TRUE)

  			fwrite(output_theta_i_df,paste0(output_dir,"Theta_i_I_",M_vec[i]+Mc_vec[i],"_Mu_",MU[k],".csv"))
  		}
  		fwrite(output_mu_df,paste0(output_dir,"Mu_I_",M_vec[i]+Mc_vec[i],".csv"))
  }


We will get tables as below:

``./simulation/RESULTS/MU``

.. image:: ../images/Examples/Simulation/Pic3.png


Generating Figure. 2A
------------------------

We generate the `Fig2A_Mu.csv` table from the summarized results:

.. code-block:: r

  work_dir_MU<-"./simulation/RESULTS/MU/"
  I_vec<-c(500,1000,1500)

  new_df<-data.frame(matrix(nrow=6,ncol=7))
  colnames(new_df)[1]<-"mu"
  new_df[,1]<-c(-5.0,-4.5,-4,-3.5,-3,-2.5)
  for(i in 1:3){
    data_df<-read.csv(paste0(work_dir_MU,"Mu_I_",I_vec[i],".csv"))
    new_df[,i+1]<-data_df$BIT_MSE[c(1,2,3,4,5,6)]
    new_df[,3+i+1]<-data_df$Naive_MSE[c(1,2,3,4,5,6)]
  }

  colnames(new_df)<-c("MU","BIT500","BIT1000","BIT1500","Naive500","Naive1000","Naive1500")
  write.csv(new_df,"./simulation/RESULTS/Fig2A_MU.csv",row.names=FALSE)

The ``Fig2A_MU.csv`` table should be:

.. csv-table:: Figure. 2A table
   :file: ../tables/Examples/Simulation/Fig2A_MU.csv
   :header-rows: 1


With the ``Fig2A_MU.csv`` table, we can now plot the MSE of BIT and naive methods:

.. code-block:: r

  MU_sim<-read.csv("./simulation/RESULTS/Fig2A_MU.csv")
  long_data_mu <- pivot_longer(MU_sim, cols = -MU, names_to = "variable", values_to = "value")

  p1<-ggplot(long_data_mu, aes(x = MU, y = value, color = variable, shape = variable)) +
    geom_line() +     # Add lines
    geom_point() +    # Add points
    scale_color_manual(values = c("BIT500" = colors_element1[1], "BIT1000" = colors_element1[2], "BIT1500" = colors_element1[3],"Naive500" = colors_element2[1], "Naive1000" = colors_element2[2], "Naive1500" = colors_element2[3]),
                       labels = c("BIT500" = "BIT (I=500)", "BIT1000" = "BIT (I=1000)", "BIT1500" = "BIT (I=1500)","Naive500" = "Naïve (I=500)", "Naive1000" = "Naïve (I=1000)", "Naive1500" = "Naïve (I=1500)"), name = "") +
    scale_shape_manual(values = c("BIT500" = 1, "BIT1000" = 2, "BIT1500" = 4,"Naive500" = 5, "Naive1000" = 8, "Naive1500" = 9),
                       labels = c("BIT500" = "BIT (I=500)", "BIT1000" = "BIT (I=1000)", "BIT1500" = "BIT (I=1500)","Naive500" = "Naïve (I=500)", "Naive1000" = "Naïve (I=1000)", "Naive1500" = "Naïve (I=1500)"),name = "") +
    labs(title = "", x = expression(bold(mu)), y = "MSE") +
    theme_bw() + theme(legend.position = "none",axis.text.x = element_text(size = 12,color="black"),  # Customizing x-axis tick labels
                       axis.text.y = element_text(size = 10,color="black"),  # Customizing y-axis tick labels
                       axis.title.x = element_text(size = 12,color="black"), # Customizing x-axis label
                       axis.title.y = element_text( size = 12,color="black"), # Customizing y-axis label
                       legend.text = element_text(size = 10,color="black"),  # Customizing legend text
                       legend.title = element_text( size = 12,color="black")  # Customizing legend title
    ) + scale_x_continuous(labels=c("-5","-4.5","-4","-3.5","-3","-2.5"))+scale_y_continuous(limits=c(0,0.0035),breaks=c(0,0.0010,0.0020,0.0030))

Which gives us:

.. image:: ../images/Examples/Simulation/Pic4.png


Generating Figure. 2B
------------------------

We generate the ``Fig2B_MU.csv`` table from the summarized results:

.. code-block:: r

  library(tidyverse)
  work_dir_MU<-"./simulation/RESULTS/MU/"

  I_vec<-c(500,1000,1500)
  mu<-c(-5,-4.5,-4,-3.5,-3,-2.5)
  THETA_MSE_MU<-data.frame(matrix(nrow=18,ncol=4))
  colnames(THETA_MSE_MU)<-c("MU","I_SIZE","BIT_MSE_MEAN","Naive_MSE_MEAN")
  THETA_MSE_MU$MU<-rep(mu,3)
  THETA_MSE_MU$I_SIZE<-rep(I_vec,each=6)

  for(i in 1:3){
    BIT_MSE_Mean<-c()
    Naive_MSE_Mean<-c()

    for(k in 1:6){
      data_df<-read.csv(paste0(work_dir_MU,"Theta_i_I_",I_vec[i],"_Mu_",mu[k],".csv"))
      BIT_MSE_Mean<-c(BIT_MSE_Mean,mean(data_df[,2],na.rm=TRUE))
      Naive_MSE_Mean<-c(Naive_MSE_Mean,mean(data_df[,4],na.rm=TRUE))
    }
    THETA_MSE_MU$BIT_MSE_MEAN[((1:6)+(i-1)*6)]<-BIT_MSE_Mean
    THETA_MSE_MU$Naive_MSE_MEAN[((1:6)+(i-1)*6)]<-Naive_MSE_Mean
  }

  THETA_MSE_MU <- THETA_MSE_MU %>%
      # Pivot the MSE columns to long format
      pivot_longer(
        cols = c(BIT_MSE_MEAN, Naive_MSE_MEAN),
        names_to = "method",
        values_to = "value"
      ) %>%
      # Create the group column by combining method and I_SIZE
      mutate(
        # Extract just "BIT" or "Naive" from the method names
        method = str_replace(method, "_MSE_MEAN", ""),
        # Create the group label in the desired format
        group = sprintf("%s (I=%d)", method, I_SIZE),
        # Rename mu column to match desired output
        mu = MU
      ) %>%
      # Select and arrange the final columns
      select(value, mu, group) %>%
      # Sort by mu and group
      arrange(mu, group)

  write.csv(THETA_MSE_MU,"./simulation/RESULTS/Fig2B_MU.csv",row.names = FALSE)


The ``Fig2B_MU.csv`` table should be:

.. csv-table:: Figure. 2B table
   :file: ../tables/Examples/Simulation/Fig2B_MU.csv
   :header-rows: 1
   :lines: 1-10


With the ``Fig2B_MU.csv`` table, we can now plot the MSE of BIT and naive methods:

.. code-block:: r

  Theta_MU_sim<-read.csv("./simulation/RESULTS/Fig2B_MU.csv")
  plot1<-ggplot(Theta_MU_sim, aes(x = mu, y = value, color = group, shape = group)) +
    geom_line() +     # Add lines
    geom_point() +    # Add points
    scale_color_manual(values = c("BIT (I=500)" = colors_element1[1], "BIT (I=1000)" = colors_element1[2], "BIT (I=1500)" = colors_element1[3],"Naive (I=500)" = colors_element2[1], "Naive (I=1000)" = colors_element2[2], "Naive (I=1500)" = colors_element2[3]), name = "") +
    scale_shape_manual(values = c("BIT (I=500)" = 1, "BIT (I=1000)" = 2, "BIT (I=1500)" = 4,"Naive (I=500)" = 5, "Naive (I=1000)" = 8, "Naive (I=1500)" = 9),name = "") +
    labs(title = "", x = expression(bold(mu)), y = "Average of MSEs") +
    theme_bw() + theme(legend.position = "none",axis.text.x = element_text(size = 12,color="black"),  # Customizing x-axis tick labels
                       axis.text.y = element_text(size = 12,color="black"),  # Customizing y-axis tick labels
                       axis.title.x = element_text(size = 14,color="black"), # Customizing x-axis label
                       axis.title.y = element_text( size = 14,color="black"), # Customizing y-axis label
                       legend.text = element_text(size = 10,color="black"),  # Customizing legend text
                       legend.title = element_text( size = 12,color="black")  # Customizing legend title
    ) + scale_x_continuous(labels=c("-5","-4.5","-4","-3.5","-3","-2.5"))+scale_y_continuous(limits=c(0.1,0.5),breaks=c(0.1,0.2,0.3,0.4,0.5))


which gives the plot:

.. image:: ../images/Examples/Simulation/Pic5.png


Generating Figure. 2C
------------------------

We can also generate the ``Fig2C_MU.csv`` table:

.. code-block:: r

  work_dir_MU<-"./simulation/RESULTS/MU/"
  work_files_MU<-list.files(work_dir_MU,pattern="Theta_i_I_*")
  work_files_MU
  I_vec<-c(500,1000,1500)
  TR_level<-c("500","1000","1500")

  mu<-c(-5,-4.5,-4,-3.5,-3,-2.5)
  MU_spearman_df<-data.frame(matrix(nrow=3600,ncol=3))
  colnames(MU_spearman_df)<-c("value","mu","group")
  for(i in 1:6){
    for(j in 1:3){
      Theta_df<-as.data.frame(fread(paste0(work_dir_MU,"Theta_i_I_",I_vec[j],"_Mu_",mu[i],".csv")))
      index1<-(i-1)*600+(j-1)*200+1
      index2<-(i-1)*600+(j-1)*200+100
      index3<-(i-1)*600+(j-1)*200+101
      index4<-(i-1)*600+(j-1)*200+200
      MU_spearman_df[index1:index2,1]<-Theta_df$Naive_Spearman[1:100]
      MU_spearman_df[index3:index4,1]<-Theta_df$BIT_Spearman[1:100]
      MU_spearman_df[index1:index2,2]<-mu[i]
      MU_spearman_df[index3:index4,2]<-mu[i]
      MU_spearman_df[index1:index2,3]<-paste0("Naive (I=",TR_level[j],")")
      MU_spearman_df[index3:index4,3]<-paste0("BIT (I=",TR_level[j],")")
    }
  }

  df1<-MU_spearman_df
  df1$mu<-as.factor(df1$mu)
  df1$group<-factor(df1$group,levels=c("BIT (I=1500)","BIT (I=1000)","BIT (I=500)","Naive (I=1500)","Naive (I=1000)","Naive (I=500)"))

  write.csv(df1,"./simulation/RESULTS/Fig2C_MU.csv",row.names=FALSE)

The ``Fig2C_MU.csv`` table should be:

.. csv-table:: Figure. 2C table
   :file: ../tables/Examples/Simulation/Fig2C_MU.csv
   :header-rows: 1
   :lines: 1-10


With the ``Fig2C_MU.csv`` table, we can now plot the MSE of BIT and naive methods:

.. code-block:: r

  df1<-read.csv(paste0(DATA_DIR,"Fig2C_MU.csv"))
  df1$mu<-as.factor(df1$mu)
  df1$group<-factor(df1$group,levels=c("BIT (I=1500)","BIT (I=1000)","BIT (I=500)","Naive (I=1500)","Naive (I=1000)","Naive (I=500)"))

  plot1<-ggplot(df1, aes(x = mu, y = value, fill = group)) +
    geom_boxplot(width = 0.7, size = 0.3,position = position_dodge(0.8), outlier.shape=NA)+
    ylim(c(0.7,1))+theme_bw()+theme(legend.position = "none",axis.text.x = element_text(size = 10,color="black"),  # Customizing x-axis tick labels
                                     axis.text.y = element_text( size = 10,color="black"),  # Customizing y-axis tick labels
                                     axis.title.x = element_text( size = 12,color="black"), # Customizing x-axis label
                                     axis.title.y = element_text( size = 12,color="black"), # Customizing y-axis label
                                     legend.text = element_text( size = 8,color="black"),  # Customizing legend text
                                     legend.title = element_text(size = 10,color="black")  # Customizing legend title
    )+
    scale_fill_manual(values = c(colors_element1[3:1], colors_element2[3:1]))+
    xlab(expression(mu))+ylab(expression(paste("Spearman ",rho)))

which gives the plot:

.. image:: ../images/Examples/Simulation/Pic6.png