RUNX1 Plots

The following example demonstrates how to process raw ATAC-seq data obtained from the Sequence Read Archive (SRA). Specifically, the workflow includes data download, quality checking, trimming, alignment with Bowtie2, format conversion with Samtools, duplicate removal, and peak calling using MACS2. This complete pipeline ensures high-quality data preparation for generating BIT plots.

For illustration purposes, the pipeline below demonstrates processing for a single sample (RUNX1_WT_DN3_Rep1). However, in a complete analysis, three additional samples are required, including two replicates from knockout conditions and one additional replicate from the wild-type control.

#!/bin/bash
#SBATCH -J Fully_Process_Scripts_ATAC_seq
#SBATCH -o ./output/output-%j.out
#SBATCH -e ./error/error-%j.out
#SBATCH -p standard-s --mem=64G

# Change the directory as necessary
LOG_DIR="/lustre/work/client/users/zeyul/BIT/mm10/data/log/"
MM10_REF_DIR="/lustre/work/client/users/BIT/tools/mm10_ref_bowtie2/mm10"
DATA_DIR="/lustre/work/client/users/zeyul/BIT/mm10/data/"
FastQC_DIR="/lustre/work/client/users/zeyul/BIT/mm10/FastQC/"
PEAK_DIR="/lustre/work/client/users/zeyul/BIT/mm10/results/"
DiffBind_DIR="/lustre/work/client/users/zeyul/BIT/mm10/DiffBind/"

MIDDLE_DIR="RUNX1_KO/"

# Specific the final ATAC-seq filename and SRX number
ATAC_NAME_1="RUNX1_WT_Rep1"
SRX_NUMBER="SRR24493103"
#SRX_NUMBER="SRR24493101" WT Rep2
#SRX_NUMBER="SRR24493100" cKO Rep1
#SRX_NUMBER="SRR24493099" cKO Rep2

mkdir -p "${DATA_DIR}${MIDDLE_DIR}"
mkdir -p "${LOG_DIR}${MIDDLE_DIR}"
mkdir -p "${FastQC_DIR}${MIDDLE_DIR}"
mkdir -p "${PEAK_DIR}${MIDDLE_DIR}"

## Download Data
fasterq-dump --progress "$SRX_NUMBER" -o "${DATA_DIR}${MIDDLE_DIR}${ATAC_NAME_1}.fastq"

## FastQC before Trimming
fastqc --noextract --nogroup -o "${FastQC_DIR}${MIDDLE_DIR}" "${DATA_DIR}${MIDDLE_DIR}${ATAC_NAME_1}_1.fastq" "${DATA_DIR}${MIDDLE_DIR}${ATAC_NAME_1}_2.fastq"

## Trimming with trim-galore
trim_galore --paired -q 20 --phred33 --length 25 -e 0.1 --stringency 4 -o "${DATA_DIR}${MIDDLE_DIR}" "${DATA_DIR}${MIDDLE_DIR}${ATAC_NAME_1}_1.fastq" "${DATA_DIR}${MIDDLE_DIR}${ATAC_NAME_1}_2.fastq"
mkdir -p "${PEAK_DIR}${MIDDLE_DIR}"

## FastQC post-trimming
fastqc --noextract --nogroup -o "${FastQC_DIR}${MIDDLE_DIR}" "${DATA_DIR}${MIDDLE_DIR}${ATAC_NAME_1}_1_val_1.fq" "${DATA_DIR}${MIDDLE_DIR}${ATAC_NAME_1}_2_val_2.fq"

## bowtie2 Alignment
bowtie2 --very-sensitive -X 2000 -p 8 -q --local \
-x "${MM10_REF_DIR}" \
-1 "${DATA_DIR}${MIDDLE_DIR}${ATAC_NAME_1}_1_val_1.fq" -2 "${DATA_DIR}${MIDDLE_DIR}${ATAC_NAME_1}_2_val_2.fq" \
-S "${DATA_DIR}${MIDDLE_DIR}${ATAC_NAME_1}.sam"

## Samtools transfer to BAM
samtools view -h -S -b \
-o "${DATA_DIR}${MIDDLE_DIR}${ATAC_NAME_1}.bam" \
"${DATA_DIR}${MIDDLE_DIR}${ATAC_NAME_1}.sam"
samtools sort -n -o "${DATA_DIR}${MIDDLE_DIR}${ATAC_NAME_1}_sorted.bam" -O BAM "${DATA_DIR}${MIDDLE_DIR}${ATAC_NAME_1}.bam"
samtools fixmate -m "${DATA_DIR}${MIDDLE_DIR}${ATAC_NAME_1}_sorted.bam" "${DATA_DIR}${MIDDLE_DIR}${ATAC_NAME_1}_fixmate.bam"
rm "${DATA_DIR}${MIDDLE_DIR}${ATAC_NAME_1}_sorted.bam"
samtools sort -o "${DATA_DIR}${MIDDLE_DIR}${ATAC_NAME_1}_sorted.bam" "${DATA_DIR}${MIDDLE_DIR}${ATAC_NAME_1}_fixmate.bam"
samtools markdup -r -s "${DATA_DIR}${MIDDLE_DIR}${ATAC_NAME_1}_sorted.bam" "${DATA_DIR}${MIDDLE_DIR}${ATAC_NAME_1}_Final.bam"
samtools index "${DATA_DIR}${MIDDLE_DIR}${ATAC_NAME_1}_Final.bam"

samtools index "${DATA_DIR}${MIDDLE_DIR}${ATAC_NAME_1}_Final.bam"

## Peak Calling
macs2 callpeak -f BAM -g mm -n "${PEAK_DIR}${MIDDLE_DIR}${ATAC_NAME_1}" -B -q 0.01 -t "${DATA_DIR}${MIDDLE_DIR}${ATAC_NAME_1}_Final.bam"

# For DiffBind
mv "${DATA_DIR}${MIDDLE_DIR}${ATAC_NAME_1}_Final.bam" "${DATA_DIR}${MIDDLE_DIR}${ATAC_NAME_1}_peaks.narrowPeak" "${PEAK_DIR}${MIDDLE_DIR}"

The processed files need to be put in the same folder for the downstream DiffBind analysis

../_images/Pic1.png

For DiffBind, we need to first formulate a samplesheet to list the datasets:

DiffBind Samplesheet

SampleID

Factor

Condition

Treatment

Replicate

bamReads

Peaks

PeakCaller

1

RUNX1_WT1

RUNX1

WT

full

1

RUNX1_WT_Rep1_Final.bam

RUNX1_WT_Rep1_peaks.narrowPeak

narrow

2

RUNX1_WT2

RUNX1

WT

full

2

RUNX1_WT_Rep2_Final.bam

RUNX1_WT_Rep2_peaks.narrowPeak

narrow

3

RUNX1_KO1

RUNX1

KO

full

1

RUNX1_cKO_Rep1_Final.bam

RUNX1_cKO_Rep1_peaks.narrowPeak

narrow

4

RUNX1_KO2

RUNX1

KO

full

2

RUNX1_cKO_Rep2_Final.bam

RUNX1_cKO_Rep2_peaks.narrowPeak

narrow

Next we need to use DiffBind to process and generate the differentially accessible region set.

library(DiffBind)
library(rtracklayer)

print(getwd())
samplesheet <- read.csv("RUNX1_KO_samplesheet.csv")
RUNX1_KO <- dba(sampleSheet = samplesheet)
RUNX1_KO <- dba.count(RUNX1_KO, minOverlap = 2)
RUNX1_KO <- dba.blacklist(RUNX1_KO, blacklist = DBA_BLACKLIST_MM10)
RUNX1_KO$config$cores = 64
RUNX1_KO$config$bUsePval = TRUE
RUNX1_KO$th = 0.20
RUNX1_KO_norm <- dba.normalize(RUNX1_KO, method = DBA_DESEQ2,
                               normalize = DBA_NORM_NATIVE,
                               library = DBA_LIBSIZE_BACKGROUND,
                               background = TRUE)
RUNX1_KO_ct <- dba.contrast(RUNX1_KO_norm, categories = DBA_CONDITION, minMembers = 2)
RUNX1_KO_results <- dba.analyze(RUNX1_KO_ct, method = DBA_DESEQ2)
RUNX1_KO <- dba.report(RUNX1_KO_results, file = "RUNX1_cKO_report.csv", th = 0.1)
export.bed(RUNX1_KO, "mm10_RUNX1.bed")

The exported BED file can be shown as:

# Load required library
library(rtracklayer)

# Define file path
RUNX1_BED <- "/Users/zeyulu/Desktop/Project/BIT/Input_Data/DARs/mm10_RUNX1.bed"

# Import BED file
import(RUNX1_BED)
Loading required package: GenomicRanges
Loading required package: GenomeInfoDb
Warning message:
package ‘GenomeInfoDb’ was built under R version 4.4.2

GRanges object with 15652 ranges and 2 metadata columns:
          seqnames              ranges strand |        name     score
             <Rle>           <IRanges>  <Rle> | <character> <numeric>
      [1]     chr3 143678121-143678521      * |       40711         0
      [2]    chr12   21480229-21480629      * |       12636         0
      [3]     chr1 192778787-192779187      * |        4468         0
      [4]    chr13     9547929-9548329      * |       15220         0
      [5]     chr7   88206431-88206831      * |       55040         0
      ...      ...                 ...    ... .         ...       ...
  [15648]     chr1   52799363-52799763      * |         796         0
  [15649]    chr14 105700293-105700693      * |       20509         0
  [15650]    chr12 111072186-111072586      * |       14669         0
  [15651]    chr16   93031654-93032366      * |       25272         0
  [15652]     chr4 117190116-117190516      * |       43363         0
  -------
  seqinfo: 24 sequences from an unspecified genome; no seqlengths

We next apply BIT to the exported BED file and generate the TR ranked table:

     TR   Theta_i     lower     upper  BIT_score BIT_score_lower BIT_score_upper Rank
 1  Bcl11b -2.182627 -2.212789 -2.156569 0.10132149      0.09860792      0.10371893    1
 2    Cbfb -2.238118 -2.305173 -2.180754 0.09637935      0.09069547      0.10149211    2
 3   Runx1 -2.269733 -2.286438 -2.255767 0.09366088      0.09225243      0.09485314    3
 4   Hdac2 -2.312496 -2.366128 -2.267778 0.09009334      0.08579231      0.09382692    4
 5   Runx3 -2.378822 -2.421391 -2.343063 0.08480192      0.08155599      0.08761877    5
 6    Cdk6 -2.488816 -2.570101 -2.415317 0.07664596      0.07108760      0.08201211    6
 7   Satb1 -2.551729 -2.589306 -2.517789 0.07231043      0.06982987      0.07462051    7
 8   Taf12 -2.552330 -2.737449 -2.421156 0.07227008      0.06079942      0.08157363    8
 9   Tfap4 -2.575503 -2.745504 -2.441114 0.07073173      0.06034110      0.08009081    9
10   Rag2 -2.576603 -2.604533 -2.550052 0.07065946      0.06884723      0.07242299   10

And we can plot the top 10 TRs:

# Load required libraries
library(ggplot2)

# Read the CSV file
RUNX1_Tab <- read.csv("/Users/zeyulu/Desktop/Project/BIT/revision_data/bin_width/1000/mm10_RUNX1_rank_table.csv", row.names = 1)

# Prepare data frame for visualization
data <- data.frame(
    Group = "Runx1 Knockout Top 10 TRs",
    Label = RUNX1_Tab[1:10, "TR"],
    Value = RUNX1_Tab[1:10, "BIT_score"],
    Upper = RUNX1_Tab[1:10, "BIT_score_upper"],
    Lower = RUNX1_Tab[1:10, "BIT_score_lower"]
)

# Convert Label to factor for ordering in the plot
data$Label <- factor(data$Label, levels = rev(data$Label))

# Create the plot using ggplot2
p1 <- ggplot(data, aes(x = Label, y = Value)) +
    geom_col(fill = "#C1E0E4", colour = "black", size = 0.25) +
    geom_errorbar(aes(ymin = Lower, ymax = Upper), width = 0.5, color = "black", size = 0.35) +
    coord_flip() +  # Flip coordinates for a horizontal bar plot
    facet_grid(. ~ Group, scales = "free_y") +  # Facet by group
    labs(title = "", x = "Top 10 Identified TRs", y = "") +
    scale_y_continuous(limits = c(0, 0.13), breaks = seq(0, 0.12, by = 0.03), expand = c(0, 0)) +
    theme_bw() +  # Minimal theme
    theme(
        axis.text.x = element_text(color = "black", size = 12),
        axis.text.y = element_text(color = "black", size = 12),
        axis.title.y = element_text(size = 14),
        legend.position = "none",
        plot.margin = unit(c(0, 0.5, 0, 0), "cm"),
        strip.background = element_rect(fill = "#DBD1B6"),
        strip.text = element_text(size = 12, colour = "black")
    )

# Display the plot
print(p1)
../_images/Pic2.png

We can also generate the plot by comparing the BIT-identified TRs from using increased accessibility regions versus decreased accessibility regions. We separate the regions in the DiffBind generated report based on fold enrichment:

# Define file path
RUNX1_report <- "./test/RUNX1_cKO_report.csv"

# Read the CSV file
RUNX1_report_tab <- read.csv(RUNX1_report)

# Display the first six rows
head(RUNX1_report_tab)
           Chr     Start       End     Conc  Conc_KO  Conc_WT      Fold      p.value          FDR
1         chr3 143678121 143678521 7.277114 5.857603 7.978584 -1.986227 8.208806e-23 5.317866e-18
2        chr12  21480229  21480629 6.594209 4.497721 7.414836 -2.670207 1.638762e-22 5.317866e-18
3         chr1 192778787 192779187 7.073688 5.673339 7.770717 -1.954896 1.053971e-20 2.280126e-16
4        chr13   9547929   9548329 7.420042 6.200425 8.071361 -1.751839 1.479800e-20 2.401012e-16
5         chr7  88206431  88206831 7.940753 7.026219 8.496063 -1.409733 9.299981e-20 1.207156e-15
6 chrUn_JH584304     55478     55878 8.066727 7.232130 8.592054 -1.310974 1.948392e-19 2.107543e-15

We separate the table based on the fold enrichment change (Column: Fold)

# Load required library
library(dplyr)

# Separate rows based on the 'Fold' column
positive_fold <- RUNX1_report_tab %>%
  dplyr::filter(Fold > 0) %>%
  dplyr::select(Chr, Start, End)

negative_fold <- RUNX1_report_tab %>%
  dplyr::filter(Fold < 0) %>%
  dplyr::select(Chr, Start, End)

# Define output file paths
positive_bed_file <- "./test/RUNX1_KO_increased.bed"
negative_bed_file <- "./test/RUNX1_KO_decreased.bed"

# Export to .bed files without headers and row names
write.table(positive_fold, file = positive_bed_file, sep = "\t",
            row.names = FALSE, col.names = FALSE, quote = FALSE)

write.table(negative_fold, file = negative_bed_file, sep = "\t",
            row.names = FALSE, col.names = FALSE, quote = FALSE)

# Print messages for confirmation
cat("Positive fold BED file saved to:", positive_bed_file, "\n")
cat("Negative fold BED file saved to:", negative_bed_file, "\n")
# Load required library
library(rtracklayer)

# Import BED files
RUNX1_KO_increased <- import("./test/RUNX1_KO_increased.bed")
RUNX1_KO_decreased <- import("./test/RUNX1_KO_decreased.bed")

# Display summary of imported GRanges objects
RUNX1_KO_increased
RUNX1_KO_decreased

Console Output

After executing the script, the following output is displayed:

Increased RUNX1 Binding Regions:

 GRanges object with 4054 ranges and 0 metadata columns:
             seqnames              ranges strand
                <Rle>           <IRanges>  <Rle>
   [1] chrUn_JH584304         26924-27323      *
   [2]           chr2 105358113-105358512      *
   [3]           chr7   12987445-12987844      *
   [4]           chr3 126628458-126628857      *
   [5]           chr3 144260784-144261183      *
   ...            ...                 ...    ...
[4050]          chr15   83563327-83563726      *
[4051]           chr5 100070187-100070586      *
[4052]           chr5 122274739-122275138      *
[4053]          chr16   93031655-93032366      *
[4054]           chr4 117190117-117190516      *
-------
seqinfo: 21 sequences from an unspecified genome; no seqlengths

Decreased RUNX1 Binding Regions:

 GRanges object with 11598 ranges and 0 metadata columns:
        seqnames              ranges strand
           <Rle>           <IRanges>  <Rle>
    [1]     chr3 143678122-143678521      *
    [2]    chr12   21480230-21480629      *
    [3]     chr1 192778788-192779187      *
    [4]    chr13     9547930-9548329      *
    [5]     chr7   88206432-88206831      *
    ...      ...                 ...    ...
[11594]     chr3   69177462-69177861      *
[11595]    chr19   49358664-49359063      *
[11596]     chr1   52799364-52799763      *
[11597]    chr14 105700294-105700693      *
[11598]    chr12 111072187-111072586      *
-------
seqinfo: 24 sequences from an unspecified genome; no seqlengths
BIT("./test/RUNX1_KO_increased.bed",output_path = "./test",genome="mm10")
BIT("./test/RUNX1_KO_decreased.bed",output_path = "./test",genome="mm10")

RUNX1_KO_increased_table<-read.csv("./test/RUNX1_KO_increased_ranked_table.csv",row.names=1)
RUNX1_KO_decreased_table<-read.csv("./test/RUNX1_KO_decreased_ranked_table.csv",row.names=1)

head(RUNX1_KO_increased_table,10)
head(RUNX1_KO_decreased_table,10)

Top 10 Transcription Factors with Increased RUNX1 Binding:

    TR   Theta_i     lower     upper  BIT_score BIT_score_lower BIT_score_upper Rank
 1    Cdk6 -2.269732 -2.346700 -2.191284 0.09366096      0.08732846      0.10053591    1
 2   Tfap4 -2.300801 -2.476280 -2.151916 0.09105663      0.07753786      0.10415227    2
 3   Nelfa -2.323227 -2.383067 -2.269906 0.08921751      0.08447306      0.09364617    3
 4  Prdm13 -2.349848 -2.779716 -2.113548 0.08707787      0.05843018      0.10778697    4
 5   Gtf2b -2.359566 -2.438576 -2.282325 0.08630845      0.08027799      0.09259738    5
 6    Hopx -2.403542 -2.842773 -2.158268 0.08290301      0.05505610      0.10356115    6
 7   Taf12 -2.406057 -2.580908 -2.261903 0.08271200      0.07037728      0.09432769    7
 8   Hcfc1 -2.415879 -2.497010 -2.340737 0.08196985      0.07606804      0.08780486    8
 9     Sp1 -2.439917 -2.478637 -2.403042 0.08017906      0.07736944      0.08294104    9
10  Smyd3 -2.440939 -2.883030 -2.210205 0.08010367      0.05299888      0.09883785   10

Top 10 Transcription Factors with Decreased RUNX1 Binding:

    TR   Theta_i     lower     upper  BIT_score BIT_score_lower BIT_score_upper Rank
 1   Runx1 -2.636133 -2.652424 -2.620706 0.06684889      0.06583978      0.06781762    1
 2   Runx3 -2.649796 -2.687546 -2.615387 0.06600161      0.06371226      0.06815469    2
 3  Bcl11b -2.669601 -2.707113 -2.636979 0.06479117      0.06255495      0.06679608    3
 4    Cbfb -2.687242 -2.759915 -2.626894 0.06373039      0.05952911      0.06742752    4
 5   Hdac2 -2.711423 -2.768347 -2.660408 0.06230267      0.05905879      0.06535040    5
 6   Satb1 -2.941911 -2.979089 -2.906946 0.05012021      0.04837957      0.05181126    6
 7    Mta2 -3.006050 -3.069515 -2.949720 0.04715331      0.04438241      0.04974976    7
 8   Ikzf1 -3.142564 -3.176731 -3.112130 0.04138527      0.04005082      0.04260968    8
 9    Chd4 -3.143628 -3.212739 -3.077606 0.04134311      0.03868915      0.04404050    9
10   Tcf7 -3.293910 -3.413210 -3.195326 0.03578072      0.03188515      0.03934199   10
../_images/Pic3.png