10xGenomics PBMCs cell-type-specific TRs

We utilize SnapATAC2 to process single-cell ATAC-seq (scATAC-seq) data obtained from 10xGENOMICS. . The detailed usage guide for SnapATAC2 can be found in the SnapATAC2-manual. We used SnapATAC2 v2.6.0 for our analysis.

For improved efficiency, some steps require a GPU to accelerate model training.

# -----------------------------
# Import Required Libraries
# -----------------------------
import warnings
warnings.filterwarnings("ignore")  # Ignore warning messages for cleaner output

import snapatac2 as snap         # Library for scATAC-seq analysis
import anndata as ad             # Handling annotated data matrices
import pandas as pd              # Data manipulation and analysis
import scanpy as sc              # Single-cell analysis package
import scvi                     # Probabilistic modeling for single-cell omics data
import os                       # For directory operations
import matplotlib.pyplot as plt # For plotting

# -----------------------------
# Set Global Settings & Seed
# -----------------------------
scvi.settings.seed = 2          # Set random seed for reproducibility
print("i")                     # Print a simple indicator (optional)

# -----------------------------
# Data Loading and Preprocessing
# -----------------------------
# Load a reference dataset from the provided multiome dataset
reference = snap.read(snap.datasets.pbmc10k_multiome(), backed=None)

# Load an ATAC dataset from a local file
atac = snap.read("./10XGENOMICS/pbmc.h5ad", backed=None)

# Generate a gene expression matrix from the ATAC dataset using gene annotations for hg38
query = snap.pp.make_gene_matrix(atac, gene_anno=snap.genome.hg38)

# Initialize the cell type annotation for the query dataset as missing values
query.obs['cell_type'] = pd.NA

# Concatenate the reference and query datasets into one AnnData object
data = ad.concat(
    [reference, query],
    join='inner',                # Keep only features present in both datasets
    label='batch',               # Create a batch label to differentiate datasets
    keys=["reference", "query"],
    index_unique='_'
)

# -----------------------------
# Filter and Identify Highly Variable Genes
# -----------------------------
sc.pp.filter_genes(data, min_cells=5)  # Keep genes expressed in at least 5 cells

# Identify and subset to the top 5000 highly variable genes using Seurat v3 method
sc.pp.highly_variable_genes(
    data,
    n_top_genes=5000,
    flavor="seurat_v3",
    batch_key="batch",  # Consider batch effects when identifying variable genes
    subset=True
)

# -----------------------------
# Setup and Train SCVI Model
# -----------------------------
# Configure the AnnData object for SCVI with batch key
scvi.model.SCVI.setup_anndata(data, batch_key="batch")

# Initialize the SCVI model with 2 layers and 30 latent dimensions
vae = scvi.model.SCVI(
    data,
    n_layers=2,
    n_latent=30,
    gene_likelihood="nb",     # Negative binomial likelihood for gene expression
    dispersion="gene-batch"   # Model dispersion per gene and batch
)

# Train the SCVI model (using early stopping to prevent overfitting)
vae.train(max_epochs=2000, early_stopping=True)

# -----------------------------
# Prepare Data for SCANVI
# -----------------------------
# Initialize a new column for cell type labels with a default value 'Unknown'
data.obs["celltype_scanvi"] = 'Unknown'

# Identify the reference cells (where batch is "reference")
ref_idx = data.obs['batch'] == "reference"
# For reference cells, assign the known cell type labels
data.obs["celltype_scanvi"][ref_idx] = data.obs['cell_type'][ref_idx]

# Initialize the SCANVI model using the pretrained SCVI model and provided labels
lvae = scvi.model.SCANVI.from_scvi_model(
    vae,
    adata=data,
    labels_key="celltype_scanvi",
    unlabeled_category="Unknown"
)

# Train the SCANVI model further with a maximum of 2000 epochs and sample 100 cells per label
lvae.train(max_epochs=2000, n_samples_per_label=100)

# -----------------------------
# Obtain Predictions and Latent Representation
# -----------------------------
# Predict cell type labels using SCANVI and store the predictions in a new column
data.obs["C_scANVI"] = lvae.predict(data)
# Get the latent representation from SCANVI for downstream analysis and visualization
data.obsm["X_scANVI"] = lvae.get_latent_representation(data)

# -----------------------------
# Compute UMAP Embedding
# -----------------------------
# Compute the nearest neighbors using the SCANVI latent representation
sc.pp.neighbors(data, use_rep="X_scANVI")
# Calculate UMAP coordinates for visualization
sc.tl.umap(data)

# Save the processed AnnData object with UMAP embedding to disk (compressed)
data.write("./10XGENOMICSpbmc10k.h5ad", compression="gzip")

# -----------------------------
# Annotate ATAC Data with Predicted Cell Types
# -----------------------------
# Map predicted cell type labels from the integrated data to the original ATAC dataset
atac.obs['Cell_Types'] = data.obs.loc[atac.obs_names + '_query']['C_scANVI'].to_numpy()

# Save the ATAC data with annotations
atac.write("./10XGENOMICS/pbmc10k_annotated.h5ad", compression="gzip")

# -----------------------------
# Peak Calling and Matrix Generation
# -----------------------------
# Run MACS3 peak calling on the data grouped by merged cell types
snap.tl.macs3(data, groupby='Cell_Types')
# Merge peaks across groups using hg38 annotations
peaks = snap.tl.merge_peaks(data.uns['macs3'], snap.genome.hg38)
# Create a peak accessibility matrix using the merged peaks
peak_mat = snap.pp.make_peak_matrix(data, use_rep=peaks['Peaks'])

# Create a new directory to save CSV outputs, using the current seed in the folder name
os.mkdir("./10XGENOMICS/csv/scATAC_Peaks_" + str(scvi.settings.seed))

# -----------------------------
# Set Plotting Parameters for High Resolution
# -----------------------------
plt.rcParams['figure.dpi'] = 1000    # High resolution for displaying figures
plt.rcParams['savefig.dpi'] = 1000   # High resolution for saving figures
plt.rcParams['figure.figsize'] = [8, 8]  # Set default figure size

# -----------------------------
# UMAP Plotting of Cell Types
# -----------------------------
# Compute UMAP embedding with a fixed random state for reproducibility
snap.tl.umap(data, random_state=15)

# Plot the UMAP embedding colored by cell types with custom styling
sc.pl.umap(
    data,
    color="Cell_Types",
    size=15,                # Increase point size for visibility
    alpha=0.9,              # Slight transparency to visualize density
    legend_fontsize=20,
    legend_fontweight='bold',
    frameon=True,           # Display the frame for clarity
    ncols=2,                # Organize legend into two columns
    show=False,
    save='umap_plot_Test.pdf'
)

# -----------------------------
# Process and Save Gene Expression Matrix
# -----------------------------
# Create a gene expression matrix using hg38 annotations
gene_matrix = snap.pp.make_gene_matrix(data, snap.genome.hg38)
print(gene_matrix)

# Filter genes expressed in fewer than 5 cells
sc.pp.filter_genes(gene_matrix, min_cells=5)
# Normalize total counts per cell
sc.pp.normalize_total(gene_matrix)
# Log-transform the normalized counts
sc.pp.log1p(gene_matrix)

# Apply MAGIC imputation (approximate solver) to smooth gene expression data
sc.external.pp.magic(gene_matrix, solver="approximate")
# Transfer UMAP coordinates from 'data' to gene_matrix for consistent visualization
gene_matrix.obsm["X_umap"] = data.obsm["X_umap"]

# Save the processed gene matrix to disk (compressed)
gene_matrix.write("pbmc10k_gene_mat.h5ad", compression='gzip')

# Set global figure parameters for scanpy plots
sc.set_figure_params(scanpy=True, dpi=1000, dpi_save=1000, fontsize=24, figsize=[10, 10])

# -----------------------------
# Plot Marker Genes on UMAP
# -----------------------------
marker_genes = []  # Define marker genes to visualize on UMAP (fill in with gene names)

# Reload the gene matrix to ensure data consistency
gene_matrix = snap.read("pbmc10k_gene_mat.h5ad", backed=None)

# Loop through each marker gene and plot its expression on the UMAP
for i in range(len(marker_genes)):
    sc.pl.umap(
        gene_matrix,
        use_raw=False,
        color=marker_genes[i],
        size=15,                # Increase point size for better visibility
        alpha=0.9,              # Slight transparency to indicate density
        frameon=True,           # Show a frame around the plot
        ncols=5,                # Organize legends into 5 columns if needed
        show=False,
        save='umap_plot_Test_UMAP_PBMCs_' + marker_genes[i] + '.pdf',
        color_map="plasma"      # Use the 'plasma' color map for gene expression
    )

The previous pipeline can generate the below UMAP plots:

Colored by cell types:

Colored by PAX5 gene activity:

We also get the marker peaks of each cell type:

marker_peaks=snap.tl.marker_regions(peak_mat, groupby='Cell_Types', pvalue=0.01)
for keys in marker_peaks.keys():
        elements=marker_peaks[keys]
        chromosomes = []
        starts = []
        ends = []
        for element in elements:
            # Split each element into chromosome, start, and end
            chromosome, positions = element.split(':')
            start, end = positions.split('-')
            # Append the results to the corresponding lists
            chromosomes.append(chromosome)
            starts.append(start)
            ends.append(end)
        df = pd.DataFrame({'Chrom': chromosomes,'Start': starts,'End': ends})
        df.to_csv("./10XGENOMICS/csv/scATAC_Peaks_"+scvi.settings.seed+"/"+keys.replace(" ","_")+".csv",index=False)
    print("./10XGENOMICS/csv/scATAC_Peaks_"+scvi.settings.seed+" Done")

cell-type-specific marker regions:

We run BIT on each of the region set:

work_dir<-"./10XGENOMICS/csv/"
work_files<-list.files(work_dir)
output_path<-"./10XGENOMICS/bit/"
dir.create(output_path, showWarnings = FALSE, recursive = TRUE)
for(i in seq_along(work_files)){
  BIT(paste0(work_dir,work_files[i]), output_path=output_dir, format="csv", bin_width=1000, genome="hg38")
}

We plot the top 10 TRs identified by BIT in each cell type along with the 95% credible intervals:

library(patchwork)
work_dir<-"./10XGENOMICS/bit/"
work_files_BIT<-list.files(work_dir,pattern="*_rank_table.csv")
work_files_BIT<-work_files_BIT[c(1,8,9,2,3,4,5,6,7)]

cell_names<-sapply(strsplit(work_files_BIT,"_rank",fixed=TRUE),function(x){return(x[[1]])})
cell_names<-tools::toTitleCase(cell_names)
cell_names<-c("B cell","Monocyte","NK cell","CD4+ T cell","CD8+ T cell","Dendritic cell","gdT cell","HSPC","MAIT cell")
colors<-c("#9B3A4D","#FC8002","#394A92","#70A0AC","#D2352C","#8E549E","#BAAFD1","#497EB2","#ADDB88")
plot_list<-list()
for(i in 1:9){
  table<-read.csv(paste0(work_dir,work_files_BIT[i]),row.names=1)
  data<-data.frame(TR=table$TR[1:10],
                   Score=table$BIT_score[1:10],
                   Lower=table$BIT_score_lower[1:10],
                   Upper=table$BIT_score_upper[1:10],
                   group=cell_names[i])
  data$TR<-factor(data$TR,levels=rev(data$TR))
  plot_list[[i]]<-ggplot(data, aes(x = TR, y = Score)) +
    geom_bar(stat = "identity", fill = colors[i], color = "black") +
    geom_errorbar(aes(ymin = Lower, ymax = Upper), width = 0.4) +
    coord_flip() +
    theme_bw() + scale_y_continuous(limits = c(0, max(data$Upper)*1.2),breaks = scales::breaks_extended(n = 4),expand = c(0,0)) + facet_grid(.~group)+
    labs(
      title = "",
      x = "Transcriptional Regulator",
      y = "BIT Score"
    )+theme(title=element_text(size=9),plot.margin=unit(c(0.25,0.25,0.25,0.25),"cm"),axis.text.y = element_text(size=10,color="black"),
            axis.text.x = element_text(size=10,color="black"),
            axis.title.x = element_text(size=14,color="black"),
            axis.title.y = element_text(size=14,color="black"),
            strip.background = element_rect(fill="#DBD1B6"),
            strip.text = element_text(size=12, colour="black",margin=ggplot2::margin(1,1,1,1,"mm")))
}

p_combined<-plot_list[[1]]+plot_list[[2]]+ plot_list[[3]] +
  plot_list[[4]]+plot_list[[5]]+plot_list[[6]] +
  plot_list[[7]]+plot_list[[8]]+plot_list[[9]]+ plot_layout(ncol=3,guides="collect",axis_titles = "collect")
print(p_combined)

We can also plot the GO enrichment analysis results of each cell type:

split_go_term <- function(term) {
  words <- strsplit(term, " ")[[1]]
  n_words <- length(words)

  # Find split points for 2-3 lines
  if(n_words <= 3) {
    split_at <- ceiling(n_words/2)
  } else {
    split_at <- c(ceiling(n_words/3), ceiling(n_words*2/3))
  }

  # Split words into groups
  groups <- split(words, cut(seq_along(words), breaks = c(0, split_at, n_words)))

  # Combine with newlines
  paste(sapply(groups, paste, collapse = " "), collapse = "\n")
}

integer_breaks <- function(x, n = 4) {
  x <- x[!is.na(x)]
  if (length(x) == 0) return(numeric())
  rng <- range(x)
  breaks <- floor(rng[1]):ceiling(rng[2])
  breaks <- unique(round(breaks))
  if (length(breaks) > n) {
    breaks <- breaks[seq(1, length(breaks), length.out = n)]
  }
  breaks
}

work_files_BIT
plot_list<-list()

for(i in 1:9){
  table<-read.csv(paste0(work_dir,work_files_BIT[i]),row.names=1)
  data<-data.frame(TR=table$TR[1:20],
                   group=cell_names[i])
  BIT_gene_ids<-bitr(data$TR,fromType = "SYMBOL",toType = "ENTREZID",OrgDb="org.Hs.eg.db")
  BIT_Results=enrichGO(gene          = BIT_gene_ids$ENTREZID[1:20],
                       OrgDb = "org.Hs.eg.db",
                       ont = "BP",
                       pAdjustMethod = "BH",
                       pvalueCutoff  = 0.01,
                       qvalueCutoff  = 0.05)
  GO_BIT_table<-head(BIT_Results,5)
  GO_PLOT_Table_BIT<-data.frame(GO=GO_BIT_table$Description,
                                GeneRatio=Trans_to_double(GO_BIT_table),
                                Pvalue=GO_BIT_table$pvalue,
                                Count=GO_BIT_table$Count)
  GO_PLOT_Table_BIT$GO <- sapply(GO_PLOT_Table_BIT$GO, split_go_term)
  plot_list[[i]] <- ggplot(GO_PLOT_Table_BIT, aes(x = GeneRatio, y = reorder(GO, -Pvalue), size = Count, color = Pvalue)) +
    geom_point() +
    # Fix Pvalue color legend (4 breaks, no overlap)
    scale_color_gradient(
      low = "red",
      high = "blue",
      limits = c(min(GO_BIT_table$pvalue), max(GO_BIT_table$pvalue)),
      breaks = scales::breaks_pretty(n = 4), # 4 breaks # 2 decimal places
      guide = guide_colorbar(
        order = 1,
        title.position = "top",
        barheight = unit(1.6, "cm"),
        ticks = FALSE
      )
    ) +
    # Fix Count size legend (integer breaks)
    scale_size_continuous(
      breaks = integer_breaks(GO_PLOT_Table_BIT$Count, n = 4), # 4 integer breaks
      range = c(2, 5), # Adjust point sizes
      guide = guide_legend(
        order = 2,
        title.position = "top",
        override.aes = list(color = "black")
      )
    ) +
    theme_bw() +
    labs(y = "GO", x = "Gene Ratio", color = "P-Value", size = "Count") +
    theme(
      text = element_text(size = 12),
      legend.position = "right",
      legend.box = "vertical",
      legend.spacing.y = unit(0.1, "cm"),
      legend.margin = margin(0, 0, 0, 0),
      axis.text.y = element_text(color = "black")
    ) +
    xlim(c(0.0, 0.6))
}
p_combined<-plot_list[[1]]+plot_list[[2]]+ plot_list[[3]] +
plot_list[[4]]+plot_list[[5]]+plot_list[[6]] +
  plot_list[[7]]+plot_list[[8]]+plot_list[[9]]+ plot_layout(ncol=3,axis_titles = "collect")

print(p_combined)

We use SnapATAC2’s built-in motif enrichment analysis method to derive the corresponding motif enrichment results:

os.makedirs('./10XGENOMICS/motifs/', exist_ok=True)
motifs = snap.tl.motif_enrichment(
    motifs=snap.datasets.cis_bp(unique=True),
    regions=marker_peaks,
    genome_fasta=snap.genome.hg38,
)
for keys in motifs.keys():
                elements=motifs[keys]
                df=elements.to_pandas()
                df=df.sort_values(by="adjusted p-value",ascending=True)
                df.to_csv('./10XGENOMICS/motifs/pbmc10k_'+str(i)+"/"+keys+'_motifs.csv',index=False)

To generate results using ArchR and scBasset, you cannot use the marker peaks produced by SnapATAC2. Instead, you need to process the scATAC-seq data directly from the fragments. We recommend consulting the original manuals for further details: (1) ArchR manual (2) scBasset manual.

For ArchR:

# -----------------------------------------------
# Configuration & Library Setup
# -----------------------------------------------

library(ArchR)              # Load the ArchR package for scATAC-seq analysis
set.seed(42)                # Set seed for reproducibility
addArchRGenome("hg38")      # Add human genome reference (hg38)

# -----------------------------------------------
# Define Input Files
# -----------------------------------------------
# Specify input file(s) with sample names and paths.
inputFiles <- c("PBMCs10k" = "./10XGENOMICS/PBMCs/pbmc10k.tsv.gz")

# -----------------------------------------------
# Create Arrow Files
# -----------------------------------------------
# Arrow files are a specialized file format used by ArchR to store data.
ArrowFiles <- createArrowFiles(
  inputFiles = inputFiles,         # Input file(s)
  sampleNames = names(inputFiles), # Use the names from the inputFiles vector
  filterTSS = 4,                   # Minimum TSS enrichment score; avoid setting too high initially
  filterFrags = 1000,              # Minimum number of fragments per cell
  addTileMat = TRUE,               # Create a tile matrix for downstream analysis
  addGeneScoreMat = TRUE           # Generate a gene score matrix
)

# -----------------------------------------------
# Calculate Doublet Scores
# -----------------------------------------------
# Identify potential doublets (artificially merged cells) in the dataset.
doubScores <- addDoubletScores(
  input = ArrowFiles,              # Use the previously created Arrow files
  k = 10,                          # Number of neighbors considered for "pseudo-doublet" estimation
  knnMethod = "UMAP",              # Use UMAP embedding for nearest neighbor search
  LSIMethod = 1                    # Specify LSI method (typically 1)
)

# -----------------------------------------------
# Create ArchR Project
# -----------------------------------------------
# The ArchR project organizes data and metadata for further analysis.
proj <- ArchRProject(
  ArrowFiles = ArrowFiles,                                         # Input Arrow files
  outputDirectory = "/projects/dheitjan/BIT/zeyul/BIT/ArchR/PBMC",  # Directory to store project outputs
  copyArrows = TRUE                                                 # Maintain an unaltered copy of the Arrow files
)

# -----------------------------------------------
# Filter Out Doublets
# -----------------------------------------------
proj <- filterDoublets(ArchRProj = proj)  # Remove cells flagged as doublets

# -----------------------------------------------
# Dimensionality Reduction and Clustering
# -----------------------------------------------
# 1. Perform iterative Latent Semantic Indexing (LSI) for dimensionality reduction
proj <- addIterativeLSI(ArchRProj = proj, useMatrix = "TileMatrix", name = "IterativeLSI")

# 2. Cluster cells using the reduced dimensions from IterativeLSI
proj <- addClusters(input = proj, reducedDims = "IterativeLSI")

# 3. Generate a UMAP embedding for visualization
proj <- addUMAP(ArchRProj = proj, reducedDims = "IterativeLSI")
p2 <- plotEmbedding(
  ArchRProj = proj,
  colorBy = "cellColData",  # Color cells by metadata (e.g., clusters)
  name = "Clusters",
  embedding = "UMAP"
)

# -----------------------------------------------
# Data Imputation and Saving Project
# -----------------------------------------------
proj <- addImputeWeights(proj)                # Impute missing values to smooth data visualization
proj <- saveArchRProject(ArchRProj = proj)      # Save the current state of the project

# -----------------------------------------------
# Peak Calling Preparation
# -----------------------------------------------
# Group cells by cell types for coverage estimation and peak calling.
proj <- addGroupCoverages(ArchRProj = proj, groupBy = "CellTypes", force = TRUE)

# Specify the path to the MACS2 executable (used for peak calling)
pathToMacs2 <- "/users/zeyul/.local/bin/macs2"  # Update with your actual MACS2 path

# Call reproducible peak sets for each cell type group using MACS2
proj <- addReproduciblePeakSet(
  ArchRProj = proj,
  groupBy = "CellTypes",
  pathToMacs2 = pathToMacs2
)

# Save the ArchR project in a new directory
saveArchRProject(
  ArchRProj = proj,
  outputDirectory = "/projects/dheitjan/BIT/zeyul/BIT/ArchR/Save-Proj2",
  load = FALSE
)

# -----------------------------------------------
# Create Peak Matrix and Identify Marker Peaks
# -----------------------------------------------
# Add the peak matrix which quantifies accessibility at each peak.
proj2 <- addPeakMatrix(proj)

# Identify marker peaks (differentially accessible regions) for each cell type.
markersPeaks <- getMarkerFeatures(
  ArchRProj = proj2,
  useMatrix = "PeakMatrix",
  groupBy = "CellTypes",
  bias = c("TSSEnrichment", "log10(nFrags)"),  # Adjust for biases
  testMethod = "wilcoxon"                      # Use Wilcoxon rank-sum test
)

# Filter marker peaks based on statistical thresholds (FDR and Log2 Fold Change)
markerList <- getMarkers(
  markersPeaks,
  cutOff = "FDR <= 0.01 & Log2FC >= 2",
  returnGR = TRUE  # Return as GRanges object
)

# Save marker peaks and marker list for future reference
saveRDS(markersPeaks, "/projects/dheitjan/BIT/zeyul/BIT/ArchR/PBMC/markersPeaks.rds")
saveRDS(markerList, "/projects/dheitjan/BIT/zeyul/BIT/ArchR/PBMC/markerList.rds")

# Save the updated ArchR project
saveArchRProject(ArchRProj = proj, load = FALSE)

# -----------------------------------------------
# Export Marker Peaks as BED Files
# -----------------------------------------------
# Export each cell type's marker peaks as a separate BED file.
cell_type_names <- names(markerList)
for(i in 1:length(cell_type_names)){
  export(
    markerList[[i]],
    paste0("/projects/dheitjan/BIT/zeyul/BIT/ArchR/PBMC/MarkerList/", cell_type_names[i], ".bed"),
    format = "BED"
  )
}

# (Optional) Export an additional genomic regions object 'gr' as a BED file if defined.
export(gr, "output.bed", format = "BED")

# -----------------------------------------------
# Motif Annotation and Enrichment Analysis
# -----------------------------------------------
# Add motif annotations using the cisBP motif database.
proj <- addMotifAnnotations(ArchRProj = proj, motifSet = "cisbp", name = "Motif")

# Perform motif enrichment analysis on the marker peaks.
enrichMotifs <- peakAnnoEnrichment(
  seMarker = markersPeaks,
  ArchRProj = proj,
  peakAnnotation = "Motif",
  cutOff = "FDR <= 0.05 & Log2FC >= 2"
)

# Extract the negative log10 adjusted p-values for motif enrichment.
mlog10padj <- assays(enrichMotifs)$mlog10Padj
motif_symbols <- rownames(mlog10padj)  # Retrieve motif names

# Rank motifs for each cell type based on enrichment (from highest to lowest).
ranked_table <- lapply(colnames(mlog10padj), function(cell_type) {
  # Order motifs by mlog10Padj in descending order
  sorted_indices <- order(mlog10padj[, cell_type], decreasing = TRUE)
  motif_symbols[sorted_indices]  # Return the ranked motif symbols
})
names(ranked_table) <- colnames(mlog10padj)  # Assign cell type names to the ranked table

# Convert the ranked motifs list into a data frame and write to a CSV file.
ranked_df <- as.data.frame(ranked_table)
write.csv(ranked_df, "ranked_df_PBMCs.csv")

For scBasset:

import numpy as np
import pandas as pd
import h5py
import scipy
import scanpy as sc
import anndata
from scbasset.utils import *  # Import utility functions from scbasset

# Plotting libraries
import seaborn as sns
import matplotlib.pyplot as plt
import os

# -----------------------------
# Define Data Paths
# -----------------------------
data_path = './10XGENOMICS/scbasset/'
# File containing the 10x multi-modal (RNA + ATAC) dataset in H5 format
h5_file = data_path + 'pbmc_granulocyte_sorted_10k_filtered_feature_bc_matrix.h5'
# BED file with ATAC peak coordinates
bed_file = data_path + 'pbmc_granulocyte_sorted_10k_atac_peaks.bed'

# -----------------------------
# Load ATAC Peak Information
# -----------------------------
# Read the BED file into a DataFrame with columns: 'chr', 'start', 'end'
peak = pd.read_csv(bed_file, sep='\t', names=['chr', 'start', 'end'])

# -----------------------------
# Load 10x Multi-modal Dataset
# -----------------------------
# Read the 10x H5 file; set gex_only=False to load both gene expression and ATAC data
ad = sc.read_10x_h5(h5_file, gex_only=False)

# -----------------------------
# Separate RNA and ATAC Modalities
# -----------------------------
# Subset the AnnData object to get RNA data (features labeled as 'Gene Expression')
ad_rna = ad[:, ad.var['feature_types'] == 'Gene Expression']
# Subset to get ATAC data (features labeled as 'Peaks')
ad_atac = ad[:, ad.var['feature_types'] == 'Peaks']

# -----------------------------
# Annotate ATAC Data with Peak Coordinates
# -----------------------------
# Add chromosome, start, and end coordinates from the BED file to the ATAC AnnData object
ad_atac.var['chr'] = peak['chr'].values
ad_atac.var['start'] = peak['start'].values
ad_atac.var['end'] = peak['end'].values

# -----------------------------
# Basic Quality Control and Filtering
# -----------------------------
# Apply basic filtering without dropping any cells or genes (min_genes/min_cells set to 0)
sc.pp.filter_cells(ad_rna, min_genes=0)
sc.pp.filter_genes(ad_rna, min_cells=0)
sc.pp.filter_cells(ad_atac, min_genes=0)
sc.pp.filter_genes(ad_atac, min_cells=0)

# -----------------------------
# Feature Filtering Based on Cell Frequency
# -----------------------------
# Define a threshold: features must be present in at least 5% of cells
thres = int(ad.shape[0] * 0.05)
# Filter RNA data: keep genes expressed in more than the threshold number of cells
ad_rna = ad_rna[:, ad_rna.var['n_cells'] > thres]
# Filter ATAC data: keep peaks accessible in more than the threshold number of cells
ad_atac = ad_atac[:, ad_atac.var['n_cells'] > thres]

# -----------------------------
# Filter ATAC Peaks by Chromosome
# -----------------------------
# Define a list of standard chromosomes (chr1 to chr22, chrX, chrY)
chrs = ['chr' + str(i) for i in range(1, 23)] + ['chrX', 'chrY']
# Filter ATAC peaks to keep only those on the standard chromosomes
ad_atac = ad_atac[:, ad_atac.var['chr'].isin(chrs)]

# -----------------------------
# Save Processed ATAC Data
# -----------------------------
# Write the filtered ATAC AnnData object to a file for downstream analysis
ad_atac.write(data_path + 'pbmc10k.h5ad')

scBasset also includes steps that require executing command-line instructions (these steps require a GPU connection):

scbasset_preprocess.py --ad_file ./10XGENOMICS/scbasset/pbmc10k.h5ad --input_fasta ./10XGENOMICS/scbasset/scbasset/hg38.fa --out_path ./10XGENOMICS/scbasset/data/

scbasset_train.py --input_folder ./10XGENOMICS/scbasset/data/ --out_path ./10XGENOMICS/scbasset/PBMC/

Following steps continue in python:

# -----------------------------
# Import Required Libraries
# -----------------------------
import anndata                     # For handling AnnData objects
import tensorflow as tf            # For deep learning model operations
import numpy as np                 # For numerical operations
import h5py                       # For handling HDF5 file format
import matplotlib.pyplot as plt    # For plotting
import os                         # For operating system interfaces (e.g., file/directory operations)
import math                       # For mathematical functions
import pickle                     # For object serialization
import seaborn as sns             # For statistical data visualization
import scipy                      # For scientific computations
import sys                        # For system-specific parameters and functions
import scanpy as sc               # For single-cell analysis
import pandas as pd               # For data manipulation and analysis
from scbasset.utils import *      # Import custom utility functions from scbasset

# -----------------------------
# Define File Paths and Directories
# -----------------------------
ad_file = './10XGENOMICS/scbasset/data/pbmc10k.h5ad'
trained_model = './10XGENOMICS/scbasset/PBMC/best_model.h5'

# Create results directory if it doesn't already exist
os.makedirs("./10XGENOMICS/scbasset/results", exist_ok=True)

# -----------------------------
# Load the AnnData Object
# -----------------------------
# Read the single-cell dataset stored in H5AD format
ad = anndata.read_h5ad(ad_file)

# -----------------------------
# Build and Load the Pre-trained Model
# -----------------------------
# Build the model with a latent dimension of 32 and number of cells from ad.shape[0]
# 'show_summary=False' disables printing of the model summary
model = make_model(32, ad.shape[0], show_summary=False)
# Load pre-trained model weights from the specified file
model.load_weights(trained_model)

# -----------------------------
# Extract Model Intercept and Filter Cells
# -----------------------------
# Retrieve the model intercept using a custom function
intercept = get_intercept(model)  # Custom function from scbasset.utils

# Perform basic cell filtering (here, filtering threshold is 0 so no cells are removed)
sc.pp.filter_cells(ad, min_counts=0)

# -----------------------------
# Evaluate Model Intercept Correlation (Optional)
# -----------------------------
# Create a plot to visualize the correlation between the model intercept
# and the log10 of the number of genes detected per cell
f, ax = plt.subplots(figsize=(4, 4))
# Compute Pearson correlation between intercept and log10 of 'n_genes'
r = scipy.stats.pearsonr(intercept, np.log10(ad.obs['n_genes']))[0]
# (You can add a title or annotation to the plot if needed)

# -----------------------------
# Get Cell Embedding from the Model
# -----------------------------
# Obtain the cell embedding using a custom function (returns a projection)
proj = get_cell_embedding(model)  # Custom function from scbasset.utils

# Save the cell embedding (projection) to a CSV file for later use
pd.DataFrame(proj).to_csv('./10XGENOMICS/scbasset/results/projection_atac.csv')

# -----------------------------
# Integrate the Projection into the AnnData Object
# -----------------------------
# Create a new plot for UMAP visualization
f, ax = plt.subplots(figsize=(4, 4))
# Read the saved projection CSV and assign the values to the 'projection' key in ad.obsm
ad.obsm['projection'] = pd.read_csv('./10XGENOMICS/scbasset/results/projection_atac.csv',
                                      index_col=0).values

# -----------------------------
# Compute Neighbors, UMAP, and Leiden Clustering
# -----------------------------
# Use the 'projection' computed above as the representation for neighbor graph computation
sc.pp.neighbors(ad, use_rep='projection')
# Compute UMAP coordinates for visualization
sc.tl.umap(ad)
# Perform Leiden clustering to identify cell clusters
sc.tl.leiden(ad)
# Plot the UMAP embedding with cells colored by Leiden cluster assignments
sc.pl.umap(ad, color='leiden', ax=ax)

# -----------------------------
# Save the Updated AnnData Object
# -----------------------------
# Write the AnnData object with the computed embeddings and clustering results to file
ad.write('./10XGENOMICS/scbasset/results/pbmc10k_embedded.h5ad')

Finally, we need to annotate the cell types and export the cell type level motif scores:

ad_file = './10XGENOMICS/scbasset/results/pbmc10k_annotated.h5ad'
trained_model = './10XGENOMICS/BIT/scbasset/PBMC/best_model.h5'
motif_fasta_folder = './10XGENOMICS/scbasset/results/Homo_sapiens_motif_fasta'
Motif_files = os.listdir("./10XGENOMICS/scbasset/results/Homo_sapiens_motif_fasta/shuffled_peaks_motifs")
TF_names = [x.split(".")[0] for x in Motif_files]
ad = anndata.read_h5ad(ad_file)
model = make_model(32, ad.shape[0], show_summary=False)
model.load_weights(trained_model)
nrow=11909
ncol=733
TF_score_df = pd.DataFrame(index=range(nrow),columns=range(ncol))
TF_score_df.columns = TF_names
for tf in TF_names:
        try:
                scores = motif_score(tf, model, motif_fasta_folder=motif_fasta_folder)
                TF_score_df[tf] = scores
                print(f"{tf} finished")
        except (FileNotFoundError) as e:
                print(f"Skipping {tf}: {e}")
TF_score_df.index = ad.obs.index
TF_score_df.to_csv("./10XGENOMICS/scbasset/results/motif_cell_types.csv")

Following steps in R.

scbasset_motif<-read.csv("./10XGENOMICS/scbasset/results/motif_cell_types.csv")
motifs_cell_type_score<-data.frame(matrix(nrow=733,ncol=10))
Cell_type_names<-names(tapply(scbasset_motif$LHX5,scbasset_motif$CellType,mean))
colnames(motifs_cell_type_score)<-c("TR_Symbol",Cell_type_names)
motifs_cell_type_score$TR_Symbol<-colnames(scbasset_motif)[3:735]

tapply(scbasset_motif$LHX5,scbasset_motif$cell_type,mean)

for(i in 3:735){
  print(i-2)
  motifs_cell_type_score[i-2,2:10]<-tapply((scbasset_motif[,i]),scbasset_motif$CellType,mean)
}

motifs_cell_type_score
TR_ranks<-data.frame(matrix(nrow=733,ncol=14))
colnames(TR_ranks)<-rep(colnames(motifs_cell_type_score)[2:10],each=2)

for(i in 1:9){
  TR_ranks[,(2*(i)-1)]<-motifs_cell_type_score$TR_Symbol[order(-motifs_cell_type_score[,i+1])]
  TR_ranks[,(2*(i))]<-motifs_cell_type_score[,i+1][order(-motifs_cell_type_score[,i+1])]
}

write.csv(TR_ranks,"./10XGENOMICS/scbasset/results/PBMC_TR_ranks_scbasset.csv",row.names = FALSE)

We plot the GO enrichment analysis results of top TRs by each method:

# -----------------------------
# Define Working Directories
# -----------------------------
# Main working directory
work_dir <- "/Users/zeyulu/Desktop/Project/BIT/revision_data/comparison/"

# -----------------------------
# Read In Results from Different Methods
# -----------------------------
# Read BIT and ArchR results (set first column as row names)
BIT_result <- read.csv(paste0(work_dir, "BIT.csv"), row.names = 1)
ArchR_result <- read.csv(paste0(work_dir, "ArchR.csv"), row.names = 1)

# For ArchR results, remove additional annotations by splitting at '_' and keeping the first part for each column
for (i in 1:9) {
  ArchR_result[, i] <- sapply(strsplit(ArchR_result[, i], "_", fixed = TRUE), function(x) { return(x[[1]]) })
}

# Read scBasset results and select every other column (columns 1, 3, ..., 17)
scbasset_result <- read.csv(paste0(work_dir, "scbasset.csv"))
scbasset_result <- scbasset_result[, c(seq(1, 18, 2))]

# Read SnapATAC2 results (set first column as row names)
SnapATAC2_result <- read.csv(paste0(work_dir, "SnapATAC2.csv"), row.names = 1)

# -----------------------------
# Define Common Cell Type Names and Rename Columns
# -----------------------------
# List of cell types to use as column names for each method's result
Cell_Types <- c("B cell", "CD4+ cell", "CD8+ cell", "Dendritic cell", "gdT cell", "HSPC", "MAIT cell", "Monocyte", "NK cell")

# Assign cell type names to the result data frames
colnames(BIT_result) <- Cell_Types
colnames(ArchR_result) <- Cell_Types
colnames(scbasset_result) <- Cell_Types
colnames(SnapATAC2_result) <- Cell_Types

# -----------------------------
# Extract Top 20 Genes for "B cell" from Each Method
# -----------------------------
# Create a list containing the top 20 genes for B cell from each method
Top20_list <- list(
  "BIT" = BIT_result$`B cell`[1:20],
  "ArchR" = ArchR_result$`B cell`[1:20],
  "scBasset" = scbasset_result$`B cell`[1:20],
  "SnapATAC2" = SnapATAC2_result$`B cell`[1:20]
)

# Display the Top20_list
Top20_list

# -----------------------------
# Perform GO Enrichment and Plotting for Each Method
# -----------------------------
# Loop over each method in the Top20_list (4 methods)
for (i in 1:4) {
  # Convert gene symbols to ENTREZ IDs using the 'bitr' function
  BIT_gene_ids <- bitr(Top20_list[[i]], fromType = "SYMBOL", toType = "ENTREZID", OrgDb = "org.Hs.eg.db")

  # Perform Gene Ontology enrichment analysis on the converted ENTREZ IDs (only considering Biological Process: BP)
  BIT_Results <- enrichGO(
    gene          = BIT_gene_ids$ENTREZID[1:20],
    OrgDb         = "org.Hs.eg.db",
    ont           = "BP",
    pAdjustMethod = "BH",
    pvalueCutoff  = 0.01,
    qvalueCutoff  = 0.05
  )

  # Extract the top 5 GO terms from the enrichment results
  GO_BIT_table <- head(BIT_Results, 5)

  # Create a data frame for plotting with columns for GO term description, gene ratio, p-value, and count
  GO_PLOT_Table_BIT <- data.frame(
    GO = GO_BIT_table$Description,
    GeneRatio = Trans_to_double(GO_BIT_table),  # Custom function to transform gene ratio values
    Pvalue = GO_BIT_table$pvalue,
    Count = GO_BIT_table$Count
  )

  # Generate a ggplot for the GO enrichment results
  plot_list[[i]] <- ggplot(GO_PLOT_Table_BIT, aes(x = GeneRatio, y = reorder(GO, -Pvalue), size = Count, color = Pvalue)) +
    geom_point() +
    # Customize the color scale for P-values with 4 pretty breaks
    scale_color_gradient(
      low = "red",
      high = "blue",
      limits = c(min(GO_BIT_table$pvalue), max(GO_BIT_table$pvalue)),
      breaks = scales::breaks_pretty(n = 4),
      guide = guide_colorbar(
        order = 1,
        title.position = "top",
        barheight = unit(1.6, "cm"),
        ticks = FALSE
      )
    ) +
    # Customize the size scale for Count with integer breaks
    scale_size_continuous(
      breaks = integer_breaks(GO_PLOT_Table_BIT$Count, n = 4),
      range = c(2, 5),
      guide = guide_legend(
        order = 2,
        title.position = "top",
        override.aes = list(color = "black")
      )
    ) +
    theme_bw() +
    labs(y = "GO", x = "Gene Ratio", color = "P-Value", size = "Count") +
    theme(
      text = element_text(size = 12),
      legend.position = "right",
      legend.box = "vertical",
      legend.spacing.y = unit(0.1, "cm"),
      legend.margin = margin(0, 0, 0, 0),
      axis.text.y = element_text(color = "black")
    ) +
    xlim(c(0.0, 0.6))
}

# -----------------------------
# Combine and Display Plots
# -----------------------------
# Combine the four individual plots into one layout with 2 columns and shared axis titles
p_combined <- plot_list[[1]] + plot_list[[2]] + plot_list[[3]] + plot_list[[4]] +
  plot_layout(ncol = 1, axis_titles = "collect")

# Print the combined plot to display the GO enrichment results for each method
print(p_combined)