10xGenomics PBMCs cell-type-specific TRs ======================================== We utilize SnapATAC2 to process single-cell ATAC-seq (scATAC-seq) data obtained from `10xGENOMICS `_. . The detailed usage guide for SnapATAC2 can be found in the `SnapATAC2-manual `_. We used **SnapATAC2 v2.6.0** for our analysis. For improved efficiency, some steps require a GPU to accelerate model training. .. code-block:: python # ----------------------------- # Import Required Libraries # ----------------------------- import warnings warnings.filterwarnings("ignore") # Ignore warning messages for cleaner output import snapatac2 as snap # Library for scATAC-seq analysis import anndata as ad # Handling annotated data matrices import pandas as pd # Data manipulation and analysis import scanpy as sc # Single-cell analysis package import scvi # Probabilistic modeling for single-cell omics data import os # For directory operations import matplotlib.pyplot as plt # For plotting # ----------------------------- # Set Global Settings & Seed # ----------------------------- scvi.settings.seed = 2 # Set random seed for reproducibility print("i") # Print a simple indicator (optional) # ----------------------------- # Data Loading and Preprocessing # ----------------------------- # Load a reference dataset from the provided multiome dataset reference = snap.read(snap.datasets.pbmc10k_multiome(), backed=None) # Load an ATAC dataset from a local file atac = snap.read("./10XGENOMICS/pbmc.h5ad", backed=None) # Generate a gene expression matrix from the ATAC dataset using gene annotations for hg38 query = snap.pp.make_gene_matrix(atac, gene_anno=snap.genome.hg38) # Initialize the cell type annotation for the query dataset as missing values query.obs['cell_type'] = pd.NA # Concatenate the reference and query datasets into one AnnData object data = ad.concat( [reference, query], join='inner', # Keep only features present in both datasets label='batch', # Create a batch label to differentiate datasets keys=["reference", "query"], index_unique='_' ) # ----------------------------- # Filter and Identify Highly Variable Genes # ----------------------------- sc.pp.filter_genes(data, min_cells=5) # Keep genes expressed in at least 5 cells # Identify and subset to the top 5000 highly variable genes using Seurat v3 method sc.pp.highly_variable_genes( data, n_top_genes=5000, flavor="seurat_v3", batch_key="batch", # Consider batch effects when identifying variable genes subset=True ) # ----------------------------- # Setup and Train SCVI Model # ----------------------------- # Configure the AnnData object for SCVI with batch key scvi.model.SCVI.setup_anndata(data, batch_key="batch") # Initialize the SCVI model with 2 layers and 30 latent dimensions vae = scvi.model.SCVI( data, n_layers=2, n_latent=30, gene_likelihood="nb", # Negative binomial likelihood for gene expression dispersion="gene-batch" # Model dispersion per gene and batch ) # Train the SCVI model (using early stopping to prevent overfitting) vae.train(max_epochs=2000, early_stopping=True) # ----------------------------- # Prepare Data for SCANVI # ----------------------------- # Initialize a new column for cell type labels with a default value 'Unknown' data.obs["celltype_scanvi"] = 'Unknown' # Identify the reference cells (where batch is "reference") ref_idx = data.obs['batch'] == "reference" # For reference cells, assign the known cell type labels data.obs["celltype_scanvi"][ref_idx] = data.obs['cell_type'][ref_idx] # Initialize the SCANVI model using the pretrained SCVI model and provided labels lvae = scvi.model.SCANVI.from_scvi_model( vae, adata=data, labels_key="celltype_scanvi", unlabeled_category="Unknown" ) # Train the SCANVI model further with a maximum of 2000 epochs and sample 100 cells per label lvae.train(max_epochs=2000, n_samples_per_label=100) # ----------------------------- # Obtain Predictions and Latent Representation # ----------------------------- # Predict cell type labels using SCANVI and store the predictions in a new column data.obs["C_scANVI"] = lvae.predict(data) # Get the latent representation from SCANVI for downstream analysis and visualization data.obsm["X_scANVI"] = lvae.get_latent_representation(data) # ----------------------------- # Compute UMAP Embedding # ----------------------------- # Compute the nearest neighbors using the SCANVI latent representation sc.pp.neighbors(data, use_rep="X_scANVI") # Calculate UMAP coordinates for visualization sc.tl.umap(data) # Save the processed AnnData object with UMAP embedding to disk (compressed) data.write("./10XGENOMICSpbmc10k.h5ad", compression="gzip") # ----------------------------- # Annotate ATAC Data with Predicted Cell Types # ----------------------------- # Map predicted cell type labels from the integrated data to the original ATAC dataset atac.obs['Cell_Types'] = data.obs.loc[atac.obs_names + '_query']['C_scANVI'].to_numpy() # Save the ATAC data with annotations atac.write("./10XGENOMICS/pbmc10k_annotated.h5ad", compression="gzip") # ----------------------------- # Peak Calling and Matrix Generation # ----------------------------- # Run MACS3 peak calling on the data grouped by merged cell types snap.tl.macs3(data, groupby='Cell_Types') # Merge peaks across groups using hg38 annotations peaks = snap.tl.merge_peaks(data.uns['macs3'], snap.genome.hg38) # Create a peak accessibility matrix using the merged peaks peak_mat = snap.pp.make_peak_matrix(data, use_rep=peaks['Peaks']) # Create a new directory to save CSV outputs, using the current seed in the folder name os.mkdir("./10XGENOMICS/csv/scATAC_Peaks_" + str(scvi.settings.seed)) # ----------------------------- # Set Plotting Parameters for High Resolution # ----------------------------- plt.rcParams['figure.dpi'] = 1000 # High resolution for displaying figures plt.rcParams['savefig.dpi'] = 1000 # High resolution for saving figures plt.rcParams['figure.figsize'] = [8, 8] # Set default figure size # ----------------------------- # UMAP Plotting of Cell Types # ----------------------------- # Compute UMAP embedding with a fixed random state for reproducibility snap.tl.umap(data, random_state=15) # Plot the UMAP embedding colored by cell types with custom styling sc.pl.umap( data, color="Cell_Types", size=15, # Increase point size for visibility alpha=0.9, # Slight transparency to visualize density legend_fontsize=20, legend_fontweight='bold', frameon=True, # Display the frame for clarity ncols=2, # Organize legend into two columns show=False, save='umap_plot_Test.pdf' ) # ----------------------------- # Process and Save Gene Expression Matrix # ----------------------------- # Create a gene expression matrix using hg38 annotations gene_matrix = snap.pp.make_gene_matrix(data, snap.genome.hg38) print(gene_matrix) # Filter genes expressed in fewer than 5 cells sc.pp.filter_genes(gene_matrix, min_cells=5) # Normalize total counts per cell sc.pp.normalize_total(gene_matrix) # Log-transform the normalized counts sc.pp.log1p(gene_matrix) # Apply MAGIC imputation (approximate solver) to smooth gene expression data sc.external.pp.magic(gene_matrix, solver="approximate") # Transfer UMAP coordinates from 'data' to gene_matrix for consistent visualization gene_matrix.obsm["X_umap"] = data.obsm["X_umap"] # Save the processed gene matrix to disk (compressed) gene_matrix.write("pbmc10k_gene_mat.h5ad", compression='gzip') # Set global figure parameters for scanpy plots sc.set_figure_params(scanpy=True, dpi=1000, dpi_save=1000, fontsize=24, figsize=[10, 10]) # ----------------------------- # Plot Marker Genes on UMAP # ----------------------------- marker_genes = [] # Define marker genes to visualize on UMAP (fill in with gene names) # Reload the gene matrix to ensure data consistency gene_matrix = snap.read("pbmc10k_gene_mat.h5ad", backed=None) # Loop through each marker gene and plot its expression on the UMAP for i in range(len(marker_genes)): sc.pl.umap( gene_matrix, use_raw=False, color=marker_genes[i], size=15, # Increase point size for better visibility alpha=0.9, # Slight transparency to indicate density frameon=True, # Show a frame around the plot ncols=5, # Organize legends into 5 columns if needed show=False, save='umap_plot_Test_UMAP_PBMCs_' + marker_genes[i] + '.pdf', color_map="plasma" # Use the 'plasma' color map for gene expression ) The previous pipeline can generate the below UMAP plots: Colored by cell types: .. image:: ../images/Examples/singlecell/Pic1.png Colored by PAX5 gene activity: .. image:: ../images/Examples/singlecell/Pic2.png We also get the marker peaks of each cell type: .. code-block:: python marker_peaks=snap.tl.marker_regions(peak_mat, groupby='Cell_Types', pvalue=0.01) for keys in marker_peaks.keys(): elements=marker_peaks[keys] chromosomes = [] starts = [] ends = [] for element in elements: # Split each element into chromosome, start, and end chromosome, positions = element.split(':') start, end = positions.split('-') # Append the results to the corresponding lists chromosomes.append(chromosome) starts.append(start) ends.append(end) df = pd.DataFrame({'Chrom': chromosomes,'Start': starts,'End': ends}) df.to_csv("./10XGENOMICS/csv/scATAC_Peaks_"+scvi.settings.seed+"/"+keys.replace(" ","_")+".csv",index=False) print("./10XGENOMICS/csv/scATAC_Peaks_"+scvi.settings.seed+" Done") cell-type-specific marker regions: .. image:: ../images/Examples/singlecell/Pic3.png We run BIT on each of the region set: .. code-block:: r work_dir<-"./10XGENOMICS/csv/" work_files<-list.files(work_dir) output_path<-"./10XGENOMICS/bit/" dir.create(output_path, showWarnings = FALSE, recursive = TRUE) for(i in seq_along(work_files)){ BIT(paste0(work_dir,work_files[i]), output_path=output_dir, format="csv", bin_width=1000, genome="hg38") } We plot the top 10 TRs identified by BIT in each cell type along with the 95% credible intervals: .. code-block:: r library(patchwork) work_dir<-"./10XGENOMICS/bit/" work_files_BIT<-list.files(work_dir,pattern="*_rank_table.csv") work_files_BIT<-work_files_BIT[c(1,8,9,2,3,4,5,6,7)] cell_names<-sapply(strsplit(work_files_BIT,"_rank",fixed=TRUE),function(x){return(x[[1]])}) cell_names<-tools::toTitleCase(cell_names) cell_names<-c("B cell","Monocyte","NK cell","CD4+ T cell","CD8+ T cell","Dendritic cell","gdT cell","HSPC","MAIT cell") colors<-c("#9B3A4D","#FC8002","#394A92","#70A0AC","#D2352C","#8E549E","#BAAFD1","#497EB2","#ADDB88") plot_list<-list() for(i in 1:9){ table<-read.csv(paste0(work_dir,work_files_BIT[i]),row.names=1) data<-data.frame(TR=table$TR[1:10], Score=table$BIT_score[1:10], Lower=table$BIT_score_lower[1:10], Upper=table$BIT_score_upper[1:10], group=cell_names[i]) data$TR<-factor(data$TR,levels=rev(data$TR)) plot_list[[i]]<-ggplot(data, aes(x = TR, y = Score)) + geom_bar(stat = "identity", fill = colors[i], color = "black") + geom_errorbar(aes(ymin = Lower, ymax = Upper), width = 0.4) + coord_flip() + theme_bw() + scale_y_continuous(limits = c(0, max(data$Upper)*1.2),breaks = scales::breaks_extended(n = 4),expand = c(0,0)) + facet_grid(.~group)+ labs( title = "", x = "Transcriptional Regulator", y = "BIT Score" )+theme(title=element_text(size=9),plot.margin=unit(c(0.25,0.25,0.25,0.25),"cm"),axis.text.y = element_text(size=10,color="black"), axis.text.x = element_text(size=10,color="black"), axis.title.x = element_text(size=14,color="black"), axis.title.y = element_text(size=14,color="black"), strip.background = element_rect(fill="#DBD1B6"), strip.text = element_text(size=12, colour="black",margin=ggplot2::margin(1,1,1,1,"mm"))) } p_combined<-plot_list[[1]]+plot_list[[2]]+ plot_list[[3]] + plot_list[[4]]+plot_list[[5]]+plot_list[[6]] + plot_list[[7]]+plot_list[[8]]+plot_list[[9]]+ plot_layout(ncol=3,guides="collect",axis_titles = "collect") print(p_combined) .. image:: ../images/Examples/singlecell/Pic4.png We can also plot the GO enrichment analysis results of each cell type: .. code-block:: r split_go_term <- function(term) { words <- strsplit(term, " ")[[1]] n_words <- length(words) # Find split points for 2-3 lines if(n_words <= 3) { split_at <- ceiling(n_words/2) } else { split_at <- c(ceiling(n_words/3), ceiling(n_words*2/3)) } # Split words into groups groups <- split(words, cut(seq_along(words), breaks = c(0, split_at, n_words))) # Combine with newlines paste(sapply(groups, paste, collapse = " "), collapse = "\n") } integer_breaks <- function(x, n = 4) { x <- x[!is.na(x)] if (length(x) == 0) return(numeric()) rng <- range(x) breaks <- floor(rng[1]):ceiling(rng[2]) breaks <- unique(round(breaks)) if (length(breaks) > n) { breaks <- breaks[seq(1, length(breaks), length.out = n)] } breaks } work_files_BIT plot_list<-list() for(i in 1:9){ table<-read.csv(paste0(work_dir,work_files_BIT[i]),row.names=1) data<-data.frame(TR=table$TR[1:20], group=cell_names[i]) BIT_gene_ids<-bitr(data$TR,fromType = "SYMBOL",toType = "ENTREZID",OrgDb="org.Hs.eg.db") BIT_Results=enrichGO(gene = BIT_gene_ids$ENTREZID[1:20], OrgDb = "org.Hs.eg.db", ont = "BP", pAdjustMethod = "BH", pvalueCutoff = 0.01, qvalueCutoff = 0.05) GO_BIT_table<-head(BIT_Results,5) GO_PLOT_Table_BIT<-data.frame(GO=GO_BIT_table$Description, GeneRatio=Trans_to_double(GO_BIT_table), Pvalue=GO_BIT_table$pvalue, Count=GO_BIT_table$Count) GO_PLOT_Table_BIT$GO <- sapply(GO_PLOT_Table_BIT$GO, split_go_term) plot_list[[i]] <- ggplot(GO_PLOT_Table_BIT, aes(x = GeneRatio, y = reorder(GO, -Pvalue), size = Count, color = Pvalue)) + geom_point() + # Fix Pvalue color legend (4 breaks, no overlap) scale_color_gradient( low = "red", high = "blue", limits = c(min(GO_BIT_table$pvalue), max(GO_BIT_table$pvalue)), breaks = scales::breaks_pretty(n = 4), # 4 breaks # 2 decimal places guide = guide_colorbar( order = 1, title.position = "top", barheight = unit(1.6, "cm"), ticks = FALSE ) ) + # Fix Count size legend (integer breaks) scale_size_continuous( breaks = integer_breaks(GO_PLOT_Table_BIT$Count, n = 4), # 4 integer breaks range = c(2, 5), # Adjust point sizes guide = guide_legend( order = 2, title.position = "top", override.aes = list(color = "black") ) ) + theme_bw() + labs(y = "GO", x = "Gene Ratio", color = "P-Value", size = "Count") + theme( text = element_text(size = 12), legend.position = "right", legend.box = "vertical", legend.spacing.y = unit(0.1, "cm"), legend.margin = margin(0, 0, 0, 0), axis.text.y = element_text(color = "black") ) + xlim(c(0.0, 0.6)) } p_combined<-plot_list[[1]]+plot_list[[2]]+ plot_list[[3]] + plot_list[[4]]+plot_list[[5]]+plot_list[[6]] + plot_list[[7]]+plot_list[[8]]+plot_list[[9]]+ plot_layout(ncol=3,axis_titles = "collect") print(p_combined) .. image:: ../images/Examples/singlecell/Pic6.png We use SnapATAC2's built-in motif enrichment analysis method to derive the corresponding motif enrichment results: .. code-block:: python os.makedirs('./10XGENOMICS/motifs/', exist_ok=True) motifs = snap.tl.motif_enrichment( motifs=snap.datasets.cis_bp(unique=True), regions=marker_peaks, genome_fasta=snap.genome.hg38, ) for keys in motifs.keys(): elements=motifs[keys] df=elements.to_pandas() df=df.sort_values(by="adjusted p-value",ascending=True) df.to_csv('./10XGENOMICS/motifs/pbmc10k_'+str(i)+"/"+keys+'_motifs.csv',index=False) To generate results using ArchR and scBasset, you cannot use the marker peaks produced by SnapATAC2. Instead, you need to process the scATAC-seq data directly from the fragments. We recommend consulting the original manuals for further details: (1) `ArchR manual `_ (2) `scBasset manual `_. For ArchR: .. code-block:: r # ----------------------------------------------- # Configuration & Library Setup # ----------------------------------------------- library(ArchR) # Load the ArchR package for scATAC-seq analysis set.seed(42) # Set seed for reproducibility addArchRGenome("hg38") # Add human genome reference (hg38) # ----------------------------------------------- # Define Input Files # ----------------------------------------------- # Specify input file(s) with sample names and paths. inputFiles <- c("PBMCs10k" = "./10XGENOMICS/PBMCs/pbmc10k.tsv.gz") # ----------------------------------------------- # Create Arrow Files # ----------------------------------------------- # Arrow files are a specialized file format used by ArchR to store data. ArrowFiles <- createArrowFiles( inputFiles = inputFiles, # Input file(s) sampleNames = names(inputFiles), # Use the names from the inputFiles vector filterTSS = 4, # Minimum TSS enrichment score; avoid setting too high initially filterFrags = 1000, # Minimum number of fragments per cell addTileMat = TRUE, # Create a tile matrix for downstream analysis addGeneScoreMat = TRUE # Generate a gene score matrix ) # ----------------------------------------------- # Calculate Doublet Scores # ----------------------------------------------- # Identify potential doublets (artificially merged cells) in the dataset. doubScores <- addDoubletScores( input = ArrowFiles, # Use the previously created Arrow files k = 10, # Number of neighbors considered for "pseudo-doublet" estimation knnMethod = "UMAP", # Use UMAP embedding for nearest neighbor search LSIMethod = 1 # Specify LSI method (typically 1) ) # ----------------------------------------------- # Create ArchR Project # ----------------------------------------------- # The ArchR project organizes data and metadata for further analysis. proj <- ArchRProject( ArrowFiles = ArrowFiles, # Input Arrow files outputDirectory = "/projects/dheitjan/BIT/zeyul/BIT/ArchR/PBMC", # Directory to store project outputs copyArrows = TRUE # Maintain an unaltered copy of the Arrow files ) # ----------------------------------------------- # Filter Out Doublets # ----------------------------------------------- proj <- filterDoublets(ArchRProj = proj) # Remove cells flagged as doublets # ----------------------------------------------- # Dimensionality Reduction and Clustering # ----------------------------------------------- # 1. Perform iterative Latent Semantic Indexing (LSI) for dimensionality reduction proj <- addIterativeLSI(ArchRProj = proj, useMatrix = "TileMatrix", name = "IterativeLSI") # 2. Cluster cells using the reduced dimensions from IterativeLSI proj <- addClusters(input = proj, reducedDims = "IterativeLSI") # 3. Generate a UMAP embedding for visualization proj <- addUMAP(ArchRProj = proj, reducedDims = "IterativeLSI") p2 <- plotEmbedding( ArchRProj = proj, colorBy = "cellColData", # Color cells by metadata (e.g., clusters) name = "Clusters", embedding = "UMAP" ) # ----------------------------------------------- # Data Imputation and Saving Project # ----------------------------------------------- proj <- addImputeWeights(proj) # Impute missing values to smooth data visualization proj <- saveArchRProject(ArchRProj = proj) # Save the current state of the project # ----------------------------------------------- # Peak Calling Preparation # ----------------------------------------------- # Group cells by cell types for coverage estimation and peak calling. proj <- addGroupCoverages(ArchRProj = proj, groupBy = "CellTypes", force = TRUE) # Specify the path to the MACS2 executable (used for peak calling) pathToMacs2 <- "/users/zeyul/.local/bin/macs2" # Update with your actual MACS2 path # Call reproducible peak sets for each cell type group using MACS2 proj <- addReproduciblePeakSet( ArchRProj = proj, groupBy = "CellTypes", pathToMacs2 = pathToMacs2 ) # Save the ArchR project in a new directory saveArchRProject( ArchRProj = proj, outputDirectory = "/projects/dheitjan/BIT/zeyul/BIT/ArchR/Save-Proj2", load = FALSE ) # ----------------------------------------------- # Create Peak Matrix and Identify Marker Peaks # ----------------------------------------------- # Add the peak matrix which quantifies accessibility at each peak. proj2 <- addPeakMatrix(proj) # Identify marker peaks (differentially accessible regions) for each cell type. markersPeaks <- getMarkerFeatures( ArchRProj = proj2, useMatrix = "PeakMatrix", groupBy = "CellTypes", bias = c("TSSEnrichment", "log10(nFrags)"), # Adjust for biases testMethod = "wilcoxon" # Use Wilcoxon rank-sum test ) # Filter marker peaks based on statistical thresholds (FDR and Log2 Fold Change) markerList <- getMarkers( markersPeaks, cutOff = "FDR <= 0.01 & Log2FC >= 2", returnGR = TRUE # Return as GRanges object ) # Save marker peaks and marker list for future reference saveRDS(markersPeaks, "/projects/dheitjan/BIT/zeyul/BIT/ArchR/PBMC/markersPeaks.rds") saveRDS(markerList, "/projects/dheitjan/BIT/zeyul/BIT/ArchR/PBMC/markerList.rds") # Save the updated ArchR project saveArchRProject(ArchRProj = proj, load = FALSE) # ----------------------------------------------- # Export Marker Peaks as BED Files # ----------------------------------------------- # Export each cell type's marker peaks as a separate BED file. cell_type_names <- names(markerList) for(i in 1:length(cell_type_names)){ export( markerList[[i]], paste0("/projects/dheitjan/BIT/zeyul/BIT/ArchR/PBMC/MarkerList/", cell_type_names[i], ".bed"), format = "BED" ) } # (Optional) Export an additional genomic regions object 'gr' as a BED file if defined. export(gr, "output.bed", format = "BED") # ----------------------------------------------- # Motif Annotation and Enrichment Analysis # ----------------------------------------------- # Add motif annotations using the cisBP motif database. proj <- addMotifAnnotations(ArchRProj = proj, motifSet = "cisbp", name = "Motif") # Perform motif enrichment analysis on the marker peaks. enrichMotifs <- peakAnnoEnrichment( seMarker = markersPeaks, ArchRProj = proj, peakAnnotation = "Motif", cutOff = "FDR <= 0.05 & Log2FC >= 2" ) # Extract the negative log10 adjusted p-values for motif enrichment. mlog10padj <- assays(enrichMotifs)$mlog10Padj motif_symbols <- rownames(mlog10padj) # Retrieve motif names # Rank motifs for each cell type based on enrichment (from highest to lowest). ranked_table <- lapply(colnames(mlog10padj), function(cell_type) { # Order motifs by mlog10Padj in descending order sorted_indices <- order(mlog10padj[, cell_type], decreasing = TRUE) motif_symbols[sorted_indices] # Return the ranked motif symbols }) names(ranked_table) <- colnames(mlog10padj) # Assign cell type names to the ranked table # Convert the ranked motifs list into a data frame and write to a CSV file. ranked_df <- as.data.frame(ranked_table) write.csv(ranked_df, "ranked_df_PBMCs.csv") For scBasset: .. code-block:: python import numpy as np import pandas as pd import h5py import scipy import scanpy as sc import anndata from scbasset.utils import * # Import utility functions from scbasset # Plotting libraries import seaborn as sns import matplotlib.pyplot as plt import os # ----------------------------- # Define Data Paths # ----------------------------- data_path = './10XGENOMICS/scbasset/' # File containing the 10x multi-modal (RNA + ATAC) dataset in H5 format h5_file = data_path + 'pbmc_granulocyte_sorted_10k_filtered_feature_bc_matrix.h5' # BED file with ATAC peak coordinates bed_file = data_path + 'pbmc_granulocyte_sorted_10k_atac_peaks.bed' # ----------------------------- # Load ATAC Peak Information # ----------------------------- # Read the BED file into a DataFrame with columns: 'chr', 'start', 'end' peak = pd.read_csv(bed_file, sep='\t', names=['chr', 'start', 'end']) # ----------------------------- # Load 10x Multi-modal Dataset # ----------------------------- # Read the 10x H5 file; set gex_only=False to load both gene expression and ATAC data ad = sc.read_10x_h5(h5_file, gex_only=False) # ----------------------------- # Separate RNA and ATAC Modalities # ----------------------------- # Subset the AnnData object to get RNA data (features labeled as 'Gene Expression') ad_rna = ad[:, ad.var['feature_types'] == 'Gene Expression'] # Subset to get ATAC data (features labeled as 'Peaks') ad_atac = ad[:, ad.var['feature_types'] == 'Peaks'] # ----------------------------- # Annotate ATAC Data with Peak Coordinates # ----------------------------- # Add chromosome, start, and end coordinates from the BED file to the ATAC AnnData object ad_atac.var['chr'] = peak['chr'].values ad_atac.var['start'] = peak['start'].values ad_atac.var['end'] = peak['end'].values # ----------------------------- # Basic Quality Control and Filtering # ----------------------------- # Apply basic filtering without dropping any cells or genes (min_genes/min_cells set to 0) sc.pp.filter_cells(ad_rna, min_genes=0) sc.pp.filter_genes(ad_rna, min_cells=0) sc.pp.filter_cells(ad_atac, min_genes=0) sc.pp.filter_genes(ad_atac, min_cells=0) # ----------------------------- # Feature Filtering Based on Cell Frequency # ----------------------------- # Define a threshold: features must be present in at least 5% of cells thres = int(ad.shape[0] * 0.05) # Filter RNA data: keep genes expressed in more than the threshold number of cells ad_rna = ad_rna[:, ad_rna.var['n_cells'] > thres] # Filter ATAC data: keep peaks accessible in more than the threshold number of cells ad_atac = ad_atac[:, ad_atac.var['n_cells'] > thres] # ----------------------------- # Filter ATAC Peaks by Chromosome # ----------------------------- # Define a list of standard chromosomes (chr1 to chr22, chrX, chrY) chrs = ['chr' + str(i) for i in range(1, 23)] + ['chrX', 'chrY'] # Filter ATAC peaks to keep only those on the standard chromosomes ad_atac = ad_atac[:, ad_atac.var['chr'].isin(chrs)] # ----------------------------- # Save Processed ATAC Data # ----------------------------- # Write the filtered ATAC AnnData object to a file for downstream analysis ad_atac.write(data_path + 'pbmc10k.h5ad') scBasset also includes steps that require executing command-line instructions (these steps require a GPU connection): .. code-block:: bash scbasset_preprocess.py --ad_file ./10XGENOMICS/scbasset/pbmc10k.h5ad --input_fasta ./10XGENOMICS/scbasset/scbasset/hg38.fa --out_path ./10XGENOMICS/scbasset/data/ scbasset_train.py --input_folder ./10XGENOMICS/scbasset/data/ --out_path ./10XGENOMICS/scbasset/PBMC/ Following steps continue in python: .. code-block:: python # ----------------------------- # Import Required Libraries # ----------------------------- import anndata # For handling AnnData objects import tensorflow as tf # For deep learning model operations import numpy as np # For numerical operations import h5py # For handling HDF5 file format import matplotlib.pyplot as plt # For plotting import os # For operating system interfaces (e.g., file/directory operations) import math # For mathematical functions import pickle # For object serialization import seaborn as sns # For statistical data visualization import scipy # For scientific computations import sys # For system-specific parameters and functions import scanpy as sc # For single-cell analysis import pandas as pd # For data manipulation and analysis from scbasset.utils import * # Import custom utility functions from scbasset # ----------------------------- # Define File Paths and Directories # ----------------------------- ad_file = './10XGENOMICS/scbasset/data/pbmc10k.h5ad' trained_model = './10XGENOMICS/scbasset/PBMC/best_model.h5' # Create results directory if it doesn't already exist os.makedirs("./10XGENOMICS/scbasset/results", exist_ok=True) # ----------------------------- # Load the AnnData Object # ----------------------------- # Read the single-cell dataset stored in H5AD format ad = anndata.read_h5ad(ad_file) # ----------------------------- # Build and Load the Pre-trained Model # ----------------------------- # Build the model with a latent dimension of 32 and number of cells from ad.shape[0] # 'show_summary=False' disables printing of the model summary model = make_model(32, ad.shape[0], show_summary=False) # Load pre-trained model weights from the specified file model.load_weights(trained_model) # ----------------------------- # Extract Model Intercept and Filter Cells # ----------------------------- # Retrieve the model intercept using a custom function intercept = get_intercept(model) # Custom function from scbasset.utils # Perform basic cell filtering (here, filtering threshold is 0 so no cells are removed) sc.pp.filter_cells(ad, min_counts=0) # ----------------------------- # Evaluate Model Intercept Correlation (Optional) # ----------------------------- # Create a plot to visualize the correlation between the model intercept # and the log10 of the number of genes detected per cell f, ax = plt.subplots(figsize=(4, 4)) # Compute Pearson correlation between intercept and log10 of 'n_genes' r = scipy.stats.pearsonr(intercept, np.log10(ad.obs['n_genes']))[0] # (You can add a title or annotation to the plot if needed) # ----------------------------- # Get Cell Embedding from the Model # ----------------------------- # Obtain the cell embedding using a custom function (returns a projection) proj = get_cell_embedding(model) # Custom function from scbasset.utils # Save the cell embedding (projection) to a CSV file for later use pd.DataFrame(proj).to_csv('./10XGENOMICS/scbasset/results/projection_atac.csv') # ----------------------------- # Integrate the Projection into the AnnData Object # ----------------------------- # Create a new plot for UMAP visualization f, ax = plt.subplots(figsize=(4, 4)) # Read the saved projection CSV and assign the values to the 'projection' key in ad.obsm ad.obsm['projection'] = pd.read_csv('./10XGENOMICS/scbasset/results/projection_atac.csv', index_col=0).values # ----------------------------- # Compute Neighbors, UMAP, and Leiden Clustering # ----------------------------- # Use the 'projection' computed above as the representation for neighbor graph computation sc.pp.neighbors(ad, use_rep='projection') # Compute UMAP coordinates for visualization sc.tl.umap(ad) # Perform Leiden clustering to identify cell clusters sc.tl.leiden(ad) # Plot the UMAP embedding with cells colored by Leiden cluster assignments sc.pl.umap(ad, color='leiden', ax=ax) # ----------------------------- # Save the Updated AnnData Object # ----------------------------- # Write the AnnData object with the computed embeddings and clustering results to file ad.write('./10XGENOMICS/scbasset/results/pbmc10k_embedded.h5ad') Finally, we need to annotate the cell types and export the cell type level motif scores: .. code-block:: python ad_file = './10XGENOMICS/scbasset/results/pbmc10k_annotated.h5ad' trained_model = './10XGENOMICS/BIT/scbasset/PBMC/best_model.h5' motif_fasta_folder = './10XGENOMICS/scbasset/results/Homo_sapiens_motif_fasta' Motif_files = os.listdir("./10XGENOMICS/scbasset/results/Homo_sapiens_motif_fasta/shuffled_peaks_motifs") TF_names = [x.split(".")[0] for x in Motif_files] ad = anndata.read_h5ad(ad_file) model = make_model(32, ad.shape[0], show_summary=False) model.load_weights(trained_model) nrow=11909 ncol=733 TF_score_df = pd.DataFrame(index=range(nrow),columns=range(ncol)) TF_score_df.columns = TF_names for tf in TF_names: try: scores = motif_score(tf, model, motif_fasta_folder=motif_fasta_folder) TF_score_df[tf] = scores print(f"{tf} finished") except (FileNotFoundError) as e: print(f"Skipping {tf}: {e}") TF_score_df.index = ad.obs.index TF_score_df.to_csv("./10XGENOMICS/scbasset/results/motif_cell_types.csv") Following steps in R. .. code-block:: r scbasset_motif<-read.csv("./10XGENOMICS/scbasset/results/motif_cell_types.csv") motifs_cell_type_score<-data.frame(matrix(nrow=733,ncol=10)) Cell_type_names<-names(tapply(scbasset_motif$LHX5,scbasset_motif$CellType,mean)) colnames(motifs_cell_type_score)<-c("TR_Symbol",Cell_type_names) motifs_cell_type_score$TR_Symbol<-colnames(scbasset_motif)[3:735] tapply(scbasset_motif$LHX5,scbasset_motif$cell_type,mean) for(i in 3:735){ print(i-2) motifs_cell_type_score[i-2,2:10]<-tapply((scbasset_motif[,i]),scbasset_motif$CellType,mean) } motifs_cell_type_score TR_ranks<-data.frame(matrix(nrow=733,ncol=14)) colnames(TR_ranks)<-rep(colnames(motifs_cell_type_score)[2:10],each=2) for(i in 1:9){ TR_ranks[,(2*(i)-1)]<-motifs_cell_type_score$TR_Symbol[order(-motifs_cell_type_score[,i+1])] TR_ranks[,(2*(i))]<-motifs_cell_type_score[,i+1][order(-motifs_cell_type_score[,i+1])] } write.csv(TR_ranks,"./10XGENOMICS/scbasset/results/PBMC_TR_ranks_scbasset.csv",row.names = FALSE) We plot the GO enrichment analysis results of top TRs by each method: .. code-block:: r # ----------------------------- # Define Working Directories # ----------------------------- # Main working directory work_dir <- "/Users/zeyulu/Desktop/Project/BIT/revision_data/comparison/" # ----------------------------- # Read In Results from Different Methods # ----------------------------- # Read BIT and ArchR results (set first column as row names) BIT_result <- read.csv(paste0(work_dir, "BIT.csv"), row.names = 1) ArchR_result <- read.csv(paste0(work_dir, "ArchR.csv"), row.names = 1) # For ArchR results, remove additional annotations by splitting at '_' and keeping the first part for each column for (i in 1:9) { ArchR_result[, i] <- sapply(strsplit(ArchR_result[, i], "_", fixed = TRUE), function(x) { return(x[[1]]) }) } # Read scBasset results and select every other column (columns 1, 3, ..., 17) scbasset_result <- read.csv(paste0(work_dir, "scbasset.csv")) scbasset_result <- scbasset_result[, c(seq(1, 18, 2))] # Read SnapATAC2 results (set first column as row names) SnapATAC2_result <- read.csv(paste0(work_dir, "SnapATAC2.csv"), row.names = 1) # ----------------------------- # Define Common Cell Type Names and Rename Columns # ----------------------------- # List of cell types to use as column names for each method's result Cell_Types <- c("B cell", "CD4+ cell", "CD8+ cell", "Dendritic cell", "gdT cell", "HSPC", "MAIT cell", "Monocyte", "NK cell") # Assign cell type names to the result data frames colnames(BIT_result) <- Cell_Types colnames(ArchR_result) <- Cell_Types colnames(scbasset_result) <- Cell_Types colnames(SnapATAC2_result) <- Cell_Types # ----------------------------- # Extract Top 20 Genes for "B cell" from Each Method # ----------------------------- # Create a list containing the top 20 genes for B cell from each method Top20_list <- list( "BIT" = BIT_result$`B cell`[1:20], "ArchR" = ArchR_result$`B cell`[1:20], "scBasset" = scbasset_result$`B cell`[1:20], "SnapATAC2" = SnapATAC2_result$`B cell`[1:20] ) # Display the Top20_list Top20_list # ----------------------------- # Perform GO Enrichment and Plotting for Each Method # ----------------------------- # Loop over each method in the Top20_list (4 methods) for (i in 1:4) { # Convert gene symbols to ENTREZ IDs using the 'bitr' function BIT_gene_ids <- bitr(Top20_list[[i]], fromType = "SYMBOL", toType = "ENTREZID", OrgDb = "org.Hs.eg.db") # Perform Gene Ontology enrichment analysis on the converted ENTREZ IDs (only considering Biological Process: BP) BIT_Results <- enrichGO( gene = BIT_gene_ids$ENTREZID[1:20], OrgDb = "org.Hs.eg.db", ont = "BP", pAdjustMethod = "BH", pvalueCutoff = 0.01, qvalueCutoff = 0.05 ) # Extract the top 5 GO terms from the enrichment results GO_BIT_table <- head(BIT_Results, 5) # Create a data frame for plotting with columns for GO term description, gene ratio, p-value, and count GO_PLOT_Table_BIT <- data.frame( GO = GO_BIT_table$Description, GeneRatio = Trans_to_double(GO_BIT_table), # Custom function to transform gene ratio values Pvalue = GO_BIT_table$pvalue, Count = GO_BIT_table$Count ) # Generate a ggplot for the GO enrichment results plot_list[[i]] <- ggplot(GO_PLOT_Table_BIT, aes(x = GeneRatio, y = reorder(GO, -Pvalue), size = Count, color = Pvalue)) + geom_point() + # Customize the color scale for P-values with 4 pretty breaks scale_color_gradient( low = "red", high = "blue", limits = c(min(GO_BIT_table$pvalue), max(GO_BIT_table$pvalue)), breaks = scales::breaks_pretty(n = 4), guide = guide_colorbar( order = 1, title.position = "top", barheight = unit(1.6, "cm"), ticks = FALSE ) ) + # Customize the size scale for Count with integer breaks scale_size_continuous( breaks = integer_breaks(GO_PLOT_Table_BIT$Count, n = 4), range = c(2, 5), guide = guide_legend( order = 2, title.position = "top", override.aes = list(color = "black") ) ) + theme_bw() + labs(y = "GO", x = "Gene Ratio", color = "P-Value", size = "Count") + theme( text = element_text(size = 12), legend.position = "right", legend.box = "vertical", legend.spacing.y = unit(0.1, "cm"), legend.margin = margin(0, 0, 0, 0), axis.text.y = element_text(color = "black") ) + xlim(c(0.0, 0.6)) } # ----------------------------- # Combine and Display Plots # ----------------------------- # Combine the four individual plots into one layout with 2 columns and shared axis titles p_combined <- plot_list[[1]] + plot_list[[2]] + plot_list[[3]] + plot_list[[4]] + plot_layout(ncol = 1, axis_titles = "collect") # Print the combined plot to display the GO enrichment results for each method print(p_combined) .. image:: ../images/Examples/singlecell/Pic5.png