TCGA Cancer-type-specific TRs

We credit to the data author:

  • Corces, M. R. et al. The chromatin accessibility landscape of primary human cancers. Science 362 (2018). 10.1126/science.aav1898

The original data was stored in a .xlsx format, which was first separated and converted to .bed format. The converted data can be retrieved from Zenodo.

../_images/Pic13.png

We apply BIT to each dataset, and plot the top 10 TRs along with their 95% credible intervals.

work_dir<-"./Cancer/Data/"
work_files<-list.files(work_dir)
output_dir<-"./Cancer/bit"

dir.create(output_dir, showWarnings = FALSE, recursive = TRUE)

for(i in seq_along(work_files)){
  BIT(paste0(work_dir,work_files[i]),output_dir,plot_bar=FALSE,genome="hg38")
}
DATA_DIR<-"./Cancer/bit"

cancer_files<-list.files(DATA_DIR)
cancer_files<-cancer_files[c(2,1,3,5,4,6,7,8,9)]
cancer_names<-sapply(strsplit(cancer_files,".",fixed=TRUE),function(x){return(x[[1]])})

for(i in 1:9){
  table<-read.csv(paste0(DATA_DIR,cancer_files[i]))
  data<-data.frame(TR=table$TR[1:10],
                   Score=table$BIT_score[1:10],
                   Lower=table$BIT_score_lower[1:10],
                   Upper=table$BIT_score_upper[1:10],
                   group=cancer_names[i])
  data$TR<-factor(data$TR,levels=rev(data$TR))
  plot_list[[i]]<-ggplot(data, aes(x = TR, y = Score)) +
    geom_bar(stat = "identity", fill = colors[i], color = "black") +
    geom_errorbar(aes(ymin = Lower, ymax = Upper), width = 0.4) +
    coord_flip() +
    theme_bw() + scale_y_continuous(limits = c(0, max(data$Upper)+0.005),expand = c(0,0)) + facet_grid(.~group)+
    labs(
      title = "",
      x = "Transcriptional Regulators",
      y = "BIT Score"
    )+theme(title=element_text(size=9),plot.margin=unit(c(0,0.1,0,0.1),"cm"),axis.text.y = element_text(size=10,color="black"),
            axis.text.x = element_text(size=10,color="black"),
            axis.title.x = element_text(size=14,color="black"),
            axis.title.y = element_text(size=14,color="black"),
            strip.background = element_rect(fill="#DBD1B6"),
            strip.text = element_text(size=12, colour="black",margin=ggplot2::margin(1,1,1,1,"mm")))
}

p_combined<-plot_list[[1]]+plot_list[[2]]+ plot_list[[3]] +
  plot_list[[4]]+plot_list[[5]]+plot_list[[6]] +
  plot_list[[7]]+plot_list[[8]]+plot_list[[9]]+ plot_layout(ncol=3,guides="collect",axis_titles = "collect")
print(p_combined)
../_images/Pic22.png

We can generate the jaccard similarity table as shown in Figure. 3C

library(BIT)
work_dir<-"./Cancer/Data/"
work_dir_bit_results<-"./Cancer/bit/"
work_files<-list.files(work_dir)
work_files_bit_results<-list.files(work_dir_bit_results)
work_files_bit_results

#reorder the cancer-types
work_files<-work_files[c(2,1,3,5,4,6,7,8,9)]
cancer_names<-cancer_names[c(2,1,3,5,4,6,7,8,9)]
work_files_bit_results<-work_files_bit_results[c(2,1,3,5,4,6,7,8,9)]

#Define jaccard function
jaccard<-function(v1,v2){
  return(length(intersect(v1,v2))/length(union(v1,v2)))
}

#initialize jaccard table
jac_table<-matrix(nrow=9,ncol=9)
rownames(jac_table)<-cancer_names
colnames(jac_table)<-cancer_names

#for upper-triangle fill by Jaccard of top 10 TRs BIT identified in each cancer type.
#for bottom-triangle fill by Jaccard of binarized cancer-type-specific accessible regions.

for(i in 1:9){
  for(j in 1:9){
    v1<-import_input_regions(paste0(work_dir,work_files[i]),bin_width = 1000, genome = "hg38")
    v2<-import_input_regions(paste0(work_dir,work_files[j]),bin_width = 1000, genome = "hg38")

    table1<-read.csv(paste0(work_dir_bit_results,work_files_bit_results[i]))
    table2<-read.csv(paste0(work_dir_bit_results,work_files_bit_results[j]))

    if(i>j){
      jac_table[i,j]<-jaccard(v1,v2)
    }else if(i<j){
      jac_table[i,j]<-jaccard(table1$TR[1:10],table2$TR[1:10])
    }
  }
}

jac_table
Jaccard table

Breast

Bladder

Colon

Lung

Liver

Mesothelium

Prostate

Squamous

Testicular

Breast

NA

0.176470588235294

0

0

0

0

0.25

0.0526315789473684

0

Bladder

0.0537453812562983

NA

0

0

0

0

0

0.25

0

Colon

0.00960080848913593

0.0410479294941446

NA

0

0.176470588235294

0

0.0526315789473684

0

0

Lung

0.0207134637514384

0.020222793487575

0.0118102219507229

NA

0

0

0.111111111111111

0

0

Liver

0.0242196651278164

0.0141432282479715

0.0576306357291077

0.0116044997039668

NA

0

0.0526315789473684

0

0

Mesothelium

0.00677551927135399

0.0069406025247699

0.004833120554441

0.0125595495885665

0.0214275213172596

NA

0

0

0

Prostate

0.131370509640771

0.0441176470588235

0.0385856239514776

0.0118838396518537

0.028598536188669

0.00707213578500707

NA

0

0

Squamous

0.00891668114179839

0.149880952380952

0.0131060354242842

0.015702416342709

0.00359379052276685

0.0113628942728401

0.01171875

NA

0.0526315789473684

Testicular

0.00936167136557507

0.0174708250870129

0.0130147012028257

0.0119351002378525

0.0102943152802043

0.0113640786145164

0.0076112412177986

0.0204625257295072

NA

We use the following code to generate the heatmap plot:

create_mask <- function(mat, direction = c("upper", "lower")) {
  direction <- match.arg(direction)
  n <- ncol(mat)
  dummy <- matrix(NA, n, n)  # Create an n x n matrix for dimensions
  mask <- if (direction == "upper") {
    upper.tri(dummy)
  } else {
    lower.tri(dummy)
  }
  mask[!mask] <- NA
  mask
}

# Generate a heatmap matrix by masking the table based on the specified triangle
plot_heatmap <- function(tbl, direction = "upper") {
  as.matrix(tbl) * create_mask(tbl, direction)
}

# Draw rectangles with overlaid formatted text for cells in the specified triangle
draw_rect <- function(n, table, direction = c("upper", "lower")) {
  direction <- match.arg(direction)
  condition <- if (direction == "upper") {
    function(i, j) j > i
  } else {
    function(i, j) j < i
  }
  for (i in 1:n) {
    for (j in 1:n) {
      if (condition(i, j)) {
        rect(i - 0.5, j - 0.5, i + 0.5, j + 0.5, lwd = 1)
        formatted <- sprintf("%.3f", table[i, j])
        number <- sub("^0+", "", formatted)
        text(i, j, number, col = "black", cex = 1, font = 2)
      }
    }
  }
}

# Plotting parameters and drawing the heatmaps with overlayed annotations
collab <- "black"
par(mfrow = c(1, 1), oma = c(2, 2, 2, 2), mar = c(6, 6, 6, 5))

# Plot the upper triangle heatmap
image(1:9, 1:9, plot_heatmap(jac_table, "upper"), ylab = "", axes = FALSE,
      col = colorRampPalette(c("#FFFFFF", "#EE4431"))(20))

# Overlay the lower triangle heatmap
image(1:9, 1:9, plot_heatmap(jac_table, "lower"), ylab = "", axes = FALSE, add = TRUE,
      col = colorRampPalette(c("#FFFFFF", "#4C8BC0"))(20))

# Draw rectangles and add text for both triangles
draw_rect(9, jac_table, "upper")
draw_rect(9, jac_table, "lower")

# Add axis labels
axis(3, at = 1:9, labels = colnames(jac_table)[1:9], las = 2, lty = 0, tck = 0,
     cex.axis = 1, tcl = -0.2, font.axis = 2, col.axis = collab)
axis(2, at = 1:9, labels = colnames(jac_table)[1:9], las = 1, lty = 0, tck = 0,
     cex.axis = 1, tcl = -0.2, font.axis = 2, col.axis = collab)
../_images/Pic33.png

To contrast with the DepMap CRISPR/cas9 TR knock-out screen data, we need extra datasets CRISPRGeneEffect.csv and Model.csv, which are now available on updated Zenodo data, in the folder of /DepMap/.

crispr_gene_effect<-as.data.frame(fread("./DepMap/CRISPRGeneEffect.csv"))
colnames(crispr_gene_effect)<-sapply(strsplit(colnames(crispr_gene_effect)," ",fixed=TRUE),function(x){return(x[[1]])})
Model<-read.csv("./DepMap/CRISPR/Model.csv")

#Filter out the cell lines belong to specific cancer type
cluster_code<-list("Breast"=c("BRCA"),
                   "Bladder"=c("BLCA"),
                   "Colon"=c("COAD"),
                   "Liver"=c("HCC"),
                   "Lung"=c("LUAD"),
                   "Mesothelium"=c("PLBMESO","PLEMESO","PLMESO"),
                   "Prostate"=c("PRAD"),
                   "Squamous"=c("HNSC","LUSC","ESCA","CESC"),
                   "Testis"=c("EMBCA","TT")
)

model_ids_func<-function(x){
  model_ids<-c()
  for(i in seq_along(x)){
    model_ids<-c(model_ids,Model$ModelID[which(Model$OncotreeCode==x[i])])
  }
  return(model_ids)
}

Model_ids<-lapply(cluster_code, model_ids_func)

Once we have the CRISPR/cas9 data, we can first replicate the Figure. 4D to show the total number of TRs ranked to top 10 / top 50 by putting a thrshold on Chronos score.

stepss<-seq(-2,0,0.2)
length(stepss)
start_color<-"#1663A9"
end_color<-"#FFFFFF"
color_palette <- colorRampPalette(c(start_color, end_color))
colors<-color_palette(length(stepss)+1)

mini_scores<-c()
method_tf<-bit_table

TR_TOP_10_thres_TABLE<-data.frame(matrix(ncol=9,nrow=11))
TR_TOP_50_thres_TABLE<-data.frame(matrix(ncol=9,nrow=11))

for(i in 1:9){
  model_ids<-Model_ids[[i]]
  cancer_name<-names(cluster_code)[i]
  subtable_10<-crispr_gene_effect[which(crispr_gene_effect$ModelID%in%model_ids),method_tf[which(method_tf[1:10,cancer_name]%in%colnames(crispr_gene_effect)),cancer_name]]
  subtable_50<-crispr_gene_effect[which(crispr_gene_effect$ModelID%in%model_ids),method_tf[which(method_tf[1:50,cancer_name]%in%colnames(crispr_gene_effect)),cancer_name]]
  vec_min_10<-c()
  for(k in 1:ncol(subtable_10)){
    vec_min_10<-c(c(vec_min_10,min(subtable_10[,k])))
  }
  vec_min_50<-c()
  for(k in 1:ncol(subtable_50)){
    vec_min_50<-c(c(vec_min_50,min(subtable_50[,k])))
  }
  for(j in 1:length(stepss)){
    TR_TOP_10_thres_TABLE[j,i]<-sum(vec_min_10<=(stepss[j]),na.rm=TRUE)
    TR_TOP_50_thres_TABLE[j,i]<-sum(vec_min_50<=(stepss[j]),na.rm=TRUE)
  }
}

#10
cumsums<-rowSums(TR_TOP_10_thres_TABLE)
cumsums<-c(cumsums,90)
cumsums
bp1<-barplot(cumsums,space=0,col=colors,axes=FALSE,ylim=c(0,100),xaxt="n")
box(bty="l")
text(bp1,cumsums+5,cumsums,cex=0.9,font=1)
tick_positions<-bp+0.5
axis(1,at=tick_positions[1:11],format(stepss,nsmall=1),las=3,cex=1.3)
axis(2,at=c(0,10,50,90),c(0,10,50,90),las=2,cex.axis=1)
abline(v=6,xpd=FALSE,lty=2)
abline(v=9,xpd=FALSE,lty=2)
title(main="Top 10 TRs",cex.main=1,font.main=1,line=1)

#50
cumsums<-c(rowSums(TR_TOP_50_thres_TABLE),450)
bp2<-barplot(cumsums,space=0,col=colors,axes=FALSE,ylim=c(0,500),bty="l")
text(bp2,cumsums+25,cumsums,cex=0.9,font=1)
box(bty="l")
tick_positions<-bp+0.5
axis(1,at=tick_positions[1:11],format(stepss,nsmall=1),las=3,cex=1.3)
axis(2,at=c(0,50,250,450),c(0,50,250,450),las=2,cex=1.3)
abline(v=6,xpd=FALSE,lty=2)
abline(v=9,xpd=FALSE,lty=2)
title(main="Top 50 TRs",cex.main=1,font.main=1,line=1)
title(xlab = "Chronos score threshold",ylab="Number of BIT identified TRs with\nscores below the threshold",outer = TRUE,line=1)
../_images/Pic42.png

Next, we apply state-of-the-art methods to analyze the generated *.bed files and extract outputs from each method. The following tools are used:

BART2

HOMER

WhichTF

ChIP-Atlas

i-cisTarget

We merged the outputs from each cancer-type to get the merged output table for each method.

#################Cancer
work_dir_cancer<-"./Cancer"

bart2_table<-read.csv(paste0(work_dir,"bart2_table.csv"))
chip_atlas_table<-read.csv(paste0(work_dir,"chip_atlas_table.csv"))
bit_table<-read.csv(paste0(work_dir,"bit_table.csv"))
icistarget_table<-read.csv(paste0(work_dir,"icistarget_table.csv"))
whichtf_files<-read.csv(paste0(work_dir,"whichtf_table.csv"))
homer_table<-read.csv(paste0(work_dir,"homer_table.csv"))

table_lists<-list("BIT"=bit_table,
                  "BART2"=bart2_table,
                  "ChIP-Atlas"=chip_atlas_table,
                  "HOMER"=homer_table,
                  "i-cisTarget"=icistarget_table,
                  "WhichTF"=whichtf_table)

We next contrat the results of BIT with the results from other state-of-the-art methods based on Chronos score threshold \(-0.4\):

TR_TOP_10_TABLE<-data.frame(matrix(ncol=9,nrow=6))
TR_TOP_50_TABLE<-data.frame(matrix(ncol=9,nrow=6))
rownames(TR_TOP_10_TABLE)<-names(table_lists)
rownames(TR_TOP_50_TABLE)<-names(table_lists)
colnames(TR_TOP_10_TABLE)<-names(cluster_code)
colnames(TR_TOP_50_TABLE)<-names(cluster_code)

for(i in 1:9){
  model_ids<-Model_ids[[i]]
  cancer_name<-names(cluster_code)[i]
  for(j in 1:6){
    method_tf<-table_lists[[j]]
    subtable_10<-crispr_gene_effect[which(crispr_gene_effect$ModelID%in%model_ids),method_tf[which(method_tf[1:10,cancer_name]%in%colnames(crispr_gene_effect)),cancer_name]]
    subtable_50<-crispr_gene_effect[which(crispr_gene_effect$ModelID%in%model_ids),method_tf[which(method_tf[1:50,cancer_name]%in%colnames(crispr_gene_effect)),cancer_name]]
    vec_min_10<-c()
    for(k in 1:ncol(subtable_10)){
      vec_min_10<-c(c(vec_min_10,min(subtable_10[,k])))
    }
    vec_min_50<-c()
    for(k in 1:ncol(subtable_50)){
      vec_min_50<-c(c(vec_min_50,min(subtable_50[,k])))
    }
    TR_TOP_10_TABLE[j,i]<-sum(vec_min_10<=(-0.4),na.rm=TRUE)
    TR_TOP_50_TABLE[j,i]<-sum(vec_min_50<=(-0.4),na.rm=TRUE)
  }
}

rowSums(TR_TOP_10_TABLE)
rowSums(TR_TOP_50_TABLE)
TR_TOP_50_TABLE
TR_TOP_10_TABLE

#> TR_TOP_50_TABLE
#            Breast Bladder Colon Liver Lung Mesothelium Prostate Squamous Testis      Method
#BIT             27      25    33    24   31          24       21       30     17         BIT
#BART2           24      24    27    17   24          18       20       22     14       BART2
#ChIP-Atlas      22      16    21    18   22           9       14       20      9  ChIP-Atlas
#HOMER           15      20    25    20   16          16       17       18      7       HOMER
#i-cisTarget     17      18    31    14    8          25       17       15     11 i-cisTarget
#WhichTF         15      16    17    13   19          20       13       15      6     WhichTF
#> TR_TOP_10_TABLE
#            Breast Bladder Colon Liver Lung Mesothelium Prostate Squamous Testis      Method
#BIT              6       8     5     3    7           6        6        7      4         BIT
#BART2            5       3     5     3    4           3        8        3      2       BART2
#ChIP-Atlas       6       5     3     3    5           3        5        4      0  ChIP-Atlas
#HOMER            5       5     5     2    4           3        6        3      2       HOMER
#i-cisTarget      4       8     7     4    3           6        6        3      3 i-cisTarget
#WhichTF          4       5     4     3    4           3        2        3      1     WhichTF

Next we plot the stacked barplot to show the total number of functionally essential TRs identified by each method:

TR_TOP_10_TABLE$Method<-rownames(TR_TOP_10_TABLE)
df_long <- pivot_longer(TR_TOP_10_TABLE,
                        cols = -Method,    # all columns except 'method'
                        names_to = "cancer",
                        values_to = "count")

colors<-c("#9B3A4D","#FC8002","#394A92","#70A0AC","#D2352C","#8E549E","#BAAFD1","#497EB2","#ADDB88")

df_long$Method<-factor(df_long$Method,levels=c("BIT","i-cisTarget","BART2","HOMER","ChIP-Atlas","WhichTF"))
df_long$cancer<-factor(df_long$cancer,levels=c("Breast","Bladder","Colon","Lung","Liver","Mesothelium","Prostate","Squamous","Testis"))
df_totals <- df_long %>%
  group_by(Method) %>%
  summarise(total = sum(count))

p1<-ggplot(df_long, aes(x = Method, y = count, fill = cancer)) +
  # Create the stacked bars with a black outline for each segment
  geom_bar(stat = "identity", color = "black", position = "stack") +
  # Add text labels for the count of each cancer in the stacked segments
  geom_text(aes(label = count),
            position = position_stack(vjust = 0.5),
            color = "white") +
  # Add the total sum above each bar in bold
  geom_text(data = df_totals,
            aes(x = Method, y = total, label = total),
            inherit.aes = FALSE,
            vjust = -0.5,  # Adjust this value as needed for spacing
            fontface = "bold") +
  # Remove extra space from the y-axis (starting at 0)
  scale_y_continuous(expand = c(0, 0),limits=c(0,60)) +
  # Manually assign specific colors for each cancer type.
  scale_fill_manual(values = c("Breast"      = "#9B3A4D",
                               "Bladder"     = "#FC8002",
                               "Colon"       = "#394A92",
                               "Liver"       = "#D2352C",
                               "Lung"        = "#70A0AC",
                               "Mesothelium" = "#8E549E",
                               "Prostate"    = "#BAAFD1",
                               "Squamous"    = "#497EB2",
                               "Testis"      = "#ADDB88")) +
  labs(x = "Method",
       y = "Count",
       title = "") +
  theme_bw()+theme(axis.text.x=element_text(angle=45,color="black",size=10,hjust=1),
                   axis.text.y=element_text(size=10,color="black"),
                   axis.title.x=element_text(size=12,color="black"),
                   axis.title.y=element_text(size=12,color="black"))

TR_TOP_50_TABLE$Method<-rownames(TR_TOP_50_TABLE)

df_long <- pivot_longer(TR_TOP_50_TABLE,
                        cols = -Method,    # all columns except 'method'
                        names_to = "cancer",
                        values_to = "count")

colors<-c("#9B3A4D","#FC8002","#394A92","#70A0AC","#D2352C","#8E549E","#BAAFD1","#497EB2","#ADDB88")

df_long$Method<-factor(df_long$Method,levels=c("BIT","i-cisTarget","BART2","HOMER","ChIP-Atlas","WhichTF"))
df_long$cancer<-factor(df_long$cancer,levels=c("Breast","Bladder","Colon","Lung","Liver","Mesothelium","Prostate","Squamous","Testis"))
df_totals <- df_long %>%
  group_by(Method) %>%
  summarise(total = sum(count))

p2<-ggplot(df_long, aes(x = Method, y = count, fill = cancer)) +
  # Create the stacked bars with a black outline for each segment
  geom_bar(stat = "identity", color = "black", position = "stack") +
  # Add text labels for the count of each cancer in the stacked segments
  geom_text(aes(label = count),
            position = position_stack(vjust = 0.5),
            color = "white") +
  # Add the total sum above each bar in bold
  geom_text(data = df_totals,
            aes(x = Method, y = total, label = total),
            inherit.aes = FALSE,
            vjust = -0.5,  # Adjust this value as needed for spacing
            fontface = "bold") +
  # Remove extra space from the y-axis (starting at 0)
  scale_y_continuous(expand = c(0, 0),limits=c(0,250)) +
  # Manually assign specific colors for each cancer type.
  scale_fill_manual(values = c("Breast"      = "#9B3A4D",
                               "Bladder"     = "#FC8002",
                               "Colon"       = "#394A92",
                               "Liver"       = "#D2352C",
                               "Lung"        = "#70A0AC",
                               "Mesothelium" = "#8E549E",
                               "Prostate"    = "#BAAFD1",
                               "Squamous"    = "#497EB2",
                               "Testis"      = "#ADDB88")) +
  labs(x = "Method",
       y = "Count",
       title = "") +
  theme_bw()+theme(axis.text.x=element_text(angle=45,color="black",size=10,hjust=1),
                   axis.text.y=element_text(size=10,color="black"),
                   axis.title.x=element_text(size=12,color="black"),
                   axis.title.y=element_text(size=12,color="black"))
library(patchwork)

plot_comb<-p1 + p2 + plot_layout(ncol=2,guides="collect",axes="collect")
print(plot_comb)
../_images/Pic52.png