2 Figure 1

Figure 1. Overview of the study cohort.
A. Schematic overview of multi-omics data obtained from our cohort.Ā Targeted exome sequencing (TES) and whole exome sequencing (WES) were performed on 362 and 11 AML samples, respectively. The analysis focuses on 35 genes that are most frequently mutated in AML.
B. Venn diagram depicting the number of samples within each data layer.
C. The French-American-British (FAB) subtyping, World Health Organization (WHO) classification, and multi-omics information of 374 AML patients.
D. The positive rate of common and other rare gene fusion events in different age groups. Considering the sample size and the balance of data, all patients were categorized into three age groups, namely, 15-44 years, 45-59 years, and 60-80 years.
E-F. Uniform Manifold Approximation and Projection (UMAP) of AML samples using 10,016 measured proteins (E) and 39,348 measured phosphosites (F), with colors denoting different WHO subtypes.
G. Gene set variation analysis (GSVA) scores for proteomics based on differentially regulated gene ontology (GO) pathways in WHO subtypes (left heatmap). Spearman correlation coefficients of the proteome and the transcriptome for the annotated pathways (right heatmap, asterisks mark significantly correlated pathways with P < 0.05, Benjamini-Hochberg-corrected).
H.Heatmap depicting kinome profiling with Z scores calculated by kinase-substrate enrichment analysis (KSEA) algorithm.
2.1 (B) Data layer
Venn diagram depicting the number of samples within each data layer.
rm(list=ls())
library(VennDiagram)
#-----------------------------------------------------------------------------
#Step 1: Load multi-omics data and sample information
#-----------------------------------------------------------------------------
AML_data <- readRDS("Input/AML_data.rds")
sampleinfo<-AML_data$sampleinfo
all.sample.name<-AML_data$sampleinfo$DIA_ID
prot.sample.name<-names(AML_data$CorrectProteomics)[names(AML_data$CorrectProteomics) %in% all.sample.name]
phospho.sample.name<-names(AML_data$Phosphoproteomics)[names(AML_data$Phosphoproteomics) %in% all.sample.name]
TES_WES.sample.name<-names(AML_data$TES_WES)[names(AML_data$TES_WES) %in% all.sample.name][-317]
RNA_seq.sample.name<-names(AML_data$RNA_seq)[names(AML_data$RNA_seq) %in% all.sample.name]
#-----------------------------------------------------------------------------
#Step 2: Venn Diagram
#-----------------------------------------------------------------------------
venn.plot <- venn.diagram(
x = list("Proteomics"=prot.sample.name,"RNA_seq"=RNA_seq.sample.name,"TES_WES"=TES_WES.sample.name,"Phosphoproteomics"=phospho.sample.name),
filename = NULL,
col = "transparent",
fill = c("#8ebfd5", "#ee8076", "#79c06c", "#f7bc6a"),
alpha = 0.50,
label.col = c("orange", "white", "darkorchid4", "white",
"white", "white", "white", "white", "darkblue", "white",
"white", "white", "white", "darkgreen", "white"),
cex = 1.5,
cat.col = c("black", "black", "black", "black"),
cat.cex = 1.5,
cat.pos = 0,
cat.dist = 0.07,
cat.fontfamily = "serif",
rotation.degree = 0,
margin = 0.2
)
pdf("Output/Figure1/Figure1B.pdf")
grid.draw(venn.plot)
2.2 (C) Data overview
The French-American-British (FAB) subtyping, World Health Organization (WHO) classification, and multi-omics information of 374 AML patients.
rm(list=ls())
library(readxl)
#-----------------------------------------------------------------------------
#Step 1: Load multi-omics data and sample information
#-----------------------------------------------------------------------------
AML_data <- readRDS("Input/AML_data.rds")
sampleinfo<-AML_data$sampleinfo
all.sample.name<-AML_data$sampleinfo$DIA_ID
prot.sample.name<-names(AML_data$CorrectProteomics)[names(AML_data$CorrectProteomics) %in% all.sample.name]
phospho.sample.name<-names(AML_data$Phosphoproteomics)[names(AML_data$Phosphoproteomics) %in% all.sample.name]
TES_WES.sample.name<-names(AML_data$TES_WES)[names(AML_data$TES_WES) %in% all.sample.name][-317]
RNA_seq.sample.name<-names(AML_data$RNA_seq)[names(AML_data$RNA_seq) %in% all.sample.name]
all.ID<- matrix(NA, nrow = length(all.sample.name), ncol = 4, dimnames = list(all.sample.name, c("Proteomics", "Phosphoproteomics", "RNA_seq","TES_WES")))
all.ID[match(prot.sample.name, all.sample.name), "Proteomics"] <- prot.sample.name
all.ID[match(phospho.sample.name, all.sample.name), "Phosphoproteomics"] <- phospho.sample.name
all.ID[match(RNA_seq.sample.name, all.sample.name), "RNA_seq"] <- RNA_seq.sample.name
all.ID[match(TES_WES.sample.name, all.sample.name), "TES_WES"] <- TES_WES.sample.name
all.ID[!is.na(all.ID)] <- 1
all.ID[is.na(all.ID)] <- 0
all.ID<-as.data.frame(all.ID)
all.ID.plot<-data.frame(apply(all.ID, 2, function(x) as.numeric(x)))
rownames(all.ID.plot)<-rownames(all.ID)
all.info<-sampleinfo[,c("DIA_ID", "WHO_classification_2022", "Diagnosis" )]
names(all.info)[2:3]<-c("WHO","FAB")
#-----------------------------------------------------------------------------
#Step 2: Heatmap
#-----------------------------------------------------------------------------
##annotation for heatmap
annotation_col <- data.frame(
FAB=factor(all.info$FAB),
WHO = factor(all.info$WHO),
row.names = all.info$DIA_ID
)
annotation_col <- annotation_col[order(annotation_col$WHO), ]
annotation_col$WHO <- factor(annotation_col$WHO, levels = c(
"PML-RARA", "RUNX1::RUNX1T1", "CBFB::MYH11", "KMT2A rearrangement",
"NUP98 rearrangement", "DEK::NUP214", "NPM1", "CEBPA",
"Myelodysplasia-related", "defined by differentiation"
))
WHO_color<- c("#999999" , "#e7dad2" , "#96c37d", "#8ecfc9","#FF7F00", "#c497b2","#ffbe7a", "#82b0d2" ,"#beb8dc", "#e984a2")
names(WHO_color) <- c(
"PML-RARA","RUNX1::RUNX1T1","CBFB::MYH11","KMT2A rearrangement","NUP98 rearrangement", "DEK::NUP214","NPM1","CEBPA","Myelodysplasia-related", "defined by differentiation"
)
FAB_color <- c("#DF9E9B","#99BADF","#D8E7CA","#99CDCE","#999ACD","#FFD0E9")
names(FAB_color) <- c("AML","M1","M2","M3","M4","M5")
ann_colors <- list(WHO = WHO_color,FAB=FAB_color)
color_series<-c("grey","#D6604D")
all.ID.plot<-all.ID.plot[row.names(annotation_col),]
plot<- pheatmap::pheatmap(
t(all.ID.plot),
scale = "none",
annotation_col = annotation_col,
annotation_colors = ann_colors,
color = color_series,
fontsize_col = 1,
cluster_rows = F,
cluster_cols = F,
show_rownames = T,
show_colnames = F,
fontsize = 10,
cellwidth = 2,
cellheight = 10,
filename =
"Output/Figure1/Figure1C.pdf",
width = 15,
height = 8
)
2.3 (D) Stacked bar chart
The positive rate of common and other rare gene fusion events in different age groups. Considering the sample size and the balance of data, all patients were categorized into three age groups, namely, 15-44 years, 45-59 years, and 60-80 years.
rm(list=ls())
library(plyr)
#-----------------------------------------------------------------------------
#Step 1: Load sample information
#-----------------------------------------------------------------------------
AML_data <- readRDS("Input/AML_data.rds")
data<-as.data.frame(AML_data$sampleinfo)
data$Age_group <- ifelse(data$Age < 45,"15-44","45-59")
data$Age_group[data$Age >= 60] <- "60-80"
data$Age_group <- factor(data$Age_group, levels = c("15-44","45-59","60-80"))
Age_WHO <- data.frame(table(data$Age_group,data$WHO_classification_2022))
Age_WHO <- ddply(Age_WHO,.(Var1),transform,percent=Freq/sum(Freq)*100)
Age_WHO$label = paste0(sprintf("%.1f", Age_WHO$percent), "%")
table(Age_WHO$Var2)
Age_WHO$Var2 <- factor(Age_WHO$Var2, levels = c("PML-RARA","RUNX1::RUNX1T1","CBFB::MYH11",
"KMT2A rearrangement","NUP98 rearrangement",
"DEK::NUP214","NPM1","CEBPA",
"Myelodysplasia-related",
"defined by differentiation"))
#-----------------------------------------------------------------------------
#Step 2: Stacked bar chart
#-----------------------------------------------------------------------------
ggplot(Age_WHO,aes(Var1,percent,fill=Var2))+
geom_bar(stat="identity",position = position_stack())+
labs(x="Age groups",y="Percentage",fill="WHO classification") +
scale_fill_manual(values = c("#999999" , "#e7dad2" , "#96c37d", "#8ecfc9","#FF7F00", "#c497b2","#ffbe7a", "#82b0d2" ,"#beb8dc", "#e984a2"))+
scale_y_continuous(labels = scales::percent_format(scale = 1))+
theme_classic()+
theme(legend.position = "right",
legend.text = element_text(size=10),
axis.text = element_text(size=10),
axis.title = element_text(size=10))+
geom_text(aes(label = sprintf("%1.1f%%", percent)),
stat = "identity",
position = position_stack(vjust = 0.5),
size = 3)
2.4 (E) UMAP for proteomics data
Uniform Manifold Approximation and Projection (UMAP) of AML samples using 10,017 measured proteins.
rm(list = ls())
library(readxl)
library(stringr)
library(umap)
library(RColorBrewer)
library(ggplot2)
#--------------------------------------------------------------------------------
#Step 1: Load proteomics data
#--------------------------------------------------------------------------------
AML_data<-readRDS("Input/AML_data.rds")
prot<-AML_data$CorrectProteomics
info<-AML_data$sampleinfo
prot1<-prot[,-c(1:2)]
prot2<-data.frame(t(prot1))
prot2$DIA_ID<-row.names(prot2)
prot3<-merge(info,prot2,by="DIA_ID",all = F)
prot_plot<-data.frame(prot3[,-c(1:198)])
prot_plot$label<-prot3$WHO_classification_2022
prot_plot<-prot_plot[,-c(1,2,4)]
row.names(prot_plot)<-prot3$DIA_ID
#-------------------------------------------------------------------------------
# Step 2: UMAP
#-------------------------------------------------------------------------------
drawUMAP <- function(M1,ptColors, strTitle="UMAP",rowNormalization=T,colNormalization=F){
if(!'label' %in% colnames(M1)){
print('A column with named label must existed in data frame')
return(NULL)
}
tmp <- M1[,colnames(M1)!='label']
if(rowNormalization){
tmp <- data.frame(t(apply(tmp,1,function(v){(v-mean(v,na.rm=T))/sd(v,na.rm=T)})),stringsAsFactors=F)
rownames(tmp) <- rownames(M1)
}
if(colNormalization){ tmp <- apply(tmp,2,function(v){(v-mean(v,na.rm=T))/sd(v,na.rm=T)}) }
tmp[is.na(tmp)] <- 0
obj = umap(d=tmp,method='naive')
clnames <- colnames(tmp)
df1 <- data.frame(obj$layout)
df1$label <- M1$label
colnames(df1) <- c('X','Y','label')
p <- ggplot(df1, aes(x=X, y=Y, colour=label)) + geom_point(size=4)
p <- p + theme( panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_blank(),
plot.title = element_text(size=16),
axis.line.x = element_line(color="black", size = 0.5),
axis.line.y = element_line(color="black", size = 0.5),
panel.background = element_blank())
p <- p + labs(title =strTitle)
p <- p + scale_colour_manual(values=ptColors)
p
}
color_order<-c(
"PML-RARA","RUNX1::RUNX1T1","CBFB::MYH11","KMT2A rearrangement","NUP98 rearrangement", "DEK::NUP214","NPM1","CEBPA","Myelodysplasia-related", "defined by differentiation"
)
prot_plot$label<-factor(prot_plot$label,levels = color_order)
color<- c("#999999" , "#e7dad2" , "#96c37d", "#8ecfc9","#FF7F00", "#c497b2","#ffbe7a", "#82b0d2" ,"#beb8dc", "#e984a2")
names(color) <- color_order
set.seed(2024)
prot_umap<-drawUMAP(prot_plot,rowNormalization = F,colNormalization = T,ptColors = color)
ggsave("Output/Figure1/Figure1E.pdf",plot =prot_umap,device = NULL )
2.5 (F) UMAP for phosphoproteomics data
Uniform Manifold Approximation and Projection (UMAP) of AML samples using 47,457 measured phosphorylated peptides.
#--------------------------------------------------------------------------------
#Step 1: Load phosphoproteomics data
#--------------------------------------------------------------------------------
AML_data<-readRDS("Input/AML_data.rds")
phospho<-AML_data$Phosphoproteomics
info<-AML_data$sampleinfo
row.names(phospho)<-paste(phospho$PTM.FlankingRegion,phospho$PG.ProteinAccessions,phospho$PTM.SiteAA,phospho$PTM.SiteLocation,sep = "-")
phospho1<-phospho[,-c(1:6)]
phospho2<-data.frame(t(phospho1))
phospho2[is.na(phospho2)]<-1
phospho2<-log2(phospho2)
phospho2$DIA_ID<-row.names(phospho2)
phospho3<-merge(info,phospho2,by="DIA_ID",all = F)
phospho_plot<-data.frame(phospho3[,-c(1:198)])
phospho_plot$label<-phospho3$WHO_classification_2022
row.names(phospho_plot)<-phospho3$DIA_ID
WHO<- as.factor(unique(phospho_plot$label))
#-------------------------------------------------------------------------------
# Step 2: UMAP
#-------------------------------------------------------------------------------
phospho_plot$label<-factor(phospho_plot$label,levels = color_order)
phospho_plot<-phospho_plot[,-c(1,2,4)]
set.seed(2023)
phospho_umap<-drawUMAP(phospho_plot,rowNormalization = F,colNormalization = T,ptColors = color)
ggsave("Output/Figure1/Figure1F.pdf",plot =phospho_umap,device = NULL )
2.6 (G) Gene set variation analysis
Gene set variation analysis (GSVA) scores for proteomics based on differentially regulated gene ontology (GO) pathways in WHO subtypes (left heatmap). Spearman correlation coefficients of the proteome and the transcriptome for the annotated pathways (right heatmap, asterisks mark significantly correlated pathways with P < 0.05, Benjamini-Hochberg-corrected).
rm(list=ls())
library(limma)
library(readxl)
library(dplyr)
library(readr)
library(clusterProfiler)
options(stringsAsFactors = F)
library(tidyverse)
library(msigdbr)
library(GSVA)
library(GSEABase)
library(pheatmap)
library(BiocParallel)
library(RColorBrewer)
#-----------------------------------------------------------------------------
#Step 1: Load data and and set parameters
#-----------------------------------------------------------------------------
AML_data<-readRDS("Input/AML_data.rds")
sample_info<-AML_data$sampleinfo
prot0<-AML_data$CorrectProteomics
prot1<-prot0[-grep(";",prot0$PG.ProteinGroups),]
aml.prot<-data.frame(t(prot1[,-c(1:2)]))
names(aml.prot)<-prot1$PG.Genes
GO_df_C5 <- msigdbr(species = "Homo sapiens",
category = "C5")
GO_df_all<-GO_df_C5
GO_df <- dplyr::select(GO_df_all, gs_name, gene_symbol, gs_exact_source, gs_subcat)
GO_df <- GO_df[GO_df$gs_subcat!="HPO",]
go_list <- split(GO_df$gene_symbol, GO_df$gs_name)
geneset <- go_list
dat<-data.frame(t(aml.prot))
dat<-as.matrix(dat)
#-----------------------------------------------------------------------------
#Step 2: Gene set enrichment analysis
#-----------------------------------------------------------------------------
Pathways<-data.frame("topPathways","group",check.names = F)
names(Pathways)<-c("topPathways","group")
Pathways<-Pathways[-1,]
WHO<-data.frame("DIA_ID"=sample_info$DIA_ID,"WHO"=sample_info$WHO_classification_2022)
WHO$WHO<-gsub(" ","_",WHO$WHO)
WHO$WHO<-gsub("::","_",WHO$WHO)
WHO$WHO<-gsub("-","_",WHO$WHO)
row.names(WHO)<-WHO$DIA_ID
WHO<-WHO[colnames(dat),]
for (i in unique(WHO$WHO)[-10]) {
set.seed(2024)
cluster<-WHO
cluster$WHO<-ifelse(cluster$WHO == i,cluster$WHO,"others")
group_list<-cluster$WHO
design <- model.matrix(~0+factor(group_list))
colnames(design) <- levels(factor(group_list))
rownames(design) <- colnames(dat)
contrast.matrix <- makeContrasts(contrasts=paste0(i,'-',"others"),
levels = design)
fit1 <- lmFit(dat,design)
fit2 <- contrasts.fit(fit1, contrast.matrix)
efit <- eBayes(fit2)
summary(decideTests(efit,lfc=1, p.value=0.05))
tempOutput <- topTable(efit, coef=paste0(i,'-',"others"), n=Inf)
degs0 <- na.omit(tempOutput)
degs0$group<- paste0(i,'-',"others")
degs0<-degs0 %>%
filter(abs(logFC)> 0.5)%>%
filter(adj.P.Val< 0.05)
mydata <-degs0[order(degs0$logFC,decreasing=TRUE),]
FCgenelist <-mydata$logFC
names(FCgenelist)<-row.names(mydata)
fgseaRes <- fgsea::fgsea(pathways= geneset,
stats= FCgenelist,
minSize=10,
maxSize=500,
nperm=10000, nproc = 1)
fgseaRes<-fgseaRes %>%
filter(pval<0.05)
topPathwaysUp.tmp <- fgseaRes[NES > 0]
topPathwaysDown.tmp <- fgseaRes[NES < 0]
if (length(topPathwaysUp.tmp) < 5 ){
up= length(topPathwaysUp.tmp)
} else{
up=5
}
if (length(topPathwaysDown.tmp) <5 ){
down= length(topPathwaysDown.tmp)
} else{
down=5
}
topPathwaysUp <- fgseaRes[NES > 0][head(order(NES,decreasing = T),n=5),pathway]
topPathwaysDown <- fgseaRes[NES < 0][head(order(NES,decreasing = F),n=5),pathway]
topPathways <-c(topPathwaysUp,rev(topPathwaysDown))
if (length(topPathways)== 0){
topPathways<-"NA"
}else{
topPathways<-topPathways
}
Pathways0<-data.frame(topPathways,"group")
Pathways0$X.group.<-paste0(i,'-',"others")
names(Pathways0)[2]<-"group"
Pathways<-rbind(Pathways,Pathways0)
}
geneset.filter<-geneset[names(geneset) %in% Pathways$topPathways]
#-----------------------------------------------------------------------------
#Step 3: Gene set variation analysis
#-----------------------------------------------------------------------------
prot.data<-dat
set.seed(123)
param <- gsvaParam(exprData = prot.data,
geneSets = geneset.filter,
kcdf = "Gaussian")
gsva.prot <- gsva(param,
verbose = TRUE,
#parallel.sz = parallel::detectCores()
)
degs<-data.frame("logFC", "AveExpr","t" ,"P.Value","adj.P.Val" ,"B", "group", "pathway")
names(degs)<-c("logFC", "AveExpr","t" ,"P.Value","adj.P.Val" ,"B", "group", "pathway")
degs<-degs[-1,]
for (i in unique(WHO$WHO)[-10]) {
set.seed(123)
cluster<-WHO
cluster$WHO<-ifelse(cluster$WHO == i,cluster$WHO,"others")
group_list<-cluster$WHO
design <- model.matrix(~0+factor(group_list))
colnames(design) <- levels(factor(group_list))
rownames(design) <- colnames(gsva.prot)
contrast.matrix <- makeContrasts(contrasts=paste0(i,'-',"others"),
levels = design)
fit1 <- lmFit(gsva.prot,design)
fit2 <- contrasts.fit(fit1, contrast.matrix)
efit <- eBayes(fit2)
tempOutput <- topTable(efit, coef=paste0(i,'-',"others"), n=Inf)
degs0 <- na.omit(tempOutput)
degs0$group<- paste0(i,'-',"others")
degs0$pathway<-row.names(degs0)
degs<-rbind(degs0,degs)
}
degs.del.snf5<-degs
library(reshape2)
pathway.heatmap<- dcast(degs.del.snf5[,c(1,7,8)], pathway ~ group, value.var = "logFC")
pathway.heatmap.1<-pathway.heatmap[,-1]
row.names(pathway.heatmap.1)<-pathway.heatmap$pathway
colors<- c('#053061','#0F437C','#195797','#246AAE','#307AB6','#3C8ABE','#519CC7','#6EAED1','#8AC0DB','#A3CDE2','#BAD9E9','#D1E5F0',"white", "#FDDBC7", "#F4A582","#D6604D", "#C94E45", "#C23D3E", "#BA2C37", "#B21C30", "#A90B29", "#A10328", "#920226", "#820224", "#67001F")
color_series <- colorRampPalette(colors)(1000)
heatmap <- pheatmap::pheatmap(
pathway.heatmap.1,
color = color_series,
fontsize_col = 4,
cluster_rows = T,
cluster_cols = T,
show_rownames = T,
show_colnames = T,
fontsize = 2,
cellwidth = 10,
cellheight = 2,
filename =paste0("Output/Figure1/Figure1G_1.pdf"))
all.prot<-names(aml.prot)
prot.in.genenset<-data.frame(1:5000)
for (i in row.names(pathway.heatmap.1)) {
prot.in.genenset0<- geneset.filter[[i]][geneset.filter[[i]] %in% all.prot]
prot.in.genenset0<-data.frame(prot.in.genenset0)
names(prot.in.genenset0)<-i
row<- nrow(prot.in.genenset0)+1
prot.in.genenset0[row:5000,]<-NA
prot.in.genenset<- cbind (prot.in.genenset, prot.in.genenset0)
}
prot.in.genenset <- prot.in.genenset[, -1]
row_na_percentage <-
rowSums(is.na(prot.in.genenset)) / ncol(prot.in.genenset)
threshold <- 0.999999
selected_rows <- which(row_na_percentage < threshold)
prot.in.genenset.delte.na <- prot.in.genenset[selected_rows,]
trans<-AML_data$RNA_seq
trans.data<-as.matrix((trans[,-c(1:2)]))
row.names(trans.data)<-trans$symbol
trans.data<-data.frame(trans.data)
trans.data<-trans.data[,names(trans.data) %in% row.names(WHO)]
trans.data<-as.matrix(trans.data)
set.seed(123)
param2<-gsvaParam(exprData = trans.data,
geneSets = geneset.filter,
kcdf = "Gaussian")
gsva.trans <- gsva(param2,
verbose = TRUE)
#parallel.sz = parallel::detectCores())
gsva.trans<-data.frame(gsva.trans)
gsva.prot<-data.frame(gsva.prot)
gsva.prot<-gsva.prot[,names(gsva.trans)]
trans.data<-data.frame(gsva.trans)
prot.data<-data.frame(gsva.prot)
df<-data.frame("corr","corr_p","pathway","cluster")
names(df)<-c("corr","corr_p","pathway","cluster")
df<-df[-1,]
for (cluster in unique(WHO$WHO)[-10]) {
cluster.sample <- row.names(WHO[WHO$WHO == cluster, ])
trans.cluster <-
trans.data[, names(trans.data) %in% cluster.sample]
prot.cluster <- prot.data[, names(prot.data) %in% cluster.sample]
for (i in 1:nrow(trans.cluster)) {
pathway <- row.names(trans.cluster)[i]
trans.pathway <- c(as.matrix(trans.cluster[i, ]))
prot.pathway <- c(as.matrix(prot.cluster[i, ]))
corr <- cor(trans.pathway, prot.pathway, method = "spearman")
corr_p <-cor.test(trans.pathway, prot.pathway, method = "spearman")$p.value
df0 <- data.frame(corr, corr_p, pathway, cluster, check.names = F)
df <- rbind(df0, df)
}
}
row.order<- heatmap$tree_row$order
col.order<-heatmap$tree_col$order
corr.heatmap<- dcast(df, pathway ~ cluster, value.var = "corr")
corr.heatmap.1<-corr.heatmap[,-c(1)]
row.names(corr.heatmap.1)<-corr.heatmap$pathway
corr.heatmap.1<-corr.heatmap.1[row.order,col.order]
corr.heatmap.p<- dcast(df, pathway ~ cluster, value.var = "corr_p")
corr.heatmap.p.1<- corr.heatmap.p[,-c(1)]
row.names(corr.heatmap.p.1)<-corr.heatmap.p$pathway
corr.heatmap.p.1<-corr.heatmap.p.1[row.order,col.order]
significance_level = 0.05
significant_matrix <-ifelse(corr.heatmap.p.1 < significance_level, "*", "")
significant_matrix[is.na(significant_matrix)] <- ""
colors <- brewer.pal(name = "RdYlGn", n = 10)[10:1]
heatmap <- pheatmap::pheatmap(
corr.heatmap.1,
color = colors,
fontsize_col = 4,
display_numbers = significant_matrix,
cluster_rows = F,
cluster_cols = F,
show_rownames = T,
show_colnames = T,
fontsize = 2,
cellwidth = 10,
cellheight = 2,
filename = paste0( "Output/Figure1/Figure1G_2.pdf")
)
2.7 (H) kinase-substrate enrichment analysis
Heatmap depicting kinome profiling with Z scores calculated by kinase-substrate enrichment analysis (KSEA) algorithm.
rm(list=ls())
library(dplyr)
library(MNet)
library(stringr)
library(ggplot2)
library(RColorBrewer)
library(readr)
library(dplyr)
library(MNet)
library(stringr)
library(ggplot2)
library(RColorBrewer)
library(KSEAapp)
#-----------------------------------------------------------------------------
#Step 1: Load data and and set parameters
#-----------------------------------------------------------------------------
AML<-read_rds("Input/AML_data.rds")
info<-AML$sampleinfo
phospho<-AML$Phosphoproteomics
row.names(phospho)<- paste(sep = "_", phospho$PTM.FlankingRegion,phospho$PTM.SiteAA,phospho$PTM.SiteLocation,phospho$PG.Genes)
phospho1<-phospho[,-c(1:6)]
phosphoNA20<-apply(phospho1,1, function (x) {sum(is.na(x))/(ncol(phospho1))})
phospho2<-phospho1[names(phosphoNA20)[phosphoNA20<0.8],]
phospho_name<-names(phospho2)
names(phospho2)<-phospho_name
phospho3<-phospho2
phospho4<-log2(phospho3)
phospho4[is.na(phospho4)]<-0
phospho5<-data.frame(t(phospho4),check.names = F)
phospho5$DIA_ID<-row.names(phospho5)
df2<-merge(info[,c(5,31)],phospho5,by="DIA_ID",all.x=F,all.y = F)
names(df2)[2]<-"WHO"
df3<-data.frame(t(df2[,-c(1:2)]))
names(df3)<-df2$DIA_ID
df2$WHO<-gsub(" ","_",df2$WHO)
df2$WHO<-gsub("::","_",df2$WHO)
df2$WHO<-gsub("-","_",df2$WHO)
matrix_raw<- data.frame(apply(df3,2,function (x) {as.numeric(x)}))
row.names(matrix_raw)<- row.names(df3)
iN<- unique(df2$WHO)[-10]
for (i in iN) {
if ( i %in% iN) {
type <- c(as.matrix(df2$WHO))
type[!type == i] <- "normal"
type[type == i] <- "tumor"
aml_matrix <- matrix_raw
}
#-----------------------------------------------------------------------------
#Step 2: Kinaseāsubstrate enrichment analysis
#-----------------------------------------------------------------------------
diff_phospho_KSEA<- MNet::mlimma(aml_matrix,type)
phosphoname<-data.frame(row.names(phospho),phospho[,c(2,3,1,5,6)])
phosphoname$site<-paste0(phosphoname$PTM.SiteAA,phosphoname$PTM.SiteLocation)
names(phosphoname)[1]<-c("name")
diff_phospho_KSEA1<-merge(diff_phospho_KSEA,phosphoname,by="name",all.x = T,all.y = F)
diff_phospho_KSEA2<-diff_phospho_KSEA1[,c(9:11,14,5,2)]
names(diff_phospho_KSEA2)<-c("Protein","Gene","Peptide","Residue.Both", "p","FC")
diff_phospho_KSEA2$FC<-2^diff_phospho_KSEA2$FC
KSData=read.csv("Input/PSP&NetworKIN_Kinase_Substrate_Dataset_July2016.csv", header = T)
PX=diff_phospho_KSEA2
path<-paste0("Output/Figure1")
KSEA.Complete(KSData,
PX,
NetworKIN = T,
NetworKIN.cutoff = 5,
m.cutoff = 5,
p.cutoff = 0.05)
KSEA.Scores=KSEA.Scores(KSData,
PX,
NetworKIN = T,
NetworKIN.cutoff = 20
)
write.csv(KSEA.Scores, paste0(path,i,"_KSEA.Scores.csv"),row.names = FALSE)
}
CBFB_MYH11<-read.csv(paste0(path,"CBFB_MYH11_KSEA.Scores.csv"))
KMT2A_rearrangement<-read.csv(paste0(path,"KMT2A_rearrangement_KSEA.Scores.csv"))
NPM1<-read.csv(paste0(path,"NPM1_KSEA.Scores.csv"))
defined_by_differentiation<-read.csv(paste0(path,"defined_by_differentiation_KSEA.Scores.csv"))
CEBPA<-read.csv(paste0(path,"CEBPA_KSEA.Scores.csv"))
Myelodysplasia_related<-read.csv(paste0(path,"Myelodysplasia_related_KSEA.Scores.csv"))
RUNX1_RUNX1T1<-read.csv(paste0(path,"RUNX1_RUNX1T1_KSEA.Scores.csv"))
PML_RARA<-read.csv(paste0(path,"PML_RARA_KSEA.Scores.csv"))
NUP98_rearrangement<-read.csv(paste0(path,"NUP98_rearrangement_KSEA.Scores.csv"))
KSEA.Heatmap.new <- function(score.list, sample.labels, stats, m.cutoff, p.cutoff, sample.cluster) {
library(gplots)
filter.m <- function(dataset, m.cutoff) {
filtered <- dataset[(dataset$m >= m.cutoff), ]
return(filtered)
}
score.list.m <- lapply(score.list, function(...) filter.m(..., m.cutoff))
for (i in 1:length(score.list.m)) {
names <- colnames(score.list.m[[i]])[c(2:7)]
colnames(score.list.m[[i]])[c(2:7)] <- paste(names, i, sep = ".")
}
master <- Reduce(function(...) merge(..., by = "Kinase.Gene", all = F), score.list.m)
row.names(master) <- master$Kinase.Gene
columns <- as.character(colnames(master))
merged.scores <- as.matrix(master[, grep("z.score", columns)])
colnames(merged.scores) <- sample.labels
merged.stats <- as.matrix(master[, grep(stats, columns)])
asterisk <- function(matrix) {
new <- data.frame()
for (i in 1:nrow(matrix)) {
for (j in 1:ncol(matrix)) {
if (matrix[i, j] < p.cutoff) {
new[i, j] <- "*"
} else {
new[i, j] <- ""
}
}
}
return(new)
}
merged.asterisk <- as.matrix(asterisk(merged.stats))
row.names(merged.asterisk) <- row.names(master)
p_detect <- data.frame(asterisk(merged.stats), row.names = row.names(master))
p_detect[p_detect != "*"] <- NA
p_detect.1 <- p_detect[!apply(p_detect, 1, function(row) all(is.na(row))), ]
merged.asterisk <- data.frame(merged.asterisk, row.names = row.names(master))
merged.asterisk <- merged.asterisk[row.names(merged.asterisk) %in% row.names(p_detect.1), ]
merged.asterisk <- as.matrix(merged.asterisk)
create.breaks <- function(merged.scores) {
if (min(merged.scores) < -1.6) {
breaks.neg <- seq(-1.6, 0, length.out = 30)
breaks.neg <- append(seq(min(merged.scores), -1.6, length.out = 10), breaks.neg)
breaks.neg <- sort(unique(breaks.neg))
} else {
breaks.neg <- seq(-1.6, 0, length.out = 30)
}
if (max(merged.scores) > 1.6) {
breaks.pos <- seq(0, 1.6, length.out = 30)
breaks.pos <- append(breaks.pos, seq(1.6, max(merged.scores), length.out = 10))
breaks.pos <- sort(unique(breaks.pos))
} else {
breaks.pos <- seq(0, 1.6, length.out = 30)
}
breaks.all <- unique(append(breaks.neg, breaks.pos))
mycol.neg <- colorpanel(n = length(breaks.neg), low = "blue", high = "white")
mycol.pos <- colorpanel(n = length(breaks.pos) - 1, low = "white", high = "red")
mycol <- unique(append(mycol.neg, mycol.pos))
color.breaks <- list(breaks.all, mycol)
return(color.breaks)
}
color.breaks <- create.breaks(merged.scores)
plot.height <- nrow(merged.scores)^0.55
plot.width <- ncol(merged.scores)^0.7
merged.scores <- merged.scores[row.names(merged.scores) %in% row.names(p_detect.1), ]
write.csv(merged.scores, paste0(path, "SNF_KSEA.Merged.Scores.csv"))
pdf(paste0(path, "Figure1H.pdf"), width = 10, height = 30)
heatmap.2(
merged.scores,
Colv = sample.cluster,
scale = "none",
cellnote = merged.asterisk,
notecol = "white",
cexCol = 0.9,
cexRow = 0.9,
srtCol = 45,
notecex = 1.4,
col = color.breaks[[2]],
density.info = "none",
trace = "none",
key = TRUE,
keysize = 0.2,
key.par = list(cex = 0.7),
breaks = color.breaks[[1]],
lmat = rbind(c(4, 3), c(2, 1)),
lhei = c(0.5, 9),
lwid = c(1.5, 4),
margins = c(2, 6)
)
dev.off()
}
KSEA.Heatmap.new(
list(
CBFB_MYH11,
KMT2A_rearrangement,
NPM1,
defined_by_differentiation,
CEBPA,
Myelodysplasia_related,
RUNX1_RUNX1T1,
PML_RARA,
NUP98_rearrangement
),
sample.labels = c(
"CBFB_MYH11",
"KMT2A_rearrangement",
"NPM1",
"defined_by_differentiation",
"CEBPA",
"Myelodysplasia_related",
"RUNX1_RUNX1T1",
"PML_RARA",
"NUP98_rearrangement"
),
stats = 'p.value',
m.cutoff = 3,
p.cutoff = 0.01,
sample.cluster = TRUE
)