In this demo, the PIPE is applied to cross-disease pleiotropic associations for neuropsychiatric disorders involving eight specific diseases; see PMID:31835028. The validity is demonstrated in recovering clinical therapeutics, with the usefulness illustrated to identify diseases where pleiotropy can inform clinical therapeutics and to determine disease relationships based on shared therapeutic targets. Also demonstrated is the identification of a network of highly prioritised genes that mediate crosstalk between molecular pathways as well as the identification of critical network nodes through crosstalk-based effect-by-removal analysis.
The results showcased in this demo might deviate slightly from original reports. This is mainly because of the regular updates to the built-in datasets (see below): GWAS_LD.EUR
for European LD SNPs, dbSNP_Common
for dbSNP common SNPs, UCSC_knownGene
for UCSC known genes, STRING_fin
for STRING functional interactions, ig.EF.ChEMBL
for ChEMBL known drug targets, ig.KEGG.category
for KEGG pathway-derived interactions, and many more.
# Please install R (version 4.4.1 or above)
# see https://cran.r-project.org
# if the package 'BiocManager' not installed, please do so
if(!("BiocManager" %in% rownames(installed.packages()))) install.packages("BiocManager")
# first, install basic packages: remotes, tidyverse, pbapply, ComplexHeatmap, patchwork, Rgraphviz, caret, randomForest
BiocManager::install(c('remotes','tidyverse','pbapply','ComplexHeatmap','patchwork','Rgraphviz','caret','randomForest'), dependencies=T)
# then, install the package 'PIPE' (now hosted at github)
BiocManager::install("hfang-bristol/PIPE", dependencies=T, force=T)
# check the package 'PIPE' successfully installed
library(PIPE)
# load packages
library(PIPE)
library(tidyverse)
library(patchwork)
# specify built-in data placeholder
placeholder <- "http://www.comptransmed.pro/bigdata_pipe"
# create the output folder 'pipe_showcase'
outdir <- 'pipe_showcase'
if(!dir.exists(outdir)) dir.create(outdir)
# read GWAS summary data as input
data.file <- file.path(placeholder, "GWAS.cross.SNP.txt")
input_data <- read_delim(data.file, delim='\t') %>% filter(pubmedid==31835028) %>% distinct(snp_id_current,pvalue)
# LD.customised
LD.customised <- oRDS('GWAS_LD.EUR', placeholder=placeholder)
# All common SNPs (primarily sourced from dbSNP)
options(timeout=120) # please increase the timeout if failed to download
GR.SNP <- oRDS("dbSNP_Common", placeholder=placeholder)
# All known genes (primarily sourced from UCSC)
GR.Gene <- oRDS("UCSC_knownGene", placeholder=placeholder)
# Functional interaction network (primarily sourced from STRING)
network.customised <- oRDS("STRING_fin", placeholder=placeholder)
# include.RGB
include.RGB <- c("PCHiC_PMID27863249_Activated_total_CD4_T_cells","PCHiC_PMID27863249_Macrophages_M0","PCHiC_PMID27863249_Macrophages_M1","PCHiC_PMID27863249_Macrophages_M2","PCHiC_PMID27863249_Megakaryocytes","PCHiC_PMID27863249_Monocytes","PCHiC_PMID27863249_Naive_B_cells","PCHiC_PMID27863249_Naive_CD4_T_cells","PCHiC_PMID27863249_Naive_CD8_T_cells","PCHiC_PMID27863249_Neutrophils","PCHiC_PMID27863249_Nonactivated_total_CD4_T_cells","PCHiC_PMID27863249_Total_B_cells","PCHiC_PMID27863249_Total_CD4_T_cells","PCHiC_PMID27863249_Total_CD8_T_cells","PCHiC_PMID31501517_DorsolateralPrefrontalCortex","PCHiC_PMID31501517_H1NeuralProgenitorCell","PCHiC_PMID31501517_Hippocampus","PCHiC_PMID31501517_LCL", "PCHiC_PMID31367015_astrocytes","PCHiC_PMID31367015_excitatory","PCHiC_PMID31367015_hippocampal","PCHiC_PMID31367015_motor", "PCHiC_PMID25938943_CD34","PCHiC_PMID25938943_GM12878")
# include.QTL
include.QTL <- c("eQTL_eQTLGen", "eQTL_eQTLCatalogue_Alasoo_2018_macrophage_IFNg","eQTL_eQTLCatalogue_Alasoo_2018_macrophage_IFNgSalmonella","eQTL_eQTLCatalogue_Alasoo_2018_macrophage_Salmonella","eQTL_eQTLCatalogue_Alasoo_2018_macrophage_naive","eQTL_eQTLCatalogue_BLUEPRINT_Tcell","eQTL_eQTLCatalogue_BLUEPRINT_monocyte","eQTL_eQTLCatalogue_BLUEPRINT_neutrophil","eQTL_eQTLCatalogue_BrainSeq_brain","eQTL_eQTLCatalogue_CEDAR_microarray_Bcell_CD19","eQTL_eQTLCatalogue_CEDAR_microarray_Tcell_CD4","eQTL_eQTLCatalogue_CEDAR_microarray_Tcell_CD8","eQTL_eQTLCatalogue_CEDAR_microarray_monocyte_CD14","eQTL_eQTLCatalogue_CEDAR_microarray_neutrophil_CD15","eQTL_eQTLCatalogue_CEDAR_microarray_platelet","eQTL_eQTLCatalogue_Fairfax_2012_microarray_Bcell_CD19","eQTL_eQTLCatalogue_Fairfax_2014_microarray_monocyte_IFN24","eQTL_eQTLCatalogue_Fairfax_2014_microarray_monocyte_LPS2","eQTL_eQTLCatalogue_Fairfax_2014_microarray_monocyte_LPS24","eQTL_eQTLCatalogue_Fairfax_2014_microarray_monocyte_naive","eQTL_eQTLCatalogue_GENCORD_LCL","eQTL_eQTLCatalogue_GENCORD_Tcell","eQTL_eQTLCatalogue_GEUVADIS_LCL","eQTL_eQTLCatalogue_GTEx_V8_Brain_Amygdala","eQTL_eQTLCatalogue_GTEx_V8_Brain_Anterior_cingulate_cortex_BA24","eQTL_eQTLCatalogue_GTEx_V8_Brain_Caudate_basal_ganglia","eQTL_eQTLCatalogue_GTEx_V8_Brain_Cerebellar_Hemisphere","eQTL_eQTLCatalogue_GTEx_V8_Brain_Cerebellum","eQTL_eQTLCatalogue_GTEx_V8_Brain_Cortex","eQTL_eQTLCatalogue_GTEx_V8_Brain_Frontal_Cortex_BA9","eQTL_eQTLCatalogue_GTEx_V8_Brain_Hippocampus","eQTL_eQTLCatalogue_GTEx_V8_Brain_Hypothalamus","eQTL_eQTLCatalogue_GTEx_V8_Brain_Nucleus_accumbens_basal_ganglia","eQTL_eQTLCatalogue_GTEx_V8_Brain_Putamen_basal_ganglia","eQTL_eQTLCatalogue_GTEx_V8_Brain_Spinal_cord_cervical_c1","eQTL_eQTLCatalogue_GTEx_V8_Brain_Substantia_nigra","eQTL_eQTLCatalogue_GTEx_V8_Nerve_Tibial","eQTL_eQTLCatalogue_GTEx_V8_Pituitary","eQTL_eQTLCatalogue_GTEx_V8_Whole_Blood","eQTL_eQTLCatalogue_Kasela_2017_microarray_Tcell_CD4","eQTL_eQTLCatalogue_Kasela_2017_microarray_Tcell_CD8","eQTL_eQTLCatalogue_Lepik_2017_blood","eQTL_eQTLCatalogue_Naranbhai_2015_microarray_neutrophil_CD16","eQTL_eQTLCatalogue_Nedelec_2016_macrophage_Listeria","eQTL_eQTLCatalogue_Nedelec_2016_macrophage_Salmonella","eQTL_eQTLCatalogue_Nedelec_2016_macrophage_naive","eQTL_eQTLCatalogue_Quach_2016_monocyte_IAV","eQTL_eQTLCatalogue_Quach_2016_monocyte_LPS","eQTL_eQTLCatalogue_Quach_2016_monocyte_Pam3CSK4","eQTL_eQTLCatalogue_Quach_2016_monocyte_R848","eQTL_eQTLCatalogue_Quach_2016_monocyte_naive","eQTL_eQTLCatalogue_ROSMAP_brain_naive","eQTL_eQTLCatalogue_Schmiedel_2018_Bcell_naive","eQTL_eQTLCatalogue_Schmiedel_2018_CD4_Tcell_antiCD3CD28","eQTL_eQTLCatalogue_Schmiedel_2018_CD4_Tcell_naive","eQTL_eQTLCatalogue_Schmiedel_2018_CD8_Tcell_antiCD3CD28","eQTL_eQTLCatalogue_Schmiedel_2018_CD8_Tcell_naive","eQTL_eQTLCatalogue_Schmiedel_2018_NKcell_naive","eQTL_eQTLCatalogue_Schmiedel_2018_Tfh_memory","eQTL_eQTLCatalogue_Schmiedel_2018_Th117_memory","eQTL_eQTLCatalogue_Schmiedel_2018_Th17_memory","eQTL_eQTLCatalogue_Schmiedel_2018_Th1_memory","eQTL_eQTLCatalogue_Schmiedel_2018_Th2_memory","eQTL_eQTLCatalogue_Schmiedel_2018_Treg_memory","eQTL_eQTLCatalogue_Schmiedel_2018_Treg_naive","eQTL_eQTLCatalogue_Schmiedel_2018_monocyte_CD16_naive","eQTL_eQTLCatalogue_Schmiedel_2018_monocyte_naive","eQTL_eQTLCatalogue_Schwartzentruber_2018_sensory_neuron")
# prepare predictors
ls_pNode_genomic <- oPierSNPsAdv(data=input_data, LD.customised=LD.customised, significance.threshold=5e-8, GR.SNP=GR.SNP, GR.Gene=GR.Gene, distance.max=20000, decay.kernel="constant", include.QTL=include.QTL, include.RGB=include.RGB, network.customised=network.customised, placeholder=placeholder, verbose.details=F)
# do prioritisation(the initial round)
ls_pNode <- Filter(Negate(is.null), ls_pNode_genomic)
dTarget0 <- oPierMatrix(ls_pNode, displayBy="pvalue", aggregateBy="fishers", rangeMax=10, GR.Gene=GR.Gene, placeholder=placeholder)
# GSP (phase 2 or above drug targets for neuropsychiatric disorders)
ig.EF.ChEMBL <- oRDS('ig.EF.ChEMBL', placeholder=placeholder)
codes_name <- c("anorexia nervosa","attention deficit hyperactivity disorder","autism spectrum disorder","bipolar disorder","unipolar depression","obsessive-compulsive disorder","schizophrenia","Tourette syndrome")
ind <- match(codes_name, V(ig.EF.ChEMBL)$name)
GSP <- base::Reduce(union, V(ig.EF.ChEMBL)$GSP[ind])
# GSN
druggable_targets <- do.call(rbind, V(ig.EF.ChEMBL)$phase) %>% pull(target) %>% unique()
g1 <- oRDS('org.Hs.string_medium', placeholder=placeholder)
g2 <- oRDS('org.Hs.PCommons_UN', placeholder=placeholder)
g <- oCombineNet(list(g1,g2), combineBy='union', attrBy="intersect")
sGS <- oGSsimulator(GSP=GSP, population=druggable_targets, network.customised=g, neighbor.order=1, verbose=F)
GSN <- sGS$GSN
message(sprintf("GSP (%d), GSN (%d)", length(GSP), length(GSN)), appendLF=T)
# predictor matrix
df_predictor <- oPierMatrix(ls_pNode, displayBy="score", aggregateBy="none", GR.Gene=GR.Gene, placeholder=placeholder)
# informative predictors
sClass <- oClassifyRF(df_predictor, GSP=GSP, GSN=GSN, nfold=3, nrepeat=10, mtry=NULL, ntree=1000, cv.aggregateBy="fishers")
df_imp <- sClass$importance %>% as_tibble(rownames='predictor')
df_perf <- sClass$performance %>% as_tibble(rownames='predictor')
imp_perf <- df_imp %>% inner_join(df_perf, by='predictor') %>% mutate(group=str_replace_all(predictor,'_.*','')) %>% arrange(group,-MeanDecreaseAccuracy)
xintercept <- imp_perf %>% filter(group=='nGene') %>% pull(MeanDecreaseAccuracy)
yintercept <- imp_perf %>% filter(group=='nGene') %>% pull(MeanDecreaseGini)
zintercept <- imp_perf %>% filter(group=='nGene') %>% pull(auroc)
imp_perf2 <- imp_perf %>% filter(MeanDecreaseAccuracy>=0, MeanDecreaseGini>=0, direction=='+') %>% mutate(x=MeanDecreaseAccuracy>=xintercept, y=MeanDecreaseGini>=yintercept, z=auroc>=zintercept) %>% arrange(-MeanDecreaseAccuracy)
# predictors selected based the importance no worse than nGene
vec_predictor_selected <- imp_perf2 %>% filter(x|y|z) %>% pull(predictor)
vec_predictor_selected
# do prioritisation(based on informative predictors)
dTarget <- oPierMatrix(ls_pNode[vec_predictor_selected], displayBy="pvalue", aggregateBy="fishers", rangeMax=10, GR.Gene=GR.Gene, placeholder=placeholder)
# write into a file 'CRM_NPD_priority.txt.gz' under the folder 'pipe_showcase'
dTarget$priority %>% select(name,rating,rank,seed,nGene,eGene,cGene,description) %>% write_delim(file.path(outdir,'CRM_NPD_priority.txt.gz'), delim='\t')
# save into a file 'dTarget_NPD.CRM.RData' under the folder 'pipe_showcase'
save(list='dTarget', file=file.path(outdir,'dTarget_NPD.CRM.RData'))
Target genes stored in the output file CRM_NPD_priority.txt.gz
above can be explored below. Notes, genes are ranked by rating (credit score ranged 0-10; see the column rating
).
top.label.query <- dTarget$priority %>% top_frac(0.5,rating) %>% pull(name)
label.query.only <- T
gp_manhattan <- oPierManhattan(dTarget, color=c("darkred", "darkorange"), point.size=0.2, top=30, top.label.type="text", top.label.size=2, top.label.col="black", top.label.query=top.label.query, label.query.only=label.query.only, y.lab="Credit score\n(neuropsychiatric disorders)", GR.Gene=GR.Gene, placeholder=placeholder)
# saved into a file 'CRM_NPD_manhattan.pdf' under the folder 'pipe_showcase'
ggsave(file.path(outdir,'CRM_NPD_manhattan.pdf'), gp_manhattan, width=10, height=3)
Figure 5.1: Manhattan plot illustrates credit score (y-axis) for prioritised target genes (color-coded by chromosomes; x-axis), with top 30 genes named.
codes_name <- c("anorexia nervosa","attention deficit hyperactivity disorder","autism spectrum disorder","bipolar disorder","unipolar depression","obsessive-compulsive disorder","schizophrenia","Tourette syndrome")
ls_tb <- pbapply::pblapply(1:length(codes_name), function(j){
message(sprintf("Processing %s ...", codes_name[j]), appendLF=T)
ind <- match(codes_name[j], V(ig.EF.ChEMBL)$name)
if(sum(is.na(ind))==0){
GSP <- base::Reduce(union, V(ig.EF.ChEMBL)$GSP[ind])
GSN <- base::Reduce(union, V(ig.EF.ChEMBL)$GSN[ind])
message(sprintf("%s, GSP (%d), GSN (%d): (%s)", codes_name[j], length(GSP), length(GSN), as.character(Sys.time())), appendLF=T)
}
ls_pNode_selected <- list('PIPE'=dTarget, 'PIPE0'=dTarget0)
ls_pPerf <- lapply(ls_pNode_selected, function(x){
if (is(x,"pNode")){
prediction <- x$priority %>% select(name,priority)
}else if(is(x,"dTarget")){
prediction <- x$priority %>% select(name,rating)
}
pPerf <- oClassifyPerf(prediction=prediction, GSP=GSP, GSN=GSN, verbose=F)
})
tb <- tibble(name=codes_name[j], methods=names(ls_pPerf), pPerf=ls_pPerf)
})
# do extracting
ls_auroc <- lapply(1:length(ls_tb), function(j){
tb <- ls_tb[[j]] %>% filter(methods %in% c("PIPE","PIPE0"))
auroc <- sapply(1:nrow(tb), function(i){
tb$pPerf[[i]]$auroc
})
tb %>% mutate(auroc=auroc) %>% select(-pPerf)
})
df_auroc <- do.call(rbind, ls_auroc)
mat <- df_auroc %>% select(name,methods,auroc) %>% pivot_wider(names_from=methods, values_from=auroc) %>% column_to_rownames('name') %>% arrange(-PIPE)
data.label <- round(mat*1000) / 1000
gp_auc <- oHeatmap(mat, zlim=c(0,1), colormap="brewer.YlOrRd", x.rotate=45, x.text.size=8, y.text.size=8, legend.title="AUC", legend.text.size=6, legend.title.size=8, shape=19, size=12, data.label=data.label, label.size=3, label.color="white")
# saved into a file 'CRM_NPD_AUC.pdf' under the folder 'pipe_showcase'
ggsave(file.path(outdir,'CRM_NPD_AUC.pdf'), gp_auc, width=4, height=4)
Figure 6.1: Heatmap-like comparisons of the performance between PIPE (based on informative predictors) and PIPE0 (based on all predictors), measured by area under the ROC curve (AUC) separating clinical proof-of-concept targets from simulated negative targets.
Leading prioritisation analysis (LPA) is performed to quantify the degree to which a target set (e.g. clinical proof-of-concept targets) is enriched at the ‘leading prioritisation’ of the prioritised gene list ranked by credit scores. The leading prioritisation (or the leading edge) contains the left-most subset of the highly prioritised/credited gene list, with a normalised enrichment score (NES) calculated as the observed running enrichment score divided by the expected one, and the significance level (P-value) estimated by the permutation test 50,000 times.
# df_priority
df_priority <- dTarget$priority %>% top_frac(1,rating) %>% transmute(priority=rating, rank=rank)
df_priority <- dTarget$priority %>% column_to_rownames("name") %>% top_frac(1,rating) %>% transmute(priority=rating, rank=rank)
# prepare SET
codes_name <- c("anorexia nervosa","attention deficit hyperactivity disorder","autism spectrum disorder","bipolar disorder","unipolar depression","obsessive-compulsive disorder","schizophrenia","Tourette syndrome")
ind <- match(codes_name, V(ig.EF.ChEMBL)$name)
codes <- c('AN','ADH','ASD','BIP','MD','OCD','SCZ','TS')
ls_gsp <- V(ig.EF.ChEMBL)$GSP[ind]
names(ls_gsp) <- codes
info <- ls_gsp %>% enframe() %>% transmute(id=name, member=value)
info <- info %>% mutate(name=codes_name, namespace='NPD', distance=NA) %>% mutate(n=map_dbl(member,length)) %>% select(name,id,namespace,distance,member,n)
set <- list(info=info, stamp=as.Date(Sys.time()))
class(set) <- "SET"
# do leading prioritisation analysis (LPA)
set.seed(825)
eLPA <- oPierGSEA(df_priority, customised.genesets=set, size.range=c(10,5000), nperm=50000)
df_summary <- eLPA$df_summary %>% mutate(flag=ifelse(adjp<0.05,'Y','N')) %>% mutate(frac=nLead/nAnno) %>% filter(frac<=1) %>% arrange(tolower(setID))
df_leading <- eLPA$leading[df_summary %>% pull(setID)] %>% enframe() %>% transmute(setID=name,members=value) %>% mutate(rank_members=map_chr(members, function(x) str_c(names(x),' (',x,')', collapse=', ')))
df_summary_leading <- df_summary %>% inner_join(df_leading, by='setID')
# save into a file 'CRM_NPD_lpa.txt' under the folder 'pipe_showcase'
df_summary_leading %>% select(-members) %>% write_delim(file.path(outdir,'CRM_NPD_lpa.txt'), delim='\t')
Target genes stored in the output file CRM_NPD_lpa.txt
above can be explored below. Notes, the column frac
refers to the fraction of clinical proof-of-concept targets recovered at the leading prioritisation of the prioritised list.
Pleiotropy Informing Clinical Therapeutics (PICT) is defined as the tendency of the prioritised gene list to be clinical proof-of-concept targets. A disease, if having a more significant enrichment (FDR; P-value adjusted accounting for the multiple tests), a higher NES and a higher coverage (the fraction of clinical proof-of-concept targets recovered at the ‘leading prioritisation’ of the prioritised list), will have a higher PICT potential.
pict <- df_summary_leading %>% mutate(pict=log10(nes * frac / adjp)) %>% select(setID, nes, adjp, frac, nAnno, nLead, pict) %>% arrange(tolower(setID))
gp_pict <- oPolarBar(pict %>% transmute(name=setID,value=pict), colormap='sci_locuszoom', size.name=9, size.value=4)
# saved into a file 'CRM_NPD_pict.pdf' under the folder 'pipe_showcase'
ggsave(file.path(outdir,'CRM_NPD_pict.pdf'), gp_pict, width=7, height=6)
Figure 6.2: Pie chart sized by PICT for individual diseases arranged in clockwise order.
Leading prioritisation plot is shown for attention deficit hyperactivity disorder (ADHD).
leading.query <- df_priority %>% as_tibble(rownames='target') %>% top_frac(1,priority) %>% pull(target)
gp_leading <- oGSEAdotplot(eLPA, top='attention deficit hyperactivity disorder', x.scale="normal", leading=T, leading.edge.only=T, leading.query.only=T, leading.query=leading.query, colormap="jet.top", zlim=c(0,10), peak.color='black', leading.size=2, clab='Credit\nscore', leading.label.direction="left") + theme(plot.title=element_text(hjust=0.5,size=10), plot.subtitle=element_text(hjust=0.5,size=8))
# saved into a file 'CRM_NPD_leading_adhd.pdf' under the folder 'pipe_showcase'
ggsave(file.path(outdir,'CRM_NPD_leading_adhd.pdf'), gp_leading, width=5, height=5)
Figure 6.3: Leading prioritisation plot for attention deficit hyperactivity disorder (ADHD), where clinical proof-of-concept targets recovered are indicated in vertical lines, labelled in gene symbols, and color-coded by the credit score.
The relationships between diseases are presented as a network, with diseases as nodes and their estimated relations as edges. Member genes for each disease are defined as clinical proof-of-concept targets recovered at the leading prioritisation. The edges are initially inferred if any member genes are shared between diseases, followed by the filtering by identifying the minimum spanning tree to keep only edges found in the resulting tree. The thickness of edges is proportional to the number of member genes shared between two-endpoint diseases.
# Member genes for each disease defined as clinical proof-of-concept targets recovered at the leading prioritisation
data <- df_summary_leading %>% transmute(name=setID, members=rank_members)
# remove (rank)
data0 <- data %>% separate_rows(members, sep=", ") %>% mutate(members=str_replace_all(members, ' \\(.*', '')) %>% nest(data=-name) %>% mutate(members=map_chr(data, function(x) str_c(unlist(x), collapse=', '))) %>% select(-data)
# bigraph
big <- data %>% separate_rows(members, sep=", ") %>% mutate(flag=1) %>% select(members,name,flag) %>% pivot_wider(names_from=members, values_from=flag, values_fill=list(flag=0)) %>% column_to_rownames("name") %>% oBicreate() %>% oBigraph()
big <- big %>% oLayout("graphlayouts.layout_with_stress")
igraph::degree(big) -> V(big)$degree
if(1){
V(big)$rating <- str_replace_all(V(big)$name, '.*\\(|\\).*', '') %>% as.numeric()
ifelse(V(big)$type=='xnode', V(big)$name, "") -> V(big)$label
ifelse(V(big)$type=='ynode' & V(big)$rating<=nrow(df_priority) * 0.02 | V(big)$type=='xnode', V(big)$name, "") -> V(big)$label
}
gp_big <- big %>% oGGnetwork(node.label="label", node.label.size=2, node.label.force=0.02, node.xcoord="xcoord", node.ycoord="ycoord", colormap='steelblue-steelblue', node.shape="type", node.size="degree", node.size.range=c(0.5,6),edge.color="grey80", edge.color.alpha=0.3, edge.arrow.gap=0, edge.curve=0.05)
## saved into a file 'CRM_NPD_big.pdf' under the folder 'pipe_showcase'
ggsave(file.path(outdir,'CRM_NPD_big.pdf'), gp_big, width=5, height=5)
# identify the minimum spanning tree (mst)
tig <- data %>% oTIG(method='empirical', empirical.cutoff=0.8)
tig <- tig %>% oLayout("gplot.layout.fruchtermanreingold")
gp_mst <- oGGnetwork(tig, node.label='name', node.label.size=2.5, node.label.color='steelblue', node.label.force=1, node.xcoord='xcoord', node.ycoord='ycoord', colormap="steelblue-steelblue", edge.size='weight', node.shape=18, node.size='n_members', node.size.title="Num of\ngenes", slim=c(0,25), node.size.range=c(2,5), edge.color='gray80', edge.color.alpha=0.3, edge.curve=0,edge.arrow.gap=0.01)
## saved into a file 'CRM_NPD_mst.pdf' under the folder 'pipe_showcase'
ggsave(file.path(outdir,'CRM_NPD_mst.pdf'), gp_mst, width=5, height=5)
Figure 6.4: Network-like representation of diseases (in circle) and member genes (in triangle). Member genes shared between any two diseases are also labelled, including target ranks (in the parentheses).
Figure 6.5: Inter-disease relationships inferred by sharing of member genes. Nodes are sized by the number of member genes, and edges for relationships (with the thickness proportional to the number of genes shared between two-endpoint diseases).
A heatmap is used to illustrate clinical proof-of-concept therapeutic targets recovered at the leading prioritisation for each individual disease within neuropsychiatric disorders
library(ComplexHeatmap)
# df_priority_leading
data <- df_summary_leading %>% transmute(name=setID,members) %>% mutate(target=map(members, function(x) names(x))) %>% mutate(rating=map(members, function(x) x)) %>% select(name,target,rating) %>% unnest(c(target,rating))
df_x <- data %>% pivot_wider(names_from=name, values_from=rating)
df_y <- df_priority %>% as_tibble(rownames='target')
df_priority_leading <- df_y %>% inner_join(df_x, by='target') %>% arrange(-priority)
# write into a file 'CRM_NPD_priority_leading.txt' under the folder 'pipe_showcase'
df_priority_leading %>% write_delim(file.path(outdir,'CRM_NPD_priority_leading.txt'), delim='\t')
# ht_main
D <- df_priority_leading %>% mutate(rname=str_c(target,' (',rank,')')) %>% select(rname, priority) %>% column_to_rownames('rname')
col_fun <- circlize::colorRamp2(seq(0,6,length=64), oColormap('spectral')(64))
#vec_group <- df_priority_leading %>% pull(rank)
#vec_group <- as.numeric(cut(df_priority_leading$rank, breaks=seq(0,10000,100)))
vec_group <- as.numeric(cut(df_priority_leading$rank, breaks=ceiling(nrow(df_priority) * seq(0,0.5,0.05))))
ht_main <- Heatmap(D, name="Credit\nscore", col=col_fun, border=T,
row_split=vec_group, cluster_rows=F, show_row_dend=T, row_title=NULL, show_row_names=T, row_names_gp=gpar(fontsize=5), row_names_rot=0, row_names_side="left", clustering_distance_rows="euclidean", clustering_method_rows="average",
cluster_columns=F, column_order=1:ncol(D), show_column_names=F, column_names_side="top", column_names_gp=gpar(fontsize=8), column_names_rot=0
)
# row_anno: ls_ha_mark
colormap <- "sci_locuszoom"
ncolor <- ncol(df_priority_leading) - 3
ls_ha_mark <- lapply(1:ncolor, function(j){
node.color <- oColormap(colormap)(ncolor)[j]
vec_at <- df_priority_leading %>% pull(j+3)
vec_at[!is.na(vec_at)] <- 1
vec_at[is.na(vec_at)] <- 0
at <- which(vec_at==1)
ha_mark_at <- rowAnnotation(
D=anno_simple(vec_at, col=c("0"="grey95", "1"=node.color), width=unit(1.5,"mm")),
mark=anno_mark(at=at, labels=df_priority_leading$target[at], labels_gp=gpar(fontsize=5,col="black"), lines_gp=gpar(col=node.color))
)
# D1, D2, D3, ...
ha_mark_at@anno_list$D@label <- str_c('D',j)
ha_mark_at
})
names(ls_ha_mark) <- colnames(df_priority_leading)[4:(3+ncolor)]
names(ls_ha_mark)
#ht <- ht_main + ls_ha_mark[[1]] + ls_ha_mark[[2]] + ls_ha_mark[[3]] + ls_ha_mark[[4]] + ls_ha_mark[[5]] + ls_ha_mark[[6]] + ls_ha_mark[[7]] + ls_ha_mark[[8]]
#draw(ht, heatmap_legend_side="left", annotation_legend_side="left")
# row_anno: ha_mark_approved
codes_name <- c("anorexia nervosa","attention deficit hyperactivity disorder","autism spectrum disorder","bipolar disorder","unipolar depression","obsessive-compulsive disorder","schizophrenia","Tourette syndrome")
ChEMBL <- oRDS("ChEMBL_v33", placeholder=placeholder)
ChEMBL_subset <- ChEMBL %>% filter(efo_term %in% codes_name)
### df_target_label
df_target_label <- ChEMBL_subset %>% filter(phase==4,target_number<=5) %>% select(Symbol,pref_name_drug,efo_term) %>% nest(data=pref_name_drug) %>% mutate(drugs=map_chr(data, function(x) {
y <- x %>% pull(pref_name_drug)
if(length(y)>1){
y <- c(y %>% head(1), '...')
str_c(y,collapse=" | ")
}else{
str_c(y %>% head(1),collapse=" | ")
}
})) %>% mutate(label=str_c(efo_term,' (',drugs,')')) %>% select(Symbol,label) %>% group_by(Symbol) %>% summarise(label=str_c(label,collapse='\n ')) %>% ungroup() %>% mutate(label=str_c(Symbol,':\n ',label))
### df_label
df_label <- df_priority_leading %>% select(target) %>% left_join(df_target_label, by=c('target'='Symbol'))
vec_at <- df_label$label
at <- which(!is.na(vec_at))
ha_mark_approved <- rowAnnotation(
A=anno_simple(0+!is.na(vec_at), col=c("0"="grey90", "1"="black"), width=unit(3,"mm")),
mark=anno_mark(at=at, labels=df_label$label[at], labels_gp=gpar(fontsize=5,col='black'), lines_gp=gpar(col='black'))
)
#ht <- ht_main + ha_mark_approved
#draw(ht, heatmap_legend_side="left", annotation_legend_side="left")
# do visualisation
## saved into a file 'CRM_NPD_heatmap_therapeutics.pdf' under the folder 'pipe_showcase'
ht <- ht_main + ls_ha_mark[[1]] + ls_ha_mark[[2]] + ls_ha_mark[[3]] + ls_ha_mark[[4]] + ls_ha_mark[[5]] + ls_ha_mark[[6]] + ls_ha_mark[[7]] + ls_ha_mark[[8]] + ha_mark_approved
pdf(file.path(outdir, 'CRM_NPD_heatmap_therapeutics.pdf'), width=9.8, height=9.5)
draw(ht, heatmap_legend_side="left", annotation_legend_side="left")
dev.off()
Figure 6.6: Illustration of clinical proof-of-concept therapeutic targets recovered at the leading prioritisation for each of eight individual diseases (D1 - D8) within neuropsychiatric disorders, with labellings for approved drug targets (the column ‘A’).
From the list of all prioritised target genes, a concise and manageable list of genes are identified, likely mediating the crosstalk between molecular pathways (sourced from KEGG), that is, targets at the pathway crosstalk level.
ig.KEGG.category <- oRDS('ig.KEGG.category', placeholder=placeholder)
ig <- ig.KEGG.category %>% filter(category %in% c("Environmental Process")) %>% deframe() %>% oCombineNet()
ig2 <- oNetInduce(ig, nodes_query=V(ig)$name, largest.comp=T) %>% as.undirected()
subg <- oPierSubnet(dTarget, network.customised=ig2, subnet.size=50)
# do permutation test
#subg_perm <- oPierSubnet(dTarget, network.customised=ig2, subnet.size=50, test.permutation=T, num.permutation=100)
# save into a file 'subg_NPD.CRM.RData' under the folder 'pipe_showcase'
save(list='subg', file=file.path(outdir,'subg_NPD.CRM.RData'))
# write into a file 'CRM_NPD_priority_subg.txt' under the folder 'pipe_showcase'
dTarget$priority %>% select(name,rating,rank,seed,nGene,eGene,cGene,description) %>% semi_join(tibble(name=V(subg)$name), by='name') %>% write_delim(file.path(outdir,'CRM_NPD_priority_subg.txt'), delim='\t')
Pathway crosstalk genes stored in the output file CRM_NPD_priority_subg.txt
above can be explored below. Notes, genes are ranked by rating (credit score ranged 0-10; see the column rating
).
subg <- subg %>% oLayout("graphlayouts.layout_with_stress")
gp_subg_evidence <- oVisEvidenceAdv(dTarget, nodes=V(subg)$name, g=subg, node.info="simple", colormap="jet.top", zlim=c(0,10), node.color.alpha=0.7, node.size.range=4, node.color.title="Credit\nscore", edge.color='steelblue', edge.color.alpha=0.2, edge.curve=0.05)
# saved into a file 'CRM_NPD_subg_evidence.pdf' under the folder 'pipe_showcase'
ggsave(file.path(outdir,'CRM_NPD_subg_evidence.pdf'), gp_subg_evidence, width=7, height=7)
Figure 7.1: Visualisation of the pathway crosstalk, with nodes labelled by gene symbols (ranks), and if identified as genomic/pleiotropic seed genes, also highlighted as pie segments.
Removal analysis is conducted to evaluate the effect of nodes on the crosstalk, done so: (i) removed individually (i.e. single-node removal), and (ii) removed in combination (e.g. combinatorial removal involving any two nodes). Removing nodes, if critical for the network, would result in a large fraction of nodes disconnected from the largest network component. These critical nodes are identified based on the optimal removal with the largest effect when removing single nodes or any two nodes in combination.
levels <- subg %>% oIG2TB('nodes') %>% arrange(name) %>% pull(name)
ls_df <- lapply(seq(2), function(i){
combn(levels,i,simplify=F) -> nodes.combine
df <- oAttack(subg, measure="combine", nodes.combine=nodes.combine)
df %>% mutate(i=i)
})
df_removal <- do.call(rbind, ls_df) %>% arrange(i,-frac.disconnected)
## df_single
df_single <- df_removal %>% filter(i==1) %>% arrange(-frac.disconnected)
## df_comb_best
df_comb_best <- df_removal %>% filter(i>1) %>% group_by(i) %>% top_n(1, frac.disconnected) %>% ungroup()
## gp_optimal
data <- rbind(df_comb_best %>% filter(i<=2), df_single) %>% distinct() %>% arrange(i,frac.disconnected) %>% mutate(x=fct_inorder(nodes.removed)) %>% transmute(value=frac.disconnected, member=nodes.removed)
levels.customised <- data %>% separate_rows(member,sep=',') %>% pull(member) %>% unique()
gp <- data %>% oUpsetAdv(member.levels='customised',levels.customised=levels.customised, label.height.unit=6.5)
gp <- gp + scale_y_continuous(limits=NULL) + geom_point(aes(fill=as.factor(i)), shape=22, size=2, color="transparent") + geom_line(aes(group=i,color=as.factor(i))) + guides(color='none')
gp_optimal <- gp + ylab('Effect on the crosstalk measured \nby fraction of nodes disconnected\nupon node removal') + scale_fill_discrete(guide=guide_legend('Optimal combination',title.position="top",nrow=1)) + theme(legend.position="bottom", axis.title.y=element_text(size=8), axis.text.y=element_text(size=7))
### saved into a file 'CRM_NPD_subg_removal_optimal.pdf' under the folder 'pipe_showcase'
ggsave(file.path(outdir,'CRM_NPD_subg_removal_optimal.pdf'), gp_optimal, width=7, height=9)
## gp_highlight
### visualise the crosstalk highlighted by nodes
vec_hightlight <- df_comb_best %>% separate_rows(nodes.removed, sep=',') %>% pull(nodes.removed) %>% unique()
### degree
ig <- subg
igraph::degree(ig) -> V(ig)$degree
### highlight: label and color
df_hightlight <- tibble(name=vec_hightlight, flag=1)
df_hightlight_label <- tibble(name=V(ig)$name) %>% left_join(df_hightlight, by='name') %>% mutate(flag=ifelse(is.na(flag),0,flag)) %>% mutate(label=ifelse(flag==1,name,''))
V(ig)$label <- df_hightlight_label %>% pull(label)
V(ig)$color <- df_hightlight_label %>% pull(flag)
gp_hightlight <- ig %>% oGGnetwork(node.label='label', node.label.size=3, node.label.color='black', node.label.alpha=1, node.label.padding=0.1, node.label.arrow=0, node.label.force=1, node.xcoord="xcoord", node.ycoord="ycoord", node.color="color", node.color.title='rating', colormap="steelblue4-darkred", zlim=c(0,1), node.color.alpha=0.7, node.size='degree', node.size.title='degree', node.size.range=c(1.5,5), edge.color="steelblue4", edge.color.alpha=0.5, edge.size=0.3, edge.curve=0.05, edge.arrow.gap=0.01)
gp_hightlight <- gp_hightlight + guides(color="none",size="none")
### saved into a file 'CRM_NPD_subg_removal_optimal_highlight.pdf' under the folder 'pipe_showcase'
ggsave(file.path(outdir,'CRM_NPD_subg_removal_optimal_highlight.pdf'), gp_hightlight, width=5, height=5)
Figure 7.2: Effect (fraction of disconnected nodes in the y-axis) of node removals, individually or in combination (indicated by colored circles beneath in the x-axis), on the crosstalk. Notably, only the optimal removal with the largest effect is illustrated for two-node combinatorial removal.
Figure 7.3: Visualisation of the crosstalk, labelled for genes with optimal removals.