1 Overview

In this demo, the PIPE is applied to cross-disease pleiotropic associations for neuropsychiatric disorders involving eight specific diseases; see PMID:31835028. The validity is demonstrated in recovering clinical therapeutics, with the usefulness illustrated to identify diseases where pleiotropy can inform clinical therapeutics and to determine disease relationships based on shared therapeutic targets. Also demonstrated is the identification of a network of highly prioritised genes that mediate crosstalk between molecular pathways as well as the identification of critical network nodes through crosstalk-based effect-by-removal analysis.

2 Important note

The results showcased in this demo might deviate slightly from original reports. This is mainly because of the regular updates to the built-in datasets (see below): GWAS_LD.EUR for European LD SNPs, dbSNP_Common for dbSNP common SNPs, UCSC_knownGene for UCSC known genes, STRING_fin for STRING functional interactions, ig.EF.ChEMBL for ChEMBL known drug targets, ig.KEGG.category for KEGG pathway-derived interactions, and many more.

3 R and Package installation

# Please install R (version 4.4.1 or above)
# see https://cran.r-project.org

# if the package 'BiocManager' not installed, please do so
if(!("BiocManager" %in% rownames(installed.packages()))) install.packages("BiocManager")

# first, install basic packages: remotes, tidyverse, pbapply, ComplexHeatmap, patchwork, Rgraphviz, caret, randomForest
BiocManager::install(c('remotes','tidyverse','pbapply','ComplexHeatmap','patchwork','Rgraphviz','caret','randomForest'), dependencies=T)

# then, install the package 'PIPE' (now hosted at github)
BiocManager::install("hfang-bristol/PIPE", dependencies=T, force=T)

# check the package 'PIPE' successfully installed
library(PIPE)

4 Package loading and built-in data placeholder

# load packages
library(PIPE)
library(tidyverse)
library(patchwork)

# specify built-in data placeholder
placeholder <- "http://www.comptransmed.pro/bigdata_pipe"

# create the output folder 'pipe_showcase'
outdir <- 'pipe_showcase'
if(!dir.exists(outdir)) dir.create(outdir)

5 Pleiotropy-informing prioritisation

5.1 Input data

# read GWAS summary data as input
data.file <- file.path(placeholder, "GWAS.cross.SNP.txt")
input_data <- read_delim(data.file, delim='\t') %>% filter(pubmedid==31835028) %>% distinct(snp_id_current,pvalue)

5.2 Parameters and built-in datasets

# LD.customised
LD.customised <- oRDS('GWAS_LD.EUR', placeholder=placeholder)

# All common SNPs (primarily sourced from dbSNP)
options(timeout=120) # please increase the timeout if failed to download
GR.SNP <- oRDS("dbSNP_Common", placeholder=placeholder)

# All known genes (primarily sourced from UCSC)
GR.Gene <- oRDS("UCSC_knownGene", placeholder=placeholder)

# Functional interaction network (primarily sourced from STRING)
network.customised <- oRDS("STRING_fin", placeholder=placeholder)

# include.RGB
include.RGB <- c("PCHiC_PMID27863249_Activated_total_CD4_T_cells","PCHiC_PMID27863249_Macrophages_M0","PCHiC_PMID27863249_Macrophages_M1","PCHiC_PMID27863249_Macrophages_M2","PCHiC_PMID27863249_Megakaryocytes","PCHiC_PMID27863249_Monocytes","PCHiC_PMID27863249_Naive_B_cells","PCHiC_PMID27863249_Naive_CD4_T_cells","PCHiC_PMID27863249_Naive_CD8_T_cells","PCHiC_PMID27863249_Neutrophils","PCHiC_PMID27863249_Nonactivated_total_CD4_T_cells","PCHiC_PMID27863249_Total_B_cells","PCHiC_PMID27863249_Total_CD4_T_cells","PCHiC_PMID27863249_Total_CD8_T_cells","PCHiC_PMID31501517_DorsolateralPrefrontalCortex","PCHiC_PMID31501517_H1NeuralProgenitorCell","PCHiC_PMID31501517_Hippocampus","PCHiC_PMID31501517_LCL", "PCHiC_PMID31367015_astrocytes","PCHiC_PMID31367015_excitatory","PCHiC_PMID31367015_hippocampal","PCHiC_PMID31367015_motor", "PCHiC_PMID25938943_CD34","PCHiC_PMID25938943_GM12878")

# include.QTL
include.QTL <- c("eQTL_eQTLGen", "eQTL_eQTLCatalogue_Alasoo_2018_macrophage_IFNg","eQTL_eQTLCatalogue_Alasoo_2018_macrophage_IFNgSalmonella","eQTL_eQTLCatalogue_Alasoo_2018_macrophage_Salmonella","eQTL_eQTLCatalogue_Alasoo_2018_macrophage_naive","eQTL_eQTLCatalogue_BLUEPRINT_Tcell","eQTL_eQTLCatalogue_BLUEPRINT_monocyte","eQTL_eQTLCatalogue_BLUEPRINT_neutrophil","eQTL_eQTLCatalogue_BrainSeq_brain","eQTL_eQTLCatalogue_CEDAR_microarray_Bcell_CD19","eQTL_eQTLCatalogue_CEDAR_microarray_Tcell_CD4","eQTL_eQTLCatalogue_CEDAR_microarray_Tcell_CD8","eQTL_eQTLCatalogue_CEDAR_microarray_monocyte_CD14","eQTL_eQTLCatalogue_CEDAR_microarray_neutrophil_CD15","eQTL_eQTLCatalogue_CEDAR_microarray_platelet","eQTL_eQTLCatalogue_Fairfax_2012_microarray_Bcell_CD19","eQTL_eQTLCatalogue_Fairfax_2014_microarray_monocyte_IFN24","eQTL_eQTLCatalogue_Fairfax_2014_microarray_monocyte_LPS2","eQTL_eQTLCatalogue_Fairfax_2014_microarray_monocyte_LPS24","eQTL_eQTLCatalogue_Fairfax_2014_microarray_monocyte_naive","eQTL_eQTLCatalogue_GENCORD_LCL","eQTL_eQTLCatalogue_GENCORD_Tcell","eQTL_eQTLCatalogue_GEUVADIS_LCL","eQTL_eQTLCatalogue_GTEx_V8_Brain_Amygdala","eQTL_eQTLCatalogue_GTEx_V8_Brain_Anterior_cingulate_cortex_BA24","eQTL_eQTLCatalogue_GTEx_V8_Brain_Caudate_basal_ganglia","eQTL_eQTLCatalogue_GTEx_V8_Brain_Cerebellar_Hemisphere","eQTL_eQTLCatalogue_GTEx_V8_Brain_Cerebellum","eQTL_eQTLCatalogue_GTEx_V8_Brain_Cortex","eQTL_eQTLCatalogue_GTEx_V8_Brain_Frontal_Cortex_BA9","eQTL_eQTLCatalogue_GTEx_V8_Brain_Hippocampus","eQTL_eQTLCatalogue_GTEx_V8_Brain_Hypothalamus","eQTL_eQTLCatalogue_GTEx_V8_Brain_Nucleus_accumbens_basal_ganglia","eQTL_eQTLCatalogue_GTEx_V8_Brain_Putamen_basal_ganglia","eQTL_eQTLCatalogue_GTEx_V8_Brain_Spinal_cord_cervical_c1","eQTL_eQTLCatalogue_GTEx_V8_Brain_Substantia_nigra","eQTL_eQTLCatalogue_GTEx_V8_Nerve_Tibial","eQTL_eQTLCatalogue_GTEx_V8_Pituitary","eQTL_eQTLCatalogue_GTEx_V8_Whole_Blood","eQTL_eQTLCatalogue_Kasela_2017_microarray_Tcell_CD4","eQTL_eQTLCatalogue_Kasela_2017_microarray_Tcell_CD8","eQTL_eQTLCatalogue_Lepik_2017_blood","eQTL_eQTLCatalogue_Naranbhai_2015_microarray_neutrophil_CD16","eQTL_eQTLCatalogue_Nedelec_2016_macrophage_Listeria","eQTL_eQTLCatalogue_Nedelec_2016_macrophage_Salmonella","eQTL_eQTLCatalogue_Nedelec_2016_macrophage_naive","eQTL_eQTLCatalogue_Quach_2016_monocyte_IAV","eQTL_eQTLCatalogue_Quach_2016_monocyte_LPS","eQTL_eQTLCatalogue_Quach_2016_monocyte_Pam3CSK4","eQTL_eQTLCatalogue_Quach_2016_monocyte_R848","eQTL_eQTLCatalogue_Quach_2016_monocyte_naive","eQTL_eQTLCatalogue_ROSMAP_brain_naive","eQTL_eQTLCatalogue_Schmiedel_2018_Bcell_naive","eQTL_eQTLCatalogue_Schmiedel_2018_CD4_Tcell_antiCD3CD28","eQTL_eQTLCatalogue_Schmiedel_2018_CD4_Tcell_naive","eQTL_eQTLCatalogue_Schmiedel_2018_CD8_Tcell_antiCD3CD28","eQTL_eQTLCatalogue_Schmiedel_2018_CD8_Tcell_naive","eQTL_eQTLCatalogue_Schmiedel_2018_NKcell_naive","eQTL_eQTLCatalogue_Schmiedel_2018_Tfh_memory","eQTL_eQTLCatalogue_Schmiedel_2018_Th117_memory","eQTL_eQTLCatalogue_Schmiedel_2018_Th17_memory","eQTL_eQTLCatalogue_Schmiedel_2018_Th1_memory","eQTL_eQTLCatalogue_Schmiedel_2018_Th2_memory","eQTL_eQTLCatalogue_Schmiedel_2018_Treg_memory","eQTL_eQTLCatalogue_Schmiedel_2018_Treg_naive","eQTL_eQTLCatalogue_Schmiedel_2018_monocyte_CD16_naive","eQTL_eQTLCatalogue_Schmiedel_2018_monocyte_naive","eQTL_eQTLCatalogue_Schwartzentruber_2018_sensory_neuron")

5.3 Initial round of prioritisation

# prepare predictors
ls_pNode_genomic <- oPierSNPsAdv(data=input_data, LD.customised=LD.customised, significance.threshold=5e-8, GR.SNP=GR.SNP, GR.Gene=GR.Gene, distance.max=20000, decay.kernel="constant", include.QTL=include.QTL, include.RGB=include.RGB, network.customised=network.customised, placeholder=placeholder, verbose.details=F)

# do prioritisation（the initial round）
ls_pNode <- Filter(Negate(is.null), ls_pNode_genomic)
dTarget0 <- oPierMatrix(ls_pNode, displayBy="pvalue", aggregateBy="fishers", rangeMax=10, GR.Gene=GR.Gene, placeholder=placeholder)

5.4 Informative predictors

# GSP (phase 2 or above drug targets for neuropsychiatric disorders)
ig.EF.ChEMBL <- oRDS('ig.EF.ChEMBL', placeholder=placeholder)

codes_name <- c("anorexia nervosa","attention deficit hyperactivity disorder","autism spectrum disorder","bipolar disorder","unipolar depression","obsessive-compulsive disorder","schizophrenia","Tourette syndrome")
ind <- match(codes_name, V(ig.EF.ChEMBL)$name)
GSP <- base::Reduce(union, V(ig.EF.ChEMBL)$GSP[ind])

# GSN
druggable_targets <- do.call(rbind, V(ig.EF.ChEMBL)$phase) %>% pull(target) %>% unique()
g1 <- oRDS('org.Hs.string_medium', placeholder=placeholder)
g2 <- oRDS('org.Hs.PCommons_UN', placeholder=placeholder)
g <- oCombineNet(list(g1,g2), combineBy='union', attrBy="intersect")
sGS <- oGSsimulator(GSP=GSP, population=druggable_targets, network.customised=g, neighbor.order=1, verbose=F)
GSN <- sGS$GSN
message(sprintf("GSP (%d), GSN (%d)", length(GSP), length(GSN)), appendLF=T)

# predictor matrix
df_predictor <- oPierMatrix(ls_pNode, displayBy="score", aggregateBy="none", GR.Gene=GR.Gene, placeholder=placeholder)

# informative predictors
sClass <- oClassifyRF(df_predictor, GSP=GSP, GSN=GSN, nfold=3, nrepeat=10, mtry=NULL, ntree=1000, cv.aggregateBy="fishers")
df_imp <- sClass$importance %>% as_tibble(rownames='predictor')
df_perf <- sClass$performance %>% as_tibble(rownames='predictor')
imp_perf <- df_imp %>% inner_join(df_perf, by='predictor') %>% mutate(group=str_replace_all(predictor,'_.*','')) %>% arrange(group,-MeanDecreaseAccuracy)
xintercept <- imp_perf %>% filter(group=='nGene') %>% pull(MeanDecreaseAccuracy)
yintercept <- imp_perf %>% filter(group=='nGene') %>% pull(MeanDecreaseGini)
zintercept <- imp_perf %>% filter(group=='nGene') %>% pull(auroc)
imp_perf2 <- imp_perf %>% filter(MeanDecreaseAccuracy>=0, MeanDecreaseGini>=0, direction=='+') %>% mutate(x=MeanDecreaseAccuracy>=xintercept, y=MeanDecreaseGini>=yintercept, z=auroc>=zintercept) %>% arrange(-MeanDecreaseAccuracy)

# predictors selected based the importance no worse than nGene
vec_predictor_selected <- imp_perf2 %>% filter(x|y|z) %>% pull(predictor)
vec_predictor_selected

5.5 Prioritisation based on informative predictors

# do prioritisation（based on informative predictors）
dTarget <- oPierMatrix(ls_pNode[vec_predictor_selected], displayBy="pvalue", aggregateBy="fishers", rangeMax=10, GR.Gene=GR.Gene, placeholder=placeholder)

# write into a file 'CRM_NPD_priority.txt.gz' under the folder 'pipe_showcase'
dTarget$priority %>% select(name,rating,rank,seed,nGene,eGene,cGene,description) %>% write_delim(file.path(outdir,'CRM_NPD_priority.txt.gz'), delim='\t')

# save into a file 'dTarget_NPD.CRM.RData' under the folder 'pipe_showcase'
save(list='dTarget', file=file.path(outdir,'dTarget_NPD.CRM.RData'))

5.6 Prioritised target genes

Target genes stored in the output file CRM_NPD_priority.txt.gz above can be explored below. Notes, genes are ranked by rating (credit score ranged 0-10; see the column rating).

5.7 Manhattan plot

top.label.query <- dTarget$priority %>% top_frac(0.5,rating) %>% pull(name)
label.query.only <- T
gp_manhattan <- oPierManhattan(dTarget, color=c("darkred", "darkorange"), point.size=0.2, top=30, top.label.type="text", top.label.size=2, top.label.col="black", top.label.query=top.label.query, label.query.only=label.query.only, y.lab="Credit score\n(neuropsychiatric disorders)", GR.Gene=GR.Gene, placeholder=placeholder)

# saved into a file 'CRM_NPD_manhattan.pdf' under the folder 'pipe_showcase'
ggsave(file.path(outdir,'CRM_NPD_manhattan.pdf'), gp_manhattan, width=10, height=3)

Figure 5.1: Manhattan plot illustrates credit score (y-axis) for prioritised target genes (color-coded by chromosomes; x-axis), with top 30 genes named.

6 Performance evaluation

6.1 Benchmarking

codes_name <- c("anorexia nervosa","attention deficit hyperactivity disorder","autism spectrum disorder","bipolar disorder","unipolar depression","obsessive-compulsive disorder","schizophrenia","Tourette syndrome")

ls_tb <- pbapply::pblapply(1:length(codes_name), function(j){
    
    message(sprintf("Processing %s ...", codes_name[j]), appendLF=T)

    ind <- match(codes_name[j], V(ig.EF.ChEMBL)$name)
    if(sum(is.na(ind))==0){
        GSP <- base::Reduce(union, V(ig.EF.ChEMBL)$GSP[ind])
        GSN <- base::Reduce(union, V(ig.EF.ChEMBL)$GSN[ind])
        message(sprintf("%s, GSP (%d), GSN (%d): (%s)", codes_name[j], length(GSP), length(GSN), as.character(Sys.time())), appendLF=T)
    }
    
    ls_pNode_selected <- list('PIPE'=dTarget, 'PIPE0'=dTarget0)
    ls_pPerf <- lapply(ls_pNode_selected, function(x){
        if (is(x,"pNode")){
            prediction <- x$priority %>% select(name,priority)
        }else if(is(x,"dTarget")){
            prediction <- x$priority %>% select(name,rating)
        }
        pPerf <- oClassifyPerf(prediction=prediction, GSP=GSP, GSN=GSN, verbose=F)
    })
    
    tb <- tibble(name=codes_name[j], methods=names(ls_pPerf), pPerf=ls_pPerf)
})

# do extracting
ls_auroc <- lapply(1:length(ls_tb), function(j){
    tb <- ls_tb[[j]] %>% filter(methods %in% c("PIPE","PIPE0"))
    auroc <- sapply(1:nrow(tb), function(i){
        tb$pPerf[[i]]$auroc
    })
    tb %>% mutate(auroc=auroc) %>% select(-pPerf)
})
df_auroc <- do.call(rbind, ls_auroc)
mat <- df_auroc %>% select(name,methods,auroc) %>% pivot_wider(names_from=methods, values_from=auroc) %>% column_to_rownames('name') %>% arrange(-PIPE)
data.label <- round(mat*1000) / 1000
gp_auc <- oHeatmap(mat, zlim=c(0,1), colormap="brewer.YlOrRd", x.rotate=45, x.text.size=8, y.text.size=8, legend.title="AUC", legend.text.size=6, legend.title.size=8, shape=19, size=12, data.label=data.label, label.size=3, label.color="white")

# saved into a file 'CRM_NPD_AUC.pdf' under the folder 'pipe_showcase'
ggsave(file.path(outdir,'CRM_NPD_AUC.pdf'), gp_auc, width=4, height=4)

Heatmap-like comparisons of the performance between PIPE (based on informative predictors) and PIPE0 (based on all predictors), measured by area under the ROC curve (AUC) separating clinical proof-of-concept targets from simulated negative targets.

Figure 6.1: Heatmap-like comparisons of the performance between PIPE (based on informative predictors) and PIPE0 (based on all predictors), measured by area under the ROC curve (AUC) separating clinical proof-of-concept targets from simulated negative targets.

6.2 Leading prioritisation analysis

Leading prioritisation analysis (LPA) is performed to quantify the degree to which a target set (e.g. clinical proof-of-concept targets) is enriched at the ‘leading prioritisation’ of the prioritised gene list ranked by credit scores. The leading prioritisation (or the leading edge) contains the left-most subset of the highly prioritised/credited gene list, with a normalised enrichment score (NES) calculated as the observed running enrichment score divided by the expected one, and the significance level (P-value) estimated by the permutation test 50,000 times.

# df_priority
df_priority <- dTarget$priority %>% top_frac(1,rating) %>% transmute(priority=rating, rank=rank)
df_priority <- dTarget$priority %>% column_to_rownames("name") %>% top_frac(1,rating) %>% transmute(priority=rating, rank=rank)

# prepare SET
codes_name <- c("anorexia nervosa","attention deficit hyperactivity disorder","autism spectrum disorder","bipolar disorder","unipolar depression","obsessive-compulsive disorder","schizophrenia","Tourette syndrome")
ind <- match(codes_name, V(ig.EF.ChEMBL)$name)
codes <- c('AN','ADH','ASD','BIP','MD','OCD','SCZ','TS')
ls_gsp <- V(ig.EF.ChEMBL)$GSP[ind]
names(ls_gsp) <- codes
info <- ls_gsp %>% enframe() %>% transmute(id=name, member=value)
info <- info %>% mutate(name=codes_name, namespace='NPD', distance=NA) %>% mutate(n=map_dbl(member,length)) %>% select(name,id,namespace,distance,member,n)
set <- list(info=info, stamp=as.Date(Sys.time()))
class(set) <- "SET"

# do leading prioritisation analysis (LPA)
set.seed(825)
eLPA <- oPierGSEA(df_priority, customised.genesets=set, size.range=c(10,5000), nperm=50000)
df_summary <- eLPA$df_summary %>% mutate(flag=ifelse(adjp<0.05,'Y','N')) %>% mutate(frac=nLead/nAnno) %>% filter(frac<=1) %>% arrange(tolower(setID))
df_leading <- eLPA$leading[df_summary %>% pull(setID)] %>% enframe() %>% transmute(setID=name,members=value) %>% mutate(rank_members=map_chr(members, function(x) str_c(names(x),' (',x,')', collapse=', ')))
df_summary_leading <- df_summary %>% inner_join(df_leading, by='setID')

# save into a file 'CRM_NPD_lpa.txt' under the folder 'pipe_showcase'
df_summary_leading %>% select(-members) %>% write_delim(file.path(outdir,'CRM_NPD_lpa.txt'), delim='\t')

Target genes stored in the output file CRM_NPD_lpa.txt above can be explored below. Notes, the column frac refers to the fraction of clinical proof-of-concept targets recovered at the leading prioritisation of the prioritised list.

Pleiotropy Informing Clinical Therapeutics (PICT) is defined as the tendency of the prioritised gene list to be clinical proof-of-concept targets. A disease, if having a more significant enrichment (FDR; P-value adjusted accounting for the multiple tests), a higher NES and a higher coverage (the fraction of clinical proof-of-concept targets recovered at the ‘leading prioritisation’ of the prioritised list), will have a higher PICT potential.

pict <- df_summary_leading %>% mutate(pict=log10(nes * frac / adjp)) %>% select(setID, nes, adjp, frac, nAnno, nLead, pict) %>% arrange(tolower(setID))
gp_pict <- oPolarBar(pict %>% transmute(name=setID,value=pict), colormap='sci_locuszoom', size.name=9, size.value=4)

# saved into a file 'CRM_NPD_pict.pdf' under the folder 'pipe_showcase'
ggsave(file.path(outdir,'CRM_NPD_pict.pdf'), gp_pict, width=7, height=6)

Figure 6.2: Pie chart sized by PICT for individual diseases arranged in clockwise order.

Leading prioritisation plot is shown for attention deficit hyperactivity disorder (ADHD).

leading.query <- df_priority %>% as_tibble(rownames='target') %>% top_frac(1,priority) %>% pull(target)
gp_leading <- oGSEAdotplot(eLPA, top='attention deficit hyperactivity disorder', x.scale="normal", leading=T, leading.edge.only=T, leading.query.only=T, leading.query=leading.query, colormap="jet.top", zlim=c(0,10), peak.color='black', leading.size=2, clab='Credit\nscore', leading.label.direction="left") + theme(plot.title=element_text(hjust=0.5,size=10), plot.subtitle=element_text(hjust=0.5,size=8))

# saved into a file 'CRM_NPD_leading_adhd.pdf' under the folder 'pipe_showcase'
ggsave(file.path(outdir,'CRM_NPD_leading_adhd.pdf'), gp_leading, width=5, height=5)

Leading prioritisation plot for attention deficit hyperactivity disorder (ADHD), where clinical proof-of-concept targets recovered are indicated in vertical lines, labelled in gene symbols, and color-coded by the credit score.

Figure 6.3: Leading prioritisation plot for attention deficit hyperactivity disorder (ADHD), where clinical proof-of-concept targets recovered are indicated in vertical lines, labelled in gene symbols, and color-coded by the credit score.

6.3 Determining target-level disease relationships

The relationships between diseases are presented as a network, with diseases as nodes and their estimated relations as edges. Member genes for each disease are defined as clinical proof-of-concept targets recovered at the leading prioritisation. The edges are initially inferred if any member genes are shared between diseases, followed by the filtering by identifying the minimum spanning tree to keep only edges found in the resulting tree. The thickness of edges is proportional to the number of member genes shared between two-endpoint diseases.

# Member genes for each disease defined as clinical proof-of-concept targets recovered at the leading prioritisation
data <- df_summary_leading %>% transmute(name=setID, members=rank_members)

# remove (rank)
data0 <- data %>% separate_rows(members, sep=", ") %>% mutate(members=str_replace_all(members, ' \\(.*', '')) %>% nest(data=-name) %>% mutate(members=map_chr(data, function(x) str_c(unlist(x), collapse=', '))) %>% select(-data)

# bigraph
big <- data %>% separate_rows(members, sep=", ") %>% mutate(flag=1) %>% select(members,name,flag) %>% pivot_wider(names_from=members, values_from=flag, values_fill=list(flag=0)) %>% column_to_rownames("name") %>% oBicreate() %>% oBigraph()
big <- big %>% oLayout("graphlayouts.layout_with_stress")
igraph::degree(big) -> V(big)$degree
if(1){
    V(big)$rating <- str_replace_all(V(big)$name, '.*\\(|\\).*', '') %>% as.numeric()
    ifelse(V(big)$type=='xnode', V(big)$name, "") -> V(big)$label
    ifelse(V(big)$type=='ynode' & V(big)$rating<=nrow(df_priority) * 0.02 | V(big)$type=='xnode', V(big)$name, "") -> V(big)$label
}
gp_big <- big %>% oGGnetwork(node.label="label", node.label.size=2, node.label.force=0.02, node.xcoord="xcoord", node.ycoord="ycoord", colormap='steelblue-steelblue', node.shape="type", node.size="degree", node.size.range=c(0.5,6),edge.color="grey80", edge.color.alpha=0.3, edge.arrow.gap=0, edge.curve=0.05)
## saved into a file 'CRM_NPD_big.pdf' under the folder 'pipe_showcase'
ggsave(file.path(outdir,'CRM_NPD_big.pdf'), gp_big, width=5, height=5)

# identify the minimum spanning tree (mst)
tig <- data %>% oTIG(method='empirical', empirical.cutoff=0.8)
tig <- tig %>% oLayout("gplot.layout.fruchtermanreingold")
gp_mst <- oGGnetwork(tig, node.label='name', node.label.size=2.5, node.label.color='steelblue', node.label.force=1, node.xcoord='xcoord', node.ycoord='ycoord', colormap="steelblue-steelblue", edge.size='weight', node.shape=18, node.size='n_members', node.size.title="Num of\ngenes", slim=c(0,25), node.size.range=c(2,5), edge.color='gray80', edge.color.alpha=0.3, edge.curve=0,edge.arrow.gap=0.01)
## saved into a file 'CRM_NPD_mst.pdf' under the folder 'pipe_showcase'
ggsave(file.path(outdir,'CRM_NPD_mst.pdf'), gp_mst, width=5, height=5)

Network-like representation of diseases (in circle) and member genes (in triangle). Member genes shared between any two diseases are also labelled, including target ranks (in the parentheses).

Figure 6.4: Network-like representation of diseases (in circle) and member genes (in triangle). Member genes shared between any two diseases are also labelled, including target ranks (in the parentheses).

Inter-disease relationships inferred by sharing of member genes. Nodes are sized by the number of member genes, and edges for relationships (with the thickness proportional to the number of genes shared between two-endpoint diseases).

Figure 6.5: Inter-disease relationships inferred by sharing of member genes. Nodes are sized by the number of member genes, and edges for relationships (with the thickness proportional to the number of genes shared between two-endpoint diseases).

6.4 Side-by-side comparisons of recovered therapeutics

A heatmap is used to illustrate clinical proof-of-concept therapeutic targets recovered at the leading prioritisation for each individual disease within neuropsychiatric disorders

library(ComplexHeatmap)

# df_priority_leading
data <- df_summary_leading %>% transmute(name=setID,members) %>% mutate(target=map(members, function(x) names(x))) %>% mutate(rating=map(members, function(x) x)) %>% select(name,target,rating) %>% unnest(c(target,rating))
df_x <- data %>% pivot_wider(names_from=name, values_from=rating)
df_y <- df_priority %>% as_tibble(rownames='target')
df_priority_leading <- df_y %>% inner_join(df_x, by='target') %>% arrange(-priority)

# write into a file 'CRM_NPD_priority_leading.txt' under the folder 'pipe_showcase'
df_priority_leading %>% write_delim(file.path(outdir,'CRM_NPD_priority_leading.txt'), delim='\t')

# ht_main
D <- df_priority_leading %>% mutate(rname=str_c(target,' (',rank,')')) %>% select(rname, priority) %>% column_to_rownames('rname')
col_fun <- circlize::colorRamp2(seq(0,6,length=64), oColormap('spectral')(64))
#vec_group <- df_priority_leading %>% pull(rank)
#vec_group <- as.numeric(cut(df_priority_leading$rank, breaks=seq(0,10000,100)))
vec_group <- as.numeric(cut(df_priority_leading$rank, breaks=ceiling(nrow(df_priority) * seq(0,0.5,0.05))))
ht_main <- Heatmap(D, name="Credit\nscore", col=col_fun, border=T, 
    row_split=vec_group, cluster_rows=F, show_row_dend=T, row_title=NULL, show_row_names=T, row_names_gp=gpar(fontsize=5), row_names_rot=0, row_names_side="left", clustering_distance_rows="euclidean", clustering_method_rows="average",
    cluster_columns=F, column_order=1:ncol(D), show_column_names=F, column_names_side="top", column_names_gp=gpar(fontsize=8), column_names_rot=0
)

# row_anno: ls_ha_mark
colormap <- "sci_locuszoom"
ncolor <- ncol(df_priority_leading) - 3
ls_ha_mark <- lapply(1:ncolor, function(j){
    node.color <- oColormap(colormap)(ncolor)[j]
    vec_at <- df_priority_leading %>% pull(j+3)
    vec_at[!is.na(vec_at)] <- 1
    vec_at[is.na(vec_at)] <- 0
    at <- which(vec_at==1)
    ha_mark_at <- rowAnnotation(
        D=anno_simple(vec_at, col=c("0"="grey95", "1"=node.color), width=unit(1.5,"mm")),
        mark=anno_mark(at=at, labels=df_priority_leading$target[at], labels_gp=gpar(fontsize=5,col="black"), lines_gp=gpar(col=node.color))
    )
    # D1, D2, D3, ...
    ha_mark_at@anno_list$D@label <- str_c('D',j)
    ha_mark_at
})
names(ls_ha_mark) <- colnames(df_priority_leading)[4:(3+ncolor)]
names(ls_ha_mark)
#ht <- ht_main + ls_ha_mark[[1]] + ls_ha_mark[[2]] + ls_ha_mark[[3]] + ls_ha_mark[[4]] + ls_ha_mark[[5]] + ls_ha_mark[[6]] + ls_ha_mark[[7]] + ls_ha_mark[[8]]
#draw(ht, heatmap_legend_side="left", annotation_legend_side="left")

# row_anno: ha_mark_approved
codes_name <- c("anorexia nervosa","attention deficit hyperactivity disorder","autism spectrum disorder","bipolar disorder","unipolar depression","obsessive-compulsive disorder","schizophrenia","Tourette syndrome")
ChEMBL <- oRDS("ChEMBL_v33", placeholder=placeholder)
ChEMBL_subset <- ChEMBL %>% filter(efo_term %in% codes_name)
### df_target_label
df_target_label <- ChEMBL_subset %>% filter(phase==4,target_number<=5) %>% select(Symbol,pref_name_drug,efo_term) %>% nest(data=pref_name_drug) %>% mutate(drugs=map_chr(data, function(x) {
    y <- x %>% pull(pref_name_drug)
    if(length(y)>1){
        y <- c(y %>% head(1), '...')
        str_c(y,collapse=" | ")
    }else{
        str_c(y %>% head(1),collapse=" | ")
    }
})) %>% mutate(label=str_c(efo_term,' (',drugs,')')) %>% select(Symbol,label) %>% group_by(Symbol) %>% summarise(label=str_c(label,collapse='\n    ')) %>% ungroup() %>% mutate(label=str_c(Symbol,':\n    ',label))
### df_label
df_label <- df_priority_leading %>% select(target) %>% left_join(df_target_label, by=c('target'='Symbol'))
vec_at <- df_label$label
at <- which(!is.na(vec_at))
ha_mark_approved <- rowAnnotation(
    A=anno_simple(0+!is.na(vec_at), col=c("0"="grey90", "1"="black"), width=unit(3,"mm")),
    mark=anno_mark(at=at, labels=df_label$label[at], labels_gp=gpar(fontsize=5,col='black'), lines_gp=gpar(col='black'))
)
#ht <- ht_main + ha_mark_approved
#draw(ht, heatmap_legend_side="left", annotation_legend_side="left")

# do visualisation
## saved into a file 'CRM_NPD_heatmap_therapeutics.pdf' under the folder 'pipe_showcase'
ht <- ht_main + ls_ha_mark[[1]] + ls_ha_mark[[2]] + ls_ha_mark[[3]] + ls_ha_mark[[4]] + ls_ha_mark[[5]] + ls_ha_mark[[6]] + ls_ha_mark[[7]] + ls_ha_mark[[8]] + ha_mark_approved
pdf(file.path(outdir, 'CRM_NPD_heatmap_therapeutics.pdf'), width=9.8, height=9.5)
draw(ht, heatmap_legend_side="left", annotation_legend_side="left")
dev.off()

Illustration of clinical proof-of-concept therapeutic targets recovered at the leading prioritisation for each of eight individual diseases (D1 - D8) within neuropsychiatric disorders, with labellings for approved drug targets (the column 'A').

Figure 6.6: Illustration of clinical proof-of-concept therapeutic targets recovered at the leading prioritisation for each of eight individual diseases (D1 - D8) within neuropsychiatric disorders, with labellings for approved drug targets (the column ‘A’).

7 Identification of targets at the pathway crosstalk level

From the list of all prioritised target genes, a concise and manageable list of genes are identified, likely mediating the crosstalk between molecular pathways (sourced from KEGG), that is, targets at the pathway crosstalk level.

ig.KEGG.category <- oRDS('ig.KEGG.category', placeholder=placeholder)
ig <- ig.KEGG.category %>% filter(category %in% c("Environmental Process")) %>% deframe() %>% oCombineNet()
ig2 <- oNetInduce(ig, nodes_query=V(ig)$name, largest.comp=T) %>% as.undirected()
subg <- oPierSubnet(dTarget, network.customised=ig2, subnet.size=50)

# do permutation test
#subg_perm <- oPierSubnet(dTarget, network.customised=ig2, subnet.size=50, test.permutation=T, num.permutation=100)

# save into a file 'subg_NPD.CRM.RData' under the folder 'pipe_showcase'
save(list='subg', file=file.path(outdir,'subg_NPD.CRM.RData'))

# write into a file 'CRM_NPD_priority_subg.txt' under the folder 'pipe_showcase'
dTarget$priority %>% select(name,rating,rank,seed,nGene,eGene,cGene,description)  %>% semi_join(tibble(name=V(subg)$name), by='name') %>% write_delim(file.path(outdir,'CRM_NPD_priority_subg.txt'), delim='\t')

7.1 Pathway crosstalk genes

Pathway crosstalk genes stored in the output file CRM_NPD_priority_subg.txt above can be explored below. Notes, genes are ranked by rating (credit score ranged 0-10; see the column rating).

7.2 Pathway crosstalk visualisation

subg <- subg %>% oLayout("graphlayouts.layout_with_stress")
gp_subg_evidence <- oVisEvidenceAdv(dTarget, nodes=V(subg)$name, g=subg, node.info="simple", colormap="jet.top", zlim=c(0,10), node.color.alpha=0.7, node.size.range=4, node.color.title="Credit\nscore", edge.color='steelblue', edge.color.alpha=0.2, edge.curve=0.05)

# saved into a file 'CRM_NPD_subg_evidence.pdf' under the folder 'pipe_showcase'
ggsave(file.path(outdir,'CRM_NPD_subg_evidence.pdf'), gp_subg_evidence, width=7, height=7)

Visualisation of the pathway crosstalk, with nodes labelled by gene symbols (ranks), and if identified as genomic/pleiotropic seed genes, also highlighted as pie segments.

Figure 7.1: Visualisation of the pathway crosstalk, with nodes labelled by gene symbols (ranks), and if identified as genomic/pleiotropic seed genes, also highlighted as pie segments.

7.3 Crosstalk-based effect-by-removal analysis

Removal analysis is conducted to evaluate the effect of nodes on the crosstalk, done so: (i) removed individually (i.e. single-node removal), and (ii) removed in combination (e.g. combinatorial removal involving any two nodes). Removing nodes, if critical for the network, would result in a large fraction of nodes disconnected from the largest network component. These critical nodes are identified based on the optimal removal with the largest effect when removing single nodes or any two nodes in combination.

levels <- subg %>% oIG2TB('nodes') %>% arrange(name) %>% pull(name)
ls_df <- lapply(seq(2), function(i){
    combn(levels,i,simplify=F) -> nodes.combine
    df <- oAttack(subg, measure="combine", nodes.combine=nodes.combine)
    df %>% mutate(i=i)
})
df_removal <- do.call(rbind, ls_df) %>% arrange(i,-frac.disconnected) 

## df_single
df_single <- df_removal %>% filter(i==1) %>% arrange(-frac.disconnected)
## df_comb_best
df_comb_best <- df_removal %>% filter(i>1) %>% group_by(i) %>% top_n(1, frac.disconnected) %>% ungroup()

## gp_optimal
data <- rbind(df_comb_best %>% filter(i<=2), df_single) %>% distinct() %>% arrange(i,frac.disconnected) %>% mutate(x=fct_inorder(nodes.removed)) %>% transmute(value=frac.disconnected, member=nodes.removed)
levels.customised <- data %>% separate_rows(member,sep=',') %>% pull(member) %>% unique()
gp <- data %>% oUpsetAdv(member.levels='customised',levels.customised=levels.customised, label.height.unit=6.5)
gp <- gp + scale_y_continuous(limits=NULL) + geom_point(aes(fill=as.factor(i)), shape=22, size=2, color="transparent") + geom_line(aes(group=i,color=as.factor(i))) + guides(color='none')
gp_optimal <- gp + ylab('Effect on the crosstalk measured \nby fraction of nodes disconnected\nupon node removal') + scale_fill_discrete(guide=guide_legend('Optimal combination',title.position="top",nrow=1)) + theme(legend.position="bottom", axis.title.y=element_text(size=8), axis.text.y=element_text(size=7))
### saved into a file 'CRM_NPD_subg_removal_optimal.pdf' under the folder 'pipe_showcase'
ggsave(file.path(outdir,'CRM_NPD_subg_removal_optimal.pdf'), gp_optimal, width=7, height=9)

## gp_highlight
### visualise the crosstalk highlighted by nodes
vec_hightlight <- df_comb_best %>% separate_rows(nodes.removed, sep=',') %>% pull(nodes.removed) %>% unique()
### degree
ig <- subg
igraph::degree(ig) -> V(ig)$degree
### highlight: label and color
df_hightlight <- tibble(name=vec_hightlight, flag=1)
df_hightlight_label <- tibble(name=V(ig)$name) %>% left_join(df_hightlight, by='name') %>% mutate(flag=ifelse(is.na(flag),0,flag)) %>% mutate(label=ifelse(flag==1,name,''))
V(ig)$label <- df_hightlight_label %>% pull(label)
V(ig)$color <- df_hightlight_label %>% pull(flag)
gp_hightlight <- ig %>% oGGnetwork(node.label='label', node.label.size=3, node.label.color='black', node.label.alpha=1, node.label.padding=0.1, node.label.arrow=0, node.label.force=1, node.xcoord="xcoord", node.ycoord="ycoord", node.color="color", node.color.title='rating', colormap="steelblue4-darkred", zlim=c(0,1), node.color.alpha=0.7, node.size='degree', node.size.title='degree', node.size.range=c(1.5,5), edge.color="steelblue4", edge.color.alpha=0.5, edge.size=0.3, edge.curve=0.05, edge.arrow.gap=0.01)
gp_hightlight <- gp_hightlight + guides(color="none",size="none")
### saved into a file 'CRM_NPD_subg_removal_optimal_highlight.pdf' under the folder 'pipe_showcase'
ggsave(file.path(outdir,'CRM_NPD_subg_removal_optimal_highlight.pdf'), gp_hightlight, width=5, height=5)

$Effect (fraction of disconnected nodes in the y-axis) of node removals, individually or in combination (indicated by colored circles beneath in the x-axis), on the crosstalk. Notably, only the optimal removal with the largest effect is illustrated for two-node combinatorial removal.$

Figure 7.2: Effect (fraction of disconnected nodes in the y-axis) of node removals, individually or in combination (indicated by colored circles beneath in the x-axis), on the crosstalk. Notably, only the optimal removal with the largest effect is illustrated for two-node combinatorial removal.

Figure 7.3: Visualisation of the crosstalk, labelled for genes with optimal removals.

Showcasing neuropsychiatric disorders (NPD)

2024-09-15