Citation
When using any data for research or training purposes, please cite our paper The impact of age and number of mutations on the size of clonal hematopoiesis
(see PNAS 2024).
Abstract
Clonal hematopoiesis (CH)
represents the clonal expansion of hematopoietic stem cells and their progeny driven by somatic mutations. Accurate risk assessment of CH is critical for disease prevention and clinical decision-making. The size of CH has been showed to associate with higher disease risk, yet, factors influencing the size of CH are unknown. In addition, the characteristics of CH in long-lived individuals are not well documented. Here, we report an in-depth analysis of CH in longevous (≥90 y old) and common (60~89 y old) elderly groups. Utilizing targeted deep sequencing, we found that the development of CH is closely related to age and the expression of aging biomarkers. The longevous elderly group exhibited a significantly higher incidence of CH and significantly higher frequency of TET2 and ASXL1 mutations, suggesting that certain CH could be beneficial to prolong life. Intriguingly, the size of CH neither correlates significantly to age, in the range of 60 to 110 y old, nor to the expression of aging biomarkers. Instead, we identified a strong correlation between large CH size and the number of mutations per individual. These findings provide a risk assessment biomarker for CH and also suggest that the evolution of the CH is influenced by factor(s) in addition to age.
# Please install R (version 4.3.1 or above)
# see https://cran.r-project.org
# if the package 'BiocManager' not installed, please do so
if(!("BiocManager" %in% rownames(installed.packages()))) install.packages("BiocManager")
# install packages: tidyverse, patchwork, remotes
BiocManager::install(c('tidyverse','patchwork','remotes','ggpubr'), dependencies=T)
BiocManager::install("hfang-bristol/PICH", dependencies=T)
# load packages
library(tidyverse)
library(patchwork)
# specify built-in data placeholder
placeholder <- "http://www.comptransmed.pro/bigdata_pich"
# create the output folder 'CH_output'
outdir <- 'CH_output'
if(!dir.exists(outdir)) dir.create(outdir)
df_input <- openxlsx::read.xlsx(file.path(placeholder,'RRB_CH.xlsx')) %>% as_tibble()
##############
## excluding
df_input <- df_input %>% filter(ID!="DZ199")
##############
df_input <- df_input %>% mutate(Sex=ifelse(Sex=='女','Female','Male')) %>% select(ID, Sex, Age, DNMT3A:ZRSR2) %>% pivot_longer(cols=DNMT3A:ZRSR2, names_to='gene', values_to='mutation')
# add age group ("agrp")
df_input <- df_input %>% mutate(agrp=cut(Age, breaks=seq(60,90,10), right=F) %>% as.character()) %>% mutate(agrp=ifelse(is.na(agrp),'[90,+)',agrp))
# add longevity group ("lngv")
df_input <- df_input %>% mutate(lngv=ifelse(agrp=='[90,+)','Longevity: [90,+)','Others: [60,90)'))
df_input <- df_input %>% select(ID,Sex,Age,agrp,lngv,gene,mutation)
# convert into mutation-level data
df_full <- df_input %>% separate_rows(mutation,sep=';|, ') %>% mutate(vaf=str_replace_all(mutation,'.*\\(|\\%.*','') %>% as.numeric()) %>% mutate(type=ifelse(str_detect(mutation,'^c.'), 'Splicing', mutation)) %>% mutate(type=ifelse(str_detect(type,'del'), 'Deletion', type)) %>% mutate(type=ifelse(str_detect(type,'fs'), 'Frameshift', type)) %>% mutate(type=ifelse(str_detect(type,'\\*'), 'Nonsense', type)) %>% mutate(type=ifelse(str_detect(type,'^p.'), 'Missense', type))
# output into 'RRB_CH_summary.xlsx'
df_full %>% openxlsx::write.xlsx(file.path(outdir,'RRB_CH_summary.xlsx'))
df_ch <- df_full %>% mutate(CH=ifelse(is.na(vaf),'N','Y')) %>% select(ID,Age,agrp,lngv,CH) %>% arrange(desc(CH)) %>% distinct(ID,Age,agrp,lngv,.keep_all=T)
# gp_ch
df_tmp <- df_ch %>% count(agrp) %>% rename(total=n)
df_ch_freq <- df_ch %>% count(CH,agrp) %>% inner_join(df_tmp, by='agrp') %>% mutate(freq=n/total)
p <- df_ch_freq %>% ggplot(aes(x=freq, y=agrp, fill=CH))
gp_ch <- p + geom_bar(stat="identity", width=0.8) + scale_fill_brewer(palette="Blues") + theme_classic() + xlab('Fraction per age group') + ylab('Age group')
# gp_ch_num
df_ch_num <- df_ch %>% count(CH,lngv)
p <- df_ch_num %>% ggplot(aes(x=lngv, y=n, fill=CH))
p <- p + geom_bar(stat="identity", position=position_dodge(), width=0.8) + scale_fill_brewer(palette="Blues") + theme_classic() + xlab('') + ylab('Number of volunteers')
gp_ch_num <- p + geom_text(aes(label=n), vjust=1.6, position=position_dodge(0.9), size=3)
# saved into a file 'Barplot_CH_age.pdf'
gp_both <- list(gp_ch,gp_ch_num) %>% wrap_plots(ncol=2, widths=c(2,1), guides="collect")
file.path(outdir,"Barplot_CH_age.pdf") %>% ggsave(gp_both, width=9, height=3)
Figure 4.1: Barplot for CH.
df_chip <- df_full %>% mutate(CHIP=ifelse(is.na(vaf) | vaf<2,'N','Y')) %>% select(ID,Age,agrp,lngv,CHIP) %>% arrange(desc(CHIP)) %>% distinct(ID,Age,agrp,lngv,.keep_all=T)
# gp_chip
df_tmp <- df_chip %>% count(agrp) %>% rename(total=n)
df_chip_freq <- df_chip %>% count(CHIP,agrp) %>% inner_join(df_tmp, by='agrp') %>% mutate(freq=n/total)
p <- df_chip_freq %>% ggplot(aes(x=freq, y=agrp, fill=CHIP))
gp_chip <- p + geom_bar(stat="identity", width=0.8) + scale_fill_brewer(palette="Reds") + theme_classic() + xlab('Fraction per age group') + ylab('Age group')
# gp_chip_num
df_chip_num <- df_chip %>% count(CHIP,lngv)
p <- df_chip_num %>% ggplot(aes(x=lngv, y=n, fill=CHIP))
p <- p + geom_bar(stat="identity", position=position_dodge(), width=0.8) + scale_fill_brewer(palette="Reds") + theme_classic() + xlab('') + ylab('Number of volunteers')
gp_chip_num <- p + geom_text(aes(label=n), vjust=1.6, position=position_dodge(0.9), size=3)
# saved into a file 'Barplot_CHIP_age.pdf'
gp_both <- list(gp_chip,gp_chip_num) %>% wrap_plots(ncol=2, widths=c(2,1), guides="collect")
file.path(outdir,"Barplot_CHIP_age.pdf") %>% ggsave(gp_both, width=9, height=3)
Figure 4.2: Barplot for CHIP.
df_ctrl <- df_input %>% distinct(lngv,gene,ID) %>% count(lngv,gene) %>% rename(total=n)
df_lngv <- df_full %>% filter(!is.na(mutation)) %>% distinct(lngv,gene,ID) %>% count(lngv,gene)
df_ctrl_lngv <- df_ctrl %>% full_join(df_lngv, by=c('lngv','gene')) %>% mutate(n=ifelse(is.na(n),0,n)) %>% mutate(freq=n/total) %>% mutate(Group=str_c(lngv,'\nn= ', total)) %>% arrange(Group,-freq,gene)
# saved into a file 'Barplot_mutatedGenes_longevity.pdf'
data <- df_ctrl_lngv %>% mutate(gene=factor(gene,levels=unique(gene) %>% rev()))
p <- data %>% ggplot(aes(x=freq, y=gene, fill=Group))
p <- p + geom_bar(stat="identity", position=position_dodge(), width=0.8) + scale_fill_manual(values=c('springgreen4','springgreen')) + theme_classic() + ylab('Mutated genes (ordered by fraction in the longevity group)') + xlab('Fraction of mutated genes per group') + scale_x_continuous(position='top') + theme(legend.position="top")
gp_bar <- p + geom_text(aes(label=n), hjust=-0.2, position=position_dodge(0.9), size=2.5)
file.path(outdir,"Barplot_mutatedGenes_longevity.pdf") %>% ggsave(gp_bar, width=5, height=7.5)
Figure 5.1: Barplot comparing mutated genes between longevity and others.
df_nummut <- df_full %>% group_by(ID,gene) %>% summarize(nummut=sum(vaf>0)) %>% ungroup() %>% mutate(nummut=ifelse(is.na(nummut),0,nummut))
df_maxvaf <- df_full %>% group_by(ID,gene) %>% summarize(maxvaf=max(vaf)) %>% ungroup() %>% mutate(maxvaf=ifelse(is.na(maxvaf),0,maxvaf))
df_all <- df_input %>% inner_join(df_nummut, by=c('ID','gene')) %>% inner_join(df_maxvaf, by=c('ID','gene'))
# regression (account for gender)
vec_genes <- df_all %>% distinct(gene) %>% pull(gene)
ls_res <- lapply(vec_genes, function(x){
#x <- 'TET2'
df_tmp <- df_all %>% mutate(y=ifelse(Age>=90,1,0)) %>% filter(gene==x)
model <- glm(y ~ Sex + nummut, family="binomial", data=df_tmp)
res1 <- cbind(OR=coef(model), confint(model)) %>% exp()
# ANOVA (Chisq test) (account for gender)
res2 <- anova(model, test="Chisq") %>% as.matrix()
# output
res <- tibble(gene=x, OR=res1[3,1], CIl=res1[3,2], CIh=res1[3,3], pval=res2[3,5])
})
df_res <- do.call(rbind, ls_res)
# at least 2 events per group
df_tmp1 <- df_ctrl_lngv %>% distinct(lngv,gene,n) %>% pivot_wider(names_from=lngv, values_from=n) %>% filter(`Longevity: [90,+)`>=2, `Others: [60,90)`>=2)
df_res_mut <- df_res %>% semi_join(df_tmp1, by='gene')
# saved into a file 'Univariate_logistic_longevity_mutations.pdf'
data <- df_res_mut %>% arrange(pval,gene) %>% mutate(gene=factor(gene,levels=rev(gene))) %>% mutate(label=str_c('P: ',signif(pval,2),'; OR: ',signif(OR,3),' [',signif(CIl,3),',',signif(CIh,3),']'))
p <- data %>% ggplot(aes(x=-log10(pval), y=gene, fill=-log10(pval)))
p <- p + geom_bar(stat="identity", width=0.7) + theme_classic() + ylab('') + xlab('Univariate logistic regression: longevity ~ gender + mutations per gene\n\n-log10(pval)') + scale_x_continuous(position='top', limits=c(0,8)) + theme(legend.position="bottom") + scale_fill_gradient2(guide=guide_colorbar(barheight=0.6,direction="horizontal"))
gp_logi_mut <- p + geom_text(aes(label=label), hjust=-0.02, position=position_dodge(0.9), size=2.5)
file.path(outdir,"Univariate_logistic_longevity_mutations.pdf") %>% ggsave(gp_logi_mut, width=6, height=3)
Figure 6.1: Univariate logistic regression model of longevity in terms of mutations (per gene), accounting for gender.
# regression (account for gender)
vec_genes <- df_all %>% distinct(gene) %>% pull(gene)
ls_res <- lapply(vec_genes, function(x){
df_tmp <- df_all %>% mutate(y=ifelse(Age>=90,1,0)) %>% filter(gene==x)
model <- glm(y ~ Sex + maxvaf, family="binomial", data=df_tmp)
res1 <- cbind(OR=coef(model), confint(model)) %>% exp()
res2 <- anova(model, test="Chisq") %>% as.matrix()
res <- tibble(gene=x, OR=res1[3,1], CIl=res1[3,2], CIh=res1[3,3], pval=res2[3,5])
})
df_res <- do.call(rbind, ls_res)
# at least 2 events per group
df_tmp1 <- df_ctrl_lngv %>% distinct(lngv,gene,n) %>% pivot_wider(names_from=lngv, values_from=n) %>% filter(`Longevity: [90,+)`>=2, `Others: [60,90)`>=2)
df_res_maxvaf <- df_res %>% semi_join(df_tmp1, by='gene')
# saved into a file 'Univariate_logistic_longevity_maxvaf.pdf'
data <- df_res_maxvaf %>% arrange(pval,gene) %>% mutate(gene=factor(gene,levels=rev(gene))) %>% mutate(label=str_c('P: ',signif(pval,2),'; OR: ',signif(OR,3),' [',signif(CIl,3),',',signif(CIh,3),']'))
p <- data %>% ggplot(aes(x=-log10(pval), y=gene, fill=-log10(pval)))
p <- p + geom_bar(stat="identity", width=0.7) + theme_classic() + ylab('') + xlab('Univariate logistic regression: longevity ~ gender + maxvaf per gene\n\n-log10(pval)') + scale_x_continuous(position='top', limits=c(0,8)) + theme(legend.position="bottom") + scale_fill_gradient2(guide=guide_colorbar(barheight=0.6,direction="horizontal"))
gp_logi_maxvaf <- p + geom_text(aes(label=label), hjust=-0.02, position=position_dodge(0.9), size=2.5)
file.path(outdir,"Univariate_logistic_longevity_maxvaf.pdf") %>% ggsave(gp_logi_maxvaf, width=6, height=3)
Figure 6.2: Univariate logistic regression model of longevity in terms of maximum VAF (per gene), accounting for gender.
df_serum <- openxlsx::read.xlsx(file.path(placeholder,'RRB_Serum_New.xlsx')) %>% as_tibble() %>% mutate(Sex=ifelse(Sex=='女','Female','Male')) %>% pivot_longer(cols=CHIT1:CXCL8, names_to='gene', values_to='value') %>% mutate(ID=as.character(ID)) %>% mutate(value=log(value))
# CXCL8 (IL8)
# TNF (TNF-alpha)
# CXCL11 (I-TAC)
# S100A8
# CHIT1
# regression (account for gender)
vec_genes <- df_serum %>% distinct(gene) %>% pull(gene)
ls_res <- lapply(vec_genes, function(x){
df_tmp <- df_serum %>% mutate(y=CH) %>% filter(gene==x)
model <- glm(y ~ Sex + value, family="binomial", data=df_tmp)
res1 <- cbind(OR=coef(model), confint(model)) %>% exp()
res2 <- anova(model, test="Chisq") %>% as.matrix()
res <- tibble(gene=x, OR=res1[3,1], CIl=res1[3,2], CIh=res1[3,3], pval=res2[3,5])
})
df_res_elisa <- do.call(rbind, ls_res)
# saved into a file 'Univariate_logistic_ch_elisa.pdf'
data <- df_res_elisa %>% arrange(pval,gene) %>% mutate(gene=factor(gene,levels=rev(gene))) %>% mutate(label=str_c('P: ',signif(pval,2),'; OR: ',signif(OR,3),' [',signif(CIl,3),',',signif(CIh,3),']'))
p <- data %>% ggplot(aes(x=-log10(pval), y=gene, fill=-log10(pval)))
p <- p + geom_bar(stat="identity", width=0.7) + theme_classic() + ylab('') + xlab('Univariate logistic regression: CH ~ gender + ELISA per gene\n\n-log10(pval)') + scale_x_continuous(position='top', limits=c(0,3)) + theme(legend.position="bottom") + scale_fill_gradient2(guide=guide_colorbar(barheight=0.6,direction="horizontal"))
gp_logi_elisa <- p + geom_text(aes(label=label), hjust=-0.02, position=position_dodge(0.9), size=3)
file.path(outdir,"Univariate_logistic_ch_elisa.pdf") %>% ggsave(gp_logi_elisa, width=8, height=3.5)
Figure 7.1: Univariate logistic regression model of CH in terms of ELISA (per gene), accounting for gender.
# regression (account for gender)
vec_genes <- df_serum %>% distinct(gene) %>% pull(gene)
ls_res <- lapply(vec_genes, function(x){
df_tmp <- df_serum %>% mutate(y=CHIP) %>% filter(gene==x)
model <- glm(y ~ Sex + value, family="binomial", data=df_tmp)
res1 <- cbind(OR=coef(model), confint(model)) %>% exp()
res2 <- anova(model, test="Chisq") %>% as.matrix()
res <- tibble(gene=x, OR=res1[3,1], CIl=res1[3,2], CIh=res1[3,3], pval=res2[3,5])
})
df_res_elisa <- do.call(rbind, ls_res)
# saved into a file 'Univariate_logistic_chip_elisa.pdf'
data <- df_res_elisa %>% arrange(pval,gene) %>% mutate(gene=factor(gene,levels=rev(gene))) %>% mutate(label=str_c('P: ',signif(pval,2),'; OR: ',signif(OR,3),' [',signif(CIl,3),',',signif(CIh,3),']'))
p <- data %>% ggplot(aes(x=-log10(pval), y=gene, fill=-log10(pval)))
p <- p + geom_bar(stat="identity", width=0.7) + theme_classic() + ylab('') + xlab('Univariate logistic regression: CHIP ~ gender + ELISA per gene\n\n-log10(pval)') + scale_x_continuous(position='top', limits=c(0,3)) + theme(legend.position="bottom") + scale_fill_gradient2(guide=guide_colorbar(barheight=0.6,direction="horizontal"))
gp_logi_elisa <- p + geom_text(aes(label=label), hjust=-0.02, position=position_dodge(0.9), size=3)
file.path(outdir,"Univariate_logistic_chip_elisa.pdf") %>% ggsave(gp_logi_elisa, width=8, height=3.5)
Figure 7.2: Univariate logistic regression model of CHIP in terms of ELISA (per gene), accounting for gender.
library(PICH)
STRING_fin <- oRDS("STRING_fin", placeholder=placeholder)
ig <- STRING_fin
# calculates a single shortest path (keeps edges in shorted pathes)
data <- rbind(df_res_mut,df_res_elisa) %>% select(gene,pval)
vec_genes <- data %>% inner_join(tibble(gene=V(ig)$name), by='gene') %>% pull(gene)
res <- igraph::shortest_paths(ig, from=vec_genes, to=vec_genes, mode='all', output="epath")
subg <- igraph::subgraph.edges(ig, eids=res$epath %>% unlist())
df_subg <- tibble(gene=V(subg)$name) %>% left_join(data, by='gene') %>% mutate(score=-log10(pval))
V(subg)$score <- df_subg$score
subg <- subg %>% oLayout("graphlayouts.layout_with_stress")
gp_subg <- subg %>% oGGnetwork(node.label="name", node.label.size=3, node.xcoord="xcoord", node.ycoord="ycoord", node.color="score", node.color.title="-log10(pval)", colormap="spectral.top", zlim=c(0,2))
# saved into a file 'PPI_subnetwork.pdf'
file.path(outdir,"PPI_subnetwork.pdf") %>% ggsave(gp_subg, width=5, height=4.5)
Figure 8.1: PPI visualisation illustrating the connectivity between CH genes and aging-related genes. Genes are color-coded by the significant level comparing longevity and others.
library(ggpubr)
# Calculate the score per person
df_tmp <- df_all %>% inner_join(tibble(gene=V(subg)$name), by='gene') %>% group_by(ID,Sex,Age,agrp,lngv) %>% summarise(score=sum(nummut)) %>% ungroup()
# density plot (drawn separately by gender)
gp <- df_tmp %>% ggdensity(x="score", add="mean", rug=TRUE, color="lngv", fill="lngv", palette=c("springgreen4","springgreen"))
gp_den <- facet(gp, facet.by="Sex")
# saved into a file 'Density_longevity.pdf'
file.path(outdir,"Density_longevity.pdf") %>% ggsave(gp_den, width=10, height=4)
# boxplot (drawn separately by gender)
gp <- df_tmp %>% ggboxplot(x="lngv", y="score", color="lngv", palette=c("springgreen4","springgreen"))
my_comparisons <- list(c("Longevity: [90,+)", "Others: [60,90)") )
gp <- gp + stat_compare_means(comparisons=my_comparisons, method="t.test") + stat_compare_means(label.y=7)
gp_box <- facet(gp, facet.by="Sex")
# saved into a file 'Boxplot_longevity.pdf'
file.path(outdir,"Boxplot_longevity.pdf") %>% ggsave(gp_box, width=10, height=4.5)
Figure 8.2: Density plot for person-wise subnetwork scores, drawn separately by gender.
Figure 8.3: Boxplot for person-wise subnetwork scores, drawn separately by gender.
# boxplot across age groups (drawn separately by gender)
gp <- df_tmp %>% ggboxplot(x="agrp", y="score", color="agrp")
my_comparisons <- list( c("[90,+)","[80,90)"), c("[90,+)","[70,80)"), c("[90,+)","[60,70)") )
gp <- gp + stat_compare_means(comparisons=my_comparisons, method="t.test") + stat_compare_means(label.y=8, method="anova")
gp_box <- facet(gp, facet.by="Sex")
# saved into a file 'Boxplot_longevity_age.pdf'
file.path(outdir,"Boxplot_longevity_age.pdf") %>% ggsave(gp_box, width=10, height=4.5)
Figure 8.4: Boxplot for person-wise subnetwork scores (across age groups), drawn separately by gender.
df_tmp <- df_all %>% group_by(ID,Sex,Age) %>% summarize(maxvaf=max(maxvaf)) %>% ungroup()
model <- glm(maxvaf ~ Sex + Age + Sex*Age, family="gaussian", data=df_tmp)
summary(model)$coefficients
# pvalue for Sex*Age: 0.1156482
# pvalue for Age: 0.0891449
# Curve across ages (drawn separately by gender)
gp <- df_tmp %>% ggplot(aes(x=Age, y=maxvaf)) + geom_point(aes(group=Sex,fill=Sex)) + geom_line(aes(group=Sex,color=Sex)) + theme_classic() + xlab('Age (year)') + ylab('VAF (%) maximised across genes per person')
gp_curve <- gp + facet_grid(Sex~.) + guides(fill="none")
# saved into a file 'Curve_Age_maxvaf.pdf'
file.path(outdir,"Curve_Age_maxvaf.pdf") %>% ggsave(gp_curve, width=10, height=5)
Figure 8.5: Illustrating the relation between maxVAF and ages, drawn separately by gender.
# regression (account for gender, and its interaction with age)
vec_genes <- df_all %>% distinct(gene) %>% pull(gene)
ls_res <- lapply(vec_genes, function(x){
df_tmp <- df_all %>% filter(gene==x)
model <- glm(maxvaf ~ Sex + Age + Sex*Age, family="gaussian", data=df_tmp)
res <- summary(model)$coefficients
# output
res <- tibble(gene=x, effect=res[3,1], pval=res[3,4])
})
df_res <- do.call(rbind, ls_res)
# at least 5 non-zero maxVAF per gene
df_tmp1 <- df_all %>% filter(maxvaf!=0) %>% distinct(gene,ID) %>% count(gene) %>% filter(n>=5)
df_res <- df_res %>% semi_join(df_tmp1, by='gene')
df_res_maxvaf <- df_res %>% semi_join(df_tmp1, by='gene')
# saved into a file 'Gene_Age_maxvaf.pdf'
data <- df_res %>% arrange(pval,gene) %>% mutate(gene=factor(gene,levels=rev(gene))) %>% mutate(label=str_c('P: ',signif(pval,2),'; age effect: ',signif(effect,3)))
p <- data %>% ggplot(aes(x=-log10(pval), y=gene, fill=-log10(pval)))
p <- p + geom_bar(stat="identity", width=0.7) + theme_classic() + ylab('') + xlab('Regression: maxvaf per gene ~ gender + age + gender*age\n\n-log10(pval)') + scale_x_continuous(position='top', limits=c(0,3)) + theme(legend.position="bottom") + scale_fill_gradient2(guide=guide_colorbar(barheight=0.6,direction="horizontal"))
gp_maxvaf_gene <- p + geom_text(aes(label=label), hjust=-0.02, position=position_dodge(0.9), size=3)
file.path(outdir,"Gene_Age_maxvaf.pdf") %>% ggsave(gp_maxvaf_gene, width=6, height=3)
Figure 9.1: Regression model of gene-specific maximum vaf in terms of age, accounting for gender and its interaction with age.
data <- df_all %>% select(ID,Sex,nummut,maxvaf) %>% group_by(ID,Sex) %>% summarize(nummut=sum(nummut>0), maxvaf=max(maxvaf)) %>% ungroup() %>% arrange(Sex, -maxvaf)
data <- data %>% filter(maxvaf>=2) %>% mutate(vafgrp=cut(maxvaf, breaks=c(2,5,10,100), right=F))
df1 <- data %>% count(vafgrp,nummut)
df2 <- data %>% count(vafgrp) %>% rename(total=n)
df <- df1 %>% inner_join(df2, by='vafgrp') %>% mutate(freq=n/total) %>% mutate(nummut=as.character(nummut)) %>% mutate(nummut=ifelse(nummut %in% c("3","4"),">=3",nummut)) %>% mutate(vafgrp=as.character(vafgrp)) %>% mutate(vafgrp=ifelse(vafgrp=="[10,100)", "[10,+)", vafgrp)) %>% mutate(vafgrp=fct_inorder(vafgrp))
# saved into a file 'maxvaf_nummut.pdf'
gp_maxvaf_nummut <- df %>% ggplot(aes(x="", y=freq, fill=nummut)) + geom_bar(width=1, stat="identity") + scale_fill_brewer(palette="Blues") + theme_classic() + xlab('') + ylab('Percentage') + facet_grid(~vafgrp)
file.path(outdir,"maxvaf_nummut.pdf") %>% ggsave(gp_maxvaf_nummut, width=5, height=4)
Figure 10.1: Barplot showing the distribution of volunteers grouped by the maximum vaf as well as the number of mutated genes.