I am a Senior Bioinformatician with over 6 years of experience bridging the gap between computational biology and pharmaceutical R&D. Currently at 7N, I develop of scalable R/Shiny applications and analyze NGS and MS data for a Big Pharma client.
I hold a Ph.D.ย in Biological Sciences and am currently pursuing an MBA in Healthcare Management, combining deep technical expertise with strategic business insight.
R & Shiny Python WDL & Nextflow Docker SQL & Snowflake NGS Analysis Bash Mass Spectrometry Git Agile / Scrum Reproducible work
Click on a publication card below to reveal the specific Bioinformatics & Statistics methods used.
Below is an interactive gallery of 15 analytical modules demonstrating my coding expertise using realistic, simulated biological datasets. Click the tabs to explore.
Focus: Population structure, GWAS, Variant Quality Control, and Evolution.
Population structure visualization simulating realistic genotype noise.
# Parameters
n_samples <- 300
n_snps <- 1000
# Generate base matrix (noise)
G <- matrix(rnorm(n_samples * n_snps), nrow = n_samples)
# Define 3 populations with subtle shifts in allele frequencies
# Pop 1 (Rows 1-100): Shift in first 100 SNPs
G[1:100, 1:100] <- G[1:100, 1:100] + 0.5
# Pop 2 (Rows 101-200): Shift in SNPs 50-150
G[101:200, 50:150] <- G[101:200, 50:150] - 0.6
# Pop 3 (Rows 201-300): Shift in SNPs 800-900
G[201:300, 800:900] <- G[201:300, 800:900] + 0.7
groups <- c(rep("Holstein", 100), rep("Jersey", 100), rep("Angus", 100))
# Perform PCA
pca_res <- prcomp(G, scale. = TRUE)
var_explained <- round(summary(pca_res)$importance[2, 1:2] * 100, 1)
df_pca <- data.frame(PC1 = pca_res$x[,1], PC2 = pca_res$x[,2], Breed = groups)
ggplot(df_pca, aes(x = PC1, y = PC2, fill = Breed)) +
geom_point(size = 3, shape = 21, color = "white", alpha = 0.7) +
stat_ellipse(aes(color = Breed), geom = "polygon", alpha = 0.1, level = 0.95, show.legend = FALSE) +
scale_fill_manual(values = c("#E69F00", "#56B4E9", "#009E73")) +
scale_color_manual(values = c("#E69F00", "#56B4E9", "#009E73")) +
theme_minimal() +
labs(title = "Population Structure (PCA)",
subtitle = "Simulated genotype matrix (300 samples x 1k SNPs)",
x = paste0("PC1 (", var_explained[1], "% Var)"),
y = paste0("PC2 (", var_explained[2], "% Var)"))
Genome-Wide Association Study with LD-like signal peaks.
n_snps <- 5000
gwas_df <- data.frame(
SNP = paste0("rs", 1:n_snps),
Chr = rep(1:5, each = 1000),
Pos = rep(1:1000, 5),
# Background noise (uniform distribution of p-values)
PVal = runif(n_snps)
)
# Simulate Linkage Disequilibrium (LD) Peak on Chr 2
center <- 1500
width <- 50
peak_signal <- 10^-seq(1, 8, length.out = width)
gwas_df$PVal[(center-width/2):(center+width/2-1)] <- peak_signal * runif(width, 0.1, 1)
gwas_df <- gwas_df %>% mutate(logP = -log10(PVal))
ggplot(gwas_df, aes(x = Pos, y = logP, color = as.factor(Chr))) +
geom_point(alpha = 0.6, size = 1) +
scale_color_brewer(palette = "Set1") +
geom_hline(yintercept = -log10(5e-8), linetype = "dashed", color = "red", alpha = 0.5) +
theme_minimal() +
facet_grid(. ~ Chr, scales = "free_x", space = "free_x", switch = "x") +
theme(axis.text.x = element_blank(), panel.grid.major.x = element_blank(), legend.position = "none") +
labs(title = "Manhattan Plot", y = "-Log10 P-Value", x = "Chromosome")
Quality Control: Realistic skewed distribution of Read Depth.
# Simulate sequencing depth (Negative Binomial distribution is common for read counts)
dp_values <- rnbinom(5000, size = 5, mu = 40)
df_qc <- data.frame(Depth = dp_values)
ggplot(df_qc, aes(x = Depth)) +
geom_histogram(aes(y = ..density..), binwidth = 2, fill = "#34495e", color = "white", alpha = 0.8) +
geom_density(color = "#e74c3c", size = 1, adjust = 2) +
scale_x_continuous(limits = c(0, 150)) +
theme_light() +
labs(title = "Variant Quality Control (DP)",
subtitle = "Distribution of sequencing depth across variants",
x = "Read Depth (DP)", y = "Density") +
geom_vline(xintercept = 40, linetype="dashed", color="orange") +
annotate("text", x = 60, y = 0.02, label = "Mean Depth ~ 40x", color = "orange")
Hierarchical clustering of genetic distances.
# Simulate distance matrix with some structure
m <- matrix(rnorm(100), nrow = 10)
rownames(m) <- paste0("Species_", LETTERS[1:10])
dist_mat <- dist(m)
hc <- hclust(dist_mat, method = "ward.D2")
par(mar=c(2,2,2,2))
plot(hc, hang = -1, main = "Phylogenetic Tree Reconstruction",
sub = "Ward's method clustering", xlab = "",
col = "#2980b9", lwd = 2)
rect.hclust(hc, k = 3, border = "red") # Highlight 3 clades
Focus: Expression profiling, Pathways, and Single-Cell.
Differential Expression: Realistic relationship between Fold Change and P-value.
n_genes <- 3000
# Generate LogFC: mostly close to 0, heavy tails
logfc <- c(rnorm(2500, 0, 0.5), rnorm(250, 2, 1), rnorm(250, -2, 1))
# Generate P-values dependent on LogFC (stronger effect -> smaller p-value)
noise <- rnorm(n_genes, 0, 2)
log_pval <- -(abs(logfc) * 3) + noise
pvals <- 10^log_pval
pvals[pvals > 1] <- 1
df_vol <- data.frame(
Gene = paste0("G", 1:n_genes),
log2FoldChange = logfc,
padj = p.adjust(pvals, method = "BH")
) %>%
mutate(
Group = case_when(
log2FoldChange > 1 & padj < 0.05 ~ "Up-regulated",
log2FoldChange < -1 & padj < 0.05 ~ "Down-regulated",
TRUE ~ "NS"
)
)
ggplot(df_vol, aes(x = log2FoldChange, y = -log10(padj), color = Group)) +
geom_point(alpha = 0.5, size = 1.5) +
scale_color_manual(values = c("steelblue", "grey85", "firebrick")) +
geom_hline(yintercept = -log10(0.05), linetype = "dashed", alpha = 0.5) +
geom_vline(xintercept = c(-1, 1), linetype = "dashed", alpha = 0.5) +
theme_minimal() +
labs(title = "Differential Expression (Volcano Plot)",
subtitle = "Simulated DESeq2 output with FDR correction",
x = "Log2 Fold Change", y = "-Log10 Adjusted P-value")
Simulating cellular trajectories/clusters.
# Create a "trajectory" shape rather than just blobs
t <- seq(0, 2*pi, length.out = 300)
x <- c(cos(t), cos(t) + 2.5, cos(t) + 1.2) + rnorm(900, 0, 0.2)
y <- c(sin(t), sin(t) + 1, sin(t) - 2) + rnorm(900, 0, 0.2)
clusters <- c(rep("Stem", 300), rep("Progenitor", 300), rep("Differentiated", 300))
df_umap <- data.frame(UMAP1 = x, UMAP2 = y, CellType = clusters)
ggplot(df_umap, aes(UMAP1, UMAP2, color = CellType)) +
geom_point(size = 0.8, alpha = 0.6) +
scale_color_manual(values = c("#9b59b6", "#3498db", "#e74c3c")) +
theme_void() +
theme(legend.position = "right") +
labs(title = "Single-Cell UMAP Projection", subtitle = "Developmental Trajectory Simulation")
Dotplot for GSEA results.
df_path <- data.frame(
Pathway = c("Cell Cycle", "DNA Replication", "P53 Signaling", "Apoptosis", "MAPK Signaling", "Ribosome"),
Count = c(45, 30, 25, 20, 15, 10),
GeneRatio = c(0.15, 0.12, 0.09, 0.08, 0.05, 0.03),
p.adjust = c(1e-9, 1e-6, 0.001, 0.01, 0.03, 0.045)
)
df_path$Pathway <- factor(df_path$Pathway, levels = df_path$Pathway[order(df_path$GeneRatio)])
ggplot(df_path, aes(x = GeneRatio, y = Pathway)) +
geom_point(aes(size = Count, color = p.adjust)) +
scale_color_gradient(low = "red", high = "blue") +
theme_light() +
labs(title = "KEGG Pathway Enrichment", x = "Gene Ratio", y = "")
Visualizing co-expression modules.
# Create structured matrix
mat <- matrix(rnorm(400), nrow = 20, ncol = 20)
# Add "modules" of correlation
mat[1:10, 1:10] <- mat[1:10, 1:10] + 2
mat[11:20, 11:20] <- mat[11:20, 11:20] - 2
df_h <- as.data.frame(mat)
colnames(df_h) <- paste0("S", 1:20)
df_h$Gene <- paste0("G", 1:20)
df_h_long <- pivot_longer(df_h, cols = -Gene, names_to = "Sample", values_to = "Exp")
ggplot(df_h_long, aes(Sample, Gene, fill = Exp)) +
geom_tile() +
scale_fill_gradient2(low = "#2c7bb6", mid = "#ffffbf", high = "#d7191c") +
theme_minimal() +
theme(axis.text.x = element_blank()) +
labs(title = "Gene Expression Heatmap", x = "Samples (n=20)", y = "Genes (n=20)")
Differentiation analysis (Monocle style).
set.seed(101)
t <- seq(0, 10, length.out=400)
branch1 <- data.frame(Time=t, Expr=sin(t) + rnorm(400, 0, 0.2), Branch="Lineage A")
branch2 <- data.frame(Time=t, Expr=sin(t) + 0.5*t + rnorm(400, 0, 0.2), Branch="Lineage B")
pseudo <- rbind(branch1, branch2)
ggplot(pseudo, aes(Time, Expr, color=Branch)) +
geom_point(alpha=0.4) + geom_smooth(se=FALSE, size=1.5) +
theme_classic() + scale_color_manual(values=c("#8e44ad", "#27ae60")) +
labs(title="Pseudotime Trajectory", subtitle="Gene Expression during Differentiation", x="Pseudotime", y="Expression Level")
Expression intensity and percentage (Seurat style).
genes <- c("CD3D", "CD19", "CD14", "MS4A1", "GNLY", "NKG7")
clusters <- c("T-Cells", "B-Cells", "Monocytes", "NK-Cells")
df_dot <- expand.grid(Gene=genes, Cluster=clusters)
set.seed(5)
df_dot$AvgExp <- runif(nrow(df_dot), 0, 3)
df_dot$Pct <- runif(nrow(df_dot), 0, 100)
df_dot$AvgExp[df_dot$Gene=="CD3D" & df_dot$Cluster=="T-Cells"] <- 2.8
df_dot$Pct[df_dot$Gene=="CD3D" & df_dot$Cluster=="T-Cells"] <- 90
ggplot(df_dot, aes(Gene, Cluster)) +
geom_point(aes(size=Pct, color=AvgExp)) +
scale_color_gradient(low="lightgrey", high="blue") +
theme_bw() + labs(title="Marker Gene Expression (DotPlot)", size="% Expressed", color="Avg Exp")
Focus: Mass Spec Intensity, Motif Analysis, Chromatin Accessibility, and Methylation.
Rank-Abundance plot typical for Mass Spectrometry data.
n_prot <- 2000
# Simulate log-normal intensities typical for MS
intensities <- rlnorm(n_prot, meanlog = 10, sdlog = 2)
intensities <- sort(intensities, decreasing = TRUE)
df_range <- data.frame(
Rank = 1:n_prot,
Intensity = intensities,
Label = NA
)
# Highlight high abundance (e.g. Albumin) and low abundance (Biomarkers)
df_range$Label[c(1, 50, 1950, 2000)] <- c("Albumin", "ApoA1", "IL-6", "TP53")
ggplot(df_range, aes(x = Rank, y = Intensity)) +
geom_line(color = "grey", size = 0.5) +
geom_point(alpha = 0.6, color = "#34495e") +
scale_y_log10() +
geom_text(aes(label = Label), vjust = -0.5, hjust = -0.1, color = "#e74c3c", fontface="bold") +
theme_bw() +
labs(title = "Proteome Dynamic Range",
subtitle = "Mass Spec Intensity Distribution (Rank-Abundance)",
x = "Protein Rank", y = "Intensity (Log10 LFQ)")
Differential Protein Abundance.
df_p <- data.frame(FC=rnorm(1000), P=runif(1000))
df_p$P[1:5] <- c(1e-6, 1e-5, 1e-5, 1e-4, 1e-4); df_p$FC[1:5] <- c(3, -3, 2.5, -2.5, 4)
df_p$Label <- NA; df_p$Label[1:5] <- c("TP53", "KRAS", "MYC", "PTEN", "EGFR")
ggplot(df_p, aes(FC, -log10(P))) +
geom_point(aes(color=P<0.01), alpha=0.5) +
scale_color_manual(values=c("grey", "purple")) +
geom_text(aes(label=Label), vjust=-1, size=3) +
theme_minimal() + theme(legend.position="none") + labs(title="Proteomics Differential Abundance")
Transcription Factor Binding Motif (CTCF-like).
# NOTE: Requires 'ggseqlogo' package
if(requireNamespace("ggseqlogo", quietly=TRUE)){
library(ggseqlogo)
# Simulated Position Weight Matrix (PWM) for a CTCF-like motif
# Rows: A, C, G, T; Cols: Positions
pwm_matrix <- matrix(c(
0.1, 0.9, 0.0, 0.0, # Pos 1: C dominant
0.0, 0.1, 0.0, 0.9, # Pos 2: T dominant
0.8, 0.1, 0.1, 0.0, # Pos 3: A dominant
0.9, 0.0, 0.1, 0.0, # Pos 4: A dominant
0.0, 0.0, 1.0, 0.0, # Pos 5: G conserved
0.1, 0.8, 0.1, 0.0, # Pos 6: C dominant
0.0, 0.0, 0.9, 0.1, # Pos 7: G dominant
0.0, 0.1, 0.0, 0.9 # Pos 8: T dominant
), nrow=4, dimnames=list(c("A","C","G","T")))
ggseqlogo(pwm_matrix) +
theme_minimal() +
labs(title="Transcription Factor Motif (SeqLogo)", subtitle="Enriched in ATAC-seq peaks (CTCF-like)")
} else {
print("Please install 'ggseqlogo' to view this plot.")
}
Volcano plot for CpG sites.
# Simulate Delta Beta values (-1 to 1)
delta_beta <- rnorm(5000, 0, 0.1)
# Add some hyper/hypo methylated sites
delta_beta[1:50] <- runif(50, 0.2, 0.6)
delta_beta[51:100] <- runif(50, -0.6, -0.2)
pvals <- runif(5000)
pvals[1:100] <- 10^-runif(100, 3, 10) # Significant
df_meth <- data.frame(DeltaBeta=delta_beta, P=pvals)
df_meth$Sig <- ifelse(df_meth$P < 0.01 & abs(df_meth$DeltaBeta) > 0.2, "DMR", "NS")
ggplot(df_meth, aes(DeltaBeta, -log10(P), color=Sig)) +
geom_point(alpha=0.5, size=1.2) +
scale_color_manual(values=c("orange", "grey")) +
geom_vline(xintercept=c(-0.2, 0.2), linetype="dashed") +
theme_classic() +
labs(title="Differential Methylation Analysis", x="Delta Beta (Methylation Difference)", y="-Log10 P-value")
Chromatin accessibility around Transcription Start Sites.
x <- seq(-2000, 2000, 20)
# Gaussian peak for accessible chromatin
y <- 60 * exp(-x^2 / (2*300^2)) + rnorm(length(x), 5, 2)
ggplot(data.frame(x,y), aes(x,y)) +
geom_area(fill="#27ae60", alpha=0.5) +
geom_line(color="#2c3e50") +
theme_classic() +
labs(title="TSS Enrichment Profile (ATAC-seq)", x="Distance from TSS (bp)", y="Normalized Signal")
ChromHMM style heatmap.
states <- c("Promoter", "Enhancer", "Transcribed", "Repressed", "Heterochrom")
samples <- paste0("Sample_", 1:10)
df_chrom <- expand.grid(State=states, Sample=samples)
df_chrom$Enrichment <- runif(50, 0, 10)
df_chrom$Enrichment[df_chrom$State=="Promoter"] <- df_chrom$Enrichment[df_chrom$State=="Promoter"] + 5
ggplot(df_chrom, aes(Sample, State, fill=Enrichment)) +
geom_tile() +
scale_fill_viridis_c(option="magma") +
theme_minimal() +
labs(title="Chromatin State Enrichment (ChromHMM)", x="", y="")
Focus: Drug Discovery, CRISPR Screens, and Machine Learning.
Genome-wide ranking of gene essentiality (MAGeCK style).
n_genes <- 2000
scores <- c(rnorm(1800, mean = 0, sd = 0.3),
rnorm(150, mean = -1.5, sd = 0.6),
rnorm(50, mean = 1.2, sd = 0.5))
df_rank <- data.frame(Score = scores)
df_rank <- df_rank[order(df_rank$Score), , drop = FALSE]
df_rank$Rank <- 1:nrow(df_rank)
df_rank$Type <- "Neutral"
df_rank$Type[df_rank$Score < -1] <- "Essential"
df_rank$Type[df_rank$Score > 1] <- "Resistance"
df_rank$Label <- NA
df_rank$Label[1:5] <- c("RPA1", "PCNA", "POLR2A", "MYC", "MCM2")
n <- nrow(df_rank)
df_rank$Label[(n-4):n] <- c("PTEN", "NF1", "KEAP1", "TP53", "RB1")
ggplot(df_rank, aes(x = Rank, y = Score, color = Type)) +
geom_point(size = 1.5, alpha = 0.7) +
scale_color_manual(values = c("firebrick", "grey85", "dodgerblue")) +
geom_text(aes(label = Label), vjust = ifelse(df_rank$Score > 0, -1, 1.5),
size = 3, fontface = "bold", color = "black", check_overlap = TRUE) +
geom_hline(yintercept = c(-1, 1), linetype = "dashed", color = "grey50") +
theme_classic() +
labs(title = "CRISPR Screen Gene Ranking",
subtitle = "Genome-wide Essentiality (Negative Selection) vs Resistance (Positive Selection)",
x = "Gene Rank",
y = "Beta Score (Log2FC)",
color = "Gene Category") +
theme(legend.position = "top")
### Feature Importance (RF)
Biomarker discovery.
imp <- data.frame(Feat=paste0("M_", 1:10), Val=sort(rnorm(10, 10, 2)))
ggplot(imp, aes(reorder(Feat, Val), Val)) + geom_col(fill="steelblue") + coord_flip() +
theme_minimal() + labs(title="Feature Importance (Random Forest)", x="", y="Mean Decrease Gini")
Pharmacodynamics using 4-Parameter Logistic Model.
# Define 4-Parameter Logistic Function
f_4pl <- function(x, b, c, d, e) { c + (d - c) / (1 + exp(b * (log(x) - log(e)))) }
concs <- c(0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000)
# Parameters: b=Slope, c=Bottom, d=Top, e=IC50
true_y <- f_4pl(concs, b = -1.5, c = 5, d = 100, e = 50)
# Add biological replicates and noise
df_dr <- data.frame(
Conc = rep(concs, each=3)
)
df_dr$Resp <- rep(true_y, each=3) + rnorm(nrow(df_dr), 0, 5) # Noise
ggplot(df_dr, aes(x=Conc, y=Resp)) +
geom_point(size=2.5, alpha=0.6, color="#8e44ad") +
geom_function(fun = function(x) f_4pl(x, -1.5, 5, 100, 50), color="#2c3e50", size=1) +
scale_x_log10() +
theme_bw() +
labs(title="Drug Dose-Response (Sigmoidal)",
subtitle="IC50 = 50 nM (4-Parameter Logistic Fit)",
x="Concentration (nM) [Log]", y="Cell Viability (%)") +
geom_vline(xintercept=50, linetype="dashed", color="red")
Model performance metrics.
# Simulate scores for Positive and Negative classes
neg_scores <- rnorm(500, mean=0.3, sd=0.15)
pos_scores <- rnorm(500, mean=0.7, sd=0.15) # Better separation
# Calculate ROC points manually
thresholds <- seq(0, 1, 0.01)
tpr <- sapply(thresholds, function(t) mean(pos_scores > t))
fpr <- sapply(thresholds, function(t) mean(neg_scores > t))
df_roc <- data.frame(FPR=fpr, TPR=tpr)
ggplot(df_roc, aes(x=FPR, y=TPR)) +
geom_path(color="#27ae60", size=1.5) +
geom_abline(linetype="dashed", color="grey") +
theme_light() +
annotate("text", x=0.75, y=0.25, label="AUC = 0.96", size=6, color="#27ae60") +
labs(title="ROC Analysis: SNP Classifier", x="1 - Specificity (FPR)", y="Sensitivity (TPR)")
Focus: Survival, Longitudinal Models, Meta-Analysis, and Correlations.
Odds Ratios visualization.
df_f <- data.frame(
Study=c("Study A", "Study B", "Study C", "Pooled"),
OR=c(1.2, 0.8, 1.5, 1.1),
Low=c(0.9, 0.5, 1.1, 0.95),
High=c(1.5, 1.2, 2.0, 1.3)
)
ggplot(df_f, aes(y=Study, x=OR, xmin=Low, xmax=High)) +
geom_point(size=3, shape=15) + geom_errorbarh(height=0.2) +
geom_vline(xintercept=1, linetype="dashed", color="red") +
theme_bw() + labs(title="Forest Plot (Meta-Analysis)", x="Odds Ratio (95% CI)", y="")
Time-to-event analysis with realistic censoring.
n <- 100
# Generate Weibull distributed times (more realistic for biological failure)
T_treat <- rweibull(n, shape = 1.5, scale = 100)
T_placebo <- rweibull(n, shape = 1.5, scale = 60) # Placebo dies faster
# Combine
df_surv <- data.frame(
Time = c(T_treat, T_placebo),
Group = rep(c("Treatment", "Placebo"), each = n)
)
# Add random censoring (patients leaving study)
cens_time <- runif(2*n, 0, 150)
df_surv$Status <- ifelse(df_surv$Time < cens_time, 1, 0) # 1=Event, 0=Censored
df_surv$TimeObs <- pmin(df_surv$Time, cens_time)
fit <- survfit(Surv(TimeObs, Status) ~ Group, data = df_surv)
# Plot
plot(fit, col=c("red", "blue"), lwd=3, xlab="Time (Months)", ylab="Survival Probability",
main="Kaplan-Meier Analysis (Simulated Clinical Trial)", frame.plot=FALSE)
legend("topright", levels(factor(df_surv$Group)), col=c("red", "blue"), lwd=3, bty="n")
grid(col="grey90")
Longitudinal data with realistic subject variance.
subjects <- 20
times <- 0:5
# Random intercepts and slopes
intercepts <- rnorm(subjects, 10, 2)
slopes <- rnorm(subjects, 0.5, 0.2)
df_lmm <- data.frame()
for(i in 1:subjects){
# Add group effect: Group B increases faster
grp <- ifelse(i > 10, "B", "A")
slope_eff <- slopes[i] + ifelse(grp=="B", 1.5, 0)
y <- intercepts[i] + slope_eff * times + rnorm(length(times), 0, 1)
df_lmm <- rbind(df_lmm, data.frame(ID=paste0("S",i), Time=times, Value=y, Group=grp))
}
ggplot(df_lmm, aes(x=Time, y=Value, group=ID, color=Group)) +
geom_line(alpha=0.4) +
stat_summary(aes(group=Group), fun=mean, geom="line", size=2) +
scale_color_manual(values=c("#95a5a6", "#e74c3c")) +
theme_bw() +
labs(title="Longitudinal Response (LMM)", subtitle="Group A vs B treatment trajectories")
Clinical variable relationships.
# Simulate correlated multivariate data
sigma <- matrix(c(1, 0.8, -0.5, 0.8, 1, -0.3, -0.5, -0.3, 1), 3, 3)
data_corr <- mvrnorm(n = 50, mu = c(0,0,0), Sigma = sigma)
colnames(data_corr) <- c("BMI", "Insulin", "Activity")
cormat <- round(cor(data_corr), 2)
melted <- as.data.frame(as.table(cormat))
ggplot(melted, aes(Var1, Var2, fill=Freq)) +
geom_tile(color="white") +
geom_text(aes(label=Freq)) +
scale_fill_gradient2(low="#2980b9", mid="white", high="#c0392b") +
theme_minimal() + labs(title="Clinical Correlations", x="", y="")
Focus: Community Diversity and Composition.
Comparing species richness (Non-normal distributions).
# Generate Gamma distributed data (common for counts/diversity)
grp_a <- rgamma(40, shape=20, scale=0.2)
grp_b <- rgamma(40, shape=15, scale=0.2)
df_alpha <- data.frame(Index = c(grp_a, grp_b), Group = rep(c("Healthy", "Dysbiosis"), each=40))
ggplot(df_alpha, aes(x=Group, y=Index, fill=Group)) +
geom_violin(alpha=0.3, trim=FALSE) +
geom_boxplot(width=0.2, alpha=0.8) +
scale_fill_manual(values=c("#e74c3c", "#2ecc71")) +
theme_classic() +
labs(title="Microbiome Alpha Diversity (Shannon)", y="Diversity Index")
Visualizing community separation.
# Create two multivariate normal distributions
g1 <- mvrnorm(25, mu=c(2, 2), Sigma=matrix(c(1,0.5,0.5,1),2))
g2 <- mvrnorm(25, mu=c(-1, -1), Sigma=matrix(c(1,-0.2,-0.2,1),2))
df_beta <- rbind(data.frame(g1, Grp="Ctrl"), data.frame(g2, Grp="Treat"))
colnames(df_beta)[1:2] <- c("PCoA1", "PCoA2")
ggplot(df_beta, aes(PCoA1, PCoA2, color=Grp)) +
geom_point(size=3) +
stat_ellipse() +
theme_minimal() +
labs(title="Beta Diversity (Bray-Curtis PCoA)", subtitle="Clustering of microbial communities")