12 Publications
100+ Citations
5 h-index
I am a Senior Bioinformatician, Computational Biologist, and R/Shiny developer with over 7 years of experience bridging the gap between computational biology and pharmaceutical R&D. Currently at 7N, I develop scalable R/Shiny applications and analyze complex NGS and MS data for a Big Pharma client.
I hold a Ph.D. in Biological Sciences and am currently pursuing an MBA in Healthcare Management (with elements of Clinical Research), combining deep technical expertise with strategic business insight.
Scrum Product Ownership Jira Trello
Click on a publication card below to reveal the specific Bioinformatics & Statistics methods used.
Below is an interactive gallery of 15 analytical modules demonstrating my coding expertise using realistic, simulated biological datasets. Click the tabs to explore.
Focus: Population structure, GWAS, Variant Quality Control, and Evolution.
Population structure visualization simulating realistic genotype noise.
# Parameters
n_samples <- 300
n_snps <- 1000
# Generate base matrix (noise)
G <- matrix(rnorm(n_samples * n_snps), nrow = n_samples)
# Define 3 populations with subtle shifts in allele frequencies
# Pop 1 (Rows 1-100): Shift in first 100 SNPs
G[1:100, 1:100] <- G[1:100, 1:100] + 0.5
# Pop 2 (Rows 101-200): Shift in SNPs 50-150
G[101:200, 50:150] <- G[101:200, 50:150] - 0.6
# Pop 3 (Rows 201-300): Shift in SNPs 800-900
G[201:300, 800:900] <- G[201:300, 800:900] + 0.7
groups <- c(rep("Holstein", 100), rep("Jersey", 100), rep("Angus", 100))
# Perform PCA
pca_res <- prcomp(G, scale. = TRUE)
var_explained <- round(summary(pca_res)$importance[2, 1:2] * 100, 1)
df_pca <- data.frame(PC1 = pca_res$x[,1], PC2 = pca_res$x[,2], Breed = groups)
ggplot(df_pca, aes(x = PC1, y = PC2, fill = Breed)) +
geom_point(size = 3, shape = 21, color = "white", alpha = 0.7) +
stat_ellipse(aes(color = Breed), geom = "polygon", alpha = 0.1, level = 0.95, show.legend = FALSE) +
scale_fill_manual(values = c("#E69F00", "#56B4E9", "#009E73")) +
scale_color_manual(values = c("#E69F00", "#56B4E9", "#009E73")) +
theme_minimal() +
labs(title = "Population Structure (PCA)",
subtitle = "Simulated genotype matrix (300 samples x 1k SNPs)",
x = paste0("PC1 (", var_explained[1], "% Var)"),
y = paste0("PC2 (", var_explained[2], "% Var)"))
Genome-Wide Association Study with LD-like signal peaks.
n_snps <- 5000
gwas_df <- data.frame(
SNP = paste0("rs", 1:n_snps),
Chr = rep(1:5, each = 1000),
Pos = rep(1:1000, 5),
# Background noise (uniform distribution of p-values)
PVal = runif(n_snps)
)
# Simulate Linkage Disequilibrium (LD) Peak on Chr 2
center <- 1500
width <- 50
peak_signal <- 10^-seq(1, 8, length.out = width)
gwas_df$PVal[(center-width/2):(center+width/2-1)] <- peak_signal * runif(width, 0.1, 1)
gwas_df <- gwas_df %>% mutate(logP = -log10(PVal))
ggplot(gwas_df, aes(x = Pos, y = logP, color = as.factor(Chr))) +
geom_point(alpha = 0.6, size = 1) +
scale_color_brewer(palette = "Set1") +
geom_hline(yintercept = -log10(5e-8), linetype = "dashed", color = "red", alpha = 0.5) +
theme_minimal() +
facet_grid(. ~ Chr, scales = "free_x", space = "free_x", switch = "x") +
theme(axis.text.x = element_blank(), panel.grid.major.x = element_blank(), legend.position = "none") +
labs(title = "Manhattan Plot", y = "-Log10 P-Value", x = "Chromosome")
Quality Control: Realistic skewed distribution of Read Depth.
# Simulate sequencing depth (Negative Binomial distribution is common for read counts)
dp_values <- rnbinom(5000, size = 5, mu = 40)
df_qc <- data.frame(Depth = dp_values)
ggplot(df_qc, aes(x = Depth)) +
geom_histogram(aes(y = ..density..), binwidth = 2, fill = "#34495e", color = "white", alpha = 0.8) +
geom_density(color = "#e74c3c", size = 1, adjust = 2) +
scale_x_continuous(limits = c(0, 150)) +
theme_light() +
labs(title = "Variant Quality Control (DP)",
subtitle = "Distribution of sequencing depth across variants",
x = "Read Depth (DP)", y = "Density") +
geom_vline(xintercept = 40, linetype="dashed", color="orange") +
annotate("text", x = 60, y = 0.02, label = "Mean Depth ~ 40x", color = "orange")
Hierarchical clustering of genetic distances.
# Simulate distance matrix with some structure
m <- matrix(rnorm(100), nrow = 10)
rownames(m) <- paste0("Species_", LETTERS[1:10])
dist_mat <- dist(m)
hc <- hclust(dist_mat, method = "ward.D2")
par(mar=c(2,2,2,2))
plot(hc, hang = -1, main = "Phylogenetic Tree Reconstruction",
sub = "Ward's method clustering", xlab = "",
col = "#2980b9", lwd = 2)
rect.hclust(hc, k = 3, border = "red") # Highlight 3 clades
Focus: Expression profiling, Pathways, and Single-Cell.
Differential Expression: Realistic relationship between Fold Change and P-value.
n_genes <- 3000
# Generate LogFC: mostly close to 0, heavy tails
logfc <- c(rnorm(2500, 0, 0.5), rnorm(250, 2, 1), rnorm(250, -2, 1))
# Generate P-values dependent on LogFC (stronger effect -> smaller p-value)
noise <- rnorm(n_genes, 0, 2)
log_pval <- -(abs(logfc) * 3) + noise
pvals <- 10^log_pval
pvals[pvals > 1] <- 1
df_vol <- data.frame(
Gene = paste0("G", 1:n_genes),
log2FoldChange = logfc,
padj = p.adjust(pvals, method = "BH")
) %>%
mutate(
Group = case_when(
log2FoldChange > 1 & padj < 0.05 ~ "Up-regulated",
log2FoldChange < -1 & padj < 0.05 ~ "Down-regulated",
TRUE ~ "NS"
)
)
ggplot(df_vol, aes(x = log2FoldChange, y = -log10(padj), color = Group)) +
geom_point(alpha = 0.5, size = 1.5) +
scale_color_manual(values = c("steelblue", "grey85", "firebrick")) +
geom_hline(yintercept = -log10(0.05), linetype = "dashed", alpha = 0.5) +
geom_vline(xintercept = c(-1, 1), linetype = "dashed", alpha = 0.5) +
theme_minimal() +
labs(title = "Differential Expression (Volcano Plot)",
subtitle = "Simulated DESeq2 output with FDR correction",
x = "Log2 Fold Change", y = "-Log10 Adjusted P-value")
Simulating cellular trajectories/clusters.
# Create a "trajectory" shape rather than just blobs
t <- seq(0, 2*pi, length.out = 300)
x <- c(cos(t), cos(t) + 2.5, cos(t) + 1.2) + rnorm(900, 0, 0.2)
y <- c(sin(t), sin(t) + 1, sin(t) - 2) + rnorm(900, 0, 0.2)
clusters <- c(rep("Stem", 300), rep("Progenitor", 300), rep("Differentiated", 300))
df_umap <- data.frame(UMAP1 = x, UMAP2 = y, CellType = clusters)
ggplot(df_umap, aes(UMAP1, UMAP2, color = CellType)) +
geom_point(size = 0.8, alpha = 0.6) +
scale_color_manual(values = c("#9b59b6", "#3498db", "#e74c3c")) +
theme_void() +
theme(legend.position = "right") +
labs(title = "Single-Cell UMAP Projection", subtitle = "Developmental Trajectory Simulation")
Dotplot for GSEA results.
df_path <- data.frame(
Pathway = c("Cell Cycle", "DNA Replication", "P53 Signaling", "Apoptosis", "MAPK Signaling", "Ribosome"),
Count = c(45, 30, 25, 20, 15, 10),
GeneRatio = c(0.15, 0.12, 0.09, 0.08, 0.05, 0.03),
p.adjust = c(1e-9, 1e-6, 0.001, 0.01, 0.03, 0.045)
)
df_path$Pathway <- factor(df_path$Pathway, levels = df_path$Pathway[order(df_path$GeneRatio)])
ggplot(df_path, aes(x = GeneRatio, y = Pathway)) +
geom_point(aes(size = Count, color = p.adjust)) +
scale_color_gradient(low = "red", high = "blue") +
theme_light() +
labs(title = "KEGG Pathway Enrichment", x = "Gene Ratio", y = "")
Visualizing co-expression modules.
# Create structured matrix
mat <- matrix(rnorm(400), nrow = 20, ncol = 20)
# Add "modules" of correlation
mat[1:10, 1:10] <- mat[1:10, 1:10] + 2
mat[11:20, 11:20] <- mat[11:20, 11:20] - 2
df_h <- as.data.frame(mat)
colnames(df_h) <- paste0("S", 1:20)
df_h$Gene <- paste0("G", 1:20)
df_h_long <- pivot_longer(df_h, cols = -Gene, names_to = "Sample", values_to = "Exp")
ggplot(df_h_long, aes(Sample, Gene, fill = Exp)) +
geom_tile() +
scale_fill_gradient2(low = "#2c7bb6", mid = "#ffffbf", high = "#d7191c") +
theme_minimal() +
theme(axis.text.x = element_blank()) +
labs(title = "Gene Expression Heatmap", x = "Samples (n=20)", y = "Genes (n=20)")
Differentiation analysis (Monocle style).
set.seed(101)
t <- seq(0, 10, length.out=400)
branch1 <- data.frame(Time=t, Expr=sin(t) + rnorm(400, 0, 0.2), Branch="Lineage A")
branch2 <- data.frame(Time=t, Expr=sin(t) + 0.5*t + rnorm(400, 0, 0.2), Branch="Lineage B")
pseudo <- rbind(branch1, branch2)
ggplot(pseudo, aes(Time, Expr, color=Branch)) +
geom_point(alpha=0.4) + geom_smooth(se=FALSE, size=1.5) +
theme_classic() + scale_color_manual(values=c("#8e44ad", "#27ae60")) +
labs(title="Pseudotime Trajectory", subtitle="Gene Expression during Differentiation", x="Pseudotime", y="Expression Level")
Expression intensity and percentage (Seurat style).
genes <- c("CD3D", "CD19", "CD14", "MS4A1", "GNLY", "NKG7")
clusters <- c("T-Cells", "B-Cells", "Monocytes", "NK-Cells")
df_dot <- expand.grid(Gene=genes, Cluster=clusters)
set.seed(5)
df_dot$AvgExp <- runif(nrow(df_dot), 0, 3)
df_dot$Pct <- runif(nrow(df_dot), 0, 100)
df_dot$AvgExp[df_dot$Gene=="CD3D" & df_dot$Cluster=="T-Cells"] <- 2.8
df_dot$Pct[df_dot$Gene=="CD3D" & df_dot$Cluster=="T-Cells"] <- 90
ggplot(df_dot, aes(Gene, Cluster)) +
geom_point(aes(size=Pct, color=AvgExp)) +
scale_color_gradient(low="lightgrey", high="blue") +
theme_bw() + labs(title="Marker Gene Expression (DotPlot)", size="% Expressed", color="Avg Exp")
Focus: Mass Spec Intensity, Motif Analysis, Chromatin Accessibility, and Methylation.
Rank-Abundance plot typical for Mass Spectrometry data.
n_prot <- 2000
# Simulate log-normal intensities typical for MS
intensities <- rlnorm(n_prot, meanlog = 10, sdlog = 2)
intensities <- sort(intensities, decreasing = TRUE)
df_range <- data.frame(
Rank = 1:n_prot,
Intensity = intensities,
Label = NA
)
# Highlight high abundance (e.g. Albumin) and low abundance (Biomarkers)
df_range$Label[c(1, 50, 1950, 2000)] <- c("Albumin", "ApoA1", "IL-6", "TP53")
ggplot(df_range, aes(x = Rank, y = Intensity)) +
geom_line(color = "grey", size = 0.5) +
geom_point(alpha = 0.6, color = "#34495e") +
scale_y_log10() +
geom_text(aes(label = Label), vjust = -0.5, hjust = -0.1, color = "#e74c3c", fontface="bold") +
theme_bw() +
labs(title = "Proteome Dynamic Range",
subtitle = "Mass Spec Intensity Distribution (Rank-Abundance)",
x = "Protein Rank", y = "Intensity (Log10 LFQ)")
Differential Protein Abundance.
df_p <- data.frame(FC=rnorm(1000), P=runif(1000))
df_p$P[1:5] <- c(1e-6, 1e-5, 1e-5, 1e-4, 1e-4); df_p$FC[1:5] <- c(3, -3, 2.5, -2.5, 4)
df_p$Label <- NA; df_p$Label[1:5] <- c("TP53", "KRAS", "MYC", "PTEN", "EGFR")
ggplot(df_p, aes(FC, -log10(P))) +
geom_point(aes(color=P<0.01), alpha=0.5) +
scale_color_manual(values=c("grey", "purple")) +
geom_text(aes(label=Label), vjust=-1, size=3) +
theme_minimal() + theme(legend.position="none") + labs(title="Proteomics Differential Abundance")
Transcription Factor Binding Motif (CTCF-like).
# NOTE: Requires 'ggseqlogo' package
if(requireNamespace("ggseqlogo", quietly=TRUE)){
library(ggseqlogo)
# Simulated Position Weight Matrix (PWM) for a CTCF-like motif
# Rows: A, C, G, T; Cols: Positions
pwm_matrix <- matrix(c(
0.1, 0.9, 0.0, 0.0, # Pos 1: C dominant
0.0, 0.1, 0.0, 0.9, # Pos 2: T dominant
0.8, 0.1, 0.1, 0.0, # Pos 3: A dominant
0.9, 0.0, 0.1, 0.0, # Pos 4: A dominant
0.0, 0.0, 1.0, 0.0, # Pos 5: G conserved
0.1, 0.8, 0.1, 0.0, # Pos 6: C dominant
0.0, 0.0, 0.9, 0.1, # Pos 7: G dominant
0.0, 0.1, 0.0, 0.9 # Pos 8: T dominant
), nrow=4, dimnames=list(c("A","C","G","T")))
ggseqlogo(pwm_matrix) +
theme_minimal() +
labs(title="Transcription Factor Motif (SeqLogo)", subtitle="Enriched in ATAC-seq peaks (CTCF-like)")
} else {
print("Please install 'ggseqlogo' to view this plot.")
}
Volcano plot for CpG sites.
# Simulate Delta Beta values (-1 to 1)
delta_beta <- rnorm(5000, 0, 0.1)
# Add some hyper/hypo methylated sites
delta_beta[1:50] <- runif(50, 0.2, 0.6)
delta_beta[51:100] <- runif(50, -0.6, -0.2)
pvals <- runif(5000)
pvals[1:100] <- 10^-runif(100, 3, 10) # Significant
df_meth <- data.frame(DeltaBeta=delta_beta, P=pvals)
df_meth$Sig <- ifelse(df_meth$P < 0.01 & abs(df_meth$DeltaBeta) > 0.2, "DMR", "NS")
ggplot(df_meth, aes(DeltaBeta, -log10(P), color=Sig)) +
geom_point(alpha=0.5, size=1.2) +
scale_color_manual(values=c("orange", "grey")) +
geom_vline(xintercept=c(-0.2, 0.2), linetype="dashed") +
theme_classic() +
labs(title="Differential Methylation Analysis", x="Delta Beta (Methylation Difference)", y="-Log10 P-value")
Chromatin accessibility around Transcription Start Sites.
x <- seq(-2000, 2000, 20)
# Gaussian peak for accessible chromatin
y <- 60 * exp(-x^2 / (2*300^2)) + rnorm(length(x), 5, 2)
ggplot(data.frame(x,y), aes(x,y)) +
geom_area(fill="#27ae60", alpha=0.5) +
geom_line(color="#2c3e50") +
theme_classic() +
labs(title="TSS Enrichment Profile (ATAC-seq)", x="Distance from TSS (bp)", y="Normalized Signal")
ChromHMM style heatmap.
states <- c("Promoter", "Enhancer", "Transcribed", "Repressed", "Heterochrom")
samples <- paste0("Sample_", 1:10)
df_chrom <- expand.grid(State=states, Sample=samples)
df_chrom$Enrichment <- runif(50, 0, 10)
df_chrom$Enrichment[df_chrom$State=="Promoter"] <- df_chrom$Enrichment[df_chrom$State=="Promoter"] + 5
ggplot(df_chrom, aes(Sample, State, fill=Enrichment)) +
geom_tile() +
scale_fill_viridis_c(option="magma") +
theme_minimal() +
labs(title="Chromatin State Enrichment (ChromHMM)", x="", y="")
Focus: Drug Discovery, CRISPR Screens, and Machine Learning.
Genome-wide ranking of gene essentiality (MAGeCK style).
n_genes <- 2000
scores <- c(rnorm(1800, mean = 0, sd = 0.3),
rnorm(150, mean = -1.5, sd = 0.6),
rnorm(50, mean = 1.2, sd = 0.5))
df_rank <- data.frame(Score = scores)
df_rank <- df_rank[order(df_rank$Score), , drop = FALSE]
df_rank$Rank <- 1:nrow(df_rank)
df_rank$Type <- "Neutral"
df_rank$Type[df_rank$Score < -1] <- "Essential"
df_rank$Type[df_rank$Score > 1] <- "Resistance"
df_rank$Label <- NA
df_rank$Label[1:5] <- c("RPA1", "PCNA", "POLR2A", "MYC", "MCM2")
n <- nrow(df_rank)
df_rank$Label[(n-4):n] <- c("PTEN", "NF1", "KEAP1", "TP53", "RB1")
ggplot(df_rank, aes(x = Rank, y = Score, color = Type)) +
geom_point(size = 1.5, alpha = 0.7) +
scale_color_manual(values = c("firebrick", "grey85", "dodgerblue")) +
geom_text(aes(label = Label), vjust = ifelse(df_rank$Score > 0, -1, 1.5),
size = 3, fontface = "bold", color = "black", check_overlap = TRUE) +
geom_hline(yintercept = c(-1, 1), linetype = "dashed", color = "grey50") +
theme_classic() +
labs(title = "CRISPR Screen Gene Ranking",
subtitle = "Genome-wide Essentiality (Negative Selection) vs Resistance (Positive Selection)",
x = "Gene Rank",
y = "Beta Score (Log2FC)",
color = "Gene Category") +
theme(legend.position = "top")
Biomarker discovery.
imp <- data.frame(Feat=paste0("M_", 1:10), Val=sort(rnorm(10, 10, 2)))
ggplot(imp, aes(reorder(Feat, Val), Val)) + geom_col(fill="steelblue") + coord_flip() +
theme_minimal() + labs(title="Feature Importance (Random Forest)", x="", y="Mean Decrease Gini")
Pharmacodynamics using 4-Parameter Logistic Model.
# Define 4-Parameter Logistic Function
f_4pl <- function(x, b, c, d, e) { c + (d - c) / (1 + exp(b * (log(x) - log(e)))) }
concs <- c(0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000)
# Parameters: b=Slope, c=Bottom, d=Top, e=IC50
true_y <- f_4pl(concs, b = -1.5, c = 5, d = 100, e = 50)
# Add biological replicates and noise
df_dr <- data.frame(
Conc = rep(concs, each=3)
)
df_dr$Resp <- rep(true_y, each=3) + rnorm(nrow(df_dr), 0, 5) # Noise
ggplot(df_dr, aes(x=Conc, y=Resp)) +
geom_point(size=2.5, alpha=0.6, color="#8e44ad") +
geom_function(fun = function(x) f_4pl(x, -1.5, 5, 100, 50), color="#2c3e50", size=1) +
scale_x_log10() +
theme_bw() +
labs(title="Drug Dose-Response (Sigmoidal)",
subtitle="IC50 = 50 nM (4-Parameter Logistic Fit)",
x="Concentration (nM) [Log]", y="Cell Viability (%)") +
geom_vline(xintercept=50, linetype="dashed", color="red")
Model performance metrics.
# Simulate scores for Positive and Negative classes
neg_scores <- rnorm(500, mean=0.3, sd=0.15)
pos_scores <- rnorm(500, mean=0.7, sd=0.15) # Better separation
# Calculate ROC points manually
thresholds <- seq(0, 1, 0.01)
tpr <- sapply(thresholds, function(t) mean(pos_scores > t))
fpr <- sapply(thresholds, function(t) mean(neg_scores > t))
df_roc <- data.frame(FPR=fpr, TPR=tpr)
ggplot(df_roc, aes(x=FPR, y=TPR)) +
geom_path(color="#27ae60", size=1.5) +
geom_abline(linetype="dashed", color="grey") +
theme_light() +
annotate("text", x=0.75, y=0.25, label="AUC = 0.96", size=6, color="#27ae60") +
labs(title="ROC Analysis: SNP Classifier", x="1 - Specificity (FPR)", y="Sensitivity (TPR)")
Focus: Survival, Longitudinal Models, Meta-Analysis, and Correlations.
Odds Ratios visualization.
df_f <- data.frame(
Study=c("Study A", "Study B", "Study C", "Pooled"),
OR=c(1.2, 0.8, 1.5, 1.1),
Low=c(0.9, 0.5, 1.1, 0.95),
High=c(1.5, 1.2, 2.0, 1.3)
)
ggplot(df_f, aes(y=Study, x=OR, xmin=Low, xmax=High)) +
geom_point(size=3, shape=15) + geom_errorbarh(height=0.2) +
geom_vline(xintercept=1, linetype="dashed", color="red") +
theme_bw() + labs(title="Forest Plot (Meta-Analysis)", x="Odds Ratio (95% CI)", y="")
Time-to-event analysis with realistic censoring.
n <- 100
# Generate Weibull distributed times (more realistic for biological failure)
T_treat <- rweibull(n, shape = 1.5, scale = 100)
T_placebo <- rweibull(n, shape = 1.5, scale = 60) # Placebo dies faster
# Combine
df_surv <- data.frame(
Time = c(T_treat, T_placebo),
Group = rep(c("Treatment", "Placebo"), each = n)
)
# Add random censoring (patients leaving study)
cens_time <- runif(2*n, 0, 150)
df_surv$Status <- ifelse(df_surv$Time < cens_time, 1, 0) # 1=Event, 0=Censored
df_surv$TimeObs <- pmin(df_surv$Time, cens_time)
fit <- survfit(Surv(TimeObs, Status) ~ Group, data = df_surv)
# Plot
plot(fit, col=c("red", "blue"), lwd=3, xlab="Time (Months)", ylab="Survival Probability",
main="Kaplan-Meier Analysis (Simulated Clinical Trial)", frame.plot=FALSE)
legend("topright", levels(factor(df_surv$Group)), col=c("red", "blue"), lwd=3, bty="n")
grid(col="grey90")
Longitudinal data with realistic subject variance.
subjects <- 20
times <- 0:5
# Random intercepts and slopes
intercepts <- rnorm(subjects, 10, 2)
slopes <- rnorm(subjects, 0.5, 0.2)
df_lmm <- data.frame()
for(i in 1:subjects){
# Add group effect: Group B increases faster
grp <- ifelse(i > 10, "B", "A")
slope_eff <- slopes[i] + ifelse(grp=="B", 1.5, 0)
y <- intercepts[i] + slope_eff * times + rnorm(length(times), 0, 1)
df_lmm <- rbind(df_lmm, data.frame(ID=paste0("S",i), Time=times, Value=y, Group=grp))
}
ggplot(df_lmm, aes(x=Time, y=Value, group=ID, color=Group)) +
geom_line(alpha=0.4) +
stat_summary(aes(group=Group), fun=mean, geom="line", size=2) +
scale_color_manual(values=c("#95a5a6", "#e74c3c")) +
theme_bw() +
labs(title="Longitudinal Response (LMM)", subtitle="Group A vs B treatment trajectories")
Clinical variable relationships.
# Simulate correlated multivariate data
sigma <- matrix(c(1, 0.8, -0.5, 0.8, 1, -0.3, -0.5, -0.3, 1), 3, 3)
data_corr <- mvrnorm(n = 50, mu = c(0,0,0), Sigma = sigma)
colnames(data_corr) <- c("BMI", "Insulin", "Activity")
cormat <- round(cor(data_corr), 2)
melted <- as.data.frame(as.table(cormat))
ggplot(melted, aes(Var1, Var2, fill=Freq)) +
geom_tile(color="white") +
geom_text(aes(label=Freq)) +
scale_fill_gradient2(low="#2980b9", mid="white", high="#c0392b") +
theme_minimal() + labs(title="Clinical Correlations", x="", y="")
Focus: Community Diversity and Composition.
Comparing species richness (Non-normal distributions).
# Generate Gamma distributed data (common for counts/diversity)
grp_a <- rgamma(40, shape=20, scale=0.2)
grp_b <- rgamma(40, shape=15, scale=0.2)
df_alpha <- data.frame(Index = c(grp_a, grp_b), Group = rep(c("Healthy", "Dysbiosis"), each=40))
ggplot(df_alpha, aes(x=Group, y=Index, fill=Group)) +
geom_violin(alpha=0.3, trim=FALSE) +
geom_boxplot(width=0.2, alpha=0.8) +
scale_fill_manual(values=c("#e74c3c", "#2ecc71")) +
theme_classic() +
labs(title="Microbiome Alpha Diversity (Shannon)", y="Diversity Index")
Visualizing community separation.
# Create two multivariate normal distributions
g1 <- mvrnorm(25, mu=c(2, 2), Sigma=matrix(c(1,0.5,0.5,1),2))
g2 <- mvrnorm(25, mu=c(-1, -1), Sigma=matrix(c(1,-0.2,-0.2,1),2))
df_beta <- rbind(data.frame(g1, Grp="Ctrl"), data.frame(g2, Grp="Treat"))
colnames(df_beta)[1:2] <- c("PCoA1", "PCoA2")
ggplot(df_beta, aes(PCoA1, PCoA2, color=Grp)) +
geom_point(size=3) +
stat_ellipse() +
theme_minimal() +
labs(title="Beta Diversity (Bray-Curtis PCoA)", subtitle="Clustering of microbial communities")
© 2026 Bartosz Czech. All rights reserved.
Built with R Markdown. Let’s connect: bartosz.w.czech@gmail.com