This task uses the DIABLO framework from mixOmics to integrate DNA methylation and gene expression data for classifying Multiple Sclerosis samples (lesion vs. NAWM). The workflow includes:
The final model identifies multi-omics signatures distinguishing lesion status, supporting downstream biomarker discovery.
##Load Input Data from Previous Parts
# ---- Load Input Data from Previous Parts ----
# ---- Load RNA-seq count matrix and metadata ----
# Load RNA-seq counts
rna_counts <- read.delim("GSE224377_raw_counts_GRCh38.p13_NCBI (1).tsv", row.names = 1)
# Load metadata
rna_meta <- read.csv("metadata-new.csv")
# Match sample names in counts matrix using metadata
colnames(rna_counts) <- rna_meta$X[match(colnames(rna_counts), rna_meta$Accession)]
# Extract patient and sample type
library(dplyr)
library(tidyr)
rna_meta <- rna_meta %>%
separate(Title, into = c("patient", "sample.type"), sep = ",", fill = "right") %>%
mutate(sample.type = trimws(sample.type))
# Load methylation matrix and remove rows with missing values
meth <- read.csv("Methylation_20000_Top_expressed.csv", row.names = 1, check.names = FALSE)
meth <- meth[complete.cases(meth), ]
# Ensure metadata is available (assumed to be already loaded as rna_meta)
meth_meta <- rna_meta
# Match samples between metadata and methylation data
valid_samples <- intersect(meth_meta$X, colnames(meth))
# Filter metadata and methylation matrix based on available matching samples
meth_meta_filtered <- meth_meta[meth_meta$X %in% valid_samples, ]
meth_filtered <- meth[, valid_samples]
# Define NAWM and lesion sample names from filtered metadata
nawm_names <- meth_meta_filtered$X[meth_meta_filtered$sample.type == "NAWM"]
lesion_names <- meth_meta_filtered$X[meth_meta_filtered$sample.type == "lesion"]
# Create final methylation matrix with ordered samples
meth_mat <- meth_filtered[, c(nawm_names, lesion_names)]
# Define group indices (optional, if needed later)
ctrl.idx <- 1:length(nawm_names)
case.idx <- (length(nawm_names) + 1):ncol(meth_mat)
group_factor <- factor(c(rep(1, length(ctrl.idx)), rep(2, length(case.idx))))
##ready to start DIABLO model
# ---- Load Required Libraries ----
library(mixOmics)
## Loading required package: MASS
##
## Attaching package: 'MASS'
## The following object is masked from 'package:AnnotationDbi':
##
## select
## The following object is masked from 'package:clusterProfiler':
##
## select
## The following object is masked from 'package:genefilter':
##
## area
## The following object is masked from 'package:dplyr':
##
## select
## Loading required package: lattice
##
## Attaching package: 'lattice'
## The following object is masked from 'package:clusterProfiler':
##
## dotplot
##
## Loaded mixOmics 6.28.0
## Thank you for using mixOmics!
## Tutorials: http://mixomics.org
## Bookdown vignette: https://mixomicsteam.github.io/Bookdown
## Questions, issues: Follow the prompts at http://mixomics.org/contact-us
## Cite us: citation('mixOmics')
##
## Attaching package: 'mixOmics'
## The following object is masked from 'package:purrr':
##
## map
library(pROC)
## Warning: package 'pROC' was built under R version 4.4.3
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:IRanges':
##
## cov, var
## The following objects are masked from 'package:S4Vectors':
##
## cov, var
## The following object is masked from 'package:BiocGenerics':
##
## var
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
library(caret) # For nearZeroVar
## Warning: package 'caret' was built under R version 4.4.3
##
## Attaching package: 'caret'
## The following objects are masked from 'package:mixOmics':
##
## nearZeroVar, plsda, splsda
## The following objects are masked from 'package:DescTools':
##
## MAE, RMSE
## The following object is masked from 'package:purrr':
##
## lift
library(ggplot2)
# Load GE_z and fix column names
GE_z <- read.csv("RNA_TMM_log2_zscore.csv", row.names = 1)
# Remove leading 'X' added by write.csv() to numeric-starting column names
colnames(GE_z) <- gsub("^X", "", colnames(GE_z))
# Check and reorder columns
GE_z <- GE_z[, colnames(meth_mat)]
# ---- Load and Prepare Data ----
# Transpose data so rows = samples, columns = features
methyl_data <- t(meth)
rna_data <- t(GE_z)
# Ensure all data is numeric
methyl_data <- as.data.frame(lapply(as.data.frame(methyl_data), as.numeric))
rna_data <- as.data.frame(lapply(as.data.frame(rna_data), as.numeric))
# Define outcome variable (factor with >1 level)
Y <- as.factor(meth_meta$sample.type)
# Check Y has >1 level
if (nlevels(Y) < 2) {
stop("Error: Outcome variable Y must have at least two classes.")
}
# ---- Remove Near-Zero Variance Features ----
nzv_methyl <- nearZeroVar(methyl_data, saveMetrics = TRUE)
methyl_data <- methyl_data[, !nzv_methyl$nzv] # Remove all near-zero variance columns
nzv_rna <- nearZeroVar(rna_data, saveMetrics = TRUE)
rna_data <- rna_data[, !nzv_rna$nzv]
# ---- Ensure Matching Sample Order ----
stopifnot(rownames(methyl_data) == rownames(rna_data))
# ---- Create Multi-Omics Block ----
data_list <- list(methyl = methyl_data, rna = rna_data)
# ---- DIABLO Design Matrix ----
design <- matrix(0.1, ncol = length(data_list), nrow = length(data_list),
dimnames = list(names(data_list), names(data_list)))
diag(design) <- 0
# ---- Tuning Step ----
list.keepX <- list(
methyl = c(10, 25, 50),
rna = c(10, 25, 50)
)
# Fix randomness for reproducibility
set.seed(42)
tune <- tune.block.splsda(
X = data_list,
Y = Y,
ncomp = 2,
test.keepX = list.keepX,
design = design,
validation = 'Mfold',
folds = 5,
nrepeat = 1,
dist = "centroids.dist"
)
## Design matrix has changed to include Y; each block will be
## linked to Y.
##
## You have provided a sequence of keepX of length: 3 for block methyl and 3 for block rna.
## This results in 9 models being fitted for each component and each nrepeat, this may take some time to run, be patient!
##
## You can look into the 'BPPARAM' argument to speed up computation time.
## Warning: The SGCCA algorithm did not converge
# ---- Optimal keepX ----
keepX.optimal <- tune$choice.keepX
# ---- Final DIABLO Model ----
diablo_model <- block.splsda(
X = data_list,
Y = Y,
ncomp = 2,
keepX = keepX.optimal,
design = design,
near.zero.var = TRUE
)
## Design matrix has changed to include Y; each block will be
## linked to Y.
# ---- Sample Projections ----
plotIndiv(diablo_model, legend = TRUE, title = "DIABLO: Sample Projection")
plotDiablo(diablo_model)
circosPlot(diablo_model, comp = 1, cutoff = 0.7)
# ---- Extract Selected Features (Per Block and Component) ----
# Helper function to extract names robustly
getSelected <- function(model, block, comp) {
out <- selectVar(model, block = block, comp = comp)
if (!is.null(out) && !is.null(out[[block]]$name)) {
return(out[[block]]$name)
} else if (!is.null(out$name)) {
return(out$name)
} else {
return(NULL)
}
}
selected_genes_c1 <- getSelected(diablo_model, "rna", comp = 1)
selected_cpgs_c1 <- getSelected(diablo_model, "methyl", comp = 1)
selected_genes_c2 <- getSelected(diablo_model, "rna", comp = 2)
selected_cpgs_c2 <- getSelected(diablo_model, "methyl", comp = 2)
# ---- Export Selected Features ----
write.csv(selected_genes_c1, "selected_genes_component1.csv", row.names = FALSE)
write.csv(selected_cpgs_c1, "selected_cpgs_component1.csv", row.names = FALSE)
write.csv(selected_genes_c2, "selected_genes_component2.csv", row.names = FALSE)
write.csv(selected_cpgs_c2, "selected_cpgs_component2.csv", row.names = FALSE)
# ---- Loadings Plot ----
plotLoadings(diablo_model, comp = 1, block = "rna", method = 'mean', contrib = 'max')
plotLoadings(diablo_model, comp = 1, block = "methyl", method = 'mean', contrib = 'max')
# ---- Model Performance ----
perf_diablo <- perf(
diablo_model,
validation = "Mfold",
folds = 5,
nrepeat = 10
)
print(perf_diablo$error.rate)
## $methyl
## max.dist centroids.dist mahalanobis.dist
## comp1 0.2333333 0.2277778 0.2277778
## comp2 0.4333333 0.3666667 0.4333333
##
## $rna
## max.dist centroids.dist mahalanobis.dist
## comp1 0.4944444 0.4944444 0.4944444
## comp2 0.5111111 0.5055556 0.5111111
plot(perf_diablo)
# ---- ROC and AUC ----
aucs <- auroc(diablo_model)
## $methyl
## $methyl$comp1
## AUC p-value
## lesion vs NAWM 1 0.0003486
##
## $methyl$comp2
## AUC p-value
## lesion vs NAWM 1 0.0003486
##
##
## $rna
## $rna$comp1
## AUC p-value
## lesion vs NAWM 1 0.0003486
##
## $rna$comp2
## AUC p-value
## lesion vs NAWM 1 0.0003486
print(aucs)
## $methyl
## $methyl$comp1
## AUC p-value
## lesion vs NAWM 1 0.0003486
##
## $methyl$comp2
## AUC p-value
## lesion vs NAWM 1 0.0003486
##
##
## $rna
## $rna$comp1
## AUC p-value
## lesion vs NAWM 1 0.0003486
##
## $rna$comp2
## AUC p-value
## lesion vs NAWM 1 0.0003486
##
##
## $graph.methyl
## $graph.methyl$comp2
# ---- Confusion Matrix ----
# Use test data structured the same way as data_list
diablo_model <- block.splsda(X = data_list, Y = Y, design = design, ncomp = 2)
## Design matrix has changed to include Y; each block will be
## linked to Y.
# Step 1: Predict using trained model
pred <- predict(diablo_model, newdata = data_list)
# Extract predicted labels for comp1
pred_labels <- pred$MajorityVote$max.dist[, "comp1"]
# Check the prediction
print(pred_labels)
## 1 2 3 4 5 6 7 8
## "lesion" NA "lesion" NA "NAWM" "NAWM" "lesion" NA
## 9 10 11 12 13 14 15 16
## "lesion" "NAWM" "lesion" "NAWM" "NAWM" "NAWM" "lesion" "NAWM"
## 17 18
## "lesion" NA
# Create confusion matrix against actual labels
conf_matrix <- table(Predicted = pred_labels, Actual = Y)
print(conf_matrix)
## Actual
## Predicted lesion NAWM
## lesion 7 0
## NAWM 2 5
# ---- Accuracy ----
accuracy <- sum(diag(conf_matrix)) / sum(conf_matrix)
cat("Accuracy:", round(accuracy, 3), "\n")
## Accuracy: 0.857