Task 8: Multi-Omics Integration via DIABLO

This task uses the DIABLO framework from mixOmics to integrate DNA methylation and gene expression data for classifying Multiple Sclerosis samples (lesion vs. NAWM). The workflow includes:

The final model identifies multi-omics signatures distinguishing lesion status, supporting downstream biomarker discovery.


Load All Required Libraries for Multi-Omics Analysis

##Load Input Data from Previous Parts

# ---- Load Input Data from Previous Parts ----
# ---- Load RNA-seq count matrix and metadata ----

# Load RNA-seq counts
rna_counts <- read.delim("GSE224377_raw_counts_GRCh38.p13_NCBI (1).tsv", row.names = 1)

# Load metadata
rna_meta <- read.csv("metadata-new.csv")

# Match sample names in counts matrix using metadata
colnames(rna_counts) <- rna_meta$X[match(colnames(rna_counts), rna_meta$Accession)]

# Extract patient and sample type
library(dplyr)
library(tidyr)

rna_meta <- rna_meta %>%
  separate(Title, into = c("patient", "sample.type"), sep = ",", fill = "right") %>%
  mutate(sample.type = trimws(sample.type))

# Load methylation matrix and remove rows with missing values
meth <- read.csv("Methylation_20000_Top_expressed.csv", row.names = 1, check.names = FALSE)
meth <- meth[complete.cases(meth), ]

# Ensure metadata is available (assumed to be already loaded as rna_meta)
meth_meta <- rna_meta

# Match samples between metadata and methylation data
valid_samples <- intersect(meth_meta$X, colnames(meth))

# Filter metadata and methylation matrix based on available matching samples
meth_meta_filtered <- meth_meta[meth_meta$X %in% valid_samples, ]
meth_filtered <- meth[, valid_samples]

# Define NAWM and lesion sample names from filtered metadata
nawm_names <- meth_meta_filtered$X[meth_meta_filtered$sample.type == "NAWM"]
lesion_names <- meth_meta_filtered$X[meth_meta_filtered$sample.type == "lesion"]

# Create final methylation matrix with ordered samples
meth_mat <- meth_filtered[, c(nawm_names, lesion_names)]

# Define group indices (optional, if needed later)
ctrl.idx <- 1:length(nawm_names)
case.idx <- (length(nawm_names) + 1):ncol(meth_mat)
group_factor <- factor(c(rep(1, length(ctrl.idx)), rep(2, length(case.idx))))

##ready to start DIABLO model

# ---- Load Required Libraries ----
library(mixOmics)
## Loading required package: MASS
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:AnnotationDbi':
## 
##     select
## The following object is masked from 'package:clusterProfiler':
## 
##     select
## The following object is masked from 'package:genefilter':
## 
##     area
## The following object is masked from 'package:dplyr':
## 
##     select
## Loading required package: lattice
## 
## Attaching package: 'lattice'
## The following object is masked from 'package:clusterProfiler':
## 
##     dotplot
## 
## Loaded mixOmics 6.28.0
## Thank you for using mixOmics!
## Tutorials: http://mixomics.org
## Bookdown vignette: https://mixomicsteam.github.io/Bookdown
## Questions, issues: Follow the prompts at http://mixomics.org/contact-us
## Cite us:  citation('mixOmics')
## 
## Attaching package: 'mixOmics'
## The following object is masked from 'package:purrr':
## 
##     map
library(pROC)
## Warning: package 'pROC' was built under R version 4.4.3
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## The following objects are masked from 'package:IRanges':
## 
##     cov, var
## The following objects are masked from 'package:S4Vectors':
## 
##     cov, var
## The following object is masked from 'package:BiocGenerics':
## 
##     var
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
library(caret)     # For nearZeroVar
## Warning: package 'caret' was built under R version 4.4.3
## 
## Attaching package: 'caret'
## The following objects are masked from 'package:mixOmics':
## 
##     nearZeroVar, plsda, splsda
## The following objects are masked from 'package:DescTools':
## 
##     MAE, RMSE
## The following object is masked from 'package:purrr':
## 
##     lift
library(ggplot2)
# Load GE_z and fix column names
GE_z <- read.csv("RNA_TMM_log2_zscore.csv", row.names = 1)

# Remove leading 'X' added by write.csv() to numeric-starting column names
colnames(GE_z) <- gsub("^X", "", colnames(GE_z))

# Check and reorder columns
GE_z <- GE_z[, colnames(meth_mat)]


# ---- Load and Prepare Data ----

# Transpose data so rows = samples, columns = features
methyl_data <- t(meth)
rna_data <- t(GE_z)

# Ensure all data is numeric
methyl_data <- as.data.frame(lapply(as.data.frame(methyl_data), as.numeric))
rna_data <- as.data.frame(lapply(as.data.frame(rna_data), as.numeric))

# Define outcome variable (factor with >1 level)
Y <- as.factor(meth_meta$sample.type)

# Check Y has >1 level
if (nlevels(Y) < 2) {
  stop("Error: Outcome variable Y must have at least two classes.")
}

# ---- Remove Near-Zero Variance Features ----
nzv_methyl <- nearZeroVar(methyl_data, saveMetrics = TRUE)
methyl_data <- methyl_data[, !nzv_methyl$nzv]  # Remove all near-zero variance columns


nzv_rna <- nearZeroVar(rna_data, saveMetrics = TRUE)
rna_data <- rna_data[, !nzv_rna$nzv]

# ---- Ensure Matching Sample Order ----
stopifnot(rownames(methyl_data) == rownames(rna_data))

# ---- Create Multi-Omics Block ----
data_list <- list(methyl = methyl_data, rna = rna_data)

# ---- DIABLO Design Matrix ----
design <- matrix(0.1, ncol = length(data_list), nrow = length(data_list),
                 dimnames = list(names(data_list), names(data_list)))
diag(design) <- 0

# ---- Tuning Step ----
list.keepX <- list(
  methyl = c(10, 25, 50),
  rna = c(10, 25, 50)
)
# Fix randomness for reproducibility
set.seed(42)

tune <- tune.block.splsda(
  X = data_list,
  Y = Y,
  ncomp = 2,
  test.keepX = list.keepX,
  design = design,
  validation = 'Mfold',
  folds = 5,
  nrepeat = 1,
  dist = "centroids.dist"
)
## Design matrix has changed to include Y; each block will be
##             linked to Y.
## 
## You have provided a sequence of keepX of length: 3 for block methyl and 3 for block rna.
## This results in 9 models being fitted for each component and each nrepeat, this may take some time to run, be patient!
## 
## You can look into the 'BPPARAM' argument to speed up computation time.
## Warning: The SGCCA algorithm did not converge
# ---- Optimal keepX ----
keepX.optimal <- tune$choice.keepX

# ---- Final DIABLO Model ----
diablo_model <- block.splsda(
  X = data_list,
  Y = Y,
  ncomp = 2,
  keepX = keepX.optimal,
  design = design,
  near.zero.var = TRUE
)
## Design matrix has changed to include Y; each block will be
##             linked to Y.
# ---- Sample Projections ----
plotIndiv(diablo_model, legend = TRUE, title = "DIABLO: Sample Projection")

plotDiablo(diablo_model)

circosPlot(diablo_model, comp = 1, cutoff = 0.7)

# ---- Extract Selected Features (Per Block and Component) ----

# Helper function to extract names robustly
getSelected <- function(model, block, comp) {
  out <- selectVar(model, block = block, comp = comp)
  if (!is.null(out) && !is.null(out[[block]]$name)) {
    return(out[[block]]$name)
  } else if (!is.null(out$name)) {
    return(out$name)
  } else {
    return(NULL)
  }
}

selected_genes_c1  <- getSelected(diablo_model, "rna", comp = 1)
selected_cpgs_c1   <- getSelected(diablo_model, "methyl", comp = 1)
selected_genes_c2  <- getSelected(diablo_model, "rna", comp = 2)
selected_cpgs_c2   <- getSelected(diablo_model, "methyl", comp = 2)

# ---- Export Selected Features ----
write.csv(selected_genes_c1, "selected_genes_component1.csv", row.names = FALSE)
write.csv(selected_cpgs_c1,  "selected_cpgs_component1.csv", row.names = FALSE)
write.csv(selected_genes_c2, "selected_genes_component2.csv", row.names = FALSE)
write.csv(selected_cpgs_c2,  "selected_cpgs_component2.csv", row.names = FALSE)

# ---- Loadings Plot ----
plotLoadings(diablo_model, comp = 1, block = "rna", method = 'mean', contrib = 'max')

plotLoadings(diablo_model, comp = 1, block = "methyl", method = 'mean', contrib = 'max')

# ---- Model Performance ----
perf_diablo <- perf(
  diablo_model,
  validation = "Mfold",
  folds = 5,
  nrepeat = 10
)
print(perf_diablo$error.rate)
## $methyl
##        max.dist centroids.dist mahalanobis.dist
## comp1 0.2333333      0.2277778        0.2277778
## comp2 0.4333333      0.3666667        0.4333333
## 
## $rna
##        max.dist centroids.dist mahalanobis.dist
## comp1 0.4944444      0.4944444        0.4944444
## comp2 0.5111111      0.5055556        0.5111111
plot(perf_diablo)

# ---- ROC and AUC ----
aucs <- auroc(diablo_model)

## $methyl
## $methyl$comp1
##                AUC   p-value
## lesion vs NAWM   1 0.0003486
## 
## $methyl$comp2
##                AUC   p-value
## lesion vs NAWM   1 0.0003486
## 
## 
## $rna
## $rna$comp1
##                AUC   p-value
## lesion vs NAWM   1 0.0003486
## 
## $rna$comp2
##                AUC   p-value
## lesion vs NAWM   1 0.0003486
print(aucs)
## $methyl
## $methyl$comp1
##                AUC   p-value
## lesion vs NAWM   1 0.0003486
## 
## $methyl$comp2
##                AUC   p-value
## lesion vs NAWM   1 0.0003486
## 
## 
## $rna
## $rna$comp1
##                AUC   p-value
## lesion vs NAWM   1 0.0003486
## 
## $rna$comp2
##                AUC   p-value
## lesion vs NAWM   1 0.0003486
## 
## 
## $graph.methyl
## $graph.methyl$comp2

# ---- Confusion Matrix ----
# Use test data structured the same way as data_list
diablo_model <- block.splsda(X = data_list, Y = Y, design = design, ncomp = 2)
## Design matrix has changed to include Y; each block will be
##             linked to Y.
# Step 1: Predict using trained model
pred <- predict(diablo_model, newdata = data_list)
# Extract predicted labels for comp1
pred_labels <- pred$MajorityVote$max.dist[, "comp1"]

# Check the prediction
print(pred_labels)
##        1        2        3        4        5        6        7        8 
## "lesion"       NA "lesion"       NA   "NAWM"   "NAWM" "lesion"       NA 
##        9       10       11       12       13       14       15       16 
## "lesion"   "NAWM" "lesion"   "NAWM"   "NAWM"   "NAWM" "lesion"   "NAWM" 
##       17       18 
## "lesion"       NA
# Create confusion matrix against actual labels
conf_matrix <- table(Predicted = pred_labels, Actual = Y)
print(conf_matrix)
##          Actual
## Predicted lesion NAWM
##    lesion      7    0
##    NAWM        2    5
# ---- Accuracy ----
accuracy <- sum(diag(conf_matrix)) / sum(conf_matrix)
cat("Accuracy:", round(accuracy, 3), "\n")
## Accuracy: 0.857