diff --git a/session_nmf/NMF_main.html b/session_nmf/NMF_main.html index 41694d3..5981b42 100644 --- a/session_nmf/NMF_main.html +++ b/session_nmf/NMF_main.html @@ -7516,14 +7516,14 @@
by Sergiu Netotea, PhD, NBIS, Chalmers
+by Sergiu Netotea, NBIS, Chalmers
Matrix factorization (MF): - + [Credit: Wikipedia]
Latent (hidden) factors:
+Latent (hidden) factors:
Model constraints: $V\in\mathbb{R}_+^{m \times n}, V \approx WH, W \in \mathbb{R}_+^{m \times k}, H \in \mathbb{R}_+^{k \times n}$
+Model constraints: +$$ +X\in\mathbb{R}_+^{m \times n}, \\ +X \approx WH, \\ +W \in \mathbb{R}_+^{m \times k}, \\ +H \in \mathbb{R}_+^{k \times n} +$$
As an optimization problem: $ min~\|V-WH\|_F, V \ge 0, W \ge 0, H \ge 0$
-The Frobenius norm
+import pandas as pd
@@ -7826,9 +7848,10 @@ Toy dataset¶[0,0,0,0,1,0]])
dataset = pd.DataFrame(m, columns=['John', 'Alice', 'Mary', 'Greg', 'Peter', 'Jennifer'])
-
dataset.index = ['diabetes_gene1', 'diabetes_gene2', 'cancer_protein1', 'unclear', 'melancholy_gene', 'cofee_dependency_gene']
-print(groceries)
+V = dataset # we call the initial matrix V
+print("\n\n V - Initial Data matrix (features x samples):")
+print(V)
John Alice Mary Greg Peter Jennifer ++ + V - Initial Data matrix (features x samples): + John Alice Mary Greg Peter Jennifer diabetes_gene1 0 1 0 1 2 2 diabetes_gene2 0 1 1 1 3 4 cancer_protein1 2 3 1 1 2 2 @@ -7869,10 +7895,11 @@Toy dataset¶
latent_factors = ['latent1', 'latent2', 'latent3']
+# we estimate k = 3 hidden features
+latent_factors = ['latent1', 'latent2', 'latent3']
from sklearn.decomposition import NMF
+# For k = 3 we solve the NMF problem
+from sklearn.decomposition import NMF
-nmf = NMF(3)
-V = dataset
-nmf.fit(V)
-
-H = pd.DataFrame(np.round(nmf.components_,2), columns=V.columns)
-H.index = latent_factors
-
-W = pd.DataFrame(np.round(nmf.transform(V),2), columns=H.index)
-W.index = V.index
-
-
print("\n\n V - Initial Data matrix (features x samples):")
-print(V)
+model = NMF(n_components=3, init='random', random_state=0) # define the model
+#r = nmf.fit(V)
+W = model.fit_transform(V)
+H = model.components_
- - V - Initial Data matrix (features x samples): - John Alice Mary Greg Peter Jennifer -diabetes_gene1 0 1 0 1 2 2 -diabetes_gene2 0 1 1 1 3 4 -cancer_protein1 2 3 1 1 2 2 -unclear 1 1 1 0 1 1 -melancholy_gene 0 2 3 4 1 1 -cofee_dependency_gene 0 0 0 0 1 0 --
print("\n\n W - factors matrix (features, factors):")
+W = pd.DataFrame(np.round(nmf.transform(V),2), columns=H.index)
+W.index = V.index
+print("\n\n W - factors matrix (features, factors):")
print(W)
print("\n\n H - coefficients matrix (factors, samples):")
+H = pd.DataFrame(np.round(H,2), columns=V.columns)
+H.index = latent_factors
+print("\n\n H - coefficients matrix (factors, samples):")
print(H)
latent_factors = ['Diabetes', 'Cancer', 'Melancholy']
@@ -8081,7 +8071,7 @@ Toy dataset¶What disease is NMF suspeting for Jennifer? Indeed, it is diabetes. We would not know this from a PCA study, because some of the scores can be negative and so are some of the loadings. The cumulative effect is obscured by the linear transformations.
Hipothesis hunting: W x H is an approximation of V, so by transforming the dataset based on the NMF model we can learn some new things.
+Hypothesis hunting: W x H is an approximation of V, so by transforming the dataset based on the NMF model we can learn some new things.
reconstructed = pd.DataFrame(np.round(np.dot(W,H),2), columns=V.columns)
+reconstructed = np.dot(W,H)
+reconstructed = pd.DataFrame(np.round(reconstructed,2), columns=V.columns)
reconstructed.index = V.index
print(reconstructed)
@@ -8129,6 +8120,7 @@ Toy dataset¶
+- This is the matrix that the model learned by performing NMF. Compared to PCA we kept the original dimensionality. WH is the equigalent of a compression-decompression operation.
- Jennifer and Peter are both suspected diabetes based on their H values.
- Peter is showing signal on the coffee dependency gene in the initial dataset. The model infers that maybe the signal for that gene was lost during processing. The model predicts even higher signal on that gene than Peter has. This is based on how similar Jeniffer is to Peter compared to all the other patients.
- This is the essence of collaborative filtering: People that share the same signals in certain kind of diseases will also share the same signals in some other kind of features.
@@ -8227,6 +8219,74 @@ Missingness and regularization
+Paper study¶
+Huizing, GJ., Deutschmann, I.M., Peyré, G. et al. Paired single-cell multi-omics data integration with Mowgli. Nat Commun 14, 7711 (2023). https://doi.org/10.1038/s41467-023-43019-2
+
+
+- NMF based method for integrating paired single-cell multi-omics data.
+- Advancements in single-cell technologies now allow the simultaneous profiling of multiple omics layers (like RNA, chromatin accessibility, and proteins) from the same cells, which raises the need for effective tools to jointly analyze this complex data.
+- Combination of Matrix Factorization and Optimal Transport: Mowgli integrates Non-Negative Matrix Factorization (NMF) with Optimal Transport (OT), allowing for both efficient dimensionality reduction and robust alignment of paired omics data. This combination enhances both the clustering accuracy and biological interpretability of the data.
+- Integration Across Omics Types: Mowgli is designed to handle various types of omics data, such as scRNA-seq, scATAC-seq, and protein data from modalities like CITE-seq and TEA-seq. This makes it a versatile tool for multi-omics data integration.
+- Benchmarking Against State-of-the-Art Methods: Mowgli was benchmarked against other leading methods like Seurat, MOFA+, and Cobolt. It outperformed these methods in embedding and clustering tasks, particularly in scenarios with noisy or sparse data. Mowgli demonstrated superior performance in dealing with real-world challenges like rare cell populations and high dropout rates typical of single-cell data.
+- User-Friendly Implementation: Mowgli is implemented as a Python package that integrates seamlessly with popular single-cell analysis frameworks like Scanpy and Muon, making it accessible for researchers.
+
+
+
+
+
++Kriebel, A.R., Welch, J.D. UINMF performs mosaic integration of single-cell multi-omic datasets using nonnegative matrix factorization. Nat Commun 13, 780 (2022). https://doi.org/10.1038/s41467-022-28431-4
+
+
- Deep architecture: CNN, with backpropagation, each NMF layer performs a hierarchical decomposition -
+Sergiu Netotea, PhD, NBIS, Chalmers
+Sergiu Netotea, PhD, NBIS, Chalmers
Network models are a very complex representation of data:
+Network-Based Approaches: +- Graph Construction, Multi-Modal Networks: Integrating multiple omics datasets into one comprehensive graph that allows for the analysis of cross-layer interactions. +- Node/Edge Weighting: Some methods apply weighting strategies to nodes and edges to emphasize biological relevance, which can assist in identifying key components within the network.
+Algorithmic Methods:
+Machine Learning Approaches:
+Read mode:
+++Agamah FE, Bayjanov JR, Niehues A, Njoku KF, Skelton M, Mazandu GK, Ederveen THA, Mulder N, Chimusa ER, 't Hoen PAC. Computational approaches for network-based integrative multi-omics analysis. Front Mol Biosci. 2022 Nov 14;9:967205. doi: 10.3389/fmolb.2022.967205. PMID: 36452456; PMCID: PMC9703081.
+
Network models are a very complex representation of data:
++MoGCN, a multi-omics integration model based on graph convolutional network (GCN) +- https://github.com/Lifoof/MoGCN +- Li X, Ma J, Leng L, Han M, Li M, He F, Zhu Y. MoGCN: A Multi-Omics Integration Method Based on Graph Convolutional Network for Cancer Subtype Analysis. Front Genet. 2022 Feb 2;13:806842. doi: 10.3389/fgene.2022.806842. PMID: 35186034; PMCID: PMC8847688
+
++Wang, C., Lue, W., Kaalia, R. et al. Network-based integration of multi-omics data for clinical outcome prediction in neuroblastoma. Sci Rep 12, 15425 (2022). https://doi.org/10.1038/s41598-022-19019-5
+
Aim: integrate multi-omics data (like gene expression and DNA methylation) for predicting clinical outcomes in neuroblastoma, a pediatric cancer.
+Using Patient Similarity Networks (PSNs) derived from omics features, they create networks where patients are nodes and edges represent their similarity based on omics data. They apply two methods for data fusion: at feature level and at network level
+Their results show that network-level fusion generally outperforms feature-level fusion for integrating diverse omics datasets, while feature-level fusion is effective when combining different features within the same omics dataset.
+Feature-level fusion: Combines features derived from each omics dataset into a single feature set by concatenating or averaging features like centrality and modularity from PSNs. For each omics dataset m, a Patient Similarity Network (PSN) is constructed. Let x_m represent the feature vector of the m-th omics dataset for a subject. The feature-level fusion is performed as follows:
+The fused feature vector $x_{\text{fused}}$ is used as input to machine learning classifiers for clinical outcome prediction.
+++Wang, J., Liao, N., Du, X. et al. A semi-supervised approach for the integration of multi-omics data based on transformer multi-head self-attention mechanism and graph convolutional networks. BMC Genomics 25, 86 (2024). https://doi.org/10.1186/s12864-024-09985-7 +Searched 2 sites
+