diff --git a/notebooks/NORDic Network Identification (NI) Part I.ipynb b/notebooks/NORDic Network Identification (NI) Part I.ipynb index 6417787..077ae6b 100644 --- a/notebooks/NORDic Network Identification (NI) Part I.ipynb +++ b/notebooks/NORDic Network Identification (NI) Part I.ipynb @@ -13,7 +13,7 @@ "id": "08971fcc", "metadata": {}, "source": [ - "This vignette displays some examples of what can be achieved using **NORDic** in order to identify gene regulatory networks." + "This vignette displays some examples of what can be achieved using **NORDic (version 2.2.0)** in order to identify gene regulatory networks." ] }, { @@ -27,18 +27,6 @@ { "cell_type": "code", "execution_count": 1, - "id": "3e1e7bf8", - "metadata": { - "scrolled": true - }, - "outputs": [], - "source": [ - "! pip install -q NORDic==2.0.0" - ] - }, - { - "cell_type": "code", - "execution_count": 2, "id": "9f33464d", "metadata": {}, "outputs": [], @@ -83,7 +71,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 2, "id": "7ef9c599", "metadata": {}, "outputs": [], @@ -109,7 +97,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 3, "id": "6d011531", "metadata": {}, "outputs": [], @@ -139,7 +127,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 4, "id": "48a51e2c", "metadata": {}, "outputs": [], @@ -182,7 +170,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 5, "id": "4e820199", "metadata": {}, "outputs": [], @@ -202,7 +190,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 6, "id": "c9781222", "metadata": {}, "outputs": [], @@ -220,7 +208,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 7, "id": "276befcb", "metadata": {}, "outputs": [], @@ -239,7 +227,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 8, "id": "dafe7523", "metadata": {}, "outputs": [], @@ -265,7 +253,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 9, "id": "bd010501", "metadata": {}, "outputs": [], @@ -294,7 +282,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 10, "id": "98db93b5", "metadata": {}, "outputs": [], @@ -315,7 +303,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 11, "id": "479a10bc", "metadata": {}, "outputs": [], @@ -327,6 +315,24 @@ "}" ] }, + { + "cell_type": "markdown", + "id": "bfda6fda", + "metadata": {}, + "source": [ + "Since version 2.2.0, one can preserve nodes which are not measured by LINCS L1000 or which do not match an EntrezGene ID (which allows considering miRNA nodes, for instance). In order to do so, set the following parameter to True (to preserve the initial behavior, which is the default one, set it to False):" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "41830889", + "metadata": {}, + "outputs": [], + "source": [ + "accept_nonRNA=True" + ] + }, { "cell_type": "markdown", "id": "dba260fa", @@ -382,6 +388,24 @@ "}" ] }, + { + "cell_type": "markdown", + "id": "635d4108", + "metadata": {}, + "source": [ + "Since version 2.2.0, NORDic can try and infer a network -without returning an error- even when no experimental data is available. In that case, there are no experimental constraints, and it boils down to choosing appropriate parameters in Section B.2 to filter out (or not) edges. In order to do so, set the following parameter to False (to preserve the initial behavior, which is the default one, set it to True):" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "2a009b35", + "metadata": {}, + "outputs": [], + "source": [ + "force_experiments=False" + ] + }, { "cell_type": "markdown", "id": "c0e0e128", @@ -402,7 +426,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 16, "id": "ba743a63", "metadata": {}, "outputs": [], @@ -432,7 +456,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 17, "id": "c730283f", "metadata": {}, "outputs": [], @@ -458,7 +482,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 18, "id": "1fd1a733", "metadata": {}, "outputs": [], @@ -490,6 +514,8 @@ " \"tau\": 0, \"filter\": True, \"connected\": True, \n", "}\n", "\n", + "accept_nonRNA=True\n", + "\n", "## Selection of parameters relative to experimental constraints\n", "LINCS_args = {\n", " \"path_to_lincs\": \"../lincs/\", \"credentials\": LINCS_credentials,\n", @@ -499,6 +525,8 @@ " \"bin_thres\": 0.5,\n", "}\n", "\n", + "force_experiments=False\n", + "\n", "## Selection of parameters relative to the inference of networks\n", "BONESIS_args = {\n", " \"limit\": 1, \"exact\": True, \"max_maxclause\": 3,\n", @@ -540,7 +568,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 19, "id": "61103c94", "metadata": {}, "outputs": [ @@ -548,9 +576,150 @@ "name": "stdout", "output_type": "stream", "text": [ - "1 solutions are already generated, and saved at MODELS/Ondine/SOLUTIONS-1_binthres=0.500_score=0.00_maxclause=3_{1,...1}.zip\n", + "--- DATA IMPORT ---\n", + "1. Import gene set from DisGeNET... Retrieving genes... 1/1\n", + "... 7 genes imported.\n", + "2. Import network from STRING... ... 15 edges in model (including 0 directed edges) with a total of 6 non-isolated genes\n", + "3. Import experiments from LINCS L1000... \n", + "\t2 cell lines are considered (['NPC', 'SHSY5Y'])\n", + " Gene Symbol -> Gene ID (6 probes)\n", + " 1/1\n", + "6 probes (successful 6, unsuccessful 0)\n", + "\t6 genes available (convertable to EntrezIDs)\n", + " Entrez ID -> LINCS L1000 (1/6)\n", + "429\tASCL1\t1\t6\n", + " Entrez ID -> LINCS L1000 (2/6)\n", + "497258; 627\tBDNF\t2\t6\n", + " Entrez ID -> LINCS L1000 (3/6)\n", + "1908\tEDN3\t3\t6\n", + " Entrez ID -> LINCS L1000 (4/6)\n", + "2668\tGDNF\t4\t6\n", + " Entrez ID -> LINCS L1000 (5/6)\n", + "8929\tPHOX2B\t5\t6\n", + " Entrez ID -> LINCS L1000 (6/6)\n", + "5979\tRET\t6\t6\n", + "\t\t6/6 genes retrieved in LINCS L1000\n", + " Gene 1/6 Cell 1/2 Type 1/3\n", + " Gene 2/6 Cell 1/2 Type 1/3\n", + " Gene 3/6 Cell 1/2 Type 1/3\n", + " Gene 4/6 Cell 1/2 Type 1/3\n", + " Gene 5/6 Cell 1/2 Type 1/3\n", + " Gene 6/6 Cell 1/2 Type 1/3\n", + " Gene 1/6 Cell 1/2 Type 2/3\n", + " Gene 2/6 Cell 1/2 Type 2/3\n", + " Gene 3/6 Cell 1/2 Type 2/3\n", + " Gene 4/6 Cell 1/2 Type 2/3\n", + " Gene 5/6 Cell 1/2 Type 2/3\n", + " Gene 6/6 Cell 1/2 Type 2/3\n", + " Gene 1/6 Cell 1/2 Type 3/3\n", + " Gene 2/6 Cell 1/2 Type 3/3\n", + " Gene 3/6 Cell 1/2 Type 3/3\n", + " Gene 4/6 Cell 1/2 Type 3/3\n", + " Gene 5/6 Cell 1/2 Type 3/3\n", + " Gene 6/6 Cell 1/2 Type 3/3\n", + " Gene 1/6 Cell 2/2 Type 1/3\n", + " Gene 2/6 Cell 2/2 Type 1/3\n", + "\t (BDNF,SHSY5Y,trt_sh): 2\n", + " Gene 3/6 Cell 2/2 Type 1/3\n", + " Gene 4/6 Cell 2/2 Type 1/3\n", + " Gene 5/6 Cell 2/2 Type 1/3\n", + " Gene 6/6 Cell 2/2 Type 1/3\n", + " 0 experiments so far\n", + "\t Treatment BDNF (entrez_id 627)... 1/1\n", + " 2 TREATED experiments ('brew_prefix') (3 profiles 'distil_id' in average)\n", + " Selected distil_ss = 3.76598\n", + " 3 same-plate profiles\n", + " 18 CONTROL experiments ('brew_prefix') (5 profiles 'distil_id' in average)\n", + " Selected distil_ss = 3.75211\n", + " 3 same-plate profiles\n", + " KD Perturbed 627 = 1.01391 || Most stable control 7846 = 1.00000\n", + " -0.01391\n", + "... 5 genes, 6 profiles\n", + " 1 experiments so far\n", + "\t Duplicated treatment:BDNF, cell:SHSY5Y, type:trt_sh\n", + " Gene 1/6 Cell 2/2 Type 2/3\n", + " Gene 2/6 Cell 2/2 Type 2/3\n", + " Gene 3/6 Cell 2/2 Type 2/3\n", + " Gene 4/6 Cell 2/2 Type 2/3\n", + " Gene 5/6 Cell 2/2 Type 2/3\n", + " Gene 6/6 Cell 2/2 Type 2/3\n", + " Gene 1/6 Cell 2/2 Type 3/3\n", + " Gene 2/6 Cell 2/2 Type 3/3\n", + " Gene 3/6 Cell 2/2 Type 3/3\n", + " Gene 4/6 Cell 2/2 Type 3/3\n", + " Gene 5/6 Cell 2/2 Type 3/3\n", + " Gene 6/6 Cell 2/2 Type 3/3\n", + "\t\t1 unique experiments (including 1 with criterion thres_iscale > None, min_value None)\n", + "... 6 genes in 6 profiles (1 experiments)\n", + "\n", + "--- CONSTRAINT BUILDING ---\n", + "1. Filtering out edges by minimum set of edges with highest score which preserve connectivity... ... 15 unique edges involving genes both in experiments (6 genes in total)\n", + "... score_STRING 0.000000\t6 genes (non isolated in PPI)\t60 edges in PPI\n", + "2. Build topological constraints from filtered edges using gene expression data... ...... 14 negative, 6 positive undirected interactions (20 edges in total), 6 non isolated genes in experiments\n", + "3. Build dynamical constraints by binarization of experimental profiles... ... Cell SHSY5Y (1/1)... 303 (good) profiles | 50 (best) profiles (capped at 50 or min>=4) (distil_ss max=6.206, min=4.141)\n", + " BDNF_KD_SHSY5Y initial_SHSY5Y\n", + "ASCL1 0.0 0.0\n", + "GDNF 0.0 1.0\n", + "EDN3 1.0 1.0\n", + "RET 1.0 1.0\n", + "PHOX2B 1.0 1.0\n", + "BDNF 0.0 0.0\n", + "... 1 experiments on 1 cell lines and 6/6 genes (Frobenius norm signature matrix: 3.464, 5 possibly constant genes: 83.3%, 0 genes with undetermined status\n", + "\n", + "--- INFER BOOLEAN NETWORK ---\n", + "1. Generate solutions from topological & dynamical constraints... ... Maximum possible #activators=2\n", + " 20 interactions (maximum # of clauses = 2)\n", + "\n", + " 1 experiments\n", + " ASCL1 GDNF EDN3 PHOX2B BDNF\n", + "Exp1_init 0 1 1 1 0\n", + "Exp1_final 0 0 1 1 0\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\r", + " 0%| | 0/1 [00:00 Minimal solution: 20 edges\n", " Maximal solution: 20 edges\n", @@ -560,7 +729,7 @@ "BDNF <- EDN3|(!ASCL1&!GDNF&!PHOX2B)\n", "EDN3 <- !ASCL1|(BDNF&GDNF&!PHOX2B)\n", "GDNF <- !ASCL1&!BDNF&EDN3&!PHOX2B\n", - "PHOX2B <- !BDNF|(ASCL1&!EDN3&!GDNF)\n", + "PHOX2B <- !GDNF|(ASCL1&!BDNF&!EDN3)\n", "\n", "... saved!\n" ] @@ -575,7 +744,9 @@ " lincs_args=LINCS_args, sig_args=SIG_args, \n", " bonesis_args=BONESIS_args, \n", " weights=DESIRABILITY, \n", - " seed=seed_number, njobs=njobs)" + " seed=seed_number, njobs=njobs,\n", + " force_experiments=force_experiments, \n", + " accept_nonRNA=accept_nonRNA)" ] }, { @@ -596,7 +767,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 20, "id": "9d01cbdb", "metadata": {}, "outputs": [ @@ -607,7 +778,7 @@ "" ] }, - "execution_count": 19, + "execution_count": 20, "metadata": {}, "output_type": "execute_result" } @@ -627,7 +798,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 21, "id": "18371086", "metadata": {}, "outputs": [ @@ -639,7 +810,7 @@ "BDNF <- EDN3|(!ASCL1&!GDNF&!PHOX2B)\n", "EDN3 <- !ASCL1|(BDNF&GDNF&!PHOX2B)\n", "GDNF <- !ASCL1&!BDNF&EDN3&!PHOX2B\n", - "PHOX2B <- !BDNF|(ASCL1&!EDN3&!GDNF)\n" + "PHOX2B <- !GDNF|(ASCL1&!BDNF&!EDN3)\n" ] } ], @@ -659,7 +830,7 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 22, "id": "5cd6b411", "metadata": {}, "outputs": [], diff --git a/setup.py b/setup.py index ca9b246..a89ac2f 100644 --- a/setup.py +++ b/setup.py @@ -8,7 +8,7 @@ from subprocess import check_call NAME = "NORDic" -VERSION = "2.1.2" +VERSION = "2.2.0" setup(name=NAME, version=VERSION, diff --git a/src/NORDic/NORDic_DR/functions.py b/src/NORDic/NORDic_DR/functions.py index 12e381c..0e2cd58 100755 --- a/src/NORDic/NORDic_DR/functions.py +++ b/src/NORDic/NORDic_DR/functions.py @@ -7,14 +7,14 @@ import NORDic.NORDic_DR.bandits as bandits import NORDic.NORDic_DR.utils as utils -def adaptive_testing(network_name, signatures, targets, frontier, states, simu_params={}, bandit_args={}, +def adaptive_testing(network_name, signatures, targets, score, states, simu_params={}, bandit_args={}, reward_fname=None, quiet=False): ''' Perform adaptive testing and recommends most promising treatments (=maximizing score) @param\tnetwork_name\tPython character string: (relative) path to a network .BNET file @param\tsignatures\tPandas DataFrame: rows/[features] x columns/[drugs to test] @param\ttargets\tPandas DataFrame: rows/[genes] x columns/[drugs to test] (either 1: active expression, -1: inactive expression, 0: undetermined expression) - @param\tfrontier\tPython object with a function "predict" that returns predictions (1: control, or 2: treated) on phenotypes + @param\tscore\tPython object: scoring of attractors @param\tstates\tPandas DataFrame: rows/[gene] x columns/[patient samples] (either 1: activatory, -1: inhibitory, 0: no regulation). @param\tsimu_params\tPython dictionary[default={}]: arguments to MPBN-SIM @param\tbandit_params\tPython dictionary[default={}]: arguments to the bandit algorithms @@ -42,7 +42,7 @@ def adaptive_testing(network_name, signatures, targets, frontier, states, simu_p "network_name": network_name, "reward_fname": reward_fname, "targets": targets, - "frontier": frontier, + "score": score, "states": states, "simu_params": simu_params, } @@ -101,7 +101,7 @@ def reward(self, arm): if (not np.isnan(self.memoization[arm, patient])): return float(self.memoization[arm, patient]) result = float(simulate_treatment(self.network_name, self.targets[[self.targets.columns[arm]]], - self.frontier, self.states.iloc[:,patient], + self.score, self.states.iloc[:,patient], self.simu_params, quiet=False)[0]) self.memoization[arm, patient] = result return result diff --git a/src/NORDic/NORDic_DS/functions.py b/src/NORDic/NORDic_DS/functions.py index 4e4bcad..a41b5c2 100755 --- a/src/NORDic/NORDic_DS/functions.py +++ b/src/NORDic/NORDic_DS/functions.py @@ -34,8 +34,8 @@ def baseline(signatures, phenotype, is_binary=False): PC[PC<0] = -1 SC[pd.isnull(SC)] = 0 PC[pd.isnull(PC)] = 0 - C = SC.join(PC, how="inner") - cosscores = pairwise_distances(C[signatures.columns].T,C[phenotype.columns].T, metric="cosine") + C = SC.join(PC, how="outer").fillna(0) + cosscores = 1-pairwise_distances(C[signatures.columns].T,C[phenotype.columns].T, metric="cosine") baseline_df = pd.DataFrame(cosscores, columns=["Cosine Score"], index=signatures.columns) return baseline_df @@ -94,13 +94,13 @@ def compute_metrics(rewards, ground_truth, K=[2,5,10], use_negative_class=False, ## DRUG SIMULATOR ## ###################### -def simulate(network_fname, targets, phenotypes, simu_params={}, nbseed=0, quiet=False): +def simulate(network_fname, targets, patients, score, simu_params={}, nbseed=0, quiet=False): ''' Simulate and score the individual effects of drugs on patient phenotypes, compared to controls @param\tnetwork_fname\tPython character string: (relative) path to a network .BNET file @param\ttargets\tPandas DataFrame: rows/[genes] x columns/[drugs to test] (either 1: active expression, -1: inactive expression, 0: undetermined expression) - @param\tphenotypes\tPandas DataFrame: rows/[genes+annotation patient/control] x columns/[samples] (either 1: activatory, -1: inhibitory, 0: no regulation). - The last line "annotation" is 1 (healthy sample) or 2 (patient sample). + @param\tpatients\tPandas DataFrame: rows/[genes] x columns/[samples] (either 1: activatory, -1: inhibitory, 0: no regulation). + @param\tscore\tPython object: scoring of attractors @param\tsimu_params\tPython dictionary[default={}]: arguments to MPBN-SIM @param\tnbseed\tPython integer[default=0] @param\tquiet\tPython bool[default=False] @@ -122,7 +122,7 @@ def simulate(network_fname, targets, phenotypes, simu_params={}, nbseed=0, quiet dfdata = phenotypes.loc[list(set([g for g in genes if (g in phenotypes.index)]))] samples = phenotypes.loc["annotation"] frontier = compute_frontier(dfdata, samples) - patients = dfdata[[c for c in dfdata.columns if (phenotypes.loc["annotation"][c]==2)]] + patients = dfdata[[c for c in dfdata.columns if (samples[c]==2)]] ## 2. Compute one score per drug and per patients if (simu_params.get('thread_count', 1)==1): scores = [simulate_treatment(network_fname, targets.loc[[g for g in targets.index if (g in genes)]], frontier, patients[[Patient]], simu_params, quiet=quiet) for Patient in patients.columns] @@ -150,13 +150,13 @@ def compute_frontier(df, samples, nbseed=0, quiet=False): print(" Accuracy of the model %.2f" % acc) return model -def compute_score(f, x0, A, frontier, genes, nb_sims, experiments, repeat=1, exp_name="", quiet=False): +def compute_score(f, x0, A, score, genes, nb_sims, experiments, repeat=1, exp_name="", quiet=False): ''' Compute similarities between any attractor in WT and in mutants, weighted by their probabilities @param\tf\tBoolean Network (MPBN) object: the mutated network @param\tx0\tMPBN object: initial state @param\tA\tAttractor list: list of attractors in mutant network - @param\tfrontier\tPython object: model to classify phenotypes + @param\tscore\tPython object: scoring of attractors @param\tgenes\tPython character string list: list of genes in the model @frontier @param\tnb_sims\tPython integer: number of iterations to compute the probabilities @param\texperiments\tPython dictionary list: list of experiments (different rates/depths) @@ -179,17 +179,17 @@ def compute_score(f, x0, A, frontier, genes, nb_sims, experiments, repeat=1, exp probs = mpbn_sim.estimate_reachable_attractor_probabilities(f, x0, A, nb_sims, depth(f, **depth_args), rates(f, **rates_args)) attrs = pd.DataFrame({"MUT_%d"%ia: a for ia, a in enumerate(A)}).replace("*",np.nan).astype(float).loc[genes] probs = np.array([probs[ia]/100. for ia in range(attrs.shape[1])]) - classification_attrs = (frontier.predict(attrs.values.T)==1).astype(int) #classifies into 1:control, 2:patient + classification_attrs = score(attrs) drug_score = probs.T.dot(classification_attrs) d_scores.append(drug_score) return np.mean(d_scores) if (repeat>1) else d_scores[0] -def simulate_treatment(network_name, targets, frontier, state, simu_params={}, quiet=False): +def simulate_treatment(network_name, targets, score, state, simu_params={}, quiet=False): ''' Compute the score assigned to a drug with targets in @targets[[drug]] in network @param\tnetwork_name\tPython character string: filename of the network in .bnet (needs to be pickable) @param\ttargets\tPandas DataFrame: rows/[genes] x columns/[columns] - @param\tfrontier\tPython object: model to classify phenotypes + @param\tscore\tPython object: scoring of attractors @param\tstate\tPandas DataFrame: binary patient initial state rows/[genes] x columns/[values in {-1,0,1}] @param\tsimu_params\tPython dictionary[default={}]: arguments to MPBN-SIM @param\tseednb\tPython integer[default=0] diff --git a/src/NORDic/NORDic_NI/functions.py b/src/NORDic/NORDic_NI/functions.py index f6bed55..55e3f96 100644 --- a/src/NORDic/NORDic_NI/functions.py +++ b/src/NORDic/NORDic_NI/functions.py @@ -21,9 +21,29 @@ from NORDic.UTILS.utils_grn import get_grfs_from_solution from NORDic.NORDic_NI.cytoscape_style import style_file -def network_identification(file_folder, taxon_id, path_to_genes=None, disgenet_args=None, network_fname=None, string_args=None, experiments_fname=None, lincs_args=None, edge_args=None, sig_args=None, bonesis_args=None, weights=None, seed=0, njobs=1): - - solution_fname, nsol = solution_generation(file_folder, taxon_id, path_to_genes, disgenet_args, network_fname, string_args, experiments_fname, lincs_args, edge_args, sig_args, bonesis_args, seed, njobs) +def network_identification(file_folder, taxon_id, path_to_genes=None, disgenet_args=None, network_fname=None, string_args=None, experiments_fname=None, lincs_args=None, edge_args=None, sig_args=None, bonesis_args=None, weights=None, seed=0, njobs=1, force_experiments=True, accept_nonRNA=False): + ''' + Generates or retrieves the optimal network model + @param\tfile_folder\tPython character string: path to which files should be saved + @param\ttaxon_id\tPython integer: NCBI Taxonomy ID for the considered species + @param\tpath_to_genes\tPython character string or None[default=None]: path to the file containing gene names (one per line) + @param\tdisgenet_args\tPython dictionary or None[default=None]: arguments to the DisGeNET API (see test files) + @param\tnetwork_fname\tPython character string or None[default=None]: path to the file containing the prior knowledge network (see test files) + @param\tstring_args\tPython dictionary or None[default=None]: arguments to the STRING API (see test files) + @param\texperiments_fname\tPython character string or None[default=None]: path to the file containing the matrix of gene expression data (genes x samples) (see test files for format) + @param\tlincs_args\tPython dictionary or None[default=None]: arguments to the LINCS L1000 API (see test files) + @param\tedge_args\tPython dictionary or None[default=None]: arguments to the processing of edges (see test files) + @param\tsig_args\tPython dictionary or None[default=None]: arguments to the processing of signatures (see test files) + @param\tbonesis_args\tPython dictionary or None[default=None]: arguments to building constraints and generating solutions using BoneSIS (see test files) + @param\tweights\tPython dictionary or None[default=None]: weights to the optimal model selection procedure (see test files) + @param\tseed\tPython integer[default=0] + @param\tnjobs\tPython integer[default=1] + @param\tnjobs\tPython integer[default=1] + @param\tforce_experiments\tPython bool[default=True]: if set to True, returns an error if no experimental profile associated with the genes is found + @param\taccept_nonRNA\tPython bool[default=False]: if set to False, ignores gene names which cannot be converted to EntrezIDs or which are not present in LINCS L1000 + @return\tsolution\tPython character string: optimal model selected by the procedure + ''' + solution_fname, nsol = solution_generation(file_folder, taxon_id, path_to_genes=path_to_genes, disgenet_args=disgenet_args, network_fname=network_fname, string_args=string_args, experiments_fname=experiments_fname, lincs_args=lincs_args, edge_args=edge_args, sig_args=sig_args, bonesis_args=bonesis_args, seed=seed, njobs=njobs, force_experiments=force_experiments, accept_nonRNA=accept_nonRNA) if (nsol==0): return None @@ -41,10 +61,33 @@ def network_identification(file_folder, taxon_id, path_to_genes=None, disgenet_a return solution -def solution_generation(file_folder, taxon_id, path_to_genes=None, disgenet_args=None, network_fname=None, string_args=None, experiments_fname=None, lincs_args=None, edge_args=None, sig_args=None, bonesis_args=None, weights=None, seed=0, njobs=1): +def solution_generation(file_folder, taxon_id, path_to_genes=None, disgenet_args=None, network_fname=None, string_args=None, experiments_fname=None, lincs_args=None, edge_args=None, sig_args=None, bonesis_args=None, weights=None, seed=0, njobs=1, force_experiments=True, accept_nonRNA=False): + ''' + Generates or retrieves the optimal network model + @param\tfile_folder\tPython character string: path to which files should be saved + @param\ttaxon_id\tPython integer: NCBI Taxonomy ID for the considered species + @param\tpath_to_genes\tPython character string or None: path to the file containing gene names (one per line) + @param\tdisgenet_args\tPython dictionary or None: arguments to the DisGeNET API (see test files) + @param\tnetwork_fname\tPython character string or None: path to the file containing the prior knowledge network (see test files) + @param\tstring_args\tPython dictionary or None: arguments to the STRING API (see test files) + @param\texperiments_fname\tPython character string or None: path to the file containing the matrix of gene expression data (genes x samples) (see test files for format) + @param\tlincs_args\tPython dictionary or None: arguments to the LINCS L1000 API (see test files) + @param\tedge_args\tPython dictionary or None: arguments to the processing of edges (see test files) + @param\tsig_args\tPython dictionary or None: arguments to the processing of signatures (see test files) + @param\tbonesis_args\tPython dictionary or None: arguments to building constraints and generating solutions using BoneSIS (see test files) + @param\tseed\tPython integer[default=0] + @param\tnjobs\tPython integer[default=1] + @param\tnjobs\tPython integer[default=1] + @param\tforce_experiments\tPython bool[default=True]: if set to True, returns an error if no experimental profile associated with the genes is found + @param\taccept_nonRNA\tPython bool[default=False]: if set to False, ignores gene names which cannot be converted to EntrezIDs or which are not present in LINCS L1000 + @return\tsolution\tPython character string: optimal model selected by the procedure + ''' sbcall("mkdir -p "+file_folder, shell=True) solution_fname=file_folder+("SOLUTIONS-%d_binthres=%.3f_score=%.2f_maxclause=%d" % (bonesis_args.get("limit", 1), sig_args.get("bin_thres", 0.5), string_args.get("score", 1), bonesis_args.get("max_maxclause", 5))) solution_fname_ls, solution_ls = glob(solution_fname+"_*.zip"), [] + ##################################### + ## RETRIEVE EXISTING RESULTS ## + ##################################### if (len(solution_fname_ls)>0): for fname in solution_fname_ls: nbits = int(sbcheck_output("ls -l "+fname+" | cut -d\" \" -f5", shell=True).decode("utf-8")) @@ -117,34 +160,47 @@ def solution_generation(file_folder, taxon_id, path_to_genes=None, disgenet_args print("2. Import network from %s" % ("STRING" if (network_fname is None) else network_fname), end="... ") network_file = file_folder+"NETWORK.tsv" - if (network_fname is None): - ## Import from STRING - app_name = get_app_name_STRING(string_args["credentials"]) - network = get_network_from_STRING(model_genes, + if (not os.path.exists(network_file)): + if (network_fname is None): + ## Import from STRING + app_name = get_app_name_STRING(string_args["credentials"]) + network = get_network_from_STRING(model_genes, taxon_id, min_score=0, app_name=app_name, quiet=True - ) - else: - ## Import prepared network: four columns ["preferredName_A","preferred_B","sign","directed","score"], sep="\t" - ## preferredName_A, preferredName_B are HUGO gene symbols - ## score in [0,1], directed in {1,0}, sign in {1,2,-1} - network = pd.read_csv(network_fname, sep="\t") - cols = ["preferredName_A","preferredName_B","sign","directed","score"] - assert network.shape[1] == len(cols) - for coli, col in enumerate(network.columns): - assert col == cols[coli] - for col in cols: - assert col in network.columns - assert all([v in [1,2,-1] for v in list(network["sign"])]) - assert all([v in [1,0] for v in list(network["directed"])]) - assert all([v <= 1 and v >= 0 for v in list(network["score"])]) - - model_genes = list(set([g for a in ["preferredName_A","preferredName_B"] for g in list(network[a])])) - network.to_csv(network_file, index=None, sep="\t") - - print("... %d edges in model (including %d directed edges) with a total of %d non-isolated genes" % (network.shape[0], network[network["directed"]==1].shape[0], len(model_genes))) + ) + else: + ## Import prepared network: four columns ["preferredName_A","preferred_B","sign","directed","score"], sep="\t" + ## preferredName_A, preferredName_B are HUGO gene symbols + ## score in [0,1], directed in {1,0}, sign in {1,2,-1} + network = pd.read_csv(network_fname, sep="\t") + cols = ["preferredName_A","preferredName_B","sign","directed","score"] + assert network.shape[1] == len(cols) + for coli, col in enumerate(network.columns): + assert col == cols[coli] + for col in cols: + assert col in network.columns + assert all([v in [1,2,-1] for v in list(network["sign"])]) + assert all([v in [1,0] for v in list(network["directed"])]) + assert all([v <= 1 and v >= 0 for v in list(network["score"])]) + + network.to_csv(network_file, index=None, sep="\t") + network = pd.read_csv(network_file, sep="\t") + network_genes = list(set([g for a in ["preferredName_A","preferredName_B"] for g in list(network[a])])) + + print("... %d edges in model (including %d directed edges) with a total of %d non-isolated genes" % (network.shape[0], network[network["directed"]==1].shape[0], len(network_genes))) + + ## Write down the list of missing genes in PPI + if (len(model_genes)>len(network_genes)): + with open(file_folder+"missing_genes_in_PPI.txt", "w") as f: + missing_genes = [g for g in model_genes if (g not in network_genes)] + f.write("\n".join(missing_genes)) + + if (network.shape[0]==0): + raise ValueError("No network could be found.") + assert network.shape[0]>0 + model_genes = network_genes ## trim out isolated nodes ########################### ## IMPORT EXPERIMENTS ## @@ -158,15 +214,11 @@ def solution_generation(file_folder, taxon_id, path_to_genes=None, disgenet_args else: cell_lines = lincs_args["cell_lines"] - print("\n\t%d cell lines in which at least one of the genes has been perturbed (%s)" % (len(cell_lines), cell_lines)) + print("\n\t%d cell lines are considered (%s)" % (len(cell_lines), cell_lines)) + if (force_experiments and (experiments_fname is None)): + assert len(cell_lines)>0 - profiles_fname = file_folder+"PROFILES_"+"-".join(cell_lines[:4])+".csv" if (experiments_fname is None): - assert string_args - assert string_args.get("credentials", False) - assert lincs_args - assert lincs_args.get("credentials", False) - ## Convert gene names in EntrezGene IDs entrezgene_fname=file_folder+"ENTREZGENES.csv" if (not os.path.exists(entrezgene_fname)): @@ -175,15 +227,18 @@ def solution_generation(file_folder, taxon_id, path_to_genes=None, disgenet_args probes.to_csv(entrezgene_fname) probes = pd.read_csv(entrezgene_fname,index_col=0) - if (len(list(probes[probes["Gene ID"]=="-"].index))>0): + if (probes.shape[0]>probes[probes["Gene ID"]!="-"].shape[0]): + with open(file_folder+"missing_genes_in_EntrezID.txt", "w") as f: + missing_genes = [g for g in probes.index if (g not in list(probes[probes["Gene ID"]!="-"].index))] + f.write("\n".join(missing_genes)) + + if (probes[probes["Gene ID"]=="-"].shape[0]>0 and not accept_nonRNA): print("\tNot found genes: %s" % str(list(probes[probes["Gene ID"]=="-"].index))) probes = probes[probes["Gene ID"]!="-"] print("\t%d genes available (convertable to EntrezIDs)" % probes.shape[0]) - if (probes.shape[0]0): + with open(file_folder+"missing_genes_in_LINCSL1000.txt", "w") as f: + f.write("\n".join(missing_genes)) + + add_rows_profiles = ["annotation", "perturbed", "perturbation", "cell_line", "sigid", "interference_scale"] + thres_iscale = lincs_args.get("thres_iscale", None) + profiles_fname = file_folder+"PROFILES_"+"-".join(cell_lines[:4])+"_thres=%s.csv" % str(thres_iscale) + if (not os.path.exists(profiles_fname)): + if (experiments_fname is None): + full_profiles_fname = file_folder+"FULL_PROFILES_"+"-".join(cell_lines[:4])+".csv" + assert string_args + assert string_args.get("credentials", False) + assert lincs_args + assert lincs_args.get("credentials", False) + + ## Get experimental profiles + pert_types = lincs_args.get("pert_types", ["trt_sh","trt_oe","trt_xpr"]) + selection = lincs_args.get("selection", "distil_ss") + path_to_lincs = lincs_args.get("path_to_lincs", "./") + nsigs = lincs_args.get("nsigs", 2) user_key = get_user_key(lincs_args["credentials"]) pert_di = pert_df[["Entrez ID","Gene Symbol"]].to_dict()["Entrez ID"] - profiles = get_experimental_constraints(cell_lines, pert_types, pert_di, taxon_id, selection, user_key, path_to_lincs, thres_iscale=thres_iscale, nsigs=nsigs, quiet=False) - profiles.to_csv(profiles_fname) - - else: + if (not os.path.exists(full_profiles_fname)): + full_profiles = get_experimental_constraints(cell_lines, pert_types, pert_di, taxon_id, selection, user_key, path_to_lincs, thres_iscale=thres_iscale, nsigs=nsigs, quiet=False) + full_profiles.to_csv(full_profiles_fname) + else: + full_profiles = pd.read_csv(full_profiles_fname, index_col=0) - profiles = pd.read_csv(experiments_fname, sep=",", index_col=0) - ## Pandas DataFrame with index=HUGO gene symbols, columns=sample names, and 3 additional rows - ## 'annotation' 2 if the sample is treated, 1 otherwise - ## 'signame' experiment UNIQUE identifier, common to samples from the same experiment - ## 'sigid' sample unique identifier from LINCS L1000 (can be filled with NaNs) - ## /!\ SHOULD ALREADY BE BINARIZED - assert list(profiles.index)[-3] == "annotation" - assert list(profiles.index)[-2] == "signame" - assert list(profiles.index)[-1] == "sigid" - assert all([v in [0,1,np.nan] for v in list(np.unique(profiles.values))]) + if (full_profiles.shape[1]==0): + profiles = None + else: + if (thres_iscale is not None): + profiles_ = full_profiles[[c for ic, c in enumerate(full_profiles.columns) if (float(list(full_profiles.loc["interference_scale"])[ic])>thres_iscale)]] + if (profiles_.shape[1]==0): + raise ValueError("The selected value of thres_iscale (%s) is too large, maximum value in collected experiments=%s" % (thres_iscale, np.max(full_profiles.loc["interference_scale"].values))) + min_iscale = np.min(profiles_.loc["interference_scale"].values) + else: + profiles_ = full_profiles + min_iscale = None + profiles = profiles_.loc[[i for i in profiles_.index if (i != "interference_scale")]] + + Nfullprofs = len(list(set(["_".join(full_profiles[c].loc[["perturbed", "perturbation", "cell_line"]]) for c in full_profiles.columns]))) + Nprofs = len(list(set(["_".join(profiles[c].loc[["perturbed", "perturbation", "cell_line"]]) for c in profiles.columns]))) + print("\t\t%d unique experiments (including %d with criterion thres_iscale > %s, min_value %s)" % (Nfullprofs, Nprofs, thres_iscale, min_iscale)) + profiles = profiles.loc[~profiles.index.duplicated()] + profiles.index = [g if (g in add_rows_profiles) else pert_df.loc[pert_df["Entrez ID"]==float(g)]["Gene Symbol"].values[0] for g in list(profiles.index)] + + if (profiles is None): + profiles = pd.DataFrame([], index=pert_inames, columns=["0","1"]).astype(str) + for a in add_rows_profiles: + profiles.loc[a] = ["nan"]*profiles.shape[1] - if (experiments_fname is None): - thres_iscale = lincs_args.get("thres_iscale", None) - full_profiles = pd.read_csv(profiles_fname, header=0, index_col=0) - if (thres_iscale is not None): - profiles_ = full_profiles[[c for ic, c in enumerate(full_profiles.columns) if (float(list(full_profiles.loc["interference_scale"])[ic])>thres_iscale)]] - if (profiles_.shape[1]==0): - raise ValueError("The selected value of thres_iscale (%s) is too large, maximum value in collected experiments=%s" % (thres_iscale, np.max(full_profiles.loc["interference_scale"].values))) - min_iscale = np.min(profiles_.loc["interference_scale"].values) else: - profiles_ = full_profiles - min_iscale = None - profiles = profiles_.loc[[i for i in profiles_.index if (i != "interference_scale")]] - - Nfullprofs = len(list(set(list(full_profiles.loc["signame"])))) - Nprofs = len(list(set(list(profiles.loc["signame"])))) - print("\t\t%d unique experiments (including %d with criterion thres_iscale > %s, min_value %s)" % (Nfullprofs, Nprofs, thres_iscale, min_iscale)) - profiles.index = [list(pert_df.index)[list(pert_df["Entrez ID"]).index(int(idx))] for idx in list(profiles.index)[:-3]]+["annotation", "signame", "sigid"] - profiles = profiles.loc[~profiles.index.duplicated()] - - assert all([g in model_genes for g in list(profiles.index)[:-3]]) - model_genes = list(set(list(profiles.index)[:-3])) - print("... %d genes in %d profiles (%d experiments)" % (len(model_genes), profiles.shape[1], len(set(list(profiles.loc["signame"]))))) + profiles = pd.read_csv(experiments_fname, sep=",", index_col=0) + ## Pandas DataFrame with index=HUGO gene symbols, columns=sample names, and 5 additional rows + ## 'annotation' 2 if the sample is treated, 1 otherwise + ## 'perturbed' gene which is perturbed + ## 'perturbation' experiment UNIQUE identifier, common to samples from the same experiment + ## 'cell_line' experiment UNIQUE identifier, common to samples from the same experiment + ## 'sigid' sample unique identifier from LINCS L1000 (can be filled with NaNs) + ## /!\ SHOULD ALREADY BE BINARIZED + assert "annotation" in list(profiles.index) and all([v in [1, 2] for v in profiles.loc["annotation"]]) + assert "perturbed" in list(profiles.index) + assert "perturbation" in list(profiles.index) and all([v in ["KD", "OE"] for v in profiles.loc["perturbation"]]) + assert "cell_line" in list(profiles.index) + assert "sigid" in list(profiles.index) + assert all([v in [0,1,np.nan] for v in list(np.unique(profiles.values))]) + profiles = profiles[[c for c in profiles.columns if ((profiles.loc["perturbed"][c] in pert_inames) and (profiles.loc["cell_line"][c] in cell_lines))]] + profiles = profiles.loc[[g for g in list(profiles.index) if (g in model_genes+add_rows_profiles)]] + profiles.to_csv(profiles_fname) + + profiles = pd.read_csv(profiles_fname, index_col=0) + if (not pd.isnull(profiles.values).all()): + nconds = len(set(["_".join([profiles.loc[v][c] for v in ["perturbed","perturbation","cell_line"]]) for c in profiles.columns])) + else: + nconds = 0 + ngenes = len([p for p in profiles.index if (p not in add_rows_profiles)]) + print("... %d genes in %d profiles (%d experiments)" % (0 if (profiles is None) else ngenes, 0 if (profiles is None) else profiles.shape[1], 0 if (profiles is None) else nconds)) + if (profiles is None and force_experiments): + raise ValueError("No experimental profile could be found.") + assert (not force_experiments) or ((profiles is not None) and profiles.shape[1]>0) + assert (not force_experiments) or ((profiles is not None) and profiles.shape[0]-len(add_rows_profiles)>0) + + ## Write down the list of missing genes in profiles + if ((profiles is not None) and (pert_df.shape[0]>ngenes)): + with open(file_folder+"missing_genes_in_profiles.txt", "w") as f: + missing_genes = [g for g in pert_inames if (g not in list(profiles.index))] + f.write("\n".join(missing_genes)) print("\n--- CONSTRAINT BUILDING ---") @@ -255,16 +351,24 @@ def solution_generation(file_folder, taxon_id, path_to_genes=None, disgenet_args ##################################### print("1. Filtering out edges by minimum set of edges with highest score which preserve connectivity...", end=" ") + network = pd.read_csv(network_file, sep="\t") + ## Filter out edges which do not involve nodes which are both present in the experiments - test_in_profiles = np.vectorize(lambda x : x in model_genes) - network = network.loc[(test_in_profiles(network["preferredName_A"])&test_in_profiles(network["preferredName_B"]))] + if (not accept_nonRNA and profiles is not None): + test_in_profiles = np.vectorize(lambda x : x in [p for p in list(profiles.index) if (p not in add_rows_profiles)]) + network = network.loc[(test_in_profiles(network["preferredName_A"])&test_in_profiles(network["preferredName_B"]))] + ## Write down the list of missing genes in profiles + genes_in_network = list(set(list(network["preferredName_A"])+list(network["preferredName_B"]))) + if (len(model_genes)>len(genes_in_network)): + with open(file_folder+"missing_genes_in_filtered_network.txt", "w") as f: + missing_genes = [g for g in list(profiles.index) if (g not in genes_in_network+add_rows_profiles)] + f.write("\n".join(missing_genes)) + model_genes = genes_in_network ## Ensure that all considered edges are unique network = network.drop_duplicates(keep="first") - print("... %d genes, %d unique edges involving genes in experiments" % (len(model_genes), network.shape[0])) + print("... %d unique edges involving genes both in experiments (%d genes in total)" % (network.shape[0], len(model_genes))) plot_influence_graph(network, "preferredName_A","preferredName_B","sign",file_folder+"network",True) - from copy import deepcopy - score_thres = 0 if (string_args is None) else string_args.get("score", 0) edges_file = file_folder+"EDGES_score=%f.tsv" % (score_thres) if (not os.path.exists(edges_file) and ((network["sign"]==2).any() or (network["directed"]==0).any())): @@ -277,12 +381,21 @@ def solution_generation(file_folder, taxon_id, path_to_genes=None, disgenet_args network_df.columns = ["Input", "Output", "SSign"] network_df.to_csv(edges_file, sep="\t", index=None) network_df = pd.read_csv(edges_file, sep="\t") - model_genes = list(set(list(network_df["Input"])+list(network_df["Output"]))) + ## Restrict genes to non-isolated genes + model_genes = [m for m in list(set(list(network_df["Input"])+list(network_df["Output"]))) if ((profiles is None) or (m in model_genes))] + + print("... score_STRING %f\t%d genes (non isolated in PPI)\t%d edges in PPI" % (string_args["score"], len(model_genes), network_df.shape[0])) - print("... score_STRING %f\t#genes (non isolated in PPI) %d\t#edges in PPI %d" % (string_args["score"], len(model_genes), network_df.shape[0])) + ## Restrict profiles to non isolated genes + if (profiles is not None): + profiles = profiles.loc[[m for m in model_genes if (m in profiles.index)]+[a for a in add_rows_profiles if (a in profiles.index)]] - ## Restrict to non isolated genes - profiles = profiles.loc[[m for m in model_genes if (m in profiles.index)]+list(profiles.index)[-3:]] + ## Write down the list of missing genes in network + ngenes = len([p for p in profiles.index if (p not in add_rows_profiles)]) + if ((profiles is not None) and (len(model_genes)>ngenes)): + with open(file_folder+"missing_genes_in_network.txt", "w") as f: + missing_genes = [g for g in model_genes if (g not in list(profiles.index))] + f.write("\n".join(missing_genes)) ################################### ## BUILD TOPOLOGICAL CONSTRAINTS ## @@ -291,21 +404,30 @@ def solution_generation(file_folder, taxon_id, path_to_genes=None, disgenet_args influences_fname = file_folder+"INFLUENCES_"+"-".join(cell_lines[:4])+"_tau="+str(edge_args.get("tau", 0))+"_beta="+str(edge_args.get("beta", 1))+"_score_thres="+str(score_thres)+".csv" if (not os.path.exists(influences_fname) and (network_fname is None or edge_args.get("tau", False))): - expr_df = profiles.iloc[:-3,:].apply(pd.to_numeric) - influences = build_influences(network_df, edge_args.get("tau", 0), beta=edge_args.get("beta", 1), cor_method=edge_args.get("cor_method", "pearson"), expr_df=expr_df) + expr_df = None if (profiles is None) else profiles.loc[[g for g in profiles.index if (g not in add_rows_profiles)]].apply(pd.to_numeric) + influences = build_influences(network_df, edge_args.get("tau", 0), beta=edge_args.get("beta", 1), cor_method=edge_args.get("cor_method", "pearson"), expr_df=expr_df if (not pd.isnull(expr_df.values).all()) else None, accept_nonRNA=accept_nonRNA) influences.to_csv(influences_fname) elif (not os.path.exists(influences_fname)): network_df.index = ["-".join(list(map(str, list(network_df.loc[idx][["Input","Output"]])))) for idx in network_df.index] influences = network_df.groupby(level=0).max() network_df.index = range(network_df.shape[0]) influences = influences.pivot_table(index="Input",columns="Output",values="SSign", fill_value=0) + missing_index = [g for g in influences.columns if (g not in influences.index)] + if (len(missing_index)>0): + missing_index_df = pd.DataFrame([], index=missing_index, columns=influences.columns).fillna(0) + influences = pd.concat((influences, missing_index_df), axis=0) + missing_columns = [g for g in influences.index if (g not in influences.columns)] + if (len(missing_columns)>0): + missing_columns_df = pd.DataFrame([], index=influences.index, columns=missing_columns).fillna(0) + influences = pd.concat((influences, missing_columns_df), axis=1) + assert influences.shape[0]==influences.shape[1] influences[influences<0] = -1 influences[influences>0] = 1 influences.to_csv(influences_fname) influences = pd.read_csv(influences_fname, index_col=0, header=0) - model_genes = list(influences.index) # - profiles = profiles.loc[[m for m in model_genes if (m in profiles.index)]+list(profiles.index)[-3:]] - print("... %d negative, %d positive interactions (%d edges in total), %d non isolated genes in experiments" % (np.sum(influences.values==1), np.sum(influences.values==-1), np.sum(influences.abs().values), len(model_genes))) + if (not pd.isnull(profiles.values).all() is not None): + profiles = profiles.loc[[m for m in list(influences.index) if (m in profiles.index)]+[a for a in add_rows_profiles if (a in profiles.index)]] + print("... %d negative, %d positive undirected interactions (%d edges in total), %d non isolated genes in experiments" % (np.sum(influences.values==-1), np.sum(influences.values==1), np.sum(influences.abs().values), influences.shape[0])) assert np.sum(influences.abs().values) > 0 influences_df = influences.melt(ignore_index=False) @@ -320,7 +442,7 @@ def solution_generation(file_folder, taxon_id, path_to_genes=None, disgenet_args ## Signatures are vectors containing values {NaN,0,1} sigs_fname = file_folder+"SIGNATURES_"+"-".join(cell_lines[:4])+"_binthres="+str(sig_args.get("bin_thres", 0.5))+"_thres_iscale="+str(lincs_args.get("thres_iscale", None))+".csv" - if (not os.path.exists(sigs_fname)): + if (not pd.isnull(profiles.values).all() and not os.path.exists(sigs_fname)): save_fname=file_folder+"CELL" if (experiments_fname is None): assert lincs_args @@ -329,29 +451,39 @@ def solution_generation(file_folder, taxon_id, path_to_genes=None, disgenet_args save_fname=file_folder+"CELL" user_key = get_user_key(lincs_args["credentials"]) profiles_index = list(profiles.index) - profiles.index = [pert_df.loc[g]["Entrez ID"] for g in profiles_index[:-3]]+profiles_index[-3:] + profiles.index = [int(pert_df.loc[g]["Entrez ID"]) if (g not in add_rows_profiles) else g for g in profiles_index] signatures = profiles2signatures(profiles, user_key, lincs_args["path_to_lincs"], save_fname, thres=sig_args.get("bin_thres", 0.5), selection=lincs_args.get("selection", "distil_ss"), backgroundfile=True, bin_method=sig_args.get("bin_method", "binary"), nbackground_limits=(sig_args.get("min_selection", 4), sig_args.get("max_selection", 50))) profiles.index = profiles_index - signatures.index = [list(pert_df.index)[list(pert_df["Entrez ID"]).index(e)] for e in signatures.index] + signatures.index = [pert_inames[list(pert_df["Entrez ID"]).index(e)] for e in signatures.index] else: - signatures = profiles.loc[profiles.index[:-3]] + signatures = profiles.loc[[g for g in profiles.index if (g not in add_rows_profiles)]] signatures.to_csv(sigs_fname) - signatures = pd.read_csv(sigs_fname, index_col=0, header=0).dropna(how="all") - - from copy import deepcopy - signatures_copy = deepcopy(signatures) - signatures_copy[signatures_copy==0] = -1 - signatures_copy = signatures_copy.fillna(0) - fnorm = np.linalg.norm(signatures_copy.values) + elif (pd.isnull(profiles.values).all()): + signatures = pd.DataFrame([], index=model_genes, columns=["0","1"]) + nb_absent_genes = len(model_genes) + nb_constant_genes = 0 + nb_undetermined_genes, nexps, ncells, ngenes = [0]*4 + fnorm = np.nan + else: + signatures = pd.read_csv(sigs_fname, index_col=0, header=0).dropna(how="all") - nb_absent_genes = len(model_genes)-signatures.shape[0] - nb_constant_genes = signatures.loc[(signatures.mean(axis=1, skipna=True)==0)|(signatures.mean(axis=1, skipna=True)==1)].shape[0]+nb_absent_genes - nb_undetermined_genes = signatures.loc[pd.isnull(signatures.mean(axis=1, skipna=True))].shape[0]+nb_absent_genes + print(signatures) + if (not pd.isnull(signatures.values).all()): + signatures_copy = deepcopy(signatures) + signatures_copy[signatures_copy==0] = -1 + signatures_copy = signatures_copy.fillna(0) + fnorm = np.linalg.norm(signatures_copy.values) - print(("... %d experiments on %d cell lines and %d/%d genes (Frobenius norm signature matrix: %.3f, %d possibly constant genes: %.1f" % (len(set([c for c in signatures.columns if ("initial" not in c)])), len(set([c for c in signatures.columns if ("initial" in c)])), signatures.shape[0], len(model_genes), fnorm, nb_constant_genes, nb_constant_genes*100/len(model_genes)))+"%, "+str(nb_undetermined_genes)+" genes with undetermined status") + nb_absent_genes = len(model_genes)-signatures.shape[0] + nb_constant_genes = signatures.loc[(signatures.mean(axis=1, skipna=True)==0)|(signatures.mean(axis=1, skipna=True)==1)].shape[0]+nb_absent_genes + nb_undetermined_genes = signatures.loc[pd.isnull(signatures.mean(axis=1, skipna=True))].shape[0]+nb_absent_genes + nexps, ncells, ngenes = len(set([c for c in signatures.columns if ("initial" not in c)])), len(set([c for c in signatures.columns if ("initial" in c)])), signatures.shape[0] - plot_signatures(signatures, fname=file_folder+"signatures_binthres="+str(sig_args.get("bin_thres",0.5))) + plot_signatures(signatures, fname=file_folder+"signatures_binthres="+str(sig_args.get("bin_thres",0.5))) + print(("... %d experiments on %d cell lines and %d/%d genes (Frobenius norm signature matrix: %.3f, %d possibly constant genes: %.1f" % (nexps, ncells, ngenes, len(model_genes), fnorm, nb_constant_genes, nb_constant_genes*100/len(model_genes)))+"%, "+str(nb_undetermined_genes)+" genes with undetermined status") + else: + print("... no experiments.") print("\n--- INFER BOOLEAN NETWORK ---") ########################### @@ -364,13 +496,14 @@ def solution_generation(file_folder, taxon_id, path_to_genes=None, disgenet_args if (len(solution_ls)0): + elif (len(bonesis_args.get("exp_ids", []))>0): exps = [cols[i] for i in bonesis_args["exp_ids"]] signatures = signatures[[e for e in exps]]+signatures[["initial_"+cell for cell in list(set([c.split("_")[-1] for c in cols]))]] BO = build_observations(grn, signatures) diff --git a/src/NORDic/UTILS/STRING_utils.py b/src/NORDic/UTILS/STRING_utils.py index 95e4281..d43f925 100755 --- a/src/NORDic/UTILS/STRING_utils.py +++ b/src/NORDic/UTILS/STRING_utils.py @@ -49,6 +49,8 @@ def get_protein_names_from_STRING(gene_list, taxon_id, app_name=None, quiet=Fals sleep(1) from io import StringIO res_df = pd.read_csv(StringIO(results), sep="\t") + if ("Error" in res_df.columns): + return None return res_df[["queryItem", "stringId", "preferredName", "annotation"]] def get_network_from_STRING(gene_list, taxon_id, min_score=0, app_name=None, quiet=False): diff --git a/src/NORDic/UTILS/utils_data.py b/src/NORDic/UTILS/utils_data.py index 386cee5..5edd213 100755 --- a/src/NORDic/UTILS/utils_data.py +++ b/src/NORDic/UTILS/utils_data.py @@ -34,7 +34,7 @@ def request_biodbnet(probe_list, from_, to_, taxon_id, chunksize=500, quiet=Fals args_di = { "method":"db2db", "format": "row", - "inputValues":",".join([x.upper() for x in chunk]), + "inputValues":",".join(chunk), "input":from_, "outputs":to_, "taxonId":str(taxon_id), @@ -88,17 +88,21 @@ def convert_genes_EntrezGene(gene_list, taxon_id, app_name, chunksize=100, missi if (not quiet): print(" STRING ID -> Gene ID (%d probes)" % len(other_ids)) res_df = get_protein_names_from_STRING(other_ids, taxon_id, app_name) - other_ids = [] ## if can't be found automatically... - for idx in list(res_df["queryItem"]): - other_ids.append(missing_genes.get(idx, idx)) - res_df = request_biodbnet(other_ids, "Gene Symbol and Synonyms", "Gene ID", taxon_id, chunksize=chunksize, quiet=quiet) - df_list.append(res_df) + if (res_df is not None): + other_ids = [] + for idx in list(res_df["queryItem"]): + other_ids.append(missing_genes.get(idx, idx)) + res_df = request_biodbnet(other_ids, "Gene Symbol and Synonyms", "Gene ID", taxon_id, chunksize=chunksize, quiet=quiet) + df_list.append(res_df) + if (len(other_ids)>0): + df_list.append(pd.DataFrame([], index=other_ids, columns=["Gene ID"]).fillna("-")) if (len(df_list)>0): probes = pd.concat(tuple(df_list), axis=0) else: probes = None return probes + probes = probes.sort_index(ascending=True) if (not quiet): print("%d probes (successful %d, unsuccessful %d)" % (len(probes), len(probes.loc[probes["Gene ID"]!='-']), len(probes.loc[probes["Gene ID"]=="-"]))) return probes @@ -131,10 +135,13 @@ def convert_EntrezGene_LINCSL1000(EntrezGenes, user_key, quiet=False): entrez_ids[ig] = entrezid if (pert_inames[ig]==g): break - print("\t".join([g, pert_inames[ig], str(ig+1), str(len(EntrezGenes))])) + if (pert_inames[ig] is not None): + print("\t".join([g, pert_inames[ig], str(ig+1), str(len(EntrezGenes))])) pert_inames_ = [p if (p is not None) else "-" for p in pert_inames] entrez_ids_ = [entrez_ids[i] if (pert_inames[i] is not None) else "-" for i in range(len(EntrezGenes))] pert_df = pd.DataFrame([pert_inames_, entrez_ids], columns=EntrezGenes, index=["Gene Symbol", "Entrez ID"]).T + pert_df = pert_df.sort_values(by="Gene Symbol", ascending=True) + pert_df = pert_df.loc[~pert_df.duplicated()] return pert_df def get_all_celllines(pert_inames, user_key, quiet=False): diff --git a/src/NORDic/UTILS/utils_exp.py b/src/NORDic/UTILS/utils_exp.py index d1c3d48..7865aad 100755 --- a/src/NORDic/UTILS/utils_exp.py +++ b/src/NORDic/UTILS/utils_exp.py @@ -32,68 +32,71 @@ def profiles2signatures(profiles_df, user_key, path_to_lincs, save_fname, backgr selection_min, selection_max = nbackground_limits assert selection_min > 0 assert selection_max >= selection_min - cell_lines = list(set([x.split("_")[1] for x in list(profiles_df.loc["sigid"])])) - signatures_list = [] + cell_lines = list(set([x for x in list(profiles_df.loc["cell_line"])])) + signatures_list, conditions_spec = [], ["perturbed", "perturbation"] + add_rows_profiles = ["annotation", "perturbed", "perturbation", "cell_line", "sigid", "interference_scale"] for icell, cell in enumerate(cell_lines): - cell_save_fname = save_fname+"_"+cell+".csv" + cell_save_fname = save_fname+"_"+cell+"_selection="+selection+".csv" ## 1. For each cell line, split control and treated samples into two Pandas DataFrames - profiles__df = profiles_df[[profiles_df.columns[ix] for ix, x in enumerate(profiles_df.loc["sigid"]) if (x.split("_")[1]==cell)]] + profiles__df = profiles_df[[profiles_df.columns[ix] for ix, x in enumerate(profiles_df.loc["cell_line"]) if (x==cell)]] initial_cols = profiles__df.loc["annotation"].apply(pd.to_numeric).values==1 - final_profiles = profiles__df[profiles__df.columns[~initial_cols]] - conditions = list(set(final_profiles.loc["signame"])) - initial_profiles = profiles__df[profiles__df.columns[initial_cols]].iloc[:-3,:].apply(pd.to_numeric) + final_profiles_df = profiles__df[profiles__df.columns[~initial_cols]] + conditions = ["_".join(list(final_profiles_df[idx].loc[conditions_spec])) for idx in final_profiles_df.columns] + initial_profiles = profiles__df[profiles__df.columns[initial_cols]].loc[[idx for idx in profiles__df.index if (idx not in add_rows_profiles)]].apply(pd.to_numeric) initial_profiles.columns = ["Ctrl_rep%d" % (i+1) for i in range(initial_profiles.shape[1])] - final_profiles.columns = [x+"_%d" % (ix+1) for ix, x in enumerate(list(final_profiles.loc["signame"]))] - final_profiles = final_profiles.iloc[:-3,:].apply(pd.to_numeric) + final_profiles_df.columns = [x+"_%d" % (ix+1) for ix, x in enumerate(conditions)] + final_profiles = final_profiles_df.loc[[idx for idx in final_profiles_df.index if (idx not in add_rows_profiles)]].apply(pd.to_numeric) assert final_profiles.shape[1]==np.sum(profiles__df.loc["annotation"].apply(pd.to_numeric).values==2) assert initial_profiles.shape[1]==np.sum(profiles__df.loc["annotation"].apply(pd.to_numeric).values==1) - if (not os.path.exists(cell_save_fname)): + if (not quiet): + print(" Cell %s (%d/%d)" % (cell, icell+1, len(cell_lines)), end="... ") + ## 1'. If required, retrieve background expression data corresponding to the considered cell line + if (not os.path.exists(cell_save_fname) and backgroundfile): + endpoint = "sigs" + method = "filter" + params = { + "where": {"cell_id": cell, "pert_type": "trt_sh"}, + "fields": ["distil_cc_q75", selection, "pct_self_rank_q25", "distil_id", "brew_prefix"] + } + request_url = build_url(endpoint, method, params=params, user_key=user_key) + response = requests.get(request_url) + assert response.status_code == 200 + data = json.loads(response.text) + data = [dt for dt in data if (("distil_id" in dt) and ("distil_cc_q75" in dt) and ("pct_self_rank_q25" in dt))] + ## Select only "gold" signatures, as defined by LINCS L1000 + ## https://clue.io/connectopedia/glossary#is_gold + data = [dt for dt in data if (len(dt["distil_id"])>1)] + assert len(data)>0 + data_gold = [dt for dt in data if ((dt["distil_cc_q75"]>=0.2) and (dt["pct_self_rank_q25"]<=0.05))] + if (len(data_gold)>0): + data = data_gold + ## Select profiles maximizing the "selection" criterion + mselection = np.min([dt[selection] for dt in data]) + max_selection = np.argsort(np.array([dt[selection] if (dt[selection]>=selection_min) else mselection for dt in data])) + max_selection = max_selection[-min(selection_max,len(max_selection)):] + vals_selection = [dt[selection] for dt in [data[i] for i in max_selection]] if (not quiet): - print(" Cell %s (%d/%d)" % (cell, icell+1, len(cell_lines)), end="... ") - ## 1'. If required, retrieve background expression data corresponding to the considered cell line - if (backgroundfile): - endpoint = "sigs" - method = "filter" - params = { - "where": {"cell_id": cell, "pert_type": "trt_sh"}, - "fields": ["distil_cc_q75", selection, "pct_self_rank_q25", "distil_id", "brew_prefix"] - } - request_url = build_url(endpoint, method, params=params, user_key=user_key) - response = requests.get(request_url) - assert response.status_code == 200 - data = json.loads(response.text) - data = [dt for dt in data if (("distil_id" in dt) and ("distil_cc_q75" in dt) and ("pct_self_rank_q25" in dt))] - ## Select only "gold" signatures, as defined by LINCS L1000 - ## https://clue.io/connectopedia/glossary#is_gold - data = [dt for dt in data if ((len(dt["distil_id"])>1) and (dt["distil_cc_q75"]>=0.2) and (dt["pct_self_rank_q25"]<=0.05))] - assert len(data)>0 - ## Select profiles maximizing the "selection" criterion - mselection = np.min([dt[selection] for dt in data]) - max_selection = np.argsort(np.array([dt[selection] if (dt[selection]>=selection_min) else mselection for dt in data])) - max_selection = max_selection[-min(selection_max,len(max_selection)):] - vals_selection = [dt[selection] for dt in [data[i] for i in max_selection]] - if (not quiet): - print(" %d (good) profiles | %d (best) profiles (capped at 50 or min>=%d) (%s max=%.3f, min=%.3f)" % (len(data), len(max_selection), selection_min, selection, np.max(vals_selection), np.min(vals_selection))) - bkgrnd = create_restricted_drug_signatures([did for dt in [data[i] for i in max_selection] for did in dt["distil_id"]], [int(g) for g in list(profiles__df.index)[:-3]], path_to_lincs, which_lvl=[3], strict=False) - bkgrnd.index = [int(g) for g in bkgrnd.index] - ## 2. Aggregate replicates by median values for signature ~ initial condition - initial_profile = initial_profiles.median(axis=1) - initial_profile = initial_profile.loc[~initial_profile.duplicated()] - ## 3. Aggregate values across probes of the same gene - final_profile = final_profiles.T - final_profile.index = ["_".join(idx.split("_")[:-1]) for idx in final_profile.index] - data = final_profile.groupby(level=0).median().T - data.columns = conditions - data = data.loc[~data.index.duplicated()] - data["initial"] = initial_profile.loc[[i for i in data.index if (i in initial_profile.index)]] - data.index = data.index.astype(int) - if (backgroundfile): - data = data.join(bkgrnd, how="inner") - data.to_csv(cell_save_fname) - data = pd.read_csv(cell_save_fname, index_col=0) + print(" %d (good) profiles | %d (best) profiles (capped at 50 or min>=%d) (%s max=%.3f, min=%.3f)" % (len(data), len(max_selection), selection_min, selection, np.max(vals_selection), np.min(vals_selection))) + bkgrnd = create_restricted_drug_signatures([did for dt in [data[i] for i in max_selection] for did in dt["distil_id"]], [int(g) for g in list(profiles__df.index) if (g not in add_rows_profiles)], path_to_lincs, which_lvl=[3], strict=False) + bkgrnd.index = [int(g) for g in bkgrnd.index] + bkgrnd.to_csv(cell_save_fname) + elif (backgroundfile): + bkgrnd = pd.read_csv(cell_save_fname, index_col=0) + ## 2. Aggregate replicates by median values for signature ~ initial condition + initial_profile = initial_profiles.median(axis=1) + initial_profile = initial_profile.loc[~initial_profile.duplicated()] + ## 3. Aggregate values across probes of the same gene + final_profile = final_profiles.T + final_profile.index = ["_".join(list(final_profiles_df[idx].loc[conditions_spec])) for idx in final_profiles_df.columns] + data = final_profile.groupby(level=0).median().T + data = data.loc[~data.index.duplicated()] + data["initial"] = initial_profile.loc[[i for i in data.index if (i in initial_profile.index)]] + data.index = data.index.astype(int) + if (backgroundfile): + data = data.join(bkgrnd, how="inner") ## 4. Binarize profiles signatures = binarize_experiments(data, thres=thres, method=bin_method.split("_CD")[0], strict=not ('CD' in bin_method)) - signatures = signatures[conditions+["initial"]] + signatures = signatures[list(set(conditions))+["initial"]] if ("_CD" in bin_method): for c in conditions: df = pd.concat((final_profiles[[col for col in final_profiles.columns if (c in col)]], initial_profiles), axis=1) @@ -159,8 +162,7 @@ def get_experimental_constraints(cell_lines, pert_types, pert_di, taxon_id, sele entrez_id = pert_di[data["pert_iname"]] if (not quiet): print(" %d experiments so far" % len(signatures)) - signame = "_".join([str(data["pert_iname"]), "OE" if ("_oe" in pert_type) else "KD", data["pert_idose"] if (data["pert_idose"]!="-666") else "NA"]) - treatment = str(data["pert_iname"]) + treatment, perturbation = str(data["pert_iname"]), "OE" if ("_oe" in pert_type) else "KD" ## avoid duplicates if (treatment in perturbed_genes and not quiet): print("\t Duplicated treatment:%s, cell:%s, type:%s" % (treatment, str(data["cell_id"]), str(data["pert_type"]))) @@ -175,11 +177,15 @@ def get_experimental_constraints(cell_lines, pert_types, pert_di, taxon_id, sele if (not quiet): print("... %d genes, %d profiles" % (sigs.shape[0]-3, sigs.shape[1])) perturbed_genes.append(treatment) - sigs.loc["signame"] = [signame]*sigs.shape[1] + sigs.loc["perturbed"] = [treatment]*sigs.shape[1] + sigs.loc["perturbation"] = [perturbation]*sigs.shape[1] + sigs.loc["cell_line"] = [line]*sigs.shape[1] sigs.loc["sigid"] = list(sigs.columns) nexp = len(signatures)+1 sigs.columns = ["Exp"+str(nexp)+":"+str(i)+"-rep"+str(ai+1) for ai, i in enumerate(list(sigs.loc["annotation"]))] signatures.append(sigs) + if (len(signatures)==0): + return pd.DataFrame([], index=pert_inames) signatures = signatures[0].join(signatures[1:], how="outer") signatures = signatures.loc[~signatures.index.duplicated()] return signatures diff --git a/src/NORDic/UTILS/utils_grn.py b/src/NORDic/UTILS/utils_grn.py index 3d02218..d731492 100755 --- a/src/NORDic/UTILS/utils_grn.py +++ b/src/NORDic/UTILS/utils_grn.py @@ -148,8 +148,9 @@ def get_genes_interactions_from_PPI(ppi, connected=False, score=0, filtering=Tru assert all([x in [-1,1,2] for x in list(ppi["sign"])]) assert all([x in [0,1] for x in list(ppi["directed"])]) assert all([x <= 1 and x >= 0 for x in list(ppi["score"])]) - assert all([col in ['preferredName_A', 'preferredName_B', 'sign', 'directed', 'score'] for col in ppi.columns]) - assert ppi.shape[1]==5 + cols = ['preferredName_A', 'preferredName_B', 'sign', 'directed', 'score'] + assert all([col in cols for col in ppi.columns]) + assert ppi.shape[1]==len(cols) ## 1. Double edges depending on whether they are marked as "directed" undirected_edges = ppi.loc[ppi["directed"]==0] undirected_edges.columns = ['preferredName_B', 'preferredName_A', 'sign', 'directed', 'score'] @@ -210,7 +211,7 @@ def get_genes_interactions_from_PPI(ppi, connected=False, score=0, filtering=Tru ppi_accepted.columns = ["Input","Output","SSign"] return ppi_accepted -def build_influences(network_df, tau, beta=1, cor_method="pearson", expr_df=None, quiet=False): +def build_influences(network_df, tau, beta=1, cor_method="pearson", expr_df=None, accept_nonRNA=False, quiet=False): ''' Filters out sign of edges based on gene expression @param\tnetwork_df\tPandas DataFrame: rows/[index] x columns/[["Input", "Output", "SSign"]] interactions @@ -218,6 +219,7 @@ def build_influences(network_df, tau, beta=1, cor_method="pearson", expr_df=None @param\tbeta\tPython integer[default=1]: power applied to the adjacency matrix @param\tcor_method\tPython character string[default="pearson"]: type of correlation @param\texpr_df\tPandas DataFrame[default=None]: rows/[genes] x columns/[samples] gene expression data + @param\taccept_nonRNA\tPython bool[default=False]: if set to False, ignores gene names which are not present in expr_df @param\tquiet\tPython bool[default=False] @return\tinfluences\tPandas DataFrame: rows/[genes] x columns/[genes] signed adjacency matrix with only interactions s.t. corr^beta>=tau ''' @@ -237,19 +239,29 @@ def build_influences(network_df, tau, beta=1, cor_method="pearson", expr_df=None network = network.fillna(0) network[network<0] = -1 network[(network>0)&(network<2)] = 1 + network = network.astype(int) from copy import deepcopy - network_undirected, network_directed = [deepcopy(network) for _ in range(2)] - network_undirected[network_undirected!=2] = 0 - network_directed[network_directed==2] = 0 + network_unsigned, network_signed = [deepcopy(network) for _ in range(2)] + network_unsigned[network_unsigned!=2] = 0 + network_signed[network_signed==2] = 0 # Correlation matrix - df = quantile_normalize(expr_df) - df = df.loc[[g for g in network.index if (g in df.index)]] - coexpr = np.power(df.T.corr(method=cor_method), beta) - coexpr[coexpr.abs()0).astype(float)-(net_mat<0).astype(float) net_mat[net_mat==0] = np.nan - influences = pd.DataFrame(net_mat+network_directed.values, index=network.index, columns=network.columns) + influences = pd.DataFrame(net_mat+network_signed.values, index=network.index, columns=network.columns) influences[np.isnan(influences)] = 0 return influences diff --git a/src/NORDic/UTILS/utils_plot.py b/src/NORDic/UTILS/utils_plot.py index 7ad6d92..71ce1db 100644 --- a/src/NORDic/UTILS/utils_plot.py +++ b/src/NORDic/UTILS/utils_plot.py @@ -19,15 +19,15 @@ def influences2graph(influences, fname, optional=False, compile2png=True, engine graph = ["digraph {"] for idx in influences.columns: if (len(idx)>0): - graph += [idx+" [shape=circle,fillcolor=grey,style=filled];"] + graph += ["\""+idx+"\" [shape=circle,fillcolor=grey,style=filled];"] directed, undirected = ["subgraph D {"], ["subgraph U {"] in_undirected = [] - for x,y in np.argwhere(influences.values != 0).tolist(): + for x,y in np.argwhere(influences.values!=0).tolist(): nx, ny = influences.index[x], influences.columns[y] sign = influences.values[x,y] if ((y,x,sign) in in_undirected): continue - edge = " ".join([nx, "->",ny]) + edge = " ".join(["\""+nx+"\"", "->", "\""+ny+"\""]) edge += " [" if (influences.values[y,x]!=0): edge += "dir=none," @@ -122,7 +122,8 @@ def plot_distributions(profiles, fname="gene_expression_distribution.png", thres @param\tthres\tPython float or None[default=None]: binarization threshold @return\tNone\t ''' - bp = profiles.iloc[:-3,:].T.apply(pd.to_numeric).boxplot(rot=90, figsize=(25,15)) + add_rows_profiles = ["annotation", "perturbed", "perturbation", "cell_line", "sigid"] + bp = profiles.loc[[g for g in profiles.index if (g not in add_rows_profiles)]].T.apply(pd.to_numeric).boxplot(rot=90, figsize=(25,15)) if (str(thres)!="None"): K = profiles.shape[0]-3 for t in [thres, 1-thres]: diff --git a/tests/test_epilepsy_NI.py b/tests/test_epilepsy_NI.py index 6e6e554..55bc81d 100644 --- a/tests/test_epilepsy_NI.py +++ b/tests/test_epilepsy_NI.py @@ -18,7 +18,7 @@ STRING_args = { "credentials": "credentials_STRING.txt", - "score": 0.31, + "score": 0, "version": "v10.5", } @@ -28,7 +28,7 @@ "cell_lines": ["NPC", "SHSY5Y"], "pert_types": ["trt_sh", "trt_oe", "trt_xpr"], "selection": "distil_ss", - "thres_iscale": 0, #0.1, + "thres_iscale": 0, "nsigs": 2, } @@ -41,7 +41,7 @@ } SIG_args = { - "bin_thres": 0.26,#0.155, + "bin_thres": 0.26, "bin_method": "binary", } @@ -50,12 +50,21 @@ BONESIS_args = { "exp_ids": [], "use_diverse": True, - "limit": 1000, + "limit": 1, "niterations": 1, - "exact": False, - "max_maxclause": 4, + "exact": True, + "max_maxclause": 3, } +STRING_args.update({"score": 0.31, "beta": 1}) +EDGE_args.update({"tau": 0, "filter": True, "connected": True}) +SIG_args.update({"bin_thres": 0.26}) #0.155 +LINCS_args.update({"thres_iscale": 0}) #0.1 +BONESIS_args.update({"limit": 1, "exact": False, "max_maxclause": 10}) + +force_experiments=True +accept_nonRNA=False + ## Download epilepsy-related data from download_Refractory_Epilepsy_Data import get_EPILEPSY_genes, file_folder, path_to_genes get_EPILEPSY_genes() @@ -65,7 +74,7 @@ njobs=max(1,cpu_count()-2) NETWORK_fname = None -solution = network_identification(file_folder, taxon_id, path_to_genes, disgenet_args=DISGENET_args, string_args=STRING_args, lincs_args=LINCS_args, edge_args=EDGE_args, sig_args=SIG_args, bonesis_args=BONESIS_args, weights=DESIRABILITY, seed=seed_number, network_fname=NETWORK_fname, njobs=njobs) +solution = network_identification(file_folder, taxon_id, path_to_genes, disgenet_args=DISGENET_args, string_args=STRING_args, lincs_args=LINCS_args, edge_args=EDGE_args, sig_args=SIG_args, bonesis_args=BONESIS_args, weights=DESIRABILITY, seed=seed_number, network_fname=NETWORK_fname, njobs=njobs, force_experiments=force_experiments, accept_nonRNA=accept_nonRNA) if (solution is not None): solution = load_grn(file_folder+"solution.bnet") ## Convert to Cytoscape-readable file diff --git a/tests/test_psy_NI.py b/tests/test_psy_NI.py new file mode 100644 index 0000000..ed20897 --- /dev/null +++ b/tests/test_psy_NI.py @@ -0,0 +1,103 @@ +#coding: utf-8 + +import imports + +import pandas as pd +from subprocess import call as sbcall +import os + +data_folder="/media/kali/1b80f30d-2803-4260-a792-9ae206084252/Code/M30/NetworkOrientedRepurposingofDrugs/" +root_folder="/media/kali/1b80f30d-2803-4260-a792-9ae206084252/Code/M30/MDD/" +#file_folder=root_folder+"MDDFemale_JURKAT/" ####"MDDMale_onlyRNA/" ### "MDD_MaleBrain/" ## "MDD_MaleJURKAT" +file_folder=root_folder+"MDDMale_JURKAT/" +if ("MDDFemale_JURKAT/" in file_folder): + module_fname=root_folder+"PourClemence/FEMALE_ME_129.txt" +if ("MDDMale_JURKAT/" in file_folder): + module_fname=root_folder+"PourClemence/MALE_ME_48.txt" + +sbcall("mkdir -p "+file_folder, shell=True) + +## Create network accepted by NORDic +network = pd.read_csv(module_fname, index_col=0, sep=",") +#network = network.loc[network["Origin"]=="mRNA"] #### +network = network[[c for c in network.columns if (c!="Origin")]] +network.columns = ["preferredName_A", "preferredName_B", "score"] +network["sign"] = [(-1)**int(network.loc[i]["score"]<0) for i in network.index] +network["score"] = network["score"].abs() +network["directed"] = 1 +network = network[["preferredName_A", "preferredName_B", "sign", "directed", "score"]] + +network_fname = file_folder+"network.tsv" +network.to_csv(network_fname, index=None, sep="\t") + +## Registration to databases +DisGeNET_credentials = data_folder+"tests/credentials_DISGENET.txt" +STRING_credentials = data_folder+"tests/credentials_STRING.txt" +LINCS_credentials = data_folder+"tests/credentials_LINCS.txt" +path_to_genes = None +network_fname = file_folder+"network.tsv" +cell_lines = ["JURKAT"] #### ["NPC","SHSY-5Y"] ## ["JURKAT"] + +## Parameters +seed_number=123456 +from multiprocessing import cpu_count +njobs=max(1,cpu_count()-2) +taxon_id=9606 # human species +disease_cids=["C1269683"] # major depressive disorder + +## Information about the disease +DISGENET_args = {"credentials": DisGeNET_credentials, "disease_cids": disease_cids} + +## Selection of parameters relative to the prior knowledge network +STRING_args = {"credentials": STRING_credentials, "score": 0} +EDGE_args = {"tau": 0, "filter": True, "connected": True} + +## Selection of parameters relative to experimental constraints +LINCS_args = {"path_to_lincs": data_folder+"lincs/", "credentials": LINCS_credentials, + "cell_lines": cell_lines, "thres_iscale": 0} +SIG_args = {"bin_thres": 0.5} + +## Selection of parameters relative to the inference of networks +BONESIS_args = {"limit": 1, "exact": True, "max_maxclause": 3} + +## Advanced +DESIRABILITY = {"DS": 3, "CL": 3, "Centr": 3, "GT": 1} + +if ("MDDFemale_JURKAT/" in file_folder): + STRING_args.update({"score": 0, "beta": 1}) + EDGE_args.update({"tau": 0, "filter": False, "connected": False}) + SIG_args.update({"bin_thres": 0.27}) + LINCS_args.update({"thres_iscale": 0.2}) + BONESIS_args.update({"limit": 1000, "exact": False, "max_maxclause": 20}) + +####STRING_args.update({"score": 0, "beta": 1}) +####EDGE_args.update({"tau": 0, "filter": False, "connected": False}) +####SIG_args.update({"bin_thres": 0.125}) +####LINCS_args.update({"thres_iscale": 0}) +####BONESIS_args.update({"limit": 1, "exact": False, "max_maxclause": 10}) + +###STRING_args.update({"score": 0, "beta": 1}) +###EDGE_args.update({"tau": 0, "filter": False, "connected": False}) +###SIG_args.update({"bin_thres": 0.16}) ## 0.165 +###LINCS_args.update({"thres_iscale": 0}) +###BONESIS_args.update({"limit": 200, "exact": False, "max_maxclause": 10}) + +if ("MDDMale_JURKAT" in file_folder): + STRING_args.update({"score": 0, "beta": 1}) + EDGE_args.update({"tau": 0, "filter": False, "connected": False}) + SIG_args.update({"bin_thres": 0.11}) + LINCS_args.update({"thres_iscale": 0}) + BONESIS_args.update({"limit": 50, "exact": True, "max_maxclause": 10}) + +from NORDic.NORDic_NI.functions import network_identification + +MME48_solution = network_identification(file_folder, taxon_id, + path_to_genes=None, disgenet_args=DISGENET_args, + string_args=STRING_args, lincs_args=LINCS_args, edge_args=EDGE_args, + sig_args=SIG_args, bonesis_args=BONESIS_args, weights=DESIRABILITY, + seed=seed_number, network_fname=network_fname, njobs=njobs, force_experiments=True, accept_nonRNA=True) + +if (os.path.exists(file_folder+"solution.bnet")): + with open(file_folder+"solution.bnet", "r") as f: + network = f.read().split("\n") + print(network) diff --git a/tests/test_psy_PMR.py b/tests/test_psy_PMR.py new file mode 100644 index 0000000..7341b39 --- /dev/null +++ b/tests/test_psy_PMR.py @@ -0,0 +1,112 @@ +# coding:utf-8 + +import imports +import os +from subprocess import call as sbcall +from glob import glob +import pandas as pd +import numpy as np + +LINCS_args = { + "path_to_lincs": "../lincs/", +} + +data_folder="/media/kali/1b80f30d-2803-4260-a792-9ae206084252/Code/M30/NetworkOrientedRepurposingofDrugs/" +root_folder="/media/kali/1b80f30d-2803-4260-a792-9ae206084252/Code/M30/MDD/" +file_folder=root_folder+"MDDFemale_JURKAT/" ##"MDDMale_JURKAT/" +if ("MDDFemale_JURKAT/" in file_folder): + sex="F" +else: + sex="M" +module_path=root_folder+"PourClemence/" + +seed_number=0 +k=1 +solution_fname=file_folder+"solution.bnet" + +from multiprocessing import cpu_count +njobs=max(1,cpu_count()-2) + +from NORDic.NORDic_PMR.functions import greedy +from NORDic.UTILS.utils_state import binarize_experiments +from NORDic.UTILS.utils_grn import load_grn +from NORDic.UTILS.utils_data import request_biodbnet + +## 1. Get genes +with open(solution_fname, "r") as f: + network = str(f.read()) +genes = [x.split(" <- ")[0] for x in network.split("\n")] +gene_outputs = [x.split(" <- ")[0] for x in network.split("\n") if (x.split(" <- ")[1] not in [x.split(" <- ")[0], "0", "1"])] + +## 2. Create states (mRNA, miRNA, conditions) +if (not os.path.exists(root_folder+"MDD_omics_"+sex+".csv")): + cmd_contents = "; ".join([ + "MDD_omics <- readRDS(\""+module_path+"ALL_MDD_omic_miRNAmRNADNAm.RDS\")", + "df <- merge(x=MDD_omics$miRNA, y=MDD_omics$mRNA, by=\"row.names\", all=TRUE)", + "rownames(df) <- rownames(MDD_omics$miRNA)", + "metadata <- read.csv(\""+module_path+"CovariatesCommon.csv\", sep=\";\", row.names=1)", + "metadata <- metadata[metadata$SEX==\""+sex+"\",]", + "df <- df[rownames(metadata),]", + "df$annotation <- sapply(metadata[rownames(df),]$GROUP, function(x) 1+as.integer(x==\"MDD\"))", + "df$Row.names <- NULL", + "write.csv(t(df), \""+root_folder+"MDD_omics_"+sex+".csv\")", + ]) + sbcall('R -e \''+cmd_contents+'\'', shell=True) +MDD_omics = pd.read_csv(root_folder+"MDD_omics_"+sex+".csv", index_col=0, dtype=str) +samples = list(MDD_omics.loc["annotation"].astype(int)) +MDD_omics = MDD_omics.loc[[a for a in MDD_omics.index if (a not in ["annotation"])]].apply(pd.to_numeric) +MDD_omics = MDD_omics.loc[~MDD_omics.index.duplicated()] +states = binarize_experiments(MDD_omics, thres=0.5, method="binary").astype(int) +states = states[[c for ic, c in enumerate(states.columns) if (samples[ic]==2)]] +states.columns = ["State%d" % i for i in range(states.shape[1])] +states.index = [".".join(g.split("-")) for g in states.index] +if (not os.path.exists(root_folder+"matches.csv")): + probes = request_biodbnet(["-".join(g.split(".")) for g in list(states.index)], from_="Ensembl Gene ID", to_="Gene Symbol", taxon_id=9606) + probes.to_csv(root_folder+"matches.csv") +probes = pd.read_csv(root_folder+"matches.csv", index_col=0) +probes.loc["ENSG00000163156"] = "SCNM1" +probes.loc["ENSG00000137767"] = "SQOR" +probes.loc["ENSG00000234506"] = "LINC01506" +probes.loc["ENSG00000164032"] = "H2AZ1" +probes.index = [".".join(g.split("-")) for g in probes.index] +probes = probes.loc[(np.vectorize(lambda x : "miR" in x)(list(probes.index)))|(probes["Gene Symbol"]!="-")] +states = states.loc[probes.index] +states.index = [".".join(x.split("-")) if ("miR" in x) else ".".join(probes.loc[x]["Gene Symbol"].split("-")) for x in states.index] +states = states.loc[[".".join(g.split("-")) for g in genes if (".".join(g.split("-")) in states.index)]] +if ("MDDMale_JURKAT/" in file_folder): + states.loc["SQRDL"] = states.loc["SQOR"] +else: + states.loc["H2AFZ"] = states.loc["H2AZ1"] + +states.index = ["/".join(g.split("-")) for g in states.index] +genes = [".".join(g.split("-")) for g in genes] +gene_outputs = [".".join(g.split("-")) for g in gene_outputs] + +## 3. Convert to an accepted format for GRNs +grn_fname=file_folder+"grn.bnet" +if (not os.path.exists(grn_fname)): + fname_ls = glob(solution_fname) + assert len(fname_ls)>0 + sbcall("sed 's/ <- /, /g' "+fname_ls[0]+" | sed 's/-/./g' > "+grn_fname, shell=True) + +IM_params = { + "seed": seed_number, + "gene_inputs": genes, # genes to be perturbed + "gene_outputs": gene_outputs # genes to be observed +} +SIMU_params = { + 'nb_sims': 1000, + 'rates': "fully_asynchronous", + 'thread_count': njobs, + 'depth': "constant_unitary", +} + +######################################### +## Test for master regulator detection ## +######################################### + +S, spreads = greedy(grn_fname, k, states, IM_params, SIMU_params, save_folder=file_folder) + +print("ANSWER:\n%s" % str(S)) + +print(spreads.sort_values(by=spreads.columns[0], ascending=False))