diff --git a/code/Netrem_model_builder.py b/code/Netrem_model_builder.py index 3ba8e69..f25576f 100644 --- a/code/Netrem_model_builder.py +++ b/code/Netrem_model_builder.py @@ -379,6 +379,8 @@ def fit(self, X, y): # fits a model Function used for model training if len(selected_cols) == 0: self.model_nonzero_coef_df = None self.num_final_predictors = 0 + self.final_corr_vs_coef_df = pd.DataFrame() + self.combined_df = pd.DataFrame() else: self.model_nonzero_coef_df = self.model_coef_df[selected_cols] if len(selected_cols) > 1: # and self.model_type != "Linear": @@ -386,17 +388,23 @@ def fit(self, X, y): # fits a model Function used for model training self.num_final_predictors = len(selected_cols) if "y_intercept" in selected_cols: self.num_final_predictors = self.num_final_predictors - 1 + else: + self.final_corr_vs_coef_df = pd.DataFrame() + self.combined_df = pd.DataFrame() + self.organize_B_interaction_list() + self.TF_coord_scores_pairwise_df = return_TF_coord_scores_df(self) return self def netrem_model_predictor_results(self, y): # olders """ :) Please note that this function by Saniya works on a netrem model and returns information about the predictors such as their Pearson correlations with y, their rankings as well. - It returns: sorted_df, final_corr_vs_coef_df, combined_df """ + It returns: sorted_df, final_corr_vs_coef_df, combined_df """ abs_df = self.model_nonzero_coef_df.replace("None", np.nan).apply(pd.to_numeric, errors='coerce').abs() if abs_df.shape[0] == 1: abs_df = pd.DataFrame([abs_df.squeeze()]) + sorted_series = abs_df.squeeze().sort_values(ascending=False) sorted_df = pd.DataFrame(sorted_series) # convert the sorted series back to a DataFrame sorted_df['Rank'] = range(1, len(sorted_df) + 1) # add a column for the rank @@ -417,11 +425,11 @@ def netrem_model_predictor_results(self, y): # olders sorting["input_data"] = "X_train" all_df = pd.concat([all_df, sorting]) self.corr_vs_coef_df = all_df - self.final_corr_vs_coef_df = self.corr_vs_coef_df[["info", "input_data"] + self.model_nonzero_coef_df.columns.tolist()[1:]] netrem_model_df = self.model_nonzero_coef_df.transpose() netrem_model_df.columns = ["coef"] netrem_model_df["TF"] = netrem_model_df.index.tolist() netrem_model_df["TG"] = tg + self.final_corr_vs_coef_df = self.corr_vs_coef_df[["info", "input_data"] + self.model_nonzero_coef_df.columns.tolist()[1:]] if self.y_intercept: netrem_model_df["info"] = "netrem_with_intercept" else: @@ -1167,6 +1175,13 @@ def netremCV(edge_list, X, y, return newest_netrem +# Function to lookup coefficient +def lookup_coef(tf, netrem_model): + model_nonzero_coef_df = netrem_model.model_nonzero_coef_df + coef_series = model_nonzero_coef_df.iloc[0].drop('y_intercept') + return coef_series.get(tf, 0) # Returns 0 if TF is not found + + def organize_predictor_interaction_network(netrem_model): if "model_nonzero_coef_df" not in vars(netrem_model).keys(): print(":( No NetREm model was built") @@ -1175,6 +1190,9 @@ def organize_predictor_interaction_network(netrem_model): if "model_type" in TF_interaction_df.columns.tolist(): TF_interaction_df = TF_interaction_df.drop(columns = ["model_type"]) num_TFs = TF_interaction_df.shape[0] + TF_coord_scores_pairwise_df = netrem_model.TF_coord_scores_pairwise_df + if TF_coord_scores_pairwise_df.shape[0] == 0: + return None TF_interaction_df = netrem_model.TF_coord_scores_pairwise_df.drop(columns = ["absVal_coord_score"]) TF_interaction_df = TF_interaction_df.rename(columns = {"coordination_score":"coord_score_cs"}) @@ -1205,8 +1223,9 @@ def organize_predictor_interaction_network(netrem_model): # Step 3: Calculate the percentile TF_interaction_df['cs_magnitude_percentile'] = (1 - (TF_interaction_df['cs_magnitude_rank'] / TF_interaction_df['absVal_coordScore'].count())) * 100 + TF_interaction_df["TF_TF"] = TF_interaction_df["node_1"] + "_" + TF_interaction_df["node_2"] - TF_interaction_df["TF_TF"] = TF_interaction_df["TF1"] + "_" + TF_interaction_df["TF2"] + #TF_interaction_df["TF_TF"] = TF_interaction_df["TF1"] + "_" + TF_interaction_df["TF2"] standardized_data = True if "old_X_df" in vars(netrem_model).keys(): @@ -1216,43 +1235,36 @@ def organize_predictor_interaction_network(netrem_model): original_X_corr_mat = netrem_model.X_df.corr() standardized_data = False # Melting the DataFrame into a 3-column edge list - original_X_corr_df = original_X_corr_mat.reset_index().melt(id_vars=["index"], var_name="TF2", value_name="original_corr") - original_X_corr_df.rename(columns={"index": "TF1"}, inplace=True) - original_X_corr_df = original_X_corr_df[original_X_corr_df["TF1"] != original_X_corr_df["TF2"]] - original_X_corr_df["TF_TF"] = original_X_corr_df["TF1"] + "_" + original_X_corr_df["TF2"] + original_X_corr_df = original_X_corr_mat.reset_index().melt(id_vars=["index"], var_name="node_2", value_name="original_corr") + original_X_corr_df.rename(columns={"index": "node_1"}, inplace=True) + original_X_corr_df = original_X_corr_df[original_X_corr_df["node_1"] != original_X_corr_df["node_2"]] + original_X_corr_df["TF_TF"] = original_X_corr_df["node_1"] + "_" + original_X_corr_df["node_2"] # Display the first few rows to verify the format if standardized_data: # Melting the DataFrame into a 3-column edge list - standardized_X_corr_df = standardized_X_corr_mat.reset_index().melt(id_vars=["index"], var_name="TF2", value_name="standardized_corr") - standardized_X_corr_df.rename(columns={"index": "TF1"}, inplace=True) - standardized_X_corr_df = standardized_X_corr_df[standardized_X_corr_df["TF1"] != standardized_X_corr_df["TF2"]] - standardized_X_corr_df["TF_TF"] = standardized_X_corr_df["TF1"] + "_" + standardized_X_corr_df["TF2"] - standardized_X_corr_df.drop(columns = ["TF1", "TF2"], inplace = True) - original_X_corr_df = pd.merge(original_X_corr_df, standardized_X_corr_df).drop(columns = ["TF1", "TF2"]) - - + standardized_X_corr_df = standardized_X_corr_mat.reset_index().melt(id_vars=["index"], var_name="node_2", value_name="standardized_corr") + standardized_X_corr_df.rename(columns={"index": "node_1"}, inplace=True) + standardized_X_corr_df = standardized_X_corr_df[standardized_X_corr_df["node_1"] != standardized_X_corr_df["node_2"]] + standardized_X_corr_df["TF_TF"] = standardized_X_corr_df["node_1"] + "_" + standardized_X_corr_df["node_2"] + standardized_X_corr_df.drop(columns = ["node_1", "node_2"], inplace = True) + original_X_corr_df = pd.merge(original_X_corr_df, standardized_X_corr_df).drop(columns = ["node_1", "node_2"]) TF_interaction_df = pd.merge(TF_interaction_df, original_X_corr_df) - default_edge_w = netrem_model.network.default_edge_weight if "W_df" in vars(netrem_model.network).keys(): W_df = netrem_model.network.W_df else: W_df = netrem_model.W_df - - ppi_net_df = W_df.reset_index().melt(id_vars=["index"], var_name="TF2", value_name="PPI_score") - ppi_net_df.rename(columns={"index": "TF1"}, inplace=True) - + ppi_net_df = W_df.reset_index().melt(id_vars=["index"], var_name="node_2", value_name="PPI_score") + ppi_net_df.rename(columns={"index": "node_1"}, inplace=True) ppi_net_df["novel_link"] = np.where((ppi_net_df.PPI_score == default_edge_w), "yes", "no") - ppi_net_df["TF_TF"] = ppi_net_df["TF1"] + "_" + ppi_net_df["TF2"] - ppi_net_df = ppi_net_df[ppi_net_df["TF1"] != ppi_net_df["TF2"]] # 42849 rows × 3 columns - ppi_net_df = ppi_net_df.drop(columns = ["TF1", "TF2"]) - + ppi_net_df["TF_TF"] = ppi_net_df["node_1"] + "_" + ppi_net_df["node_2"] + ppi_net_df = ppi_net_df[ppi_net_df["node_1"] != ppi_net_df["node_2"]] # 42849 rows × 3 columns + ppi_net_df = ppi_net_df.drop(columns = ["node_1", "node_2"]) TF_interaction_df = pd.merge(TF_interaction_df, ppi_net_df) TF_interaction_df["absVal_diff_cs_and_originalCorr"] = abs(TF_interaction_df["coord_score_cs"] - TF_interaction_df["standardized_corr"]) - - TF_interaction_df['c_1'] = TF_interaction_df['TF1'].apply(lookup_coef, args=(netrem_model,)) - TF_interaction_df['c_2'] = TF_interaction_df['TF2'].apply(lookup_coef, args=(netrem_model,)) + TF_interaction_df['c_1'] = TF_interaction_df['node_1'].apply(lookup_coef, args=(netrem_model,)) + TF_interaction_df['c_2'] = TF_interaction_df['node_2'].apply(lookup_coef, args=(netrem_model,)) return TF_interaction_df diff --git a/code/netrem_evaluation_functions.py b/code/netrem_evaluation_functions.py index ff0d723..4d63c35 100644 --- a/code/netrem_evaluation_functions.py +++ b/code/netrem_evaluation_functions.py @@ -36,8 +36,8 @@ from skopt.utils import use_named_args #import Netrem_model_builder as nm from sklearn import linear_model, preprocessing # 9/19 -import NetremmerFinal2024 as nm -from NetremmerFinal2024 import * +import Netrem_model_builder as nm +from Netrem_model_builder import * class BayesianObjective_Lasso: def __init__(self, X, y, cv_folds, model, scorer="mse", print_network=False): diff --git a/code/previous_version/demo_toy.py b/code/previous_version/demo_toy.py new file mode 100644 index 0000000..51a81e0 --- /dev/null +++ b/code/previous_version/demo_toy.py @@ -0,0 +1,65 @@ +import sys +sys.path.append("../code") # assuming "code" is one directory up and then down into "code" + +from DemoDataBuilderXandY import generate_dummy_data +from Netrem_model_builder import netrem, netremCV +import PriorGraphNetwork as graph +import error_metrics as em +import essential_functions as ef +import netrem_evaluation_functions as nm_eval +import Netrem_model_builder as nm + +dummy_data = generate_dummy_data(corrVals = [0.9, 0.5, 0.4, -0.3, -0.8], # the # of elements in corrVals is the # of predictors (X) + num_samples_M = 100000, # the number of samples M + train_data_percent = 70) # the remainder out of 100,000 will be kept for testing. If 100, then ALL data is used for training and testing. + +X_df = dummy_data.X_df +X_df.head() + +y_df = dummy_data.y_df +y_df.head() + +# 70,000 samples for training data (used to train and fit GRegulNet model) +X_train = dummy_data.view_X_train_df() +y_train = dummy_data.view_y_train_df() + +# 30,000 samples for testing data +X_test = dummy_data.view_X_test_df() +y_test = dummy_data.view_y_test_df() + +X_train.corr() # pairwise correlations among the training samples +X_test.corr() # pairwise correlations among the training samples + + +# prior network edge_list (missing edges or edges with no edge weight will be added with the default_edge_list so the network is fully-connected): +edge_list = [["TF1", "TF2", 0.8], ["TF4", "TF5", 0.95], ["TF1", "TF3"], ["TF1", "TF4"], ["TF1", "TF5"], + ["TF2", "TF3"], ["TF2", "TF4"], ["TF2", "TF5"], ["TF3", "TF4"], ["TF3", "TF5"]] + +beta_network_val = 1 +# by default, model_type is Lasso, so alpha_lasso_val will be specified for the alpha_lasso parameter. +# However, we will specify model_type = LassoCV, so our alpha_lasso is determined by cross-validation on training data). + +# Building the network regularized regression model: +# By default, edges are constructed between all of the nodes; nodes with a missing edge are assigned the default_edge_weight. +netrem_demo = netrem(edge_list = edge_list, + beta_net = beta_network_val, + model_type = "LassoCV", + view_network = True) + +# Fitting the NetREm model on training data: X_train and y_train: +netrem_demo.fit(X_train, y_train) + + +pred_y_test = netrem_demo.predict(X_test) # predicted values for y_test +mse_test = netrem_demo.test_mse(X_test, y_test) +print(f"Please note that the testing Mean Square Error (MSE) is {mse_test}") + +# To view and extract the predicted model coefficients for the predictors: +netrem_demo.model_coef_df + +netrem_demo.B_interaction_df + + +netrem_demo.final_corr_vs_coef_df +netrem_demo.combined_df +organize_B_interaction_network(netrem_demo) diff --git a/demo/demo_toy.py b/demo/demo_toy.py index 51a81e0..59145a3 100644 --- a/demo/demo_toy.py +++ b/demo/demo_toy.py @@ -8,10 +8,12 @@ import essential_functions as ef import netrem_evaluation_functions as nm_eval import Netrem_model_builder as nm - -dummy_data = generate_dummy_data(corrVals = [0.9, 0.5, 0.4, -0.3, -0.8], # the # of elements in corrVals is the # of predictors (X) +M = 10000 # number of samples (e.g. single-cells) +corrs_list = [0.9, 0.5, 0.4, -0.3, -0.8] # the individual correlations (r) of each X variable with y +dummy_data = generate_dummy_data(corrVals = corrs_list, # the # of elements in corrVals is the # of predictors (X) num_samples_M = 100000, # the number of samples M - train_data_percent = 70) # the remainder out of 100,000 will be kept for testing. If 100, then ALL data is used for training and testing. + sparsity_factor_perc = 40, # approximately what % of data for each variable is sparse (0) + train_data_percent = 70) # the remainder out of 10,000 will be kept for testing. If 100, then ALL data is used for training and testing. X_df = dummy_data.X_df X_df.head() @@ -19,11 +21,11 @@ y_df = dummy_data.y_df y_df.head() -# 70,000 samples for training data (used to train and fit GRegulNet model) +# 7,000 samples for training data (used to train and fit GRegulNet model) X_train = dummy_data.view_X_train_df() y_train = dummy_data.view_y_train_df() -# 30,000 samples for testing data +# 3,000 samples for testing data X_test = dummy_data.view_X_test_df() y_test = dummy_data.view_y_test_df() @@ -36,6 +38,7 @@ ["TF2", "TF3"], ["TF2", "TF4"], ["TF2", "TF5"], ["TF3", "TF4"], ["TF3", "TF5"]] beta_network_val = 1 +alpha_lasso_val = 0.1 # by default, model_type is Lasso, so alpha_lasso_val will be specified for the alpha_lasso parameter. # However, we will specify model_type = LassoCV, so our alpha_lasso is determined by cross-validation on training data). @@ -43,7 +46,8 @@ # By default, edges are constructed between all of the nodes; nodes with a missing edge are assigned the default_edge_weight. netrem_demo = netrem(edge_list = edge_list, beta_net = beta_network_val, - model_type = "LassoCV", + alpha_lasso = alpha_lasso_val, + model_type = "Lasso", view_network = True) # Fitting the NetREm model on training data: X_train and y_train: @@ -57,9 +61,9 @@ # To view and extract the predicted model coefficients for the predictors: netrem_demo.model_coef_df -netrem_demo.B_interaction_df +netrem_demo.TF_interaction_df netrem_demo.final_corr_vs_coef_df netrem_demo.combined_df -organize_B_interaction_network(netrem_demo) +nm.organize_predictor_interaction_network(netrem_demo)