renames

asherp7 · Jan 21, 2019 · 782eec4 · 782eec4
1 parent 44b7374
commit 782eec4
Show file tree

Hide file tree

Showing 2 changed files with 23 additions and 23 deletions.
diff --git a/hack_proj/lior_test.py b/hack_proj/lior_test.py
@@ -50,8 +50,8 @@ def create_data_01(train_list, test_list):
 
 
 def test_new_markov():
-    markov_order = 2
-    win_size = 51
+    markov_order = 3
+    win_size = 201
     island_markov_path = r'C:\liorz\school\76558 Algorithms in Computational Biology\hackathon\data\tmp\create_data_01\markov_model_island_%d.pkl' % markov_order
     other_markov_path = r'C:\liorz\school\76558 Algorithms in Computational Biology\hackathon\data\tmp\create_data_01\markov_model_other_%d.pkl' % markov_order
     test_seq_pkl_path = r'C:\liorz\school\76558 Algorithms in Computational Biology\hackathon\data\tmp\create_data_01\chr1_125124_145563.pkl'
@@ -61,32 +61,32 @@ def test_new_markov():
     llr_markov = pickle.load(open(island_markov_path, 'rb')) # type: MarkovModel
     other_markov = pickle.load(open(other_markov_path, 'rb')) # type: MarkovModel
     seq, labels = pickle.load(open(test_seq_pkl_path, 'rb'))
-    seq = seq[:100000]
-    labels = labels[:100000]
+    seq = seq[:300000]
+    labels = labels[:300000]
     llr_markov.log_prob_mat = island_markov.log_prob_mat - other_markov.log_prob_mat
     llr_markov.log_transition_mat = island_markov.log_transition_mat - other_markov.log_transition_mat
-    print_transition_mat(island_markov)
+    # print_transition_mat(island_markov)
 
     llr_score = llr_markov.get_ll([seq])
-    llr_score_island = island_markov.get_ll([seq])
-    llr_score_other = other_markov.get_ll([seq])
 
     print('N Count: %d' % count_substr(seq, 'N'))
     new_labels = labels[llr_markov.order:]
     window_score, window_labels = apply_window(llr_score, new_labels, win_size=win_size)
 
     show_roc_curve(window_labels, window_score, print_data=False)
     y_pred = np.zeros_like(window_score)
-    y_pred[window_score > 17.3] = 1
+    y_pred[window_score > 30] = 1
     print_prediction_stats(window_labels, y_pred)
     idx = np.array(range(window_score.size))
 
     plt.figure()
     plot_scores_and_labels(window_score, window_labels, 'LLR', 'Log Likeklihood Ratio - Markov order = %d, Win size = %d' % (markov_order, win_size))
     # plt.figure()
+    # llr_score_other = other_markov.get_ll([seq])
     # window_score_other, window_labels_other = apply_window(llr_score_other, new_labels, win_size=win_size)
     # plot_scores_and_labels(window_score_other, window_labels, 'Other LL', 'Other Sequence Log Likelihood - Markov order %d, Win size = %d' % (markov_order, win_size))
     # plt.figure()
+    # llr_score_island = island_markov.get_ll([seq])
     # window_score_island, window_labels_island = apply_window(llr_score_island, new_labels, win_size=win_size)
     # plot_scores_and_labels(window_score_island, window_labels, 'Island LL', 'Island Sequence Log Likelihood - Markov order %d, Win size = %d' % (markov_order, win_size))
     plt.show()
@@ -110,7 +110,7 @@ def train_markov_models():
                    r'C:\liorz\school\76558 Algorithms in Computational Biology\hackathon\data\tmp\create_data_01\train_near.pkl']
     output_dir = r'C:\liorz\school\76558 Algorithms in Computational Biology\hackathon\data\tmp\create_data_01'
 
-    markov_order = 5
+    markov_order = 6
     print('----------------Island data--------------')
     island_data = []
     for p in island_paths:

diff --git a/hack_proj/markov_model.py b/hack_proj/markov_model.py
@@ -17,12 +17,12 @@ def __init__(self, valid_states, order):
         self.valid_states = valid_states
         self.valid_states_dict = list2dict(self.valid_states)
         self.states_num = len(self.valid_states)
-        self.count_states = self._create_order_states(self.order + 1)
-        self.count_states_dict = list2dict(self.count_states)
-        self.base_states = self._create_order_states(self.order)
+        self.full_states = self._create_combined_states(self.order + 1)
+        self.full_states_dict = list2dict(self.full_states)
+        self.base_states = self._create_combined_states(self.order)
         self.base_states_dict = list2dict(self.base_states)
 
-        self.log_prob_mat = np.log(np.ones(len(self.count_states), dtype=float) / len(self.count_states))
+        self.log_prob_mat = np.log(np.ones(len(self.full_states), dtype=float) / len(self.full_states))
         self.log_transition_mat = np.log(np.ones((pow(self.states_num, self.order), self.states_num), dtype=float) / self.states_num)
 
     def get_prob_mat(self):
@@ -31,12 +31,12 @@ def get_prob_mat(self):
     def get_transition_mat(self):
         return np.exp(self.log_transition_mat)
 
-    def _create_order_states(self, len):
+    def _create_combined_states(self, len):
         return list([''.join(p) for p in itertools.product(self.valid_states, repeat=len)])
 
-    def _count_order_states(self, x, smooth=1):
+    def _count_full_states(self, x, smooth=1):
         count_dict = {}
-        for k in self.count_states:
+        for k in self.full_states:
             count_dict[k] = smooth
 
         for val in x:
@@ -48,11 +48,11 @@ def _count_order_states(self, x, smooth=1):
         return count_dict
 
     def fit_transition(self, x):
-        prob_mat = np.ones(len(self.count_states), dtype=float) / len(self.count_states)
+        prob_mat = np.ones(len(self.full_states), dtype=float) / len(self.full_states)
         transition_mat = np.ones((pow(self.states_num, self.order), self.states_num), dtype=float) / self.states_num
 
-        # Count all count states
-        count_dict = self._count_order_states(x)
+        # Count all full states
+        count_dict = self._count_full_states(x)
 
         # Count base states to normalize by
         norm_dict = {}
@@ -62,7 +62,7 @@ def fit_transition(self, x):
             norm_dict[k[:-1]] += v
 
         # Create probability mat for the count states
-        for i, k in enumerate(self.count_states):
+        for i, k in enumerate(self.full_states):
             prob_mat[i] = count_dict[k] / norm_dict[k[:-1]]
 
         # Create transition mat from base states
@@ -79,7 +79,7 @@ def str_to_order_states(self, x):
             curr_ret = []
             for i in range(len(val) - self.order):
                 try:
-                    curr_ret.append(self.count_states_dict[val[i:i + self.order + 1]])
+                    curr_ret.append(self.full_states_dict[val[i:i + self.order + 1]])
                 except:
                     curr_ret.append(MarkovModel.UNKNOWN_VAL)
             ret.append(np.array(curr_ret))
@@ -90,7 +90,7 @@ def get_ll(self, x):
         all_ll = []
         for arr in x_states:
             curr_ll = np.full_like(arr, fill_value=np.nan, dtype=float)
-            for i in range(len(self.count_states)):
+            for i in range(len(self.full_states)):
                 curr_ll[arr == i] = self.log_prob_mat[i]
             self.nan_to_prev(curr_ll, np.mean(self.log_prob_mat))
 
@@ -114,7 +114,7 @@ def nan_to_prev(self, arr, start_val):
 
 def print_prob_mat(markov_model: MarkovModel):
     prob_mat = markov_model.get_prob_mat()
-    for i, k in enumerate(markov_model.count_states):
+    for i, k in enumerate(markov_model.full_states):
         print('%s: %f' % (k, prob_mat[i]))