diff --git a/.gitignore b/.gitignore index ba00295..1f36604 100644 --- a/.gitignore +++ b/.gitignore @@ -54,7 +54,4 @@ dist/ build/ # Ignore data folder -data/ - -# Ignore test files -hack_proj/lior_test.py \ No newline at end of file +data/ \ No newline at end of file diff --git a/hack_proj/lior_test.py b/hack_proj/lior_test.py new file mode 100644 index 0000000..13897df --- /dev/null +++ b/hack_proj/lior_test.py @@ -0,0 +1,251 @@ +from utils import * +from Bio import SeqIO, SeqUtils +import numpy as np +import pickle +import time +from markov_model import * + + +def create_data_01(train_list, test_list): + data_dir_path = r'C:\liorz\school\76558 Algorithms in Computational Biology\hackathon\data/' + sizes_path = r'C:\liorz\school\76558 Algorithms in Computational Biology\hackathon\data\hg19.chrom.sizes' + tags_path = r'C:\liorz\school\76558 Algorithms in Computational Biology\hackathon\data\CGI.hg19.bed' + output_dir = r'C:\liorz\school\76558 Algorithms in Computational Biology\hackathon\data\tmp\create_data_01' + + genome_tag_range = GenomeTagRange(tags_path, sizes_path) + reader = DataReader.DataReader(data_dir_path) + data_extractor = DataExtractor(reader, genome_tag_range) + + print('Create train_islands...') + pkl_path = os.path.join(output_dir, 'train_islands.pkl') + if not os.path.isfile(pkl_path): + train_islands, _ = create_island_overlap_data(train_list, data_extractor, pad_size=0, seq_num_list=None) + pickle.dump(train_islands, open(os.path.join(output_dir, 'train_islands.pkl'), 'wb')) + train_islands.clear() + + print('Create train_random...') + pkl_path = os.path.join(output_dir, 'train_random.pkl') + if not os.path.isfile(pkl_path): + train_random, _ = create_non_island_random_data(train_list, data_extractor, seq_len=2001, min_distance=5000, seq_num_list=500) + pickle.dump(train_random, open(os.path.join(output_dir, 'train_random.pkl'), 'wb')) + train_random.clear() + + print('Create train_near...') + pkl_path = os.path.join(output_dir, 'train_near.pkl') + if not os.path.isfile(pkl_path): + train_near, _ = create_near_island_data(train_list, data_extractor, seq_num_list=500, distance=1000, seq_len=2001) + pickle.dump(train_near, open(os.path.join(output_dir, 'train_near.pkl'), 'wb')) + train_near.clear() + + print('Create test padded_islands') + pkl_path = os.path.join(output_dir, 'test_padded_islands_02.pkl') + if not os.path.isfile(pkl_path): + test_padded_islands, labels = create_island_overlap_data(test_list, data_extractor, pad_size=1500, seq_num_list=None) + full_str = '' + full_lables = [] + for i in range(len(test_padded_islands)): + full_str = full_str + test_padded_islands[i] + full_lables.extend(labels[i]) + pickle.dump((full_str, full_lables), open(pkl_path, 'wb')) + + +def test_new_markov(): + markov_order = 2 + win_size = 51 + island_markov_path = r'C:\liorz\school\76558 Algorithms in Computational Biology\hackathon\data\tmp\create_data_01\markov_model_island_%d.pkl' % markov_order + other_markov_path = r'C:\liorz\school\76558 Algorithms in Computational Biology\hackathon\data\tmp\create_data_01\markov_model_other_%d.pkl' % markov_order + test_seq_pkl_path = r'C:\liorz\school\76558 Algorithms in Computational Biology\hackathon\data\tmp\create_data_01\chr1_125124_145563.pkl' + test_seq_pkl_path = r'C:\liorz\school\76558 Algorithms in Computational Biology\hackathon\data\tmp\create_data_01\test_padded_islands_02.pkl' + + island_markov = pickle.load(open(island_markov_path, 'rb')) # type: MarkovModel + llr_markov = pickle.load(open(island_markov_path, 'rb')) # type: MarkovModel + other_markov = pickle.load(open(other_markov_path, 'rb')) # type: MarkovModel + seq, labels = pickle.load(open(test_seq_pkl_path, 'rb')) + seq = seq[:100000] + labels = labels[:100000] + llr_markov.log_prob_mat = island_markov.log_prob_mat - other_markov.log_prob_mat + llr_markov.log_transition_mat = island_markov.log_transition_mat - other_markov.log_transition_mat + print_transition_mat(island_markov) + + llr_score = llr_markov.get_ll([seq]) + llr_score_island = island_markov.get_ll([seq]) + llr_score_other = other_markov.get_ll([seq]) + + print('N Count: %d' % count_substr(seq, 'N')) + new_labels = labels[llr_markov.order:] + window_score, window_labels = apply_window(llr_score, new_labels, win_size=win_size) + + show_roc_curve(window_labels, window_score, print_data=False) + y_pred = np.zeros_like(window_score) + y_pred[window_score > 17.3] = 1 + print_prediction_stats(window_labels, y_pred) + idx = np.array(range(window_score.size)) + + plt.figure() + plot_scores_and_labels(window_score, window_labels, 'LLR', 'Log Likeklihood Ratio - Markov order = %d, Win size = %d' % (markov_order, win_size)) + # plt.figure() + # window_score_other, window_labels_other = apply_window(llr_score_other, new_labels, win_size=win_size) + # plot_scores_and_labels(window_score_other, window_labels, 'Other LL', 'Other Sequence Log Likelihood - Markov order %d, Win size = %d' % (markov_order, win_size)) + # plt.figure() + # window_score_island, window_labels_island = apply_window(llr_score_island, new_labels, win_size=win_size) + # plot_scores_and_labels(window_score_island, window_labels, 'Island LL', 'Island Sequence Log Likelihood - Markov order %d, Win size = %d' % (markov_order, win_size)) + plt.show() + + +def plot_scores_and_labels(score, labels, score_type, title): + idx = np.array(range(score.size)) + # plt.scatter(range(score.size), normalize_arr(score), marker='.', label=score_type) + plt.plot(range(score.size), normalize_arr(score), label=score_type) + plt.scatter(idx[labels == 1], labels[labels == 1], marker='.', label='CPG Island', c='orange') + plt.scatter(idx[labels == 0], labels[labels == 0], marker='.', label='Other', c='green') + plt.xlabel('DNA Sequence location') + plt.ylabel('CPG Island score') + plt.title(title) + plt.legend() + + +def train_markov_models(): + island_paths = [r'C:\liorz\school\76558 Algorithms in Computational Biology\hackathon\data\tmp\create_data_01\train_islands.pkl'] + other_paths = [r'C:\liorz\school\76558 Algorithms in Computational Biology\hackathon\data\tmp\create_data_01\train_random.pkl', + r'C:\liorz\school\76558 Algorithms in Computational Biology\hackathon\data\tmp\create_data_01\train_near.pkl'] + output_dir = r'C:\liorz\school\76558 Algorithms in Computational Biology\hackathon\data\tmp\create_data_01' + + markov_order = 5 + print('----------------Island data--------------') + island_data = [] + for p in island_paths: + island_data.extend(pickle.load(open(p, 'rb'))) + markov_model_island = MarkovModel(ALL_BASES, order=markov_order) + # markov_model_island.fit_transition(island_data[:10]) + # print_transition_mat(markov_model_island) + # markov_model_island.fit_transition(island_data[:100]) + # print_transition_mat(markov_model_island) + # markov_model_island.fit_transition(island_data[:1000]) + # print_transition_mat(markov_model_island) + markov_model_island.fit_transition(island_data) + print_transition_mat(markov_model_island) + pickle.dump(markov_model_island, open(os.path.join(output_dir, 'markov_model_island_%d.pkl' % markov_order), 'wb')) + print('----------------Other data--------------') + other_data = [] + for p in other_paths: + other_data.extend(pickle.load(open(p, 'rb'))) + markov_model_other = MarkovModel(ALL_BASES, order=markov_order) + # markov_model_other.fit_transition(other_data[:10]) + # print_transition_mat(markov_model_other) + # markov_model_other.fit_transition(other_data[:100]) + # print_transition_mat(markov_model_other) + # markov_model_other.fit_transition(other_data[:1000]) + # print_transition_mat(markov_model_other) + markov_model_other.fit_transition(other_data) + print_transition_mat(markov_model_other) + pickle.dump(markov_model_other, open(os.path.join(output_dir, 'markov_model_other_%d.pkl' % markov_order), 'wb')) + + +def extract_data_01(): + data_dir_path = r'C:\liorz\school\76558 Algorithms in Computational Biology\hackathon\data/' + sizes_path = r'C:\liorz\school\76558 Algorithms in Computational Biology\hackathon\data\hg19.chrom.sizes' + tags_path = r'C:\liorz\school\76558 Algorithms in Computational Biology\hackathon\data\CGI.hg19.bed' + data_path = r'C:\liorz\school\76558 Algorithms in Computational Biology\hackathon\data\clf data\all_data.pkl' + + genome_tag_range = GenomeTagRange(tags_path, sizes_path) + reader = DataReader.DataReader(data_dir_path) + data_extractor = DataExtractor(reader, genome_tag_range) + + start = 125124 + end = 145563 + chr = 'chr1' + ranges_dict = {RANGE_START: [start], RANGE_END: [end]} + + pkl_path = r'C:\liorz\school\76558 Algorithms in Computational Biology\hackathon\data\tmp\%s_%d_%d.pkl' % (chr, start, end) + all_seq, all_labels = data_extractor.get_data_from_ranges(chr, ranges_dict, pad_size=0, with_unknown=True) + pickle.dump((all_seq[0], all_labels[0]), open(pkl_path, 'wb')) + + +def test_all_01(): + data_dir_path = r'C:\liorz\school\76558 Algorithms in Computational Biology\hackathon\data/' + sizes_path = r'C:\liorz\school\76558 Algorithms in Computational Biology\hackathon\data\hg19.chrom.sizes' + tags_path = r'C:\liorz\school\76558 Algorithms in Computational Biology\hackathon\data\CGI.hg19.bed' + data_path = r'C:\liorz\school\76558 Algorithms in Computational Biology\hackathon\data\clf data\all_data.pkl' + + genome_tag_range = GenomeTagRange(tags_path, sizes_path) + reader = DataReader.DataReader(data_dir_path) + data_extractor = DataExtractor(reader, genome_tag_range) + + t = time.time() + all_seq, all_labels = create_island_overlap_data(train_chrs, data_extractor, pad_size=1000, seq_num_list=None) + print('Found %d sequences' % len(all_seq)) + print('create_island_overlap_data Time: %f seconds' % (time.time() - t)) + + t = time.time() + all_seq, all_labels = create_near_island_data(train_chrs, data_extractor, None) + print('Found %d sequences' % len(all_seq)) + print('create_near_island_data Time: %f seconds' % (time.time() - t)) + + + t = time.time() + all_seq, all_labels = create_non_island_random_data(train_chrs, data_extractor, 2000) + print('Found %d sequences' % len(all_seq)) + print('create_non_island_random_data Time: %f seconds' % (time.time() - t)) + + exit(1) + all_data = pickle.load(open(data_path, 'rb')) + all_seq = [] + all_label = [] + for seq, label, seq_label in all_data: + all_seq.append(seq) + all_label.append(int(label)) + # clf = CGFreqClassifier(seq_window_len=501) + clf = SubstrFreqClassifier(substr_list=['G', 'C'], seq_window_len=501) + clf = SubstrFreqClassifier(substr_list=['GC', 'CG'], seq_window_len=501) + train_ratio = 0.9 + test_ratio = 0.9 + clf.fit(all_seq[:int(len(all_seq) * train_ratio)], all_label[:int(len(all_seq) * train_ratio)]) + print('Predict test...') + + y = all_label[int(len(all_seq) * test_ratio):] + test_data = all_seq[int(len(all_seq) * test_ratio):] + y_pred = clf.get_seq_prob(test_data) + show_roc_curve(y, y_pred) + + thresh = 0.3 + clf.set_threshold(thresh) + + # all_seq, all_labels = extractor.get_base_tagged_seq('chr1', pad_size=2) + + +def test_data_01(): + data_dir_path = r'C:\liorz\school\76558 Algorithms in Computational Biology\hackathon\data/' + sizes_path = r'C:\liorz\school\76558 Algorithms in Computational Biology\hackathon\data\hg19.chrom.sizes' + tags_path = r'C:\liorz\school\76558 Algorithms in Computational Biology\hackathon\data\CGI.hg19.bed' + + genome_tag_range = GenomeTagRange(tags_path, sizes_path) + reader = DataReader.DataReader(data_dir_path) + extractor = DataExtractor(reader, genome_tag_range) + all_seq, all_labels = extractor.get_base_tagged_seq('chr1', pad_size=2) + + print(all_seq) + print(all_labels) + + +def test_classifier_01(): + data_path = r'C:\liorz\school\76558 Algorithms in Computational Biology\hackathon\data\clf data\all_data.pkl' + all_data = pickle.load(open(data_path, 'rb')) + all_seq = [] + all_label = [] + for seq, label, seq_label in all_data: + all_seq.append(seq) + all_label.append(int(label)) + clf = CGFreqClassifier(100) + clf.fit(all_seq[:int(len(all_seq) * 0.9)], all_label[:int(len(all_seq) * 0.9)]) + y = all_label[int(len(all_seq) * 0.9):] + test_data = all_seq[int(len(all_seq) * 0.9):] + y_pred = clf.get_seq_prob(test_data) + show_roc_curve(y, y_pred) + + +if __name__ == "__main__": + # train_markov_models() + # create_data_01(train_chrs, test_chrs) + test_new_markov() + # extract_data_01() + exit(1) diff --git a/hack_proj/markov_model.py b/hack_proj/markov_model.py new file mode 100644 index 0000000..37b221a --- /dev/null +++ b/hack_proj/markov_model.py @@ -0,0 +1,144 @@ +import re +import itertools +from utils import * + +def list2dict(lst): + d = {} + for i in range(len(lst)): + d[lst[i]] = i + return d + + +class MarkovModel(object): + UNKNOWN_VAL = -1 + + def __init__(self, valid_states, order): + self.order = order + self.valid_states = valid_states + self.valid_states_dict = list2dict(self.valid_states) + self.states_num = len(self.valid_states) + self.count_states = self._create_order_states(self.order + 1) + self.count_states_dict = list2dict(self.count_states) + self.base_states = self._create_order_states(self.order) + self.base_states_dict = list2dict(self.base_states) + + self.log_prob_mat = np.log(np.ones(len(self.count_states), dtype=float) / len(self.count_states)) + self.log_transition_mat = np.log(np.ones((pow(self.states_num, self.order), self.states_num), dtype=float) / self.states_num) + + def get_prob_mat(self): + return np.exp(self.log_prob_mat) + + def get_transition_mat(self): + return np.exp(self.log_transition_mat) + + def _create_order_states(self, len): + return list([''.join(p) for p in itertools.product(self.valid_states, repeat=len)]) + + def _count_order_states(self, x, smooth=1): + count_dict = {} + for k in self.count_states: + count_dict[k] = smooth + + for val in x: + for i in range(len(val) - self.order): + try: + count_dict[val[i:i + self.order + 1]] += 1 + except: + continue + return count_dict + + def fit_transition(self, x): + prob_mat = np.ones(len(self.count_states), dtype=float) / len(self.count_states) + transition_mat = np.ones((pow(self.states_num, self.order), self.states_num), dtype=float) / self.states_num + + # Count all count states + count_dict = self._count_order_states(x) + + # Count base states to normalize by + norm_dict = {} + for k in self.base_states: + norm_dict[k] = 0 + for k, v in count_dict.items(): + norm_dict[k[:-1]] += v + + # Create probability mat for the count states + for i, k in enumerate(self.count_states): + prob_mat[i] = count_dict[k] / norm_dict[k[:-1]] + + # Create transition mat from base states + for i, k in enumerate(self.base_states): + for j, s in enumerate(self.valid_states): + transition_mat[i, j] = count_dict[k + s] / norm_dict[k] + + self.log_prob_mat = np.log(prob_mat) + self.log_transition_mat = np.log(transition_mat) + + def str_to_order_states(self, x): + ret = [] + for val in x: + curr_ret = [] + for i in range(len(val) - self.order): + try: + curr_ret.append(self.count_states_dict[val[i:i + self.order + 1]]) + except: + curr_ret.append(MarkovModel.UNKNOWN_VAL) + ret.append(np.array(curr_ret)) + return ret + + def get_ll(self, x): + x_states = self.str_to_order_states(x) + all_ll = [] + for arr in x_states: + curr_ll = np.full_like(arr, fill_value=np.nan, dtype=float) + for i in range(len(self.count_states)): + curr_ll[arr == i] = self.log_prob_mat[i] + self.nan_to_prev(curr_ll, np.mean(self.log_prob_mat)) + + all_ll.append(curr_ll) + return all_ll + + def nan_to_prev(self, arr, start_val): + nan_idx = np.where(np.isnan(arr))[0] + if nan_idx.size > 0: + if np.isnan(arr[0]): + arr[0] = start_val + if nan_idx[0] == 0: + if nan_idx.size == 1: + nan_idx = None + else: + nan_idx = nan_idx[1:] + if nan_idx is not None and nan_idx.size > 0: + for i in nan_idx: + arr[i] = arr[i - 1] + + +def print_prob_mat(markov_model: MarkovModel): + prob_mat = markov_model.get_prob_mat() + for i, k in enumerate(markov_model.count_states): + print('%s: %f' % (k, prob_mat[i])) + + +def print_transition_mat(markov_model: MarkovModel): + states_str = '' + transition_mat = markov_model.get_transition_mat() + for j in range(transition_mat.shape[1]): + states_str += '\t%s' % markov_model.valid_states[j] + print(states_str) + for i in range(transition_mat.shape[0]): + print_str = markov_model.base_states[i] + for j in range(transition_mat.shape[1]): + print_str += '\t%.3f' % transition_mat[i,j] + print(print_str) + + +if __name__ == "__main__": + m = MarkovModel(['A', 'B'], 1) + train = ['12AAG', 'AABBAA', 'ABABAAAAAABBBBBNNABABABA', 'AAXXABBBAA', 'ABBBBBBXBBBAABBBBBBBBBBAAAAAAAAAAAAAAAAAAAAAAAAA'] + print_prob_mat(m) + print_transition_mat(m) + m.fit_transition(train) + print('-----------------------') + print_prob_mat(m) + print_transition_mat(m) + + ll = m.get_ll(train) \ No newline at end of file diff --git a/hack_proj/utils.py b/hack_proj/utils.py index 9dc7013..18f743c 100644 --- a/hack_proj/utils.py +++ b/hack_proj/utils.py @@ -265,13 +265,13 @@ def create_island_overlap_data(chr_list, data_extractor: DataExtractor, pad_size all_seq = [] all_labels = [] for i, chr in enumerate(chr_list): - curr_all_seq, curr_all_labels = data_extractor.get_base_tagged_seq(chr, seq_num_list[i], pad_size, with_unknown=False) + curr_all_seq, curr_all_labels = data_extractor.get_base_tagged_seq(chr, seq_num_list[i], pad_size, with_unknown=True) all_seq.extend(curr_all_seq) all_labels.extend(curr_all_labels) return all_seq, all_labels -def create_non_island_random_data(chr_list, data_extractor: DataExtractor, seq_num_list=None, seq_len=2001): +def create_non_island_random_data(chr_list, data_extractor: DataExtractor, seq_num_list=None, seq_len=2001, min_distance=1000): if seq_num_list is None: seq_num_list = 1 if isinstance(seq_num_list, int): @@ -281,8 +281,8 @@ def create_non_island_random_data(chr_list, data_extractor: DataExtractor, seq_n all_labels = [] for i, chr in enumerate(chr_list): curr_all_seq, curr_all_labels = data_extractor.get_non_island_random_data(chr, seq_num_list[i], - seq_len=seq_len, min_distance=1000, - with_unknown=False) + seq_len=seq_len, min_distance=min_distance, + with_unknown=True) all_seq.extend(curr_all_seq) all_labels.extend(curr_all_labels) return all_seq, all_labels @@ -299,7 +299,7 @@ def create_near_island_data(chr_list, data_extractor: DataExtractor, seq_num_lis for i, chr in enumerate(chr_list): curr_all_seq, curr_all_labels = data_extractor.get_near_island_data(chr, seq_num_list[i], seq_len, distance=distance, - with_unknown=False) + with_unknown=True) all_seq.extend(curr_all_seq) all_labels.extend(curr_all_labels) return all_seq, all_labels @@ -307,7 +307,7 @@ def create_near_island_data(chr_list, data_extractor: DataExtractor, seq_num_lis # Results stats def show_roc_curve(y, y_score, title='ROC Curve', show_grid=True, print_data=True): - fpr, tpr, thresholds = roc_curve(y, y_score) + fpr, tpr, thresholds = roc_curve(y, y_score, drop_intermediate=False) roc_auc = auc(fpr, tpr) plt.figure() @@ -500,8 +500,15 @@ def classify_seq(self, seq, labels=None): y_pred[y_prob >= self.threshold] = 1 return y_pred +class MarkovLLRClassifier(IslandClassifier): + pass # General utils +def normalize_arr(arr): + arr = arr - np.mean(arr) + arr = arr / np.std(arr) + return arr + def count_substr(s, sub_str): count = 0 start = 0