From aea3b60c53929ea30cd456a670732aa919584ca7 Mon Sep 17 00:00:00 2001 From: asherp7 Date: Tue, 29 Jan 2019 23:59:15 +0200 Subject: [PATCH] work from hackathon that wasnt commited --- hack_proj/train_svm.py | 75 ++++++++++++++++++++++++++++++++++++++---- 1 file changed, 68 insertions(+), 7 deletions(-) diff --git a/hack_proj/train_svm.py b/hack_proj/train_svm.py index 2181b66..495b4fc 100644 --- a/hack_proj/train_svm.py +++ b/hack_proj/train_svm.py @@ -1,15 +1,45 @@ import itertools from collections import Counter import numpy as np -np.set_printoptions(threshold=np.nan) from sklearn import svm import pickle +import matplotlib.pyplot as plt +np.set_printoptions(threshold=np.nan) -CHROMOSOME = 'chr1' -K_MER_LEN = 6 +K_MER_LEN = 5 TESTING_RATIO = 0.1 +# Results stats +def show_recall_precision_curve(recall, precision, title='SVM performance as function of K-mer length', show_grid=True, print_data=True): + plt.figure() + # lw = 2 + plt.plot(range(1, len(precision) + 1), precision, label='Precision') + plt.plot(range(1, len(recall) + 1), recall, label='Recall') + plt.xlabel('K') + plt.ylabel('Percent') + plt.legend() + plt.title(title) + # plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--') + # plt.xlim([0.0, 1.0]) + # plt.ylim([0.0, 1.05]) + # plt.xlabel('K') + # # plt.ylabel('') + # plt.title(title) + # plt.legend(loc="lower right") + # plt.grid(show_grid) + # + # # create the axis of thresholds (scores) + # ax2 = plt.gca().twinx() + # ax2.plot(fpr, thresholds, markeredgecolor='r', linestyle='dashed', color='g') + # ax2.set_ylabel('Threshold', color='g') + # ax2.set_ylim([thresholds[-1], thresholds[0]]) + # ax2.set_xlim([fpr[0], fpr[-1]]) + # if print_data: + # for i in range(fpr.size): + # print('FPR: %.3f, TPR: %.3f: Threshold = %.4f' % (fpr[i], tpr[i], thresholds[i])) + plt.show() + def svm_testing_loss(clf, test_data, test_labels, k): predictions = clf.predict(test_data) fn, fp, tn, tp = 0, 0, 0, 0 @@ -77,20 +107,51 @@ def train_svm(data_path, k): size_of_test_set = int(len(Y) * TESTING_RATIO) - clf = svm.SVC(gamma='scale') + clf = svm.SVC(kernel='linear', gamma='scale') # print(' Started SVM Training...') clf.fit(X[:-size_of_test_set], Y[:-size_of_test_set]) results = svm_testing_loss(clf, X[-size_of_test_set:], Y[-size_of_test_set:], k) + bases = ['A', 'C', 'G', 'T'] + kmers = [''.join(p) for p in itertools.product(bases, repeat=k)] + plot_coefficients(clf, kmers) return results -def compare_svm_on_k_list(data_path, k_list=range(1,7)): +def compare_svm_on_k_list(data_path, k_list=range(1, 7)): results = [] for k in k_list: results.append(train_svm(data_path, k)) return results +def f_importances(coef, names): + print(names) + imp = coef + imp, names = zip(*sorted(zip(imp, names))) + plt.barh(range(len(names)), imp, align='center') + plt.yticks(range(len(names)), names) + plt.show() + + +def plot_coefficients(classifier, feature_names, top_features=15): + coef = classifier.coef_.ravel() + top_positive_coefficients = np.argsort(coef)[-top_features:] + top_negative_coefficients = np.argsort(coef)[:top_features] + top_coefficients = np.hstack([top_negative_coefficients, top_positive_coefficients]) + # create plot + plt.figure(figsize=(15, 5)) + colors = ['red' if c < 0 else 'blue' for c in coef[top_coefficients]] + plt.bar(np.arange(2 * top_features), coef[top_coefficients], color=colors) + feature_names = np.array(feature_names) + plt.xticks(np.arange(1, 1 + 2 * top_features), feature_names[top_coefficients], rotation=60, ha='right') + plt.show() + + if __name__ == '__main__': - # train_svm('data/all_data', K_MER_LEN) - print(compare_svm_on_k_list('data/all_data')) + train_svm('data/all_data', K_MER_LEN) + # results = compare_svm_on_k_list('data/data500/data_without_chr1') + # recall = [x for (_,_,_,_,x,_,_) in results] + # precision =[x for (_,_,_,_,_,x,_) in results] + # recall.append(0.98185) + # precision.append(0.99591) + # show_recall_precision_curve(recall, precision) \ No newline at end of file