From aea3b60c53929ea30cd456a670732aa919584ca7 Mon Sep 17 00:00:00 2001
From: asherp7 <asher.patinkin@mail.huji.ac.il>
Date: Tue, 29 Jan 2019 23:59:15 +0200
Subject: [PATCH] work from hackathon that wasnt commited

---
 hack_proj/train_svm.py | 75 ++++++++++++++++++++++++++++++++++++++----
 1 file changed, 68 insertions(+), 7 deletions(-)

diff --git a/hack_proj/train_svm.py b/hack_proj/train_svm.py
index 2181b66..495b4fc 100644
--- a/hack_proj/train_svm.py
+++ b/hack_proj/train_svm.py
@@ -1,15 +1,45 @@
 import itertools
 from collections import Counter
 import numpy as np
-np.set_printoptions(threshold=np.nan)
 from sklearn import svm
 import pickle
+import matplotlib.pyplot as plt
+np.set_printoptions(threshold=np.nan)
 
-CHROMOSOME = 'chr1'
-K_MER_LEN = 6
+K_MER_LEN = 5
 TESTING_RATIO = 0.1
 
 
+# Results stats
+def show_recall_precision_curve(recall, precision, title='SVM performance as function of K-mer length', show_grid=True, print_data=True):
+    plt.figure()
+    # lw = 2
+    plt.plot(range(1, len(precision) + 1), precision, label='Precision')
+    plt.plot(range(1, len(recall) + 1), recall, label='Recall')
+    plt.xlabel('K')
+    plt.ylabel('Percent')
+    plt.legend()
+    plt.title(title)
+    # plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
+    # plt.xlim([0.0, 1.0])
+    # plt.ylim([0.0, 1.05])
+    # plt.xlabel('K')
+    # # plt.ylabel('')
+    # plt.title(title)
+    # plt.legend(loc="lower right")
+    # plt.grid(show_grid)
+    #
+    # # create the axis of thresholds (scores)
+    # ax2 = plt.gca().twinx()
+    # ax2.plot(fpr, thresholds, markeredgecolor='r', linestyle='dashed', color='g')
+    # ax2.set_ylabel('Threshold', color='g')
+    # ax2.set_ylim([thresholds[-1], thresholds[0]])
+    # ax2.set_xlim([fpr[0], fpr[-1]])
+    # if print_data:
+    #     for i in range(fpr.size):
+    #         print('FPR: %.3f, TPR: %.3f: Threshold = %.4f' % (fpr[i], tpr[i], thresholds[i]))
+    plt.show()
+
 def svm_testing_loss(clf, test_data, test_labels, k):
     predictions = clf.predict(test_data)
     fn, fp, tn, tp = 0, 0, 0, 0
@@ -77,20 +107,51 @@ def train_svm(data_path, k):
 
     size_of_test_set = int(len(Y) * TESTING_RATIO)
 
-    clf = svm.SVC(gamma='scale')
+    clf = svm.SVC(kernel='linear', gamma='scale')
     # print(' Started SVM Training...')
     clf.fit(X[:-size_of_test_set], Y[:-size_of_test_set])
     results = svm_testing_loss(clf, X[-size_of_test_set:], Y[-size_of_test_set:], k)
+    bases = ['A', 'C', 'G', 'T']
+    kmers = [''.join(p) for p in itertools.product(bases, repeat=k)]
+    plot_coefficients(clf, kmers)
     return results
 
 
-def compare_svm_on_k_list(data_path, k_list=range(1,7)):
+def compare_svm_on_k_list(data_path, k_list=range(1, 7)):
     results = []
     for k in k_list:
         results.append(train_svm(data_path, k))
     return results
 
 
+def f_importances(coef, names):
+    print(names)
+    imp = coef
+    imp, names = zip(*sorted(zip(imp, names)))
+    plt.barh(range(len(names)), imp, align='center')
+    plt.yticks(range(len(names)), names)
+    plt.show()
+
+
+def plot_coefficients(classifier, feature_names, top_features=15):
+    coef = classifier.coef_.ravel()
+    top_positive_coefficients = np.argsort(coef)[-top_features:]
+    top_negative_coefficients = np.argsort(coef)[:top_features]
+    top_coefficients = np.hstack([top_negative_coefficients, top_positive_coefficients])
+    # create plot
+    plt.figure(figsize=(15, 5))
+    colors = ['red' if c < 0 else 'blue' for c in coef[top_coefficients]]
+    plt.bar(np.arange(2 * top_features), coef[top_coefficients], color=colors)
+    feature_names = np.array(feature_names)
+    plt.xticks(np.arange(1, 1 + 2 * top_features), feature_names[top_coefficients], rotation=60, ha='right')
+    plt.show()
+
+
 if __name__ == '__main__':
-    # train_svm('data/all_data', K_MER_LEN)
-    print(compare_svm_on_k_list('data/all_data'))
+    train_svm('data/all_data', K_MER_LEN)
+    # results = compare_svm_on_k_list('data/data500/data_without_chr1')
+    # recall = [x for (_,_,_,_,x,_,_) in results]
+    # precision =[x for (_,_,_,_,_,x,_) in results]
+    # recall.append(0.98185)
+    # precision.append(0.99591)
+    # show_recall_precision_curve(recall, precision)
\ No newline at end of file