Implementation of the algorithm, reports and readme file was completed.

yilmazbaysal · Nov 11, 2019 · cb3d319 · cb3d319
1 parent 776db0f
commit cb3d319
Show file tree

Hide file tree

Showing 7 changed files with 206 additions and 1 deletion.
diff --git a/.gitignore b/.gitignore
@@ -102,3 +102,7 @@ venv.bak/
 
 # mypy
 .mypy_cache/
+
+# Custom
+.idea/
+dataset/
diff --git a/README.md b/README.md
@@ -1 +1,16 @@
-# short-text-clustering
+# Short-Text Clustering with K-Means
+
+Python version:  
+`Python 3.6.5`
+
+To install the requirements use the following command:  
+`pip3 install -r requirements.pip`
+
+To run the code use the following command:  
+`python3 src/main.py`
+
+Dataset is a file that contains Stack Overflow questions in the below CSV format:  
+```
+"python","Question Title","Question content."
+...
+```
diff --git a/report.pdf b/report.pdf
diff --git a/requirements.pip b/requirements.pip
@@ -0,0 +1,2 @@
+numpy==1.17.4
+scipy==1.3.2
diff --git a/src/data_preparer.py b/src/data_preparer.py
@@ -0,0 +1,93 @@
+import re
+import csv
+from collections import defaultdict
+
+import numpy as np
+from scipy.sparse import csr_matrix
+
+
+class DataPreparer:
+
+    #
+    def __init__(self, file_path):
+        self.documents = []
+        self.classes = set()
+        self.document_labels = {}
+        self.document_count = 0
+        self.vocabulary_set = set()
+        self.vocabulary_size = 0
+        self.indexed_vocabulary = {}
+
+        self.word_document_count = defaultdict(int)
+        self.document_word_count = defaultdict(int)
+
+        self.__read_dataset(file_path)  # Read the CSV file and store it on the above variables
+
+    #
+    def __read_dataset(self, file_path):
+        with open(file_path, 'r') as f:
+            for document in csv.reader(f):
+                self.classes.add(document[0])
+                self.document_labels[self.document_count] = document[0]
+                self.documents.append(" ".join(re.split(r'\W+', '{} {}'.format(document[1], document[2]).lower())))
+                self.document_count += 1
+
+    #
+    def build_vocabulary(self):
+        # Count the words in all documents
+        for document in self.documents:
+            tmp_set = set()
+            for word in document.split():
+                if word not in tmp_set:
+                    self.word_document_count[word] += 1
+                    tmp_set.add(word)
+
+        # Find redundant words
+        redundant_words_set = set()
+        for key, value in self.word_document_count.items():
+            if value < 3 or value > self.document_count * 0.4 or key.isdigit():
+                redundant_words_set.add(key)
+
+        # Vocabulary properties
+        self.vocabulary_set = self.word_document_count.keys() - redundant_words_set
+        self.vocabulary_size = len(self.vocabulary_set)
+
+        # Build indexed vocabulary
+        vocabulary_dict = {}
+        for i, word in enumerate(sorted(list(self.vocabulary_set))):
+            vocabulary_dict[word] = i
+            self.indexed_vocabulary[i] = word
+
+        return vocabulary_dict
+
+    #
+    def generate_document_term_matrix(self):
+        vocabulary_dict = self.build_vocabulary()
+
+        sparse_matrix_dict = defaultdict(int)
+        for i, document in enumerate(self.documents):
+            for word in document.split():
+                if word in self.vocabulary_set:
+                    sparse_matrix_dict[(i, vocabulary_dict[word])] += 1
+                    self.document_word_count[i] += 1
+
+        return sparse_matrix_dict
+
+    #
+    def apply_tf_idf(self):
+        sparse_matrix_dict = self.generate_document_term_matrix()
+
+        # Calculate TF-IDF weights
+        for (document_id, term_id), count in sparse_matrix_dict.items():
+            tf = count / self.document_word_count[document_id]
+            idf = np.log(self.document_count / self.word_document_count[self.indexed_vocabulary[term_id]])
+            sparse_matrix_dict[(document_id, term_id)] = tf * idf
+
+        return self.__dict_to_sparse_matrix(sparse_matrix_dict)
+
+    #
+    def __dict_to_sparse_matrix(self, document_term_dict):
+        return csr_matrix(
+            (list(document_term_dict.values()), zip(*list(document_term_dict.keys()))),
+            shape=(self.document_count, self.vocabulary_size)
+        )
diff --git a/src/k_means.py b/src/k_means.py
@@ -0,0 +1,79 @@
+import random
+from collections import defaultdict
+
+import numpy as np
+from scipy.sparse import csr_matrix
+
+
+class K_Means:
+
+    #
+    def __init__(self, k: int, iterations: int, data: csr_matrix, data_length: int):
+        self.k = k
+        self.iterations = iterations
+        self.data = data
+        self.data_length = data_length
+
+    #
+    def __initialize_centroids(self):
+        centroids = []
+        centroid_norms = []
+        for random_index in random.sample(range(0, self.data_length - 1), self.k):
+            centroid = self.data.getrow(random_index).toarray()[0]
+            centroids.append(centroid)
+
+        return centroid_norms, centroids
+
+    #
+    def cluster(self, document_labels):
+        centroid_norms, centroids = self.__initialize_centroids()
+
+        # Main loop
+        clusters = defaultdict(set)
+        for i in range(self.iterations):
+            clusters = defaultdict(set)  # Reset the clusters
+
+            # find the distance between the point and cluster; choose the nearest centroid
+            for row_index, sparse_row in enumerate(self.data):
+                vector = sparse_row.toarray()
+
+                closest_centroid = (float('inf'), None)
+                for centroid_index, centroid in enumerate(centroids):
+                    # Calculate inverse of cosine similarity
+                    dist = 1 - np.true_divide(np.dot(vector, centroid), np.multiply(np.linalg.norm(vector), np.linalg.norm(centroid)))
+
+                    if dist <= closest_centroid[0]:
+                        closest_centroid = (dist, centroid_index)
+
+                clusters[closest_centroid[1]].add(row_index)
+
+            # Re-calculate centroids
+            for centroid_index, vector_indices in clusters.items():
+                avg_vector = None
+                for vector_index in vector_indices:
+                    if avg_vector is None:
+                        avg_vector = self.data.getrow(vector_index).toarray()
+                    else:
+                        avg_vector = np.add(avg_vector, self.data.getrow(vector_index).toarray())
+
+                centroids[centroid_index] = (avg_vector / len(vector_indices))[0]
+
+            # Calculate purity
+            majority_sum = 0
+            for cluster in clusters.values():
+                # Count cluster items with respect to their labels
+                labeled_document_counts = defaultdict(int)
+                for document_index in cluster:
+                    labeled_document_counts[document_labels[document_index]] += 1
+
+                # Find majority class
+                majority_class = (0, None)
+                for label, count in labeled_document_counts.items():
+                    if count > majority_class[0]:
+                        majority_class = (count, label)
+
+                majority_sum += majority_class[0]  # Add majority to global sum
+
+            print('Purity at iteration {} is\t{}'.format(i, (majority_sum / self.data_length)))
+
+        return clusters
diff --git a/src/main.py b/src/main.py
@@ -0,0 +1,12 @@
+from data_preparer import DataPreparer
+from k_means import K_Means
+
+# Prepare the data before calculations
+dp = DataPreparer(file_path='dataset/stck_data.csv')
+
+# Create document-term matrix
+dt_matrix = dp.apply_tf_idf()
+
+# Run K-Means algorithm
+k_means = K_Means(len(dp.classes), iterations=10, data=dt_matrix, data_length=dp.document_count)
+clusters = k_means.cluster(dp.document_labels)