Skip to content

Commit

Permalink
Implementation of the algorithm, reports and readme file was completed.
Browse files Browse the repository at this point in the history
  • Loading branch information
yilmaz-baysal committed Nov 11, 2019
1 parent 776db0f commit cb3d319
Show file tree
Hide file tree
Showing 7 changed files with 206 additions and 1 deletion.
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -102,3 +102,7 @@ venv.bak/

# mypy
.mypy_cache/

# Custom
.idea/
dataset/
17 changes: 16 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1 +1,16 @@
# short-text-clustering
# Short-Text Clustering with K-Means

Python version:
`Python 3.6.5`

To install the requirements use the following command:
`pip3 install -r requirements.pip`

To run the code use the following command:
`python3 src/main.py`

Dataset is a file that contains Stack Overflow questions in the below CSV format:
```
"python","Question Title","Question content."
...
```
Binary file added report.pdf
Binary file not shown.
2 changes: 2 additions & 0 deletions requirements.pip
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
numpy==1.17.4
scipy==1.3.2
93 changes: 93 additions & 0 deletions src/data_preparer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
import re
import csv
from collections import defaultdict

import numpy as np
from scipy.sparse import csr_matrix


class DataPreparer:

#
def __init__(self, file_path):
self.documents = []
self.classes = set()
self.document_labels = {}
self.document_count = 0
self.vocabulary_set = set()
self.vocabulary_size = 0
self.indexed_vocabulary = {}

self.word_document_count = defaultdict(int)
self.document_word_count = defaultdict(int)

self.__read_dataset(file_path) # Read the CSV file and store it on the above variables

#
def __read_dataset(self, file_path):
with open(file_path, 'r') as f:
for document in csv.reader(f):
self.classes.add(document[0])
self.document_labels[self.document_count] = document[0]
self.documents.append(" ".join(re.split(r'\W+', '{} {}'.format(document[1], document[2]).lower())))
self.document_count += 1

#
def build_vocabulary(self):
# Count the words in all documents
for document in self.documents:
tmp_set = set()
for word in document.split():
if word not in tmp_set:
self.word_document_count[word] += 1
tmp_set.add(word)

# Find redundant words
redundant_words_set = set()
for key, value in self.word_document_count.items():
if value < 3 or value > self.document_count * 0.4 or key.isdigit():
redundant_words_set.add(key)

# Vocabulary properties
self.vocabulary_set = self.word_document_count.keys() - redundant_words_set
self.vocabulary_size = len(self.vocabulary_set)

# Build indexed vocabulary
vocabulary_dict = {}
for i, word in enumerate(sorted(list(self.vocabulary_set))):
vocabulary_dict[word] = i
self.indexed_vocabulary[i] = word

return vocabulary_dict

#
def generate_document_term_matrix(self):
vocabulary_dict = self.build_vocabulary()

sparse_matrix_dict = defaultdict(int)
for i, document in enumerate(self.documents):
for word in document.split():
if word in self.vocabulary_set:
sparse_matrix_dict[(i, vocabulary_dict[word])] += 1
self.document_word_count[i] += 1

return sparse_matrix_dict

#
def apply_tf_idf(self):
sparse_matrix_dict = self.generate_document_term_matrix()

# Calculate TF-IDF weights
for (document_id, term_id), count in sparse_matrix_dict.items():
tf = count / self.document_word_count[document_id]
idf = np.log(self.document_count / self.word_document_count[self.indexed_vocabulary[term_id]])
sparse_matrix_dict[(document_id, term_id)] = tf * idf

return self.__dict_to_sparse_matrix(sparse_matrix_dict)

#
def __dict_to_sparse_matrix(self, document_term_dict):
return csr_matrix(
(list(document_term_dict.values()), zip(*list(document_term_dict.keys()))),
shape=(self.document_count, self.vocabulary_size)
)
79 changes: 79 additions & 0 deletions src/k_means.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
import random
from collections import defaultdict

import numpy as np
from scipy.sparse import csr_matrix


class K_Means:

#
def __init__(self, k: int, iterations: int, data: csr_matrix, data_length: int):
self.k = k
self.iterations = iterations
self.data = data
self.data_length = data_length

#
def __initialize_centroids(self):
centroids = []
centroid_norms = []
for random_index in random.sample(range(0, self.data_length - 1), self.k):
centroid = self.data.getrow(random_index).toarray()[0]
centroids.append(centroid)

return centroid_norms, centroids

#
def cluster(self, document_labels):
centroid_norms, centroids = self.__initialize_centroids()

# Main loop
clusters = defaultdict(set)
for i in range(self.iterations):
clusters = defaultdict(set) # Reset the clusters

# find the distance between the point and cluster; choose the nearest centroid
for row_index, sparse_row in enumerate(self.data):
vector = sparse_row.toarray()

closest_centroid = (float('inf'), None)
for centroid_index, centroid in enumerate(centroids):
# Calculate inverse of cosine similarity
dist = 1 - np.true_divide(np.dot(vector, centroid), np.multiply(np.linalg.norm(vector), np.linalg.norm(centroid)))

if dist <= closest_centroid[0]:
closest_centroid = (dist, centroid_index)

clusters[closest_centroid[1]].add(row_index)

# Re-calculate centroids
for centroid_index, vector_indices in clusters.items():
avg_vector = None
for vector_index in vector_indices:
if avg_vector is None:
avg_vector = self.data.getrow(vector_index).toarray()
else:
avg_vector = np.add(avg_vector, self.data.getrow(vector_index).toarray())

centroids[centroid_index] = (avg_vector / len(vector_indices))[0]

# Calculate purity
majority_sum = 0
for cluster in clusters.values():
# Count cluster items with respect to their labels
labeled_document_counts = defaultdict(int)
for document_index in cluster:
labeled_document_counts[document_labels[document_index]] += 1

# Find majority class
majority_class = (0, None)
for label, count in labeled_document_counts.items():
if count > majority_class[0]:
majority_class = (count, label)

majority_sum += majority_class[0] # Add majority to global sum

print('Purity at iteration {} is\t{}'.format(i, (majority_sum / self.data_length)))

return clusters
12 changes: 12 additions & 0 deletions src/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
from data_preparer import DataPreparer
from k_means import K_Means

# Prepare the data before calculations
dp = DataPreparer(file_path='dataset/stck_data.csv')

# Create document-term matrix
dt_matrix = dp.apply_tf_idf()

# Run K-Means algorithm
k_means = K_Means(len(dp.classes), iterations=10, data=dt_matrix, data_length=dp.document_count)
clusters = k_means.cluster(dp.document_labels)

0 comments on commit cb3d319

Please sign in to comment.