-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Implementation of the algorithm, reports and readme file was completed.
- Loading branch information
1 parent
776db0f
commit cb3d319
Showing
7 changed files
with
206 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -102,3 +102,7 @@ venv.bak/ | |
|
||
# mypy | ||
.mypy_cache/ | ||
|
||
# Custom | ||
.idea/ | ||
dataset/ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +1,16 @@ | ||
# short-text-clustering | ||
# Short-Text Clustering with K-Means | ||
|
||
Python version: | ||
`Python 3.6.5` | ||
|
||
To install the requirements use the following command: | ||
`pip3 install -r requirements.pip` | ||
|
||
To run the code use the following command: | ||
`python3 src/main.py` | ||
|
||
Dataset is a file that contains Stack Overflow questions in the below CSV format: | ||
``` | ||
"python","Question Title","Question content." | ||
... | ||
``` |
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
numpy==1.17.4 | ||
scipy==1.3.2 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,93 @@ | ||
import re | ||
import csv | ||
from collections import defaultdict | ||
|
||
import numpy as np | ||
from scipy.sparse import csr_matrix | ||
|
||
|
||
class DataPreparer: | ||
|
||
# | ||
def __init__(self, file_path): | ||
self.documents = [] | ||
self.classes = set() | ||
self.document_labels = {} | ||
self.document_count = 0 | ||
self.vocabulary_set = set() | ||
self.vocabulary_size = 0 | ||
self.indexed_vocabulary = {} | ||
|
||
self.word_document_count = defaultdict(int) | ||
self.document_word_count = defaultdict(int) | ||
|
||
self.__read_dataset(file_path) # Read the CSV file and store it on the above variables | ||
|
||
# | ||
def __read_dataset(self, file_path): | ||
with open(file_path, 'r') as f: | ||
for document in csv.reader(f): | ||
self.classes.add(document[0]) | ||
self.document_labels[self.document_count] = document[0] | ||
self.documents.append(" ".join(re.split(r'\W+', '{} {}'.format(document[1], document[2]).lower()))) | ||
self.document_count += 1 | ||
|
||
# | ||
def build_vocabulary(self): | ||
# Count the words in all documents | ||
for document in self.documents: | ||
tmp_set = set() | ||
for word in document.split(): | ||
if word not in tmp_set: | ||
self.word_document_count[word] += 1 | ||
tmp_set.add(word) | ||
|
||
# Find redundant words | ||
redundant_words_set = set() | ||
for key, value in self.word_document_count.items(): | ||
if value < 3 or value > self.document_count * 0.4 or key.isdigit(): | ||
redundant_words_set.add(key) | ||
|
||
# Vocabulary properties | ||
self.vocabulary_set = self.word_document_count.keys() - redundant_words_set | ||
self.vocabulary_size = len(self.vocabulary_set) | ||
|
||
# Build indexed vocabulary | ||
vocabulary_dict = {} | ||
for i, word in enumerate(sorted(list(self.vocabulary_set))): | ||
vocabulary_dict[word] = i | ||
self.indexed_vocabulary[i] = word | ||
|
||
return vocabulary_dict | ||
|
||
# | ||
def generate_document_term_matrix(self): | ||
vocabulary_dict = self.build_vocabulary() | ||
|
||
sparse_matrix_dict = defaultdict(int) | ||
for i, document in enumerate(self.documents): | ||
for word in document.split(): | ||
if word in self.vocabulary_set: | ||
sparse_matrix_dict[(i, vocabulary_dict[word])] += 1 | ||
self.document_word_count[i] += 1 | ||
|
||
return sparse_matrix_dict | ||
|
||
# | ||
def apply_tf_idf(self): | ||
sparse_matrix_dict = self.generate_document_term_matrix() | ||
|
||
# Calculate TF-IDF weights | ||
for (document_id, term_id), count in sparse_matrix_dict.items(): | ||
tf = count / self.document_word_count[document_id] | ||
idf = np.log(self.document_count / self.word_document_count[self.indexed_vocabulary[term_id]]) | ||
sparse_matrix_dict[(document_id, term_id)] = tf * idf | ||
|
||
return self.__dict_to_sparse_matrix(sparse_matrix_dict) | ||
|
||
# | ||
def __dict_to_sparse_matrix(self, document_term_dict): | ||
return csr_matrix( | ||
(list(document_term_dict.values()), zip(*list(document_term_dict.keys()))), | ||
shape=(self.document_count, self.vocabulary_size) | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,79 @@ | ||
import random | ||
from collections import defaultdict | ||
|
||
import numpy as np | ||
from scipy.sparse import csr_matrix | ||
|
||
|
||
class K_Means: | ||
|
||
# | ||
def __init__(self, k: int, iterations: int, data: csr_matrix, data_length: int): | ||
self.k = k | ||
self.iterations = iterations | ||
self.data = data | ||
self.data_length = data_length | ||
|
||
# | ||
def __initialize_centroids(self): | ||
centroids = [] | ||
centroid_norms = [] | ||
for random_index in random.sample(range(0, self.data_length - 1), self.k): | ||
centroid = self.data.getrow(random_index).toarray()[0] | ||
centroids.append(centroid) | ||
|
||
return centroid_norms, centroids | ||
|
||
# | ||
def cluster(self, document_labels): | ||
centroid_norms, centroids = self.__initialize_centroids() | ||
|
||
# Main loop | ||
clusters = defaultdict(set) | ||
for i in range(self.iterations): | ||
clusters = defaultdict(set) # Reset the clusters | ||
|
||
# find the distance between the point and cluster; choose the nearest centroid | ||
for row_index, sparse_row in enumerate(self.data): | ||
vector = sparse_row.toarray() | ||
|
||
closest_centroid = (float('inf'), None) | ||
for centroid_index, centroid in enumerate(centroids): | ||
# Calculate inverse of cosine similarity | ||
dist = 1 - np.true_divide(np.dot(vector, centroid), np.multiply(np.linalg.norm(vector), np.linalg.norm(centroid))) | ||
|
||
if dist <= closest_centroid[0]: | ||
closest_centroid = (dist, centroid_index) | ||
|
||
clusters[closest_centroid[1]].add(row_index) | ||
|
||
# Re-calculate centroids | ||
for centroid_index, vector_indices in clusters.items(): | ||
avg_vector = None | ||
for vector_index in vector_indices: | ||
if avg_vector is None: | ||
avg_vector = self.data.getrow(vector_index).toarray() | ||
else: | ||
avg_vector = np.add(avg_vector, self.data.getrow(vector_index).toarray()) | ||
|
||
centroids[centroid_index] = (avg_vector / len(vector_indices))[0] | ||
|
||
# Calculate purity | ||
majority_sum = 0 | ||
for cluster in clusters.values(): | ||
# Count cluster items with respect to their labels | ||
labeled_document_counts = defaultdict(int) | ||
for document_index in cluster: | ||
labeled_document_counts[document_labels[document_index]] += 1 | ||
|
||
# Find majority class | ||
majority_class = (0, None) | ||
for label, count in labeled_document_counts.items(): | ||
if count > majority_class[0]: | ||
majority_class = (count, label) | ||
|
||
majority_sum += majority_class[0] # Add majority to global sum | ||
|
||
print('Purity at iteration {} is\t{}'.format(i, (majority_sum / self.data_length))) | ||
|
||
return clusters |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,12 @@ | ||
from data_preparer import DataPreparer | ||
from k_means import K_Means | ||
|
||
# Prepare the data before calculations | ||
dp = DataPreparer(file_path='dataset/stck_data.csv') | ||
|
||
# Create document-term matrix | ||
dt_matrix = dp.apply_tf_idf() | ||
|
||
# Run K-Means algorithm | ||
k_means = K_Means(len(dp.classes), iterations=10, data=dt_matrix, data_length=dp.document_count) | ||
clusters = k_means.cluster(dp.document_labels) |