-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.py
97 lines (63 loc) · 3.03 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
import CranReader as CR
import Preprocessing as PR
import VectorSpaceModel as VSM
import LanguageModel as LM
if __name__ == '__main__':
# Reading corpus
rd = CR.CranReader('./CRANFIELD/cran.all.1400', "read_text")
rd.read()
# Preprocessing corpus
p = PR.Preprocessing(rd.txts)
p.perform_preprocessing()
# Reading queris
qrs = CR.CranReader('./CRANFIELD/cran.qry', "read_text")
qrs.read()
# Preprocessing queris
p_qrs = PR.Preprocessing(qrs.txts)
p_qrs.perform_preprocessing()
# Choosing query
query_index = 8
print("Clean query = ", p_qrs.clean_corpus[query_index - 1])
print("Initial query = ", qrs.txts[query_index - 1])
# Creating Vector Space Model with preprocessed corpus
print("Creating Vector Space Model with preprocessed corpus")
vsm = VSM.VectorSpaceModel(p.clean_corpus)
vsm.calculate_inverted_index()
# Querying with VSM
related_docs_indices_vsm = vsm.calculate_cos_similarity(p_qrs.clean_corpus[query_index - 1])
# Printing 3 of the best matches for VSM
for i in related_docs_indices_vsm[:3]:
data = [rd.txts[i]]
print(data)
# Creating Language Model with preprocessed corpus
print("Creating Language Model with preprocessed corpus")
lm = LM.LanguageModel(p.stem_clean_tokens, p.clean_corpus)
lm.calculate_TF_IDF()
related_docs_indices_lm = lm.query_likelihood(p_qrs.stem_clean_tokens[query_index - 1])
# Printing 3 of the best matches for LM with query likelyhood
for i in related_docs_indices_lm[:3]:
data = [rd.txts[i]]
print(data)
if __name__ == '__test_main__':
test_corpus = [
'In computer science artificial intelligence sometimes called machine intelligence is intelligence demonstrated by machines',
'Experimentation calculation and Observation is called science',
'Physics is a natural science that involves the study of matter and its motion through space and time, along with related concepts such as energy and force',
'In mathematics and computer science an algorithm is a finite sequence of well-defined computer-implementable instructions',
'Chemistry is the scientific discipline involved with elements and compounds composed of atoms, molecules and ions',
'Biochemistry is the branch of science that explores the chemical processes within and related to living organisms',
'Sociology is the study of society, patterns of social relationships, social interaction, and culture that surrounds everyday life',
]
test_query = ['computer science']
# Preprocessing corpus
p = PR.Preprocessing(test_corpus)
p.perform_preprocessing()
# Creating Vector Space Model with preprocessed corpus
vsm = VSM.VectorSpaceModel(p.clean_corpus)
vsm.vectorizer()
# Reading queris
p_qrs = PR.Preprocessing(test_query)
p_qrs.perform_preprocessing()
vsm.calculate_cos_similarity(p_qrs.clean_corpus[0])
print("Clean query = ", p_qrs.clean_corpus[0])
print("Clean query = ", p_qrs.clean_corpus)