-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathOntoPT.py
59 lines (44 loc) · 1.37 KB
/
OntoPT.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
# -*- coding: utf-8 -*-
"""
Created on Fri Nov 27 03:05:47 2020
@author: Eduardo Vicente
"""
import requests
import re,string
from nltk.corpus import stopwords
def find_uppercase(a_str):
for c in a_str:
if c in string.ascii_uppercase:
return True
return False
def remove_stopwords(words):
filtered = []
stop_words = set(stopwords.words('portuguese'))
# print(stop_words)
for w in words:
if w not in stop_words:
filtered.append(w)
return filtered
def synsets(word):
payload = {"pesquisa":word}
r = requests.get("http://ontobusca.dei.uc.pt:8080/ontobusca/ServletXML",params=payload)
# print(r.html)
cleanr = re.compile('<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});')
cleantext = re.sub(cleanr, ' ', r.text)
punctuation = string.punctuation
white_space = string.whitespace
for c in cleantext:
# print(c)
if c in punctuation or c in white_space:
# print("omg")
cleantext = cleantext.replace(c," ")
arr = cleantext.split(" ")
# print(arr)
for w in arr:
if find_uppercase(w):
arr.remove(w)
# print(arr)
str_list = list(filter(None, arr))
# print(str_list)
final = remove_stopwords(str_list)
return final