-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathutils.py
32 lines (23 loc) · 950 Bytes
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.tokenize import word_tokenize
nltk.download(['punkt', 'wordnet', 'averaged_perceptron_tagger', 'stopwords'])
stop_words = stopwords.words('english')
def tokenize(text):
url_regex = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
detected_urls = re.findall(url_regex, text)
for url in detected_urls:
text = text.replace(url, "urlplaceholder")
text = re.sub(r"[^a-zA-Z0-9]", " ", text.lower())
tokens = word_tokenize(text)
tokens = [word for word in tokens if word not in stop_words]
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()
clean_tokens = []
for tok in tokens:
clean_tok = lemmatizer.lemmatize(tok).strip()
clean_tok = stemmer.stem(clean_tok)
clean_tokens.append(clean_tok)
return clean_tokens