-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathngramDFcreate.py
61 lines (54 loc) · 2.25 KB
/
ngramDFcreate.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import pandas as pd
import csv
import re
from collections import Counter
from nltk import ngrams
stream_stopwords = ['list','of','custom','stopwords']
lemmatizer = WordNetLemmatizer()
tokenizer = RegexpTokenizer(r'\w+')
stop_words = stopwords.words('english')
stop_words_list = list(set(stop_words + stream_stopwords))
filepath = 'PATH/TO/PROJECT/FOLDER/'
df = pd.read_csv(filepath + 'CSV_INFILE.csv',sep='\t',dtype=str)
def remove_url(txt):
return " ".join(re.sub("([^0-9A-Za-z \t])|(\w+:\/\/\S+)", "", str(txt)).split())
def clean_text(text, stop_words_list):
if text:
no_urls = remove_url(text)
no_xrtalines = no_urls.replace('\n','')
only_alphanum = re.sub(r'[A-Z]+(?![a-z])', '', no_xrtalines)
no_nums = re.sub(r'\d+', '', only_alphanum)
clean_list = [i.lower() for i in tokenizer.tokenize(no_nums)]
lemmatized_list = [lemmatizer.lemmatize(x) for x in clean_list]
no_stops = [word for word in lemmatized_list if not word in stop_words_list]
else:
no_stops = ['']
return ' '.join(no_stops)
def format_ngram_counter(n_grams):
plist = []
for x in n_grams:
value = n_grams[x]
num_ngrams =range(len(x))
terms = ', '.join([x[i] for i in num_ngrams])
ntype = str(len(x))
xlist = [terms,value,ntype]
plist.append(xlist)
pdf = pd.DataFrame(plist,columns = ['ngrams','count','type'])
return pdf
def text_ngram_process(texts):
token = nltk.word_tokenize(texts)
bigrams = format_ngram_counter(Counter(ngrams(token,2)))
trigrams = format_ngram_counter(Counter(ngrams(token,3)))
fourgrams = format_ngram_counter(Counter(ngrams(token,4)))
fivegrams = format_ngram_counter(Counter(ngrams(token,5)))
frames = [bigrams,trigrams,fourgrams,fivegrams]
ngram_df = pd.concat(frames)
return ngram_df
df['clean_text'] = df['text'].apply(lambda x: clean_tweet(x, stop_words2))
texts = df['clean_text'].tolist()
ngram_df = text_ngram_process(' '.join(texts))
ngram_df.sort_values(by='count', ascending=False,inplace=True)
ngram_df.to_csv(filepath +'NGRAM_DF_OUTFILE.csv', sep='\t',index=False, quoting=csv.QUOTE_ALL)