-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathlinguistic_analysis.py
170 lines (150 loc) · 4.96 KB
/
linguistic_analysis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Tue Aug 22 17:12:00 2017
Linguistic analysis of tweets
@author: duc
"""
import utils
import re
import json
from nltk.corpus import cmudict
from tweepy import Cursor
from twitter_api_setup import get_twitter_client
from collections import Counter
def get_max_amount_tweets(user):
maxTweets = []
api = get_twitter_client()
print("\nfetch", user,"'s tweets")
# request to get most recent 3200 tweets and write to a file
with open(user + '_tweets.json', 'w') as f:
for tweet in Cursor(api.user_timeline, screen_name=user, tweet_mode="extended").items(3200):
maxTweets.append(tweet.full_text)
json.dump(maxTweets, f)
print("done fetching", len(maxTweets),"tweets")
return maxTweets
def get_tweets_from_file(user):
tweets = []
with open(user + '_tweets.json', 'r') as f:
print('read tweets from file ' + user + '_tweets.json ...')
tweets = json.load(f)
print('finish reading')
return tweets
def get_average_exclamation_marks(tweets):
count = 0
for tweet in tweets:
exclamationMark = re.compile(r"\!")
count += len(exclamationMark.findall(tweet))
return count / len(tweets)
def get_exclamation_marks(tweet):
exclamationMark = re.compile(r"\!")
return len(exclamationMark.findall(tweet))
def get_average_question_marks(tweets):
count = 0
for tweet in tweets:
questionMark = re.compile(r"\?")
count += len(questionMark.findall(tweet))
return count / len(tweets)
#------ functions below require preprocessed and tokenized text ---------------
def get_average_word_characters(tweets):
charCount = 0
words = 0
for tweet in tweets:
for sentence in tweet:
for word in sentence:
words += 1
charCount += len(word)
return charCount / words
def get_average_word_syllables(tweets):
pronouncingDict = cmudict.dict()
syl = 0
words = 0
for tweet in tweets:
for sentence in tweet:
for word in sentence:
words += 1
syl += utils.get_word_syllables_offline(word, pronouncingDict)
return syl / words
def get_average_sentence_length(tweets):
wordCount = 0
sentenceCount = 0
for tweet in tweets:
for sentence in tweet:
wordCount += len(sentence)
sentenceCount += 1
return wordCount / sentenceCount
def get_sentence_length(tweet):
wordCount = 0
sentenceCount = 0
for sentence in tweet:
wordCount += len(sentence)
sentenceCount += 1
return wordCount / sentenceCount
def get_average_tweet_length(tweets):
wordCount = 0
for tweet in tweets:
for sentence in tweet:
wordCount += len(sentence)
return wordCount / len(tweets)
def get_average_flesch_grade_level(tweets):
pronouncingDict = cmudict.dict()
level = 0
for tweet in tweets:
level += utils.get_flesch_grade_level(tweet, pronouncingDict)
return level / len(tweets)
# inspired from Marco Bonzanini - Mastering Social Media Mining with Python
# p. 74
def get_most_frequent_keywords(tweets):
tweetsWithoutStopwords = (
[utils.remove_stopwords(tweet) for tweet in tweets]
)
counter = Counter()
# get rid of sentence structure after tokenization
newTweets = [ sum(tweet, []) for tweet in tweetsWithoutStopwords ]
for tweet in newTweets:
counter.update(tweet)
return counter.most_common(25)
#------------------------------------------------------------------------------
def get_linguistic_analysis(user, fromFile):
tweets = []
if fromFile:
tweets = get_tweets_from_file(user)
else:
tweets = get_max_amount_tweets(user)
tweets = utils.remove_retweets(tweets)
norm = [
utils.preprocess(tweet)
for tweet in tweets if len(utils.preprocess(tweet))
if not utils.is_retweet(tweet)
]
print("\nLinguistic Analysis of ", user, "'s tweets\n")
print(
"Average word length: ",
get_average_word_characters(norm),
" characters"
)
print("Average syllables per word: ", get_average_word_syllables(norm))
print(
"Average sentence length: ",
get_average_sentence_length(norm),
" words"
)
print("Average tweet length: ", get_average_tweet_length(norm), " words")
print(
"Average question marks per tweet: ",
get_average_question_marks(tweets)
)
print(
"Average exclamation marks per tweet: ",
get_average_exclamation_marks(tweets)
)
print("Average flesch grade level: ", get_average_flesch_grade_level(norm))
print("\nMost frequent 25 keywords:")
for tag,count in get_most_frequent_keywords(norm):
print("{}: {}".format(tag, count))
if __name__ == '__main__':
fromFile = True
user = input(
"Enter the Twitter username of the person you want to analyse:\n"
)
get_linguistic_analysis(user, fromFile)