-
Notifications
You must be signed in to change notification settings - Fork 0
/
scraping_blm_tweets.py
74 lines (61 loc) · 2.85 KB
/
scraping_blm_tweets.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
# -*- coding: utf-8 -*-
"""Scraping BLM Tweets.ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/1kl2d0Hd8lBDPtgEYfosLHHGNZsJP82Ge
### **Loading Libraries**
"""
!pip install GetOldTweets3
import pandas as pd
import GetOldTweets3 as got
"""### **From Normal People**"""
tags=["blm", "blacklivesmatter" ,"georgefloyd" ,"justiceforgeorgefloyd", "icantbreathe", 'black lives matter', 'george floyd' ]
since_date = '2020-05-25' # George Floyd/Date of death
until_date = '2020-08-01' # Date of Project Creation
count = 1500
# Creation of query object
columns = ['Date', 'Name', 'Text', 'Hashtags']
citizen_tweets = pd.DataFrame(columns=columns)
for tg in tags:
tweetCriteria = got.manager.TweetCriteria()\
.setQuerySearch(tg)\
.setTopTweets(True)\
.setSince(since_date)\
.setUntil(until_date)\
.setMaxTweets(count)\
.setEmoji("unicode")
tweets = got.manager.TweetManager.getTweets(tweetCriteria)
temp = [[tw.date,tw.username, tw.text, tw.hashtags] for tw in tweets]
temp = pd.DataFrame(temp, columns = ['Date', 'Name', 'Text', 'Hashtags'])
citizen_tweets = pd.concat([citizen_tweets, temp])
citizen_tweets.drop_duplicates(subset ="Text",
keep = False, inplace = True)
citizen_tweets.reset_index(drop=True, inplace=True)
citizen_tweets.to_csv("blm_tweets.csv")
"""### **From News Channels**"""
news_sources = ['nytimes', 'bbcbreaking', 'bbcnews', 'bbcworld',
'theeconomist', 'reuters','wsj', 'financialtimes',
'guardian', 'realdailywire', "cnn", "msnbc", "blazetv",
"freespeechtv", "hgtv"]
columns = ['Date', 'Name', 'Text', 'Hashtags']
count = 1500
text_tweets = pd.DataFrame(columns=columns)
since_date = '2020-05-25' # George Floyd/Date of death
until_date = '2020-08-01' # Date of Project Creationf
for tf in tags:
for ns in news_sources:
tweetCriteria = got.manager.TweetCriteria().setUsername(ns)\
.setQuerySearch(tf)\
.setSince(since_date)\
.setUntil(until_date)\
.setMaxTweets(count)
tweets = got.manager.TweetManager.getTweets(tweetCriteria)
temp = [[tw.date,tw.username, tw.text, tw.hashtags] for tw in tweets]
temp = pd.DataFrame(temp, columns = ['Date', 'Name', 'Text', 'Hashtags'])
text_tweets = pd.concat([text_tweets, temp])
citizen_tweets.drop_duplicates(subset ="Text",
keep = False , inplace = True)
text_tweets.reset_index(drop=True, inplace=True)
text_tweets.head()
text_tweets.tail()
text_tweets.to_csv("blm_news.csv")