-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrun.py
58 lines (46 loc) · 2.12 KB
/
run.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
import os
import pandas as pd
from datetime import datetime, timedelta
from dotenv import load_dotenv, set_key
from src.get_tweets_snscrape import get_tweets_by_term_since, get_tweets_by_user_since, merge_sns_files
from src.get_tweets_tweepy import TwitterAPI
def main():
# setup
load_dotenv()
date_since = os.getenv('DATE_SINCE')
date_until = os.getenv('DATE_UNTIL')
temp_data_path = "./temp-data/"
# get past tweets with snscrape
# Users - not used as users are included in terms
# candidates = os.getenv('USERS').split(',')
# get_tweets_by_user_since(candidates, date_since,
# temp_data_path, until=date_until)
# get tweets by terms
terms = os.getenv('TERMS').split(',')
get_tweets_by_term_since(terms, date_since,
temp_data_path, until=date_until)
new_ids = merge_sns_files(temp_data_path)
# remove duplicates first to save on tweepy limit
new_ids = list(dict.fromkeys(new_ids))
# alternative to get tweet ids direct from file
# with open("/Users/joeworsfold/hatedetector/scrape-twitter/temp-data/sns-merged.txt", 'r') as f:
# new_ids = f.read().split('\n')
# new_ids = list(dict.fromkeys(new_ids))
# tweepy to get details from snscrape tweet ids
TP = TwitterAPI(api_key=os.getenv('CONS_API_KEY'),
api_secret=os.getenv('CONS_API_SEC'),
acc_token=os.getenv('ACCESS_TOKEN'),
acc_secret=os.getenv('ACCESS_SECRET'))
TP.get_statuses(new_ids, is_extended=True, add_to_csv=True,
filepath=temp_data_path + "tp-statuses-" + date_since)
# store in postgres
# does the table exist, create if not, use dataframe column names
# loop INSERT and catch duplicate primary keys (id_str)
# reset env
# new_date_since = (datetime.today() - timedelta(days=1)
# ).strftime('%Y-%m-%d')
# new_date_until = datetime.today().strftime('%Y-%m-%d')
# set_key('.env', 'DATE_SINCE', new_date_since)
# set_key('.env', 'DATE_UNTIL', new_date_until)
if __name__ == "__main__":
main()