-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathreddit.py
91 lines (76 loc) · 3.23 KB
/
reddit.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
import pandas as pd
from prawcore.exceptions import PrawcoreException
import praw
import os
from dotenv import load_dotenv
import time
load_dotenv()
client_id = os.getenv('REDDIT_CLIENT_ID')
client_secret = os.getenv('REDDIT_CLIENT_SECRET')
user_agent = os.getenv('REDDIT_USER_AGENT')
def scrape_reddit_comments(reddit: praw.Reddit, submission: praw.models.reddit.submission.Submission) -> list:
# Function to scrape Reddit comments
max_attempts = 5
current_attempt = 0
while current_attempt < max_attempts:
try:
print("Loading comments...")
# Load all comments at once
submission.comments.replace_more(limit=None)
comments = []
# Get total number of comments for progress tracking
total_comments = len(submission.comments.list())
print(f"Found {total_comments} comments to process")
# Use comment_forest's list() method to get all comments at once
for index, comment in enumerate(submission.comments.list(), 1):
comment_data = {
'text': comment.body,
'author': comment.author.name if comment.author else '[deleted]',
'score': comment.score,
'depth': comment.depth,
'created_utc': comment.created_utc
}
comments.append(comment_data)
# Print progress every 100 comments
if index % 100 == 0:
progress = (index / total_comments) * 100
print(f"Progress: {progress:.1f}% ({index}/{total_comments} comments processed)")
print(f"Completed! Total comments fetched: {len(comments)}")
return comments
except PrawcoreException as e:
if e.response and e.response.status_code == 429:
delay = 2 ** current_attempt
print(f"Rate limit exceeded. Retrying in {delay} seconds...")
time.sleep(delay)
current_attempt += 1
else:
print(f"Error scraping comments: {e}")
return []
print("Max retry attempts reached. Could not fetch comments.")
return []
def get_reddit_instance(client_id: str, client_secret: str, user_agent: str) -> praw.Reddit:
# Function to create a Reddit instance
reddit = praw.Reddit(
client_id=client_id,
client_secret=client_secret,
user_agent=user_agent
)
return reddit
def main():
# Function to scrape Reddit comments and save to CSV
reddit_url = input("Enter Reddit post URL: ")
# Initialize Reddit instance
reddit = get_reddit_instance(client_id, client_secret, user_agent)
submission = reddit.submission(url=reddit_url)
# Scrape Reddit comments
comments = scrape_reddit_comments(reddit, submission)
# Convert to DataFrame and save as CSV
if comments:
df = pd.DataFrame(comments)
filename = f"reddit_comments_{int(time.time())}.csv"
df.to_csv(filename, index=False)
print(f"Comments saved to {filename}")
else:
print("No comments were scraped.")
if __name__ == "__main__":
main()