-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfinalwritetest.py
100 lines (87 loc) · 4.41 KB
/
finalwritetest.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
import pandas as pd
import requests
from bs4 import BeautifulSoup
from datetime import datetime
import re
from urllib.parse import urljoin
def fetch_html(url):
"""Fetch the HTML content of a given URL."""
response = requests.get(url)
if response.status_code == 200:
return BeautifulSoup(response.content, 'html.parser')
else:
print(f"Failed to retrieve content from the URL: {url}")
return None
def extract_post_content(post_html):
"""Extract the post content from the HTML."""
content_tag = post_html.find('p')
if content_tag:
content = content_tag.get_text(strip=True)
return content
return "No Content"
def extract_posts(soup, keyword, page_number, base_url):
"""Extract posts from the page."""
posts = soup.find_all('div', class_='post')
bot_indicators = ["download for free", "survey", "http", "https", "www.", "torrent", "hack tool", "hack"]
special_chars = set("ÜÄÝÞßþÙ")
data = []
for post in posts:
title_tag = post.find('h1').find('a')
title = title_tag.text.strip() if title_tag else "No Title"
content_html = str(post.find('p'))
# Skip bot-like content and check for special characters
if (any(indicator in title.lower() for indicator in bot_indicators) or
any(indicator in content_html.lower() for indicator in bot_indicators) or
any(char in content_html for char in special_chars)):
continue
more_link = post.find('a', href=re.compile(r'/\d+'))
if more_link:
full_content_url = urljoin(base_url, more_link['href'])
full_content_soup = fetch_html(full_content_url)
content = extract_post_content(full_content_soup) if full_content_soup else "No Content"
else:
content = extract_post_content(post)
details = post.find('div', class_='postdetails').get_text()
date_time_match = re.search(r'on (\w+ \d{1,2}, \d{4} - \d{1,2}:\d{2} [ap]m)', details)
date_time = datetime.strptime(date_time_match.group(1), '%B %d, %Y - %I:%M %p') if date_time_match else None
if date_time:
post_data = {
'Keyword': keyword,
'Page': page_number,
'Title': title,
'Post Content': content,
'year': date_time.year,
'month': date_time.strftime('%B'),
'day_of_week': date_time.strftime('%A'),
'Time': date_time.strftime('%H:%M')
}
data.append(post_data)
return data
def scrape_all_posts(keywords, base_url):
all_posts = []
for keyword in keywords:
page = 1
while True:
url = f"{base_url}/search/{keyword}/page/{page}"
soup = fetch_html(url)
if soup is None:
print(f"Failed to fetch page {page} for keyword '{keyword}'.")
break
posts = extract_posts(soup, keyword, page, base_url)
if not posts:
print(f"No more relevant posts found for keyword '{keyword}' at page {page}. Stopping.")
break
all_posts.extend(posts)
page += 1
return all_posts
# Configuration for scraping
base_url = "https://www.somewheretowrite.com"
happy_keywords = ["joyous", "celebration", "wonderful", "amazing", "delightful", "ecstatic", "blissful", "cheerful", "exuberant", "jubilant", "euphoric", "thrilled", "content", "elated", "gleeful", "grateful", "happy", "merry", "radiant", "sunny", "upbeat", "victorious", "vivacious", "zestful", "blessed", "fortunate", "lucky", "jolly", "smiling", "joy", "happiness", "excited", "positive"]
sad_keywords = ["death", "sad", "depressed", "worst", "miserable", "hate", "unhappy", "tragic", "grief", "heartbroken", "sorrow", "melancholy", "gloomy", "grief-stricken", "despair", "disheartened", "tearful", "unfortunate", "bleak", "desolate", "forlorn", "dejected", "woeful", "anguish", "dismal", "unbearable", "painful", "distraught", "regretful", "bereaved", "pain", "suffering", "negative", "downcast"]
keywords = happy_keywords + sad_keywords
# Start scraping process
all_posts = scrape_all_posts(keywords, base_url)
# Save data to DataFrame
df = pd.DataFrame(all_posts)
df.to_excel('posts_data.xlsx', index=False)
print("All data has been compiled and saved successfully.")