-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscrape.py
128 lines (93 loc) · 4.44 KB
/
scrape.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
import os
import requests
from bs4 import BeautifulSoup
# Base URL for constructing full download links
base_url = "https://www.gutenberg.org"
# URL of the page to scrape
url = "https://www.gutenberg.org/browse/authors/c"
# Path to save the downloaded plain text files
save_path = "/home/adesoji/Downloads/visis-backend-assessment-Adesoji/bookpdf"
# Create the directory if it doesn't exist
os.makedirs(save_path, exist_ok=True)
# Send a GET request to the URL
response = requests.get(url)
response.raise_for_status() # Check that the request was successful
# Parse the HTML content using BeautifulSoup
soup = BeautifulSoup(response.content, 'html.parser')
# Find all 'a' tags with href attribute that matches the book links
book_links = soup.select('li.pgdbetext a[href*="/ebooks/"]')
# Loop through each book link to find and download "Plain Text UTF-8" file
for book_link in book_links:
book_url = base_url + book_link['href']
# Request the book page
book_response = requests.get(book_url)
book_response.raise_for_status()
# Parse the book page
book_soup = BeautifulSoup(book_response.content, 'html.parser')
# Find the "Plain Text UTF-8" download link
download_link = book_soup.select_one('a[href*=".txt.utf-8"]')
if download_link:
file_url = base_url + download_link['href']
# Get the file name from the URL and change the extension to .txt
file_name = os.path.basename(file_url).replace('.txt.utf-8', '.txt')
save_file_path = os.path.join(save_path, file_name)
# Download the file
file_response = requests.get(file_url)
file_response.raise_for_status()
# Save the file to the specified path
with open(save_file_path, 'wb') as file:
file.write(file_response.content)
print(f"Downloaded and saved: {save_file_path}")
print("All files have been downloaded and saved successfully.")
# # #######Loop through links with retry mechanism
# import os
# import requests
# from bs4 import BeautifulSoup
# from tqdm import tqdm
# from tenacity import retry, wait_fixed, stop_after_attempt
# # Base URL for constructing full download links
# base_url = "https://www.gutenberg.org"
# # List of URLs to scrape from authors A to F
# author_urls = [
# f"https://www.gutenberg.org/browse/authors/{letter}" for letter in "acdef"
# ]
# # Path to save the downloaded plain text files
# save_path = "/home/adesoji/Downloads/visis-backend-assessment-Adesoji/bookpdf"
# # Create the directory if it doesn't exist
# os.makedirs(save_path, exist_ok=True)
# @retry(wait=wait_fixed(2), stop=stop_after_attempt(3))
# def download_file(file_url, save_file_path):
# response = requests.get(file_url)
# response.raise_for_status()
# with open(save_file_path, 'wb') as file:
# file.write(response.content)
# for url in author_urls:
# # Send a GET request to the URL
# response = requests.get(url)
# response.raise_for_status() # Check that the request was successful
# # Parse the HTML content using BeautifulSoup
# soup = BeautifulSoup(response.content, 'html.parser')
# # Find all 'a' tags with href attribute that matches the book links
# book_links = soup.select('li.pgdbetext a[href*="/ebooks/"]')
# # Loop through each book link to find and download "Plain Text UTF-8" file
# for book_link in tqdm(book_links, desc=f"Processing {url}"):
# book_url = base_url + book_link['href']
# # Request the book page
# book_response = requests.get(book_url)
# book_response.raise_for_status()
# # Parse the book page
# book_soup = BeautifulSoup(book_response.content, 'html.parser')
# # Find the "Plain Text UTF-8" download link
# download_link = book_soup.select_one('a[href*=".txt.utf-8"]')
# if download_link:
# file_url = base_url + download_link['href']
# # Get the file name from the URL and change the extension to .txt
# file_name = os.path.basename(file_url).replace('.txt.utf-8', '.txt')
# save_file_path = os.path.join(save_path, file_name)
# # Download the file with retry mechanism
# try:
# download_file(file_url, save_file_path)
# print(f"Downloaded and saved: {save_file_path}")
# except Exception as e:
# print(f"Failed to download {file_url}: {e}")
# print("All files have been downloaded and saved successfully.")