-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcrawler.py
265 lines (222 loc) · 12.4 KB
/
crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
from colorama import Fore, Style, init
import time
import random
import argparse
from user_agents import USER_AGENTS
import os
import signal
from blacklist import is_blacklisted, manage_blacklist
main_logo = '''
[91m_[0m[93m_[0m[92m_[0m[96m_[0m [94m_[0m[95m_[0m[91m_[0m[93m_[0m[92m_[0m[96m_[0m [94m_[0m[95m_[0m [91m_[0m[93m_[0m[92m_[0m[96m_[0m
[94m/[0m[95m\[0m [91m_[0m[93m`[0m[92m\[0m [96m/[0m[94m\[0m [95m_[0m [91m\[0m [93m/[0m[92m\[0m [96m\[0m [94m/[0m[95m\[0m [91m_[0m[93m`[0m[92m\[0m
[96m\[0m [94m\[0m [95m\[0m[91m/[0m[93m\[0m[92m_[0m[96m\[0m [94m_[0m [95m_[0m[91m_[0m[93m\[0m [92m\[0m [96m\[0m[94mL[0m[95m\[0m [91m\[0m [93m_[0m[92m_[0m [96m_[0m[94m_[0m [95m_[0m[91m_[0m[93m\[0m [92m\[0m [96m\[0m [94m_[0m[95m_[0m[91m\[0m [93m\[0m [92m\[0m[96mL[0m[94m\[0m [95m\[0m
[91m\[0m [93m\[0m [92m\[0m[96m/[0m[94m_[0m[95m/[0m[91m_[0m[93m/[0m[92m\[0m[96m`[0m[94m'[0m[95m_[0m[91m_[0m[93m\[0m [92m\[0m [96m_[0m[94m_[0m [95m\[0m[91m/[0m[93m\[0m [92m\[0m[96m/[0m[94m\[0m [95m\[0m[91m/[0m[93m\[0m [92m\[0m[96m\[0m [94m\[0m [95m\[0m [91m_[0m[93m_[0m [92m/[0m[96m'[0m[94m_[0m[95m_[0m[91m`[0m[93m\[0m [92m\[0m [96m,[0m [94m/[0m
[95m\[0m [91m\[0m [93m\[0m[92mL[0m[96m\[0m [94m\[0m [95m\[0m [91m\[0m[93m/[0m [92m\[0m [96m\[0m [94m\[0m[95m/[0m[91m\[0m [93m\[0m [92m\[0m [96m\[0m[94m_[0m[95m/[0m [91m\[0m[93m_[0m[92m/[0m [96m\[0m[94m\[0m [95m\[0m [91m\[0m[93mL[0m[92m\[0m [96m\[0m[94m/[0m[95m\[0m [91m_[0m[93m_[0m[92m/[0m[96m\[0m [94m\[0m [95m\[0m[91m\[0m [93m\[0m
[92m\[0m [96m\[0m[94m_[0m[95m_[0m[91m_[0m[93m_[0m[92m/[0m[96m\[0m [94m\[0m[95m_[0m[91m\[0m [93m\[0m [92m\[0m[96m_[0m[94m\[0m [95m\[0m[91m_[0m[93m\[0m [92m\[0m[96m_[0m[94m_[0m[95m_[0m[91mx[0m[93m_[0m[92m_[0m[96m_[0m[94m/[0m[95m'[0m [91m\[0m [93m\[0m[92m_[0m[96m_[0m[94m_[0m[95m_[0m[91m/[0m[93m\[0m [92m\[0m[96m_[0m[94m_[0m[95m_[0m[91m_[0m[93m\[0m[92m\[0m [96m\[0m[94m_[0m[95m\[0m [91m\[0m[93m_[0m[92m\[0m
[96m\[0m[94m/[0m[95m_[0m[91m_[0m[93m_[0m[92m/[0m [96m\[0m[94m/[0m[95m_[0m[91m/[0m [93m\[0m[92m/[0m[96m_[0m[94m/[0m[95m\[0m[91m/[0m[93m_[0m[92m/[0m[96m\[0m[94m/[0m[95m_[0m[91m_[0m[93m/[0m[92m/[0m[96m_[0m[94m_[0m[95m/[0m [91m\[0m[93m/[0m[92m_[0m[96m_[0m[94m_[0m[95m/[0m [91m\[0m[93m/[0m[92m_[0m[96m_[0m[94m_[0m[95m_[0m[91m/[0m [93m\[0m[92m/[0m[96m_[0m[94m/[0m[95m\[0m[91m/[0m [93m/[0m
\033[92m github.com/noarche/crawler \033[0m
\033[92m Build Date: Dec. 30 2024\033[0m
\033[93m Version 6.9\033[0m
'''
infoabt = '''
\033[92m This script crawls links. Originally created to gather links for Noisy.py\033[0m
\033[32m To view noisy script visit github.com/noarche/noisy\033[0m
\033[33m For more information run with -help flag. \033[0m
\033[33m To load domains from list run with -i textfile.txt or type 'filename.txt' instead of 'website.com'\033[0m
\033[96m Enter a starting link..\033[0m
\033[36m Enter a sleep time. Leave blank and press enter for very fast.\033[0m
'''
print(main_logo)
print(infoabt)
init(autoreset=True)
output_file = 'sites_found.txt'
total_bandwidth = 0
MAX_REQUESTS_PER_LINK = 999999
DEFAULT_MAX_LINKS = 999999
exit_flag = False
restart_flag = False
def signal_handler(signum, frame):
global restart_flag
print(Fore.YELLOW + "\nInterrupt received. Returning to the main menu.")
restart_flag = True
signal.signal(signal.SIGINT, signal_handler)
def parse_arguments():
"""Parse command-line arguments."""
parser = argparse.ArgumentParser(description="Crawl and extract links from provided URLs.")
parser.add_argument('-i', '--input', type=str, help="Path to a text file containing URLs (one per line).")
parser.add_argument('-help', action='store_true', help="Show detailed help information about the script.")
return parser.parse_args()
def show_help():
"""Display help information."""
help_text = """
This script crawls websites and extracts links matching the original TLD of the provided domains.
Features:
- Takes input as a comma-separated list of URLs or reads from a file (one URL per line).
- Allows setting a delay between requests.
- Limits the number of links to parse per URL.
- Saves found links to a file (sites_found.txt).
Usage:
- Provide URLs as input separated by commas, or type 'domains.txt' to read from a file.
- Use -i argument to specify a file containing domains.
"""
print(Fore.CYAN + help_text)
def prompt_for_links(file_path=None):
"""Prompt the user for starting links or exit command."""
if file_path:
try:
with open(file_path, 'r') as file:
links = [line.strip() if line.strip().startswith("http") else "https://" + line.strip() for line in file]
return links
except FileNotFoundError:
print(Fore.RED + f"File not found: {file_path}")
exit()
while True:
try:
user_input = input(Fore.CYAN + "Enter starting links (comma-separated) or type a '.txt' file name (type 'exit' to quit): ").strip()
if user_input.lower() == 'exit':
exit()
if user_input.lower().endswith('.txt'):
if os.path.isfile(user_input):
return prompt_for_links(user_input)
else:
print(Fore.RED + f"File not found: {user_input}")
continue
links = [link.strip() if link.startswith("http") else "https://" + link.strip() for link in user_input.split(',')]
return links
except KeyboardInterrupt:
print(Fore.RED + "\nProcess interrupted. Returning to the main menu.")
return []
def prompt_for_delay():
"""Prompt the user for a delay between requests."""
while True:
try:
delay_input = input(Fore.CYAN + "Enter delay in seconds between requests (leave empty for 0.02, type 'exit' to quit): ").strip()
if delay_input.lower() == 'exit':
exit()
if delay_input == '':
return 0.02
delay = float(delay_input)
if delay >= 0:
return delay
print(Fore.RED + "Delay must be a non-negative number.")
except ValueError:
print(Fore.RED + "Invalid input. Please enter a number.")
def prompt_for_max_links():
"""Prompt the user for the maximum number of links to parse per URL."""
while True:
try:
max_links_input = input(Fore.CYAN + f"Enter the maximum number of links to parse per URL (leave empty for {DEFAULT_MAX_LINKS}): ").strip()
if max_links_input.lower() == 'exit':
exit()
if max_links_input == '':
return DEFAULT_MAX_LINKS
max_links = int(max_links_input)
if max_links > 0:
return max_links
print(Fore.RED + "The maximum number of links must be a positive integer.")
except ValueError:
print(Fore.RED + "Invalid input. Please enter an integer.")
def save_links(link):
"""Append a unique .net or .com link to sites_found.txt."""
with open(output_file, 'a') as file:
file.write(link + '\n')
def get_random_user_agent():
"""Return a random user agent from the USER_AGENTS list."""
return random.choice(USER_AGENTS)
from blacklist import is_blacklisted, manage_blacklist
# Add an option to manage the blacklist in your script
def prompt_for_blacklist():
"""Prompt user to manage the blacklist."""
while True:
manage_choice = input("Would you like to manage the blacklist? (yes/no): ").strip().lower()
if manage_choice == 'yes':
manage_blacklist()
return
elif manage_choice == 'no':
return
else:
print("Invalid input. Please type 'yes' or 'no'.")
# Integrate blacklist check in crawl_website function
def crawl_website(url, visited_links, links_to_visit, original_tld):
"""Crawl a website and extract all links matching the original TLD."""
global total_bandwidth
parsed_url = urlparse(url)
domain = parsed_url.netloc
if is_blacklisted(domain):
print(Fore.YELLOW + f"Skipping blacklisted domain: {domain}")
return False
try:
headers = {"User-Agent": get_random_user_agent()}
response = requests.get(url, headers=headers, timeout=2.69)
response.raise_for_status()
bandwidth_used = len(response.text.encode('utf-8')) / (1024 * 1024)
total_bandwidth += bandwidth_used
soup = BeautifulSoup(response.text, 'html.parser')
for a_tag in soup.find_all('a', href=True):
href = a_tag['href']
if any(href.endswith(ext) for ext in (
'.ico', '.png', '.jpg', '.webp', '.webm', '.pdf', '.gif',
'.doc', '.docx', '.svg', '.iso', '.ts', '.srt', '.jpeg',
'.json', '.onion', '.i2p', '.safetensors', '.rar', '.zip',
'.gguf', '.ggml', '.shp', '.gif', '.avi', '.mp3', '.wav',
'.mkv', '.mov', '.heif', '.heic', '.txt', '.xml', '.js',
'.m4b', '.mp4', '.m4a', '.flac', '.ogg', '.opus', '.avif',
'.hc', '.tc', '.xyz', '.exe', '.msi', '.tar', '.7z', '.tif',
'.css', '.csv'
)):
continue
full_url = urljoin(url, href)
parsed_url = urlparse(full_url)
domain = parsed_url.netloc
if is_blacklisted(domain):
print(Fore.YELLOW + f"Skipping blacklisted domain: {domain}")
continue
if parsed_url.scheme in ('http', 'https') and parsed_url.netloc.endswith(original_tld):
clean_url = parsed_url.scheme + '://' + parsed_url.netloc + parsed_url.path
if clean_url not in visited_links and clean_url not in links_to_visit:
links_to_visit.add(clean_url)
except Exception as e:
print(Fore.RED + f"Error crawling {url}: {e}")
return False
return True
# Before crawling, allow the user to manage the blacklist
prompt_for_blacklist()
args = parse_arguments()
if args.help:
show_help()
exit()
while True:
try:
visited_links = set()
links_to_visit = set(prompt_for_links(args.input))
if not links_to_visit:
continue
delay = prompt_for_delay()
max_links_per_url = prompt_for_max_links()
while links_to_visit and not restart_flag:
url = links_to_visit.pop()
request_count = 0
parsed_links_count = 0
original_tld = '.' + urlparse(url).netloc.split('.')[-1]
while url not in visited_links and request_count < MAX_REQUESTS_PER_LINK and parsed_links_count < max_links_per_url:
print(Fore.BLUE + f"Crawling: {url}")
visited_links.add(url)
if crawl_website(url, visited_links, links_to_visit, original_tld):
save_links(url)
parsed_links_count += 1
print(Fore.RED + f"Total bandwidth used: {total_bandwidth:.2f} MB")
time.sleep(delay)
request_count += 1
if restart_flag:
restart_flag = False
print(Fore.CYAN + "Returning to the main menu.")
except KeyboardInterrupt:
print(Fore.YELLOW + "\nProcess interrupted. Returning to the main menu.")
restart_flag = True
continue
except Exception as e:
print(Fore.RED + f"\nScript crashed: {e}. Please provide a new link.")
continue