-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscrape_step_1_links.py
109 lines (93 loc) · 4.05 KB
/
scrape_step_1_links.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
import asyncio
import openpyxl
import urllib.parse
import scrape_step_0_location
import scrape_step_4_check
from playwright.async_api import async_playwright, TimeoutError as PlaywrightTimeoutError
async def setup_browser():
"""Setup and return a configured Playwright browser instance."""
playwright = await async_playwright().start()
browser = await playwright.chromium.launch(headless=True) # Set to True for headless mode
context = await browser.new_context()
return playwright, browser, context
async def scroll_within_div(page, search_term):
"""Scroll within the specific div to load more results."""
div_selector = f"div[aria-label='Results for {search_term} near me']"
try:
await page.wait_for_selector(div_selector, timeout=20000)
scroll_amount = 10000 # Number of pixels to scroll each time
for _ in range(50): # Adjust the number of scroll actions
await page.evaluate(
f"""
var element = document.querySelector("{div_selector}");
if (element) {{
element.scrollTop += {scroll_amount};
}}
"""
)
await asyncio.sleep(0.1) # Allow data to load
except PlaywrightTimeoutError:
print(f"Timeout while loading or finding the results for {search_term}.")
async def scrape_links(page, search_term, lat, lng, worksheet):
"""Scrape links for a given search term and location using Playwright."""
try:
# Encode the search term and construct URL
encoded_search_term = urllib.parse.quote_plus(search_term + " near me")
url = f"https://www.google.com/maps/search/{encoded_search_term}/@{lat},{lng},14z/data=!4m2!2m1!6e1?entry=ttu"
await page.goto(url)
print(f"Loading URL: {page.url}")
# Scroll through results to load more data
await scroll_within_div(page, search_term)
# Extract and save links
links = await page.query_selector_all("a[href]")
found_links = 0
for link in links:
href = await link.get_attribute("href")
if href and "maps" in href:
worksheet.append([href])
print(f"Found link: {href}")
found_links += 1
if found_links == 0:
print(f"No valid links found for {search_term} at {lat}, {lng}")
except Exception as e:
print(f"Error: {e}")
worksheet.append([f"Error: {e}"])
async def main(input_file_path=r"fx.xlsx", search_term="Aesthetics Clinic", place_name="Varanasi"):
min_latitude, max_latitude, min_longitude, max_longitude = scrape_step_0_location.main(place_name)
# Setup Playwright
playwright, browser, context = await setup_browser()
page = await context.new_page()
workbook = None
try:
# Load the Excel workbook and worksheet
workbook = openpyxl.load_workbook(input_file_path)
worksheet = workbook.active
# Scrape links for the given latitude and longitude ranges
lat = min_latitude
while lat <= max_latitude:
lng = min_longitude
while lng <= max_longitude:
await asyncio.sleep(1)
if scrape_step_4_check.main(place_name, lat, lng):
await scrape_links(page, search_term, lat, lng, worksheet)
lng = round(lng + 0.1, 2) # Adjust longitude increment as needed
lat = round(lat + 0.1, 2) # Adjust latitude increment as needed
# Save the workbook
workbook.save(input_file_path)
print("Links saved successfully to Excel.")
except Exception as e:
print(f"Error saving workbook: {e}")
finally:
# Ensure all resources are properly closed
if workbook:
workbook.close()
await context.close()
await browser.close()
await playwright.stop()
if __name__ == "__main__":
try:
asyncio.run(main())
except KeyboardInterrupt:
print("Script interrupted by user.")
except Exception as e:
print(f"Error occurred: {e}")