-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathTask.py
116 lines (98 loc) · 4.25 KB
/
Task.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
import time
import csv
import json
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException
def configure_driver():
options = webdriver.ChromeOptions()
options.add_argument('--start-maximized')
options.add_argument('--disable-notifications')
driver = webdriver.Chrome(options=options)
return driver
def authenticate(driver, email, password):
driver.get("https://www.amazon.in/ap/signin")
try:
email_field = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.ID, "ap_email"))
)
email_field.send_keys(email)
driver.find_element(By.ID, "continue").click()
password_field = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.ID, "ap_password"))
)
password_field.send_keys(password)
driver.find_element(By.ID, "signInSubmit").click()
except TimeoutException:
print("Login elements not found. Check your credentials or internet connection.")
driver.quit()
exit()
def scrape_category(driver, category_url):
driver.get(category_url)
time.sleep(5)
scraped_data = []
try:
products = driver.find_elements(By.XPATH, "//div[contains(@class, 'zg-grid-general-faceout')]")
for product in products[:1500]:
try:
name = product.find_element(By.CSS_SELECTOR, ".p13n-sc-truncate-desktop-type2").text
price = product.find_element(By.CSS_SELECTOR, ".p13n-sc-price").text
discount = None
rating = product.find_element(By.CSS_SELECTOR, "span.a-icon-alt").text
ship_from = None
sold_by = None
description = None
num_bought = None
images = None
# Append collected data
scraped_data.append({
"Name": name,
"Price": price,
"Discount": discount,
"Rating": rating,
"Ship From": ship_from,
"Sold By": sold_by,
"Description": description,
"Number Bought": num_bought,
"Images": images,
})
except NoSuchElementException:
continue
except TimeoutException:
print("Failed to load products in the category. Check your connection or the URL.")
return scraped_data
def save_data(data, file_format="csv", filename="amazon_best_sellers"):
if file_format == "csv":
keys = data[0].keys()
with open(f"{filename}.csv", "w", newline="", encoding="utf-8") as output_file:
dict_writer = csv.DictWriter(output_file, fieldnames=keys)
dict_writer.writeheader()
dict_writer.writerows(data)
elif file_format == "json":
with open(f"{filename}.json", "w", encoding="utf-8") as output_file:
json.dump(data, output_file, indent=4)
if __name__ == "__main__":
# Amazon credentials
email = "your_email@example.com"
password = "your_password"
categories = [
"https://www.amazon.in/gp/bestsellers/kitchen/ref=zg_bs_nav_kitchen_0",
"https://www.amazon.in/gp/bestsellers/shoes/ref=zg_bs_nav_shoes_0",
"https://www.amazon.in/gp/bestsellers/computers/ref=zg_bs_nav_computers_0",
"https://www.amazon.in/gp/bestsellers/electronics/ref=zg_bs_nav_electronics_0",
]
driver = configure_driver()
try:
authenticate(driver, email, password)
all_scraped_data = []
for category_url in categories[:10]: # Limit to 10 categories
print(f"Scraping category: {category_url}")
category_data = scrape_category(driver, category_url)
all_scraped_data.extend(category_data)
save_data(all_scraped_data, file_format="csv", filename="amazon_best_sellers")
print("Scraping completed successfully. Data saved to file.")
finally:
driver.quit()