-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
75 lines (53 loc) · 1.97 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
"""This script serves as a skeleton template for synchronous AgentQL scripts."""
import logging
import agentql
from agentql.ext.playwright.sync_api import Page
from playwright.sync_api import sync_playwright
import polars as pl
from pydantic import BaseModel
from typing import List
# Set up logging
logging.basicConfig(level=logging.INFO)
log = logging.getLogger(__name__)
# Set the URL to the desired website
URL = "https://webscraper.io/test-sites/e-commerce/allinone"
class Product(BaseModel):
product_name: str
product_price: float
product_description: str
class ProductList(BaseModel):
products: List[Product]
def main():
with sync_playwright() as p, p.chromium.launch(headless=False) as browser:
# Create a new page in the browser and wrap it to get access to the AgentQL's querying API
page = agentql.wrap(browser.new_page())
# Navigate to the desired URL
page.goto(URL)
response = get_response(page)
# Use products as a key as that is what we set as the key in our AgentQLquery
data = response['products']
# Write the data to an Excel file. We are using Excel because we need to support nested data
write_response_to_xlsx(data, "output.xlsx")
def get_response(page: Page):
query = """
{
products[] {
product_name,
product_price,
product_description
}
}
"""
return page.query_data(query)
def write_response_to_xlsx(data: List[dict], filename: str = "output.xlsx"):
# Validate data using Pydantic models
log.info("Validating data with Product model...")
validated_products = [Product(**item).model_dump() for item in data]
# Convert validated data to DataFrame
log.info("Converting validated data to DataFrame...")
df = pl.DataFrame(validated_products)
log.info(f"Writing data to {filename}...")
df.write_excel(filename)
log.info("Data written successfully!")
if __name__ == "__main__":
main()