-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathfirecrawl_ex.py
137 lines (105 loc) · 5.09 KB
/
firecrawl_ex.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
from firecrawl import FirecrawlApp
from openai import OpenAI
from dotenv import load_dotenv
import os
import json
import pandas as pd
from datetime import datetime
def scrape_data(url):
load_dotenv()
# Initialize the FirecrawlApp with your API key
app = FirecrawlApp(api_key=os.getenv('FIRECRAWL_API_KEY'))
# Scrape a single URL
scraped_data = app.scrape_url(url,{'pageOptions':{'onlyMainContent': True}})
# Check if 'markdown' key exists in the scraped data
if 'markdown' in scraped_data:
return scraped_data['markdown']
else:
raise KeyError("The key 'markdown' does not exist in the scraped data.")
def save_raw_data(raw_data, timestamp, output_folder='output'):
# Ensure the output folder exists
os.makedirs(output_folder, exist_ok=True)
# Save the raw markdown data with timestamp in filename
raw_output_path = os.path.join(output_folder, f'rawData_{timestamp}.md')
with open(raw_output_path, 'w', encoding='utf-8') as f:
f.write(raw_data)
print(f"Raw data saved to {raw_output_path}")
def format_data(data, fields=None):
load_dotenv()
# Instantiate the OpenAI client
client = OpenAI(api_key=os.getenv('OPENAI_APIKEY'))
# Assign default fields if not provided
if fields is None:
fields = ["Address", "Real Estate Agency", "Price", "Beds", "Baths", "Sqft", "Home Type", "Listing Age", "Picture of home URL", "Listing URL"]
# Define system message content
system_message = f"""You are an intelligent text extraction and conversion assistant. Your task is to extract structured information
from the given text and convert it into a pure JSON format. The JSON should contain only the structured data extracted from the text,
with no additional commentary, explanations, or extraneous information.
You could encounter cases where you can't find the data of the fields you have to extract or the data will be in a foreign language.
Please process the following text and provide the output in pure JSON format with no words before or after the JSON:"""
# Define user message content
user_message = f"Extract the following information from the provided text:\nPage content:\n\n{data}\n\nInformation to extract: {fields}"
response = client.chat.completions.create(
model="gpt-4o",
response_format={ "type": "json_object" },
messages=[
{
"role": "system",
"content": system_message
},
{
"role": "user",
"content": user_message
}
]
)
# Check if the response contains the expected data
if response and response.choices:
formatted_data = response.choices[0].message.content.strip()
print(f"Formatted data received from API: {formatted_data}")
try:
parsed_json = json.loads(formatted_data)
except json.JSONDecodeError as e:
print(f"JSON decoding error: {e}")
print(f"Formatted data that caused the error: {formatted_data}")
raise ValueError("The formatted data could not be decoded into JSON.")
return parsed_json
else:
raise ValueError("The OpenAI API response did not contain the expected choices data.")
def save_formatted_data(formatted_data, timestamp, output_folder='output'):
# Ensure the output folder exists
os.makedirs(output_folder, exist_ok=True)
# Save the formatted data as JSON with timestamp in filename
output_path = os.path.join(output_folder, f'sorted_data_{timestamp}.json')
with open(output_path, 'w', encoding='utf-8') as f:
json.dump(formatted_data, f, indent=4)
print(f"Formatted data saved to {output_path}")
# Check if data is a dictionary and contains exactly one key
if isinstance(formatted_data, dict) and len(formatted_data) == 1:
key = next(iter(formatted_data)) # Get the single key
formatted_data = formatted_data[key]
# Convert the formatted data to a pandas DataFrame
df = pd.DataFrame(formatted_data)
# Convert the formatted data to a pandas DataFrame
if isinstance(formatted_data, dict):
formatted_data = [formatted_data]
df = pd.DataFrame(formatted_data)
# Save the DataFrame to an Excel file
#excel_output_path = os.path.join(output_folder, f'sorted_data_{timestamp}.xlsx')
df.to_csv(f"{timestamp}.csv", index=False)
if __name__ == "__main__":
# Scrape a single URL
url = 'https://www.nobroker.in/properties-for-sale-in-bangalore-bangalore'
try:
# Generate timestamp
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
# Scrape data
raw_data = scrape_data(url)
# Save raw data
save_raw_data(raw_data, timestamp)
# Format data
formatted_data = format_data(raw_data)
# Save formatted data
save_formatted_data(formatted_data, timestamp)
except Exception as e:
print(f"An error occurred: {e}")