Skip to content

Commit

Permalink
feat: add new map extraction feature
Browse files Browse the repository at this point in the history
  • Loading branch information
Aparup Ganguly authored and Aparup Ganguly committed Feb 24, 2025
1 parent 74b273f commit c8c4131
Show file tree
Hide file tree
Showing 6 changed files with 262 additions and 14 deletions.
4 changes: 3 additions & 1 deletion src/backend/base/langflow/components/firecrawl/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
from .firecrawl_crawl_api import FirecrawlCrawlApi
from .firecrawl_extract_api import FirecrawlExtractApi
from .firecrawl_map_api import FirecrawlMapApi
from .firecrawl_scrape_api import FirecrawlScrapeApi

__all__ = ["FirecrawlCrawlApi", "FirecrawlScrapeApi"]
__all__ = ["FirecrawlCrawlApi", "FirecrawlScrapeApi", "FirecrawlMapApi", "FirecrawlExtractApi"]

Check failure on line 6 in src/backend/base/langflow/components/firecrawl/__init__.py

View workflow job for this annotation

GitHub Actions / Ruff Style Check (3.13)

Ruff (RUF022)

src/backend/base/langflow/components/firecrawl/__init__.py:6:11: RUF022 `__all__` is not sorted

Check failure on line 6 in src/backend/base/langflow/components/firecrawl/__init__.py

View workflow job for this annotation

GitHub Actions / Ruff Style Check (3.13)

Ruff (W292)

src/backend/base/langflow/components/firecrawl/__init__.py:6:96: W292 No newline at end of file
Original file line number Diff line number Diff line change
@@ -1,17 +1,15 @@
import uuid

from langflow.custom import Component
from langflow.io import DataInput, IntInput, MultilineInput, Output, SecretStrInput, StrInput
from langflow.schema import Data

Check failure on line 4 in src/backend/base/langflow/components/firecrawl/firecrawl_crawl_api.py

View workflow job for this annotation

GitHub Actions / Ruff Style Check (3.13)

Ruff (I001)

src/backend/base/langflow/components/firecrawl/firecrawl_crawl_api.py:1:1: I001 Import block is un-sorted or un-formatted


class FirecrawlCrawlApi(Component):
display_name: str = "FirecrawlCrawlApi"
description: str = "Firecrawl Crawl API."
name = "FirecrawlCrawlApi"

output_types: list[str] = ["Document"]
documentation: str = "https://docs.firecrawl.dev/api-reference/endpoint/crawl-post"
documentation: str = "https://docs.firecrawl.dev/v1/api-reference/endpoint/crawl-post"

inputs = [
SecretStrInput(
Expand Down Expand Up @@ -57,7 +55,7 @@ class FirecrawlCrawlApi(Component):

def crawl(self) -> Data:
try:
from firecrawl.firecrawl import FirecrawlApp
from firecrawl import FirecrawlApp
except ImportError as e:
msg = "Could not import firecrawl integration package. Please install it with `pip install firecrawl-py`."
raise ImportError(msg) from e
Expand All @@ -67,9 +65,23 @@ def crawl(self) -> Data:
if scrape_options_dict:
params["scrapeOptions"] = scrape_options_dict

# Set default values for new parameters in v1
params.setdefault("maxDepth", 2)
params.setdefault("limit", 10000)
params.setdefault("allowExternalLinks", False)
params.setdefault("allowBackwardLinks", False)
params.setdefault("ignoreSitemap", False)
params.setdefault("ignoreQueryParameters", False)

# Ensure onlyMainContent is explicitly set if not provided
if "scrapeOptions" in params:
params["scrapeOptions"].setdefault("onlyMainContent", True)
else:
params["scrapeOptions"] = {"onlyMainContent": True}

if not self.idempotency_key:
self.idempotency_key = str(uuid.uuid4())

app = FirecrawlApp(api_key=self.api_key)
crawl_result = app.crawl_url(self.url, params=params, idempotency_key=self.idempotency_key)
return Data(data={"results": crawl_result})
return Data(data={"results": crawl_result})

Check failure on line 87 in src/backend/base/langflow/components/firecrawl/firecrawl_crawl_api.py

View workflow job for this annotation

GitHub Actions / Ruff Style Check (3.13)

Ruff (W292)

src/backend/base/langflow/components/firecrawl/firecrawl_crawl_api.py:87:52: W292 No newline at end of file
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
from langflow.custom import Component
from langflow.io import (
DataInput,
MultilineInput,
Output,
SecretStrInput,
StrInput,

Check failure on line 7 in src/backend/base/langflow/components/firecrawl/firecrawl_extract_api.py

View workflow job for this annotation

GitHub Actions / Ruff Style Check (3.13)

Ruff (F401)

src/backend/base/langflow/components/firecrawl/firecrawl_extract_api.py:7:5: F401 `langflow.io.StrInput` imported but unused
BoolInput,
)
from langflow.schema import Data

Check failure on line 10 in src/backend/base/langflow/components/firecrawl/firecrawl_extract_api.py

View workflow job for this annotation

GitHub Actions / Ruff Style Check (3.13)

Ruff (I001)

src/backend/base/langflow/components/firecrawl/firecrawl_extract_api.py:1:1: I001 Import block is un-sorted or un-formatted

class FirecrawlExtractApi(Component):
display_name: str = "FirecrawlExtractApi"
description: str = "Firecrawl Extract API."
name = "FirecrawlExtractApi"

output_types: list[str] = ["Document"]
documentation: str = "https://docs.firecrawl.dev/api-reference/endpoint/extract"

inputs = [
SecretStrInput(
name="api_key",
display_name="API Key",
required=True,
password=True,
info="The API key to use Firecrawl API.",
),
MultilineInput(
name="urls",
display_name="URLs",
required=True,
info="List of URLs to extract data from (separated by commas or new lines).",
tool_mode=True,
),
MultilineInput(
name="prompt",
display_name="Prompt",
required=True,
info="Prompt to guide the extraction process.",
tool_mode=True,
),
DataInput(
name="schema",
display_name="Schema",
required=False,
info="Schema to define the structure of the extracted data.",
),
BoolInput(
name="enable_web_search",
display_name="Enable Web Search",
info="When true, the extraction will use web search to find additional data.",
),
# # Optional: Not essential for basic extraction
# BoolInput(
# name="ignore_sitemap",
# display_name="Ignore Sitemap",
# info="When true, sitemap.xml files will be ignored during website scanning.",
# ),
# # Optional: Not essential for basic extraction
# BoolInput(
# name="include_subdomains",
# display_name="Include Subdomains",
# info="When true, subdomains of the provided URLs will also be scanned.",
# ),
# # Optional: Not essential for basic extraction
# BoolInput(
# name="show_sources",
# display_name="Show Sources",
# info="When true, the sources used to extract the data will be included in the response.",
# ),
]

outputs = [
Output(display_name="Data", name="data", method="extract"),
]

def extract(self) -> Data:
try:
from firecrawl import FirecrawlApp
except ImportError as e:
msg = "Could not import firecrawl integration package. Please install it with `pip install firecrawl-py`."
raise ImportError(msg) from e

# Validate API key
if not self.api_key:
raise ValueError("API key is required")

Check failure on line 86 in src/backend/base/langflow/components/firecrawl/firecrawl_extract_api.py

View workflow job for this annotation

GitHub Actions / Ruff Style Check (3.13)

Ruff (TRY003)

src/backend/base/langflow/components/firecrawl/firecrawl_extract_api.py:86:19: TRY003 Avoid specifying long messages outside the exception class

Check failure on line 86 in src/backend/base/langflow/components/firecrawl/firecrawl_extract_api.py

View workflow job for this annotation

GitHub Actions / Ruff Style Check (3.13)

Ruff (EM101)

src/backend/base/langflow/components/firecrawl/firecrawl_extract_api.py:86:30: EM101 Exception must not use a string literal, assign to variable first

# Validate URLs
if not self.urls:
raise ValueError("URLs are required")

Check failure on line 90 in src/backend/base/langflow/components/firecrawl/firecrawl_extract_api.py

View workflow job for this annotation

GitHub Actions / Ruff Style Check (3.13)

Ruff (TRY003)

src/backend/base/langflow/components/firecrawl/firecrawl_extract_api.py:90:19: TRY003 Avoid specifying long messages outside the exception class

Check failure on line 90 in src/backend/base/langflow/components/firecrawl/firecrawl_extract_api.py

View workflow job for this annotation

GitHub Actions / Ruff Style Check (3.13)

Ruff (EM101)

src/backend/base/langflow/components/firecrawl/firecrawl_extract_api.py:90:30: EM101 Exception must not use a string literal, assign to variable first

# Split and validate URLs (handle both commas and newlines)
urls = [url.strip() for url in self.urls.replace('\n', ',').split(',') if url.strip()]
if not urls:
raise ValueError("No valid URLs provided")

# Validate and process prompt
if not self.prompt:
raise ValueError("Prompt is required")

# Get the prompt text (handling both string and multiline input)
prompt_text = self.prompt.strip()

# Enhance the prompt to encourage comprehensive extraction
enhanced_prompt = prompt_text
if "schema" not in prompt_text.lower():
enhanced_prompt = f"{prompt_text}. Please extract all instances in a comprehensive, structured format."

params = {
"prompt": enhanced_prompt,
"enableWebSearch": self.enable_web_search,
# Optional parameters - not essential for basic extraction
"ignoreSitemap": self.ignore_sitemap,
"includeSubdomains": self.include_subdomains,
"showSources": self.show_sources,
"timeout": 300,
}

# Only add schema to params if it's provided and is a valid schema structure
if self.schema:
try:
if isinstance(self.schema, dict) and "type" in self.schema:
params["schema"] = self.schema
elif hasattr(self.schema, "dict") and "type" in self.schema.dict():
params["schema"] = self.schema.dict()
else:
# Skip invalid schema without raising an error
print("Warning: Provided schema is not in the correct format. Proceeding without schema.")
except Exception as e:
print(f"Warning: Could not process schema: {str(e)}. Proceeding without schema.")

try:
app = FirecrawlApp(api_key=self.api_key)
extract_result = app.extract(urls, params=params)
return Data(data=extract_result)
except Exception as e:
raise ValueError(f"Error during extraction: {str(e)}")
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
from langflow.custom import Component
from langflow.io import (
DataInput,
MultilineInput,
Output,
SecretStrInput,
StrInput,
BoolInput,
)
from langflow.schema import Data

class FirecrawlMapApi(Component):
display_name: str = "FirecrawlMapApi"
description: str = "Firecrawl Map API."
name = "FirecrawlMapApi"

output_types: list[str] = ["Document"]
documentation: str = "https://docs.firecrawl.dev/api-reference/endpoint/map"

inputs = [
SecretStrInput(
name="api_key",
display_name="API Key",
required=True,
password=True,
info="The API key to use Firecrawl API.",
),
MultilineInput(
name="urls",
display_name="URLs",
required=True,
info="List of URLs to create maps from (separated by commas or new lines).",
tool_mode=True,
),
BoolInput(
name="ignore_sitemap",
display_name="Ignore Sitemap",
info="When true, the sitemap.xml file will be ignored during crawling.",
),
BoolInput(
name="sitemap_only",
display_name="Sitemap Only",
info="When true, only links found in the sitemap will be returned.",
),
BoolInput(
name="include_subdomains",
display_name="Include Subdomains",
info="When true, subdomains of the provided URL will also be scanned.",
),
]

outputs = [
Output(display_name="Data", name="data", method="map"),
]

def map(self) -> Data:
try:
from firecrawl import FirecrawlApp
except ImportError as e:
msg = "Could not import firecrawl integration package. Please install it with `pip install firecrawl-py`."
raise ImportError(msg) from e

# Validate URLs
if not self.urls:
raise ValueError("URLs are required")

# Split and validate URLs (handle both commas and newlines)
urls = [url.strip() for url in self.urls.replace('\n', ',').split(',') if url.strip()]
if not urls:
raise ValueError("No valid URLs provided")

params = {
"ignoreSitemap": self.ignore_sitemap,
"sitemapOnly": self.sitemap_only,
"includeSubdomains": self.include_subdomains,
}

app = FirecrawlApp(api_key=self.api_key)

# Map all provided URLs and combine results
combined_links = []
for url in urls:
result = app.map_url(url, params=params)
if isinstance(result, dict) and 'links' in result:
combined_links.extend(result['links'])

map_result = {
'success': True,
'links': combined_links
}

return Data(data=map_result)
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@
)
from langflow.schema import Data


class FirecrawlScrapeApi(Component):
display_name: str = "FirecrawlScrapeApi"
description: str = "Firecrawl Scrape API."
Expand Down Expand Up @@ -42,29 +41,33 @@ class FirecrawlScrapeApi(Component):
display_name="Scrape Options",
info="The page options to send with the request.",
),
DataInput( # https://docs.firecrawl.dev/features/extract
DataInput(
name="extractorOptions",
display_name="Extractor Options",
info="The extractor options to send with the request.",
),
]

outputs = [
Output(display_name="Data", name="data", method="crawl"),
Output(display_name="Data", name="data", method="scrape"),
]

def crawl(self) -> list[Data]:
def scrape(self) -> Data:
try:
from firecrawl.firecrawl import FirecrawlApp
from firecrawl import FirecrawlApp
except ImportError as e:
msg = "Could not import firecrawl integration package. Please install it with `pip install firecrawl-py`."
raise ImportError(msg) from e

params = self.scrapeOptions.__dict__["data"] if self.scrapeOptions else {}
extractor_options_dict = self.extractorOptions.__dict__["data"] if self.extractorOptions else {}
params = self.scrapeOptions.__dict__.get("data", {}) if self.scrapeOptions else {}
extractor_options_dict = self.extractorOptions.__dict__.get("data", {}) if self.extractorOptions else {}
if extractor_options_dict:
params["extract"] = extractor_options_dict

# Set default values for parameters
params.setdefault("formats", ["markdown"]) # Default output format
params.setdefault("onlyMainContent", True) # Default to only main content

app = FirecrawlApp(api_key=self.api_key)
results = app.scrape_url(self.url, params=params)
return Data(data=results)
return Data(data=results)
2 changes: 2 additions & 0 deletions src/frontend/src/utils/styleUtils.ts
Original file line number Diff line number Diff line change
Expand Up @@ -656,6 +656,8 @@ export const nodeIconsLucide: iconsType = {
FacebookChatLoader: FBIcon,
FirecrawlCrawlApi: FirecrawlIcon,
FirecrawlScrapeApi: FirecrawlIcon,
FirecrawlMapApi: FirecrawlIcon,
FirecrawlExtractApi: FirecrawlIcon,
GitbookLoader: GitBookIcon,
GoogleSearchAPIWrapper: GoogleIcon,
GoogleSearchResults: GoogleIcon,
Expand Down

0 comments on commit c8c4131

Please sign in to comment.