Skip to content

Commit

Permalink
Fix ZyteWebReader article parsing (run-llama#16401)
Browse files Browse the repository at this point in the history
  • Loading branch information
rakeshmehta0308 authored Oct 7, 2024
1 parent 0061d8b commit 54f5a0b
Show file tree
Hide file tree
Showing 3 changed files with 75 additions and 16 deletions.
75 changes: 63 additions & 12 deletions docs/docs/examples/data_connectors/WebPageDemo.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -620,36 +620,78 @@
"pip install zyte-api\n",
"```\n",
"\n",
"To get access to your ZYTE API key please visit: https://www.zyte.com/zyte-api/"
"To get access to your ZYTE API key please visit: https://docs.zyte.com/zyte-api/get-started.html"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "f49f22bf",
"id": "31e1aaa5-8bfc-452f-9c72-15def22f872f",
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"5871\n"
]
}
],
"source": [
"from llama_index.readers.web import ZyteWebReader\n",
"\n",
"# Required to run it in notebook\n",
"# import nest_asyncio\n",
"# nest_asyncio.apply()\n",
"\n",
"\n",
"# Initiate ZyteWebReader with your Zyte API key\n",
"zyte_reader = ZyteWebReader(\n",
" api_key=\"your ZYTE API key here\",\n",
" mode=\"article\", # or \"html-text\" or \"html\"\n",
")\n",
"\n",
"urls = [\n",
" \"https://www.zyte.com/blog/web-scraping-apis/\",\n",
" \"https://www.zyte.com/blog/system-integrators-extract-big-data/\",\n",
"]\n",
"\n",
"documents = zyte_reader.load_data(\n",
" urls=urls,\n",
")\n",
"\n",
"print(len(documents[0].text))"
]
},
{
"cell_type": "markdown",
"id": "c21ae76e-1b2c-480e-a58f-9f9becce15a6",
"metadata": {},
"source": [
"Browser rendering and javascript can be enabled by passing setting corresponding parameters during initialization. "
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "f49f22bf",
"metadata": {},
"outputs": [],
"source": [
"zyte_dw_params = {\n",
" \"browserHtml\": True, # Enable browser rendering\n",
" \"javascript\": True, # Enable JavaScript\n",
"}\n",
"\n",
"# Initiate ZyteWebReader with your Zyte API key\n",
"# Initiate ZyteWebReader with your Zyte API key and use default \"article\" mode\n",
"zyte_reader = ZyteWebReader(\n",
" api_key=\"Your Zyte API Key\",\n",
" api_key=\"your ZYTE API key here\",\n",
" download_kwargs=zyte_dw_params,\n",
")\n",
"\n",
"# Load documents from URLs as markdown\n",
"# Load documents from URLs\n",
"documents = zyte_reader.load_data(\n",
" urls=[\"https://www.zyte.com/blog/system-integrators-extract-big-data/\"],\n",
" urls=urls,\n",
")"
]
},
Expand All @@ -662,7 +704,7 @@
{
"data": {
"text/plain": [
"7150"
"4355"
]
},
"execution_count": null,
Expand All @@ -674,6 +716,14 @@
"len(documents[0].text)"
]
},
{
"cell_type": "markdown",
"id": "133d26d7-c26d-40b2-b08f-6c838fd3a6b6",
"metadata": {},
"source": [
"Set \"continue_on_failure\" to False if you'd like to stop when any request fails."
]
},
{
"cell_type": "code",
"execution_count": null,
Expand All @@ -682,14 +732,15 @@
"outputs": [],
"source": [
"zyte_reader = ZyteWebReader(\n",
" api_key=\"Your API Key\",\n",
" api_key=\"your ZYTE API key here\",\n",
" mode=\"html-text\",\n",
" download_kwargs=zyte_dw_params,\n",
" continue_on_failure=False,\n",
")\n",
"\n",
"# Load documents from URLs as markdown\n",
"# Load documents from URLs\n",
"documents = zyte_reader.load_data(\n",
" urls=[\"https://www.zyte.com/blog/system-integrators-extract-big-data/\"],\n",
" urls=urls,\n",
")"
]
},
Expand All @@ -702,7 +753,7 @@
{
"data": {
"text/plain": [
"19554"
"17488"
]
},
"execution_count": null,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -44,17 +44,23 @@ class ZyteWebReader(BasePydanticReader):
mode: str
n_conn: int
download_kwargs: Optional[dict]
continue_on_failure: bool

def __init__(
self,
api_key: str,
mode: Literal["article", "html", "html-text"] = "article",
n_conn: int = 15,
download_kwargs: Optional[Dict[str, Any]] = None,
continue_on_failure: bool = True,
) -> None:
"""Initialize with file path."""
super().__init__(
api_key=api_key, mode=mode, n_conn=n_conn, download_kwargs=download_kwargs
api_key=api_key,
mode=mode,
n_conn=n_conn,
download_kwargs=download_kwargs,
continue_on_failure=continue_on_failure,
)
try:
from zyte_api import AsyncZyteAPI
Expand All @@ -81,12 +87,14 @@ def class_name(cls) -> str:
return "ZyteWebReader"

def _zyte_html_option(self) -> str:
if "browserHtml" in self.download_kwargs:
if self.download_kwargs and "browserHtml" in self.download_kwargs:
return "browserHtml"
return "httpResponseBody"

def _get_article(self, page: Dict) -> str:
return page["article"]["headline"] + "\n\n" + page["article"]["articleBody"]
headline = page["article"].get("headline", "")
article_body = page["article"].get("articleBody", "")
return headline + "\n\n" + article_body

def _zyte_request_params(self, url: str) -> dict:
request_params: Dict[str, Any] = {"url": url}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ license = "MIT"
maintainers = ["HawkClaws", "Hironsan", "NA", "an-bluecat", "bborn", "jasonwcfan", "kravetsmic", "pandazki", "ruze00", "selamanse", "thejessezhang"]
name = "llama-index-readers-web"
readme = "README.md"
version = "0.2.3"
version = "0.2.4"

[tool.poetry.dependencies]
python = ">=3.8.1,<4.0"
Expand Down

0 comments on commit 54f5a0b

Please sign in to comment.