From 54f5a0bbea46c49f653f628f18964a9a0cb66455 Mon Sep 17 00:00:00 2001 From: Rakesh Mehta <46493063+rakeshmehta0308@users.noreply.github.com> Date: Mon, 7 Oct 2024 18:40:55 +0100 Subject: [PATCH] Fix ZyteWebReader article parsing (#16401) --- .../data_connectors/WebPageDemo.ipynb | 75 ++++++++++++++++--- .../llama_index/readers/web/zyte_web/base.py | 14 +++- .../llama-index-readers-web/pyproject.toml | 2 +- 3 files changed, 75 insertions(+), 16 deletions(-) diff --git a/docs/docs/examples/data_connectors/WebPageDemo.ipynb b/docs/docs/examples/data_connectors/WebPageDemo.ipynb index 3330cdea468cf..197f4a08fc72c 100644 --- a/docs/docs/examples/data_connectors/WebPageDemo.ipynb +++ b/docs/docs/examples/data_connectors/WebPageDemo.ipynb @@ -620,15 +620,23 @@ "pip install zyte-api\n", "```\n", "\n", - "To get access to your ZYTE API key please visit: https://www.zyte.com/zyte-api/" + "To get access to your ZYTE API key please visit: https://docs.zyte.com/zyte-api/get-started.html" ] }, { "cell_type": "code", "execution_count": null, - "id": "f49f22bf", + "id": "31e1aaa5-8bfc-452f-9c72-15def22f872f", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "5871\n" + ] + } + ], "source": [ "from llama_index.readers.web import ZyteWebReader\n", "\n", @@ -636,20 +644,54 @@ "# import nest_asyncio\n", "# nest_asyncio.apply()\n", "\n", + "\n", + "# Initiate ZyteWebReader with your Zyte API key\n", + "zyte_reader = ZyteWebReader(\n", + " api_key=\"your ZYTE API key here\",\n", + " mode=\"article\", # or \"html-text\" or \"html\"\n", + ")\n", + "\n", + "urls = [\n", + " \"https://www.zyte.com/blog/web-scraping-apis/\",\n", + " \"https://www.zyte.com/blog/system-integrators-extract-big-data/\",\n", + "]\n", + "\n", + "documents = zyte_reader.load_data(\n", + " urls=urls,\n", + ")\n", + "\n", + "print(len(documents[0].text))" + ] + }, + { + "cell_type": "markdown", + "id": "c21ae76e-1b2c-480e-a58f-9f9becce15a6", + "metadata": {}, + "source": [ + "Browser rendering and javascript can be enabled by passing setting corresponding parameters during initialization. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f49f22bf", + "metadata": {}, + "outputs": [], + "source": [ "zyte_dw_params = {\n", " \"browserHtml\": True, # Enable browser rendering\n", " \"javascript\": True, # Enable JavaScript\n", "}\n", "\n", - "# Initiate ZyteWebReader with your Zyte API key\n", + "# Initiate ZyteWebReader with your Zyte API key and use default \"article\" mode\n", "zyte_reader = ZyteWebReader(\n", - " api_key=\"Your Zyte API Key\",\n", + " api_key=\"your ZYTE API key here\",\n", " download_kwargs=zyte_dw_params,\n", ")\n", "\n", - "# Load documents from URLs as markdown\n", + "# Load documents from URLs\n", "documents = zyte_reader.load_data(\n", - " urls=[\"https://www.zyte.com/blog/system-integrators-extract-big-data/\"],\n", + " urls=urls,\n", ")" ] }, @@ -662,7 +704,7 @@ { "data": { "text/plain": [ - "7150" + "4355" ] }, "execution_count": null, @@ -674,6 +716,14 @@ "len(documents[0].text)" ] }, + { + "cell_type": "markdown", + "id": "133d26d7-c26d-40b2-b08f-6c838fd3a6b6", + "metadata": {}, + "source": [ + "Set \"continue_on_failure\" to False if you'd like to stop when any request fails." + ] + }, { "cell_type": "code", "execution_count": null, @@ -682,14 +732,15 @@ "outputs": [], "source": [ "zyte_reader = ZyteWebReader(\n", - " api_key=\"Your API Key\",\n", + " api_key=\"your ZYTE API key here\",\n", " mode=\"html-text\",\n", " download_kwargs=zyte_dw_params,\n", + " continue_on_failure=False,\n", ")\n", "\n", - "# Load documents from URLs as markdown\n", + "# Load documents from URLs\n", "documents = zyte_reader.load_data(\n", - " urls=[\"https://www.zyte.com/blog/system-integrators-extract-big-data/\"],\n", + " urls=urls,\n", ")" ] }, @@ -702,7 +753,7 @@ { "data": { "text/plain": [ - "19554" + "17488" ] }, "execution_count": null, diff --git a/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/zyte_web/base.py b/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/zyte_web/base.py index cf11bc5925bdc..10e59a733a604 100644 --- a/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/zyte_web/base.py +++ b/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/zyte_web/base.py @@ -44,6 +44,7 @@ class ZyteWebReader(BasePydanticReader): mode: str n_conn: int download_kwargs: Optional[dict] + continue_on_failure: bool def __init__( self, @@ -51,10 +52,15 @@ def __init__( mode: Literal["article", "html", "html-text"] = "article", n_conn: int = 15, download_kwargs: Optional[Dict[str, Any]] = None, + continue_on_failure: bool = True, ) -> None: """Initialize with file path.""" super().__init__( - api_key=api_key, mode=mode, n_conn=n_conn, download_kwargs=download_kwargs + api_key=api_key, + mode=mode, + n_conn=n_conn, + download_kwargs=download_kwargs, + continue_on_failure=continue_on_failure, ) try: from zyte_api import AsyncZyteAPI @@ -81,12 +87,14 @@ def class_name(cls) -> str: return "ZyteWebReader" def _zyte_html_option(self) -> str: - if "browserHtml" in self.download_kwargs: + if self.download_kwargs and "browserHtml" in self.download_kwargs: return "browserHtml" return "httpResponseBody" def _get_article(self, page: Dict) -> str: - return page["article"]["headline"] + "\n\n" + page["article"]["articleBody"] + headline = page["article"].get("headline", "") + article_body = page["article"].get("articleBody", "") + return headline + "\n\n" + article_body def _zyte_request_params(self, url: str) -> dict: request_params: Dict[str, Any] = {"url": url} diff --git a/llama-index-integrations/readers/llama-index-readers-web/pyproject.toml b/llama-index-integrations/readers/llama-index-readers-web/pyproject.toml index c410a02d9d0f5..132eb64a4b03d 100644 --- a/llama-index-integrations/readers/llama-index-readers-web/pyproject.toml +++ b/llama-index-integrations/readers/llama-index-readers-web/pyproject.toml @@ -45,7 +45,7 @@ license = "MIT" maintainers = ["HawkClaws", "Hironsan", "NA", "an-bluecat", "bborn", "jasonwcfan", "kravetsmic", "pandazki", "ruze00", "selamanse", "thejessezhang"] name = "llama-index-readers-web" readme = "README.md" -version = "0.2.3" +version = "0.2.4" [tool.poetry.dependencies] python = ">=3.8.1,<4.0"