Fix ZyteWebReader article parsing (run-llama#16401)

jzhao62 · Oct 7, 2024 · 54f5a0b · 54f5a0b
1 parent 0061d8b
commit 54f5a0b
Show file tree

Hide file tree

Showing 3 changed files with 75 additions and 16 deletions.
diff --git a/docs/docs/examples/data_connectors/WebPageDemo.ipynb b/docs/docs/examples/data_connectors/WebPageDemo.ipynb
@@ -620,36 +620,78 @@
     "pip install zyte-api\n",
     "```\n",
     "\n",
-    "To get access to your ZYTE API key please visit: https://www.zyte.com/zyte-api/"
+    "To get access to your ZYTE API key please visit: https://docs.zyte.com/zyte-api/get-started.html"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "f49f22bf",
+   "id": "31e1aaa5-8bfc-452f-9c72-15def22f872f",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "5871\n"
+     ]
+    }
+   ],
    "source": [
     "from llama_index.readers.web import ZyteWebReader\n",
     "\n",
     "# Required to run it in notebook\n",
     "# import nest_asyncio\n",
     "# nest_asyncio.apply()\n",
     "\n",
+    "\n",
+    "# Initiate ZyteWebReader with your Zyte API key\n",
+    "zyte_reader = ZyteWebReader(\n",
+    "    api_key=\"your ZYTE API key here\",\n",
+    "    mode=\"article\",  # or \"html-text\" or \"html\"\n",
+    ")\n",
+    "\n",
+    "urls = [\n",
+    "    \"https://www.zyte.com/blog/web-scraping-apis/\",\n",
+    "    \"https://www.zyte.com/blog/system-integrators-extract-big-data/\",\n",
+    "]\n",
+    "\n",
+    "documents = zyte_reader.load_data(\n",
+    "    urls=urls,\n",
+    ")\n",
+    "\n",
+    "print(len(documents[0].text))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "c21ae76e-1b2c-480e-a58f-9f9becce15a6",
+   "metadata": {},
+   "source": [
+    "Browser rendering and javascript can be enabled by passing setting corresponding parameters during initialization. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f49f22bf",
+   "metadata": {},
+   "outputs": [],
+   "source": [
     "zyte_dw_params = {\n",
     "    \"browserHtml\": True,  # Enable browser rendering\n",
     "    \"javascript\": True,  # Enable JavaScript\n",
     "}\n",
     "\n",
-    "# Initiate ZyteWebReader with your Zyte API key\n",
+    "# Initiate ZyteWebReader with your Zyte API key and use default \"article\" mode\n",
     "zyte_reader = ZyteWebReader(\n",
-    "    api_key=\"Your Zyte API Key\",\n",
+    "    api_key=\"your ZYTE API key here\",\n",
     "    download_kwargs=zyte_dw_params,\n",
     ")\n",
     "\n",
-    "# Load documents from URLs as markdown\n",
+    "# Load documents from URLs\n",
     "documents = zyte_reader.load_data(\n",
-    "    urls=[\"https://www.zyte.com/blog/system-integrators-extract-big-data/\"],\n",
+    "    urls=urls,\n",
     ")"
    ]
   },
@@ -662,7 +704,7 @@
     {
      "data": {
       "text/plain": [
-       "7150"
+       "4355"
       ]
      },
      "execution_count": null,
@@ -674,6 +716,14 @@
     "len(documents[0].text)"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "133d26d7-c26d-40b2-b08f-6c838fd3a6b6",
+   "metadata": {},
+   "source": [
+    "Set \"continue_on_failure\" to False if you'd like to stop when any request fails."
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -682,14 +732,15 @@
    "outputs": [],
    "source": [
     "zyte_reader = ZyteWebReader(\n",
-    "    api_key=\"Your API Key\",\n",
+    "    api_key=\"your ZYTE API key here\",\n",
     "    mode=\"html-text\",\n",
     "    download_kwargs=zyte_dw_params,\n",
+    "    continue_on_failure=False,\n",
     ")\n",
     "\n",
-    "# Load documents from URLs as markdown\n",
+    "# Load documents from URLs\n",
     "documents = zyte_reader.load_data(\n",
-    "    urls=[\"https://www.zyte.com/blog/system-integrators-extract-big-data/\"],\n",
+    "    urls=urls,\n",
     ")"
    ]
   },
@@ -702,7 +753,7 @@
     {
      "data": {
       "text/plain": [
-       "19554"
+       "17488"
       ]
      },
      "execution_count": null,

diff --git a/...dex-integrations/readers/llama-index-readers-web/llama_index/readers/web/zyte_web/base.py b/...dex-integrations/readers/llama-index-readers-web/llama_index/readers/web/zyte_web/base.py
@@ -44,17 +44,23 @@ class ZyteWebReader(BasePydanticReader):
     mode: str
     n_conn: int
     download_kwargs: Optional[dict]
+    continue_on_failure: bool
 
     def __init__(
         self,
         api_key: str,
         mode: Literal["article", "html", "html-text"] = "article",
         n_conn: int = 15,
         download_kwargs: Optional[Dict[str, Any]] = None,
+        continue_on_failure: bool = True,
     ) -> None:
         """Initialize with file path."""
         super().__init__(
-            api_key=api_key, mode=mode, n_conn=n_conn, download_kwargs=download_kwargs
+            api_key=api_key,
+            mode=mode,
+            n_conn=n_conn,
+            download_kwargs=download_kwargs,
+            continue_on_failure=continue_on_failure,
         )
         try:
             from zyte_api import AsyncZyteAPI
@@ -81,12 +87,14 @@ def class_name(cls) -> str:
         return "ZyteWebReader"
 
     def _zyte_html_option(self) -> str:
-        if "browserHtml" in self.download_kwargs:
+        if self.download_kwargs and "browserHtml" in self.download_kwargs:
             return "browserHtml"
         return "httpResponseBody"
 
     def _get_article(self, page: Dict) -> str:
-        return page["article"]["headline"] + "\n\n" + page["article"]["articleBody"]
+        headline = page["article"].get("headline", "")
+        article_body = page["article"].get("articleBody", "")
+        return headline + "\n\n" + article_body
 
     def _zyte_request_params(self, url: str) -> dict:
         request_params: Dict[str, Any] = {"url": url}

diff --git a/llama-index-integrations/readers/llama-index-readers-web/pyproject.toml b/llama-index-integrations/readers/llama-index-readers-web/pyproject.toml
@@ -45,7 +45,7 @@ license = "MIT"
 maintainers = ["HawkClaws", "Hironsan", "NA", "an-bluecat", "bborn", "jasonwcfan", "kravetsmic", "pandazki", "ruze00", "selamanse", "thejessezhang"]
 name = "llama-index-readers-web"
 readme = "README.md"
-version = "0.2.3"
+version = "0.2.4"
 
 [tool.poetry.dependencies]
 python = ">=3.8.1,<4.0"