From 19892efc3f6297757035d5d73d8380e7b28c40fe Mon Sep 17 00:00:00 2001 From: Rakesh mehta Date: Thu, 3 Oct 2024 09:30:35 +0100 Subject: [PATCH 01/10] Add zyte serp --- .../llama-index-readers-zyte-serp/.gitignore | 153 ++++++++++++++++++ .../llama-index-readers-zyte-serp/BUILD | 7 + .../llama-index-readers-zyte-serp/Makefile | 17 ++ .../llama-index-readers-zyte-serp/README.md | 53 ++++++ .../llama_index/readers/zyte_serp/__init__.py | 4 + .../llama_index/readers/zyte_serp/base.py | 82 ++++++++++ .../pyproject.toml | 55 +++++++ .../requirements.txt | 1 + .../tests/__init__.py | 0 .../tests/test_readers_zyte-serp.py | 0 10 files changed, 372 insertions(+) create mode 100644 llama-index-integrations/readers/llama-index-readers-zyte-serp/.gitignore create mode 100644 llama-index-integrations/readers/llama-index-readers-zyte-serp/BUILD create mode 100644 llama-index-integrations/readers/llama-index-readers-zyte-serp/Makefile create mode 100644 llama-index-integrations/readers/llama-index-readers-zyte-serp/README.md create mode 100644 llama-index-integrations/readers/llama-index-readers-zyte-serp/llama_index/readers/zyte_serp/__init__.py create mode 100644 llama-index-integrations/readers/llama-index-readers-zyte-serp/llama_index/readers/zyte_serp/base.py create mode 100644 llama-index-integrations/readers/llama-index-readers-zyte-serp/pyproject.toml create mode 100644 llama-index-integrations/readers/llama-index-readers-zyte-serp/requirements.txt create mode 100644 llama-index-integrations/readers/llama-index-readers-zyte-serp/tests/__init__.py create mode 100644 llama-index-integrations/readers/llama-index-readers-zyte-serp/tests/test_readers_zyte-serp.py diff --git a/llama-index-integrations/readers/llama-index-readers-zyte-serp/.gitignore b/llama-index-integrations/readers/llama-index-readers-zyte-serp/.gitignore new file mode 100644 index 0000000000000..990c18de22908 --- /dev/null +++ b/llama-index-integrations/readers/llama-index-readers-zyte-serp/.gitignore @@ -0,0 +1,153 @@ +llama_index/_static +.DS_Store +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +bin/ +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +etc/ +include/ +lib/ +lib64/ +parts/ +sdist/ +share/ +var/ +wheels/ +pip-wheel-metadata/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +.ruff_cache + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints +notebooks/ + +# IPython +profile_default/ +ipython_config.py + +# pyenv +.python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ +pyvenv.cfg + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# Jetbrains +.idea +modules/ +*.swp + +# VsCode +.vscode + +# pipenv +Pipfile +Pipfile.lock + +# pyright +pyrightconfig.json diff --git a/llama-index-integrations/readers/llama-index-readers-zyte-serp/BUILD b/llama-index-integrations/readers/llama-index-readers-zyte-serp/BUILD new file mode 100644 index 0000000000000..2d3d88d1eab9c --- /dev/null +++ b/llama-index-integrations/readers/llama-index-readers-zyte-serp/BUILD @@ -0,0 +1,7 @@ +poetry_requirements( + name="poetry", +) + +python_requirements( + name="reqs", +) diff --git a/llama-index-integrations/readers/llama-index-readers-zyte-serp/Makefile b/llama-index-integrations/readers/llama-index-readers-zyte-serp/Makefile new file mode 100644 index 0000000000000..b9eab05aa3706 --- /dev/null +++ b/llama-index-integrations/readers/llama-index-readers-zyte-serp/Makefile @@ -0,0 +1,17 @@ +GIT_ROOT ?= $(shell git rev-parse --show-toplevel) + +help: ## Show all Makefile targets. + @grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[33m%-30s\033[0m %s\n", $$1, $$2}' + +format: ## Run code autoformatters (black). + pre-commit install + git ls-files | xargs pre-commit run black --files + +lint: ## Run linters: pre-commit (black, ruff, codespell) and mypy + pre-commit install && git ls-files | xargs pre-commit run --show-diff-on-failure --files + +test: ## Run tests via pytest. + pytest tests + +watch-docs: ## Build and watch documentation. + sphinx-autobuild docs/ docs/_build/html --open-browser --watch $(GIT_ROOT)/llama_index/ diff --git a/llama-index-integrations/readers/llama-index-readers-zyte-serp/README.md b/llama-index-integrations/readers/llama-index-readers-zyte-serp/README.md new file mode 100644 index 0000000000000..65bb162d364ee --- /dev/null +++ b/llama-index-integrations/readers/llama-index-readers-zyte-serp/README.md @@ -0,0 +1,53 @@ +# LlamaIndex Readers Integration: Zyte-Serp + +ZyteSerp can be used to add organic search results from Google Search. It takes a `query` and returns top search results urls. + +## Instructions for ZyteSerpReader + +### Setup and Installation + +`pip install llama-index-readers-zyte-serp` + +1. **Install zyte-api Package**: Ensure the `zyte-api` package is installed to use the ZyteSerpReader. Install it via pip with the following command: + + ```bash + pip install zyte-api + ``` + +2. **API Key**: Secure an API key from [Zyte](https://www.zyte.com/zyte-api/) to access the Zyte services. + +### Using ZyteSerpReader + +- **Initialization**: Initialize the ZyteWebReader by providing the API key, the desired mode of operation (`article`, `html-text`, or `html`), and any optional parameters for the Zyte API. + + ```python + from llama_index.readers.zyte_serp import ZyteWebReader + + zyte_serp = ZyteSerpReader( + api_key="your_api_key_here", + extract_from="httpResponseBody", # or "browserHtml" + ) + ``` + +- **Loading Data**: To load data, use the `load_data` method with the URLs you wish to process. + +```python +documents = zyte_serp.load_data(query="llama index docs") +``` + +### Example Usage + +Here is an example demonstrating how to initialize the ZyteWebReader, load document from a URL. + +```python +# Initialize the ZyteSerpReader with your API key +zyte_serp = ZyteSerpReader( + api_key="your_api_key_here", # Replace with your actual API key +) + +# Load documents from Paul G +documents = zyte_serp.load_data(urls="llama index docs") + +# Display the document +print(documents) +``` diff --git a/llama-index-integrations/readers/llama-index-readers-zyte-serp/llama_index/readers/zyte_serp/__init__.py b/llama-index-integrations/readers/llama-index-readers-zyte-serp/llama_index/readers/zyte_serp/__init__.py new file mode 100644 index 0000000000000..a458b33875961 --- /dev/null +++ b/llama-index-integrations/readers/llama-index-readers-zyte-serp/llama_index/readers/zyte_serp/__init__.py @@ -0,0 +1,4 @@ +from llama_index.readers.zyte_serp.base import ZyteSerpReader + + +__all__ = ["ZyteSerpReader"] diff --git a/llama-index-integrations/readers/llama-index-readers-zyte-serp/llama_index/readers/zyte_serp/base.py b/llama-index-integrations/readers/llama-index-readers-zyte-serp/llama_index/readers/zyte_serp/base.py new file mode 100644 index 0000000000000..2a6fd42d294a8 --- /dev/null +++ b/llama-index-integrations/readers/llama-index-readers-zyte-serp/llama_index/readers/zyte_serp/base.py @@ -0,0 +1,82 @@ +from typing import Literal, Optional +from pydantic import Field + +from llama_index.core.readers.base import BasePydanticReader +from llama_index.core.schema import Document + + +class ZyteSerpReader(BasePydanticReader): + """Get google search results URLs for a search query. + + Args: + api_key: Zyte API key. + extract_from: Determines the mode while extracting the . + It can take one of the following values: 'html', 'html-text', 'article' + + Example: + .. code-block:: python + + from llama_index.readers.zyte_serp import ZyteSerpReader + + reader = ZyteSerpReader( + api_key="ZYTE_API_KEY", + ) + docs = reader.load_data( + "search query", + ) + + Zyte-API reference: + https://www.zyte.com/zyte-api/ + + """ + + client: Optional[object] = Field(None) + api_key: str + extract_from: str + + def __init__( + self, + api_key: str, + extract_from: Literal["httpResponseBody", "browserHtml"] = "httpResponseBody", + ) -> None: + """Initialize with file path.""" + super().__init__( + api_key=api_key, + extract_from=extract_from, + ) + try: + from zyte_api import ZyteAPI + from zyte_api.utils import USER_AGENT as PYTHON_ZYTE_API_USER_AGENT + + except ImportError: + raise ImportError( + "zyte-api package not found, please install it with " + "`pip install zyte-api`" + ) + + user_agent = f"llama-index-zyte-api/{PYTHON_ZYTE_API_USER_AGENT}" + self.client = ZyteAPI( + api_key=api_key, + user_agent=user_agent, + ) + + def _serp_url(self, query: str): + base_url = "https://www.google.com/search?q=" + return base_url + query.replace(" ", "+") + + def load_data(self, query: str): + serp_url = self._serp_url(query) + serp_request = { + "url": serp_url, + "serp": True, + "serpOptions": {"extractFrom": self.extract_from}, + } + results = self.client.get(serp_request) + docs = [] + for result in results["serp"]["organicResults"]: + doc = Document( + text=result["url"], + metadata={"name": result["name"], "rank": result["rank"]}, + ) + docs.append(doc) + return docs diff --git a/llama-index-integrations/readers/llama-index-readers-zyte-serp/pyproject.toml b/llama-index-integrations/readers/llama-index-readers-zyte-serp/pyproject.toml new file mode 100644 index 0000000000000..f630ef09c3d45 --- /dev/null +++ b/llama-index-integrations/readers/llama-index-readers-zyte-serp/pyproject.toml @@ -0,0 +1,55 @@ +[build-system] +build-backend = "poetry.core.masonry.api" +requires = ["poetry-core"] + +[tool.codespell] +check-filenames = true +check-hidden = true +# [tool.llamahub] +contains_example = false +import_path = "llama_index.readers.zyte_serp" +# Feel free to un-skip examples, and experimental, you will just need to +# work through many typos (--write-changes and --interactive will help) +skip = "*.csv,*.html,*.json,*.jsonl,*.pdf,*.txt,*.ipynb" + +[tool.llamahub.class_authors] +ZyteSerpReader = "rakeshmehta0308" + +[tool.mypy] +disallow_untyped_defs = true +# Remove venv skip when integrated with pre-commit +exclude = ["_static", "build", "examples", "notebooks", "venv"] +ignore_missing_imports = true +python_version = "3.8" + +[tool.poetry] +authors = ["Your Name "] +description = "llama-index readers zyte-serp integration" +license = "MIT" +name = "llama-index-readers-zyte-serp" +packages = [{include = "llama_index/"}] +readme = "README.md" +version = "0.1.0" + +[tool.poetry.dependencies] +python = ">=3.8.1,<4.0" +llama-index-core = "^0.10.0" + +[tool.poetry.group.dev.dependencies] +black = {extras = ["jupyter"], version = "<=23.9.1,>=23.7.0"} +codespell = {extras = ["toml"], version = ">=v2.2.6"} +ipython = "8.10.0" +jupyter = "^1.0.0" +mypy = "0.991" +pre-commit = "3.2.0" +pylint = "2.15.10" +pytest = "7.2.1" +pytest-mock = "3.11.1" +ruff = "0.0.292" +tree-sitter-languages = "^1.8.0" +types-Deprecated = ">=0.1.0" +types-PyYAML = "^6.0.12.12" +types-protobuf = "^4.24.0.4" +types-redis = "4.5.5.0" +types-requests = "2.28.11.8" # TODO: unpin when mypy>0.991 +types-setuptools = "67.1.0.0" diff --git a/llama-index-integrations/readers/llama-index-readers-zyte-serp/requirements.txt b/llama-index-integrations/readers/llama-index-readers-zyte-serp/requirements.txt new file mode 100644 index 0000000000000..0773eeff4d0e5 --- /dev/null +++ b/llama-index-integrations/readers/llama-index-readers-zyte-serp/requirements.txt @@ -0,0 +1 @@ +zyte-api diff --git a/llama-index-integrations/readers/llama-index-readers-zyte-serp/tests/__init__.py b/llama-index-integrations/readers/llama-index-readers-zyte-serp/tests/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/llama-index-integrations/readers/llama-index-readers-zyte-serp/tests/test_readers_zyte-serp.py b/llama-index-integrations/readers/llama-index-readers-zyte-serp/tests/test_readers_zyte-serp.py new file mode 100644 index 0000000000000..e69de29bb2d1d From 3982a61843220114b6c05db0a75dc9aed905beee Mon Sep 17 00:00:00 2001 From: Rakesh mehta Date: Thu, 3 Oct 2024 09:37:12 +0100 Subject: [PATCH 02/10] Add serp example notebook --- .../data_connectors/ZyteSerpDemo.ipynb | 317 ++++++++++++++++++ 1 file changed, 317 insertions(+) create mode 100644 docs/docs/examples/data_connectors/ZyteSerpDemo.ipynb diff --git a/docs/docs/examples/data_connectors/ZyteSerpDemo.ipynb b/docs/docs/examples/data_connectors/ZyteSerpDemo.ipynb new file mode 100644 index 0000000000000..d0eb3108e71a0 --- /dev/null +++ b/docs/docs/examples/data_connectors/ZyteSerpDemo.ipynb @@ -0,0 +1,317 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "39207008-8ea1-4a0e-bbe1-a8b5e73f0de0", + "metadata": {}, + "source": [ + "# Zyte Serp Reader" + ] + }, + { + "cell_type": "markdown", + "id": "d8503fec-8c5f-49cd-acb9-a128b701ddc6", + "metadata": {}, + "source": [ + "Zyte Serp Reader allows you to access the organic results from google search. Given a query string, it provides the URLs of the top search results and the text string associated with those." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "98850676-106a-484e-85df-eaf6e1ea52d0", + "metadata": {}, + "outputs": [], + "source": [ + "# %pip install llama-index llama-index-readers-zyte-serp" + ] + }, + { + "cell_type": "markdown", + "id": "50861c50-b62c-4c13-8d38-9562a4c4093e", + "metadata": {}, + "source": [ + "In this notebook we show how Zyte Serp Reader (along with web reader) can be used collect information about a particular topic. Given these documents we can perform queries on this topic. \n", + "\n", + "Recently the Govt. of Ireland announced fiscal budget for 2024 and here we show how we can query information regarding the budget. First we get the relevant information using the Zyte Serp Reader, then the information from these URLs is extracted using web reader and finally a queries are answered using a openai chatgpt model. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3a5f674f-5be8-48da-b447-63edd84a6779", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "from llama_index.readers.zyte_serp import ZyteSerpReader\n", + "from llama_index.readers.web.zyte_web.base import ZyteWebReader" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5d787224-245e-41fa-aa6e-eca68d06d92c", + "metadata": {}, + "outputs": [], + "source": [ + "# This is needed to run it in juypter notebook\n", + "import nest_asyncio\n", + "\n", + "nest_asyncio.apply()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "86685d68-8b88-4883-afae-4767c41b1b88", + "metadata": {}, + "outputs": [], + "source": [ + "zyte_api_key = os.environ.get(\"ZYTE_API_KEY\")" + ] + }, + { + "cell_type": "markdown", + "id": "4a45e4ae-2ee5-41e4-aa23-a5b927154cdb", + "metadata": {}, + "source": [ + "### Get relevant resources (using ZyteSerp)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "52ce02e0-9ed0-452a-876d-ecaab50e94dd", + "metadata": {}, + "outputs": [], + "source": [ + "topic = \"Ireland Budget 2025\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1eaf5a9c-c597-4cff-9446-287529b9e15a", + "metadata": {}, + "outputs": [], + "source": [ + "serp_reader = ZyteSerpReader(api_key=zyte_api_key)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a6c5e727-9d1c-470c-a839-690d433ca3cf", + "metadata": {}, + "outputs": [], + "source": [ + "search_results = serp_reader.load_data(topic)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5c838258-219a-4ffb-96a3-263a0f08829e", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "7" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(search_results)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "03ef22f3-295a-4e46-820b-357559d8324c", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "https://www.gov.ie/en/publication/e8315-budget-2025/\n", + "{'name': 'Budget 2025', 'rank': 1}\n", + "https://www.citizensinformation.ie/en/money-and-tax/budgets/budget-2025/\n", + "{'name': 'Budget 2025', 'rank': 2}\n", + "https://www.gov.ie/en/publication/cb193-your-guide-to-budget-2025/\n", + "{'name': 'Your guide to Budget 2025', 'rank': 3}\n", + "https://www.irishtimes.com/your-money/2024/10/01/budget-2025-ireland-main-points/\n", + "{'name': 'Budget 2025 main points: Energy credits, bonus welfare ...', 'rank': 4}\n" + ] + } + ], + "source": [ + "for r in search_results[:4]:\n", + " print(r.text)\n", + " print(r.metadata)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7f02511f-6edc-4022-a5b4-ca4b5dbbd9d8", + "metadata": {}, + "outputs": [], + "source": [ + "urls = [r.text for r in search_results]" + ] + }, + { + "cell_type": "markdown", + "id": "ec61510f-3717-4805-b9b2-5af7405bc4f1", + "metadata": {}, + "source": [ + "Seems we have a list of relevant URL with regard to our topic (\"Ireland budget 2024\"). Metadata also shows the text and rank associated with the search result entry. Next we get the content of these webpages using web reader." + ] + }, + { + "cell_type": "markdown", + "id": "edb07727-2111-4aa7-b52f-51943a42f4bd", + "metadata": {}, + "source": [ + "### Get topic content" + ] + }, + { + "cell_type": "markdown", + "id": "f0249f60-3262-46f3-bfea-d8299e55faa9", + "metadata": {}, + "source": [ + "Given the urls of the webpages which contain information about the topic, we get the content. Since the webpages contain a lot of non-relevant content, we can obtain the filtered content using the \"article\" mode of the ZyteWebReader which returns only the article content of from the webpage." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "20426fd7-5b8d-40e0-aeb9-b16ee58ea036", + "metadata": {}, + "outputs": [], + "source": [ + "web_reader = ZyteWebReader(api_key=zyte_api_key, mode=\"article\")\n", + "documents = web_reader.load_data(urls)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8e7f2b75-d583-4f86-9d9d-9bb7cfc58d1d", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Budget 2025 - Tax Highlights Ireland\n", + "\n", + "Budget 2025 announced on 1 October 2024 included a substantial \"cost-of-living\" package including many one-off payments, as well as outlining a framework to direc\n" + ] + } + ], + "source": [ + "print(documents[0].text[:200])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4f4abb80-9838-4335-b384-304abce0fde9", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "7" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(documents)" + ] + }, + { + "cell_type": "markdown", + "id": "377d4120-f21a-4003-82cd-c225892b595f", + "metadata": {}, + "source": [ + "### Query engine" + ] + }, + { + "cell_type": "markdown", + "id": "e635e402-8ce8-4ebe-8144-948647ff6d5a", + "metadata": {}, + "source": [ + "Here a very basic query is performed using VectorStoreIndex. Please make sure that the OPENAI_API_KEY env variable is set before running the following code. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e870bde0-f956-4d2a-a922-1653f0ab1a06", + "metadata": {}, + "outputs": [], + "source": [ + "from llama_index.core import VectorStoreIndex\n", + "\n", + "index = VectorStoreIndex.from_documents(documents)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6aece266-f08a-42eb-91ba-8d6c2a0237d0", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Two €125 electricity credits will be provided - one this year and one in 2025.\n" + ] + } + ], + "source": [ + "query_engine = index.as_query_engine()\n", + "response = query_engine.query(\n", + " \"What kind of energy credits are provided in the budget?\"\n", + ")\n", + "print(response)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "forked-llama", + "language": "python", + "name": "forked-llama" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From 0600609ab1e13e0d0b00311d9c6cdbba25a9f094 Mon Sep 17 00:00:00 2001 From: Rakesh mehta Date: Thu, 3 Oct 2024 09:43:38 +0100 Subject: [PATCH 03/10] add test --- .../tests/test_readers_zyte-serp.py | 0 .../tests/test_readers_zyte_serp.py | 7 +++++++ 2 files changed, 7 insertions(+) delete mode 100644 llama-index-integrations/readers/llama-index-readers-zyte-serp/tests/test_readers_zyte-serp.py create mode 100644 llama-index-integrations/readers/llama-index-readers-zyte-serp/tests/test_readers_zyte_serp.py diff --git a/llama-index-integrations/readers/llama-index-readers-zyte-serp/tests/test_readers_zyte-serp.py b/llama-index-integrations/readers/llama-index-readers-zyte-serp/tests/test_readers_zyte-serp.py deleted file mode 100644 index e69de29bb2d1d..0000000000000 diff --git a/llama-index-integrations/readers/llama-index-readers-zyte-serp/tests/test_readers_zyte_serp.py b/llama-index-integrations/readers/llama-index-readers-zyte-serp/tests/test_readers_zyte_serp.py new file mode 100644 index 0000000000000..015c8de0e5c70 --- /dev/null +++ b/llama-index-integrations/readers/llama-index-readers-zyte-serp/tests/test_readers_zyte_serp.py @@ -0,0 +1,7 @@ +from llama_index.core.readers.base import BaseReader +from llama_index.readers.zyte_serp import ZyteSerpReader + + +def test_class(): + names_of_base_classes = [b.__name__ for b in ZyteSerpReader.__mro__] + assert BaseReader.__name__ in names_of_base_classes From b697ae045d62265623baa5cf8639fdcaf791fa7a Mon Sep 17 00:00:00 2001 From: Rakesh mehta Date: Thu, 3 Oct 2024 10:00:53 +0100 Subject: [PATCH 04/10] Update notebook --- .../examples/data_connectors/ZyteSerpDemo.ipynb | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/docs/docs/examples/data_connectors/ZyteSerpDemo.ipynb b/docs/docs/examples/data_connectors/ZyteSerpDemo.ipynb index d0eb3108e71a0..999362c6539ac 100644 --- a/docs/docs/examples/data_connectors/ZyteSerpDemo.ipynb +++ b/docs/docs/examples/data_connectors/ZyteSerpDemo.ipynb @@ -56,9 +56,8 @@ "outputs": [], "source": [ "# This is needed to run it in juypter notebook\n", - "import nest_asyncio\n", - "\n", - "nest_asyncio.apply()" + "# import nest_asyncio\n", + "# nest_asyncio.apply()" ] }, { @@ -79,6 +78,14 @@ "### Get relevant resources (using ZyteSerp)" ] }, + { + "cell_type": "markdown", + "id": "7889631c-4650-423c-93fa-dd08473c7848", + "metadata": {}, + "source": [ + "Given a topic, we use the search results from google to get the links to the relevant pages. " + ] + }, { "cell_type": "code", "execution_count": null, From 40327859218b16f08507cdef937bf8e6e987f3c6 Mon Sep 17 00:00:00 2001 From: Rakesh mehta Date: Fri, 4 Oct 2024 16:24:16 +0100 Subject: [PATCH 05/10] Update README --- .../llama-index-readers-zyte-serp/README.md | 37 ++++++++++++------- 1 file changed, 23 insertions(+), 14 deletions(-) diff --git a/llama-index-integrations/readers/llama-index-readers-zyte-serp/README.md b/llama-index-integrations/readers/llama-index-readers-zyte-serp/README.md index 65bb162d364ee..e8e8a64b7ad08 100644 --- a/llama-index-integrations/readers/llama-index-readers-zyte-serp/README.md +++ b/llama-index-integrations/readers/llama-index-readers-zyte-serp/README.md @@ -8,20 +8,14 @@ ZyteSerp can be used to add organic search results from Google Search. It takes `pip install llama-index-readers-zyte-serp` -1. **Install zyte-api Package**: Ensure the `zyte-api` package is installed to use the ZyteSerpReader. Install it via pip with the following command: - - ```bash - pip install zyte-api - ``` - -2. **API Key**: Secure an API key from [Zyte](https://www.zyte.com/zyte-api/) to access the Zyte services. +Secure an API key from [Zyte](https://www.zyte.com/zyte-api/) to access the Zyte services. ### Using ZyteSerpReader -- **Initialization**: Initialize the ZyteWebReader by providing the API key, the desired mode of operation (`article`, `html-text`, or `html`), and any optional parameters for the Zyte API. +- **Initialization**: Initialize the ZyteSerpReader by providing the API key and the option for extraction ("httpResponseBody" or "browserHtml"). ```python - from llama_index.readers.zyte_serp import ZyteWebReader + from llama_index.readers.zyte_serp import ZyteSerpReader zyte_serp = ZyteSerpReader( api_key="your_api_key_here", @@ -29,7 +23,7 @@ ZyteSerp can be used to add organic search results from Google Search. It takes ) ``` -- **Loading Data**: To load data, use the `load_data` method with the URLs you wish to process. +- **Loading Data**: To load search results, use the `load_data` method with the query you wish to search. ```python documents = zyte_serp.load_data(query="llama index docs") @@ -37,17 +31,32 @@ documents = zyte_serp.load_data(query="llama index docs") ### Example Usage -Here is an example demonstrating how to initialize the ZyteWebReader, load document from a URL. +Here is an example demonstrating how to initialize the ZyteSerpReader and get top search URLs. +Further the content from these URLs can be loaded using ZyteWebReader in "article" mode to obtain just the article content from webpage. ```python +from llama_index.readers.zyte_serp import ZyteSerpReader +from llama_index.readers.web.zyte.base import ZyteWebReader + # Initialize the ZyteSerpReader with your API key zyte_serp = ZyteSerpReader( api_key="your_api_key_here", # Replace with your actual API key ) -# Load documents from Paul G -documents = zyte_serp.load_data(urls="llama index docs") +# Get the search results (URLs from google search results) +search_urls = zyte_serp.load_data(query="llama index docs") + +# Display the results +print(search_urls) + +urls = [result.text for result in search_urls] + +# Initialize the ZyteWebReader to load the content from search results +zyte_web = ZyteWebReader( + api_key="your_api_key_here", # Replace with your actual API key + mode="article", +) -# Display the document +documents = zyte_web.load_data(urls) print(documents) ``` From 63ecfd4e4e437e007edc94d5bca05fa38b6a354a Mon Sep 17 00:00:00 2001 From: Rakesh mehta Date: Mon, 7 Oct 2024 11:03:49 +0100 Subject: [PATCH 06/10] Minor fixes --- .../llama_index/readers/zyte_serp/base.py | 30 ++++++++----------- 1 file changed, 13 insertions(+), 17 deletions(-) diff --git a/llama-index-integrations/readers/llama-index-readers-zyte-serp/llama_index/readers/zyte_serp/base.py b/llama-index-integrations/readers/llama-index-readers-zyte-serp/llama_index/readers/zyte_serp/base.py index 2a6fd42d294a8..6c52a6c9f193c 100644 --- a/llama-index-integrations/readers/llama-index-readers-zyte-serp/llama_index/readers/zyte_serp/base.py +++ b/llama-index-integrations/readers/llama-index-readers-zyte-serp/llama_index/readers/zyte_serp/base.py @@ -1,4 +1,4 @@ -from typing import Literal, Optional +from typing import Optional from pydantic import Field from llama_index.core.readers.base import BasePydanticReader @@ -10,8 +10,8 @@ class ZyteSerpReader(BasePydanticReader): Args: api_key: Zyte API key. - extract_from: Determines the mode while extracting the . - It can take one of the following values: 'html', 'html-text', 'article' + extract_from: Determines the mode while extracting the search results. + It can take one of the following values: 'httpResponseBody', 'browserHtml' Example: .. code-block:: python @@ -26,33 +26,26 @@ class ZyteSerpReader(BasePydanticReader): ) Zyte-API reference: - https://www.zyte.com/zyte-api/ + https://docs.zyte.com/zyte-api/get-started.html """ client: Optional[object] = Field(None) api_key: str - extract_from: str + extract_from: Optional[str] def __init__( self, api_key: str, - extract_from: Literal["httpResponseBody", "browserHtml"] = "httpResponseBody", + extract_from: Optional[str] = None, ) -> None: """Initialize with file path.""" super().__init__( api_key=api_key, extract_from=extract_from, ) - try: - from zyte_api import ZyteAPI - from zyte_api.utils import USER_AGENT as PYTHON_ZYTE_API_USER_AGENT - - except ImportError: - raise ImportError( - "zyte-api package not found, please install it with " - "`pip install zyte-api`" - ) + from zyte_api import ZyteAPI + from zyte_api.utils import USER_AGENT as PYTHON_ZYTE_API_USER_AGENT user_agent = f"llama-index-zyte-api/{PYTHON_ZYTE_API_USER_AGENT}" self.client = ZyteAPI( @@ -61,16 +54,19 @@ def __init__( ) def _serp_url(self, query: str): + from urllib.parse import quote_plus + base_url = "https://www.google.com/search?q=" - return base_url + query.replace(" ", "+") + return base_url + quote_plus(query) def load_data(self, query: str): serp_url = self._serp_url(query) serp_request = { "url": serp_url, "serp": True, - "serpOptions": {"extractFrom": self.extract_from}, } + if self.extract_from: + serp_request.update({"serpOptions": {"extractFrom": self.extract_from}}) results = self.client.get(serp_request) docs = [] for result in results["serp"]["organicResults"]: From ac25c18fe20b7e29fd38fa7f1e71f803fc3730ed Mon Sep 17 00:00:00 2001 From: Rakesh mehta Date: Tue, 8 Oct 2024 09:57:18 +0100 Subject: [PATCH 07/10] Add build files --- .../llama_index/readers/zyte_serp/BUILD | 1 + .../readers/llama-index-readers-zyte-serp/tests/BUILD | 1 + 2 files changed, 2 insertions(+) create mode 100644 llama-index-integrations/readers/llama-index-readers-zyte-serp/llama_index/readers/zyte_serp/BUILD create mode 100644 llama-index-integrations/readers/llama-index-readers-zyte-serp/tests/BUILD diff --git a/llama-index-integrations/readers/llama-index-readers-zyte-serp/llama_index/readers/zyte_serp/BUILD b/llama-index-integrations/readers/llama-index-readers-zyte-serp/llama_index/readers/zyte_serp/BUILD new file mode 100644 index 0000000000000..db46e8d6c978c --- /dev/null +++ b/llama-index-integrations/readers/llama-index-readers-zyte-serp/llama_index/readers/zyte_serp/BUILD @@ -0,0 +1 @@ +python_sources() diff --git a/llama-index-integrations/readers/llama-index-readers-zyte-serp/tests/BUILD b/llama-index-integrations/readers/llama-index-readers-zyte-serp/tests/BUILD new file mode 100644 index 0000000000000..db46e8d6c978c --- /dev/null +++ b/llama-index-integrations/readers/llama-index-readers-zyte-serp/tests/BUILD @@ -0,0 +1 @@ +python_sources() From 14770c7e45b982cea3af47228f4af190aeedca02 Mon Sep 17 00:00:00 2001 From: Rakesh mehta Date: Tue, 8 Oct 2024 11:10:16 +0100 Subject: [PATCH 08/10] Update tools.llamahub section --- .../readers/llama-index-readers-zyte-serp/pyproject.toml | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/llama-index-integrations/readers/llama-index-readers-zyte-serp/pyproject.toml b/llama-index-integrations/readers/llama-index-readers-zyte-serp/pyproject.toml index f630ef09c3d45..508b5709ac210 100644 --- a/llama-index-integrations/readers/llama-index-readers-zyte-serp/pyproject.toml +++ b/llama-index-integrations/readers/llama-index-readers-zyte-serp/pyproject.toml @@ -5,13 +5,14 @@ requires = ["poetry-core"] [tool.codespell] check-filenames = true check-hidden = true -# [tool.llamahub] -contains_example = false -import_path = "llama_index.readers.zyte_serp" # Feel free to un-skip examples, and experimental, you will just need to # work through many typos (--write-changes and --interactive will help) skip = "*.csv,*.html,*.json,*.jsonl,*.pdf,*.txt,*.ipynb" +[tool.llamahub] +contains_example = false +import_path = "llama_index.readers.zyte_serp" + [tool.llamahub.class_authors] ZyteSerpReader = "rakeshmehta0308" From 3e64de46f32bb8e06e852a2cb253ca55dc40af5e Mon Sep 17 00:00:00 2001 From: Logan Markewich Date: Tue, 8 Oct 2024 15:43:11 -0600 Subject: [PATCH 09/10] build files --- .../readers/llama-index-readers-zyte-serp/BUILD | 4 ---- .../readers/llama-index-readers-zyte-serp/pyproject.toml | 3 ++- .../readers/llama-index-readers-zyte-serp/requirements.txt | 1 - .../readers/llama-index-readers-zyte-serp/tests/BUILD | 2 +- 4 files changed, 3 insertions(+), 7 deletions(-) delete mode 100644 llama-index-integrations/readers/llama-index-readers-zyte-serp/requirements.txt diff --git a/llama-index-integrations/readers/llama-index-readers-zyte-serp/BUILD b/llama-index-integrations/readers/llama-index-readers-zyte-serp/BUILD index 2d3d88d1eab9c..0896ca890d8bf 100644 --- a/llama-index-integrations/readers/llama-index-readers-zyte-serp/BUILD +++ b/llama-index-integrations/readers/llama-index-readers-zyte-serp/BUILD @@ -1,7 +1,3 @@ poetry_requirements( name="poetry", ) - -python_requirements( - name="reqs", -) diff --git a/llama-index-integrations/readers/llama-index-readers-zyte-serp/pyproject.toml b/llama-index-integrations/readers/llama-index-readers-zyte-serp/pyproject.toml index 508b5709ac210..f37a513668be6 100644 --- a/llama-index-integrations/readers/llama-index-readers-zyte-serp/pyproject.toml +++ b/llama-index-integrations/readers/llama-index-readers-zyte-serp/pyproject.toml @@ -34,7 +34,8 @@ version = "0.1.0" [tool.poetry.dependencies] python = ">=3.8.1,<4.0" -llama-index-core = "^0.10.0" +llama-index-core = "^0.11.0" +zyte-api = "*" [tool.poetry.group.dev.dependencies] black = {extras = ["jupyter"], version = "<=23.9.1,>=23.7.0"} diff --git a/llama-index-integrations/readers/llama-index-readers-zyte-serp/requirements.txt b/llama-index-integrations/readers/llama-index-readers-zyte-serp/requirements.txt deleted file mode 100644 index 0773eeff4d0e5..0000000000000 --- a/llama-index-integrations/readers/llama-index-readers-zyte-serp/requirements.txt +++ /dev/null @@ -1 +0,0 @@ -zyte-api diff --git a/llama-index-integrations/readers/llama-index-readers-zyte-serp/tests/BUILD b/llama-index-integrations/readers/llama-index-readers-zyte-serp/tests/BUILD index db46e8d6c978c..dabf212d7e716 100644 --- a/llama-index-integrations/readers/llama-index-readers-zyte-serp/tests/BUILD +++ b/llama-index-integrations/readers/llama-index-readers-zyte-serp/tests/BUILD @@ -1 +1 @@ -python_sources() +python_tests() From 9d2c25be41ee994582ee71f369ddaf4423e0c943 Mon Sep 17 00:00:00 2001 From: Logan Markewich Date: Tue, 8 Oct 2024 15:46:57 -0600 Subject: [PATCH 10/10] types --- .../llama_index/readers/zyte_serp/base.py | 20 ++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/llama-index-integrations/readers/llama-index-readers-zyte-serp/llama_index/readers/zyte_serp/base.py b/llama-index-integrations/readers/llama-index-readers-zyte-serp/llama_index/readers/zyte_serp/base.py index 6c52a6c9f193c..e0e26b9dff546 100644 --- a/llama-index-integrations/readers/llama-index-readers-zyte-serp/llama_index/readers/zyte_serp/base.py +++ b/llama-index-integrations/readers/llama-index-readers-zyte-serp/llama_index/readers/zyte_serp/base.py @@ -1,9 +1,11 @@ from typing import Optional -from pydantic import Field from llama_index.core.readers.base import BasePydanticReader from llama_index.core.schema import Document +from zyte_api import ZyteAPI +from zyte_api.utils import USER_AGENT as PYTHON_ZYTE_API_USER_AGENT + class ZyteSerpReader(BasePydanticReader): """Get google search results URLs for a search query. @@ -30,7 +32,7 @@ class ZyteSerpReader(BasePydanticReader): """ - client: Optional[object] = Field(None) + client: ZyteAPI api_key: str extract_from: Optional[str] @@ -40,17 +42,17 @@ def __init__( extract_from: Optional[str] = None, ) -> None: """Initialize with file path.""" - super().__init__( + user_agent = f"llama-index-zyte-api/{PYTHON_ZYTE_API_USER_AGENT}" + + client = ZyteAPI( api_key=api_key, - extract_from=extract_from, + user_agent=user_agent, ) - from zyte_api import ZyteAPI - from zyte_api.utils import USER_AGENT as PYTHON_ZYTE_API_USER_AGENT - user_agent = f"llama-index-zyte-api/{PYTHON_ZYTE_API_USER_AGENT}" - self.client = ZyteAPI( + super().__init__( api_key=api_key, - user_agent=user_agent, + extract_from=extract_from, + client=client, ) def _serp_url(self, query: str):