diff --git a/Makefile b/Makefile index 2ec1704..834b5d9 100644 --- a/Makefile +++ b/Makefile @@ -4,7 +4,7 @@ lint: ruff check . --fix format-nb: - black --ipynb examples/*.ipynb + black --ipynb docs/examples/*.ipynb format: format-nb black . diff --git a/docs/examples/deepseek_demo.ipynb b/docs/examples/deepseek_demo.ipynb new file mode 100644 index 0000000..db09bb5 --- /dev/null +++ b/docs/examples/deepseek_demo.ipynb @@ -0,0 +1,68 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Using Vision Parse with DeepSeek" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This notebook demonstrates how to use Vision Parse with DeepSeek." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!pip install 'vision-parse[openai]' -Uqq # install the vision-parse package with openai" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from vision_parse import VisionParser\n", + "import os\n", + "\n", + "# Initialize parser\n", + "parser = VisionParser(\n", + " model_name=\"deepseek-chat\",\n", + " api_key=os.getenv(\"DEEPSEEK_API_KEY\"),\n", + " temperature=0.9,\n", + " top_p=0.4,\n", + " image_mode=None,\n", + " detailed_extraction=True,\n", + " enable_concurrency=True,\n", + ")\n", + "\n", + "pdf_path = \"../tests/Texas-Holdem-Rules.pdf\"\n", + "markdown_pages = parser.convert_pdf(pdf_path)\n", + "\n", + "# Print the markdown pages\n", + "for i, page_content in enumerate(markdown_pages):\n", + " print(f\"\\n--- Page {i+1} ---\\n{page_content}\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "personal", + "language": "python", + "name": "python3" + }, + "language_info": { + "name": "python", + "version": "3.10.9" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/pyproject.toml b/pyproject.toml index e4e0a7a..d8f4fe0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -61,11 +61,11 @@ gemini = [ "google-generativeai==0.8.3", ] openai = [ - "openai>=1.59.8", + "openai==1.59.8", ] all = [ "google-generativeai==0.8.3", - "openai>=1.59.8", + "openai==1.59.8", ] [tool.hatch.build.targets.wheel] diff --git a/src/vision_parse/llm.py b/src/vision_parse/llm.py index 3a1dd6b..7690f49 100644 --- a/src/vision_parse/llm.py +++ b/src/vision_parse/llm.py @@ -306,7 +306,7 @@ async def _get_response( ): if self.provider == "ollama": return await self._ollama(base64_encoded, prompt, structured) - elif self.provider == "openai": + elif self.provider == "openai" or self.provider == "deepseek": return await self._openai(base64_encoded, prompt, structured) elif self.provider == "gemini": return await self._gemini(base64_encoded, prompt, structured) diff --git a/tests/test_llm.py b/tests/test_llm.py index 8d0aaf4..f411c37 100644 --- a/tests/test_llm.py +++ b/tests/test_llm.py @@ -293,6 +293,66 @@ async def test_gemini_generate_markdown( assert mock_client.generate_content_async.call_count == 2 +@pytest.mark.asyncio +@patch("openai.AsyncOpenAI") +async def test_deepseek_generate_markdown( + MockAsyncOpenAI, sample_base64_image, mock_pixmap +): + """Test markdown generation using Deepseek.""" + mock_client = AsyncMock() + MockAsyncOpenAI.return_value = mock_client + + # Mock structured analysis response + mock_parse = AsyncMock() + mock_parse.choices = [ + AsyncMock( + message=AsyncMock( + content=json.dumps( + { + "text_detected": "Yes", + "tables_detected": "No", + "images_detected": "No", + "latex_equations_detected": "No", + "extracted_text": "Test content", + "confidence_score_text": 0.9, + } + ) + ) + ) + ] + + # Mock markdown conversion response + mock_create = AsyncMock() + mock_create.choices = [ + AsyncMock(message=AsyncMock(content="# Test Header\n\nTest content")) + ] + # Set up side effects to return mock_parse first, then mock_create + mock_client.chat.completions.create = AsyncMock( + side_effect=[mock_parse, mock_create] + ) + + llm = LLM( + model_name="deepseek-chat", + api_key="test-key", + temperature=0.7, + top_p=0.7, + ollama_config=None, + openai_config=None, + gemini_config=None, + image_mode=None, + custom_prompt=None, + detailed_extraction=True, + enable_concurrency=True, + device=None, + num_workers=1, + ) + result = await llm.generate_markdown(sample_base64_image, mock_pixmap, 0) + + assert isinstance(result, str) + assert "Test content" in result + assert mock_client.chat.completions.create.call_count == 2 + + @pytest.mark.asyncio @patch("ollama.AsyncClient") async def test_ollama_base64_image_mode( @@ -389,65 +449,3 @@ async def test_ollama_llm_error(mock_async_client, sample_base64_image, mock_pix await llm.generate_markdown(sample_base64_image, mock_pixmap, 0) assert "Ollama Model processing failed" in str(exc_info.value) assert mock_client.chat.call_count == 1 - - -@pytest.mark.asyncio -@patch("openai.AsyncOpenAI") -async def test_openai_llm_error(MockAsyncOpenAI, sample_base64_image, mock_pixmap): - """Test LLMError handling for OpenAI.""" - mock_client = AsyncMock() - MockAsyncOpenAI.return_value = mock_client - - # Mock API error for markdown generation - mock_client.chat.completions.create.side_effect = Exception("OpenAI API error") - - llm = LLM( - model_name="gpt-4o", - api_key="test-key", - temperature=0.7, - top_p=0.7, - ollama_config=None, - openai_config=None, - gemini_config=None, - image_mode=None, - custom_prompt=None, - detailed_extraction=True, - enable_concurrency=True, - device=None, - num_workers=1, - ) - - with pytest.raises(LLMError) as exc_info: - await llm.generate_markdown(sample_base64_image, mock_pixmap, 0) - assert "OpenAI Model processing failed" in str(exc_info.value) - - -@pytest.mark.asyncio -@patch("google.generativeai.GenerativeModel") -async def test_gemini_llm_error(MockGenerativeModel, sample_base64_image, mock_pixmap): - """Test LLMError handling for Gemini.""" - mock_client = AsyncMock() - MockGenerativeModel.return_value = mock_client - - # Mock API error - mock_client.generate_content.side_effect = Exception("Gemini API error") - - llm = LLM( - model_name="gemini-1.5-pro", - api_key="test-key", - temperature=0.7, - top_p=0.7, - ollama_config=None, - openai_config=None, - gemini_config=None, - image_mode=None, - custom_prompt=None, - detailed_extraction=True, - enable_concurrency=True, - device=None, - num_workers=1, - ) - - with pytest.raises(LLMError) as exc_info: - await llm.generate_markdown(sample_base64_image, mock_pixmap, 0) - assert "Gemini Model processing failed" in str(exc_info.value) diff --git a/uv.lock b/uv.lock index c26234a..e441ab0 100644 --- a/uv.lock +++ b/uv.lock @@ -1162,7 +1162,7 @@ wheels = [ [[package]] name = "openai" -version = "1.59.9" +version = "1.59.8" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "anyio", version = "4.5.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" }, @@ -1175,9 +1175,9 @@ dependencies = [ { name = "tqdm" }, { name = "typing-extensions" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/ec/2d/04faa92bac0341649223398503db4415d2f658a757d9d32bb68f3378ddd0/openai-1.59.9.tar.gz", hash = "sha256:ec1a20b0351b4c3e65c6292db71d8233515437c6065efd4fd50edeb55df5f5d2", size = 347134 } +sdist = { url = "https://files.pythonhosted.org/packages/e0/c4/b4482784de63c7158f6c0afcb07fd66450ea6c912d6bddf9d7599f2eda25/openai-1.59.8.tar.gz", hash = "sha256:ac4bda5fa9819fdc6127e8ea8a63501f425c587244bc653c7c11a8ad84f953e1", size = 346775 } wheels = [ - { url = "https://files.pythonhosted.org/packages/07/b4/57f1954a4560092ad8c45f07ad183eab9c8e093e0a1db829f9b506b2d5d1/openai-1.59.9-py3-none-any.whl", hash = "sha256:61a0608a1313c08ddf92fe793b6dbd1630675a1fe3866b2f96447ce30050c448", size = 455527 }, + { url = "https://files.pythonhosted.org/packages/8c/cf/5b235e12ead3cd2098f9792776c966994c1bc558cba5799e12f3045227df/openai-1.59.8-py3-none-any.whl", hash = "sha256:a8b8ee35c4083b88e6da45406d883cf6bd91a98ab7dd79178b8bc24c8bfb09d9", size = 455567 }, ] [[package]] @@ -1808,7 +1808,7 @@ wheels = [ [[package]] name = "vision-parse" -version = "0.1.12.dev2+g6d52db1.d20250122" +version = "0.1.12.dev15+g595bb42" source = { editable = "." } dependencies = [ { name = "jinja2" }, @@ -1854,8 +1854,8 @@ requires-dist = [ { name = "nest-asyncio", specifier = ">=1.6.0" }, { name = "numpy", specifier = ">=2.0.0" }, { name = "ollama", specifier = ">=0.4.4" }, - { name = "openai", marker = "extra == 'all'", specifier = ">=1.59.8" }, - { name = "openai", marker = "extra == 'openai'", specifier = ">=1.59.8" }, + { name = "openai", marker = "extra == 'all'", specifier = "==1.59.8" }, + { name = "openai", marker = "extra == 'openai'", specifier = "==1.59.8" }, { name = "opencv-python", specifier = ">=4.10.0.84" }, { name = "pydantic", specifier = ">=2.0.0" }, { name = "pymupdf", specifier = ">=1.22.0" },