Merge pull request #200 from LitZeus/add-ai-pdf-chatbot

Added ChatDocs AI: An AI chatbot for PDF interactions
UppuluriKalyani · Oct 10, 2024 · 1598368 · 1598368
2 parents 19d1558 + 0677a92
commit 1598368
Show file tree

Hide file tree

Showing 7 changed files with 260 additions and 0 deletions.
diff --git a/Generative Models/ChatDocs-AI/README.md b/Generative Models/ChatDocs-AI/README.md
@@ -0,0 +1,41 @@
+# ChatDocs AI
+RAG-based generative AI application enabling interactive communication with PDF documents. Developed using Streamlit and Groq AI Inference technology.
+
+# Supported Models
+This application makes use of following LLMs:
+  - Chat Models — Groq AI:
+      - Llama3-8b-8192
+      - Llama3-70b-8192
+      - Mixtral-8x7b-32768
+      - Gemma-7b-it
+  - Embeddings -- OpenAI
+      - Text-embedding-ada-002-v2
+
+# System Requirements
+- Python 3.9 or later (earlier versions are not compatible).
+
+# Installation
+1. Fork the repository and then follow the steps given below!
+
+2. Clone the repository
+```bash
+git clone https://github.com/<your-username>/ML-Nexus.git
+cd ML-Nexus/Generative Models/ChatDocs-AI
+```
+3. Create and activate a virtual environment:
+```bash
+python -m venv venv
+source venv/bin/activate  # On Windows: venv\Scripts\activate
+```
+4. Install required Python packages:
+```bash
+pip install -r requirements.txt
+```
+
+5. Run the application:
+```bash
+streamlit run main.py
+```
+
+# Snapshots
+![image](https://github.com/user-attachments/assets/7e518fc4-c70c-44fd-9719-24b78bf0e5c9)
diff --git a/Generative Models/ChatDocs-AI/chat_handler.py b/Generative Models/ChatDocs-AI/chat_handler.py
@@ -0,0 +1,9 @@
+from langchain.chains import create_retrieval_chain
+from langchain.chains.combine_documents import create_stuff_documents_chain
+
+# Get LLM response to user query
+def get_llm_response(llm, prompt_template, question):
+    doc_chain = create_stuff_documents_chain(llm, prompt_template)
+    retrieval_chain = create_retrieval_chain(st.session_state.vector_store.as_retriever(), doc_chain)
+    response = retrieval_chain.invoke({'input': question})
+    return response
diff --git a/Generative Models/ChatDocs-AI/config.py b/Generative Models/ChatDocs-AI/config.py
@@ -0,0 +1,31 @@
+import streamlit as st
+from dotenv import load_dotenv
+
+load_dotenv()
+
+# Function for API configuration
+def sidebar_api_key_configuration():
+    st.sidebar.subheader("API Keys")
+    openai_api_key = st.sidebar.text_input("Enter your OpenAI API Key 🗝️", type="password", help='Get API Key here')
+    groq_api_key = st.sidebar.text_input("Enter your Groq API Key 🗝️", type="password", help='Get API Key here')
+
+    if not all([openai_api_key, groq_api_key]):
+        st.sidebar.warning('Enter both API Keys')
+        st.session_state.prompt_activation = False
+    elif valid_keys(openai_api_key, groq_api_key):
+        st.sidebar.success('Keys valid. Ready to proceed!')
+        st.session_state.prompt_activation = True
+    else:
+        st.sidebar.warning('Invalid API keys')
+        st.session_state.prompt_activation = False
+
+    return openai_api_key, groq_api_key
+
+
+def valid_keys(openai_key, groq_key):
+    return openai_key.startswith('sk-') and groq_key.startswith('gsk_')
+
+# Model Selection in sidebar
+def sidebar_groq_model_selection():
+    st.sidebar.subheader("Model Selection")
+    return st.sidebar.selectbox('Select Model', ('Llama3-8b', 'Llama3-70b', 'Mixtral-8x7b', 'Gemma-7b'))
diff --git a/Generative Models/ChatDocs-AI/main.py b/Generative Models/ChatDocs-AI/main.py
@@ -0,0 +1,50 @@
+import streamlit as st
+from config import sidebar_api_key_configuration, sidebar_groq_model_selection
+from pdf_processing import create_vectorstore
+from chat_handler import get_llm_response
+from streamlit_option_menu import option_menu
+from langchain_core.prompts import ChatPromptTemplate
+from langchain_groq import ChatGroq
+
+# Page Configuration
+st.set_page_config(page_title="ChatDocs AI", page_icon=":robot_face:", layout="centered")
+
+# Session State Variables
+for key in ["vector_store", "response", "prompt_activation", "conversation", "chat_history", "prompt"]:
+    if key not in st.session_state:
+        st.session_state[key] = None
+
+openai_api_key, groq_api_key = sidebar_api_key_configuration()
+model = sidebar_groq_model_selection()
+
+# Main App Interface
+st.title("ChatDocs AI :robot_face:")
+st.write("*Interrogate Documents, Ignite Insights*")
+
+selected = option_menu(menu_title=None, options=["ChatDocs AI", "Reference", "About"], icons=["robot", "bi-file-text-fill", "app"], orientation="horizontal")
+llm = ChatGroq(groq_api_key=groq_api_key, model_name=model)
+prompt_template = ChatPromptTemplate.from_template("Answer based on provided context only: {context} Questions: {input}")
+
+# ChatDocs AI Section
+if selected == "PDF ChatDocs AI":
+    st.subheader("Upload PDF(s)")
+    pdf_docs = st.file_uploader("Upload PDFs", type=['pdf'], accept_multiple_files=True, disabled=not st.session_state.prompt_activation)
+    process = st.button("Process", type="primary", disabled=not pdf_docs)
+
+    if process:
+        st.session_state.vector_store = create_vectorstore(openai_api_key, pdf_docs)
+        st.session_state.prompt = True
+        st.success('Database is ready')
+
+    if "messages" not in st.session_state:
+        st.session_state["messages"] = [{"role": "assistant", "content": "How can I help you?"}]
+
+    for msg in st.session_state.messages:
+        st.chat_message(msg["role"]).write(msg["content"])
+
+    if question := st.chat_input(placeholder="Ask a document-related question", disabled=not st.session_state.prompt):
+        st.session_state.messages.append({"role": "user", "content": question})
+        with st.spinner('Processing...'):
+            st.session_state.response = get_llm_response(llm, prompt_template, question)
+            st.session_state.messages.append({"role": "assistant", "content": st.session_state.response['answer']})
+            st.chat_message("assistant").write(st.session_state.response['answer'])
diff --git a/Generative Models/ChatDocs-AI/pdf_processing.py b/Generative Models/ChatDocs-AI/pdf_processing.py
@@ -0,0 +1,20 @@
+from pypdf import PdfReader
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain_openai import OpenAIEmbeddings
+from langchain_community.vectorstores import FAISS
+
+# Read PDF data
+def read_pdf_data(pdf_docs):
+    text = "".join([page.extract_text() for pdf in pdf_docs for page in PdfReader(pdf).pages])
+    return text
+
+# Split data into chunks
+def split_data(text):
+    splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
+    return splitter.split_text(text)
+
+# Create vector store from PDF data
+def create_vectorstore(openai_api_key, pdf_docs):
+    text_chunks = split_data(read_pdf_data(pdf_docs))
+    embeddings = OpenAIEmbeddings(openai_api_key=openai_api_key)
+    return FAISS.from_texts(texts=text_chunks, embedding=embeddings)
diff --git a/Generative Models/ChatDocs-AI/requirements.txt b/Generative Models/ChatDocs-AI/requirements.txt
@@ -0,0 +1,73 @@
+# --- Core Libraries ---
+numpy==1.26.4              # Core numerical computations
+pandas==2.2.2              # Data manipulation and analysis
+python-dateutil==2.9.0.post0  # Date and time handling
+pytz==2024.1               # Timezone handling
+
+# --- Web Frameworks & Streamlit ---
+streamlit==1.35.0          # Web app framework for interactive data applications
+streamlit-camera-input-live==0.2.0  # Camera input for Streamlit apps
+streamlit-option-menu==0.3.12  # Sidebar navigation menu
+streamlit-extras==0.4.2     # Extra Streamlit widgets
+streamlit-card==1.0.2       # Streamlit widget to create card components
+streamlit-toggle-switch==1.0.2  # Toggle switch widget
+streamlit-faker==0.0.3      # Streamlit widget to generate fake data for testing
+st-annotated-text==4.0.1    # Streamlit annotated text widget for UI
+streamlit-vertical-slider==2.5.5  # Custom slider for Streamlit
+streamlit-image-coordinates==0.1.6  # Capture image coordinates in Streamlit
+
+# --- PDF and Document Processing ---
+pypdf==4.2.0               # PDF file reading and manipulation
+lxml==5.2.2                # XML and HTML processing
+faiss-cpu==1.8.0           # FAISS for similarity search and vector stores
+
+# --- LangChain & Language Models ---
+langchain==0.2.1           # LangChain framework for LLMs
+langchain-community==0.2.1 # LangChain extensions by the community
+langchain-core==0.2.1      # Core LangChain package
+langchain-openai==0.1.7    # OpenAI embeddings for LangChain
+langchain-groq==0.1.4      # Groq AI integration for LangChain
+
+# --- Large Language Model Embeddings & Similarity ---
+openai==1.30.2             # OpenAI's API wrapper
+tiktoken==0.7.0            # Tokenization library for LLMs
+Faker==25.2.0              # Fake data generation for testing
+
+# --- Asynchronous & HTTP Requests ---
+aiohttp==3.9.5             # Asynchronous HTTP client/server framework
+httpx==0.27.0              # HTTP client for Python, async-friendly
+certifi==2024.2.2          # Certificate handling for secure HTTPS connections
+
+# --- API Utilities & Environment ---
+python-dotenv==1.0.1       # Load environment variables from .env files
+requests==2.32.2           # Simplified HTTP request library
+validators==0.28.2         # Data validation utilities
+
+# --- Visualization & UI ---
+altair==5.3.0              # Declarative statistical visualization library
+matplotlib==3.9.0          # Plotting library for static, animated, and interactive visualizations
+pydeck==0.9.1              # 3D map visualization
+pillow==10.3.0             # Image processing capabilities
+
+# --- Schema Validation & Dataclasses ---
+dataclasses-json==0.6.6    # Serialization and deserialization for dataclasses
+pydantic==2.7.1            # Data validation and settings management using Python type annotations
+jsonschema==4.22.0         # JSON schema validation
+
+# --- Utility Libraries ---
+tqdm==4.66.4               # Progress bars for loops
+tenacity==8.3.0            # Retry library for dealing with unreliable APIs
+toolz==0.12.1              # Functional programming utilities
+more-itertools==10.2.0     # Additional tools for Python's `itertools`
+
+# --- Others (Logging, Markup, Miscellaneous) ---
+blinker==1.8.2             # Event-driven architecture support
+Markdown==3.6              # Support for Markdown in Python
+rich==13.7.1               # Rich text formatting for the terminal
+GitPython==3.1.43          # Git version control via Python
+watchdog==4.0.1            # File system monitoring
+
+# --- FAISS Dependencies (for AI similarity search) ---
+protobuf==4.25.3           # Protocol buffers for FAISS
+orjson==3.10.3             # Fast JSON parsing
+pyarrow==16.1.0            # High-performance data processing with FAISS
diff --git a/Generative Models/ChatDocs-AI/util.py b/Generative Models/ChatDocs-AI/util.py
@@ -0,0 +1,36 @@
+import os
+from langchain_openai import OpenAIEmbeddings
+from langchain_community.vectorstores import FAISS
+from dotenv import load_dotenv
+
+# Load environment variables from .env file
+load_dotenv()
+
+# Utility function for vector store creation
+def create_vectorstore(text_chunks, openai_api_key):
+    """
+    Create a FAISS vector store from text chunks using OpenAI embeddings.
+    """
+    embeddings = OpenAIEmbeddings(openai_api_key=openai_api_key)
+    vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings)
+    return vectorstore
+
+# API key validation utility
+def valid_api_keys(openai_key, groq_key):
+    """
+    Validate OpenAI and Groq API keys.
+    """
+    return openai_key.startswith('sk-') and groq_key.startswith('gsk_')
+
+# Load API keys from environment variables
+def load_api_keys():
+    """
+    Load API keys from environment variables or a .env file.
+    """
+    openai_api_key = os.getenv('OPENAI_API_KEY')
+    groq_api_key = os.getenv('GROQ_API_KEY')
+
+    if not openai_api_key or not groq_api_key:
+        raise ValueError("API keys are missing! Please provide valid OpenAI and Groq API keys.")
+
+    return openai_api_key, groq_api_key