Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added ChatDocs AI: An AI chatbot for PDF interactions #200

Merged
merged 4 commits into from
Oct 10, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 41 additions & 0 deletions Generative Models/ChatDocs-AI/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
# ChatDocs AI
RAG-based generative AI application enabling interactive communication with PDF documents. Developed using Streamlit and Groq AI Inference technology.

# Supported Models
This application makes use of following LLMs:
- Chat Models — Groq AI:
- Llama3-8b-8192
- Llama3-70b-8192
- Mixtral-8x7b-32768
- Gemma-7b-it
- Embeddings -- OpenAI
- Text-embedding-ada-002-v2

# System Requirements
- Python 3.9 or later (earlier versions are not compatible).

# Installation
1. Fork the repository and then follow the steps given below!

2. Clone the repository
```bash
git clone https://github.com/<your-username>/ML-Nexus.git
cd ML-Nexus/Generative Models/ChatDocs-AI
```
3. Create and activate a virtual environment:
```bash
python -m venv venv
source venv/bin/activate # On Windows: venv\Scripts\activate
```
4. Install required Python packages:
```bash
pip install -r requirements.txt
```

5. Run the application:
```bash
streamlit run main.py
```

# Snapshots
![image](https://github.com/user-attachments/assets/7e518fc4-c70c-44fd-9719-24b78bf0e5c9)
9 changes: 9 additions & 0 deletions Generative Models/ChatDocs-AI/chat_handler.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain

# Get LLM response to user query
def get_llm_response(llm, prompt_template, question):
doc_chain = create_stuff_documents_chain(llm, prompt_template)
retrieval_chain = create_retrieval_chain(st.session_state.vector_store.as_retriever(), doc_chain)
response = retrieval_chain.invoke({'input': question})
return response
31 changes: 31 additions & 0 deletions Generative Models/ChatDocs-AI/config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
import streamlit as st
from dotenv import load_dotenv

load_dotenv()

# Function for API configuration
def sidebar_api_key_configuration():
st.sidebar.subheader("API Keys")
openai_api_key = st.sidebar.text_input("Enter your OpenAI API Key 🗝️", type="password", help='Get API Key here')
groq_api_key = st.sidebar.text_input("Enter your Groq API Key 🗝️", type="password", help='Get API Key here')

if not all([openai_api_key, groq_api_key]):
st.sidebar.warning('Enter both API Keys')
st.session_state.prompt_activation = False
elif valid_keys(openai_api_key, groq_api_key):
st.sidebar.success('Keys valid. Ready to proceed!')
st.session_state.prompt_activation = True
else:
st.sidebar.warning('Invalid API keys')
st.session_state.prompt_activation = False

return openai_api_key, groq_api_key


def valid_keys(openai_key, groq_key):
return openai_key.startswith('sk-') and groq_key.startswith('gsk_')

# Model Selection in sidebar
def sidebar_groq_model_selection():
st.sidebar.subheader("Model Selection")
return st.sidebar.selectbox('Select Model', ('Llama3-8b', 'Llama3-70b', 'Mixtral-8x7b', 'Gemma-7b'))
50 changes: 50 additions & 0 deletions Generative Models/ChatDocs-AI/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
import streamlit as st
from config import sidebar_api_key_configuration, sidebar_groq_model_selection
from pdf_processing import create_vectorstore
from chat_handler import get_llm_response
from streamlit_option_menu import option_menu
from langchain_core.prompts import ChatPromptTemplate
from langchain_groq import ChatGroq

# Page Configuration
st.set_page_config(page_title="ChatDocs AI", page_icon=":robot_face:", layout="centered")

# Session State Variables
for key in ["vector_store", "response", "prompt_activation", "conversation", "chat_history", "prompt"]:
if key not in st.session_state:
st.session_state[key] = None

openai_api_key, groq_api_key = sidebar_api_key_configuration()
model = sidebar_groq_model_selection()

# Main App Interface
st.title("ChatDocs AI :robot_face:")
st.write("*Interrogate Documents, Ignite Insights*")

selected = option_menu(menu_title=None, options=["ChatDocs AI", "Reference", "About"], icons=["robot", "bi-file-text-fill", "app"], orientation="horizontal")
llm = ChatGroq(groq_api_key=groq_api_key, model_name=model)
prompt_template = ChatPromptTemplate.from_template("Answer based on provided context only: {context} Questions: {input}")

# ChatDocs AI Section
if selected == "PDF ChatDocs AI":
st.subheader("Upload PDF(s)")
pdf_docs = st.file_uploader("Upload PDFs", type=['pdf'], accept_multiple_files=True, disabled=not st.session_state.prompt_activation)
process = st.button("Process", type="primary", disabled=not pdf_docs)

if process:
st.session_state.vector_store = create_vectorstore(openai_api_key, pdf_docs)
st.session_state.prompt = True
st.success('Database is ready')

if "messages" not in st.session_state:
st.session_state["messages"] = [{"role": "assistant", "content": "How can I help you?"}]

for msg in st.session_state.messages:
st.chat_message(msg["role"]).write(msg["content"])

if question := st.chat_input(placeholder="Ask a document-related question", disabled=not st.session_state.prompt):
st.session_state.messages.append({"role": "user", "content": question})
with st.spinner('Processing...'):
st.session_state.response = get_llm_response(llm, prompt_template, question)
st.session_state.messages.append({"role": "assistant", "content": st.session_state.response['answer']})
st.chat_message("assistant").write(st.session_state.response['answer'])
20 changes: 20 additions & 0 deletions Generative Models/ChatDocs-AI/pdf_processing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
from pypdf import PdfReader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import FAISS

# Read PDF data
def read_pdf_data(pdf_docs):
text = "".join([page.extract_text() for pdf in pdf_docs for page in PdfReader(pdf).pages])
return text

# Split data into chunks
def split_data(text):
splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
return splitter.split_text(text)

# Create vector store from PDF data
def create_vectorstore(openai_api_key, pdf_docs):
text_chunks = split_data(read_pdf_data(pdf_docs))
embeddings = OpenAIEmbeddings(openai_api_key=openai_api_key)
return FAISS.from_texts(texts=text_chunks, embedding=embeddings)
73 changes: 73 additions & 0 deletions Generative Models/ChatDocs-AI/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
# --- Core Libraries ---
numpy==1.26.4 # Core numerical computations
pandas==2.2.2 # Data manipulation and analysis
python-dateutil==2.9.0.post0 # Date and time handling
pytz==2024.1 # Timezone handling

# --- Web Frameworks & Streamlit ---
streamlit==1.35.0 # Web app framework for interactive data applications
streamlit-camera-input-live==0.2.0 # Camera input for Streamlit apps
streamlit-option-menu==0.3.12 # Sidebar navigation menu
streamlit-extras==0.4.2 # Extra Streamlit widgets
streamlit-card==1.0.2 # Streamlit widget to create card components
streamlit-toggle-switch==1.0.2 # Toggle switch widget
streamlit-faker==0.0.3 # Streamlit widget to generate fake data for testing
st-annotated-text==4.0.1 # Streamlit annotated text widget for UI
streamlit-vertical-slider==2.5.5 # Custom slider for Streamlit
streamlit-image-coordinates==0.1.6 # Capture image coordinates in Streamlit

# --- PDF and Document Processing ---
pypdf==4.2.0 # PDF file reading and manipulation
lxml==5.2.2 # XML and HTML processing
faiss-cpu==1.8.0 # FAISS for similarity search and vector stores

# --- LangChain & Language Models ---
langchain==0.2.1 # LangChain framework for LLMs
langchain-community==0.2.1 # LangChain extensions by the community
langchain-core==0.2.1 # Core LangChain package
langchain-openai==0.1.7 # OpenAI embeddings for LangChain
langchain-groq==0.1.4 # Groq AI integration for LangChain

# --- Large Language Model Embeddings & Similarity ---
openai==1.30.2 # OpenAI's API wrapper
tiktoken==0.7.0 # Tokenization library for LLMs
Faker==25.2.0 # Fake data generation for testing

# --- Asynchronous & HTTP Requests ---
aiohttp==3.9.5 # Asynchronous HTTP client/server framework
httpx==0.27.0 # HTTP client for Python, async-friendly
certifi==2024.2.2 # Certificate handling for secure HTTPS connections

# --- API Utilities & Environment ---
python-dotenv==1.0.1 # Load environment variables from .env files
requests==2.32.2 # Simplified HTTP request library
validators==0.28.2 # Data validation utilities

# --- Visualization & UI ---
altair==5.3.0 # Declarative statistical visualization library
matplotlib==3.9.0 # Plotting library for static, animated, and interactive visualizations
pydeck==0.9.1 # 3D map visualization
pillow==10.3.0 # Image processing capabilities

# --- Schema Validation & Dataclasses ---
dataclasses-json==0.6.6 # Serialization and deserialization for dataclasses
pydantic==2.7.1 # Data validation and settings management using Python type annotations
jsonschema==4.22.0 # JSON schema validation

# --- Utility Libraries ---
tqdm==4.66.4 # Progress bars for loops
tenacity==8.3.0 # Retry library for dealing with unreliable APIs
toolz==0.12.1 # Functional programming utilities
more-itertools==10.2.0 # Additional tools for Python's `itertools`

# --- Others (Logging, Markup, Miscellaneous) ---
blinker==1.8.2 # Event-driven architecture support
Markdown==3.6 # Support for Markdown in Python
rich==13.7.1 # Rich text formatting for the terminal
GitPython==3.1.43 # Git version control via Python
watchdog==4.0.1 # File system monitoring

# --- FAISS Dependencies (for AI similarity search) ---
protobuf==4.25.3 # Protocol buffers for FAISS
orjson==3.10.3 # Fast JSON parsing
pyarrow==16.1.0 # High-performance data processing with FAISS
36 changes: 36 additions & 0 deletions Generative Models/ChatDocs-AI/util.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
import os
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import FAISS
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

# Utility function for vector store creation
def create_vectorstore(text_chunks, openai_api_key):
"""
Create a FAISS vector store from text chunks using OpenAI embeddings.
"""
embeddings = OpenAIEmbeddings(openai_api_key=openai_api_key)
vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings)
return vectorstore

# API key validation utility
def valid_api_keys(openai_key, groq_key):
"""
Validate OpenAI and Groq API keys.
"""
return openai_key.startswith('sk-') and groq_key.startswith('gsk_')

# Load API keys from environment variables
def load_api_keys():
"""
Load API keys from environment variables or a .env file.
"""
openai_api_key = os.getenv('OPENAI_API_KEY')
groq_api_key = os.getenv('GROQ_API_KEY')

if not openai_api_key or not groq_api_key:
raise ValueError("API keys are missing! Please provide valid OpenAI and Groq API keys.")

return openai_api_key, groq_api_key
Loading