Skip to content

Commit

Permalink
Merge pull request #200 from LitZeus/add-ai-pdf-chatbot
Browse files Browse the repository at this point in the history
Added ChatDocs AI: An AI chatbot for PDF interactions
  • Loading branch information
UppuluriKalyani authored Oct 10, 2024
2 parents 19d1558 + 0677a92 commit 1598368
Show file tree
Hide file tree
Showing 7 changed files with 260 additions and 0 deletions.
41 changes: 41 additions & 0 deletions Generative Models/ChatDocs-AI/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
# ChatDocs AI
RAG-based generative AI application enabling interactive communication with PDF documents. Developed using Streamlit and Groq AI Inference technology.

# Supported Models
This application makes use of following LLMs:
- Chat Models — Groq AI:
- Llama3-8b-8192
- Llama3-70b-8192
- Mixtral-8x7b-32768
- Gemma-7b-it
- Embeddings -- OpenAI
- Text-embedding-ada-002-v2

# System Requirements
- Python 3.9 or later (earlier versions are not compatible).

# Installation
1. Fork the repository and then follow the steps given below!

2. Clone the repository
```bash
git clone https://github.com/<your-username>/ML-Nexus.git
cd ML-Nexus/Generative Models/ChatDocs-AI
```
3. Create and activate a virtual environment:
```bash
python -m venv venv
source venv/bin/activate # On Windows: venv\Scripts\activate
```
4. Install required Python packages:
```bash
pip install -r requirements.txt
```

5. Run the application:
```bash
streamlit run main.py
```

# Snapshots
![image](https://github.com/user-attachments/assets/7e518fc4-c70c-44fd-9719-24b78bf0e5c9)
9 changes: 9 additions & 0 deletions Generative Models/ChatDocs-AI/chat_handler.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain

# Get LLM response to user query
def get_llm_response(llm, prompt_template, question):
doc_chain = create_stuff_documents_chain(llm, prompt_template)
retrieval_chain = create_retrieval_chain(st.session_state.vector_store.as_retriever(), doc_chain)
response = retrieval_chain.invoke({'input': question})
return response
31 changes: 31 additions & 0 deletions Generative Models/ChatDocs-AI/config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
import streamlit as st
from dotenv import load_dotenv

load_dotenv()

# Function for API configuration
def sidebar_api_key_configuration():
st.sidebar.subheader("API Keys")
openai_api_key = st.sidebar.text_input("Enter your OpenAI API Key 🗝️", type="password", help='Get API Key here')
groq_api_key = st.sidebar.text_input("Enter your Groq API Key 🗝️", type="password", help='Get API Key here')

if not all([openai_api_key, groq_api_key]):
st.sidebar.warning('Enter both API Keys')
st.session_state.prompt_activation = False
elif valid_keys(openai_api_key, groq_api_key):
st.sidebar.success('Keys valid. Ready to proceed!')
st.session_state.prompt_activation = True
else:
st.sidebar.warning('Invalid API keys')
st.session_state.prompt_activation = False

return openai_api_key, groq_api_key


def valid_keys(openai_key, groq_key):
return openai_key.startswith('sk-') and groq_key.startswith('gsk_')

# Model Selection in sidebar
def sidebar_groq_model_selection():
st.sidebar.subheader("Model Selection")
return st.sidebar.selectbox('Select Model', ('Llama3-8b', 'Llama3-70b', 'Mixtral-8x7b', 'Gemma-7b'))
50 changes: 50 additions & 0 deletions Generative Models/ChatDocs-AI/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
import streamlit as st
from config import sidebar_api_key_configuration, sidebar_groq_model_selection
from pdf_processing import create_vectorstore
from chat_handler import get_llm_response
from streamlit_option_menu import option_menu
from langchain_core.prompts import ChatPromptTemplate
from langchain_groq import ChatGroq

# Page Configuration
st.set_page_config(page_title="ChatDocs AI", page_icon=":robot_face:", layout="centered")

# Session State Variables
for key in ["vector_store", "response", "prompt_activation", "conversation", "chat_history", "prompt"]:
if key not in st.session_state:
st.session_state[key] = None

openai_api_key, groq_api_key = sidebar_api_key_configuration()
model = sidebar_groq_model_selection()

# Main App Interface
st.title("ChatDocs AI :robot_face:")
st.write("*Interrogate Documents, Ignite Insights*")

selected = option_menu(menu_title=None, options=["ChatDocs AI", "Reference", "About"], icons=["robot", "bi-file-text-fill", "app"], orientation="horizontal")
llm = ChatGroq(groq_api_key=groq_api_key, model_name=model)
prompt_template = ChatPromptTemplate.from_template("Answer based on provided context only: {context} Questions: {input}")

# ChatDocs AI Section
if selected == "PDF ChatDocs AI":
st.subheader("Upload PDF(s)")
pdf_docs = st.file_uploader("Upload PDFs", type=['pdf'], accept_multiple_files=True, disabled=not st.session_state.prompt_activation)
process = st.button("Process", type="primary", disabled=not pdf_docs)

if process:
st.session_state.vector_store = create_vectorstore(openai_api_key, pdf_docs)
st.session_state.prompt = True
st.success('Database is ready')

if "messages" not in st.session_state:
st.session_state["messages"] = [{"role": "assistant", "content": "How can I help you?"}]

for msg in st.session_state.messages:
st.chat_message(msg["role"]).write(msg["content"])

if question := st.chat_input(placeholder="Ask a document-related question", disabled=not st.session_state.prompt):
st.session_state.messages.append({"role": "user", "content": question})
with st.spinner('Processing...'):
st.session_state.response = get_llm_response(llm, prompt_template, question)
st.session_state.messages.append({"role": "assistant", "content": st.session_state.response['answer']})
st.chat_message("assistant").write(st.session_state.response['answer'])
20 changes: 20 additions & 0 deletions Generative Models/ChatDocs-AI/pdf_processing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
from pypdf import PdfReader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import FAISS

# Read PDF data
def read_pdf_data(pdf_docs):
text = "".join([page.extract_text() for pdf in pdf_docs for page in PdfReader(pdf).pages])
return text

# Split data into chunks
def split_data(text):
splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
return splitter.split_text(text)

# Create vector store from PDF data
def create_vectorstore(openai_api_key, pdf_docs):
text_chunks = split_data(read_pdf_data(pdf_docs))
embeddings = OpenAIEmbeddings(openai_api_key=openai_api_key)
return FAISS.from_texts(texts=text_chunks, embedding=embeddings)
73 changes: 73 additions & 0 deletions Generative Models/ChatDocs-AI/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
# --- Core Libraries ---
numpy==1.26.4 # Core numerical computations
pandas==2.2.2 # Data manipulation and analysis
python-dateutil==2.9.0.post0 # Date and time handling
pytz==2024.1 # Timezone handling

# --- Web Frameworks & Streamlit ---
streamlit==1.35.0 # Web app framework for interactive data applications
streamlit-camera-input-live==0.2.0 # Camera input for Streamlit apps
streamlit-option-menu==0.3.12 # Sidebar navigation menu
streamlit-extras==0.4.2 # Extra Streamlit widgets
streamlit-card==1.0.2 # Streamlit widget to create card components
streamlit-toggle-switch==1.0.2 # Toggle switch widget
streamlit-faker==0.0.3 # Streamlit widget to generate fake data for testing
st-annotated-text==4.0.1 # Streamlit annotated text widget for UI
streamlit-vertical-slider==2.5.5 # Custom slider for Streamlit
streamlit-image-coordinates==0.1.6 # Capture image coordinates in Streamlit

# --- PDF and Document Processing ---
pypdf==4.2.0 # PDF file reading and manipulation
lxml==5.2.2 # XML and HTML processing
faiss-cpu==1.8.0 # FAISS for similarity search and vector stores

# --- LangChain & Language Models ---
langchain==0.2.1 # LangChain framework for LLMs
langchain-community==0.2.1 # LangChain extensions by the community
langchain-core==0.2.1 # Core LangChain package
langchain-openai==0.1.7 # OpenAI embeddings for LangChain
langchain-groq==0.1.4 # Groq AI integration for LangChain

# --- Large Language Model Embeddings & Similarity ---
openai==1.30.2 # OpenAI's API wrapper
tiktoken==0.7.0 # Tokenization library for LLMs
Faker==25.2.0 # Fake data generation for testing

# --- Asynchronous & HTTP Requests ---
aiohttp==3.9.5 # Asynchronous HTTP client/server framework
httpx==0.27.0 # HTTP client for Python, async-friendly
certifi==2024.2.2 # Certificate handling for secure HTTPS connections

# --- API Utilities & Environment ---
python-dotenv==1.0.1 # Load environment variables from .env files
requests==2.32.2 # Simplified HTTP request library
validators==0.28.2 # Data validation utilities

# --- Visualization & UI ---
altair==5.3.0 # Declarative statistical visualization library
matplotlib==3.9.0 # Plotting library for static, animated, and interactive visualizations
pydeck==0.9.1 # 3D map visualization
pillow==10.3.0 # Image processing capabilities

# --- Schema Validation & Dataclasses ---
dataclasses-json==0.6.6 # Serialization and deserialization for dataclasses
pydantic==2.7.1 # Data validation and settings management using Python type annotations
jsonschema==4.22.0 # JSON schema validation

# --- Utility Libraries ---
tqdm==4.66.4 # Progress bars for loops
tenacity==8.3.0 # Retry library for dealing with unreliable APIs
toolz==0.12.1 # Functional programming utilities
more-itertools==10.2.0 # Additional tools for Python's `itertools`

# --- Others (Logging, Markup, Miscellaneous) ---
blinker==1.8.2 # Event-driven architecture support
Markdown==3.6 # Support for Markdown in Python
rich==13.7.1 # Rich text formatting for the terminal
GitPython==3.1.43 # Git version control via Python
watchdog==4.0.1 # File system monitoring

# --- FAISS Dependencies (for AI similarity search) ---
protobuf==4.25.3 # Protocol buffers for FAISS
orjson==3.10.3 # Fast JSON parsing
pyarrow==16.1.0 # High-performance data processing with FAISS
36 changes: 36 additions & 0 deletions Generative Models/ChatDocs-AI/util.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
import os
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import FAISS
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

# Utility function for vector store creation
def create_vectorstore(text_chunks, openai_api_key):
"""
Create a FAISS vector store from text chunks using OpenAI embeddings.
"""
embeddings = OpenAIEmbeddings(openai_api_key=openai_api_key)
vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings)
return vectorstore

# API key validation utility
def valid_api_keys(openai_key, groq_key):
"""
Validate OpenAI and Groq API keys.
"""
return openai_key.startswith('sk-') and groq_key.startswith('gsk_')

# Load API keys from environment variables
def load_api_keys():
"""
Load API keys from environment variables or a .env file.
"""
openai_api_key = os.getenv('OPENAI_API_KEY')
groq_api_key = os.getenv('GROQ_API_KEY')

if not openai_api_key or not groq_api_key:
raise ValueError("API keys are missing! Please provide valid OpenAI and Groq API keys.")

return openai_api_key, groq_api_key

0 comments on commit 1598368

Please sign in to comment.