diff --git a/Generative Models/ChatDocs-AI/README.md b/Generative Models/ChatDocs-AI/README.md new file mode 100644 index 00000000..19b614a8 --- /dev/null +++ b/Generative Models/ChatDocs-AI/README.md @@ -0,0 +1,41 @@ +# ChatDocs AI +RAG-based generative AI application enabling interactive communication with PDF documents. Developed using Streamlit and Groq AI Inference technology. + +# Supported Models +This application makes use of following LLMs: + - Chat Models — Groq AI: + - Llama3-8b-8192 + - Llama3-70b-8192 + - Mixtral-8x7b-32768 + - Gemma-7b-it + - Embeddings -- OpenAI + - Text-embedding-ada-002-v2 + +# System Requirements +- Python 3.9 or later (earlier versions are not compatible). + +# Installation +1. Fork the repository and then follow the steps given below! + +2. Clone the repository +```bash +git clone https://github.com//ML-Nexus.git +cd ML-Nexus/Generative Models/ChatDocs-AI +``` +3. Create and activate a virtual environment: +```bash +python -m venv venv +source venv/bin/activate # On Windows: venv\Scripts\activate +``` +4. Install required Python packages: +```bash +pip install -r requirements.txt +``` + +5. Run the application: +```bash +streamlit run main.py +``` + +# Snapshots +![image](https://github.com/user-attachments/assets/7e518fc4-c70c-44fd-9719-24b78bf0e5c9) diff --git a/Generative Models/ChatDocs-AI/chat_handler.py b/Generative Models/ChatDocs-AI/chat_handler.py new file mode 100644 index 00000000..7e29fa1b --- /dev/null +++ b/Generative Models/ChatDocs-AI/chat_handler.py @@ -0,0 +1,9 @@ +from langchain.chains import create_retrieval_chain +from langchain.chains.combine_documents import create_stuff_documents_chain + +# Get LLM response to user query +def get_llm_response(llm, prompt_template, question): + doc_chain = create_stuff_documents_chain(llm, prompt_template) + retrieval_chain = create_retrieval_chain(st.session_state.vector_store.as_retriever(), doc_chain) + response = retrieval_chain.invoke({'input': question}) + return response diff --git a/Generative Models/ChatDocs-AI/config.py b/Generative Models/ChatDocs-AI/config.py new file mode 100644 index 00000000..958bc95e --- /dev/null +++ b/Generative Models/ChatDocs-AI/config.py @@ -0,0 +1,31 @@ +import streamlit as st +from dotenv import load_dotenv + +load_dotenv() + +# Function for API configuration +def sidebar_api_key_configuration(): + st.sidebar.subheader("API Keys") + openai_api_key = st.sidebar.text_input("Enter your OpenAI API Key 🗝️", type="password", help='Get API Key here') + groq_api_key = st.sidebar.text_input("Enter your Groq API Key 🗝️", type="password", help='Get API Key here') + + if not all([openai_api_key, groq_api_key]): + st.sidebar.warning('Enter both API Keys') + st.session_state.prompt_activation = False + elif valid_keys(openai_api_key, groq_api_key): + st.sidebar.success('Keys valid. Ready to proceed!') + st.session_state.prompt_activation = True + else: + st.sidebar.warning('Invalid API keys') + st.session_state.prompt_activation = False + + return openai_api_key, groq_api_key + + +def valid_keys(openai_key, groq_key): + return openai_key.startswith('sk-') and groq_key.startswith('gsk_') + +# Model Selection in sidebar +def sidebar_groq_model_selection(): + st.sidebar.subheader("Model Selection") + return st.sidebar.selectbox('Select Model', ('Llama3-8b', 'Llama3-70b', 'Mixtral-8x7b', 'Gemma-7b')) diff --git a/Generative Models/ChatDocs-AI/main.py b/Generative Models/ChatDocs-AI/main.py new file mode 100644 index 00000000..71303345 --- /dev/null +++ b/Generative Models/ChatDocs-AI/main.py @@ -0,0 +1,50 @@ +import streamlit as st +from config import sidebar_api_key_configuration, sidebar_groq_model_selection +from pdf_processing import create_vectorstore +from chat_handler import get_llm_response +from streamlit_option_menu import option_menu +from langchain_core.prompts import ChatPromptTemplate +from langchain_groq import ChatGroq + +# Page Configuration +st.set_page_config(page_title="ChatDocs AI", page_icon=":robot_face:", layout="centered") + +# Session State Variables +for key in ["vector_store", "response", "prompt_activation", "conversation", "chat_history", "prompt"]: + if key not in st.session_state: + st.session_state[key] = None + +openai_api_key, groq_api_key = sidebar_api_key_configuration() +model = sidebar_groq_model_selection() + +# Main App Interface +st.title("ChatDocs AI :robot_face:") +st.write("*Interrogate Documents, Ignite Insights*") + +selected = option_menu(menu_title=None, options=["ChatDocs AI", "Reference", "About"], icons=["robot", "bi-file-text-fill", "app"], orientation="horizontal") +llm = ChatGroq(groq_api_key=groq_api_key, model_name=model) +prompt_template = ChatPromptTemplate.from_template("Answer based on provided context only: {context} Questions: {input}") + +# ChatDocs AI Section +if selected == "PDF ChatDocs AI": + st.subheader("Upload PDF(s)") + pdf_docs = st.file_uploader("Upload PDFs", type=['pdf'], accept_multiple_files=True, disabled=not st.session_state.prompt_activation) + process = st.button("Process", type="primary", disabled=not pdf_docs) + + if process: + st.session_state.vector_store = create_vectorstore(openai_api_key, pdf_docs) + st.session_state.prompt = True + st.success('Database is ready') + + if "messages" not in st.session_state: + st.session_state["messages"] = [{"role": "assistant", "content": "How can I help you?"}] + + for msg in st.session_state.messages: + st.chat_message(msg["role"]).write(msg["content"]) + + if question := st.chat_input(placeholder="Ask a document-related question", disabled=not st.session_state.prompt): + st.session_state.messages.append({"role": "user", "content": question}) + with st.spinner('Processing...'): + st.session_state.response = get_llm_response(llm, prompt_template, question) + st.session_state.messages.append({"role": "assistant", "content": st.session_state.response['answer']}) + st.chat_message("assistant").write(st.session_state.response['answer']) diff --git a/Generative Models/ChatDocs-AI/pdf_processing.py b/Generative Models/ChatDocs-AI/pdf_processing.py new file mode 100644 index 00000000..d5f90d83 --- /dev/null +++ b/Generative Models/ChatDocs-AI/pdf_processing.py @@ -0,0 +1,20 @@ +from pypdf import PdfReader +from langchain.text_splitter import RecursiveCharacterTextSplitter +from langchain_openai import OpenAIEmbeddings +from langchain_community.vectorstores import FAISS + +# Read PDF data +def read_pdf_data(pdf_docs): + text = "".join([page.extract_text() for pdf in pdf_docs for page in PdfReader(pdf).pages]) + return text + +# Split data into chunks +def split_data(text): + splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200) + return splitter.split_text(text) + +# Create vector store from PDF data +def create_vectorstore(openai_api_key, pdf_docs): + text_chunks = split_data(read_pdf_data(pdf_docs)) + embeddings = OpenAIEmbeddings(openai_api_key=openai_api_key) + return FAISS.from_texts(texts=text_chunks, embedding=embeddings) diff --git a/Generative Models/ChatDocs-AI/requirements.txt b/Generative Models/ChatDocs-AI/requirements.txt new file mode 100644 index 00000000..91e3d477 --- /dev/null +++ b/Generative Models/ChatDocs-AI/requirements.txt @@ -0,0 +1,73 @@ +# --- Core Libraries --- +numpy==1.26.4 # Core numerical computations +pandas==2.2.2 # Data manipulation and analysis +python-dateutil==2.9.0.post0 # Date and time handling +pytz==2024.1 # Timezone handling + +# --- Web Frameworks & Streamlit --- +streamlit==1.35.0 # Web app framework for interactive data applications +streamlit-camera-input-live==0.2.0 # Camera input for Streamlit apps +streamlit-option-menu==0.3.12 # Sidebar navigation menu +streamlit-extras==0.4.2 # Extra Streamlit widgets +streamlit-card==1.0.2 # Streamlit widget to create card components +streamlit-toggle-switch==1.0.2 # Toggle switch widget +streamlit-faker==0.0.3 # Streamlit widget to generate fake data for testing +st-annotated-text==4.0.1 # Streamlit annotated text widget for UI +streamlit-vertical-slider==2.5.5 # Custom slider for Streamlit +streamlit-image-coordinates==0.1.6 # Capture image coordinates in Streamlit + +# --- PDF and Document Processing --- +pypdf==4.2.0 # PDF file reading and manipulation +lxml==5.2.2 # XML and HTML processing +faiss-cpu==1.8.0 # FAISS for similarity search and vector stores + +# --- LangChain & Language Models --- +langchain==0.2.1 # LangChain framework for LLMs +langchain-community==0.2.1 # LangChain extensions by the community +langchain-core==0.2.1 # Core LangChain package +langchain-openai==0.1.7 # OpenAI embeddings for LangChain +langchain-groq==0.1.4 # Groq AI integration for LangChain + +# --- Large Language Model Embeddings & Similarity --- +openai==1.30.2 # OpenAI's API wrapper +tiktoken==0.7.0 # Tokenization library for LLMs +Faker==25.2.0 # Fake data generation for testing + +# --- Asynchronous & HTTP Requests --- +aiohttp==3.9.5 # Asynchronous HTTP client/server framework +httpx==0.27.0 # HTTP client for Python, async-friendly +certifi==2024.2.2 # Certificate handling for secure HTTPS connections + +# --- API Utilities & Environment --- +python-dotenv==1.0.1 # Load environment variables from .env files +requests==2.32.2 # Simplified HTTP request library +validators==0.28.2 # Data validation utilities + +# --- Visualization & UI --- +altair==5.3.0 # Declarative statistical visualization library +matplotlib==3.9.0 # Plotting library for static, animated, and interactive visualizations +pydeck==0.9.1 # 3D map visualization +pillow==10.3.0 # Image processing capabilities + +# --- Schema Validation & Dataclasses --- +dataclasses-json==0.6.6 # Serialization and deserialization for dataclasses +pydantic==2.7.1 # Data validation and settings management using Python type annotations +jsonschema==4.22.0 # JSON schema validation + +# --- Utility Libraries --- +tqdm==4.66.4 # Progress bars for loops +tenacity==8.3.0 # Retry library for dealing with unreliable APIs +toolz==0.12.1 # Functional programming utilities +more-itertools==10.2.0 # Additional tools for Python's `itertools` + +# --- Others (Logging, Markup, Miscellaneous) --- +blinker==1.8.2 # Event-driven architecture support +Markdown==3.6 # Support for Markdown in Python +rich==13.7.1 # Rich text formatting for the terminal +GitPython==3.1.43 # Git version control via Python +watchdog==4.0.1 # File system monitoring + +# --- FAISS Dependencies (for AI similarity search) --- +protobuf==4.25.3 # Protocol buffers for FAISS +orjson==3.10.3 # Fast JSON parsing +pyarrow==16.1.0 # High-performance data processing with FAISS diff --git a/Generative Models/ChatDocs-AI/util.py b/Generative Models/ChatDocs-AI/util.py new file mode 100644 index 00000000..595d5b1b --- /dev/null +++ b/Generative Models/ChatDocs-AI/util.py @@ -0,0 +1,36 @@ +import os +from langchain_openai import OpenAIEmbeddings +from langchain_community.vectorstores import FAISS +from dotenv import load_dotenv + +# Load environment variables from .env file +load_dotenv() + +# Utility function for vector store creation +def create_vectorstore(text_chunks, openai_api_key): + """ + Create a FAISS vector store from text chunks using OpenAI embeddings. + """ + embeddings = OpenAIEmbeddings(openai_api_key=openai_api_key) + vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings) + return vectorstore + +# API key validation utility +def valid_api_keys(openai_key, groq_key): + """ + Validate OpenAI and Groq API keys. + """ + return openai_key.startswith('sk-') and groq_key.startswith('gsk_') + +# Load API keys from environment variables +def load_api_keys(): + """ + Load API keys from environment variables or a .env file. + """ + openai_api_key = os.getenv('OPENAI_API_KEY') + groq_api_key = os.getenv('GROQ_API_KEY') + + if not openai_api_key or not groq_api_key: + raise ValueError("API keys are missing! Please provide valid OpenAI and Groq API keys.") + + return openai_api_key, groq_api_key