-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathRAG_1.py
108 lines (86 loc) · 4.31 KB
/
RAG_1.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
import os
import requests
import deeplake
import streamlit as st
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from langchain.text_splitter import CharacterTextSplitter
from langchain_community.document_loaders import TextLoader
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.llms import OpenAI
from langchain.prompts import PromptTemplate
# Streamlit app title
st.title("RAG-based Question Answering with Deep Lake & OpenAI")
# Step 1: Ask for API keys from the user
openai_api_key = st.text_input("Enter your OpenAI API Key", type="password")
deeplake_token = st.text_input("Enter your Deep Lake Token", type="password")
if openai_api_key and deeplake_token:
# Set the API keys
os.environ["OPENAI_API_KEY"] = openai_api_key
# Step 2: Download and save the text file
file_url = "https://sherlock-holm.es/stories/plain-text/stud.txt"
response = requests.get(file_url)
if response.status_code != 200:
st.error(f"Failed to download the file. Status code: {response.status_code}")
else:
file_path = 'downloaded_example.txt'
with open(file_path, 'w', encoding='utf-8') as f:
f.write(response.text)
st.success(f"File downloaded and saved to {file_path}")
# Step 3: Load and process the text file using LangChain's TextLoader
loader = TextLoader(file_path)
documents = loader.load()
# Step 4: Split the document into manageable chunks
text_splitter = CharacterTextSplitter(chunk_size=2000, chunk_overlap=0)
docs = text_splitter.split_documents(documents)
st.write(f"Document split into {len(docs)} chunks.")
# Step 5: Initialize the OpenAI embeddings
embeddings = OpenAIEmbeddings(api_key=openai_api_key, model="text-embedding-ada-002")
# Step 6: Load Deep Lake dataset
activeloop_org_id = "YOUR_ORGANIZATION_NAME"
activeloop_dataset_name = "YOUR_DATASET_NAME"
dataset_path = f"hub://{activeloop_org_id}/{activeloop_dataset_name}"
db = deeplake.load(dataset_path, token=deeplake_token)
st.success("Connected to Deep Lake dataset.")
# Step 7: Query input from the user
query = st.text_input("Enter your query:")
if query:
# Embed the query using OpenAI embeddings
embedded_query = embeddings.embed_query(query)
# Compute cosine similarity between the query and stored embeddings
cosine_sim_matrix = cosine_similarity([embedded_query], db["embedding"].numpy())
# Get the most relevant result
most_relevant_index = int(cosine_sim_matrix.argmax())
# Retrieve the most relevant text from the dataset
try:
most_relevant_text = db["text"][most_relevant_index].numpy()
if isinstance(most_relevant_text, np.ndarray):
most_relevant_text = most_relevant_text.item()
except Exception as e:
st.error(f"Error retrieving the most relevant result: {str(e)}")
most_relevant_text = ""
# Define the prompt template
template = """
You are a helpful chatbot assisting readers with their questions based on the provided article.
Use only the information below to answer the question. Do not invent or hallucinate any information.
Context:
{chunks_formatted}
Question: {query}
Answer:
"""
prompt = PromptTemplate(
input_variables=["chunks_formatted", "query"],
template=template
)
# Format the context to inject into the prompt
chunks_formatted = most_relevant_text if isinstance(most_relevant_text, str) else "\n\n".join(most_relevant_text)
prompt_formatted = prompt.format(chunks_formatted=chunks_formatted, query=query)
# Generate the answer using the LLM
llm = OpenAI(model="gpt-3.5-turbo-instruct", temperature=0, openai_api_key=openai_api_key)
try:
answer = llm(prompt_formatted)
st.write("Answer:", answer)
except Exception as e:
st.error(f"Error generating the answer: {str(e)}")
else:
st.warning("Please enter both OpenAI API Key and Deep Lake Token.")