-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathhelp.py
66 lines (41 loc) · 2 KB
/
help.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
from langchain.document_loaders import PyPDFLoader
from langchain.chains import RetrievalQA
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import SKLearnVectorStore
from langchain import HuggingFaceHub
import os
os.environ["HUGGINGFACEHUB_API_TOKEN"] = "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"
def qadocument(file_path,query):
# Load the PDF file from current working directory
loader = PyPDFLoader(file_path)
# Split the PDF into Pages
pages = loader.load_and_split()
# Define chunk size, overlap and separators
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=1024,
chunk_overlap=64,
separators=['\n\n', '\n', '(?=>\. )', ' ', '']
)
# Split the pages into texts as defined above
texts = text_splitter.split_documents(pages)
# Load embeddings from HuggingFace
embedding = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
# Set the persisted vector store
vector_db_path = "./document_vector_db.parquet"
# Create the vector store
vector_db = SKLearnVectorStore.from_documents(texts, embedding=embedding, persist_path=vector_db_path,
serializer="parquet")
llm=HuggingFaceHub(repo_id="tiiuae/falcon-7b-instruct", model_kwargs={"temperature":0.1 ,"max_length":512})
qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff",
retriever=vector_db.as_retriever(search_kwargs={"k": 3}),
return_source_documents=True,
verbose=False,
)
# Send question as a query to qa chain
result = qa({"query": query})
result = result["result"]
response_lines = result.split("\n")
# Get the second line (index 1) of the response
result = response_lines[1]
return result