-
Notifications
You must be signed in to change notification settings - Fork 11
/
Copy pathdocsGpt.py
176 lines (145 loc) · 5.32 KB
/
docsGpt.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
# docsGpt.py - Contains the docsGpt functions and classes for document parsing
# Author: Armin Norouzi, Farhad Davaripour
# Contact: https://github.com/Farhad-Davaripour/DocsGPT
# Date created: April 14, 2023
# Last modified: May 3, 2023
# License: MIT License
# Import required modules
import sys
import subprocess
from google.colab import files
import os
import shutil
import time
import tempfile
# List of library names to import
library_names = ['langchain', 'openai', 'PyPDF2', 'tiktoken', 'faiss-cpu', 'textwrap', 'python-docx', 'python-pptx', "langchain-community"]
# Dynamically import libraries from list
for name in library_names:
try:
__import__(name)
except ImportError:
print(f"{name} not found. Installing {name}...")
subprocess.check_call([sys.executable, '-m', 'pip', 'install', name])
# Import required modules
from PyPDF2 import PdfReader
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.chains.question_answering import load_qa_chain
from langchain.llms import OpenAI
from getpass import getpass
import textwrap
import os
import docx
import pptx
# adding token
# print("You need OpenAI token: Here is the link to get
# the keys: https://platform.openai.com/account/billing/overview")
token = getpass("Enter your OpenAI token: ()")
os.environ["OPENAI_API_KEY"] = str(token)
# Download embeddings from OpenAI
embeddings = OpenAIEmbeddings()
chain = load_qa_chain(OpenAI(), chain_type="stuff")
def extract_texts(root_files):
"""
Extracts text from uploaded file and puts it in a list.
Supported file types: .pdf, .docx, .pptx
If multiple files are provided, their contents are concatenated.
Args:
- root_files: A list of file paths to be processed.
Returns:
- A FAISS index object containing the embeddings of the
text chunks.
"""
raw_text = ''
for root_file in root_files:
_, ext = os.path.splitext(root_file)
if ext == '.pdf':
with open(root_file, 'rb') as f:
reader = PdfReader(f)
for i in range(len(reader.pages)):
page = reader.pages[i]
raw_text += page.extract_text()
elif ext == '.docx':
doc = docx.Document(root_file)
for paragraph in doc.paragraphs:
raw_text += paragraph.text
elif ext == '.pptx':
ppt = pptx.Presentation(root_file)
for slide in ppt.slides:
for shape in slide.shapes:
if hasattr(shape, 'text'):
raw_text += shape.text
# retreival we don't hit the token size limits.
text_splitter = CharacterTextSplitter(
separator = "\n",
chunk_size = 1000,
chunk_overlap = 200,
length_function = len,
)
texts = text_splitter.split_text(raw_text)
docsearch = FAISS.from_texts(texts, embeddings)
return docsearch
def run_query(query, docsearch):
"""
Runs a query on a PDF file using the docsearch and chain
libraries.
Args:
- query: A string representing the query to be run.
- file: A PDFReader object containing the PDF file to be
searched.
Returns:
- A string containing the output of the chain library run
on the documents returned by the docsearch similarity search.
"""
docs = docsearch.similarity_search(query)
return chain.run(input_documents=docs, question=query)
def upload_file(folder_path):
"""
Uploads a file from the local file system and saves it to
a folder path.
Args:
- folder_path: A string representing the folder path where
the file will be saved.
Returns:
- A string representing the path of the uploaded file.
"""
uploaded = files.upload()
root_file = []
for filename, data in uploaded.items():
with open(filename, 'wb') as f:
f.write(data)
shutil.copy(filename, folder_path + "/")
root_file.append(folder_path + "/" + filename)
os.remove(filename)
return root_file
def run_conversation(folder_path):
"""
Initiates a conversation with the user by repeatedly asking for
input queries and running them on a PDF file.
Args:
- folder_path: A string representing the folder path where the
PDF file is located.
Returns:
- Run conversation based on PDF
"""
root_files = upload_file(folder_path)
# location of the pdf file/files.
docsearch = extract_texts(root_files)
count = 0
while True:
print("Question ", count + 1)
query = input(" Ask your question or if you have no further question type stop:\n ")
if query.lower() == "stop":
print("### Thanks for using the app! ###")
break
elif query == "":
print("### Your input is empty! Try again! ###")
continue
else:
wrapped_text = textwrap.wrap(run_query(query, docsearch), width=100)
print("Answer:")
for line in wrapped_text:
print(line)
count += 1