-
Notifications
You must be signed in to change notification settings - Fork 10
/
Copy pathindex_content.py
404 lines (326 loc) · 14.2 KB
/
index_content.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
import os
import pandas as pd
import csv
import html2text
import sys
import requests
import uuid
from atlassian import Confluence
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfpage import PDFPage
from pdfminer.converter import PDFPageAggregator
from pdfminer.layout import LAParams, LTTextLineHorizontal, LTTextBoxHorizontal, LTChar
from io import StringIO
from pprint import pprint
from bs4 import BeautifulSoup
import argparse
from transformers import GPT2TokenizerFast
from typing import Tuple
from nltk.tokenize import sent_tokenize
sys.stdout = open(sys.stdout.fileno(), mode='w', encoding='utf8', buffering=1)
# Create an ArgumentParser object
parser = argparse.ArgumentParser()
# Add an argument with a flag and a name
parser.add_argument("--spaces", nargs="*", default=["STRM"], help="Specify the Confluence Space you want to index")
parser.add_argument("--zendesk", nargs="*", default=["learningpool"], help="Specify the Zendesk domains you want to index")
parser.add_argument("--max_pages", default=1000, help="The maximum amount of Space pages to index")
parser.add_argument("--out", default="./output/default/contents.csv", help="Specify the filename to save the content")
parser.add_argument("--min_tokens", default=20, help="Remove content with less than this number of tokens")
parser.add_argument("--input", default=False, help="Folder to ingest CSVs from. Rows should be in the format 'heading,answers,answers,...'")
parser.add_argument("--use_dirs", action=argparse.BooleanOptionalAction, help="Use the folder structure (./product/area.csv)")
parser.add_argument("--pdf_content_fontsize", default=12, help="Content greater than this fontsize will be considered as a header")
args = parser.parse_args()
max_pages = int(args.max_pages)
pdf_content_fontsize = int(args.pdf_content_fontsize)
min_tokens = int(args.min_tokens)
# Connect to Confluence
confluence = Confluence(url='https://learninglocker.atlassian.net', username=os.environ.get('CONFLUENCE_USERNAME'), password=os.environ.get('CONFLUENCE_API_KEY'))
tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
def count_tokens(text: str) -> int:
"""count the number of tokens in a string"""
return len(tokenizer.encode(text))
def reduce_long(
long_text: str, long_text_tokens: bool = False, max_len: int = 590
) -> str:
"""
Reduce a long text to a maximum of `max_len` tokens by potentially cutting at a sentence end
"""
if not long_text_tokens:
long_text_tokens = count_tokens(long_text)
if long_text_tokens > max_len:
sentences = sent_tokenize(long_text.replace("\n", " "))
ntokens = 0
for i, sentence in enumerate(sentences):
ntokens += 1 + count_tokens(sentence)
if ntokens > max_len:
return ". ".join(sentences[:i][:-1]) + "."
return long_text
def extract_html_content(
title_prefix: str,
page_title: str,
html: str,
url: str
):
nuuids, ncontents, nurls = [], [], []
soup = BeautifulSoup(html, 'html.parser')
headings = soup.find_all(["h1", "h2", "h3", "h4", "h5", "h6"])
prev_heading = []
# Iterate through all headings and subheadings
for h in headings:
# Extract the heading text and remove HTML
heading = html2text.html2text(str(h)).strip()
# Initialize the content list
content = []
# Find the next heading or subheading
next_h = h.find_next(['h1', 'h2', 'h3', 'h4', 'h5', 'h6'])
actual_heading = heading.lstrip('#').lstrip(' ')
# Iterate through all siblings until the next heading or subheading is reached
for sibling in h.next_siblings:
if sibling == next_h:
break
# If the sibling is a tag, extract the text and remove HTML
if sibling.name:
para = html2text.html2text(str(sibling)).strip()
if len(para) > 0:
content.append(para)
# If there are content entries, join them all together, clean up for utf-8 and write the row
if len(content) > 0:
content = "".join(content).replace("\n", "").encode('utf-8').decode('utf-8')
# If there are headings above this one without content, we concat them here
if len(prev_heading) > 0:
full_heading = " - ".join(prev_heading) + " - " + actual_heading
else:
full_heading = actual_heading
title = f"{title_prefix} - {page_title}"
# Store the extracted title, heading, content
row_uuid = str(uuid.uuid4())
nuuids.append(row_uuid)
ncontents.append(f"{title} - {full_heading} - {content}")
nurls.append(url)
prev_heading = []
else:
# Otherwise, we store this heading to append to the next sibling with content
prev_heading.append(actual_heading)
# Return the 3 arrays of titles, headings and content
return (nuuids, ncontents, nurls)
def count_content_tokens(
nuuids:list,
ncontents: list,
nurls: list
):
# count the tokens of each section
ncontent_ntokens = [
count_tokens(c) # Add the tokens from the content
+ 4
+ count_tokens(" ".join(id.split(" ")[1:-1])) # Add the tokens from the uuids
- (1 if len(c) == 0 else 0)
for id, c in zip(nuuids, ncontents)
]
# Create a tuple of (title, section_name, content, number of tokens)
outputs = []
outputs += [(id, u, c, tk) if tk<max_len
else (id, reduce_long(c, max_len), count_tokens(reduce_long(c,max_len)))
for id, u, c, tk in zip(nuuids, nurls, ncontents, ncontent_ntokens)]
return outputs
def extract_sections(
space: str,
limit: int = max_pages
):
nuuids, ncontents, nurls = [], [], []
confluence_space = confluence.get_space(space_key=space)
space_title = confluence_space['name']
print(f"Fetching up to {limit} pages from '{space_title}'...")
# Search for all pages in a given space
results = confluence.get_all_pages_from_space(space=space, start=0, limit=limit)
page_ids = []
for result in results:
page_ids.append(result["id"])
# Iterate through the list of Confluence pages
for page_id in page_ids:
# Fetch the Confluence page
page = confluence.get_page_by_id(page_id=page_id, expand="body.storage")
# Extract the page title and content
page_title = page['title']
page_html = page['body']['storage']['value']
page_url = page['_links']['base'] + page['_links']['webui'];
pageIds, pageContent, pageUrls = extract_html_content(space_title, page_title, page_html, page_url)
nuuids += pageIds
ncontents += pageContent
nurls += pageUrls
return count_content_tokens(nuuids, ncontents, nurls)
def extract_zendesk_domain(
zendesk_domain: str,
limit: int = max_pages
):
nuuids, ncontents, nurls = [], [], []
total_pages = 0;
URL = f"https://{zendesk_domain}.zendesk.com/api/v2/help_center/en-us"
print(f"Fetching up to {limit} pages from 'https://{zendesk_domain}.zendesk.com'...")
# Fetch the Categories from Zendesk
cat_response = requests.get(URL + '/categories.json')
cat_data = cat_response.json()
for category in cat_data['categories']:
category_title = category['name']
# Fetch the sections within the categories
sections_response = requests.get(URL + '/categories/' + str(category['id']) + '/sections.json')
sections_data = sections_response.json()
for section in sections_data['sections']:
page_title = section['name']
# Fetch the articles within the section
articles_response = requests.get(URL + '/sections/' + str(section['id']) + '/articles.json')
articles_data = articles_response.json()
for article in articles_data["articles"]:
page_title += " - " + article['title']
page_html = article['body']
page_url = article['html_url']
if (page_html is not None and total_pages < limit ):
pageIds, pageContent, pageUrls = extract_html_content(category_title, page_title, page_html, page_url)
nuuids += pageIds
ncontents += pageContent
nurls += pageUrls
total_pages += 1
if (articles_data['next_page'] is not None):
pprint('TODO! But have not seen multiple pages yet at this level (due to using sections...)')
return count_content_tokens(nuuids, ncontents, nurls)
def extract_csvfile(subdir, file):
nuuids, ncontents, nurls = [], [], []
csv_filepath = os.path.join(subdir, file)
subdir_name = os.path.basename(subdir)
file_name = os.path.splitext(file)[0]
print(f"Loading data from {csv_filepath}, subdir: {subdir_name}")
if args.use_dirs == False:
product = input(f"Please enter the product NAME for this file (default: {subdir_name}): ")
if not product:
product = subdir_name
product_area = input(f"Please enter the product AREA for this file (default: {file_name}): ")
if not product_area:
product_area = file_name
else:
product = subdir_name
product_area = file_name
title = f"{product} - {product_area}"
with open(csv_filepath, 'r', encoding='utf-8') as csv_file:
csv_reader = csv.reader(csv_file)
for row in csv_reader:
if row:
row_uuid = str(uuid.uuid4())
content = ""
if row[0]:
content += f"{row[0]} -"
if len(row) > 1 and row[1]:
content += row[1]
for i in range(2, len(row)):
if row[i]:
content += ' ' + row[i]
# If the content is empty, move on
if (len(content) > 0):
content = f"{title} - {content}"
else:
continue
nuuids.append(row_uuid)
ncontents.append(content)
nurls.append(file)
return count_content_tokens(nuuids, ncontents, nurls)
import PyPDF2
def index_pdf_content(subdir, file):
filepath = os.path.join(subdir, file)
nuuids, ncontents, nurls = [], [], []
print(f"Loading data from {filepath}")
subdir_name = os.path.basename(subdir)
file_name = os.path.splitext(file)[0]
if args.use_dirs == False:
product = input(f"Please enter the product NAME for this file (default: {subdir_name}): ")
if not product:
product = subdir_name
product_area = input(f"Please enter the product AREA for this file (default: {file_name}): ")
if not product_area:
product_area = file_name
else:
product = subdir_name
product_area = file_name
title = f"{product} - {product_area}"
# open the pdf file
with open(filepath, 'rb') as pdf_file:
laparams = LAParams()
rsrcmgr = PDFResourceManager()
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
pages = PDFPage.get_pages(pdf_file)
content = {}
current_headings = []
prev_heading_size = 0
for page in pages:
interpreter.process_page(page)
layout = device.get_result()
for element in layout:
if isinstance(element, LTTextBoxHorizontal):
for text_line in element:
if isinstance(text_line, LTTextLineHorizontal):
is_heading = False
for char in text_line:
if isinstance(char, LTChar):
fontsize = char.size
if fontsize > pdf_content_fontsize:
is_heading = True
break
if is_heading:
heading = text_line.get_text().replace('\n', '')
if fontsize == prev_heading_size and len(current_headings) > 0:
current_headings.pop()
elif fontsize > prev_heading_size:
current_headings = []
current_headings.append(heading)
prev_heading_size = fontsize
break
line_text = element.get_text().replace('\n', '')
key = ' - '.join(current_headings)
if key not in content:
content[key] = [line_text]
else:
content[key].append(line_text)
for heading in content:
row_uuid = str(uuid.uuid4())
nuuids.append(row_uuid)
content_text = " ".join(content[heading])
ncontents.append(f"{heading} - {content_text}")
nurls.append(f"{file_name} - {heading}")
return count_content_tokens(nuuids, ncontents, nurls)
def index_txt_content(dir_path, file, max_chars=500):
nuuids, ncontents, nurls = [], [], []
filepath = os.path.join(dir_path, file)
with open(filepath, 'r', encoding='utf-8') as txt_file:
content = txt_file.read()
blocks = [content[i:i+max_chars] for i in range(0, len(content), max_chars)]
for block in blocks:
nuuids.append(str(uuid.uuid4()))
ncontents.append(block)
nurls.append(file)
return count_content_tokens(nuuids, ncontents, nurls)
# Define the maximum number of tokens we allow per row
max_len = 1500
# For each Space, fetch the content and add to a list(title, heading, content, tokens)
res = []
for space in args.spaces:
print(f"INDEXING CONTENT FROM CONFLUENCE: {space}")
res += extract_sections(space)
for domain in args.zendesk:
print(f"INDEXING CONTENT FROM ZENDESK: {domain}.zendesk.com")
res += extract_zendesk_domain(domain)
if os.path.isdir(args.input):
for subdir, dirs, files in os.walk(args.input):
for file in files:
if file.endswith(".csv"):
res += extract_csvfile(subdir, file)
elif file.endswith(".pdf"):
res += index_pdf_content(subdir, file)
elif file.endswith(".txt"):
res += index_txt_content(subdir, file)
# Remove rows with less than 40 tokens
df = pd.DataFrame(res, columns=["id", "url", "content", "tokens"])
df = df[df.tokens > min_tokens]
df = df.drop_duplicates(['id'])
df = df.reset_index().drop('index',axis=1) # reset index
print(df.head())
# Store the content to a CSV
df.to_csv(args.out, index=False, escapechar='\\')
print(f"Done! File saved to {args.out}")