-
Notifications
You must be signed in to change notification settings - Fork 32
/
Copy pathclean.py
155 lines (124 loc) · 4.84 KB
/
clean.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
import argparse
import os
from bs4 import BeautifulSoup
def dir_path(dir):
if os.path.isdir(dir):
return dir
else:
raise NotADirectoryError(dir)
def parse_args():
parser = argparse.ArgumentParser(description='Massages Binder Output')
parser.add_argument('in_dir', help='The directory containing files to clean', type=dir_path)
parser.add_argument('out_dir', help='The directory to output to', type=dir_path)
parser.add_argument('--allow_file', help='A file containing components to include, one per line')
return parser.parse_args()
def read_allow_list(args):
if args.allow_file:
with open(args.allow_file) as f:
allow_list = []
for line in f:
allow_list.append(line.strip())
return allow_list
else:
return []
def clean(dir_entry, args, allow_list):
with open(dir_entry.path) as fp:
soup = BeautifulSoup(fp)
# Remove search
for el in soup.find_all(id='toc-search'):
el.extract()
# remove edit links
for el in soup.find_all('a', 'edit'):
el.extract()
# remove unnecessary scripts
for el in soup.find_all('script'):
# Google analytics
if el.string and 'GoogleAnalyticsObject' in el.string:
el.extract()
# Search
if el.string and 'function search' in el.string:
el.extract()
# Feedback prompt
if el.string and 'feedback-button' in el.string:
el.extract()
# Revision check
if el.string and 'checkRevisions' in el.string:
el.extract()
# binder JS
if 'src' in el.attrs and 'binder.js' in el['src']:
el.extract()
if el.string and 'enable_safeWindowOpen' in el.string:
el.extract()
if el.string and 'loadLocalLink' in el.string:
el.extract()
for el in soup.find_all('a'):
# remove external links
if el['href'].startswith('http'):
el.unwrap()
# remove footer nav links
elif 'id' in el.attrs and el['id'] in ['next-link', 'prev-link', 'footer-next-link']:
el.extract()
# handle binder-internal links
elif len(allow_list) > 0 and el['href'] not in allow_list:
# nav links get deleted
if 'original-href' in el.attrs:
el.extract()
# links in content get converted to plain text
else:
el.unwrap()
# Remove empty TOC entries
for el in soup.find_all('div', 'ace-line selectable'):
if len(el.contents) == 0:
el.extract()
for el in soup.find_all('div', 'child-container'):
if len(el.contents) == 0:
el.parent.extract()
for el in soup.find_all('div', 'ace-line hidden'):
el.extract()
# remove original-href and data-doc-id attributes from anchors
for el in soup.find_all('a'):
del el['data-doc-id']
del el['original-href']
del el['prev-name']
del el['prev-href']
del el['next-name']
del el['next-href']
# Rewrite static link/script urls
for el in soup.find_all('link'):
if 'dropbox-appbox-static' in el['href']:
el['href'] = 'static/' + el['href'].split('/')[-1]
for el in soup.find_all('script'):
if 'src' in el.attrs and 'dropbox-appbox-static' in el['src']:
el['src'] = 'static/' + el['src'].split('/')[-1]
# Insert needed JS
s = soup.new_tag('script')
s.string = 'function toggleLeftnav() { document.body.classList.toggle("nav-visible") }'
s['type'] = 'text/javascript'
soup.append(s)
# Insert analytics
s = soup.new_tag('script')
s['type'] = 'text/javascript'
s['async'] = None
s['src'] = 'https://www.googletagmanager.com/gtag/js?id=G-H85723BFKM'
soup.append(s)
s = soup.new_tag('script')
s['type'] = 'text/javascript'
s.string = '''
window.dataLayer = window.dataLayer || [];
function gtag(){dataLayer.push(arguments);}
gtag('js', new Date());
gtag('config', 'G-H85723BFKM');
'''
soup.append(s)
# Output
with open(os.path.join(args.out_dir, dir_entry.name), mode='w') as wp:
print(str(soup), file=wp)
def main():
args = parse_args()
allow_list = read_allow_list(args)
for file in os.scandir(args.in_dir):
if len(allow_list) == 0 or file.name in allow_list:
# FIXME parse and pass allow list
clean(file, args, allow_list)
if __name__ == "__main__":
main()