-
-
Notifications
You must be signed in to change notification settings - Fork 71
/
tomd.py
96 lines (75 loc) · 2.44 KB
/
tomd.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
from pyquery import PyQuery as pq
__all__ = ['Tomd', 'convert']
MARKDOWN = {
'h1': "#",
'h2': "##",
'h3': "###",
'h4': "####",
'h5': "#####",
'h6': "######",
"blockquote": ">",
"li": "-",
"hr": "---",
"p": "\n"
}
INLINE = {
'em': ('*', '*'),
'strong': ('**', '**'),
'b': ('**', '**'),
'i': ('*', '*'),
'del': ('~~', '~~'),
"code": ('`', '`')
}
split_str = "++++++++++++++++++"
class Tomd:
def __init__(self, html=''):
self.html = html
self._markdown = ""
def convert(self, html=""):
d = pq(html)
d('head').remove()
html = d.html()
d = pq(html)
for e in d('span'):
inline_mark = pq(e).text()
html = html.replace(str(pq(e)), inline_mark)
d = pq(html)
for e in d('a'):
if "http" in pq(e).attr('href'):
inline_mark = f"[{pq(e).text()}]({pq(e).attr('href')})"
html = html.replace(str(pq(e)), inline_mark)
d = pq(html)
for e in d('img'):
inline_mark = f"![{pq(e).attr('alt')}]({pq(e).attr('src')})"
html = html.replace(str(pq(e)), inline_mark)
d = pq(html)
for e in d('thead'):
inline_mark = pq(e).outer_html() + '|------' * (pq(e)('th').length - 1)
html = html.replace(str(pq(e)), inline_mark)
d = pq(html)
for e in d('th,td'):
inline_mark = "|" + pq(e).text()
html = html.replace(str(pq(e)), inline_mark)
d = pq(html)
for e in d('pre'):
inline_mark = "```" + split_str + pq(e).html() + split_str + "```" + split_str
html = html.replace(str(pq(e)), inline_mark)
d = pq(html)
selectors = ','.join(INLINE.keys())
for e in d(selectors):
inline_mark = INLINE.get(e.tag)[0] + pq(e).text() + INLINE.get(e.tag)[1]
html = html.replace(str(pq(e)), inline_mark)
d = pq(html)
selectors = ','.join(MARKDOWN.keys())
for e in d(selectors):
inline_mark = split_str + MARKDOWN.get(e.tag) + " " + pq(e).text() + split_str
html = html.replace(str(pq(e)), inline_mark)
self._markdown = pq(html).text().replace(split_str, '\n')
print(self._markdown)
return self._markdown
@property
def markdown(self):
self.convert(self.html)
return self._markdown
_inst = Tomd()
convert = _inst.convert