From c6afb218a59df0db583799438cbd2c93a53970e0 Mon Sep 17 00:00:00 2001 From: Yolk Date: Tue, 10 Jan 2017 10:31:05 +0200 Subject: [PATCH 01/21] Html processing refactoring --- html_telegraph_poster/html_to_telegraph.py | 110 +++++++++++++-------- tests/test.py | 27 +++++ 2 files changed, 96 insertions(+), 41 deletions(-) diff --git a/html_telegraph_poster/html_to_telegraph.py b/html_telegraph_poster/html_to_telegraph.py index 9f39414..5176628 100644 --- a/html_telegraph_poster/html_to_telegraph.py +++ b/html_telegraph_poster/html_to_telegraph.py @@ -49,13 +49,40 @@ def clean_article_html(html_string): return html_string.strip(' \t') -def _wrap_tag(element, wrapper): +def _create_element(element): + # creates lxml element without document tree (no body, no parents) new_element = html.HtmlElement() - new_element.tag = wrapper + new_element.tag = element + return new_element + + +def _wrap_tag(element, wrapper): + new_element = _create_element(wrapper) new_element.append(element) return new_element +def _fragments_from_string(html_string): + fragments = html.fragments_fromstring(html_string) + if not len(fragments): + return [] + # convert and append text node before starting tag + if not isinstance(fragments[0], html.HtmlElement): + if len(fragments[0].strip()) > 0: + if len(fragments) == 1: + return html.fragments_fromstring('

%s

' % fragments[0]) + else: + paragraph = _create_element('p') + paragraph.text = fragments[0] + fragments[1].addprevious(paragraph) + fragments.insert(1, paragraph) + + fragments.pop(0) + if not len(fragments): + return [] + return fragments + + def preprocess_media_tags(element): if isinstance(element, html.HtmlElement): if element.tag == 'figcaption': @@ -78,7 +105,8 @@ def preprocess_media_tags(element): elif vimeo: element.set('src', '/embed/vimeo?url=' + quote_plus('https://vimeo.com/' + vimeo.group(2))) - element = _wrap_tag(element, 'figure') + element.addprevious(_create_element('figure')) + element.getprevious().append(element) elif element.tag == 'blockquote' and element.get('class') == 'twitter-tweet': twitter_links = element.xpath('.//a') for tw_link in twitter_links: @@ -86,34 +114,31 @@ def preprocess_media_tags(element): twitter_frame = html.HtmlElement() twitter_frame.tag = 'iframe' twitter_frame.set('src', '/embed/twitter?url=' + quote_plus(tw_link.get('href'))) - element = _wrap_tag(twitter_frame, 'figure') - - return element + element.addprevious(_wrap_tag(twitter_frame, 'figure')) + element.drop_tree() def preprocess_fragments(fragments): - processed_fragments = [] bad_tags = [] if not len(fragments): - return processed_fragments + return None - # convert and append text node before starting tag - if not isinstance(fragments[0], html.HtmlElement): - if len(fragments[0].strip()) > 0: - processed_fragments.append(html.fromstring('

%s

' % fragments[0])) - fragments.pop(0) - if not len(fragments): - return processed_fragments + body = fragments[0].getparent() for fragment in fragments: # figure should be on the top level if fragment.find('figure') is not None: f = fragment.find('figure') - processed_fragments.append(f) - fragment.remove(f) + body.append(f) + + images_to_wrap = fragment.xpath('.//self::p[not(normalize-space(string()))]//img') + if len(images_to_wrap): + for image in images_to_wrap: + image.tail = '' + body.append(_wrap_tag(image, 'figure')) - processed_fragments.append(fragment) + body.append(fragment) # bad iframes ns = {'re': "http://exslt.org/regular-expressions"} bad_tags.extend(fragments[-1].xpath("//iframe[not(re:test(@src, '%s|%s', 'i'))]" % (youtube_re, vimeo_re), namespaces=ns)) @@ -125,15 +150,25 @@ def preprocess_fragments(fragments): bad_tag.drop_tag() if bad_tag in fragments: fragments.remove(bad_tag) - if bad_tag in processed_fragments: - processed_fragments.remove(bad_tag) - return processed_fragments + for fragment in fragments: + if fragment.tag not in allowed_top_level_tags: + paragraph = _create_element('p') + fragment.addprevious(paragraph) + paragraph.append(fragment) + else: + # convert and append text nodes after closing tag + if fragment.tail and len(fragment.tail.strip()) != 0: + paragraph = _create_element('p') + paragraph.text = fragment.tail + fragment.addnext(paragraph) + fragment.tail = '' + + return len(body.getchildren()) and body def _recursive_convert(element): - element = preprocess_media_tags(element) fragment_root_element = { 'tag': element.tag } @@ -165,26 +200,19 @@ def convert_html_to_telegraph_format(html_string, clean_html=True): if clean_html: html_string = clean_article_html(html_string) - fragments = preprocess_fragments( - html.fragments_fromstring(html_string) - ) - content = [] - - for fragment in fragments: - - if fragment.tag not in allowed_top_level_tags: - paragraph = html.HtmlElement() - paragraph.tag = 'p' - paragraph.append(fragment) - content.append(_recursive_convert(paragraph)) - else: - content.append(_recursive_convert(fragment)) + body = preprocess_fragments( + _fragments_from_string(html_string) + ) + if body: + for x in body.iterdescendants(): + preprocess_media_tags(x) + else: + fragments = _fragments_from_string(html_string) + body = fragments[0].getparent() if len(fragments) else None - # convert and append text nodes after closing tag - if fragment.tail and len(fragment.tail.strip()) != 0: - content.append( - _recursive_convert(html.fromstring('

%s

' % fragment.tail)) - ) + content = [] + if body: + content = [_recursive_convert(x) for x in body.iterchildren()] return json.dumps(content, ensure_ascii=False) diff --git a/tests/test.py b/tests/test.py index 25c1d8a..6d3fd40 100644 --- a/tests/test.py +++ b/tests/test.py @@ -13,6 +13,18 @@ def assertJson(self, first, second): json.loads(second) ) + def test_text_only(self): + html = 'only plain text' + html_empty_string = ' ' + self.assertJson( + [{'children': ['only plain text'], 'tag': 'p'}], + convert_html_to_telegraph_format(html, clean_html=True) + ) + self.assertJson( + [], + convert_html_to_telegraph_format(html_empty_string, clean_html=True) + ) + def test_text_on_top(self): html = '''
@@ -324,6 +336,21 @@ def test_lists(self): convert_html_to_telegraph_format(empty_list, clean_html=True) ) + def test_convert_without_clean(self): + # multiple br tags should be replaced with one line break + html = 'Text first line' \ + '


' \ + '

text


' \ + '' + self.assertJson( + [{'tag': 'p', 'children': ['Text first line']}, {'tag': 'br'}, {'tag': 'br'}, + {'tag': 'br', 'attrs': {'class': 'somebrclass'}}, {'tag': 'div'}, + {'tag': 'br', 'attrs': {'id': 'somebrid'}}, {'tag': 'p', 'children': ['text']}, {'tag': 'br'}, + {'tag': 'span', + 'children': [{'tag': 'em', 'children': [{'tag': 'strong', 'children': [{'tag': 'i'}, {'tag': 'u'}]}]}]}], + convert_html_to_telegraph_format(html, clean_html=False) + ) + class UploadImageTest(unittest.TestCase): From 23e4a923d1f07540299fa25970864f8ef2688691 Mon Sep 17 00:00:00 2001 From: Yolk Date: Tue, 10 Jan 2017 10:52:34 +0200 Subject: [PATCH 02/21] Html processing refactoring (stage 2) --- html_telegraph_poster/html_to_telegraph.py | 24 +++++++++--- tests/test.py | 45 ++++++++++++++-------- 2 files changed, 49 insertions(+), 20 deletions(-) diff --git a/html_telegraph_poster/html_to_telegraph.py b/html_telegraph_poster/html_to_telegraph.py index 5176628..9e8c57d 100644 --- a/html_telegraph_poster/html_to_telegraph.py +++ b/html_telegraph_poster/html_to_telegraph.py @@ -49,10 +49,12 @@ def clean_article_html(html_string): return html_string.strip(' \t') -def _create_element(element): +def _create_element(element, text=None): # creates lxml element without document tree (no body, no parents) new_element = html.HtmlElement() new_element.tag = element + if text: + new_element.text = text return new_element @@ -132,18 +134,19 @@ def preprocess_fragments(fragments): f = fragment.find('figure') body.append(f) - images_to_wrap = fragment.xpath('.//self::p[not(normalize-space(string()))]//img') + images_to_wrap = fragment.xpath('.//self::img[not(ancestor::figure)]') if len(images_to_wrap): for image in images_to_wrap: - image.tail = '' body.append(_wrap_tag(image, 'figure')) + if image.tail: + body.append(_create_element('p', text=image.tail)) + image.tail = '' - body.append(fragment) # bad iframes ns = {'re': "http://exslt.org/regular-expressions"} bad_tags.extend(fragments[-1].xpath("//iframe[not(re:test(@src, '%s|%s', 'i'))]" % (youtube_re, vimeo_re), namespaces=ns)) # bad lists (remove lists/list items if empty) - nodes_not_to_be_empty = fragments[-1].xpath('//ul|//ol|//li|//p') + nodes_not_to_be_empty = fragments[-1].xpath('//ul|//ol|//li') bad_tags.extend([x for x in nodes_not_to_be_empty if len(x.text_content().strip()) == 0]) for bad_tag in bad_tags: @@ -167,6 +170,15 @@ def preprocess_fragments(fragments): return len(body.getchildren()) and body +def post_process(body): + + bad_tags = body.xpath('//p|//a') + + for x in bad_tags: + if len(x.text_content().strip()) == 0: + x.drop_tag() + + def _recursive_convert(element): fragment_root_element = { @@ -206,6 +218,8 @@ def convert_html_to_telegraph_format(html_string, clean_html=True): if body: for x in body.iterdescendants(): preprocess_media_tags(x) + + post_process(body) else: fragments = _fragments_from_string(html_string) body = fragments[0].getparent() if len(fragments) else None diff --git a/tests/test.py b/tests/test.py index 6d3fd40..3aa7d21 100644 --- a/tests/test.py +++ b/tests/test.py @@ -131,7 +131,7 @@ def test_image_inside_paragraph(self): html = '

' \ '

' - para_with_text = '

abc

' + para_with_text = '

abc xyz

' para_with_figure = '

test

' self.assertJson( @@ -142,9 +142,12 @@ def test_image_inside_paragraph(self): ], convert_html_to_telegraph_format(html, clean_html=True) ) + self.assertJson( [ - {'tag': 'p', 'children': [' ', {'tag': 'img', 'attrs': {'src': 'image1.jpg'}}, 'abc ']} + {"tag": "p", "children": [" abc "]}, + {"tag": "figure", "children": [{"tag": "img", "attrs": {"src": "image1.jpg"}}]}, + {"tag": "p", "children": ["xyz "]} ], convert_html_to_telegraph_format(para_with_text, clean_html=True) ) @@ -158,19 +161,20 @@ def test_image_inside_paragraph(self): def test_image_tag_at_the_top(self): html = '' - html_with_text_after = ' Text after' - html_with_text_before = 'Text before ' + html_with_text_after = ' Text after' + html_with_text_before = 'Text before ' html_joined = html_with_text_before + html_with_text_after self.assertJson( [ - {"children": [{"attrs": {"src": "image.jpg"}, "tag": "img"}], "tag": "p"} + {"children": [{"attrs": {"src": "image.jpg"}, "tag": "img"}], "tag": "figure"} ], convert_html_to_telegraph_format(html, clean_html=True) ) self.assertJson( [ - {"children": [{"attrs": {"src": "image.jpg"}, "tag": "img"}, ' Text after'], "tag": "p"} + {"tag": "figure", "children": [{"tag": "img", "attrs": {"src": "image1.jpg"}}]}, + {"tag": "p", "children": [" Text after"]} ], convert_html_to_telegraph_format(html_with_text_after, clean_html=True) ) @@ -178,15 +182,17 @@ def test_image_tag_at_the_top(self): self.assertJson( [ {"children": ["Text before "], "tag": "p"}, - {"children": [{"attrs": {"src": "image.jpg"}, "tag": "img"}], "tag": "p"} + {"children": [{"attrs": {"src": "image0.jpg"}, "tag": "img"}], "tag": "figure"} ], convert_html_to_telegraph_format(html_with_text_before, clean_html=True) ) + self.assertJson( [ - {"children": ["Text before "], "tag": "p"}, - {"children": [{"attrs": {"src": "image.jpg"}, "tag": "img"}], "tag": "p"}, - {"children": [{"attrs": {"src": "image.jpg"}, "tag": "img"}, " Text after"], "tag": "p"} + {"tag": "p", "children": ["Text before "]}, + {"tag": "figure", "children": [{"tag": "img", "attrs": {"src": "image0.jpg"}}]}, + {"tag": "figure", "children": [{"tag": "img", "attrs": {"src": "image1.jpg"}}]}, + {"tag": "p", "children": [" Text after"]} ], convert_html_to_telegraph_format(html_joined, clean_html=True) ) @@ -238,8 +244,8 @@ def test_iframe(self): mix = iframe_child_no_src + html + iframe_empty_src + iframe_no_src self.assertJson( [ - {'tag': 'p', 'children': [{'tag': 'figure', 'children': [{'tag': 'iframe', 'attrs': { - 'src': '/embed/youtube?url=https%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3Dabcdef'}}]}]} + {'tag': 'figure', 'children': [{'tag': 'iframe', 'attrs': { + 'src': '/embed/youtube?url=https%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3Dabcdef'}}]} ], convert_html_to_telegraph_format(html, clean_html=True) ) @@ -258,8 +264,8 @@ def test_iframe(self): self.assertJson( [ - {'tag': 'p', 'children': [{'tag': 'figure', 'children': [{'tag': 'iframe', 'attrs': { - 'src': '/embed/youtube?url=https%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3Dabcdef'}}]}]} + {'tag': 'figure', 'children': [{'tag': 'iframe', 'attrs': { + 'src': '/embed/youtube?url=https%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3Dabcdef'}}]} ], convert_html_to_telegraph_format(mix, clean_html=True) ) @@ -276,7 +282,7 @@ def test_iframe(self): ) self.assertJson( [ - {u'tag': u'p', u'children': [{u'tag': u'figure', u'children': [{u'tag': u'iframe', u'attrs': {u'src': u'/embed/vimeo?url=https%3A%2F%2Fvimeo.com%2F1185346'}}]}]} + {'tag': 'figure', 'children': [{'tag': 'iframe', 'attrs': {'src': '/embed/vimeo?url=https%3A%2F%2Fvimeo.com%2F1185346'}}]} ], convert_html_to_telegraph_format(iframe_vimeo, clean_html=True) ) @@ -351,6 +357,15 @@ def test_convert_without_clean(self): convert_html_to_telegraph_format(html, clean_html=False) ) + def test_empty_links(self): + html = ' ' + + self.assertJson( + [ + {'tag': 'figure', 'children': [{'tag': 'img', 'attrs': {'src': 'http://httpbin.org/image/jpeg'}}]} + ], + convert_html_to_telegraph_format(html, clean_html=True) + ) class UploadImageTest(unittest.TestCase): From ec498535250d92da46da792084924e8ece8a7143 Mon Sep 17 00:00:00 2001 From: Yolk Date: Tue, 10 Jan 2017 19:28:46 +0200 Subject: [PATCH 03/21] Html processing refactoring (stage 3) --- html_telegraph_poster/html_to_telegraph.py | 34 ++++++++++++++-------- tests/test.py | 25 +++++++++++++++- 2 files changed, 46 insertions(+), 13 deletions(-) diff --git a/html_telegraph_poster/html_to_telegraph.py b/html_telegraph_poster/html_to_telegraph.py index 9e8c57d..4530d07 100644 --- a/html_telegraph_poster/html_to_telegraph.py +++ b/html_telegraph_poster/html_to_telegraph.py @@ -58,6 +58,12 @@ def _create_element(element, text=None): return new_element +def _insert_after(element, ref): + parent = ref.getparent() + parent.insert(parent.index(ref) + 1, element) + return element + + def _wrap_tag(element, wrapper): new_element = _create_element(wrapper) new_element.append(element) @@ -129,18 +135,21 @@ def preprocess_fragments(fragments): body = fragments[0].getparent() for fragment in fragments: + last_element = fragment # figure should be on the top level if fragment.find('figure') is not None: f = fragment.find('figure') - body.append(f) + last_element = _insert_after(f, last_element) images_to_wrap = fragment.xpath('.//self::img[not(ancestor::figure)]') - if len(images_to_wrap): - for image in images_to_wrap: - body.append(_wrap_tag(image, 'figure')) - if image.tail: - body.append(_create_element('p', text=image.tail)) - image.tail = '' + for image in images_to_wrap: + figure = _create_element('figure') + last_element = _insert_after(figure, last_element) + figure.append(image) + + if image.tail: + _insert_after(_create_element('p', text=image.tail), last_element) + image.tail = '' # bad iframes ns = {'re': "http://exslt.org/regular-expressions"} @@ -167,7 +176,7 @@ def preprocess_fragments(fragments): fragment.addnext(paragraph) fragment.tail = '' - return len(body.getchildren()) and body + return len(body.getchildren()) and body or None def post_process(body): @@ -215,9 +224,10 @@ def convert_html_to_telegraph_format(html_string, clean_html=True): body = preprocess_fragments( _fragments_from_string(html_string) ) - if body: - for x in body.iterdescendants(): - preprocess_media_tags(x) + if body is not None: + desc = [x for x in body.iterdescendants()] + for tag in desc: + preprocess_media_tags(tag) post_process(body) else: @@ -225,7 +235,7 @@ def convert_html_to_telegraph_format(html_string, clean_html=True): body = fragments[0].getparent() if len(fragments) else None content = [] - if body: + if body is not None: content = [_recursive_convert(x) for x in body.iterchildren()] return json.dumps(content, ensure_ascii=False) diff --git a/tests/test.py b/tests/test.py index 3aa7d21..31ca714 100644 --- a/tests/test.py +++ b/tests/test.py @@ -133,7 +133,8 @@ def test_image_inside_paragraph(self): para_with_text = '

abc xyz

' para_with_figure = '

test

' - + para_img1 = '

Text 1

test

Text 2

' + para_img2 = '

Text 1 Text after image

Text 2

' self.assertJson( [ {"children": [{"attrs": {"src": "image0.jpg"}, "tag": "img"}], "tag": "figure"}, @@ -159,6 +160,28 @@ def test_image_inside_paragraph(self): convert_html_to_telegraph_format(para_with_figure, clean_html=True) ) + self.assertJson( + [ + {"tag": "p", "children": ["Text 1 "]}, + { + "tag": "figure", "children": + [" ", {"tag": "img", "attrs": {"src": "image0.jpg"}}, " ", + {"tag": "figcaption", "children": ["test"]}] + }, + {"tag": "p", "children": ["Text 2"]} + ], + convert_html_to_telegraph_format(para_img1, clean_html=True) + ) + + self.assertJson( + [ + {"tag": "p", "children": [" Text 1 "]}, + {"tag": "figure", "children": [{"tag": "img", "attrs": {"src": "image0.jpg"}}]}, + {"tag": "p", "children": ["Text after image "]}, {"tag": "p", "children": ["Text 2 "]} + ], + convert_html_to_telegraph_format(para_img2, clean_html=True) + ) + def test_image_tag_at_the_top(self): html = '' html_with_text_after = ' Text after' From 42c8ea6d032ea6a6fd61392d390140e9a1f01fae Mon Sep 17 00:00:00 2001 From: Yolk Date: Tue, 10 Jan 2017 22:20:06 +0200 Subject: [PATCH 04/21] Version bump --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 76f9aa9..ccd7fe9 100644 --- a/setup.py +++ b/setup.py @@ -1,7 +1,7 @@ from setuptools import setup setup(name='html_telegraph_poster', - version='0.0.22', + version='0.1.0', description='Posts your html to telegra.ph blogging service', keywords='telegra.ph post html telegram', url='https://github.com/mercuree/html-telegraph-poster', From 5dc84b8433bf079aaecb495631d272a1f28b368c Mon Sep 17 00:00:00 2001 From: Yolk Date: Wed, 11 Jan 2017 11:48:24 +0200 Subject: [PATCH 05/21] Option to disable html cleaning and processing --- html_telegraph_poster/html_to_telegraph.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/html_telegraph_poster/html_to_telegraph.py b/html_telegraph_poster/html_to_telegraph.py index 4530d07..c414275 100644 --- a/html_telegraph_poster/html_to_telegraph.py +++ b/html_telegraph_poster/html_to_telegraph.py @@ -242,14 +242,15 @@ def convert_html_to_telegraph_format(html_string, clean_html=True): def _upload(title, author, text, - author_url='', tph_uuid=None, page_id=None, user_agent=default_user_agent, convert_html=True): + author_url='', tph_uuid=None, page_id=None, user_agent=default_user_agent, convert_html=True, + clean_html=True): if not title: raise TitleRequiredError('Title is required') if not text: raise TextRequiredError('Text is required') - content = convert_html_to_telegraph_format(text) if convert_html else text + content = convert_html_to_telegraph_format(text, clean_html) if convert_html else text cookies = dict(tph_uuid=tph_uuid) if tph_uuid and page_id else None fields = { @@ -286,7 +287,7 @@ def upload_to_telegraph(title, author, text, author_url='', tph_uuid=None, page_ class TelegraphPoster(object): - def __init__(self, tph_uuid=None, page_id=None, user_agent=default_user_agent): + def __init__(self, tph_uuid=None, page_id=None, user_agent=default_user_agent, clean_html=True): self.title = None self.author = None self.author_url = None @@ -294,6 +295,7 @@ def __init__(self, tph_uuid=None, page_id=None, user_agent=default_user_agent): self.tph_uuid = tph_uuid self.page_id = page_id self.user_agent = user_agent + self.clean_html = clean_html def post(self, title, author, text, author_url=''): result = self.edit( @@ -310,12 +312,13 @@ def post(self, title, author, text, author_url=''): return result def edit(self, title=None, author=None, text=None): - return upload_to_telegraph( + return _upload( title=title or self.title, author=author or self.author, text=text or self.text, author_url=self.author_url, tph_uuid=self.tph_uuid, page_id=self.page_id, - user_agent=self.user_agent + user_agent=self.user_agent, + clean_html=self.clean_html ) From e46503b3248c15f4962dcb0ca9b3c8c7bc32756e Mon Sep 17 00:00:00 2001 From: Yolk Date: Wed, 11 Jan 2017 12:48:14 +0200 Subject: [PATCH 06/21] Optimizing bad tags processing --- html_telegraph_poster/html_to_telegraph.py | 7 +++---- tests/test.py | 2 +- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/html_telegraph_poster/html_to_telegraph.py b/html_telegraph_poster/html_to_telegraph.py index c414275..385f165 100644 --- a/html_telegraph_poster/html_to_telegraph.py +++ b/html_telegraph_poster/html_to_telegraph.py @@ -93,10 +93,7 @@ def _fragments_from_string(html_string): def preprocess_media_tags(element): if isinstance(element, html.HtmlElement): - if element.tag == 'figcaption': - # figcaption may have only text content - [e.drop_tag() for e in element.findall('*')] - elif element.tag in ['ol', 'ul']: + if element.tag in ['ol', 'ul']: # ignore any spaces between
    and
  • element.text = '' elif element.tag == 'li': @@ -154,6 +151,8 @@ def preprocess_fragments(fragments): # bad iframes ns = {'re': "http://exslt.org/regular-expressions"} bad_tags.extend(fragments[-1].xpath("//iframe[not(re:test(@src, '%s|%s', 'i'))]" % (youtube_re, vimeo_re), namespaces=ns)) + # figcaption may have only text content + bad_tags.extend(fragments[-1].xpath("//figcaption//*")) # bad lists (remove lists/list items if empty) nodes_not_to_be_empty = fragments[-1].xpath('//ul|//ol|//li') bad_tags.extend([x for x in nodes_not_to_be_empty if len(x.text_content().strip()) == 0]) diff --git a/tests/test.py b/tests/test.py index 31ca714..b9e5f74 100644 --- a/tests/test.py +++ b/tests/test.py @@ -133,7 +133,7 @@ def test_image_inside_paragraph(self): para_with_text = '

    abc xyz

    ' para_with_figure = '

    test

    ' - para_img1 = '

    Text 1

    test

    Text 2

    ' + para_img1 = '

    Text 1

    test

    Text 2

    ' para_img2 = '

    Text 1 Text after image

    Text 2

    ' self.assertJson( [ From 2ea0df4217b3720d4a662e7499868e21415c6c49 Mon Sep 17 00:00:00 2001 From: Yolk Date: Sun, 15 Jan 2017 13:53:56 +0200 Subject: [PATCH 07/21] Added pre tag support --- html_telegraph_poster/html_to_telegraph.py | 49 +++++++++++++++-- tests/test.py | 63 ++++++++++++++++++++++ 2 files changed, 107 insertions(+), 5 deletions(-) diff --git a/html_telegraph_poster/html_to_telegraph.py b/html_telegraph_poster/html_to_telegraph.py index 385f165..5066a64 100644 --- a/html_telegraph_poster/html_to_telegraph.py +++ b/html_telegraph_poster/html_to_telegraph.py @@ -11,13 +11,15 @@ base_url = 'https://telegra.ph' save_url = 'https://edit.telegra.ph/save' default_user_agent = 'Python_telegraph_poster/0.1' -allowed_tags = ['a', 'aside', 'b', 'blockquote', 'br', 'em', 'figcaption', 'figure', 'h3', 'h4', 'hr', 'i', 'iframe', - 'img', 'li', 'ol', 'p', 's', 'strong', 'u', 'ul', 'video'] -allowed_top_level_tags = ['aside', 'blockquote', 'figure', 'h3', 'h4', 'hr', 'ol', 'p', 'ul'] +allowed_tags = ['a', 'aside', 'b', 'blockquote', 'br', 'em', 'figcaption', 'figure', 'h3', 'h4', 'hr', 'i', + 'iframe', 'img', 'li', 'ol', 'p', 'pre', 's', 'strong', 'u', 'ul', 'video'] +allowed_top_level_tags = ['aside', 'blockquote', 'pre', 'figure', 'h3', 'h4', 'hr', 'ol', 'p', 'ul'] youtube_re = r'(https?:)?//(www\.)?youtube(-nocookie)?\.com/embed/' vimeo_re = r'(https?:)?//player\.vimeo\.com/video/(\d+)' twitter_re = re.compile(r'(https?:)?//(www\.)?twitter\.com/[A-Za-z0-9_]{1,15}/status/\d+') +pre_content_re = re.compile(r']*>[^<]*') +line_breaks_and_empty_strings = re.compile('(^[\s\t]*)?\r?\n', flags=re.MULTILINE) def clean_article_html(html_string): @@ -41,14 +43,36 @@ def clean_article_html(html_string): cleaned = c.clean_html(html_string) # remove wrapped div cleaned = cleaned[5:-6] - # remove all line breaks and empty strings (in html it means nothing) - html_string = re.sub('(^[\s\t]*)?\r?\n', '', cleaned, flags=re.MULTILINE) + # remove all line breaks and empty strings + html_string = replace_line_breaks_except_pre(cleaned) # but replace multiple br tags with one line break, telegraph will convert it to
    html_string = re.sub(r'(|\s[^<>]*>)\s*)+', '\n', html_string) return html_string.strip(' \t') +def replace_line_breaks_except_pre(html_string): + # Remove all line breaks and empty strings, except pre tag + # how to make it in one string? :\ + pre_ranges = [0] + out = '' + + # get
     start/end postion
    +    for x in pre_content_re.finditer(html_string):
    +        start, end = x.start(), x.end()
    +        pre_ranges.extend((start, end))
    +    pre_ranges.append(len(html_string))
    +
    +    # all odd elements are 
    , leave them untouched
    +    for k in range(1, len(pre_ranges)):
    +        part = html_string[pre_ranges[k-1]:pre_ranges[k]]
    +        if k % 2 == 0:
    +            out += part
    +        else:
    +            out += line_breaks_and_empty_strings.sub('', part)
    +    return out
    +
    +
     def _create_element(element, text=None):
         # creates lxml element without document tree (no body, no parents)
         new_element = html.HtmlElement()
    @@ -153,6 +177,10 @@ def preprocess_fragments(fragments):
         bad_tags.extend(fragments[-1].xpath("//iframe[not(re:test(@src, '%s|%s', 'i'))]" % (youtube_re, vimeo_re), namespaces=ns))
         # figcaption may have only text content
         bad_tags.extend(fragments[-1].xpath("//figcaption//*"))
    +
    +    # drop all tags inside pre
    +    bad_tags.extend(fragments[-1].xpath("//pre//*"))
    +
         # bad lists (remove lists/list items if empty)
         nodes_not_to_be_empty = fragments[-1].xpath('//ul|//ol|//li')
         bad_tags.extend([x for x in nodes_not_to_be_empty if len(x.text_content().strip()) == 0])
    @@ -186,6 +214,17 @@ def post_process(body):
             if len(x.text_content().strip()) == 0:
                 x.drop_tag()
     
    +    # group following pre elements into single one (telegraph is buggy)
    +    pres = body.xpath('//pre')
    +    for pre in pres:
    +        next_pre = pre.getnext()
    +        while next_pre is not None and next_pre.tag == 'pre':
    +            pre.text += "\n" + next_pre.text
    +            current_pre = next_pre
    +            next_pre = next_pre.getnext()
    +            pres.remove(current_pre)
    +            current_pre.drop_tree()
    +
     
     def _recursive_convert(element):
     
    diff --git a/tests/test.py b/tests/test.py
    index b9e5f74..f1ef916 100644
    --- a/tests/test.py
    +++ b/tests/test.py
    @@ -390,6 +390,69 @@ def test_empty_links(self):
                 convert_html_to_telegraph_format(html, clean_html=True)
             )
     
    +    def test_code_block(self):
    +        html = '''
    +        def test_code_block(self):
    +            html = ''
    +            print("hello world")
    +        
    ''' + html2 = ''' +

    +                def hello_world():
    +                    print("hello")
    +            
    +
    print("second pre")
    +

    +

    Text after pre

    + ''' + html3 = ''' +
    my_list = [1, 2, 3, 4, 5, 6, 7]
    +EVEN = slice(1, None, 2)
    +print(my_list[EVEN])     # [2, 4, 6]
    +
    +

    paragraph splitter

    +
     String anotherCodeBlock = "separated code block"
    +
      String anotherCodeBlock2 = "separated code block2"
    +
      String anotherCodeBlock3 = "separated code block3"
    +

    paragraph splitter

    +
      String anotherCodeBlock4 = "separated code block4"
    +
      String anotherCodeBlock5 = "separated code block5"
    +

    paragraph splitter

    +
      String anotherCodeBlock6 = "separated code block6"
    + ''' + self.assertJson( + [ + {"tag": "pre", "children": [ + "\n def test_code_block(self):\n html = ''\n print(\"hello world\")\n "]} + ], + convert_html_to_telegraph_format(html, clean_html=True) + ) + self.assertJson( + [ + {"tag": "pre", "attrs": {"class": "code"}, "children": [ + "\n def hello_world():\n print(\"hello\")\n \nprint(\"second pre\")"]}, + {"tag": "p", "children": [" Text after pre "]} + ], + convert_html_to_telegraph_format(html2, clean_html=True) + ) + self.assertJson( + [ + {"tag": "pre", "children": [ + "my_list = [1, 2, 3, 4, 5, 6, 7]\nEVEN = slice(1, None, 2)\nprint(my_list[EVEN]) # [2, 4, 6]\n"]}, + {"tag": "p", "children": [" paragraph splitter"]}, + {"tag": "pre", "children": [ + " String anotherCodeBlock = \"separated code block\"\n String anotherCodeBlock2 = \"separated code block2\"\n String anotherCodeBlock3 = \"separated code block3\""]}, + {"tag": "p", "children": [" paragraph splitter"]}, + {"tag": "pre", "children": [ + " String anotherCodeBlock4 = \"separated code block4\"\n String anotherCodeBlock5 = \"separated code block5\""]}, + {"tag": "p", "children": [" paragraph splitter"]}, + {"tag": "pre", "children": [" String anotherCodeBlock6 = \"separated code block6\""]} + ], + convert_html_to_telegraph_format(html3, clean_html=True) + ) + + class UploadImageTest(unittest.TestCase): def test_upload(self): From 8a02792ffd4ff5cd21724e3ddff9b96a486eb2e8 Mon Sep 17 00:00:00 2001 From: Yolk Date: Sun, 15 Jan 2017 13:58:01 +0200 Subject: [PATCH 08/21] Added pre tag support version bump --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index ccd7fe9..d3d7841 100644 --- a/setup.py +++ b/setup.py @@ -1,7 +1,7 @@ from setuptools import setup setup(name='html_telegraph_poster', - version='0.1.0', + version='0.1.1', description='Posts your html to telegra.ph blogging service', keywords='telegra.ph post html telegram', url='https://github.com/mercuree/html-telegraph-poster', From 5f242a3134531cc9edae3cee38f2b7f0fb03b528 Mon Sep 17 00:00:00 2001 From: Yolk Date: Thu, 19 Jan 2017 20:08:37 +0200 Subject: [PATCH 09/21] Minor improvements --- html_telegraph_poster/html_to_telegraph.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/html_telegraph_poster/html_to_telegraph.py b/html_telegraph_poster/html_to_telegraph.py index 5066a64..1edf689 100644 --- a/html_telegraph_poster/html_to_telegraph.py +++ b/html_telegraph_poster/html_to_telegraph.py @@ -218,7 +218,7 @@ def post_process(body): pres = body.xpath('//pre') for pre in pres: next_pre = pre.getnext() - while next_pre is not None and next_pre.tag == 'pre': + while next_pre is not None and next_pre in pres: pre.text += "\n" + next_pre.text current_pre = next_pre next_pre = next_pre.getnext() From 68f629fc2ec614b40817c809fab2f90aaa1f0586 Mon Sep 17 00:00:00 2001 From: Yolk Date: Thu, 19 Jan 2017 20:15:30 +0200 Subject: [PATCH 10/21] Create textnodes from strings --- html_telegraph_poster/html_to_telegraph.py | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/html_telegraph_poster/html_to_telegraph.py b/html_telegraph_poster/html_to_telegraph.py index 1edf689..f490dd0 100644 --- a/html_telegraph_poster/html_to_telegraph.py +++ b/html_telegraph_poster/html_to_telegraph.py @@ -94,6 +94,17 @@ def _wrap_tag(element, wrapper): return new_element +def join_following_elements(elements, join_string): + for element in elements: + next_element = element.getnext() + while next_element is not None and next_element in elements: + element.text += join_string + next_element.text + current = next_element + next_element = next_element.getnext() + elements.remove(current) + current.drop_tree() + + def _fragments_from_string(html_string): fragments = html.fragments_fromstring(html_string) if not len(fragments): @@ -215,15 +226,7 @@ def post_process(body): x.drop_tag() # group following pre elements into single one (telegraph is buggy) - pres = body.xpath('//pre') - for pre in pres: - next_pre = pre.getnext() - while next_pre is not None and next_pre in pres: - pre.text += "\n" + next_pre.text - current_pre = next_pre - next_pre = next_pre.getnext() - pres.remove(current_pre) - current_pre.drop_tree() + join_following_elements(body.xpath('//pre'), join_string="\n") def _recursive_convert(element): From 70537df713a5979d7281211e8da469d17548f855 Mon Sep 17 00:00:00 2001 From: Yolk Date: Thu, 19 Jan 2017 22:02:21 +0200 Subject: [PATCH 11/21] Refactoring --- html_telegraph_poster/html_to_telegraph.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/html_telegraph_poster/html_to_telegraph.py b/html_telegraph_poster/html_to_telegraph.py index f490dd0..ac7abbf 100644 --- a/html_telegraph_poster/html_to_telegraph.py +++ b/html_telegraph_poster/html_to_telegraph.py @@ -94,15 +94,19 @@ def _wrap_tag(element, wrapper): return new_element -def join_following_elements(elements, join_string): +def join_following_elements(elements, join_string=''): for element in elements: next_element = element.getnext() while next_element is not None and next_element in elements: - element.text += join_string + next_element.text current = next_element next_element = next_element.getnext() + if current.text: + current.text = join_string + current.text + if current.tail: + current.tail = current.tail.strip() + element.append(current) elements.remove(current) - current.drop_tree() + current.drop_tag() def _fragments_from_string(html_string): From ab6c5a727792057f948dd5e7329b809bae366b2c Mon Sep 17 00:00:00 2001 From: Yolk Date: Fri, 20 Jan 2017 13:50:11 +0200 Subject: [PATCH 12/21] Plain text should not be inside figure --- html_telegraph_poster/html_to_telegraph.py | 16 +++++++++++++--- setup.py | 2 +- tests/test.py | 8 ++++++++ 3 files changed, 22 insertions(+), 4 deletions(-) diff --git a/html_telegraph_poster/html_to_telegraph.py b/html_telegraph_poster/html_to_telegraph.py index ac7abbf..4f3caf2 100644 --- a/html_telegraph_poster/html_to_telegraph.py +++ b/html_telegraph_poster/html_to_telegraph.py @@ -94,6 +94,15 @@ def _wrap_tag(element, wrapper): return new_element +def _wrap_figure(element): + figure = _create_element('figure') + element.addprevious(figure) + element.drop_tag() + element.tail = '' + figure.append(element) + return figure + + def join_following_elements(elements, join_string=''): for element in elements: next_element = element.getnext() @@ -149,8 +158,8 @@ def preprocess_media_tags(element): elif vimeo: element.set('src', '/embed/vimeo?url=' + quote_plus('https://vimeo.com/' + vimeo.group(2))) - element.addprevious(_create_element('figure')) - element.getprevious().append(element) + _wrap_figure(element) + elif element.tag == 'blockquote' and element.get('class') == 'twitter-tweet': twitter_links = element.xpath('.//a') for tw_link in twitter_links: @@ -158,7 +167,8 @@ def preprocess_media_tags(element): twitter_frame = html.HtmlElement() twitter_frame.tag = 'iframe' twitter_frame.set('src', '/embed/twitter?url=' + quote_plus(tw_link.get('href'))) - element.addprevious(_wrap_tag(twitter_frame, 'figure')) + element.addprevious(twitter_frame) + _wrap_figure(twitter_frame) element.drop_tree() diff --git a/setup.py b/setup.py index d3d7841..fe8178f 100644 --- a/setup.py +++ b/setup.py @@ -1,7 +1,7 @@ from setuptools import setup setup(name='html_telegraph_poster', - version='0.1.1', + version='0.1.2', description='Posts your html to telegra.ph blogging service', keywords='telegra.ph post html telegram', url='https://github.com/mercuree/html-telegraph-poster', diff --git a/tests/test.py b/tests/test.py index f1ef916..45146e6 100644 --- a/tests/test.py +++ b/tests/test.py @@ -262,6 +262,7 @@ def test_iframe(self): iframe_no_src = '' iframe_child_no_src = '

    ' iframe_text_before = 'text before ' + iframe_text_after = '

    Text after

    ' iframe_not_allowed_src = '
    ' iframe_vimeo = '' mix = iframe_child_no_src + html + iframe_empty_src + iframe_no_src @@ -309,6 +310,13 @@ def test_iframe(self): ], convert_html_to_telegraph_format(iframe_vimeo, clean_html=True) ) + self.assertJson( + [ + {"tag": "p", "children": [{"tag": "figure", "children": [{"tag": "iframe", "attrs": { + "src": "/embed/youtube?url=https%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3Dabcdef"}}]}, "Text after "]} + ], + convert_html_to_telegraph_format(iframe_text_after, clean_html=True) + ) def test_twitter_links(self): html = ''' From 177668d642fd617a73a6a6e93a53cd45c5423c61 Mon Sep 17 00:00:00 2001 From: Yolk Date: Fri, 20 Jan 2017 17:45:47 +0200 Subject: [PATCH 13/21] Avoid nested figures --- html_telegraph_poster/html_to_telegraph.py | 3 ++- setup.py | 2 +- tests/test.py | 11 ++++++++++- 3 files changed, 13 insertions(+), 3 deletions(-) diff --git a/html_telegraph_poster/html_to_telegraph.py b/html_telegraph_poster/html_to_telegraph.py index 4f3caf2..1766070 100644 --- a/html_telegraph_poster/html_to_telegraph.py +++ b/html_telegraph_poster/html_to_telegraph.py @@ -158,7 +158,8 @@ def preprocess_media_tags(element): elif vimeo: element.set('src', '/embed/vimeo?url=' + quote_plus('https://vimeo.com/' + vimeo.group(2))) - _wrap_figure(element) + if not len(element.xpath('./ancestor::figure')): + _wrap_figure(element) elif element.tag == 'blockquote' and element.get('class') == 'twitter-tweet': twitter_links = element.xpath('.//a') diff --git a/setup.py b/setup.py index fe8178f..72c2895 100644 --- a/setup.py +++ b/setup.py @@ -1,7 +1,7 @@ from setuptools import setup setup(name='html_telegraph_poster', - version='0.1.2', + version='0.1.3', description='Posts your html to telegra.ph blogging service', keywords='telegra.ph post html telegram', url='https://github.com/mercuree/html-telegraph-poster', diff --git a/tests/test.py b/tests/test.py index 45146e6..156e21d 100644 --- a/tests/test.py +++ b/tests/test.py @@ -266,11 +266,12 @@ def test_iframe(self): iframe_not_allowed_src = '
    ' iframe_vimeo = '' mix = iframe_child_no_src + html + iframe_empty_src + iframe_no_src + iframe_with_figure = '
    Text after
    ' self.assertJson( [ {'tag': 'figure', 'children': [{'tag': 'iframe', 'attrs': { 'src': '/embed/youtube?url=https%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3Dabcdef'}}]} - ], + ], convert_html_to_telegraph_format(html, clean_html=True) ) self.assertJson( @@ -317,6 +318,14 @@ def test_iframe(self): ], convert_html_to_telegraph_format(iframe_text_after, clean_html=True) ) + self.assertJson( + [ + {u'tag': u'figure', u'children': [{u'tag': u'iframe', u'attrs': { + u'src': u'/embed/youtube?url=https%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3Dabcdef'}}, u'Text after ']} + ], + convert_html_to_telegraph_format(iframe_with_figure, clean_html=True) + ) + def test_twitter_links(self): html = ''' From a43f485b1b8f8bdcc8156d38d36e0a911c77524e Mon Sep 17 00:00:00 2001 From: Yolk Date: Sat, 21 Jan 2017 15:54:10 +0200 Subject: [PATCH 14/21] More correct formatting (fixed image order inside paragraph) --- html_telegraph_poster/html_to_telegraph.py | 50 ++++++++++++++-------- setup.py | 2 +- tests/test.py | 5 ++- 3 files changed, 35 insertions(+), 22 deletions(-) diff --git a/html_telegraph_poster/html_to_telegraph.py b/html_telegraph_poster/html_to_telegraph.py index 1766070..0c1d24f 100644 --- a/html_telegraph_poster/html_to_telegraph.py +++ b/html_telegraph_poster/html_to_telegraph.py @@ -173,6 +173,30 @@ def preprocess_media_tags(element): element.drop_tree() +def move_to_top(body): + # this should be improved to include nested elements (like lists) + # still buggy + elements = body.xpath('./*/figure') + for element in elements: + preceding_elements = element.xpath('./preceding-sibling::*') + parent = element.getparent() + if len(preceding_elements) > 0 or parent.text and len(parent.text) > 0: + + new_container = _create_element(parent.tag) + new_container.text = parent.text + parent.text = '' + parent.addprevious(new_container) + + for preceding in preceding_elements: + new_container.append(preceding) + + parent_for_figure = element.xpath('./ancestor::*[parent::body]')[0] + # tail leaves inside parent + element.drop_tree() + element.tail = '' + parent_for_figure.addprevious(element) + + def preprocess_fragments(fragments): bad_tags = [] @@ -181,23 +205,6 @@ def preprocess_fragments(fragments): body = fragments[0].getparent() - for fragment in fragments: - last_element = fragment - # figure should be on the top level - if fragment.find('figure') is not None: - f = fragment.find('figure') - last_element = _insert_after(f, last_element) - - images_to_wrap = fragment.xpath('.//self::img[not(ancestor::figure)]') - for image in images_to_wrap: - figure = _create_element('figure') - last_element = _insert_after(figure, last_element) - figure.append(image) - - if image.tail: - _insert_after(_create_element('p', text=image.tail), last_element) - image.tail = '' - # bad iframes ns = {'re': "http://exslt.org/regular-expressions"} bad_tags.extend(fragments[-1].xpath("//iframe[not(re:test(@src, '%s|%s', 'i'))]" % (youtube_re, vimeo_re), namespaces=ns)) @@ -210,7 +217,8 @@ def preprocess_fragments(fragments): # bad lists (remove lists/list items if empty) nodes_not_to_be_empty = fragments[-1].xpath('//ul|//ol|//li') bad_tags.extend([x for x in nodes_not_to_be_empty if len(x.text_content().strip()) == 0]) - + # remove links with images inside + bad_tags.extend(body.xpath('.//a[descendant::img]')) for bad_tag in bad_tags: bad_tag.drop_tag() if bad_tag in fragments: @@ -229,6 +237,10 @@ def preprocess_fragments(fragments): fragment.addnext(paragraph) fragment.tail = '' + images_to_wrap = body.xpath('.//img[not(ancestor::figure)]') + for image in images_to_wrap: + _wrap_figure(image) + return len(body.getchildren()) and body or None @@ -284,7 +296,7 @@ def convert_html_to_telegraph_format(html_string, clean_html=True): desc = [x for x in body.iterdescendants()] for tag in desc: preprocess_media_tags(tag) - + move_to_top(body) post_process(body) else: fragments = _fragments_from_string(html_string) diff --git a/setup.py b/setup.py index 72c2895..5f897b0 100644 --- a/setup.py +++ b/setup.py @@ -1,7 +1,7 @@ from setuptools import setup setup(name='html_telegraph_poster', - version='0.1.3', + version='0.1.4', description='Posts your html to telegra.ph blogging service', keywords='telegra.ph post html telegram', url='https://github.com/mercuree/html-telegraph-poster', diff --git a/tests/test.py b/tests/test.py index 156e21d..35b0f11 100644 --- a/tests/test.py +++ b/tests/test.py @@ -313,8 +313,9 @@ def test_iframe(self): ) self.assertJson( [ - {"tag": "p", "children": [{"tag": "figure", "children": [{"tag": "iframe", "attrs": { - "src": "/embed/youtube?url=https%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3Dabcdef"}}]}, "Text after "]} + {"tag": "figure", "children": [{"tag": "iframe", "attrs": { + "src": "/embed/youtube?url=https%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3Dabcdef"}}]}, + {"tag": "p", "children": ["Text after "]} ], convert_html_to_telegraph_format(iframe_text_after, clean_html=True) ) From f0d43d8991640ae9f1835cdc06ae9a1f6264adf6 Mon Sep 17 00:00:00 2001 From: Yolk Date: Sat, 21 Jan 2017 16:03:31 +0200 Subject: [PATCH 15/21] More tests --- tests/test.py | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/tests/test.py b/tests/test.py index 35b0f11..5572657 100644 --- a/tests/test.py +++ b/tests/test.py @@ -267,6 +267,14 @@ def test_iframe(self): iframe_vimeo = '' mix = iframe_child_no_src + html + iframe_empty_src + iframe_no_src iframe_with_figure = '
    Text after
    ' + + multiple_iframes = '

    '\ + 'Text before'\ + 'link text'\ + 'link2 Text after link'\ + ''\ + '

    ' + self.assertJson( [ {'tag': 'figure', 'children': [{'tag': 'iframe', 'attrs': { @@ -326,7 +334,16 @@ def test_iframe(self): ], convert_html_to_telegraph_format(iframe_with_figure, clean_html=True) ) - + self.assertJson( + [ + {"tag": "p", "children": ["Text before", {"tag": "a", "attrs": {"href": "/123"}, "children": ["link"]}]}, + {"tag": "figure", "children": [{"tag": "iframe", "attrs": { + "src": "/embed/youtube?url=https%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3Dabcdef"}}]}, + {"tag": "p", "children": [" text", {"tag": "a", "attrs": {"href": "/246"}, "children": ["link2"]}, " Text after link"]}, + {"tag": "figure", "children": [{"tag": "iframe", "attrs": { + "src": "/embed/youtube?url=https%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3Dxyzxyzxyz"}}]}], + convert_html_to_telegraph_format(multiple_iframes, clean_html=True) + ) def test_twitter_links(self): html = ''' From 2dfc835d64463090e7516d8f7abbf82a8d19b71c Mon Sep 17 00:00:00 2001 From: Yolk Date: Sat, 21 Jan 2017 23:56:48 +0200 Subject: [PATCH 16/21] Remove class attribute --- html_telegraph_poster/html_to_telegraph.py | 5 +++++ setup.py | 2 +- tests/test.py | 2 +- 3 files changed, 7 insertions(+), 2 deletions(-) diff --git a/html_telegraph_poster/html_to_telegraph.py b/html_telegraph_poster/html_to_telegraph.py index 0c1d24f..a1d393b 100644 --- a/html_telegraph_poster/html_to_telegraph.py +++ b/html_telegraph_poster/html_to_telegraph.py @@ -255,6 +255,11 @@ def post_process(body): # group following pre elements into single one (telegraph is buggy) join_following_elements(body.xpath('//pre'), join_string="\n") + # remove class attributes for all + elements_with_class = body.xpath('.//*[@class]') + for element in elements_with_class: + element.attrib.pop('class') + def _recursive_convert(element): diff --git a/setup.py b/setup.py index 5f897b0..78c49fc 100644 --- a/setup.py +++ b/setup.py @@ -1,7 +1,7 @@ from setuptools import setup setup(name='html_telegraph_poster', - version='0.1.4', + version='0.1.5', description='Posts your html to telegra.ph blogging service', keywords='telegra.ph post html telegram', url='https://github.com/mercuree/html-telegraph-poster', diff --git a/tests/test.py b/tests/test.py index 5572657..8a1b27f 100644 --- a/tests/test.py +++ b/tests/test.py @@ -465,7 +465,7 @@ def hello_world(): ) self.assertJson( [ - {"tag": "pre", "attrs": {"class": "code"}, "children": [ + {"tag": "pre", "children": [ "\n def hello_world():\n print(\"hello\")\n \nprint(\"second pre\")"]}, {"tag": "p", "children": [" Text after pre "]} ], From 75a1a1f8971c09536f1daae1cbf7b7572eba7c4e Mon Sep 17 00:00:00 2001 From: Yolk Date: Sun, 22 Jan 2017 16:18:53 +0200 Subject: [PATCH 17/21] Fixed case, where some elements skipped convertation --- html_telegraph_poster/html_to_telegraph.py | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/html_telegraph_poster/html_to_telegraph.py b/html_telegraph_poster/html_to_telegraph.py index a1d393b..8c1aed1 100644 --- a/html_telegraph_poster/html_to_telegraph.py +++ b/html_telegraph_poster/html_to_telegraph.py @@ -224,7 +224,7 @@ def preprocess_fragments(fragments): if bad_tag in fragments: fragments.remove(bad_tag) - for fragment in fragments: + for fragment in body.getchildren(): if fragment.tag not in allowed_top_level_tags: paragraph = _create_element('p') fragment.addprevious(paragraph) diff --git a/setup.py b/setup.py index 78c49fc..b51191d 100644 --- a/setup.py +++ b/setup.py @@ -1,7 +1,7 @@ from setuptools import setup setup(name='html_telegraph_poster', - version='0.1.5', + version='0.1.6', description='Posts your html to telegra.ph blogging service', keywords='telegra.ph post html telegram', url='https://github.com/mercuree/html-telegraph-poster', From 39b980b3a632f5af9d169c468412747cd6e26fa1 Mon Sep 17 00:00:00 2001 From: Yolk Date: Fri, 27 Jan 2017 17:32:49 +0200 Subject: [PATCH 18/21] Fixed regular expression for pre tags --- html_telegraph_poster/html_to_telegraph.py | 2 +- setup.py | 2 +- tests/test.py | 25 ++++++++++++++++++++++ 3 files changed, 27 insertions(+), 2 deletions(-) diff --git a/html_telegraph_poster/html_to_telegraph.py b/html_telegraph_poster/html_to_telegraph.py index 8c1aed1..854249d 100644 --- a/html_telegraph_poster/html_to_telegraph.py +++ b/html_telegraph_poster/html_to_telegraph.py @@ -18,7 +18,7 @@ youtube_re = r'(https?:)?//(www\.)?youtube(-nocookie)?\.com/embed/' vimeo_re = r'(https?:)?//player\.vimeo\.com/video/(\d+)' twitter_re = re.compile(r'(https?:)?//(www\.)?twitter\.com/[A-Za-z0-9_]{1,15}/status/\d+') -pre_content_re = re.compile(r']*>[^<]*
    ') +pre_content_re = re.compile(r']*>[\s\S]*?
    ') line_breaks_and_empty_strings = re.compile('(^[\s\t]*)?\r?\n', flags=re.MULTILINE) diff --git a/setup.py b/setup.py index b51191d..7ae1b48 100644 --- a/setup.py +++ b/setup.py @@ -1,7 +1,7 @@ from setuptools import setup setup(name='html_telegraph_poster', - version='0.1.6', + version='0.1.7', description='Posts your html to telegra.ph blogging service', keywords='telegra.ph post html telegram', url='https://github.com/mercuree/html-telegraph-poster', diff --git a/tests/test.py b/tests/test.py index 8a1b27f..43c1ac1 100644 --- a/tests/test.py +++ b/tests/test.py @@ -455,6 +455,17 @@ def hello_world():
      String anotherCodeBlock5 = "separated code block5"

    paragraph splitter

      String anotherCodeBlock6 = "separated code block6"
    + ''' + html4 = ''' +

    $

    mkvirtualenv myvirtualenv --python

    =

    /usr/bin/python3.4 + + +Running virtualenv with interpreter /usr/bin/python3.4 +Using base prefix

    '/usr'

    +New python executable in myvirtualenv/bin/python3.4 +Also creating executable in myvirtualenv/bin/python +Installing setuptools, pip...done. +
    ''' self.assertJson( [ @@ -487,6 +498,20 @@ def hello_world(): convert_html_to_telegraph_format(html3, clean_html=True) ) + print(convert_html_to_telegraph_format(html4, clean_html=True)) + self.assertJson( + [ + {"tag": "pre", "children": [ + "$ mkvirtualenv myvirtualenv --python=/usr/bin/python3.4\n\n\n" + "Running virtualenv with interpreter /usr/bin/python3.4\n" + "Using base prefix '/usr'\n" + "New python executable in myvirtualenv/bin/python3.4\n" + "Also creating executable in myvirtualenv/bin/python\n" + "Installing setuptools, pip...done.\n"]} + ], + convert_html_to_telegraph_format(html4, clean_html=True) + ) + class UploadImageTest(unittest.TestCase): From 87c823dd80f64c2a1519afc2c7e9d92bb159ac3d Mon Sep 17 00:00:00 2001 From: Yolk Date: Fri, 27 Jan 2017 18:03:18 +0200 Subject: [PATCH 19/21] Removed print --- tests/test.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/tests/test.py b/tests/test.py index 43c1ac1..9a7c375 100644 --- a/tests/test.py +++ b/tests/test.py @@ -497,8 +497,6 @@ def hello_world(): ], convert_html_to_telegraph_format(html3, clean_html=True) ) - - print(convert_html_to_telegraph_format(html4, clean_html=True)) self.assertJson( [ {"tag": "pre", "children": [ From ee24b6dbc23e874bebd5c4ebf2e9e5f0b355825a Mon Sep 17 00:00:00 2001 From: Yolk Date: Tue, 31 Jan 2017 17:27:49 +0200 Subject: [PATCH 20/21] Added code tag support --- html_telegraph_poster/html_to_telegraph.py | 11 +++++++++-- setup.py | 2 +- tests/test.py | 14 ++++++++++++++ 3 files changed, 24 insertions(+), 3 deletions(-) diff --git a/html_telegraph_poster/html_to_telegraph.py b/html_telegraph_poster/html_to_telegraph.py index 854249d..235ac36 100644 --- a/html_telegraph_poster/html_to_telegraph.py +++ b/html_telegraph_poster/html_to_telegraph.py @@ -11,14 +11,14 @@ base_url = 'https://telegra.ph' save_url = 'https://edit.telegra.ph/save' default_user_agent = 'Python_telegraph_poster/0.1' -allowed_tags = ['a', 'aside', 'b', 'blockquote', 'br', 'em', 'figcaption', 'figure', 'h3', 'h4', 'hr', 'i', +allowed_tags = ['a', 'aside', 'b', 'blockquote', 'br', 'code', 'em', 'figcaption', 'figure', 'h3', 'h4', 'hr', 'i', 'iframe', 'img', 'li', 'ol', 'p', 'pre', 's', 'strong', 'u', 'ul', 'video'] allowed_top_level_tags = ['aside', 'blockquote', 'pre', 'figure', 'h3', 'h4', 'hr', 'ol', 'p', 'ul'] youtube_re = r'(https?:)?//(www\.)?youtube(-nocookie)?\.com/embed/' vimeo_re = r'(https?:)?//player\.vimeo\.com/video/(\d+)' twitter_re = re.compile(r'(https?:)?//(www\.)?twitter\.com/[A-Za-z0-9_]{1,15}/status/\d+') -pre_content_re = re.compile(r']*>[\s\S]*?') +pre_content_re = re.compile(r'<(pre|code)(>|\s[^>]*>)[\s\S]*?') line_breaks_and_empty_strings = re.compile('(^[\s\t]*)?\r?\n', flags=re.MULTILINE) @@ -224,6 +224,13 @@ def preprocess_fragments(fragments): if bad_tag in fragments: fragments.remove(bad_tag) + # code - > pre + # convert multiline code into pre + code_elements = body.xpath('.//code') + for code_element in code_elements: + if '\n' in code_element.text: + code_element.tag = 'pre' + for fragment in body.getchildren(): if fragment.tag not in allowed_top_level_tags: paragraph = _create_element('p') diff --git a/setup.py b/setup.py index 7ae1b48..f1f3709 100644 --- a/setup.py +++ b/setup.py @@ -1,7 +1,7 @@ from setuptools import setup setup(name='html_telegraph_poster', - version='0.1.7', + version='0.1.8', description='Posts your html to telegra.ph blogging service', keywords='telegra.ph post html telegram', url='https://github.com/mercuree/html-telegraph-poster', diff --git a/tests/test.py b/tests/test.py index 9a7c375..13256c8 100644 --- a/tests/test.py +++ b/tests/test.py @@ -466,6 +466,12 @@ def hello_world(): Also creating executable in myvirtualenv/bin/python Installing setuptools, pip...done. + ''' + html5 = ''' +

    Text before inline_code = True Text after

    + multiline_code = True + next_line = True + ''' self.assertJson( [ @@ -509,6 +515,14 @@ def hello_world(): ], convert_html_to_telegraph_format(html4, clean_html=True) ) + self.assertJson( + [ + {"tag": "p", + "children": ["Text before ", {"tag": "code", "children": [" inline_code = True"]}, " Text after"]}, + {"tag": "pre", "children": [" multiline_code = True\n next_line = True\n "]} + ], + convert_html_to_telegraph_format(html5, clean_html=True) + ) class UploadImageTest(unittest.TestCase): From dce5625e0db69cb65340152a83b05a8ee5f67914 Mon Sep 17 00:00:00 2001 From: Yolk Date: Tue, 31 Jan 2017 21:36:26 +0200 Subject: [PATCH 21/21] slightly changed Class interface --- html_telegraph_poster/html_to_telegraph.py | 13 ++++++------- setup.py | 2 +- 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/html_telegraph_poster/html_to_telegraph.py b/html_telegraph_poster/html_to_telegraph.py index 235ac36..4e4d441 100644 --- a/html_telegraph_poster/html_to_telegraph.py +++ b/html_telegraph_poster/html_to_telegraph.py @@ -367,7 +367,7 @@ def upload_to_telegraph(title, author, text, author_url='', tph_uuid=None, page_ class TelegraphPoster(object): - def __init__(self, tph_uuid=None, page_id=None, user_agent=default_user_agent, clean_html=True): + def __init__(self, tph_uuid=None, page_id=None, user_agent=default_user_agent, clean_html=True, convert_html=True): self.title = None self.author = None self.author_url = None @@ -376,17 +376,15 @@ def __init__(self, tph_uuid=None, page_id=None, user_agent=default_user_agent, c self.page_id = page_id self.user_agent = user_agent self.clean_html = clean_html + self.convert_html = convert_html def post(self, title, author, text, author_url=''): - result = self.edit( - title, - author, - text - ) self.title = title self.author = author self.author_url = author_url self.text = text + result = self.edit() + self.tph_uuid = result['tph_uuid'] self.page_id = result['page_id'] return result @@ -400,5 +398,6 @@ def edit(self, title=None, author=None, text=None): tph_uuid=self.tph_uuid, page_id=self.page_id, user_agent=self.user_agent, - clean_html=self.clean_html + clean_html=self.clean_html, + convert_html=self.convert_html ) diff --git a/setup.py b/setup.py index f1f3709..fef7b6d 100644 --- a/setup.py +++ b/setup.py @@ -1,7 +1,7 @@ from setuptools import setup setup(name='html_telegraph_poster', - version='0.1.8', + version='0.1.9', description='Posts your html to telegra.ph blogging service', keywords='telegra.ph post html telegram', url='https://github.com/mercuree/html-telegraph-poster',