From c6afb218a59df0db583799438cbd2c93a53970e0 Mon Sep 17 00:00:00 2001
From: Yolk <yolkmail@gmail.com>
Date: Tue, 10 Jan 2017 10:31:05 +0200
Subject: [PATCH 01/21] Html processing refactoring

---
 html_telegraph_poster/html_to_telegraph.py | 110 +++++++++++++--------
 tests/test.py                              |  27 +++++
 2 files changed, 96 insertions(+), 41 deletions(-)
diff --git a/html_telegraph_poster/html_to_telegraph.py b/html_telegraph_poster/html_to_telegraph.py
index 9f39414..5176628 100644
--- a/html_telegraph_poster/html_to_telegraph.py
+++ b/html_telegraph_poster/html_to_telegraph.py
@@ -49,13 +49,40 @@ def clean_article_html(html_string):
     return html_string.strip(' \t')
 
 
-def _wrap_tag(element, wrapper):
+def _create_element(element):
+    # creates lxml element without document tree (no body, no parents)
     new_element = html.HtmlElement()
-    new_element.tag = wrapper
+    new_element.tag = element
+    return new_element
+
+
+def _wrap_tag(element, wrapper):
+    new_element = _create_element(wrapper)
     new_element.append(element)
     return new_element
 
 
+def _fragments_from_string(html_string):
+    fragments = html.fragments_fromstring(html_string)
+    if not len(fragments):
+        return []
+    # convert and append text node before starting tag
+    if not isinstance(fragments[0], html.HtmlElement):
+        if len(fragments[0].strip()) > 0:
+            if len(fragments) == 1:
+                return html.fragments_fromstring('<p>%s</p>' % fragments[0])
+            else:
+                paragraph = _create_element('p')
+                paragraph.text = fragments[0]
+                fragments[1].addprevious(paragraph)
+                fragments.insert(1, paragraph)
+
+        fragments.pop(0)
+        if not len(fragments):
+            return []
+    return fragments
+
+
 def preprocess_media_tags(element):
     if isinstance(element, html.HtmlElement):
         if element.tag == 'figcaption':
@@ -78,7 +105,8 @@ def preprocess_media_tags(element):
                 elif vimeo:
                     element.set('src', '/embed/vimeo?url=' + quote_plus('https://vimeo.com/' + vimeo.group(2)))
 
-                element = _wrap_tag(element, 'figure')
+                element.addprevious(_create_element('figure'))
+                element.getprevious().append(element)
         elif element.tag == 'blockquote' and element.get('class') == 'twitter-tweet':
             twitter_links = element.xpath('.//a')
             for tw_link in twitter_links:
@@ -86,34 +114,31 @@ def preprocess_media_tags(element):
                     twitter_frame = html.HtmlElement()
                     twitter_frame.tag = 'iframe'
                     twitter_frame.set('src', '/embed/twitter?url=' + quote_plus(tw_link.get('href')))
-                    element = _wrap_tag(twitter_frame, 'figure')
-
-    return element
+                    element.addprevious(_wrap_tag(twitter_frame, 'figure'))
+                    element.drop_tree()
 
 
 def preprocess_fragments(fragments):
-    processed_fragments = []
     bad_tags = []
 
     if not len(fragments):
-        return processed_fragments
+        return None
 
-    # convert and append text node before starting tag
-    if not isinstance(fragments[0], html.HtmlElement):
-        if len(fragments[0].strip()) > 0:
-            processed_fragments.append(html.fromstring('<p>%s</p>' % fragments[0]))
-        fragments.pop(0)
-        if not len(fragments):
-            return processed_fragments
+    body = fragments[0].getparent()
 
     for fragment in fragments:
         # figure should be on the top level
         if fragment.find('figure') is not None:
             f = fragment.find('figure')
-            processed_fragments.append(f)
-            fragment.remove(f)
+            body.append(f)
+
+        images_to_wrap = fragment.xpath('.//self::p[not(normalize-space(string()))]//img')
+        if len(images_to_wrap):
+            for image in images_to_wrap:
+                image.tail = ''
+                body.append(_wrap_tag(image, 'figure'))
 
-        processed_fragments.append(fragment)
+        body.append(fragment)
     # bad iframes
     ns = {'re': "http://exslt.org/regular-expressions"}
     bad_tags.extend(fragments[-1].xpath("//iframe[not(re:test(@src, '%s|%s', 'i'))]" % (youtube_re, vimeo_re), namespaces=ns))
@@ -125,15 +150,25 @@ def preprocess_fragments(fragments):
         bad_tag.drop_tag()
         if bad_tag in fragments:
             fragments.remove(bad_tag)
-        if bad_tag in processed_fragments:
-            processed_fragments.remove(bad_tag)
 
-    return processed_fragments
+    for fragment in fragments:
+        if fragment.tag not in allowed_top_level_tags:
+            paragraph = _create_element('p')
+            fragment.addprevious(paragraph)
+            paragraph.append(fragment)
+        else:
+            # convert and append text nodes after closing tag
+            if fragment.tail and len(fragment.tail.strip()) != 0:
+                paragraph = _create_element('p')
+                paragraph.text = fragment.tail
+                fragment.addnext(paragraph)
+                fragment.tail = ''
+
+    return len(body.getchildren()) and body
 
 
 def _recursive_convert(element):
 
-    element = preprocess_media_tags(element)
     fragment_root_element = {
         'tag': element.tag
     }
@@ -165,26 +200,19 @@ def convert_html_to_telegraph_format(html_string, clean_html=True):
     if clean_html:
         html_string = clean_article_html(html_string)
 
-    fragments = preprocess_fragments(
-        html.fragments_fromstring(html_string)
-    )
-    content = []
-
-    for fragment in fragments:
-
-        if fragment.tag not in allowed_top_level_tags:
-            paragraph = html.HtmlElement()
-            paragraph.tag = 'p'
-            paragraph.append(fragment)
-            content.append(_recursive_convert(paragraph))
-        else:
-            content.append(_recursive_convert(fragment))
+        body = preprocess_fragments(
+            _fragments_from_string(html_string)
+        )
+        if body:
+            for x in body.iterdescendants():
+                preprocess_media_tags(x)
+    else:
+        fragments = _fragments_from_string(html_string)
+        body = fragments[0].getparent() if len(fragments) else None
 
-            # convert and append text nodes after closing tag
-            if fragment.tail and len(fragment.tail.strip()) != 0:
-                content.append(
-                    _recursive_convert(html.fromstring('<p>%s</p>' % fragment.tail))
-                )
+    content = []
+    if body:
+        content = [_recursive_convert(x) for x in body.iterchildren()]
 
     return json.dumps(content, ensure_ascii=False)
 
diff --git a/tests/test.py b/tests/test.py
index 25c1d8a..6d3fd40 100644
--- a/tests/test.py
+++ b/tests/test.py
@@ -13,6 +13,18 @@ def assertJson(self, first, second):
             json.loads(second)
         )
 
+    def test_text_only(self):
+        html = 'only plain text'
+        html_empty_string = '               '
+        self.assertJson(
+            [{'children': ['only plain text'], 'tag': 'p'}],
+            convert_html_to_telegraph_format(html, clean_html=True)
+        )
+        self.assertJson(
+            [],
+            convert_html_to_telegraph_format(html_empty_string, clean_html=True)
+        )
+
     def test_text_on_top(self):
         html = '''
 <div>
@@ -324,6 +336,21 @@ def test_lists(self):
             convert_html_to_telegraph_format(empty_list, clean_html=True)
         )
 
+    def test_convert_without_clean(self):
+        # multiple br tags should be replaced with one line break
+        html = 'Text first line' \
+               '<br><br /> <br class="somebrclass">  <div>' \
+               '</div> <br id="somebrid"/> <p>text</p> <br>' \
+               '<span><em><strong><i></i><u></u></strong></em></span>'
+        self.assertJson(
+            [{'tag': 'p', 'children': ['Text first line']}, {'tag': 'br'}, {'tag': 'br'},
+             {'tag': 'br', 'attrs': {'class': 'somebrclass'}}, {'tag': 'div'},
+             {'tag': 'br', 'attrs': {'id': 'somebrid'}}, {'tag': 'p', 'children': ['text']}, {'tag': 'br'},
+             {'tag': 'span',
+              'children': [{'tag': 'em', 'children': [{'tag': 'strong', 'children': [{'tag': 'i'}, {'tag': 'u'}]}]}]}],
+            convert_html_to_telegraph_format(html, clean_html=False)
+        )
+
 
 class UploadImageTest(unittest.TestCase):
 

From 23e4a923d1f07540299fa25970864f8ef2688691 Mon Sep 17 00:00:00 2001
From: Yolk <yolkmail@gmail.com>
Date: Tue, 10 Jan 2017 10:52:34 +0200
Subject: [PATCH 02/21] Html processing refactoring (stage 2)

---
 html_telegraph_poster/html_to_telegraph.py | 24 +++++++++---
 tests/test.py                              | 45 ++++++++++++++--------
 2 files changed, 49 insertions(+), 20 deletions(-)

diff --git a/html_telegraph_poster/html_to_telegraph.py b/html_telegraph_poster/html_to_telegraph.py
index 5176628..9e8c57d 100644
--- a/html_telegraph_poster/html_to_telegraph.py
+++ b/html_telegraph_poster/html_to_telegraph.py
@@ -49,10 +49,12 @@ def clean_article_html(html_string):
     return html_string.strip(' \t')
 
 
-def _create_element(element):
+def _create_element(element, text=None):
     # creates lxml element without document tree (no body, no parents)
     new_element = html.HtmlElement()
     new_element.tag = element
+    if text:
+        new_element.text = text
     return new_element
 
 
@@ -132,18 +134,19 @@ def preprocess_fragments(fragments):
             f = fragment.find('figure')
             body.append(f)
 
-        images_to_wrap = fragment.xpath('.//self::p[not(normalize-space(string()))]//img')
+        images_to_wrap = fragment.xpath('.//self::img[not(ancestor::figure)]')
         if len(images_to_wrap):
             for image in images_to_wrap:
-                image.tail = ''
                 body.append(_wrap_tag(image, 'figure'))
+                if image.tail:
+                    body.append(_create_element('p', text=image.tail))
+                    image.tail = ''
 
-        body.append(fragment)
     # bad iframes
     ns = {'re': "http://exslt.org/regular-expressions"}
     bad_tags.extend(fragments[-1].xpath("//iframe[not(re:test(@src, '%s|%s', 'i'))]" % (youtube_re, vimeo_re), namespaces=ns))
     # bad lists (remove lists/list items if empty)
-    nodes_not_to_be_empty = fragments[-1].xpath('//ul|//ol|//li|//p')
+    nodes_not_to_be_empty = fragments[-1].xpath('//ul|//ol|//li')
     bad_tags.extend([x for x in nodes_not_to_be_empty if len(x.text_content().strip()) == 0])
 
     for bad_tag in bad_tags:
@@ -167,6 +170,15 @@ def preprocess_fragments(fragments):
     return len(body.getchildren()) and body
 
 
+def post_process(body):
+
+    bad_tags = body.xpath('//p|//a')
+
+    for x in bad_tags:
+        if len(x.text_content().strip()) == 0:
+            x.drop_tag()
+
+
 def _recursive_convert(element):
 
     fragment_root_element = {
@@ -206,6 +218,8 @@ def convert_html_to_telegraph_format(html_string, clean_html=True):
         if body:
             for x in body.iterdescendants():
                 preprocess_media_tags(x)
+
+            post_process(body)
     else:
         fragments = _fragments_from_string(html_string)
         body = fragments[0].getparent() if len(fragments) else None
diff --git a/tests/test.py b/tests/test.py
index 6d3fd40..3aa7d21 100644
--- a/tests/test.py
+++ b/tests/test.py
@@ -131,7 +131,7 @@ def test_image_inside_paragraph(self):
         html = '<p> <img src="image0.jpg"/></p>' \
                '<p>  <span> <img src="image1.jpg"/>   </span> <img src="image2.jpg"/> </p>'
 
-        para_with_text = '<p>  <span> <img src="image1.jpg"/>abc </span> </p>'
+        para_with_text = '<p> abc <span> <img src="image1.jpg"/>xyz </span> </p>'
         para_with_figure = '<p> <figure> <img src="image0.jpg"/> <figcaption>test</figcaption></figure> </p>'
 
         self.assertJson(
@@ -142,9 +142,12 @@ def test_image_inside_paragraph(self):
             ],
             convert_html_to_telegraph_format(html, clean_html=True)
         )
+
         self.assertJson(
             [
-                {'tag': 'p', 'children': ['   ', {'tag': 'img', 'attrs': {'src': 'image1.jpg'}}, 'abc  ']}
+                {"tag": "p", "children": [" abc  "]},
+                {"tag": "figure", "children": [{"tag": "img", "attrs": {"src": "image1.jpg"}}]},
+                {"tag": "p", "children": ["xyz  "]}
             ],
             convert_html_to_telegraph_format(para_with_text, clean_html=True)
         )
@@ -158,19 +161,20 @@ def test_image_inside_paragraph(self):
 
     def test_image_tag_at_the_top(self):
         html = '<img src="image.jpg" title="image"/>'
-        html_with_text_after = '<img src="image.jpg" title="image"/> Text after'
-        html_with_text_before = 'Text before <img src="image.jpg" title="image"/>'
+        html_with_text_after = '<img src="image1.jpg" title="image"/> Text after'
+        html_with_text_before = 'Text before <img src="image0.jpg" title="image"/>'
         html_joined = html_with_text_before + html_with_text_after
         self.assertJson(
             [
-                {"children": [{"attrs": {"src": "image.jpg"}, "tag": "img"}], "tag": "p"}
+                {"children": [{"attrs": {"src": "image.jpg"}, "tag": "img"}], "tag": "figure"}
             ],
             convert_html_to_telegraph_format(html, clean_html=True)
         )
 
         self.assertJson(
             [
-                {"children": [{"attrs": {"src": "image.jpg"}, "tag": "img"}, ' Text after'], "tag": "p"}
+                {"tag": "figure", "children": [{"tag": "img", "attrs": {"src": "image1.jpg"}}]},
+                {"tag": "p", "children": [" Text after"]}
             ],
             convert_html_to_telegraph_format(html_with_text_after, clean_html=True)
         )
@@ -178,15 +182,17 @@ def test_image_tag_at_the_top(self):
         self.assertJson(
             [
                 {"children": ["Text before "], "tag": "p"},
-                {"children": [{"attrs": {"src": "image.jpg"}, "tag": "img"}], "tag": "p"}
+                {"children": [{"attrs": {"src": "image0.jpg"}, "tag": "img"}], "tag": "figure"}
             ],
             convert_html_to_telegraph_format(html_with_text_before, clean_html=True)
         )
+
         self.assertJson(
             [
-                {"children": ["Text before "], "tag": "p"},
-                {"children": [{"attrs": {"src": "image.jpg"}, "tag": "img"}], "tag": "p"},
-                {"children": [{"attrs": {"src": "image.jpg"}, "tag": "img"}, " Text after"], "tag": "p"}
+                {"tag": "p", "children": ["Text before "]},
+                {"tag": "figure", "children": [{"tag": "img", "attrs": {"src": "image0.jpg"}}]},
+                {"tag": "figure", "children": [{"tag": "img", "attrs": {"src": "image1.jpg"}}]},
+                {"tag": "p", "children": [" Text after"]}
             ],
             convert_html_to_telegraph_format(html_joined, clean_html=True)
         )
@@ -238,8 +244,8 @@ def test_iframe(self):
         mix = iframe_child_no_src + html + iframe_empty_src + iframe_no_src
         self.assertJson(
             [
-                {'tag': 'p', 'children': [{'tag': 'figure', 'children': [{'tag': 'iframe', 'attrs': {
-                'src': '/embed/youtube?url=https%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3Dabcdef'}}]}]}
+                {'tag': 'figure', 'children': [{'tag': 'iframe', 'attrs': {
+                'src': '/embed/youtube?url=https%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3Dabcdef'}}]}
              ],
             convert_html_to_telegraph_format(html, clean_html=True)
         )
@@ -258,8 +264,8 @@ def test_iframe(self):
 
         self.assertJson(
             [
-                {'tag': 'p', 'children': [{'tag': 'figure', 'children': [{'tag': 'iframe', 'attrs': {
-                    'src': '/embed/youtube?url=https%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3Dabcdef'}}]}]}
+                {'tag': 'figure', 'children': [{'tag': 'iframe', 'attrs': {
+                'src': '/embed/youtube?url=https%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3Dabcdef'}}]}
             ],
             convert_html_to_telegraph_format(mix, clean_html=True)
         )
@@ -276,7 +282,7 @@ def test_iframe(self):
         )
         self.assertJson(
             [
-                {u'tag': u'p', u'children': [{u'tag': u'figure', u'children': [{u'tag': u'iframe', u'attrs': {u'src': u'/embed/vimeo?url=https%3A%2F%2Fvimeo.com%2F1185346'}}]}]}
+                {'tag': 'figure', 'children': [{'tag': 'iframe', 'attrs': {'src': '/embed/vimeo?url=https%3A%2F%2Fvimeo.com%2F1185346'}}]}
             ],
             convert_html_to_telegraph_format(iframe_vimeo, clean_html=True)
         )
@@ -351,6 +357,15 @@ def test_convert_without_clean(self):
             convert_html_to_telegraph_format(html, clean_html=False)
         )
 
+    def test_empty_links(self):
+        html = '<a href="http://example.com/">   <img src="http://httpbin.org/image/jpeg"/>   </a>'
+
+        self.assertJson(
+            [
+                {'tag': 'figure', 'children': [{'tag': 'img', 'attrs': {'src': 'http://httpbin.org/image/jpeg'}}]}
+            ],
+            convert_html_to_telegraph_format(html, clean_html=True)
+        )
 
 class UploadImageTest(unittest.TestCase):
 

From ec498535250d92da46da792084924e8ece8a7143 Mon Sep 17 00:00:00 2001
From: Yolk <yolkmail@gmail.com>
Date: Tue, 10 Jan 2017 19:28:46 +0200
Subject: [PATCH 03/21] Html processing refactoring (stage 3)

---
 html_telegraph_poster/html_to_telegraph.py | 34 ++++++++++++++--------
 tests/test.py                              | 25 +++++++++++++++-
 2 files changed, 46 insertions(+), 13 deletions(-)

diff --git a/html_telegraph_poster/html_to_telegraph.py b/html_telegraph_poster/html_to_telegraph.py
index 9e8c57d..4530d07 100644
--- a/html_telegraph_poster/html_to_telegraph.py
+++ b/html_telegraph_poster/html_to_telegraph.py
@@ -58,6 +58,12 @@ def _create_element(element, text=None):
     return new_element
 
 
+def _insert_after(element, ref):
+    parent = ref.getparent()
+    parent.insert(parent.index(ref) + 1, element)
+    return element
+
+
 def _wrap_tag(element, wrapper):
     new_element = _create_element(wrapper)
     new_element.append(element)
@@ -129,18 +135,21 @@ def preprocess_fragments(fragments):
     body = fragments[0].getparent()
 
     for fragment in fragments:
+        last_element = fragment
         # figure should be on the top level
         if fragment.find('figure') is not None:
             f = fragment.find('figure')
-            body.append(f)
+            last_element = _insert_after(f, last_element)
 
         images_to_wrap = fragment.xpath('.//self::img[not(ancestor::figure)]')
-        if len(images_to_wrap):
-            for image in images_to_wrap:
-                body.append(_wrap_tag(image, 'figure'))
-                if image.tail:
-                    body.append(_create_element('p', text=image.tail))
-                    image.tail = ''
+        for image in images_to_wrap:
+            figure = _create_element('figure')
+            last_element = _insert_after(figure, last_element)
+            figure.append(image)
+
+            if image.tail:
+                _insert_after(_create_element('p', text=image.tail), last_element)
+                image.tail = ''
 
     # bad iframes
     ns = {'re': "http://exslt.org/regular-expressions"}
@@ -167,7 +176,7 @@ def preprocess_fragments(fragments):
                 fragment.addnext(paragraph)
                 fragment.tail = ''
 
-    return len(body.getchildren()) and body
+    return len(body.getchildren()) and body or None
 
 
 def post_process(body):
@@ -215,9 +224,10 @@ def convert_html_to_telegraph_format(html_string, clean_html=True):
         body = preprocess_fragments(
             _fragments_from_string(html_string)
         )
-        if body:
-            for x in body.iterdescendants():
-                preprocess_media_tags(x)
+        if body is not None:
+            desc = [x for x in body.iterdescendants()]
+            for tag in desc:
+                preprocess_media_tags(tag)
 
             post_process(body)
     else:
@@ -225,7 +235,7 @@ def convert_html_to_telegraph_format(html_string, clean_html=True):
         body = fragments[0].getparent() if len(fragments) else None
 
     content = []
-    if body:
+    if body is not None:
         content = [_recursive_convert(x) for x in body.iterchildren()]
 
     return json.dumps(content, ensure_ascii=False)
diff --git a/tests/test.py b/tests/test.py
index 3aa7d21..31ca714 100644
--- a/tests/test.py
+++ b/tests/test.py
@@ -133,7 +133,8 @@ def test_image_inside_paragraph(self):
 
         para_with_text = '<p> abc <span> <img src="image1.jpg"/>xyz </span> </p>'
         para_with_figure = '<p> <figure> <img src="image0.jpg"/> <figcaption>test</figcaption></figure> </p>'
-
+        para_img1 = '<p>Text 1 <figure> <img src="image0.jpg"/> <figcaption>test</figcaption></figure> </p><p>Text 2<p>'
+        para_img2 = '<p> Text 1 <img src="image0.jpg"/>Text after image </p><p>Text 2 </p>'
         self.assertJson(
             [
                 {"children": [{"attrs": {"src": "image0.jpg"}, "tag": "img"}], "tag": "figure"},
@@ -159,6 +160,28 @@ def test_image_inside_paragraph(self):
             convert_html_to_telegraph_format(para_with_figure, clean_html=True)
         )
 
+        self.assertJson(
+            [
+                {"tag": "p", "children": ["Text 1 "]},
+                {
+                    "tag": "figure", "children":
+                    [" ", {"tag": "img", "attrs": {"src": "image0.jpg"}}, " ",
+                        {"tag": "figcaption", "children": ["test"]}]
+                },
+                {"tag": "p", "children": ["Text 2"]}
+            ],
+            convert_html_to_telegraph_format(para_img1, clean_html=True)
+        )
+
+        self.assertJson(
+            [
+                {"tag": "p", "children": [" Text 1 "]},
+                {"tag": "figure", "children": [{"tag": "img", "attrs": {"src": "image0.jpg"}}]},
+                {"tag": "p", "children": ["Text after image "]}, {"tag": "p", "children": ["Text 2 "]}
+             ],
+            convert_html_to_telegraph_format(para_img2, clean_html=True)
+        )
+
     def test_image_tag_at_the_top(self):
         html = '<img src="image.jpg" title="image"/>'
         html_with_text_after = '<img src="image1.jpg" title="image"/> Text after'

From 42c8ea6d032ea6a6fd61392d390140e9a1f01fae Mon Sep 17 00:00:00 2001
From: Yolk <yolkmail@gmail.com>
Date: Tue, 10 Jan 2017 22:20:06 +0200
Subject: [PATCH 04/21] Version bump

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 76f9aa9..ccd7fe9 100644
--- a/setup.py
+++ b/setup.py
@@ -1,7 +1,7 @@
 from setuptools import setup
 
 setup(name='html_telegraph_poster',
-      version='0.0.22',
+      version='0.1.0',
       description='Posts your html to telegra.ph blogging service',
       keywords='telegra.ph post html telegram',
       url='https://github.com/mercuree/html-telegraph-poster',

From 5dc84b8433bf079aaecb495631d272a1f28b368c Mon Sep 17 00:00:00 2001
From: Yolk <yolkmail@gmail.com>
Date: Wed, 11 Jan 2017 11:48:24 +0200
Subject: [PATCH 05/21] Option to disable html cleaning and processing

---
 html_telegraph_poster/html_to_telegraph.py | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/html_telegraph_poster/html_to_telegraph.py b/html_telegraph_poster/html_to_telegraph.py
index 4530d07..c414275 100644
--- a/html_telegraph_poster/html_to_telegraph.py
+++ b/html_telegraph_poster/html_to_telegraph.py
@@ -242,14 +242,15 @@ def convert_html_to_telegraph_format(html_string, clean_html=True):
 
 
 def _upload(title, author, text,
-            author_url='', tph_uuid=None, page_id=None, user_agent=default_user_agent, convert_html=True):
+            author_url='', tph_uuid=None, page_id=None, user_agent=default_user_agent, convert_html=True,
+            clean_html=True):
 
     if not title:
         raise TitleRequiredError('Title is required')
     if not text:
         raise TextRequiredError('Text is required')
 
-    content = convert_html_to_telegraph_format(text) if convert_html else text
+    content = convert_html_to_telegraph_format(text, clean_html) if convert_html else text
     cookies = dict(tph_uuid=tph_uuid) if tph_uuid and page_id else None
 
     fields = {
@@ -286,7 +287,7 @@ def upload_to_telegraph(title, author, text, author_url='', tph_uuid=None, page_
 
 
 class TelegraphPoster(object):
-    def __init__(self, tph_uuid=None, page_id=None, user_agent=default_user_agent):
+    def __init__(self, tph_uuid=None, page_id=None, user_agent=default_user_agent, clean_html=True):
         self.title = None
         self.author = None
         self.author_url = None
@@ -294,6 +295,7 @@ def __init__(self, tph_uuid=None, page_id=None, user_agent=default_user_agent):
         self.tph_uuid = tph_uuid
         self.page_id = page_id
         self.user_agent = user_agent
+        self.clean_html = clean_html
 
     def post(self, title, author, text, author_url=''):
         result = self.edit(
@@ -310,12 +312,13 @@ def post(self, title, author, text, author_url=''):
         return result
 
     def edit(self, title=None, author=None, text=None):
-        return upload_to_telegraph(
+        return _upload(
             title=title or self.title,
             author=author or self.author,
             text=text or self.text,
             author_url=self.author_url,
             tph_uuid=self.tph_uuid,
             page_id=self.page_id,
-            user_agent=self.user_agent
+            user_agent=self.user_agent,
+            clean_html=self.clean_html
         )

From e46503b3248c15f4962dcb0ca9b3c8c7bc32756e Mon Sep 17 00:00:00 2001
From: Yolk <yolkmail@gmail.com>
Date: Wed, 11 Jan 2017 12:48:14 +0200
Subject: [PATCH 06/21] Optimizing bad tags processing

---
 html_telegraph_poster/html_to_telegraph.py | 7 +++----
 tests/test.py                              | 2 +-
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/html_telegraph_poster/html_to_telegraph.py b/html_telegraph_poster/html_to_telegraph.py
index c414275..385f165 100644
--- a/html_telegraph_poster/html_to_telegraph.py
+++ b/html_telegraph_poster/html_to_telegraph.py
@@ -93,10 +93,7 @@ def _fragments_from_string(html_string):
 
 def preprocess_media_tags(element):
     if isinstance(element, html.HtmlElement):
-        if element.tag == 'figcaption':
-            # figcaption may have only text content
-            [e.drop_tag() for e in element.findall('*')]
-        elif element.tag in ['ol', 'ul']:
+        if element.tag in ['ol', 'ul']:
             # ignore any spaces between <ul> and <li>
             element.text = ''
         elif element.tag == 'li':
@@ -154,6 +151,8 @@ def preprocess_fragments(fragments):
     # bad iframes
     ns = {'re': "http://exslt.org/regular-expressions"}
     bad_tags.extend(fragments[-1].xpath("//iframe[not(re:test(@src, '%s|%s', 'i'))]" % (youtube_re, vimeo_re), namespaces=ns))
+    # figcaption may have only text content
+    bad_tags.extend(fragments[-1].xpath("//figcaption//*"))
     # bad lists (remove lists/list items if empty)
     nodes_not_to_be_empty = fragments[-1].xpath('//ul|//ol|//li')
     bad_tags.extend([x for x in nodes_not_to_be_empty if len(x.text_content().strip()) == 0])
diff --git a/tests/test.py b/tests/test.py
index 31ca714..b9e5f74 100644
--- a/tests/test.py
+++ b/tests/test.py
@@ -133,7 +133,7 @@ def test_image_inside_paragraph(self):
 
         para_with_text = '<p> abc <span> <img src="image1.jpg"/>xyz </span> </p>'
         para_with_figure = '<p> <figure> <img src="image0.jpg"/> <figcaption>test</figcaption></figure> </p>'
-        para_img1 = '<p>Text 1 <figure> <img src="image0.jpg"/> <figcaption>test</figcaption></figure> </p><p>Text 2<p>'
+        para_img1 = '<p>Text 1 <figure> <img src="image0.jpg"/> <figcaption><em>test</em></figcaption></figure> </p><p>Text 2<p>'
         para_img2 = '<p> Text 1 <img src="image0.jpg"/>Text after image </p><p>Text 2 </p>'
         self.assertJson(
             [

From 2ea0df4217b3720d4a662e7499868e21415c6c49 Mon Sep 17 00:00:00 2001
From: Yolk <yolkmail@gmail.com>
Date: Sun, 15 Jan 2017 13:53:56 +0200
Subject: [PATCH 07/21] Added pre tag support

---
 html_telegraph_poster/html_to_telegraph.py | 49 +++++++++++++++--
 tests/test.py                              | 63 ++++++++++++++++++++++
 2 files changed, 107 insertions(+), 5 deletions(-)

diff --git a/html_telegraph_poster/html_to_telegraph.py b/html_telegraph_poster/html_to_telegraph.py
index 385f165..5066a64 100644
--- a/html_telegraph_poster/html_to_telegraph.py
+++ b/html_telegraph_poster/html_to_telegraph.py
@@ -11,13 +11,15 @@
 base_url = 'https://telegra.ph'
 save_url = 'https://edit.telegra.ph/save'
 default_user_agent = 'Python_telegraph_poster/0.1'
-allowed_tags = ['a', 'aside', 'b', 'blockquote', 'br', 'em', 'figcaption', 'figure', 'h3', 'h4', 'hr', 'i', 'iframe',
-                'img', 'li', 'ol', 'p', 's', 'strong', 'u', 'ul', 'video']
-allowed_top_level_tags = ['aside', 'blockquote', 'figure', 'h3', 'h4', 'hr', 'ol', 'p', 'ul']
+allowed_tags = ['a', 'aside', 'b', 'blockquote', 'br', 'em', 'figcaption', 'figure', 'h3', 'h4', 'hr', 'i',
+                'iframe', 'img', 'li', 'ol', 'p', 'pre', 's', 'strong', 'u', 'ul', 'video']
+allowed_top_level_tags = ['aside', 'blockquote', 'pre', 'figure', 'h3', 'h4', 'hr', 'ol', 'p', 'ul']
 
 youtube_re = r'(https?:)?//(www\.)?youtube(-nocookie)?\.com/embed/'
 vimeo_re = r'(https?:)?//player\.vimeo\.com/video/(\d+)'
 twitter_re = re.compile(r'(https?:)?//(www\.)?twitter\.com/[A-Za-z0-9_]{1,15}/status/\d+')
+pre_content_re = re.compile(r'<pre[^>]*>[^<]*</pre>')
+line_breaks_and_empty_strings = re.compile('(^[\s\t]*)?\r?\n', flags=re.MULTILINE)
 
 
 def clean_article_html(html_string):
@@ -41,14 +43,36 @@ def clean_article_html(html_string):
     cleaned = c.clean_html(html_string)
     # remove wrapped div
     cleaned = cleaned[5:-6]
-    # remove all line breaks and empty strings (in html it means nothing)
-    html_string = re.sub('(^[\s\t]*)?\r?\n', '', cleaned, flags=re.MULTILINE)
+    # remove all line breaks and empty strings
+    html_string = replace_line_breaks_except_pre(cleaned)
     # but replace multiple br tags with one line break, telegraph will convert it to <br class="inline">
     html_string = re.sub(r'(<br(/?>|\s[^<>]*>)\s*)+', '\n', html_string)
 
     return html_string.strip(' \t')
 
 
+def replace_line_breaks_except_pre(html_string):
+    # Remove all line breaks and empty strings, except pre tag
+    # how to make it in one string? :\
+    pre_ranges = [0]
+    out = ''
+
+    # get <pre> start/end postion
+    for x in pre_content_re.finditer(html_string):
+        start, end = x.start(), x.end()
+        pre_ranges.extend((start, end))
+    pre_ranges.append(len(html_string))
+
+    # all odd elements are <pre>, leave them untouched
+    for k in range(1, len(pre_ranges)):
+        part = html_string[pre_ranges[k-1]:pre_ranges[k]]
+        if k % 2 == 0:
+            out += part
+        else:
+            out += line_breaks_and_empty_strings.sub('', part)
+    return out
+
+
 def _create_element(element, text=None):
     # creates lxml element without document tree (no body, no parents)
     new_element = html.HtmlElement()
@@ -153,6 +177,10 @@ def preprocess_fragments(fragments):
     bad_tags.extend(fragments[-1].xpath("//iframe[not(re:test(@src, '%s|%s', 'i'))]" % (youtube_re, vimeo_re), namespaces=ns))
     # figcaption may have only text content
     bad_tags.extend(fragments[-1].xpath("//figcaption//*"))
+
+    # drop all tags inside pre
+    bad_tags.extend(fragments[-1].xpath("//pre//*"))
+
     # bad lists (remove lists/list items if empty)
     nodes_not_to_be_empty = fragments[-1].xpath('//ul|//ol|//li')
     bad_tags.extend([x for x in nodes_not_to_be_empty if len(x.text_content().strip()) == 0])
@@ -186,6 +214,17 @@ def post_process(body):
         if len(x.text_content().strip()) == 0:
             x.drop_tag()
 
+    # group following pre elements into single one (telegraph is buggy)
+    pres = body.xpath('//pre')
+    for pre in pres:
+        next_pre = pre.getnext()
+        while next_pre is not None and next_pre.tag == 'pre':
+            pre.text += "\n" + next_pre.text
+            current_pre = next_pre
+            next_pre = next_pre.getnext()
+            pres.remove(current_pre)
+            current_pre.drop_tree()
+
 
 def _recursive_convert(element):
 
diff --git a/tests/test.py b/tests/test.py
index b9e5f74..f1ef916 100644
--- a/tests/test.py
+++ b/tests/test.py
@@ -390,6 +390,69 @@ def test_empty_links(self):
             convert_html_to_telegraph_format(html, clean_html=True)
         )
 
+    def test_code_block(self):
+        html = '''<pre>
+        def test_code_block(self):
+            html = ''
+            print("hello world")
+        </pre>'''
+        html2 = '''
+            <p><pre
+            class="code">
+                def hello_world():
+                    print("hello")
+            </pre>
+            <pre>print("second pre")</pre>
+            </p>
+            <p> Text after pre </p>
+        '''
+        html3 = '''
+<pre><code class="python hljs">my_list = [<span class="hljs-number">1</span>, <span class="hljs-number">2</span>, <span class="hljs-number">3</span>, <span class="hljs-number">4</span>, <span class="hljs-number">5</span>, <span class="hljs-number">6</span>, <span class="hljs-number">7</span>]
+EVEN = slice(<span class="hljs-number">1</span>, <span class="hljs-keyword">None</span>, <span class="hljs-number">2</span>)
+print(my_list[EVEN])     <span class="hljs-comment"># [2, 4, 6]</span>
+</code></pre>
+<p> paragraph splitter</p>
+<pre> String anotherCodeBlock = "separated code block"</pre>
+<pre>  String anotherCodeBlock2 = "separated code block2"</pre>
+<pre>  String anotherCodeBlock3 = "separated code block3"</pre>
+<p> paragraph splitter</p>
+<pre>  String anotherCodeBlock4 = "separated code block4"</pre>
+<pre>  String anotherCodeBlock5 = "separated code block5"</pre>
+<p> paragraph splitter</p>
+<pre>  String anotherCodeBlock6 = "separated code block6"</pre>
+        '''
+        self.assertJson(
+            [
+                {"tag": "pre", "children": [
+                    "\n        def test_code_block(self):\n            html = ''\n            print(\"hello world\")\n        "]}
+            ],
+            convert_html_to_telegraph_format(html, clean_html=True)
+        )
+        self.assertJson(
+            [
+                {"tag": "pre", "attrs": {"class": "code"}, "children": [
+                    "\n                def hello_world():\n                    print(\"hello\")\n            \nprint(\"second pre\")"]},
+                 {"tag": "p", "children": [" Text after pre "]}
+            ],
+            convert_html_to_telegraph_format(html2, clean_html=True)
+        )
+        self.assertJson(
+            [
+                {"tag": "pre", "children": [
+                    "my_list = [1, 2, 3, 4, 5, 6, 7]\nEVEN = slice(1, None, 2)\nprint(my_list[EVEN])     # [2, 4, 6]\n"]},
+                {"tag": "p", "children": [" paragraph splitter"]},
+                {"tag": "pre", "children": [
+                    " String anotherCodeBlock = \"separated code block\"\n  String anotherCodeBlock2 = \"separated code block2\"\n  String anotherCodeBlock3 = \"separated code block3\""]},
+                {"tag": "p", "children": [" paragraph splitter"]},
+                {"tag": "pre", "children": [
+                    "  String anotherCodeBlock4 = \"separated code block4\"\n  String anotherCodeBlock5 = \"separated code block5\""]},
+                {"tag": "p", "children": [" paragraph splitter"]},
+                {"tag": "pre", "children": ["  String anotherCodeBlock6 = \"separated code block6\""]}
+            ],
+            convert_html_to_telegraph_format(html3, clean_html=True)
+        )
+
+
 class UploadImageTest(unittest.TestCase):
 
     def test_upload(self):

From 8a02792ffd4ff5cd21724e3ddff9b96a486eb2e8 Mon Sep 17 00:00:00 2001
From: Yolk <yolkmail@gmail.com>
Date: Sun, 15 Jan 2017 13:58:01 +0200
Subject: [PATCH 08/21] Added pre tag support version bump

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index ccd7fe9..d3d7841 100644
--- a/setup.py
+++ b/setup.py
@@ -1,7 +1,7 @@
 from setuptools import setup
 
 setup(name='html_telegraph_poster',
-      version='0.1.0',
+      version='0.1.1',
       description='Posts your html to telegra.ph blogging service',
       keywords='telegra.ph post html telegram',
       url='https://github.com/mercuree/html-telegraph-poster',

From 5f242a3134531cc9edae3cee38f2b7f0fb03b528 Mon Sep 17 00:00:00 2001
From: Yolk <yolkmail@gmail.com>
Date: Thu, 19 Jan 2017 20:08:37 +0200
Subject: [PATCH 09/21] Minor improvements

---
 html_telegraph_poster/html_to_telegraph.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/html_telegraph_poster/html_to_telegraph.py b/html_telegraph_poster/html_to_telegraph.py
index 5066a64..1edf689 100644
--- a/html_telegraph_poster/html_to_telegraph.py
+++ b/html_telegraph_poster/html_to_telegraph.py
@@ -218,7 +218,7 @@ def post_process(body):
     pres = body.xpath('//pre')
     for pre in pres:
         next_pre = pre.getnext()
-        while next_pre is not None and next_pre.tag == 'pre':
+        while next_pre is not None and next_pre in pres:
             pre.text += "\n" + next_pre.text
             current_pre = next_pre
             next_pre = next_pre.getnext()

From 68f629fc2ec614b40817c809fab2f90aaa1f0586 Mon Sep 17 00:00:00 2001
From: Yolk <yolkmail@gmail.com>
Date: Thu, 19 Jan 2017 20:15:30 +0200
Subject: [PATCH 10/21] Create textnodes from strings

---
 html_telegraph_poster/html_to_telegraph.py | 21 ++++++++++++---------
 1 file changed, 12 insertions(+), 9 deletions(-)

diff --git a/html_telegraph_poster/html_to_telegraph.py b/html_telegraph_poster/html_to_telegraph.py
index 1edf689..f490dd0 100644
--- a/html_telegraph_poster/html_to_telegraph.py
+++ b/html_telegraph_poster/html_to_telegraph.py
@@ -94,6 +94,17 @@ def _wrap_tag(element, wrapper):
     return new_element
 
 
+def join_following_elements(elements, join_string):
+    for element in elements:
+        next_element = element.getnext()
+        while next_element is not None and next_element in elements:
+            element.text += join_string + next_element.text
+            current = next_element
+            next_element = next_element.getnext()
+            elements.remove(current)
+            current.drop_tree()
+
+
 def _fragments_from_string(html_string):
     fragments = html.fragments_fromstring(html_string)
     if not len(fragments):
@@ -215,15 +226,7 @@ def post_process(body):
             x.drop_tag()
 
     # group following pre elements into single one (telegraph is buggy)
-    pres = body.xpath('//pre')
-    for pre in pres:
-        next_pre = pre.getnext()
-        while next_pre is not None and next_pre in pres:
-            pre.text += "\n" + next_pre.text
-            current_pre = next_pre
-            next_pre = next_pre.getnext()
-            pres.remove(current_pre)
-            current_pre.drop_tree()
+    join_following_elements(body.xpath('//pre'), join_string="\n")
 
 
 def _recursive_convert(element):

From 70537df713a5979d7281211e8da469d17548f855 Mon Sep 17 00:00:00 2001
From: Yolk <yolkmail@gmail.com>
Date: Thu, 19 Jan 2017 22:02:21 +0200
Subject: [PATCH 11/21] Refactoring

---
 html_telegraph_poster/html_to_telegraph.py | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/html_telegraph_poster/html_to_telegraph.py b/html_telegraph_poster/html_to_telegraph.py
index f490dd0..ac7abbf 100644
--- a/html_telegraph_poster/html_to_telegraph.py
+++ b/html_telegraph_poster/html_to_telegraph.py
@@ -94,15 +94,19 @@ def _wrap_tag(element, wrapper):
     return new_element
 
 
-def join_following_elements(elements, join_string):
+def join_following_elements(elements, join_string=''):
     for element in elements:
         next_element = element.getnext()
         while next_element is not None and next_element in elements:
-            element.text += join_string + next_element.text
             current = next_element
             next_element = next_element.getnext()
+            if current.text:
+                current.text = join_string + current.text
+            if current.tail:
+                current.tail = current.tail.strip()
+            element.append(current)
             elements.remove(current)
-            current.drop_tree()
+            current.drop_tag()
 
 
 def _fragments_from_string(html_string):

From ab6c5a727792057f948dd5e7329b809bae366b2c Mon Sep 17 00:00:00 2001
From: Yolk <yolkmail@gmail.com>
Date: Fri, 20 Jan 2017 13:50:11 +0200
Subject: [PATCH 12/21] Plain text should not be inside figure

---
 html_telegraph_poster/html_to_telegraph.py | 16 +++++++++++++---
 setup.py                                   |  2 +-
 tests/test.py                              |  8 ++++++++
 3 files changed, 22 insertions(+), 4 deletions(-)

diff --git a/html_telegraph_poster/html_to_telegraph.py b/html_telegraph_poster/html_to_telegraph.py
index ac7abbf..4f3caf2 100644
--- a/html_telegraph_poster/html_to_telegraph.py
+++ b/html_telegraph_poster/html_to_telegraph.py
@@ -94,6 +94,15 @@ def _wrap_tag(element, wrapper):
     return new_element
 
 
+def _wrap_figure(element):
+    figure = _create_element('figure')
+    element.addprevious(figure)
+    element.drop_tag()
+    element.tail = ''
+    figure.append(element)
+    return figure
+
+
 def join_following_elements(elements, join_string=''):
     for element in elements:
         next_element = element.getnext()
@@ -149,8 +158,8 @@ def preprocess_media_tags(element):
                 elif vimeo:
                     element.set('src', '/embed/vimeo?url=' + quote_plus('https://vimeo.com/' + vimeo.group(2)))
 
-                element.addprevious(_create_element('figure'))
-                element.getprevious().append(element)
+                _wrap_figure(element)
+
         elif element.tag == 'blockquote' and element.get('class') == 'twitter-tweet':
             twitter_links = element.xpath('.//a')
             for tw_link in twitter_links:
@@ -158,7 +167,8 @@ def preprocess_media_tags(element):
                     twitter_frame = html.HtmlElement()
                     twitter_frame.tag = 'iframe'
                     twitter_frame.set('src', '/embed/twitter?url=' + quote_plus(tw_link.get('href')))
-                    element.addprevious(_wrap_tag(twitter_frame, 'figure'))
+                    element.addprevious(twitter_frame)
+                    _wrap_figure(twitter_frame)
                     element.drop_tree()
 
 
diff --git a/setup.py b/setup.py
index d3d7841..fe8178f 100644
--- a/setup.py
+++ b/setup.py
@@ -1,7 +1,7 @@
 from setuptools import setup
 
 setup(name='html_telegraph_poster',
-      version='0.1.1',
+      version='0.1.2',
       description='Posts your html to telegra.ph blogging service',
       keywords='telegra.ph post html telegram',
       url='https://github.com/mercuree/html-telegraph-poster',
diff --git a/tests/test.py b/tests/test.py
index f1ef916..45146e6 100644
--- a/tests/test.py
+++ b/tests/test.py
@@ -262,6 +262,7 @@ def test_iframe(self):
         iframe_no_src = '<iframe></iframe>'
         iframe_child_no_src = '<p><iframe></iframe></p>'
         iframe_text_before = 'text before <iframe></iframe>'
+        iframe_text_after = '<p><iframe src="//www.youtube.com/embed/abcdef"></iframe>Text after </p>'
         iframe_not_allowed_src = '<div><iframe src="http://example.com"></iframe></div>'
         iframe_vimeo = '<iframe src="https://player.vimeo.com/video/1185346"></iframe>'
         mix = iframe_child_no_src + html + iframe_empty_src + iframe_no_src
@@ -309,6 +310,13 @@ def test_iframe(self):
             ],
             convert_html_to_telegraph_format(iframe_vimeo, clean_html=True)
         )
+        self.assertJson(
+            [
+                {"tag": "p", "children": [{"tag": "figure", "children": [{"tag": "iframe", "attrs": {
+                "src": "/embed/youtube?url=https%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3Dabcdef"}}]}, "Text after "]}
+             ],
+            convert_html_to_telegraph_format(iframe_text_after, clean_html=True)
+        )
 
     def test_twitter_links(self):
         html = '''

From 177668d642fd617a73a6a6e93a53cd45c5423c61 Mon Sep 17 00:00:00 2001
From: Yolk <yolkmail@gmail.com>
Date: Fri, 20 Jan 2017 17:45:47 +0200
Subject: [PATCH 13/21] Avoid nested figures

---
 html_telegraph_poster/html_to_telegraph.py |  3 ++-
 setup.py                                   |  2 +-
 tests/test.py                              | 11 ++++++++++-
 3 files changed, 13 insertions(+), 3 deletions(-)

diff --git a/html_telegraph_poster/html_to_telegraph.py b/html_telegraph_poster/html_to_telegraph.py
index 4f3caf2..1766070 100644
--- a/html_telegraph_poster/html_to_telegraph.py
+++ b/html_telegraph_poster/html_to_telegraph.py
@@ -158,7 +158,8 @@ def preprocess_media_tags(element):
                 elif vimeo:
                     element.set('src', '/embed/vimeo?url=' + quote_plus('https://vimeo.com/' + vimeo.group(2)))
 
-                _wrap_figure(element)
+                if not len(element.xpath('./ancestor::figure')):
+                    _wrap_figure(element)
 
         elif element.tag == 'blockquote' and element.get('class') == 'twitter-tweet':
             twitter_links = element.xpath('.//a')
diff --git a/setup.py b/setup.py
index fe8178f..72c2895 100644
--- a/setup.py
+++ b/setup.py
@@ -1,7 +1,7 @@
 from setuptools import setup
 
 setup(name='html_telegraph_poster',
-      version='0.1.2',
+      version='0.1.3',
       description='Posts your html to telegra.ph blogging service',
       keywords='telegra.ph post html telegram',
       url='https://github.com/mercuree/html-telegraph-poster',
diff --git a/tests/test.py b/tests/test.py
index 45146e6..156e21d 100644
--- a/tests/test.py
+++ b/tests/test.py
@@ -266,11 +266,12 @@ def test_iframe(self):
         iframe_not_allowed_src = '<div><iframe src="http://example.com"></iframe></div>'
         iframe_vimeo = '<iframe src="https://player.vimeo.com/video/1185346"></iframe>'
         mix = iframe_child_no_src + html + iframe_empty_src + iframe_no_src
+        iframe_with_figure = '<figure><iframe src="//www.youtube.com/embed/abcdef"></iframe>Text after </figure>'
         self.assertJson(
             [
                 {'tag': 'figure', 'children': [{'tag': 'iframe', 'attrs': {
                 'src': '/embed/youtube?url=https%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3Dabcdef'}}]}
-             ],
+            ],
             convert_html_to_telegraph_format(html, clean_html=True)
         )
         self.assertJson(
@@ -317,6 +318,14 @@ def test_iframe(self):
              ],
             convert_html_to_telegraph_format(iframe_text_after, clean_html=True)
         )
+        self.assertJson(
+            [
+                {u'tag': u'figure', u'children': [{u'tag': u'iframe', u'attrs': {
+                    u'src': u'/embed/youtube?url=https%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3Dabcdef'}}, u'Text after ']}
+            ],
+            convert_html_to_telegraph_format(iframe_with_figure, clean_html=True)
+        )
+
 
     def test_twitter_links(self):
         html = '''

From a43f485b1b8f8bdcc8156d38d36e0a911c77524e Mon Sep 17 00:00:00 2001
From: Yolk <yolkmail@gmail.com>
Date: Sat, 21 Jan 2017 15:54:10 +0200
Subject: [PATCH 14/21] More correct formatting (fixed image order inside
 paragraph)

---
 html_telegraph_poster/html_to_telegraph.py | 50 ++++++++++++++--------
 setup.py                                   |  2 +-
 tests/test.py                              |  5 ++-
 3 files changed, 35 insertions(+), 22 deletions(-)

diff --git a/html_telegraph_poster/html_to_telegraph.py b/html_telegraph_poster/html_to_telegraph.py
index 1766070..0c1d24f 100644
--- a/html_telegraph_poster/html_to_telegraph.py
+++ b/html_telegraph_poster/html_to_telegraph.py
@@ -173,6 +173,30 @@ def preprocess_media_tags(element):
                     element.drop_tree()
 
 
+def move_to_top(body):
+    # this should be improved to include nested elements (like lists)
+    # still buggy
+    elements = body.xpath('./*/figure')
+    for element in elements:
+        preceding_elements = element.xpath('./preceding-sibling::*')
+        parent = element.getparent()
+        if len(preceding_elements) > 0 or parent.text and len(parent.text) > 0:
+
+            new_container = _create_element(parent.tag)
+            new_container.text = parent.text
+            parent.text = ''
+            parent.addprevious(new_container)
+
+            for preceding in preceding_elements:
+                new_container.append(preceding)
+
+        parent_for_figure = element.xpath('./ancestor::*[parent::body]')[0]
+        # tail leaves inside parent
+        element.drop_tree()
+        element.tail = ''
+        parent_for_figure.addprevious(element)
+
+
 def preprocess_fragments(fragments):
     bad_tags = []
 
@@ -181,23 +205,6 @@ def preprocess_fragments(fragments):
 
     body = fragments[0].getparent()
 
-    for fragment in fragments:
-        last_element = fragment
-        # figure should be on the top level
-        if fragment.find('figure') is not None:
-            f = fragment.find('figure')
-            last_element = _insert_after(f, last_element)
-
-        images_to_wrap = fragment.xpath('.//self::img[not(ancestor::figure)]')
-        for image in images_to_wrap:
-            figure = _create_element('figure')
-            last_element = _insert_after(figure, last_element)
-            figure.append(image)
-
-            if image.tail:
-                _insert_after(_create_element('p', text=image.tail), last_element)
-                image.tail = ''
-
     # bad iframes
     ns = {'re': "http://exslt.org/regular-expressions"}
     bad_tags.extend(fragments[-1].xpath("//iframe[not(re:test(@src, '%s|%s', 'i'))]" % (youtube_re, vimeo_re), namespaces=ns))
@@ -210,7 +217,8 @@ def preprocess_fragments(fragments):
     # bad lists (remove lists/list items if empty)
     nodes_not_to_be_empty = fragments[-1].xpath('//ul|//ol|//li')
     bad_tags.extend([x for x in nodes_not_to_be_empty if len(x.text_content().strip()) == 0])
-
+    # remove links with images inside
+    bad_tags.extend(body.xpath('.//a[descendant::img]'))
     for bad_tag in bad_tags:
         bad_tag.drop_tag()
         if bad_tag in fragments:
@@ -229,6 +237,10 @@ def preprocess_fragments(fragments):
                 fragment.addnext(paragraph)
                 fragment.tail = ''
 
+    images_to_wrap = body.xpath('.//img[not(ancestor::figure)]')
+    for image in images_to_wrap:
+        _wrap_figure(image)
+
     return len(body.getchildren()) and body or None
 
 
@@ -284,7 +296,7 @@ def convert_html_to_telegraph_format(html_string, clean_html=True):
             desc = [x for x in body.iterdescendants()]
             for tag in desc:
                 preprocess_media_tags(tag)
-
+            move_to_top(body)
             post_process(body)
     else:
         fragments = _fragments_from_string(html_string)
diff --git a/setup.py b/setup.py
index 72c2895..5f897b0 100644
--- a/setup.py
+++ b/setup.py
@@ -1,7 +1,7 @@
 from setuptools import setup
 
 setup(name='html_telegraph_poster',
-      version='0.1.3',
+      version='0.1.4',
       description='Posts your html to telegra.ph blogging service',
       keywords='telegra.ph post html telegram',
       url='https://github.com/mercuree/html-telegraph-poster',
diff --git a/tests/test.py b/tests/test.py
index 156e21d..35b0f11 100644
--- a/tests/test.py
+++ b/tests/test.py
@@ -313,8 +313,9 @@ def test_iframe(self):
         )
         self.assertJson(
             [
-                {"tag": "p", "children": [{"tag": "figure", "children": [{"tag": "iframe", "attrs": {
-                "src": "/embed/youtube?url=https%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3Dabcdef"}}]}, "Text after "]}
+                {"tag": "figure", "children": [{"tag": "iframe", "attrs": {
+                    "src": "/embed/youtube?url=https%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3Dabcdef"}}]},
+                {"tag": "p", "children": ["Text after "]}
              ],
             convert_html_to_telegraph_format(iframe_text_after, clean_html=True)
         )

From f0d43d8991640ae9f1835cdc06ae9a1f6264adf6 Mon Sep 17 00:00:00 2001
From: Yolk <yolkmail@gmail.com>
Date: Sat, 21 Jan 2017 16:03:31 +0200
Subject: [PATCH 15/21] More tests

---
 tests/test.py | 19 ++++++++++++++++++-
 1 file changed, 18 insertions(+), 1 deletion(-)

diff --git a/tests/test.py b/tests/test.py
index 35b0f11..5572657 100644
--- a/tests/test.py
+++ b/tests/test.py
@@ -267,6 +267,14 @@ def test_iframe(self):
         iframe_vimeo = '<iframe src="https://player.vimeo.com/video/1185346"></iframe>'
         mix = iframe_child_no_src + html + iframe_empty_src + iframe_no_src
         iframe_with_figure = '<figure><iframe src="//www.youtube.com/embed/abcdef"></iframe>Text after </figure>'
+
+        multiple_iframes = '<p>'\
+            'Text before'\
+            '<a href="/123">link</a><iframe src="//www.youtube.com/embed/abcdef"></iframe> text'\
+            '<a href="/246">link2</a> Text after link'\
+            '<iframe src="//www.youtube.com/embed/xyzxyzxyz"></iframe>'\
+            '</p>'
+
         self.assertJson(
             [
                 {'tag': 'figure', 'children': [{'tag': 'iframe', 'attrs': {
@@ -326,7 +334,16 @@ def test_iframe(self):
             ],
             convert_html_to_telegraph_format(iframe_with_figure, clean_html=True)
         )
-
+        self.assertJson(
+            [
+                {"tag": "p", "children": ["Text before", {"tag": "a", "attrs": {"href": "/123"}, "children": ["link"]}]},
+                {"tag": "figure", "children": [{"tag": "iframe", "attrs": {
+                    "src": "/embed/youtube?url=https%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3Dabcdef"}}]},
+                {"tag": "p", "children": [" text", {"tag": "a", "attrs": {"href": "/246"}, "children": ["link2"]}, " Text after link"]},
+                {"tag": "figure", "children": [{"tag": "iframe", "attrs": {
+                 "src": "/embed/youtube?url=https%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3Dxyzxyzxyz"}}]}],
+            convert_html_to_telegraph_format(multiple_iframes, clean_html=True)
+        )
 
     def test_twitter_links(self):
         html = '''

From 2dfc835d64463090e7516d8f7abbf82a8d19b71c Mon Sep 17 00:00:00 2001
From: Yolk <yolkmail@gmail.com>
Date: Sat, 21 Jan 2017 23:56:48 +0200
Subject: [PATCH 16/21] Remove class attribute

---
 html_telegraph_poster/html_to_telegraph.py | 5 +++++
 setup.py                                   | 2 +-
 tests/test.py                              | 2 +-
 3 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/html_telegraph_poster/html_to_telegraph.py b/html_telegraph_poster/html_to_telegraph.py
index 0c1d24f..a1d393b 100644
--- a/html_telegraph_poster/html_to_telegraph.py
+++ b/html_telegraph_poster/html_to_telegraph.py
@@ -255,6 +255,11 @@ def post_process(body):
     # group following pre elements into single one (telegraph is buggy)
     join_following_elements(body.xpath('//pre'), join_string="\n")
 
+    # remove class attributes for all
+    elements_with_class = body.xpath('.//*[@class]')
+    for element in elements_with_class:
+        element.attrib.pop('class')
+
 
 def _recursive_convert(element):
 
diff --git a/setup.py b/setup.py
index 5f897b0..78c49fc 100644
--- a/setup.py
+++ b/setup.py
@@ -1,7 +1,7 @@
 from setuptools import setup
 
 setup(name='html_telegraph_poster',
-      version='0.1.4',
+      version='0.1.5',
       description='Posts your html to telegra.ph blogging service',
       keywords='telegra.ph post html telegram',
       url='https://github.com/mercuree/html-telegraph-poster',
diff --git a/tests/test.py b/tests/test.py
index 5572657..8a1b27f 100644
--- a/tests/test.py
+++ b/tests/test.py
@@ -465,7 +465,7 @@ def hello_world():
         )
         self.assertJson(
             [
-                {"tag": "pre", "attrs": {"class": "code"}, "children": [
+                {"tag": "pre", "children": [
                     "\n                def hello_world():\n                    print(\"hello\")\n            \nprint(\"second pre\")"]},
                  {"tag": "p", "children": [" Text after pre "]}
             ],

From 75a1a1f8971c09536f1daae1cbf7b7572eba7c4e Mon Sep 17 00:00:00 2001
From: Yolk <yolkmail@gmail.com>
Date: Sun, 22 Jan 2017 16:18:53 +0200
Subject: [PATCH 17/21] Fixed case, where some elements skipped convertation

---
 html_telegraph_poster/html_to_telegraph.py | 2 +-
 setup.py                                   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/html_telegraph_poster/html_to_telegraph.py b/html_telegraph_poster/html_to_telegraph.py
index a1d393b..8c1aed1 100644
--- a/html_telegraph_poster/html_to_telegraph.py
+++ b/html_telegraph_poster/html_to_telegraph.py
@@ -224,7 +224,7 @@ def preprocess_fragments(fragments):
         if bad_tag in fragments:
             fragments.remove(bad_tag)
 
-    for fragment in fragments:
+    for fragment in body.getchildren():
         if fragment.tag not in allowed_top_level_tags:
             paragraph = _create_element('p')
             fragment.addprevious(paragraph)
diff --git a/setup.py b/setup.py
index 78c49fc..b51191d 100644
--- a/setup.py
+++ b/setup.py
@@ -1,7 +1,7 @@
 from setuptools import setup
 
 setup(name='html_telegraph_poster',
-      version='0.1.5',
+      version='0.1.6',
       description='Posts your html to telegra.ph blogging service',
       keywords='telegra.ph post html telegram',
       url='https://github.com/mercuree/html-telegraph-poster',

From 39b980b3a632f5af9d169c468412747cd6e26fa1 Mon Sep 17 00:00:00 2001
From: Yolk <yolkmail@gmail.com>
Date: Fri, 27 Jan 2017 17:32:49 +0200
Subject: [PATCH 18/21] Fixed regular expression for pre tags

---
 html_telegraph_poster/html_to_telegraph.py |  2 +-
 setup.py                                   |  2 +-
 tests/test.py                              | 25 ++++++++++++++++++++++
 3 files changed, 27 insertions(+), 2 deletions(-)

diff --git a/html_telegraph_poster/html_to_telegraph.py b/html_telegraph_poster/html_to_telegraph.py
index 8c1aed1..854249d 100644
--- a/html_telegraph_poster/html_to_telegraph.py
+++ b/html_telegraph_poster/html_to_telegraph.py
@@ -18,7 +18,7 @@
 youtube_re = r'(https?:)?//(www\.)?youtube(-nocookie)?\.com/embed/'
 vimeo_re = r'(https?:)?//player\.vimeo\.com/video/(\d+)'
 twitter_re = re.compile(r'(https?:)?//(www\.)?twitter\.com/[A-Za-z0-9_]{1,15}/status/\d+')
-pre_content_re = re.compile(r'<pre[^>]*>[^<]*</pre>')
+pre_content_re = re.compile(r'<pre[^>]*>[\s\S]*?</pre>')
 line_breaks_and_empty_strings = re.compile('(^[\s\t]*)?\r?\n', flags=re.MULTILINE)
 
 
diff --git a/setup.py b/setup.py
index b51191d..7ae1b48 100644
--- a/setup.py
+++ b/setup.py
@@ -1,7 +1,7 @@
 from setuptools import setup
 
 setup(name='html_telegraph_poster',
-      version='0.1.6',
+      version='0.1.7',
       description='Posts your html to telegra.ph blogging service',
       keywords='telegra.ph post html telegram',
       url='https://github.com/mercuree/html-telegraph-poster',
diff --git a/tests/test.py b/tests/test.py
index 8a1b27f..43c1ac1 100644
--- a/tests/test.py
+++ b/tests/test.py
@@ -455,6 +455,17 @@ def hello_world():
 <pre>  String anotherCodeBlock5 = "separated code block5"</pre>
 <p> paragraph splitter</p>
 <pre>  String anotherCodeBlock6 = "separated code block6"</pre>
+        '''
+        html4 = '''
+<pre class="code literal-block"><p class="nv">$ </p>mkvirtualenv myvirtualenv --python<p class="o">=</p>/usr/bin/python3.4
+
+
+Running virtualenv with interpreter /usr/bin/python3.4
+Using base prefix <p class="s1">'/usr'</p>
+New python executable in myvirtualenv/bin/python3.4
+Also creating executable in myvirtualenv/bin/python
+Installing setuptools, pip...done.
+</pre>
         '''
         self.assertJson(
             [
@@ -487,6 +498,20 @@ def hello_world():
             convert_html_to_telegraph_format(html3, clean_html=True)
         )
 
+        print(convert_html_to_telegraph_format(html4, clean_html=True))
+        self.assertJson(
+            [
+                {"tag": "pre", "children": [
+                    "$ mkvirtualenv myvirtualenv --python=/usr/bin/python3.4\n\n\n"
+                    "Running virtualenv with interpreter /usr/bin/python3.4\n"
+                    "Using base prefix '/usr'\n"
+                    "New python executable in myvirtualenv/bin/python3.4\n"
+                    "Also creating executable in myvirtualenv/bin/python\n"
+                    "Installing setuptools, pip...done.\n"]}
+            ],
+            convert_html_to_telegraph_format(html4, clean_html=True)
+        )
+
 
 class UploadImageTest(unittest.TestCase):
 

From 87c823dd80f64c2a1519afc2c7e9d92bb159ac3d Mon Sep 17 00:00:00 2001
From: Yolk <yolkmail@gmail.com>
Date: Fri, 27 Jan 2017 18:03:18 +0200
Subject: [PATCH 19/21] Removed print

---
 tests/test.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/tests/test.py b/tests/test.py
index 43c1ac1..9a7c375 100644
--- a/tests/test.py
+++ b/tests/test.py
@@ -497,8 +497,6 @@ def hello_world():
             ],
             convert_html_to_telegraph_format(html3, clean_html=True)
         )
-
-        print(convert_html_to_telegraph_format(html4, clean_html=True))
         self.assertJson(
             [
                 {"tag": "pre", "children": [

From ee24b6dbc23e874bebd5c4ebf2e9e5f0b355825a Mon Sep 17 00:00:00 2001
From: Yolk <yolkmail@gmail.com>
Date: Tue, 31 Jan 2017 17:27:49 +0200
Subject: [PATCH 20/21] Added code tag support

---
 html_telegraph_poster/html_to_telegraph.py | 11 +++++++++--
 setup.py                                   |  2 +-
 tests/test.py                              | 14 ++++++++++++++
 3 files changed, 24 insertions(+), 3 deletions(-)

diff --git a/html_telegraph_poster/html_to_telegraph.py b/html_telegraph_poster/html_to_telegraph.py
index 854249d..235ac36 100644
--- a/html_telegraph_poster/html_to_telegraph.py
+++ b/html_telegraph_poster/html_to_telegraph.py
@@ -11,14 +11,14 @@
 base_url = 'https://telegra.ph'
 save_url = 'https://edit.telegra.ph/save'
 default_user_agent = 'Python_telegraph_poster/0.1'
-allowed_tags = ['a', 'aside', 'b', 'blockquote', 'br', 'em', 'figcaption', 'figure', 'h3', 'h4', 'hr', 'i',
+allowed_tags = ['a', 'aside', 'b', 'blockquote', 'br', 'code', 'em', 'figcaption', 'figure', 'h3', 'h4', 'hr', 'i',
                 'iframe', 'img', 'li', 'ol', 'p', 'pre', 's', 'strong', 'u', 'ul', 'video']
 allowed_top_level_tags = ['aside', 'blockquote', 'pre', 'figure', 'h3', 'h4', 'hr', 'ol', 'p', 'ul']
 
 youtube_re = r'(https?:)?//(www\.)?youtube(-nocookie)?\.com/embed/'
 vimeo_re = r'(https?:)?//player\.vimeo\.com/video/(\d+)'
 twitter_re = re.compile(r'(https?:)?//(www\.)?twitter\.com/[A-Za-z0-9_]{1,15}/status/\d+')
-pre_content_re = re.compile(r'<pre[^>]*>[\s\S]*?</pre>')
+pre_content_re = re.compile(r'<(pre|code)(>|\s[^>]*>)[\s\S]*?</\1>')
 line_breaks_and_empty_strings = re.compile('(^[\s\t]*)?\r?\n', flags=re.MULTILINE)
 
 
@@ -224,6 +224,13 @@ def preprocess_fragments(fragments):
         if bad_tag in fragments:
             fragments.remove(bad_tag)
 
+    # code - > pre
+    # convert multiline code into pre
+    code_elements = body.xpath('.//code')
+    for code_element in code_elements:
+        if '\n' in code_element.text:
+            code_element.tag = 'pre'
+
     for fragment in body.getchildren():
         if fragment.tag not in allowed_top_level_tags:
             paragraph = _create_element('p')
diff --git a/setup.py b/setup.py
index 7ae1b48..f1f3709 100644
--- a/setup.py
+++ b/setup.py
@@ -1,7 +1,7 @@
 from setuptools import setup
 
 setup(name='html_telegraph_poster',
-      version='0.1.7',
+      version='0.1.8',
       description='Posts your html to telegra.ph blogging service',
       keywords='telegra.ph post html telegram',
       url='https://github.com/mercuree/html-telegraph-poster',
diff --git a/tests/test.py b/tests/test.py
index 9a7c375..13256c8 100644
--- a/tests/test.py
+++ b/tests/test.py
@@ -466,6 +466,12 @@ def hello_world():
 Also creating executable in myvirtualenv/bin/python
 Installing setuptools, pip...done.
 </pre>
+        '''
+        html5 = '''
+        <p>Text before <code> inline_code = True</code> Text after</p>
+        <code> multiline_code = True
+        next_line = True
+        </code>
         '''
         self.assertJson(
             [
@@ -509,6 +515,14 @@ def hello_world():
             ],
             convert_html_to_telegraph_format(html4, clean_html=True)
         )
+        self.assertJson(
+            [
+                {"tag": "p",
+                    "children": ["Text before ", {"tag": "code", "children": [" inline_code = True"]}, " Text after"]},
+                {"tag": "pre", "children": [" multiline_code = True\n        next_line = True\n        "]}
+            ],
+            convert_html_to_telegraph_format(html5, clean_html=True)
+        )
 
 
 class UploadImageTest(unittest.TestCase):

From dce5625e0db69cb65340152a83b05a8ee5f67914 Mon Sep 17 00:00:00 2001
From: Yolk <yolkmail@gmail.com>
Date: Tue, 31 Jan 2017 21:36:26 +0200
Subject: [PATCH 21/21] slightly changed Class interface

---
 html_telegraph_poster/html_to_telegraph.py | 13 ++++++-------
 setup.py                                   |  2 +-
 2 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/html_telegraph_poster/html_to_telegraph.py b/html_telegraph_poster/html_to_telegraph.py
index 235ac36..4e4d441 100644
--- a/html_telegraph_poster/html_to_telegraph.py
+++ b/html_telegraph_poster/html_to_telegraph.py
@@ -367,7 +367,7 @@ def upload_to_telegraph(title, author, text, author_url='', tph_uuid=None, page_
 
 
 class TelegraphPoster(object):
-    def __init__(self, tph_uuid=None, page_id=None, user_agent=default_user_agent, clean_html=True):
+    def __init__(self, tph_uuid=None, page_id=None, user_agent=default_user_agent, clean_html=True, convert_html=True):
         self.title = None
         self.author = None
         self.author_url = None
@@ -376,17 +376,15 @@ def __init__(self, tph_uuid=None, page_id=None, user_agent=default_user_agent, c
         self.page_id = page_id
         self.user_agent = user_agent
         self.clean_html = clean_html
+        self.convert_html = convert_html
 
     def post(self, title, author, text, author_url=''):
-        result = self.edit(
-            title,
-            author,
-            text
-        )
         self.title = title
         self.author = author
         self.author_url = author_url
         self.text = text
+        result = self.edit()
+
         self.tph_uuid = result['tph_uuid']
         self.page_id = result['page_id']
         return result
@@ -400,5 +398,6 @@ def edit(self, title=None, author=None, text=None):
             tph_uuid=self.tph_uuid,
             page_id=self.page_id,
             user_agent=self.user_agent,
-            clean_html=self.clean_html
+            clean_html=self.clean_html,
+            convert_html=self.convert_html
         )
diff --git a/setup.py b/setup.py
index f1f3709..fef7b6d 100644
--- a/setup.py
+++ b/setup.py
@@ -1,7 +1,7 @@
 from setuptools import setup
 
 setup(name='html_telegraph_poster',
-      version='0.1.8',
+      version='0.1.9',
       description='Posts your html to telegra.ph blogging service',
       keywords='telegra.ph post html telegram',
       url='https://github.com/mercuree/html-telegraph-poster',