Merge pull request #7 from claromes/display

Display Update
claromes · Aug 16, 2023 · 5dd7e00 · 5dd7e00
2 parents 59cd087 + e156a5e
commit 5dd7e00
Show file tree

Hide file tree

Showing 3 changed files with 197 additions and 94 deletions.
diff --git a/README.md b/README.md
@@ -15,7 +15,7 @@ Tool that displays multiple archived tweets on Wayback Machine to avoid opening
 
 ## Features
 
-- 30 embedded tweets per page
+- 30 tweets per page
 - Filtering by only deleted tweets
 
 ## Development
@@ -42,6 +42,8 @@ Streamlit will be served at http://localhost:8501
 - [x] `only_deleted` checkbox selected for handles without deleted tweets
 - [x] Pagination: set session variable on first click
 - [x] Pagination: scroll to top
+- [ ] `IndexError`
+- [ ] Timeout error
 
 ## Roadmap
 
@@ -54,7 +56,11 @@ Streamlit will be served at http://localhost:8501
 - [x] Changelog
 - [ ] Prevent duplicate URLs
 - [ ] Range size defined by user
-- [ ] Hide Twitter header banner (iframe)
 - [ ] `parse_links` exception
+- [ ] Add current page to page title
+- [ ] Parse MIME type `warc/revisit`
+- [ ] Filter by period/datetime
+- [ ] Apply filters by API endpoints
+- [ ] Add contributing guidelines
 
 ## [Changelog](/CHANGELOG.md)
diff --git a/app.py b/app.py
@@ -2,8 +2,8 @@
 import datetime
 import streamlit as st
 import streamlit.components.v1 as components
-
-__version__ = '0.1.4'
+import json
+import re
 
 year = datetime.datetime.now().year
 
@@ -20,7 +20,7 @@
 
         Tool that displays multiple archived tweets on Wayback Machine to avoid opening each link manually.
 
-        - 30 embedded tweets per page
+        - 30 tweets per page
         - Filtering by only deleted tweets
 
         This tool is experimental, please feel free to send your [feedbacks](https://github.com/claromes/waybacktweets/issues).
@@ -37,6 +37,20 @@
     header[data-testid="stHeader"] {
         opacity: 0.5;
     }
+    div[data-testid="stDecoration"] {
+        visibility: hidden;
+        height: 0%;
+        position: fixed;
+    }
+    div[data-testid="stStatusWidget"] {
+        visibility: hidden;
+        height: 0%;
+        position: fixed;
+    }
+    iframe {
+        background-color: #dddddd;
+        border-radius: 0.5rem;
+    }
 </style>
 '''
 
@@ -76,29 +90,70 @@ def scroll_into_view():
 
     components.html(js, width=0, height=0)
 
-@st.cache_data(ttl=1800, show_spinner=False)
 def embed(tweet):
-    api = 'https://publish.twitter.com/oembed?url={}'.format(tweet)
-    response = requests.get(api)
+    try:
+        url = 'https://publish.twitter.com/oembed?url={}'.format(tweet)
+        response = requests.get(url)
+
+        regex = r'<blockquote class="twitter-tweet"><p[^>]*>(.*?)<\/p>.*?&mdash; (.*?)<\/a>'
+        regex_author = r'^(.*?)\s*\('
+
+        if response.status_code == 200 or response.status_code == 302:
+            status_code = response.status_code
+            html = response.json()['html']
+            author_name = response.json()['author_name']
+
+            matches_html = re.findall(regex, html, re.DOTALL)
+
+            tweet_content = []
+            user_info = []
+            is_RT = []
+
+            for match in matches_html:
+                tweet_content_match = re.sub(r'<a[^>]*>|<\/a>', '', match[0].strip())
+                tweet_content_match = tweet_content_match.replace('<br>', '\n')
+
+                user_info_match = re.sub(r'<a[^>]*>|<\/a>', '', match[1].strip())
+                user_info_match = user_info_match.replace(')', '), ')
+
+                match_author = re.search(regex_author, user_info_match)
+                author_tweet = match_author.group(1)
+
+                if tweet_content_match:
+                    tweet_content.append(tweet_content_match)
+                if user_info_match:
+                    user_info.append(user_info_match)
+
+                    is_RT_match = False
+                    if author_name != author_tweet:
+                        is_RT_match = True
+
+                    is_RT.append(is_RT_match)
+
+            return status_code, tweet_content, user_info, is_RT
+        else:
+            return False
+    except requests.exceptions.Timeout:
+        st.error('Connection to publish.twitter.com timed out.')
+
 
-    if response.status_code == 200 or response.status_code == 304:
-        return response.json()['html']
-    else:
-        return None
 
 @st.cache_data(ttl=1800, show_spinner=False)
 def tweets_count(handle):
     url = 'https://web.archive.org/cdx/search/cdx?url=https://twitter.com/{}/status/*&output=json'.format(handle)
-    response = requests.get(url)
-    if response.status_code == 200:
-        data = response.json()
-        if data and len(data) > 1:
-            total_tweets = len(data) - 1
-            return total_tweets
-        else:
-            return 0
-    else:
-        return None
+    try:
+        response = requests.get(url)
+
+        if response.status_code == 200:
+            data = response.json()
+            if data and len(data) > 1:
+                total_tweets = len(data) - 1
+                return total_tweets
+            else:
+                return 0
+    except requests.exceptions.Timeout:
+        st.error('Connection to web.archive.org timed out.')
+
 
 @st.cache_data(ttl=1800, show_spinner=False)
 def query_api(handle, limit, offset):
@@ -107,11 +162,13 @@ def query_api(handle, limit, offset):
         st.stop()
 
     url = 'https://web.archive.org/cdx/search/cdx?url=https://twitter.com/{}/status/*&output=json&limit={}&offset={}'.format(handle, limit, offset)
-    response = requests.get(url)
-    if response.status_code == 200:
-        return response.json()
-    else:
-        return None
+    try:
+        response = requests.get(url)
+
+        if response.status_code == 200 or response.status_code == 304:
+            return response.json()
+    except requests.exceptions.Timeout:
+        st.error('Connection to web.archive.org timed out.')
 
 @st.cache_data(ttl=1800, show_spinner=False)
 def parse_links(links):
@@ -148,6 +205,8 @@ def attr(i):
 handle = st.text_input('username', placeholder='username', label_visibility='collapsed')
 query = st.button('Query', type='primary', use_container_width=True)
 
+bar = st.empty()
+
 if query or handle:
     if handle != st.session_state.current_handle:
         st.session_state.offset = 0
@@ -164,91 +223,129 @@ def attr(i):
     only_deleted = st.checkbox('Only deleted tweets')
 
     try:
-        with st.spinner(''):
-            progress = st.empty()
-            links = query_api(handle, tweets_per_page, st.session_state.offset)
-            parsed_links = parse_links(links)[0]
-            tweet_links = parse_links(links)[1]
-            mimetype = parse_links(links)[2]
-            timestamp = parse_links(links)[3]
+        bar.progress(0)
+        progress = st.empty()
+        links = query_api(handle, tweets_per_page, st.session_state.offset)
+
+        parse = parse_links(links)
+        parsed_links = parse[0]
+        tweet_links = parse[1]
+        mimetype = parse[2]
+        timestamp = parse[3]
+
+        if links:
+            st.divider()
+
+            st.session_state.current_handle = handle
+            st.session_state.current_query = query
+
+            return_none_count = 0
+
+            def prev_page():
+                st.session_state.offset -= tweets_per_page
+
+                #scroll to top config
+                st.session_state.update_component += 1
+                scroll_into_view()
 
+            def next_page():
+                st.session_state.offset += tweets_per_page
+
+                #scroll to top config
+                st.session_state.update_component += 1
+                scroll_into_view()
+
+            def display_tweet():
+                if is_RT[0] == True:
+                    st.info('*Retweet*')
+                st.write(tweet_content[0])
+                st.write(user_info[0])
 
-            if links:
                 st.divider()
 
-                st.session_state.current_handle = handle
-                st.session_state.current_query = query
+            def display_not_tweet():
+                if mimetype[i] == 'application/json':
+                    st.error('Tweet has been deleted.')
+                    response = requests.get(link)
+                    json_data = response.json()
+                    json_text = response.json()['text']
+
+                    st.code(json_text)
+                    st.json(json_data, expanded=False)
+
+                    st.divider()
+                if mimetype[i] == 'text/html':
+                    st.error('Tweet has been deleted.')
+                    components.iframe(link, height=500, scrolling=True)
+
+                    st.divider()
 
-                return_none_count = 0
+                if mimetype[i] == 'warc/revisit':
+                    st.warning('''MIME Type was not parsed.''')
 
-                def prev_page():
-                    st.session_state.offset -= tweets_per_page
+                    st.divider()
 
-                    #scroll to top config
-                    st.session_state.update_component += 1
-                    scroll_into_view()
+            start_index = st.session_state.offset
+            end_index = min(count, start_index + tweets_per_page)
 
-                def next_page():
-                    st.session_state.offset += tweets_per_page
+            for i in range(tweets_per_page):
+                try:
+                    bar.progress((i*3) + 13)
 
-                    #scroll to top config
-                    st.session_state.update_component += 1
-                    scroll_into_view()
+                    link = parsed_links[i]
+                    tweet = embed(tweet_links[i])
 
-                start_index = st.session_state.offset
-                end_index = min(count, start_index + tweets_per_page)
+                    if not only_deleted:
+                        attr(i)
 
-                for i in range(tweets_per_page):
-                    try:
-                        link = parsed_links[i]
-                        tweet = embed(tweet_links[i])
+                        if tweet:
+                            status_code = tweet[0]
+                            tweet_content = tweet[1]
+                            user_info = tweet[2]
+                            is_RT = tweet[3]
 
-                        if not only_deleted:
+                            if mimetype[i] == 'application/json':
+                                display_tweet()
+
+                            if mimetype[i] == 'text/html':
+                                display_tweet()
+                        elif not tweet:
+                            display_not_tweet()
+
+                    if only_deleted:
+                        if not tweet:
+                            return_none_count += 1
                             attr(i)
 
-                            if tweet == None:
-                                st.error('Tweet has been deleted.')
-                                components.iframe(src=link, width=700, height=1000, scrolling=True)
-                                st.divider()
-                            else:
-                                components.html(tweet, width=700, height=1000, scrolling=True)
-                                st.divider()
-
-                        if only_deleted:
-                            if tweet == None:
-                                return_none_count += 1
-                                attr(i)
-
-                                st.error('Tweet has been deleted.')
-                                components.iframe(src=link, width=700, height=1000, scrolling=True)
-                                st.divider()
-
-                            progress.write('{} URLs have been captured in the range {}-{}'.format(return_none_count, start_index, end_index))
-
-                        if start_index <= 0:
-                            st.session_state.prev_disabled = True
-                        else:
-                            st.session_state.prev_disabled = False
-
-                        if i + 1 == count:
-                            st.session_state.next_disabled = True
-                        else:
-                            st.session_state.next_disabled = False
-                    except IndexError:
-                        if start_index <= 0:
-                            st.session_state.prev_disabled = True
-                        else:
-                            st.session_state.prev_disabled = False
+                            display_not_tweet()
+
+                        progress.write('{} URLs have been captured in the range {}-{}'.format(return_none_count, start_index, end_index))
 
+                    if start_index <= 0:
+                        st.session_state.prev_disabled = True
+                    else:
+                        st.session_state.prev_disabled = False
+
+                    if i + 1 == count:
                         st.session_state.next_disabled = True
+                    else:
+                        st.session_state.next_disabled = False
+                # TODO
+                except IndexError:
+                    if start_index <= 0:
+                        st.session_state.prev_disabled = True
+                    else:
+                        st.session_state.prev_disabled = False
+
+                    st.session_state.next_disabled = True
 
-                prev, _ , next = st.columns([3, 4, 3])
+            prev, _ , next = st.columns([3, 4, 3])
 
-                prev.button('Previous', disabled=st.session_state.prev_disabled, key='prev_button_key', on_click=prev_page, type='primary', use_container_width=True)
-                next.button('Next', disabled=st.session_state.next_disabled, key='next_button_key', on_click=next_page, type='primary', use_container_width=True)
+            prev.button('Previous', disabled=st.session_state.prev_disabled, key='prev_button_key', on_click=prev_page, type='primary', use_container_width=True)
+            next.button('Next', disabled=st.session_state.next_disabled, key='next_button_key', on_click=next_page, type='primary', use_container_width=True)
 
-            if not links:
-                st.error('Unable to query the Wayback Machine API.')
+        if not links:
+            st.error('Unable to query the Wayback Machine API.')
     except TypeError as e:
         st.error('''
         {}. Refresh this page and try again.

diff --git a/requirements.txt b/requirements.txt
@@ -1,2 +1,2 @@
 requests==2.30.0
-streamlit==1.23.1
+streamlit==1.25.0