From 9beff69b1704726246ff5ba7f0772d63328c5502 Mon Sep 17 00:00:00 2001 From: Claromes Date: Sat, 5 Aug 2023 20:07:07 -0300 Subject: [PATCH 1/8] delete iframe from tweets and display as text --- README.md | 4 +- app.py | 152 +++++++++++++++++++++++++++++++++++++++++------------- 2 files changed, 119 insertions(+), 37 deletions(-) diff --git a/README.md b/README.md index c99102f..8205dfe 100644 --- a/README.md +++ b/README.md @@ -15,7 +15,7 @@ Tool that displays multiple archived tweets on Wayback Machine to avoid opening ## Features -- 30 embedded tweets per page +- 30 tweets per page - Filtering by only deleted tweets ## Development @@ -54,7 +54,7 @@ Streamlit will be served at http://localhost:8501 - [x] Changelog - [ ] Prevent duplicate URLs - [ ] Range size defined by user -- [ ] Hide Twitter header banner (iframe) - [ ] `parse_links` exception +- [ ] Add current page to page title ## [Changelog](/CHANGELOG.md) diff --git a/app.py b/app.py index 3fa25ff..d02569e 100644 --- a/app.py +++ b/app.py @@ -2,8 +2,10 @@ import datetime import streamlit as st import streamlit.components.v1 as components +import json +import re -__version__ = '0.1.4' +__version__ = '0.2' year = datetime.datetime.now().year @@ -20,7 +22,7 @@ Tool that displays multiple archived tweets on Wayback Machine to avoid opening each link manually. - - 30 embedded tweets per page + - 30 tweets per page - Filtering by only deleted tweets This tool is experimental, please feel free to send your [feedbacks](https://github.com/claromes/waybacktweets/issues). @@ -76,29 +78,68 @@ def scroll_into_view(): components.html(js, width=0, height=0) -@st.cache_data(ttl=1800, show_spinner=False) def embed(tweet): - api = 'https://publish.twitter.com/oembed?url={}'.format(tweet) - response = requests.get(api) + try: + url = 'https://publish.twitter.com/oembed?url={}'.format(tweet) + response = requests.get(url, timeout=1) + + regex = r'
]*>(.*?)<\/p>.*?— (.*?)<\/a>' + regex_author = r'^(.*?)\s*\(' + + if response.status_code == 200 or response.status_code == 302: + status_code = response.status_code + html = response.json()['html'] + author_name = response.json()['author_name'] + + matches_html = re.findall(regex, html, re.DOTALL) + + tweet_content = [] + user_info = [] + is_RT = [] + + for match in matches_html: + tweet_content_match = re.sub(r']*>|<\/a>', '', match[0].strip()) + user_info_match = re.sub(r']*>|<\/a>', '', match[1].strip()) + user_info_match = user_info_match.replace(')', '), ') + + match_author = re.search(regex_author, user_info_match) + author_tweet = match_author.group(1) + + if tweet_content_match: + tweet_content.append(tweet_content_match) + if user_info_match: + user_info.append(user_info_match) + + is_RT_match = False + if author_name != author_tweet: + is_RT_match = True + + is_RT.append(is_RT_match) + + return status_code, tweet_content, user_info, is_RT + else: + return False + except requests.exceptions.Timeout: + st.error('Connection to web.archive.org timed out.') + - if response.status_code == 200 or response.status_code == 304: - return response.json()['html'] - else: - return None @st.cache_data(ttl=1800, show_spinner=False) def tweets_count(handle): url = 'https://web.archive.org/cdx/search/cdx?url=https://twitter.com/{}/status/*&output=json'.format(handle) - response = requests.get(url) - if response.status_code == 200: - data = response.json() - if data and len(data) > 1: - total_tweets = len(data) - 1 - return total_tweets - else: - return 0 - else: - return None + try: + response = requests.get(url, timeout=5) + + if response.status_code == 200: + data = response.json() + if data and len(data) > 1: + total_tweets = len(data) - 1 + return total_tweets + else: + return 0 + except requests.exceptions.Timeout: + st.error('Connection to web.archive.org timed out.') + @st.cache_data(ttl=1800, show_spinner=False) def query_api(handle, limit, offset): @@ -107,11 +148,13 @@ def query_api(handle, limit, offset): st.stop() url = 'https://web.archive.org/cdx/search/cdx?url=https://twitter.com/{}/status/*&output=json&limit={}&offset={}'.format(handle, limit, offset) - response = requests.get(url) - if response.status_code == 200: - return response.json() - else: - return None + try: + response = requests.get(url, timeout=1) + + if response.status_code == 200 or response.status_code == 304: + return response.json() + except requests.exceptions.Timeout: + st.error('Connection to web.archive.org timed out.') @st.cache_data(ttl=1800, show_spinner=False) def parse_links(links): @@ -206,22 +249,61 @@ def next_page(): if not only_deleted: attr(i) - if tweet == None: - st.error('Tweet has been deleted.') - components.iframe(src=link, width=700, height=1000, scrolling=True) - st.divider() - else: - components.html(tweet, width=700, height=1000, scrolling=True) - st.divider() + if tweet: + status_code = tweet[0] + tweet_content = tweet[1] + user_info = tweet[2] + is_RT = tweet[3] + + if mimetype[i] == 'application/json': + if is_RT[0] == True: + st.info('*Retweet*') + st.write(tweet_content[0]) + st.write(user_info[0]) + + st.divider() + if mimetype[i] == 'text/html': + if is_RT[0] == True: + st.info('*Retweet*') + st.write(tweet_content[0]) + st.write(user_info[0]) + + st.divider() + elif not tweet: + if mimetype[i] == 'application/json': + st.error('Tweet has been deleted.') + response = requests.get(link, timeout=5) + json_data = response.json() + + st.json(json_data, expanded=False) + + st.divider() + if mimetype[i] == 'text/html': + st.error('Tweet has been deleted.') + st.info('IFRAME') + st.write(link) + + st.divider() if only_deleted: - if tweet == None: + if not tweet: return_none_count += 1 attr(i) - st.error('Tweet has been deleted.') - components.iframe(src=link, width=700, height=1000, scrolling=True) - st.divider() + if mimetype[i] == 'application/json': + st.error('Tweet has been deleted.') + response = requests.get(link, timeout=5) + json_data = response.json() + + st.json(json_data, expanded=False) + + st.divider() + if mimetype[i] == 'text/html': + st.error('Tweet has been deleted.') + st.info('IFRAME') + st.write(link) + + st.divider() progress.write('{} URLs have been captured in the range {}-{}'.format(return_none_count, start_index, end_index)) From 09581f0df68aff027ee870d940b2f773ba891a1e Mon Sep 17 00:00:00 2001 From: Claromes Date: Sat, 5 Aug 2023 22:01:31 -0300 Subject: [PATCH 2/8] add progress bar, refactoring --- README.md | 4 + app.py | 218 +++++++++++++++++++++++++++--------------------------- 2 files changed, 115 insertions(+), 107 deletions(-) diff --git a/README.md b/README.md index 8205dfe..c7cf677 100644 --- a/README.md +++ b/README.md @@ -42,6 +42,8 @@ Streamlit will be served at http://localhost:8501 - [x] `only_deleted` checkbox selected for handles without deleted tweets - [x] Pagination: set session variable on first click - [x] Pagination: scroll to top +- [ ] `IndexError` +- [ ] Timeout error ## Roadmap @@ -56,5 +58,7 @@ Streamlit will be served at http://localhost:8501 - [ ] Range size defined by user - [ ] `parse_links` exception - [ ] Add current page to page title +- [ ] Parse MIME type `warc/revisit` +- [ ] Filter by period/datetime ## [Changelog](/CHANGELOG.md) diff --git a/app.py b/app.py index d02569e..299dddc 100644 --- a/app.py +++ b/app.py @@ -4,6 +4,7 @@ import streamlit.components.v1 as components import json import re +from bs4 import BeautifulSoup __version__ = '0.2' @@ -39,6 +40,16 @@ header[data-testid="stHeader"] { opacity: 0.5; } + div[data-testid="stDecoration"] { + visibility: hidden; + height: 0%; + position: fixed; + } + div[data-testid="stStatusWidget"] { + visibility: hidden; + height: 0%; + position: fixed; + } ''' @@ -99,6 +110,8 @@ def embed(tweet): for match in matches_html: tweet_content_match = re.sub(r']*>|<\/a>', '', match[0].strip()) + tweet_content_match = tweet_content_match.replace('
', '\n') + user_info_match = re.sub(r']*>|<\/a>', '', match[1].strip()) user_info_match = user_info_match.replace(')', '), ') @@ -120,7 +133,7 @@ def embed(tweet): else: return False except requests.exceptions.Timeout: - st.error('Connection to web.archive.org timed out.') + st.error('Connection to publish.twitter.com timed out.') @@ -128,7 +141,7 @@ def embed(tweet): def tweets_count(handle): url = 'https://web.archive.org/cdx/search/cdx?url=https://twitter.com/{}/status/*&output=json'.format(handle) try: - response = requests.get(url, timeout=5) + response = requests.get(url, timeout=10) if response.status_code == 200: data = response.json() @@ -191,6 +204,8 @@ def attr(i): handle = st.text_input('username', placeholder='username', label_visibility='collapsed') query = st.button('Query', type='primary', use_container_width=True) +bar = st.progress(0) + if query or handle: if handle != st.session_state.current_handle: st.session_state.offset = 0 @@ -207,130 +222,119 @@ def attr(i): only_deleted = st.checkbox('Only deleted tweets') try: - with st.spinner(''): - progress = st.empty() - links = query_api(handle, tweets_per_page, st.session_state.offset) - parsed_links = parse_links(links)[0] - tweet_links = parse_links(links)[1] - mimetype = parse_links(links)[2] - timestamp = parse_links(links)[3] + progress = st.empty() + links = query_api(handle, tweets_per_page, st.session_state.offset) + parsed_links = parse_links(links)[0] + tweet_links = parse_links(links)[1] + mimetype = parse_links(links)[2] + timestamp = parse_links(links)[3] + + + if links: + st.divider() + + st.session_state.current_handle = handle + st.session_state.current_query = query + + return_none_count = 0 + + def prev_page(): + st.session_state.offset -= tweets_per_page + #scroll to top config + st.session_state.update_component += 1 + scroll_into_view() + + def next_page(): + st.session_state.offset += tweets_per_page + + #scroll to top config + st.session_state.update_component += 1 + scroll_into_view() + + def display_tweet(): + if is_RT[0] == True: + st.info('*Retweet*') + st.write(tweet_content[0]) + st.write(user_info[0]) - if links: st.divider() - st.session_state.current_handle = handle - st.session_state.current_query = query + def display_not_tweet(): + if mimetype[i] == 'application/json': + st.error('Tweet has been deleted.') + response = requests.get(link, timeout=5) + json_data = response.json() + + st.json(json_data, expanded=False) + + st.divider() + if mimetype[i] == 'text/html': + st.error('Tweet has been deleted.') + components.iframe(link, height=500) - return_none_count = 0 + st.divider() - def prev_page(): - st.session_state.offset -= tweets_per_page + start_index = st.session_state.offset + end_index = min(count, start_index + tweets_per_page) - #scroll to top config - st.session_state.update_component += 1 - scroll_into_view() + for i in range(tweets_per_page): + try: + bar.progress((i*3) + 13) - def next_page(): - st.session_state.offset += tweets_per_page + link = parsed_links[i] + tweet = embed(tweet_links[i]) - #scroll to top config - st.session_state.update_component += 1 - scroll_into_view() + if not only_deleted: + attr(i) - start_index = st.session_state.offset - end_index = min(count, start_index + tweets_per_page) + if tweet: + status_code = tweet[0] + tweet_content = tweet[1] + user_info = tweet[2] + is_RT = tweet[3] - for i in range(tweets_per_page): - try: - link = parsed_links[i] - tweet = embed(tweet_links[i]) + if mimetype[i] == 'application/json': + display_tweet() - if not only_deleted: + if mimetype[i] == 'text/html': + display_tweet() + elif not tweet: + display_not_tweet() + + if only_deleted: + if not tweet: + return_none_count += 1 attr(i) - if tweet: - status_code = tweet[0] - tweet_content = tweet[1] - user_info = tweet[2] - is_RT = tweet[3] - - if mimetype[i] == 'application/json': - if is_RT[0] == True: - st.info('*Retweet*') - st.write(tweet_content[0]) - st.write(user_info[0]) - - st.divider() - if mimetype[i] == 'text/html': - if is_RT[0] == True: - st.info('*Retweet*') - st.write(tweet_content[0]) - st.write(user_info[0]) - - st.divider() - elif not tweet: - if mimetype[i] == 'application/json': - st.error('Tweet has been deleted.') - response = requests.get(link, timeout=5) - json_data = response.json() - - st.json(json_data, expanded=False) - - st.divider() - if mimetype[i] == 'text/html': - st.error('Tweet has been deleted.') - st.info('IFRAME') - st.write(link) - - st.divider() - - if only_deleted: - if not tweet: - return_none_count += 1 - attr(i) - - if mimetype[i] == 'application/json': - st.error('Tweet has been deleted.') - response = requests.get(link, timeout=5) - json_data = response.json() - - st.json(json_data, expanded=False) - - st.divider() - if mimetype[i] == 'text/html': - st.error('Tweet has been deleted.') - st.info('IFRAME') - st.write(link) - - st.divider() - - progress.write('{} URLs have been captured in the range {}-{}'.format(return_none_count, start_index, end_index)) - - if start_index <= 0: - st.session_state.prev_disabled = True - else: - st.session_state.prev_disabled = False - - if i + 1 == count: - st.session_state.next_disabled = True - else: - st.session_state.next_disabled = False - except IndexError: - if start_index <= 0: - st.session_state.prev_disabled = True - else: - st.session_state.prev_disabled = False + display_not_tweet() + + progress.write('{} URLs have been captured in the range {}-{}'.format(return_none_count, start_index, end_index)) + if start_index <= 0: + st.session_state.prev_disabled = True + else: + st.session_state.prev_disabled = False + + if i + 1 == count: st.session_state.next_disabled = True + else: + st.session_state.next_disabled = False + except IndexError: + if start_index <= 0: + st.session_state.prev_disabled = True + else: + st.session_state.prev_disabled = False + + st.session_state.next_disabled = True - prev, _ , next = st.columns([3, 4, 3]) + prev, _ , next = st.columns([3, 4, 3]) - prev.button('Previous', disabled=st.session_state.prev_disabled, key='prev_button_key', on_click=prev_page, type='primary', use_container_width=True) - next.button('Next', disabled=st.session_state.next_disabled, key='next_button_key', on_click=next_page, type='primary', use_container_width=True) + prev.button('Previous', disabled=st.session_state.prev_disabled, key='prev_button_key', on_click=prev_page, type='primary', use_container_width=True) + next.button('Next', disabled=st.session_state.next_disabled, key='next_button_key', on_click=next_page, type='primary', use_container_width=True) - if not links: - st.error('Unable to query the Wayback Machine API.') + if not links: + st.error('Unable to query the Wayback Machine API.') except TypeError as e: st.error(''' {}. Refresh this page and try again. From 2a4ba380b3f6f368eb7e4b1e5bcacb0976277886 Mon Sep 17 00:00:00 2001 From: Claromes Date: Mon, 14 Aug 2023 17:48:52 -0300 Subject: [PATCH 3/8] test screenshot --- README.md | 2 ++ app.py | 39 ++++++++++++++++++++++++++++++++------- requirements.txt | 2 +- 3 files changed, 35 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index c7cf677..ea8eaf3 100644 --- a/README.md +++ b/README.md @@ -60,5 +60,7 @@ Streamlit will be served at http://localhost:8501 - [ ] Add current page to page title - [ ] Parse MIME type `warc/revisit` - [ ] Filter by period/datetime +- [ ] Apply filters by API endpoints +- [ ] Add contributing guidelines ## [Changelog](/CHANGELOG.md) diff --git a/app.py b/app.py index 299dddc..41225d0 100644 --- a/app.py +++ b/app.py @@ -4,7 +4,8 @@ import streamlit.components.v1 as components import json import re -from bs4 import BeautifulSoup +import os +from selenium import webdriver __version__ = '0.2' @@ -92,7 +93,7 @@ def scroll_into_view(): def embed(tweet): try: url = 'https://publish.twitter.com/oembed?url={}'.format(tweet) - response = requests.get(url, timeout=1) + response = requests.get(url) regex = r'