Skip to content

Commit

Permalink
Merge pull request #7 from claromes/display
Browse files Browse the repository at this point in the history
Display Update
  • Loading branch information
claromes authored Aug 16, 2023
2 parents 59cd087 + e156a5e commit 5dd7e00
Show file tree
Hide file tree
Showing 3 changed files with 197 additions and 94 deletions.
10 changes: 8 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ Tool that displays multiple archived tweets on Wayback Machine to avoid opening

## Features

- 30 embedded tweets per page
- 30 tweets per page
- Filtering by only deleted tweets

## Development
Expand All @@ -42,6 +42,8 @@ Streamlit will be served at http://localhost:8501
- [x] `only_deleted` checkbox selected for handles without deleted tweets
- [x] Pagination: set session variable on first click
- [x] Pagination: scroll to top
- [ ] `IndexError`
- [ ] Timeout error

## Roadmap

Expand All @@ -54,7 +56,11 @@ Streamlit will be served at http://localhost:8501
- [x] Changelog
- [ ] Prevent duplicate URLs
- [ ] Range size defined by user
- [ ] Hide Twitter header banner (iframe)
- [ ] `parse_links` exception
- [ ] Add current page to page title
- [ ] Parse MIME type `warc/revisit`
- [ ] Filter by period/datetime
- [ ] Apply filters by API endpoints
- [ ] Add contributing guidelines

## [Changelog](/CHANGELOG.md)
279 changes: 188 additions & 91 deletions app.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@
import datetime
import streamlit as st
import streamlit.components.v1 as components

__version__ = '0.1.4'
import json
import re

year = datetime.datetime.now().year

Expand All @@ -20,7 +20,7 @@
Tool that displays multiple archived tweets on Wayback Machine to avoid opening each link manually.
- 30 embedded tweets per page
- 30 tweets per page
- Filtering by only deleted tweets
This tool is experimental, please feel free to send your [feedbacks](https://github.com/claromes/waybacktweets/issues).
Expand All @@ -37,6 +37,20 @@
header[data-testid="stHeader"] {
opacity: 0.5;
}
div[data-testid="stDecoration"] {
visibility: hidden;
height: 0%;
position: fixed;
}
div[data-testid="stStatusWidget"] {
visibility: hidden;
height: 0%;
position: fixed;
}
iframe {
background-color: #dddddd;
border-radius: 0.5rem;
}
</style>
'''

Expand Down Expand Up @@ -76,29 +90,70 @@ def scroll_into_view():

components.html(js, width=0, height=0)

@st.cache_data(ttl=1800, show_spinner=False)
def embed(tweet):
api = 'https://publish.twitter.com/oembed?url={}'.format(tweet)
response = requests.get(api)
try:
url = 'https://publish.twitter.com/oembed?url={}'.format(tweet)
response = requests.get(url)

regex = r'<blockquote class="twitter-tweet"><p[^>]*>(.*?)<\/p>.*?&mdash; (.*?)<\/a>'
regex_author = r'^(.*?)\s*\('

if response.status_code == 200 or response.status_code == 302:
status_code = response.status_code
html = response.json()['html']
author_name = response.json()['author_name']

matches_html = re.findall(regex, html, re.DOTALL)

tweet_content = []
user_info = []
is_RT = []

for match in matches_html:
tweet_content_match = re.sub(r'<a[^>]*>|<\/a>', '', match[0].strip())
tweet_content_match = tweet_content_match.replace('<br>', '\n')

user_info_match = re.sub(r'<a[^>]*>|<\/a>', '', match[1].strip())
user_info_match = user_info_match.replace(')', '), ')

match_author = re.search(regex_author, user_info_match)
author_tweet = match_author.group(1)

if tweet_content_match:
tweet_content.append(tweet_content_match)
if user_info_match:
user_info.append(user_info_match)

is_RT_match = False
if author_name != author_tweet:
is_RT_match = True

is_RT.append(is_RT_match)

return status_code, tweet_content, user_info, is_RT
else:
return False
except requests.exceptions.Timeout:
st.error('Connection to publish.twitter.com timed out.')


if response.status_code == 200 or response.status_code == 304:
return response.json()['html']
else:
return None

@st.cache_data(ttl=1800, show_spinner=False)
def tweets_count(handle):
url = 'https://web.archive.org/cdx/search/cdx?url=https://twitter.com/{}/status/*&output=json'.format(handle)
response = requests.get(url)
if response.status_code == 200:
data = response.json()
if data and len(data) > 1:
total_tweets = len(data) - 1
return total_tweets
else:
return 0
else:
return None
try:
response = requests.get(url)

if response.status_code == 200:
data = response.json()
if data and len(data) > 1:
total_tweets = len(data) - 1
return total_tweets
else:
return 0
except requests.exceptions.Timeout:
st.error('Connection to web.archive.org timed out.')


@st.cache_data(ttl=1800, show_spinner=False)
def query_api(handle, limit, offset):
Expand All @@ -107,11 +162,13 @@ def query_api(handle, limit, offset):
st.stop()

url = 'https://web.archive.org/cdx/search/cdx?url=https://twitter.com/{}/status/*&output=json&limit={}&offset={}'.format(handle, limit, offset)
response = requests.get(url)
if response.status_code == 200:
return response.json()
else:
return None
try:
response = requests.get(url)

if response.status_code == 200 or response.status_code == 304:
return response.json()
except requests.exceptions.Timeout:
st.error('Connection to web.archive.org timed out.')

@st.cache_data(ttl=1800, show_spinner=False)
def parse_links(links):
Expand Down Expand Up @@ -148,6 +205,8 @@ def attr(i):
handle = st.text_input('username', placeholder='username', label_visibility='collapsed')
query = st.button('Query', type='primary', use_container_width=True)

bar = st.empty()

if query or handle:
if handle != st.session_state.current_handle:
st.session_state.offset = 0
Expand All @@ -164,91 +223,129 @@ def attr(i):
only_deleted = st.checkbox('Only deleted tweets')

try:
with st.spinner(''):
progress = st.empty()
links = query_api(handle, tweets_per_page, st.session_state.offset)
parsed_links = parse_links(links)[0]
tweet_links = parse_links(links)[1]
mimetype = parse_links(links)[2]
timestamp = parse_links(links)[3]
bar.progress(0)
progress = st.empty()
links = query_api(handle, tweets_per_page, st.session_state.offset)

parse = parse_links(links)
parsed_links = parse[0]
tweet_links = parse[1]
mimetype = parse[2]
timestamp = parse[3]

if links:
st.divider()

st.session_state.current_handle = handle
st.session_state.current_query = query

return_none_count = 0

def prev_page():
st.session_state.offset -= tweets_per_page

#scroll to top config
st.session_state.update_component += 1
scroll_into_view()

def next_page():
st.session_state.offset += tweets_per_page

#scroll to top config
st.session_state.update_component += 1
scroll_into_view()

def display_tweet():
if is_RT[0] == True:
st.info('*Retweet*')
st.write(tweet_content[0])
st.write(user_info[0])

if links:
st.divider()

st.session_state.current_handle = handle
st.session_state.current_query = query
def display_not_tweet():
if mimetype[i] == 'application/json':
st.error('Tweet has been deleted.')
response = requests.get(link)
json_data = response.json()
json_text = response.json()['text']

st.code(json_text)
st.json(json_data, expanded=False)

st.divider()
if mimetype[i] == 'text/html':
st.error('Tweet has been deleted.')
components.iframe(link, height=500, scrolling=True)

st.divider()

return_none_count = 0
if mimetype[i] == 'warc/revisit':
st.warning('''MIME Type was not parsed.''')

def prev_page():
st.session_state.offset -= tweets_per_page
st.divider()

#scroll to top config
st.session_state.update_component += 1
scroll_into_view()
start_index = st.session_state.offset
end_index = min(count, start_index + tweets_per_page)

def next_page():
st.session_state.offset += tweets_per_page
for i in range(tweets_per_page):
try:
bar.progress((i*3) + 13)

#scroll to top config
st.session_state.update_component += 1
scroll_into_view()
link = parsed_links[i]
tweet = embed(tweet_links[i])

start_index = st.session_state.offset
end_index = min(count, start_index + tweets_per_page)
if not only_deleted:
attr(i)

for i in range(tweets_per_page):
try:
link = parsed_links[i]
tweet = embed(tweet_links[i])
if tweet:
status_code = tweet[0]
tweet_content = tweet[1]
user_info = tweet[2]
is_RT = tweet[3]

if not only_deleted:
if mimetype[i] == 'application/json':
display_tweet()

if mimetype[i] == 'text/html':
display_tweet()
elif not tweet:
display_not_tweet()

if only_deleted:
if not tweet:
return_none_count += 1
attr(i)

if tweet == None:
st.error('Tweet has been deleted.')
components.iframe(src=link, width=700, height=1000, scrolling=True)
st.divider()
else:
components.html(tweet, width=700, height=1000, scrolling=True)
st.divider()

if only_deleted:
if tweet == None:
return_none_count += 1
attr(i)

st.error('Tweet has been deleted.')
components.iframe(src=link, width=700, height=1000, scrolling=True)
st.divider()

progress.write('{} URLs have been captured in the range {}-{}'.format(return_none_count, start_index, end_index))

if start_index <= 0:
st.session_state.prev_disabled = True
else:
st.session_state.prev_disabled = False

if i + 1 == count:
st.session_state.next_disabled = True
else:
st.session_state.next_disabled = False
except IndexError:
if start_index <= 0:
st.session_state.prev_disabled = True
else:
st.session_state.prev_disabled = False
display_not_tweet()

progress.write('{} URLs have been captured in the range {}-{}'.format(return_none_count, start_index, end_index))

if start_index <= 0:
st.session_state.prev_disabled = True
else:
st.session_state.prev_disabled = False

if i + 1 == count:
st.session_state.next_disabled = True
else:
st.session_state.next_disabled = False
# TODO
except IndexError:
if start_index <= 0:
st.session_state.prev_disabled = True
else:
st.session_state.prev_disabled = False

st.session_state.next_disabled = True

prev, _ , next = st.columns([3, 4, 3])
prev, _ , next = st.columns([3, 4, 3])

prev.button('Previous', disabled=st.session_state.prev_disabled, key='prev_button_key', on_click=prev_page, type='primary', use_container_width=True)
next.button('Next', disabled=st.session_state.next_disabled, key='next_button_key', on_click=next_page, type='primary', use_container_width=True)
prev.button('Previous', disabled=st.session_state.prev_disabled, key='prev_button_key', on_click=prev_page, type='primary', use_container_width=True)
next.button('Next', disabled=st.session_state.next_disabled, key='next_button_key', on_click=next_page, type='primary', use_container_width=True)

if not links:
st.error('Unable to query the Wayback Machine API.')
if not links:
st.error('Unable to query the Wayback Machine API.')
except TypeError as e:
st.error('''
{}. Refresh this page and try again.
Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
requests==2.30.0
streamlit==1.23.1
streamlit==1.25.0

0 comments on commit 5dd7e00

Please sign in to comment.