Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Dev #225

Merged
merged 5 commits into from
Oct 20, 2024
Merged

Dev #225

Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 15 additions & 5 deletions .env.example
Original file line number Diff line number Diff line change
@@ -1,13 +1,23 @@
# protest data
ACLED_EMAIL=
ACLED_KEY=
# media data
MEDIACLOUD_API_TOKEN=
ZENROWS_API_KEY=
# google trends data
DATAFORSEO_EMAIL=
DATAFORSEO_PASSWORD=
# tiktok data
RAPIDAPI_KEY=
# bundestag data
BUNDESTAG_API_KEY=
# ai
AZURE_API_BASE=
AZURE_API_VERSION=
AZURE_API_KEY=
BUNDESTAG_API_KEY=
DATAFORSEO_EMAIL=
DATAFORSEO_PASSWORD=
PORT=
# logging
SENTRY_DSN=
AI_TREND_RESOLUTION=0.01 # fraction of all articles that should be downloaded and ai-coded for the sentiment and topic trends
# port where the server should run
PORT=
# fraction of all articles that should be downloaded and ai-coded for the sentiment and topic trends
AI_TREND_RESOLUTION=0.01
2 changes: 2 additions & 0 deletions .github/workflows/deploy.yml
Original file line number Diff line number Diff line change
Expand Up @@ -66,3 +66,5 @@ jobs:
DATAFORSEO_PASSWORD: ${{ secrets.DATAFORSEO_PASSWORD }}
BUNDESTAG_API_KEY: ${{ secrets.BUNDESTAG_API_KEY }}
SENTRY_DSN: ${{ secrets.SENTRY_DSN }}
RAPIDAPI_KEY: ${{ secrets.RAPIDAPI_KEY }}

2 changes: 1 addition & 1 deletion backend-python/media_impact_monitor/cron.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ def fill_cache():
)
except Exception as e:
errors.append(f"events {data_source}: {e}")
for media_source in ["news_online", "news_print", "web_google"]:
for media_source in ["news_online", "news_print", "social_tiktok", "web_google"]:
for trend_type in ["keywords", "sentiment"]:
for aggregation in ["daily", "weekly"]:
if aggregation == "daily" and media_source == "web_google":
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,171 @@
import re
from collections import Counter
from datetime import datetime
from typing import Any

import pandas as pd
from tqdm.auto import tqdm

from media_impact_monitor.util.cache import get
from media_impact_monitor.util.env import RAPIDAPI_KEY

headers = {
"x-rapidapi-key": RAPIDAPI_KEY,
"x-rapidapi-host": "tiktok-scraper7.p.rapidapi.com",
}


def get_videos_for_keywords(
keywords: str, n: int, cursor: int = 0
) -> list[dict[str, Any]]:
"""
Get videos for a given set of keywords.
Problem: This returns max ~150 videos, even for very popular keywords.
Use hashtag query to get more videos.
"""
url = "https://tiktok-scraper7.p.rapidapi.com/feed/search"
query = {
"keywords": keywords,
"region": "us", # location of the proxy server
"count": 30, # max: 30
"cursor": cursor,
"publish_time": "0", # 0 - ALL 1 - Past 24 hours 7 - This week 30 - This month 90 - Last 3 months 180 - Last 6 months
"sort_type": "0", # 0 - Relevance 1 - Like count 3 - Date posted
}
response = get(url, headers=headers, params=query)
# print(response.json())
data = response.json()["data"]
videos, cursor, has_more = data["videos"], data["cursor"], data["hasMore"]
if has_more and cursor < n:
videos.extend(get_videos_for_keywords(keywords=keywords, n=n, cursor=cursor))
return videos


def get_hashtag_suggestions(keywords: str) -> Counter:
videos = get_videos_for_keywords(keywords, n=100)
titles = [video["title"] for video in videos]
hashtags = [re.findall(r"#(\w+)", title) for title in titles]
hashtags = [item for sublist in hashtags for item in sublist]
hashtag_counts = Counter(hashtags)
return hashtag_counts


def get_hashtag_id(hashtag: str) -> str:
url = "https://tiktok-scraper7.p.rapidapi.com/challenge/info"
querystring = {
"challenge_name": hashtag,
}
response = get(url, headers=headers, params=querystring)
return response.json()["data"]["id"]


def get_videos_for_hashtag_id(
hashtag_id: str, n: int, cursor: int = 0, verbose: bool = True
) -> list[dict[str, Any]]:
url = "https://tiktok-scraper7.p.rapidapi.com/challenge/posts"
query = {
"challenge_id": hashtag_id,
"count": 20, # max: 20
"cursor": cursor,
}
response = get(url, headers=headers, params=query)
data = response.json()["data"]
videos, cursor, has_more = data["videos"], data["cursor"], data["hasMore"]
if has_more and cursor < n:
if verbose:
print(cursor)
videos.extend(
get_videos_for_hashtag_id(
hashtag_id=hashtag_id, n=n, cursor=cursor, verbose=verbose
)
)
return videos


def get_videos_for_hashtag(
hashtag: str, n: int, cursor: int = 0, verbose: bool = True
) -> list[dict[str, Any]]:
hashtag_id = get_hashtag_id(hashtag)
return get_videos_for_hashtag_id(hashtag_id, n=n, cursor=cursor, verbose=verbose)


def get_video_history_for_hashtag(
hashtag: str, n: int, verbose: bool = True
) -> pd.DataFrame:
"""
Get video history for a hashtag.
Returns a time series of views and posts.
Views are computed by summing the views of all videos that were posted in a given day -- that is, the views do not correspond to the dates when the videos were actually viewed. It is recommended to just use posts, or comments (see `get_comment_history_for_hashtag`).
"""
videos = get_videos_for_hashtag(hashtag, n=n, verbose=verbose)
df = pd.DataFrame(
{
"date": [datetime.fromtimestamp(video["create_time"]) for video in videos],
"id": [video["video_id"] for video in videos],
"title": [video["title"] for video in videos],
"views": [video["play_count"] for video in videos],
}
)
df["date"] = pd.to_datetime(df["date"])
df = df.sort_values("date")
ts = (
df.resample("1D", on="date")
.agg(
{
"views": "sum",
"id": "count",
}
)
.rename(columns={"id": "posts"})
)
ts = ts.reindex(pd.date_range(start=ts.index.min(), end=ts.index.max())).fillna(0)
return ts


def get_comments_for_video(
video_id: str, n: int, cursor: int = 0
) -> list[dict[str, Any]]:
url = "https://tiktok-scraper7.p.rapidapi.com/comment/list"
query = {
"url": video_id,
"count": 50, # max: 50 (?)
"cursor": cursor,
}
response = get(url, headers=headers, params=query)
data = response.json()["data"]
comments, cursor, has_more = data["comments"], data["cursor"], data["hasMore"]
if has_more and cursor < n:
comments.extend(get_comments_for_video(video_id, n=n, cursor=cursor))
return comments


def get_comment_history_for_hashtag(
hashtag: str, n_posts: int, n_comments: int, verbose: bool = True
) -> pd.DataFrame:
videos = get_videos_for_hashtag(hashtag, n=n_posts, verbose=verbose)
comments = [
get_comments_for_video(video["video_id"], n=n_comments)
for video in tqdm(videos)
if video["comment_count"] > 0
]
comments = [comment for video_comments in comments for comment in video_comments]
comments_df = pd.DataFrame(
{
"date": [
datetime.fromtimestamp(comment["create_time"]) for comment in comments
],
"text": [comment["text"] for comment in comments],
"video_id": [comment["video_id"] for comment in comments],
}
)
ts = (
comments_df.resample("1W", on="date")
.agg(
{
"text": "count",
}
)
.rename(columns={"text": "comments"})
)
ts = ts.reindex(pd.date_range(start=ts.index.min(), end=ts.index.max())).fillna(0)
return ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
import pytest
import pandas as pd
from datetime import datetime, timedelta
from collections import Counter
from media_impact_monitor.data_loaders.social_media.tiktok import (
get_videos_for_keywords,
get_hashtag_suggestions,
get_hashtag_id,
get_videos_for_hashtag_id,
get_videos_for_hashtag,
get_video_history_for_hashtag,
get_comments_for_video,
get_comment_history_for_hashtag,
)


@pytest.mark.slow
def test_get_videos_for_keywords():
videos = get_videos_for_keywords("climate change", n=50)
assert len(videos) > 0
assert isinstance(videos[0], dict)
assert "title" in videos[0]
assert "video_id" in videos[0]


@pytest.mark.slow
def test_get_hashtag_suggestions():
suggestions = get_hashtag_suggestions("climate change")
assert len(suggestions) > 0
assert isinstance(suggestions, Counter)


@pytest.mark.slow
def test_get_hashtag_id():
hashtag_id = get_hashtag_id("climatechange")
assert isinstance(hashtag_id, str)
assert len(hashtag_id) > 0


@pytest.mark.slow
def test_get_videos_for_hashtag_id():
hashtag_id = get_hashtag_id("climatechange")
videos = get_videos_for_hashtag_id(hashtag_id, n=50)
assert len(videos) > 0
assert isinstance(videos[0], dict)
assert "title" in videos[0]
assert "video_id" in videos[0]


@pytest.mark.slow
def test_get_videos_for_hashtag():
videos = get_videos_for_hashtag("climatechange", n=50)
assert len(videos) > 0
assert isinstance(videos[0], dict)
assert "title" in videos[0]
assert "video_id" in videos[0]


@pytest.mark.slow
def test_get_video_history_for_hashtag():
history = get_video_history_for_hashtag("climatechange", n=100)
assert isinstance(history, pd.DataFrame)
assert len(history) > 0
assert "views" in history.columns
assert "posts" in history.columns


@pytest.mark.slow
def test_get_comments_for_video():
videos = get_videos_for_hashtag("climatechange", n=1)
video_id = videos[0]["video_id"]
comments = get_comments_for_video(video_id, n=50)
assert len(comments) > 0
assert isinstance(comments[0], dict)
assert "text" in comments[0]


@pytest.mark.slow
def test_get_comment_history_for_hashtag():
history = get_comment_history_for_hashtag(
"climatechange", n_posts=10, n_comments=10
)
assert isinstance(history, pd.DataFrame)
assert len(history) > 0
assert "comments" in history.columns


@pytest.mark.slow
def test_data_freshness():
videos = get_videos_for_hashtag("climatechange", n=50)
latest_video_date = max(
datetime.fromtimestamp(video["create_time"]) for video in videos
)
assert latest_video_date >= datetime.now() - timedelta(
days=7
), "No recent videos found"


@pytest.mark.slow
def test_video_content():
videos = get_videos_for_keywords("climate change", n=50)
climate_related_words = [
"climate",
"environment",
"global warming",
"sustainability",
]
assert any(
any(word in video["title"].lower() for word in climate_related_words)
for video in videos
), "No climate-related content found in video titles"
14 changes: 13 additions & 1 deletion backend-python/media_impact_monitor/trends/keyword_trend.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,9 @@
get_mediacloud_counts,
)
from media_impact_monitor.data_loaders.news_print.genios import get_genios_counts
from media_impact_monitor.data_loaders.social_media.tiktok import (
get_video_history_for_hashtag,
)
from media_impact_monitor.data_loaders.web.google_trends import get_google_trends_counts
from media_impact_monitor.types_ import TrendSearch
from media_impact_monitor.util.paths import src
Expand All @@ -31,6 +34,8 @@ def get_keyword_trend(q: TrendSearch) -> tuple[pd.DataFrame | None, list[str]]:
)
case "web_google":
ds = get_google_trends_counts(query=query, end_date=q.end_date)
case "social_tiktok":
ds = get_video_history_for_hashtag(query, n=1000, verbose=True)["posts"]
case _:
raise ValueError(f"Unsupported media source: {q.media_source}")
dss[topic] = ds
Expand Down Expand Up @@ -65,7 +70,14 @@ def topic_queries(media_source: str) -> dict[str, str]:
# media_source,
# ),
}
if media_source != "web_google":
if media_source == "social_tiktok":
keyword_queries = {
"climate activism": "climateprotest", # TODO: improve
"climate policy": "climateaction", # TODO: improve
"climate science": "climatechange", # TODO: improve
"climate crisis framing": "climatecrisis", # TODO: improve
}
elif media_source != "web_google":
keyword_queries["climate activism"] = xs_with_ys(
keywords["climate_science"]
+ keywords["climate_policy"]
Expand Down
2 changes: 1 addition & 1 deletion backend-python/media_impact_monitor/types_.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
# FIXME: consider renaming "Topic" to "Issue" to avoid confusion with topics within the issue (like science or policy)
Topic = Literal["climate_change"]
Query = str # for now, just a single keyword
MediaSource = Literal["news_online", "news_print", "web_google"]
MediaSource = Literal["news_online", "news_print", "web_google", "social_tiktok"]

StartDateField = Field(
default=date(2020, 1, 1),
Expand Down
2 changes: 2 additions & 0 deletions backend-python/media_impact_monitor/util/env.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
DATAFORSEO_PASSWORD = environ["DATAFORSEO_PASSWORD"]
BUNDESTAG_API_KEY = environ["BUNDESTAG_API_KEY"]
SENTRY_DSN = environ["SENTRY_DSN"]
RAPIDAPI_KEY = environ["RAPIDAPI_KEY"]
AI_TREND_RESOLUTION = float(environ.get("AI_TREND_RESOLUTION", 0.01))

assert ACLED_EMAIL
Expand All @@ -31,3 +32,4 @@
assert DATAFORSEO_PASSWORD
assert BUNDESTAG_API_KEY
assert SENTRY_DSN
assert RAPIDAPI_KEY
Loading
Loading