From 1ded0c6a12d6521af8dd64d39d4d26deb270267b Mon Sep 17 00:00:00 2001 From: David Pomerenke <46022183+davidpomerenke@users.noreply.github.com> Date: Sun, 20 Oct 2024 23:35:08 +0200 Subject: [PATCH 1/5] feat(data_loaders): add TikTok --- .env.example | 20 +- .../data_loaders/social_media/tiktok.py | 171 ++++++++++++++++++ .../data_loaders/social_media/tiktok_test.py | 111 ++++++++++++ .../media_impact_monitor/util/env.py | 2 + 4 files changed, 299 insertions(+), 5 deletions(-) create mode 100644 backend-python/media_impact_monitor/data_loaders/social_media/tiktok.py create mode 100644 backend-python/media_impact_monitor/data_loaders/social_media/tiktok_test.py diff --git a/.env.example b/.env.example index 3676f42..367ede4 100644 --- a/.env.example +++ b/.env.example @@ -1,13 +1,23 @@ +# protest data ACLED_EMAIL= ACLED_KEY= +# media data MEDIACLOUD_API_TOKEN= ZENROWS_API_KEY= +# google trends data +DATAFORSEO_EMAIL= +DATAFORSEO_PASSWORD= +# tiktok data +RAPIDAPI_KEY= +# bundestag data +BUNDESTAG_API_KEY= +# ai AZURE_API_BASE= AZURE_API_VERSION= AZURE_API_KEY= -BUNDESTAG_API_KEY= -DATAFORSEO_EMAIL= -DATAFORSEO_PASSWORD= -PORT= +# logging SENTRY_DSN= -AI_TREND_RESOLUTION=0.01 # fraction of all articles that should be downloaded and ai-coded for the sentiment and topic trends +# port where the server should run +PORT= +# fraction of all articles that should be downloaded and ai-coded for the sentiment and topic trends +AI_TREND_RESOLUTION=0.01 diff --git a/backend-python/media_impact_monitor/data_loaders/social_media/tiktok.py b/backend-python/media_impact_monitor/data_loaders/social_media/tiktok.py new file mode 100644 index 0000000..9935a18 --- /dev/null +++ b/backend-python/media_impact_monitor/data_loaders/social_media/tiktok.py @@ -0,0 +1,171 @@ +import re +from collections import Counter +from datetime import datetime +from typing import Any + +import pandas as pd +from tqdm.auto import tqdm + +from media_impact_monitor.util.cache import get +from media_impact_monitor.util.env import RAPIDAPI_KEY + +headers = { + "x-rapidapi-key": RAPIDAPI_KEY, + "x-rapidapi-host": "tiktok-scraper7.p.rapidapi.com", +} + + +def get_videos_for_keywords( + keywords: str, n: int, cursor: int = 0 +) -> list[dict[str, Any]]: + """ + Get videos for a given set of keywords. + Problem: This returns max ~150 videos, even for very popular keywords. + Use hashtag query to get more videos. + """ + url = "https://tiktok-scraper7.p.rapidapi.com/feed/search" + query = { + "keywords": keywords, + "region": "us", # location of the proxy server + "count": 30, # max: 30 + "cursor": cursor, + "publish_time": "0", # 0 - ALL 1 - Past 24 hours 7 - This week 30 - This month 90 - Last 3 months 180 - Last 6 months + "sort_type": "0", # 0 - Relevance 1 - Like count 3 - Date posted + } + response = get(url, headers=headers, params=query) + # print(response.json()) + data = response.json()["data"] + videos, cursor, has_more = data["videos"], data["cursor"], data["hasMore"] + if has_more and cursor < n: + videos.extend(get_videos_for_keywords(keywords=keywords, n=n, cursor=cursor)) + return videos + + +def get_hashtag_suggestions(keywords: str) -> Counter: + videos = get_videos_for_keywords(keywords, n=100) + titles = [video["title"] for video in videos] + hashtags = [re.findall(r"#(\w+)", title) for title in titles] + hashtags = [item for sublist in hashtags for item in sublist] + hashtag_counts = Counter(hashtags) + return hashtag_counts + + +def get_hashtag_id(hashtag: str) -> str: + url = "https://tiktok-scraper7.p.rapidapi.com/challenge/info" + querystring = { + "challenge_name": hashtag, + } + response = get(url, headers=headers, params=querystring) + return response.json()["data"]["id"] + + +def get_videos_for_hashtag_id( + hashtag_id: str, n: int, cursor: int = 0, verbose: bool = True +) -> list[dict[str, Any]]: + url = "https://tiktok-scraper7.p.rapidapi.com/challenge/posts" + query = { + "challenge_id": hashtag_id, + "count": 20, # max: 20 + "cursor": cursor, + } + response = get(url, headers=headers, params=query) + data = response.json()["data"] + videos, cursor, has_more = data["videos"], data["cursor"], data["hasMore"] + if has_more and cursor < n: + if verbose: + print(cursor) + videos.extend( + get_videos_for_hashtag_id( + hashtag_id=hashtag_id, n=n, cursor=cursor, verbose=verbose + ) + ) + return videos + + +def get_videos_for_hashtag( + hashtag: str, n: int, cursor: int = 0, verbose: bool = True +) -> list[dict[str, Any]]: + hashtag_id = get_hashtag_id(hashtag) + return get_videos_for_hashtag_id(hashtag_id, n=n, cursor=cursor, verbose=verbose) + + +def get_video_history_for_hashtag( + hashtag: str, n: int, verbose: bool = True +) -> pd.DataFrame: + """ + Get video history for a hashtag. + Returns a time series of views and posts. + Views are computed by summing the views of all videos that were posted in a given day -- that is, the views do not correspond to the dates when the videos were actually viewed. It is recommended to just use posts, or comments (see `get_comment_history_for_hashtag`). + """ + videos = get_videos_for_hashtag(hashtag, n=n, verbose=verbose) + df = pd.DataFrame( + { + "date": [datetime.fromtimestamp(video["create_time"]) for video in videos], + "id": [video["video_id"] for video in videos], + "title": [video["title"] for video in videos], + "views": [video["play_count"] for video in videos], + } + ) + df["date"] = pd.to_datetime(df["date"]) + df = df.sort_values("date") + ts = ( + df.resample("1D", on="date") + .agg( + { + "views": "sum", + "id": "count", + } + ) + .rename(columns={"id": "posts"}) + ) + ts = ts.reindex(pd.date_range(start=ts.index.min(), end=ts.index.max())).fillna(0) + return ts + + +def get_comments_for_video( + video_id: str, n: int, cursor: int = 0 +) -> list[dict[str, Any]]: + url = "https://tiktok-scraper7.p.rapidapi.com/comment/list" + query = { + "url": video_id, + "count": 50, # max: 50 (?) + "cursor": cursor, + } + response = get(url, headers=headers, params=query) + data = response.json()["data"] + comments, cursor, has_more = data["comments"], data["cursor"], data["hasMore"] + if has_more and cursor < n: + comments.extend(get_comments_for_video(video_id, n=n, cursor=cursor)) + return comments + + +def get_comment_history_for_hashtag( + hashtag: str, n_posts: int, n_comments: int, verbose: bool = True +) -> pd.DataFrame: + videos = get_videos_for_hashtag(hashtag, n=n_posts, verbose=verbose) + comments = [ + get_comments_for_video(video["video_id"], n=n_comments) + for video in tqdm(videos) + if video["comment_count"] > 0 + ] + comments = [comment for video_comments in comments for comment in video_comments] + comments_df = pd.DataFrame( + { + "date": [ + datetime.fromtimestamp(comment["create_time"]) for comment in comments + ], + "text": [comment["text"] for comment in comments], + "video_id": [comment["video_id"] for comment in comments], + } + ) + ts = ( + comments_df.resample("1W", on="date") + .agg( + { + "text": "count", + } + ) + .rename(columns={"text": "comments"}) + ) + ts = ts.reindex(pd.date_range(start=ts.index.min(), end=ts.index.max())).fillna(0) + return ts diff --git a/backend-python/media_impact_monitor/data_loaders/social_media/tiktok_test.py b/backend-python/media_impact_monitor/data_loaders/social_media/tiktok_test.py new file mode 100644 index 0000000..25ac282 --- /dev/null +++ b/backend-python/media_impact_monitor/data_loaders/social_media/tiktok_test.py @@ -0,0 +1,111 @@ +import pytest +import pandas as pd +from datetime import datetime, timedelta +from collections import Counter +from media_impact_monitor.data_loaders.social_media.tiktok import ( + get_videos_for_keywords, + get_hashtag_suggestions, + get_hashtag_id, + get_videos_for_hashtag_id, + get_videos_for_hashtag, + get_video_history_for_hashtag, + get_comments_for_video, + get_comment_history_for_hashtag, +) + + +@pytest.mark.slow +def test_get_videos_for_keywords(): + videos = get_videos_for_keywords("climate change", n=50) + assert len(videos) > 0 + assert isinstance(videos[0], dict) + assert "title" in videos[0] + assert "video_id" in videos[0] + + +@pytest.mark.slow +def test_get_hashtag_suggestions(): + suggestions = get_hashtag_suggestions("climate change") + assert len(suggestions) > 0 + assert isinstance(suggestions, Counter) + + +@pytest.mark.slow +def test_get_hashtag_id(): + hashtag_id = get_hashtag_id("climatechange") + assert isinstance(hashtag_id, str) + assert len(hashtag_id) > 0 + + +@pytest.mark.slow +def test_get_videos_for_hashtag_id(): + hashtag_id = get_hashtag_id("climatechange") + videos = get_videos_for_hashtag_id(hashtag_id, n=50) + assert len(videos) > 0 + assert isinstance(videos[0], dict) + assert "title" in videos[0] + assert "video_id" in videos[0] + + +@pytest.mark.slow +def test_get_videos_for_hashtag(): + videos = get_videos_for_hashtag("climatechange", n=50) + assert len(videos) > 0 + assert isinstance(videos[0], dict) + assert "title" in videos[0] + assert "video_id" in videos[0] + + +@pytest.mark.slow +def test_get_video_history_for_hashtag(): + history = get_video_history_for_hashtag("climatechange", n=100) + assert isinstance(history, pd.DataFrame) + assert len(history) > 0 + assert "views" in history.columns + assert "posts" in history.columns + + +@pytest.mark.slow +def test_get_comments_for_video(): + videos = get_videos_for_hashtag("climatechange", n=1) + video_id = videos[0]["video_id"] + comments = get_comments_for_video(video_id, n=50) + assert len(comments) > 0 + assert isinstance(comments[0], dict) + assert "text" in comments[0] + + +@pytest.mark.slow +def test_get_comment_history_for_hashtag(): + history = get_comment_history_for_hashtag( + "climatechange", n_posts=10, n_comments=10 + ) + assert isinstance(history, pd.DataFrame) + assert len(history) > 0 + assert "comments" in history.columns + + +@pytest.mark.slow +def test_data_freshness(): + videos = get_videos_for_hashtag("climatechange", n=50) + latest_video_date = max( + datetime.fromtimestamp(video["create_time"]) for video in videos + ) + assert latest_video_date >= datetime.now() - timedelta( + days=7 + ), "No recent videos found" + + +@pytest.mark.slow +def test_video_content(): + videos = get_videos_for_keywords("climate change", n=50) + climate_related_words = [ + "climate", + "environment", + "global warming", + "sustainability", + ] + assert any( + any(word in video["title"].lower() for word in climate_related_words) + for video in videos + ), "No climate-related content found in video titles" diff --git a/backend-python/media_impact_monitor/util/env.py b/backend-python/media_impact_monitor/util/env.py index 3363b48..588c216 100644 --- a/backend-python/media_impact_monitor/util/env.py +++ b/backend-python/media_impact_monitor/util/env.py @@ -18,6 +18,7 @@ DATAFORSEO_PASSWORD = environ["DATAFORSEO_PASSWORD"] BUNDESTAG_API_KEY = environ["BUNDESTAG_API_KEY"] SENTRY_DSN = environ["SENTRY_DSN"] +RAPIDAPI_KEY = environ["RAPIDAPI_KEY"] AI_TREND_RESOLUTION = float(environ.get("AI_TREND_RESOLUTION", 0.01)) assert ACLED_EMAIL @@ -31,3 +32,4 @@ assert DATAFORSEO_PASSWORD assert BUNDESTAG_API_KEY assert SENTRY_DSN +assert RAPIDAPI_KEY From 06af85447ea1549ff4ecf2d9e8e6022819877e1b Mon Sep 17 00:00:00 2001 From: David Pomerenke <46022183+davidpomerenke@users.noreply.github.com> Date: Mon, 21 Oct 2024 00:15:06 +0200 Subject: [PATCH 2/5] feat: integrate TikTok into API and frontend --- .../trends/keyword_trend.py | 14 +++++++++- backend-python/media_impact_monitor/types_.py | 2 +- .../src/components/DataSourceSelect.tsx | 8 ++++++ frontend-nextjs/src/stores/filtersStore.ts | 4 +-- frontend-nextjs/src/utility/textUtil.tsx | 26 ++++++++++++++++--- 5 files changed, 47 insertions(+), 7 deletions(-) diff --git a/backend-python/media_impact_monitor/trends/keyword_trend.py b/backend-python/media_impact_monitor/trends/keyword_trend.py index 227defa..acdae99 100644 --- a/backend-python/media_impact_monitor/trends/keyword_trend.py +++ b/backend-python/media_impact_monitor/trends/keyword_trend.py @@ -5,6 +5,9 @@ get_mediacloud_counts, ) from media_impact_monitor.data_loaders.news_print.genios import get_genios_counts +from media_impact_monitor.data_loaders.social_media.tiktok import ( + get_video_history_for_hashtag, +) from media_impact_monitor.data_loaders.web.google_trends import get_google_trends_counts from media_impact_monitor.types_ import TrendSearch from media_impact_monitor.util.paths import src @@ -31,6 +34,8 @@ def get_keyword_trend(q: TrendSearch) -> tuple[pd.DataFrame | None, list[str]]: ) case "web_google": ds = get_google_trends_counts(query=query, end_date=q.end_date) + case "social_tiktok": + ds = get_video_history_for_hashtag(query, n=1000, verbose=True)["posts"] case _: raise ValueError(f"Unsupported media source: {q.media_source}") dss[topic] = ds @@ -65,7 +70,14 @@ def topic_queries(media_source: str) -> dict[str, str]: # media_source, # ), } - if media_source != "web_google": + if media_source == "social_tiktok": + keyword_queries = { + "climate activism": "climateprotest", # TODO: improve + "climate policy": "climateaction", # TODO: improve + "climate science": "climatechange", # TODO: improve + "climate crisis framing": "climatecrisis", # TODO: improve + } + elif media_source != "web_google": keyword_queries["climate activism"] = xs_with_ys( keywords["climate_science"] + keywords["climate_policy"] diff --git a/backend-python/media_impact_monitor/types_.py b/backend-python/media_impact_monitor/types_.py index 353d762..39f65d8 100644 --- a/backend-python/media_impact_monitor/types_.py +++ b/backend-python/media_impact_monitor/types_.py @@ -9,7 +9,7 @@ # FIXME: consider renaming "Topic" to "Issue" to avoid confusion with topics within the issue (like science or policy) Topic = Literal["climate_change"] Query = str # for now, just a single keyword -MediaSource = Literal["news_online", "news_print", "web_google"] +MediaSource = Literal["news_online", "news_print", "web_google", "social_tiktok"] StartDateField = Field( default=date(2020, 1, 1), diff --git a/frontend-nextjs/src/components/DataSourceSelect.tsx b/frontend-nextjs/src/components/DataSourceSelect.tsx index 0fced52..509700c 100644 --- a/frontend-nextjs/src/components/DataSourceSelect.tsx +++ b/frontend-nextjs/src/components/DataSourceSelect.tsx @@ -23,6 +23,7 @@ import { type LucideIcon, NewspaperIcon, SearchIcon, + Music2Icon } from 'lucide-react' import { useMemo, useState } from 'react' @@ -52,6 +53,13 @@ const options: OptionType[] = [ description: texts.filters.mediaSource.values.printNews.description, links: texts.filters.mediaSource.values.printNews.links, }, + { + name: texts.filters.mediaSource.values.tiktok.name, + value: 'social_tiktok', + Icon: Music2Icon, + description: texts.filters.mediaSource.values.tiktok.description, + links: texts.filters.mediaSource.values.tiktok.links, + }, { name: texts.filters.mediaSource.values.googleTrends.name, value: 'web_google', diff --git a/frontend-nextjs/src/stores/filtersStore.ts b/frontend-nextjs/src/stores/filtersStore.ts index fc096f6..60f1101 100644 --- a/frontend-nextjs/src/stores/filtersStore.ts +++ b/frontend-nextjs/src/stores/filtersStore.ts @@ -20,7 +20,7 @@ import { } from "zustand/middleware"; -export type MediaSourceType = "news_online" | "news_print" | "web_google"; +export type MediaSourceType = "news_online" | "news_print" | "social_tiktok" | "web_google"; export type FiltersState = { from: Date; @@ -80,7 +80,7 @@ const getFiltersZodSchema = (today: Date) => { .boolean() .default(defaultInitState.isDefaultTimeRange), organizers: z.array(z.string()).default(defaultInitState.organizers), - mediaSource: z.enum(["news_online", "news_print", "web_google"]), + mediaSource: z.enum(["news_online", "news_print", "social_tiktok", "web_google"]), }) .default(defaultInitState); } diff --git a/frontend-nextjs/src/utility/textUtil.tsx b/frontend-nextjs/src/utility/textUtil.tsx index c6cbacb..cb1881b 100644 --- a/frontend-nextjs/src/utility/textUtil.tsx +++ b/frontend-nextjs/src/utility/textUtil.tsx @@ -141,7 +141,7 @@ const textsEnGB = { values: { onlineNews: { name: 'Online News', - description: 'Articles from online news pages.', + description: 'Articles in online news pages.', links: [ { label: 'Official Website', @@ -151,7 +151,7 @@ const textsEnGB = { }, printNews: { name: 'Print News', - description: 'Articles from print newspapers.', + description: 'Articles in print newspapers.', links: [ { label: 'Official Website', @@ -163,9 +163,19 @@ const textsEnGB = { }, ], }, + tiktok: { + name: 'TikTok', + description: 'Video posts on TikTok.', + links: [ + { + label: 'Official Website', + href: 'https://www.tiktok.com/', + }, + ], + }, googleTrends: { name: 'Google Trends', - description: 'Search trends from Google.', + description: 'Search trends on Google.', links: [ { label: 'Official Website', @@ -776,6 +786,16 @@ const textsXXX = { }, ], }, + tiktok: { + name: 'XXX', + description: 'XXX', + links: [ + { + label: 'XXX', + href: 'XXX', + }, + ], + }, googleTrends: { name: 'XXX', description: 'XXX', From d95ad4185eb5b7e09bd3389332fe7af5603496bd Mon Sep 17 00:00:00 2001 From: David Pomerenke <46022183+davidpomerenke@users.noreply.github.com> Date: Mon, 21 Oct 2024 00:18:42 +0200 Subject: [PATCH 3/5] fix(tiktok): add RAPIDAPI key env var to CI --- .github/workflows/deploy.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml index 42a61c9..971a217 100644 --- a/.github/workflows/deploy.yml +++ b/.github/workflows/deploy.yml @@ -66,3 +66,5 @@ jobs: DATAFORSEO_PASSWORD: ${{ secrets.DATAFORSEO_PASSWORD }} BUNDESTAG_API_KEY: ${{ secrets.BUNDESTAG_API_KEY }} SENTRY_DSN: ${{ secrets.SENTRY_DSN }} + RAPIDAPI_KEY: ${{ secrets.RAPIDAPI_KEY }} + From 27928f909ae120dc4216a6c6d5df0a13e6e1f0c4 Mon Sep 17 00:00:00 2001 From: David Pomerenke <46022183+davidpomerenke@users.noreply.github.com> Date: Mon, 21 Oct 2024 00:22:23 +0200 Subject: [PATCH 4/5] fix(cron.py): add TikTok to cronjob --- backend-python/media_impact_monitor/cron.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend-python/media_impact_monitor/cron.py b/backend-python/media_impact_monitor/cron.py index a1786c0..f69e4a2 100644 --- a/backend-python/media_impact_monitor/cron.py +++ b/backend-python/media_impact_monitor/cron.py @@ -51,7 +51,7 @@ def fill_cache(): ) except Exception as e: errors.append(f"events {data_source}: {e}") - for media_source in ["news_online", "news_print", "web_google"]: + for media_source in ["news_online", "news_print", "social_tiktok", "web_google"]: for trend_type in ["keywords", "sentiment"]: for aggregation in ["daily", "weekly"]: if aggregation == "daily" and media_source == "web_google": From 53f2000720cc41b7a178f96f32e6bb56b49baa42 Mon Sep 17 00:00:00 2001 From: David Pomerenke <46022183+davidpomerenke@users.noreply.github.com> Date: Mon, 21 Oct 2024 00:49:50 +0200 Subject: [PATCH 5/5] fix(frontend): modify media source type in more places --- frontend-nextjs/src/utility/eventMediaUtil.ts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/frontend-nextjs/src/utility/eventMediaUtil.ts b/frontend-nextjs/src/utility/eventMediaUtil.ts index 017ea9e..1cde023 100644 --- a/frontend-nextjs/src/utility/eventMediaUtil.ts +++ b/frontend-nextjs/src/utility/eventMediaUtil.ts @@ -9,7 +9,7 @@ import { today } from "./today"; import { formatZodError } from "./zodUtil"; export const eventMediaInputQueryZodSchema = z.object({ - mediaSource: z.enum(["news_online", "news_print", "web_google"]), + mediaSource: z.enum(["news_online", "news_print", "social_tiktok", "web_google"]), from: z.date().optional(), to: z.date().optional(), eventId: z.string(), @@ -22,7 +22,7 @@ export type EventMediaInputQueryType = z.infer< >; const eventMediaOutputQueryZodSchema = z.object({ - media_source: z.enum(["news_online", "news_print", "web_google"]), + media_source: z.enum(["news_online", "news_print", "social_tiktok", "web_google"]), start_date: z.string().optional(), end_date: z.string().optional(), event_id: z.string(),