From 1ded0c6a12d6521af8dd64d39d4d26deb270267b Mon Sep 17 00:00:00 2001
From: David Pomerenke <46022183+davidpomerenke@users.noreply.github.com>
Date: Sun, 20 Oct 2024 23:35:08 +0200
Subject: [PATCH 1/5] feat(data_loaders): add TikTok

---
 .env.example                                  |  20 +-
 .../data_loaders/social_media/tiktok.py       | 171 ++++++++++++++++++
 .../data_loaders/social_media/tiktok_test.py  | 111 ++++++++++++
 .../media_impact_monitor/util/env.py          |   2 +
 4 files changed, 299 insertions(+), 5 deletions(-)
 create mode 100644 backend-python/media_impact_monitor/data_loaders/social_media/tiktok.py
 create mode 100644 backend-python/media_impact_monitor/data_loaders/social_media/tiktok_test.py

diff --git a/.env.example b/.env.example
index 3676f42..367ede4 100644
--- a/.env.example
+++ b/.env.example
@@ -1,13 +1,23 @@
+# protest data
 ACLED_EMAIL=
 ACLED_KEY=
+# media data
 MEDIACLOUD_API_TOKEN=
 ZENROWS_API_KEY=
+# google trends data
+DATAFORSEO_EMAIL=
+DATAFORSEO_PASSWORD=
+# tiktok data
+RAPIDAPI_KEY=
+# bundestag data
+BUNDESTAG_API_KEY=
+# ai
 AZURE_API_BASE=
 AZURE_API_VERSION=
 AZURE_API_KEY=
-BUNDESTAG_API_KEY=
-DATAFORSEO_EMAIL=
-DATAFORSEO_PASSWORD=
-PORT=
+# logging
 SENTRY_DSN=
-AI_TREND_RESOLUTION=0.01 # fraction of all articles that should be downloaded and ai-coded for the sentiment and topic trends
+# port where the server should run
+PORT=
+# fraction of all articles that should be downloaded and ai-coded for the sentiment and topic trends
+AI_TREND_RESOLUTION=0.01 
diff --git a/backend-python/media_impact_monitor/data_loaders/social_media/tiktok.py b/backend-python/media_impact_monitor/data_loaders/social_media/tiktok.py
new file mode 100644
index 0000000..9935a18
--- /dev/null
+++ b/backend-python/media_impact_monitor/data_loaders/social_media/tiktok.py
@@ -0,0 +1,171 @@
+import re
+from collections import Counter
+from datetime import datetime
+from typing import Any
+
+import pandas as pd
+from tqdm.auto import tqdm
+
+from media_impact_monitor.util.cache import get
+from media_impact_monitor.util.env import RAPIDAPI_KEY
+
+headers = {
+    "x-rapidapi-key": RAPIDAPI_KEY,
+    "x-rapidapi-host": "tiktok-scraper7.p.rapidapi.com",
+}
+
+
+def get_videos_for_keywords(
+    keywords: str, n: int, cursor: int = 0
+) -> list[dict[str, Any]]:
+    """
+    Get videos for a given set of keywords.
+    Problem: This returns max ~150 videos, even for very popular keywords.
+    Use hashtag query to get more videos.
+    """
+    url = "https://tiktok-scraper7.p.rapidapi.com/feed/search"
+    query = {
+        "keywords": keywords,
+        "region": "us",  # location of the proxy server
+        "count": 30,  # max: 30
+        "cursor": cursor,
+        "publish_time": "0",  # 0 - ALL 1 - Past 24 hours 7 - This week 30 - This month 90 - Last 3 months 180 - Last 6 months
+        "sort_type": "0",  # 0 - Relevance 1 - Like count 3 - Date posted
+    }
+    response = get(url, headers=headers, params=query)
+    # print(response.json())
+    data = response.json()["data"]
+    videos, cursor, has_more = data["videos"], data["cursor"], data["hasMore"]
+    if has_more and cursor < n:
+        videos.extend(get_videos_for_keywords(keywords=keywords, n=n, cursor=cursor))
+    return videos
+
+
+def get_hashtag_suggestions(keywords: str) -> Counter:
+    videos = get_videos_for_keywords(keywords, n=100)
+    titles = [video["title"] for video in videos]
+    hashtags = [re.findall(r"#(\w+)", title) for title in titles]
+    hashtags = [item for sublist in hashtags for item in sublist]
+    hashtag_counts = Counter(hashtags)
+    return hashtag_counts
+
+
+def get_hashtag_id(hashtag: str) -> str:
+    url = "https://tiktok-scraper7.p.rapidapi.com/challenge/info"
+    querystring = {
+        "challenge_name": hashtag,
+    }
+    response = get(url, headers=headers, params=querystring)
+    return response.json()["data"]["id"]
+
+
+def get_videos_for_hashtag_id(
+    hashtag_id: str, n: int, cursor: int = 0, verbose: bool = True
+) -> list[dict[str, Any]]:
+    url = "https://tiktok-scraper7.p.rapidapi.com/challenge/posts"
+    query = {
+        "challenge_id": hashtag_id,
+        "count": 20,  # max: 20
+        "cursor": cursor,
+    }
+    response = get(url, headers=headers, params=query)
+    data = response.json()["data"]
+    videos, cursor, has_more = data["videos"], data["cursor"], data["hasMore"]
+    if has_more and cursor < n:
+        if verbose:
+            print(cursor)
+        videos.extend(
+            get_videos_for_hashtag_id(
+                hashtag_id=hashtag_id, n=n, cursor=cursor, verbose=verbose
+            )
+        )
+    return videos
+
+
+def get_videos_for_hashtag(
+    hashtag: str, n: int, cursor: int = 0, verbose: bool = True
+) -> list[dict[str, Any]]:
+    hashtag_id = get_hashtag_id(hashtag)
+    return get_videos_for_hashtag_id(hashtag_id, n=n, cursor=cursor, verbose=verbose)
+
+
+def get_video_history_for_hashtag(
+    hashtag: str, n: int, verbose: bool = True
+) -> pd.DataFrame:
+    """
+    Get video history for a hashtag.
+    Returns a time series of views and posts.
+    Views are computed by summing the views of all videos that were posted in a given day -- that is, the views do not correspond to the dates when the videos were actually viewed. It is recommended to just use posts, or comments (see `get_comment_history_for_hashtag`).
+    """
+    videos = get_videos_for_hashtag(hashtag, n=n, verbose=verbose)
+    df = pd.DataFrame(
+        {
+            "date": [datetime.fromtimestamp(video["create_time"]) for video in videos],
+            "id": [video["video_id"] for video in videos],
+            "title": [video["title"] for video in videos],
+            "views": [video["play_count"] for video in videos],
+        }
+    )
+    df["date"] = pd.to_datetime(df["date"])
+    df = df.sort_values("date")
+    ts = (
+        df.resample("1D", on="date")
+        .agg(
+            {
+                "views": "sum",
+                "id": "count",
+            }
+        )
+        .rename(columns={"id": "posts"})
+    )
+    ts = ts.reindex(pd.date_range(start=ts.index.min(), end=ts.index.max())).fillna(0)
+    return ts
+
+
+def get_comments_for_video(
+    video_id: str, n: int, cursor: int = 0
+) -> list[dict[str, Any]]:
+    url = "https://tiktok-scraper7.p.rapidapi.com/comment/list"
+    query = {
+        "url": video_id,
+        "count": 50,  # max: 50 (?)
+        "cursor": cursor,
+    }
+    response = get(url, headers=headers, params=query)
+    data = response.json()["data"]
+    comments, cursor, has_more = data["comments"], data["cursor"], data["hasMore"]
+    if has_more and cursor < n:
+        comments.extend(get_comments_for_video(video_id, n=n, cursor=cursor))
+    return comments
+
+
+def get_comment_history_for_hashtag(
+    hashtag: str, n_posts: int, n_comments: int, verbose: bool = True
+) -> pd.DataFrame:
+    videos = get_videos_for_hashtag(hashtag, n=n_posts, verbose=verbose)
+    comments = [
+        get_comments_for_video(video["video_id"], n=n_comments)
+        for video in tqdm(videos)
+        if video["comment_count"] > 0
+    ]
+    comments = [comment for video_comments in comments for comment in video_comments]
+    comments_df = pd.DataFrame(
+        {
+            "date": [
+                datetime.fromtimestamp(comment["create_time"]) for comment in comments
+            ],
+            "text": [comment["text"] for comment in comments],
+            "video_id": [comment["video_id"] for comment in comments],
+        }
+    )
+    ts = (
+        comments_df.resample("1W", on="date")
+        .agg(
+            {
+                "text": "count",
+            }
+        )
+        .rename(columns={"text": "comments"})
+    )
+    ts = ts.reindex(pd.date_range(start=ts.index.min(), end=ts.index.max())).fillna(0)
+    return ts
diff --git a/backend-python/media_impact_monitor/data_loaders/social_media/tiktok_test.py b/backend-python/media_impact_monitor/data_loaders/social_media/tiktok_test.py
new file mode 100644
index 0000000..25ac282
--- /dev/null
+++ b/backend-python/media_impact_monitor/data_loaders/social_media/tiktok_test.py
@@ -0,0 +1,111 @@
+import pytest
+import pandas as pd
+from datetime import datetime, timedelta
+from collections import Counter
+from media_impact_monitor.data_loaders.social_media.tiktok import (
+    get_videos_for_keywords,
+    get_hashtag_suggestions,
+    get_hashtag_id,
+    get_videos_for_hashtag_id,
+    get_videos_for_hashtag,
+    get_video_history_for_hashtag,
+    get_comments_for_video,
+    get_comment_history_for_hashtag,
+)
+
+
+@pytest.mark.slow
+def test_get_videos_for_keywords():
+    videos = get_videos_for_keywords("climate change", n=50)
+    assert len(videos) > 0
+    assert isinstance(videos[0], dict)
+    assert "title" in videos[0]
+    assert "video_id" in videos[0]
+
+
+@pytest.mark.slow
+def test_get_hashtag_suggestions():
+    suggestions = get_hashtag_suggestions("climate change")
+    assert len(suggestions) > 0
+    assert isinstance(suggestions, Counter)
+
+
+@pytest.mark.slow
+def test_get_hashtag_id():
+    hashtag_id = get_hashtag_id("climatechange")
+    assert isinstance(hashtag_id, str)
+    assert len(hashtag_id) > 0
+
+
+@pytest.mark.slow
+def test_get_videos_for_hashtag_id():
+    hashtag_id = get_hashtag_id("climatechange")
+    videos = get_videos_for_hashtag_id(hashtag_id, n=50)
+    assert len(videos) > 0
+    assert isinstance(videos[0], dict)
+    assert "title" in videos[0]
+    assert "video_id" in videos[0]
+
+
+@pytest.mark.slow
+def test_get_videos_for_hashtag():
+    videos = get_videos_for_hashtag("climatechange", n=50)
+    assert len(videos) > 0
+    assert isinstance(videos[0], dict)
+    assert "title" in videos[0]
+    assert "video_id" in videos[0]
+
+
+@pytest.mark.slow
+def test_get_video_history_for_hashtag():
+    history = get_video_history_for_hashtag("climatechange", n=100)
+    assert isinstance(history, pd.DataFrame)
+    assert len(history) > 0
+    assert "views" in history.columns
+    assert "posts" in history.columns
+
+
+@pytest.mark.slow
+def test_get_comments_for_video():
+    videos = get_videos_for_hashtag("climatechange", n=1)
+    video_id = videos[0]["video_id"]
+    comments = get_comments_for_video(video_id, n=50)
+    assert len(comments) > 0
+    assert isinstance(comments[0], dict)
+    assert "text" in comments[0]
+
+
+@pytest.mark.slow
+def test_get_comment_history_for_hashtag():
+    history = get_comment_history_for_hashtag(
+        "climatechange", n_posts=10, n_comments=10
+    )
+    assert isinstance(history, pd.DataFrame)
+    assert len(history) > 0
+    assert "comments" in history.columns
+
+
+@pytest.mark.slow
+def test_data_freshness():
+    videos = get_videos_for_hashtag("climatechange", n=50)
+    latest_video_date = max(
+        datetime.fromtimestamp(video["create_time"]) for video in videos
+    )
+    assert latest_video_date >= datetime.now() - timedelta(
+        days=7
+    ), "No recent videos found"
+
+
+@pytest.mark.slow
+def test_video_content():
+    videos = get_videos_for_keywords("climate change", n=50)
+    climate_related_words = [
+        "climate",
+        "environment",
+        "global warming",
+        "sustainability",
+    ]
+    assert any(
+        any(word in video["title"].lower() for word in climate_related_words)
+        for video in videos
+    ), "No climate-related content found in video titles"
diff --git a/backend-python/media_impact_monitor/util/env.py b/backend-python/media_impact_monitor/util/env.py
index 3363b48..588c216 100644
--- a/backend-python/media_impact_monitor/util/env.py
+++ b/backend-python/media_impact_monitor/util/env.py
@@ -18,6 +18,7 @@
 DATAFORSEO_PASSWORD = environ["DATAFORSEO_PASSWORD"]
 BUNDESTAG_API_KEY = environ["BUNDESTAG_API_KEY"]
 SENTRY_DSN = environ["SENTRY_DSN"]
+RAPIDAPI_KEY = environ["RAPIDAPI_KEY"]
 AI_TREND_RESOLUTION = float(environ.get("AI_TREND_RESOLUTION", 0.01))
 
 assert ACLED_EMAIL
@@ -31,3 +32,4 @@
 assert DATAFORSEO_PASSWORD
 assert BUNDESTAG_API_KEY
 assert SENTRY_DSN
+assert RAPIDAPI_KEY

From 06af85447ea1549ff4ecf2d9e8e6022819877e1b Mon Sep 17 00:00:00 2001
From: David Pomerenke <46022183+davidpomerenke@users.noreply.github.com>
Date: Mon, 21 Oct 2024 00:15:06 +0200
Subject: [PATCH 2/5] feat: integrate TikTok into API and frontend

---
 .../trends/keyword_trend.py                   | 14 +++++++++-
 backend-python/media_impact_monitor/types_.py |  2 +-
 .../src/components/DataSourceSelect.tsx       |  8 ++++++
 frontend-nextjs/src/stores/filtersStore.ts    |  4 +--
 frontend-nextjs/src/utility/textUtil.tsx      | 26 ++++++++++++++++---
 5 files changed, 47 insertions(+), 7 deletions(-)

diff --git a/backend-python/media_impact_monitor/trends/keyword_trend.py b/backend-python/media_impact_monitor/trends/keyword_trend.py
index 227defa..acdae99 100644
--- a/backend-python/media_impact_monitor/trends/keyword_trend.py
+++ b/backend-python/media_impact_monitor/trends/keyword_trend.py
@@ -5,6 +5,9 @@
     get_mediacloud_counts,
 )
 from media_impact_monitor.data_loaders.news_print.genios import get_genios_counts
+from media_impact_monitor.data_loaders.social_media.tiktok import (
+    get_video_history_for_hashtag,
+)
 from media_impact_monitor.data_loaders.web.google_trends import get_google_trends_counts
 from media_impact_monitor.types_ import TrendSearch
 from media_impact_monitor.util.paths import src
@@ -31,6 +34,8 @@ def get_keyword_trend(q: TrendSearch) -> tuple[pd.DataFrame | None, list[str]]:
                 )
             case "web_google":
                 ds = get_google_trends_counts(query=query, end_date=q.end_date)
+            case "social_tiktok":
+                ds = get_video_history_for_hashtag(query, n=1000, verbose=True)["posts"]
             case _:
                 raise ValueError(f"Unsupported media source: {q.media_source}")
         dss[topic] = ds
@@ -65,7 +70,14 @@ def topic_queries(media_source: str) -> dict[str, str]:
         #     media_source,
         # ),
     }
-    if media_source != "web_google":
+    if media_source == "social_tiktok":
+        keyword_queries = {
+            "climate activism": "climateprotest",  # TODO: improve
+            "climate policy": "climateaction",  # TODO: improve
+            "climate science": "climatechange",  # TODO: improve
+            "climate crisis framing": "climatecrisis",  # TODO: improve
+        }
+    elif media_source != "web_google":
         keyword_queries["climate activism"] = xs_with_ys(
             keywords["climate_science"]
             + keywords["climate_policy"]
diff --git a/backend-python/media_impact_monitor/types_.py b/backend-python/media_impact_monitor/types_.py
index 353d762..39f65d8 100644
--- a/backend-python/media_impact_monitor/types_.py
+++ b/backend-python/media_impact_monitor/types_.py
@@ -9,7 +9,7 @@
 # FIXME: consider renaming "Topic" to "Issue" to avoid confusion with topics within the issue (like science or policy)
 Topic = Literal["climate_change"]
 Query = str  # for now, just a single keyword
-MediaSource = Literal["news_online", "news_print", "web_google"]
+MediaSource = Literal["news_online", "news_print", "web_google", "social_tiktok"]
 
 StartDateField = Field(
     default=date(2020, 1, 1),
diff --git a/frontend-nextjs/src/components/DataSourceSelect.tsx b/frontend-nextjs/src/components/DataSourceSelect.tsx
index 0fced52..509700c 100644
--- a/frontend-nextjs/src/components/DataSourceSelect.tsx
+++ b/frontend-nextjs/src/components/DataSourceSelect.tsx
@@ -23,6 +23,7 @@ import {
 	type LucideIcon,
 	NewspaperIcon,
 	SearchIcon,
+	Music2Icon
 } from 'lucide-react'
 import { useMemo, useState } from 'react'
 
@@ -52,6 +53,13 @@ const options: OptionType[] = [
 		description: texts.filters.mediaSource.values.printNews.description,
 		links: texts.filters.mediaSource.values.printNews.links,
 	},
+	{
+		name: texts.filters.mediaSource.values.tiktok.name,
+		value: 'social_tiktok',
+		Icon: Music2Icon,
+		description: texts.filters.mediaSource.values.tiktok.description,
+		links: texts.filters.mediaSource.values.tiktok.links,
+	},
 	{
 		name: texts.filters.mediaSource.values.googleTrends.name,
 		value: 'web_google',
diff --git a/frontend-nextjs/src/stores/filtersStore.ts b/frontend-nextjs/src/stores/filtersStore.ts
index fc096f6..60f1101 100644
--- a/frontend-nextjs/src/stores/filtersStore.ts
+++ b/frontend-nextjs/src/stores/filtersStore.ts
@@ -20,7 +20,7 @@ import {
 } from "zustand/middleware";
 
 
-export type MediaSourceType = "news_online" | "news_print" | "web_google";
+export type MediaSourceType = "news_online" | "news_print" | "social_tiktok" | "web_google";
 
 export type FiltersState = {
 	from: Date;
@@ -80,7 +80,7 @@ const getFiltersZodSchema = (today: Date) => {
 			.boolean()
 			.default(defaultInitState.isDefaultTimeRange),
 		organizers: z.array(z.string()).default(defaultInitState.organizers),
-		mediaSource: z.enum(["news_online", "news_print", "web_google"]),
+		mediaSource: z.enum(["news_online", "news_print", "social_tiktok", "web_google"]),
 	})
 	.default(defaultInitState);
 }
diff --git a/frontend-nextjs/src/utility/textUtil.tsx b/frontend-nextjs/src/utility/textUtil.tsx
index c6cbacb..cb1881b 100644
--- a/frontend-nextjs/src/utility/textUtil.tsx
+++ b/frontend-nextjs/src/utility/textUtil.tsx
@@ -141,7 +141,7 @@ const textsEnGB = {
 			values: {
 				onlineNews: {
 					name: 'Online News',
-					description: 'Articles from online news pages.',
+					description: 'Articles in online news pages.',
 					links: [
 						{
 							label: 'Official Website',
@@ -151,7 +151,7 @@ const textsEnGB = {
 				},
 				printNews: {
 					name: 'Print News',
-					description: 'Articles from print newspapers.',
+					description: 'Articles in print newspapers.',
 					links: [
 						{
 							label: 'Official Website',
@@ -163,9 +163,19 @@ const textsEnGB = {
 						},
 					],
 				},
+				tiktok: {
+					name: 'TikTok',
+					description: 'Video posts on TikTok.',
+					links: [
+						{
+							label: 'Official Website',
+							href: 'https://www.tiktok.com/',
+						},
+					],
+				},
 				googleTrends: {
 					name: 'Google Trends',
-					description: 'Search trends from Google.',
+					description: 'Search trends on Google.',
 					links: [
 						{
 							label: 'Official Website',
@@ -776,6 +786,16 @@ const textsXXX = {
 						},
 					],
 				},
+				tiktok: {
+					name: 'XXX',
+					description: 'XXX',
+					links: [
+						{
+							label: 'XXX',
+							href: 'XXX',
+						},
+					],
+				},
 				googleTrends: {
 					name: 'XXX',
 					description: 'XXX',

From d95ad4185eb5b7e09bd3389332fe7af5603496bd Mon Sep 17 00:00:00 2001
From: David Pomerenke <46022183+davidpomerenke@users.noreply.github.com>
Date: Mon, 21 Oct 2024 00:18:42 +0200
Subject: [PATCH 3/5] fix(tiktok): add RAPIDAPI key env var to CI

---
 .github/workflows/deploy.yml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml
index 42a61c9..971a217 100644
--- a/.github/workflows/deploy.yml
+++ b/.github/workflows/deploy.yml
@@ -66,3 +66,5 @@ jobs:
         DATAFORSEO_PASSWORD: ${{ secrets.DATAFORSEO_PASSWORD }}
         BUNDESTAG_API_KEY: ${{ secrets.BUNDESTAG_API_KEY }}
         SENTRY_DSN: ${{ secrets.SENTRY_DSN }}
+        RAPIDAPI_KEY: ${{ secrets.RAPIDAPI_KEY }}
+

From 27928f909ae120dc4216a6c6d5df0a13e6e1f0c4 Mon Sep 17 00:00:00 2001
From: David Pomerenke <46022183+davidpomerenke@users.noreply.github.com>
Date: Mon, 21 Oct 2024 00:22:23 +0200
Subject: [PATCH 4/5] fix(cron.py): add TikTok to cronjob

---
 backend-python/media_impact_monitor/cron.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/backend-python/media_impact_monitor/cron.py b/backend-python/media_impact_monitor/cron.py
index a1786c0..f69e4a2 100644
--- a/backend-python/media_impact_monitor/cron.py
+++ b/backend-python/media_impact_monitor/cron.py
@@ -51,7 +51,7 @@ def fill_cache():
             )
         except Exception as e:
             errors.append(f"events {data_source}: {e}")
-    for media_source in ["news_online", "news_print", "web_google"]:
+    for media_source in ["news_online", "news_print", "social_tiktok", "web_google"]:
         for trend_type in ["keywords", "sentiment"]:
             for aggregation in ["daily", "weekly"]:
                 if aggregation == "daily" and media_source == "web_google":

From 53f2000720cc41b7a178f96f32e6bb56b49baa42 Mon Sep 17 00:00:00 2001
From: David Pomerenke <46022183+davidpomerenke@users.noreply.github.com>
Date: Mon, 21 Oct 2024 00:49:50 +0200
Subject: [PATCH 5/5] fix(frontend): modify media source type in more places

---
 frontend-nextjs/src/utility/eventMediaUtil.ts | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/frontend-nextjs/src/utility/eventMediaUtil.ts b/frontend-nextjs/src/utility/eventMediaUtil.ts
index 017ea9e..1cde023 100644
--- a/frontend-nextjs/src/utility/eventMediaUtil.ts
+++ b/frontend-nextjs/src/utility/eventMediaUtil.ts
@@ -9,7 +9,7 @@ import { today } from "./today";
 import { formatZodError } from "./zodUtil";
 
 export const eventMediaInputQueryZodSchema = z.object({
-	mediaSource: z.enum(["news_online", "news_print", "web_google"]),
+	mediaSource: z.enum(["news_online", "news_print", "social_tiktok", "web_google"]),
 	from: z.date().optional(),
 	to: z.date().optional(),
 	eventId: z.string(),
@@ -22,7 +22,7 @@ export type EventMediaInputQueryType = z.infer<
 >;
 
 const eventMediaOutputQueryZodSchema = z.object({
-	media_source: z.enum(["news_online", "news_print", "web_google"]),
+	media_source: z.enum(["news_online", "news_print", "social_tiktok", "web_google"]),
 	start_date: z.string().optional(),
 	end_date: z.string().optional(),
 	event_id: z.string(),