diff --git a/src/dataimport/.dockerignore b/src/dataimport/.dockerignore index b8d18da..9b8ce24 100644 --- a/src/dataimport/.dockerignore +++ b/src/dataimport/.dockerignore @@ -3,3 +3,5 @@ geckodriver.exe .env .env.* pietsmietfullvideoimport.py +*.sql +*.zip diff --git a/src/dataimport/.gitignore b/src/dataimport/.gitignore index 1b1d690..6c31534 100644 --- a/src/dataimport/.gitignore +++ b/src/dataimport/.gitignore @@ -2,6 +2,8 @@ geckodriver.exe *.json openai_test.py .env -videos.sql +*.sql __pycache__/ threads.py +test.py +*.zip diff --git a/src/dataimport/hello-cron b/src/dataimport/hello-cron index 52268d7..5228a29 100644 --- a/src/dataimport/hello-cron +++ b/src/dataimport/hello-cron @@ -3,7 +3,8 @@ 35 * * * * . /root/project_env.sh; /usr/local/bin/python /app/pietsmietdevideoimporter.py >> /var/log/cron.log 2>&1 */15 * * * * . /root/project_env.sh; /usr/local/bin/python /app/pietsmietdevideoimporter.py >> /var/log/cron.log 2>&1 5 * * * * . /root/project_env.sh; /usr/local/bin/python /app/youtube.py >> /var/log/cron.log 2>&1 -5 * * * * . /root/project_env.sh; /usr/local/bin/python /app/instagram.py >> /var/log/cron.log 2>&1 +45 21 * * * . /root/project_env.sh; /usr/local/bin/python /app/instagram.py >> /var/log/cron.log 2>&1 +45 10 * * * . /root/project_env.sh; /usr/local/bin/python /app/instagram.py >> /var/log/cron.log 2>&1 */15 * * * * . /root/project_env.sh; /usr/local/bin/python /app/reddit.py >> /var/log/cron.log 2>&1 */1 * * * * . /root/project_env.sh; /usr/local/bin/python /app/twitch.py >> /var/log/cron.log 2>&1 10 1 * * * . /root/project_env.sh; /usr/local/bin/python /app/informationopenaianalyze.py >> /var/log/cron.log 2>&1 diff --git a/src/dataimport/instagram.py b/src/dataimport/instagram.py index 15d2efe..f124f67 100644 --- a/src/dataimport/instagram.py +++ b/src/dataimport/instagram.py @@ -12,6 +12,11 @@ console = Console() +# create cdn directory if not exists +if not os.path.exists("/app/cdn/instagram"): + console.log("Creating /app/cdn/instagram directory...", style="bold green") + os.makedirs("/app/cdn/instagram") + USERNAME = os.getenv("INSTAGRAM_USERNAME") PASSWORD = os.getenv("INSTAGRAM_PASSWORD") KEY_2FA = os.getenv("INSTAGRAM_2FA_SECRET") @@ -123,8 +128,8 @@ async def instagram(): db = Database(url=os.getenv("DATABASE_URL")) await db.connect() for user, user_id in user_dict.items(): - console.log(f"Fetching last 50 media items for {user}") - last_media = cl.user_medias(user_id, amount=50) + console.log(f"Fetching last 3 media items for {user}") + last_media = cl.user_medias(user_id, amount=3) console.log(f"Found {len(last_media)} media items for {user}") for media in last_media: remote_id = f"{user}_{str(media.id)}" @@ -156,10 +161,10 @@ async def instagram(): console.log(f"Downloading thumbnail for {remote_id}") try: thumbnail = requests.get(thumbnail_url).content - filename = f"instagram_{uuid4()}.jpg" - with open(f"/app/cdn/{filename}", "wb") as f: + filename = f"{uuid4()}.jpg" + with open(f"/app/cdn/instagram/{filename}", "wb") as f: f.write(thumbnail) - thumbnail_url = f"/cdn/{filename}" + thumbnail_url = f"/cdn/instagram/{filename}" except Exception as e: console.log(f"Error downloading thumbnail: {e}", style="bold red") continue @@ -182,10 +187,10 @@ async def instagram(): console.log(f"Downloading thumbnail for resource {resource.pk}") try: thumbnail = requests.get(thumbnail_url).content - filename = f"instagramr_{uuid4()}.jpg" - with open(f"/app/cdn/{filename}", "wb") as f: + filename = f"r_{uuid4()}.jpg" + with open(f"/app/cdn/instagram/{filename}", "wb") as f: f.write(thumbnail) - thumbnail_url = f"/cdn/{filename}" + thumbnail_url = f"/cdn/instagram/{filename}" except Exception as e: console.log( f"Error downloading thumbnail: {e}", style="bold red" diff --git a/src/dataimport/pietsmietdefullthumbnailimport.py b/src/dataimport/pietsmietdefullthumbnailimport.py new file mode 100644 index 0000000..2038346 --- /dev/null +++ b/src/dataimport/pietsmietdefullthumbnailimport.py @@ -0,0 +1,58 @@ +# This file is not actively used in psaggregator. +# It is a script that was used to import all thumbnails from PietSmiet once. +# Other imports only import recent thumbnails. +# This script generates a sql file that can be used to sync all thumbnails in combination with the pietsmietfullvideoimport.py script. + +import os +import asyncio +from uuid import uuid4 +import requests + +from rich.console import Console +from databases import Database + + +console = Console() + + +async def stuff() -> asyncio.coroutine: + console.log("Connecting to database...", style="bold green") + db = Database(url=os.getenv("DATABASE_URL")) + await db.connect() + + handled = dict() + + query = "SELECT * FROM ContentPiece WHERE importedFrom='PietSmietDE' AND type='PSVideo' AND remoteId IS NOT NULL AND imageUri IS NOT NULL" + console.log("Fetching all videos...", style="bold green") + videos = await db.fetch_all(query=query) + + for index, video in enumerate(videos): + if video.remoteId in handled: + continue + handled[video.remoteId] = uuid4() + + console.log( + f"Fetching thumbnail for {video.remoteId} ({index})...", style="bold green" + ) + + thumbnail = requests.get(video.imageUri).content + filename = f"/app/cdn/psde/{handled[video.remoteId]}.jpg" + with open(filename, "wb") as f: + f.write(thumbnail) + + console.log("Write mapping to file...", style="bold green") + + update_statements = list() + + for handledId, uuid in handled.items(): + update_statements.append( + f"UPDATE ContentPiece SET imageUri='/cdn/psde/{uuid}.jpg' WHERE remoteId='{handledId}'" + ) + + with open("psde.sql", "w", encoding="utf-8") as f: + f.writelines(update_statements) + + console.log("Done!", style="bold green") + + +asyncio.run(stuff()) diff --git a/src/dataimport/pietsmietdevideoimporter.py b/src/dataimport/pietsmietdevideoimporter.py index b3e555b..bb8da2d 100644 --- a/src/dataimport/pietsmietdevideoimporter.py +++ b/src/dataimport/pietsmietdevideoimporter.py @@ -1,3 +1,4 @@ +import requests import os import json import asyncio @@ -12,6 +13,11 @@ console = Console() +# create cdn directory if not exists +if not os.path.exists("/app/cdn/psde"): + console.log("Creating /app/cdn/psde directory...", style="bold green") + os.makedirs("/app/cdn/psde") + async def stuff() -> asyncio.coroutine: console.log("Starting...", style="bold green") @@ -72,7 +78,7 @@ async def stuff() -> asyncio.coroutine: uri = f"'{video['short_url']}'" if video.get("thumbnail"): try: - imageUri = f"'{video['thumbnail']['variations'][0]['url']}'" + imageUri = f"{video['thumbnail']['variations'][0]['url']}" except KeyError: pass except IndexError: @@ -99,7 +105,7 @@ async def stuff() -> asyncio.coroutine: INSERT INTO ContentPiece (id , remoteId, title, description, additionalInfo, startDate, imageUri, href, duration, importedAt, importedFrom , type) VALUES ('{}', '{}' , '{}' , NULL , NULL , {} , {} , {} , {} , now() , 'PietSmietDE', 'PSVideo');""" UPDATE_STATEMENT = """ - UPDATE ContentPiece SET href={}, imageUri={}, title='{}', duration={} WHERE id='{}';""" + UPDATE ContentPiece SET href={}, title='{}', duration={} WHERE id='{}';""" console.log("Checking for existing entries...", style="bold green") for content in data: @@ -115,7 +121,6 @@ async def stuff() -> asyncio.coroutine: query = UPDATE_STATEMENT.format( content["uri"], - content["imageUri"], content["title"], content["duration"], result[0]["id"], @@ -128,6 +133,17 @@ async def stuff() -> asyncio.coroutine: style="bold yellow", ) + if content["imageUri"] != "NULL": + try: + thumbnail = requests.get(content["imageUri"]).content + filename = f"{uuid4()}.jpg" + with open(f"/app/cdn/psde/{filename}", "wb") as f: + f.write(thumbnail) + content["imageUri"] = f"'/cdn/psde/{filename}'" + except Exception as e: + console.log(f"Error downloading thumbnail: {e}", style="bold red") + content["imageUri"] = f"'{content['imageUri']}'" + query = INSERT_STATEMENT.format( uuid4(), content["remoteId"], diff --git a/src/dataimport/reddit.py b/src/dataimport/reddit.py index 3b59519..4634de8 100644 --- a/src/dataimport/reddit.py +++ b/src/dataimport/reddit.py @@ -1,6 +1,8 @@ +import requests import os import asyncio from datetime import datetime +from uuid import uuid4 from rich.console import Console from databases import Database @@ -10,6 +12,11 @@ console = Console() +# create cdn directory if not exists +if not os.path.exists("/app/cdn/reddit"): + console.log("Creating /app/cdn/reddit directory...", style="bold green") + os.makedirs("/app/cdn/reddit") + async def stuff() -> asyncio.coroutine: client_id = os.getenv("REDDIT_CLIENT_ID") @@ -52,6 +59,13 @@ async def stuff() -> asyncio.coroutine: delete_query = "DELETE FROM RedditPost WHERE 1=1;" await db.execute(delete_query) + console.log("Deleting old thumbnails...", style="bold green") + try: + for file in os.listdir("/app/cdn/reddit"): + os.remove(f"/app/cdn/reddit/{file}") + except Exception as e: + console.log(f"Error deleting old thumbnails: {e}", style="bold red") + INSERT_STATEMENT = """INSERT INTO RedditPost (id , title, description, username, upvotes, comments, sticky, publishedAt, imageUri, href, importedAt) VALUES ('{}', '{}' , NULL , '{}' , {} , {} , {} , '{}' , {} , '{}', now());""" @@ -63,7 +77,15 @@ async def stuff() -> asyncio.coroutine: thumbnail = "NULL" if submission.thumbnail.startswith("http"): - thumbnail = f"'{submission.thumbnail}'" + try: + thubmnail_content = requests.get(submission.thumbnail).content + thumbnail = f"{uuid4()}.jpg" + with open(f"/app/cdn/reddit/{thumbnail}", "wb") as f: + f.write(thubmnail_content) + thumbnail = f"'/cdn/reddit/{thumbnail}'" + except Exception as e: + console.log(f"Error downloading thumbnail: {e}") + thumbnail = f"'{submission.thumbnail}'" query = INSERT_STATEMENT.format( submission.id, diff --git a/src/dataimport/youtube.py b/src/dataimport/youtube.py index 8e8e43e..1951220 100644 --- a/src/dataimport/youtube.py +++ b/src/dataimport/youtube.py @@ -11,6 +11,11 @@ console = Console() +# create cdn directory if not exists +if not os.path.exists("/app/cdn/yt"): + console.log("Creating /app/cdn/yt directory...", style="bold green") + os.makedirs("/app/cdn/yt") + server_base_url = ( os.getenv("YT_SERVER_BASE_URL") if os.getenv("YT_SERVER_BASE_URL") @@ -79,14 +84,14 @@ async def youtube(): pass if thumbnailUri != "NULL": - # download thumbnail and store it in /app/cdn/ + # download thumbnail and store it in /app/cdn/yt/ console.log(f"Downloading thumbnail for {yt['id']}") try: thumbnail = requests.get(thumbnailUri).content - filename = f"youtube_{uuid4()}.jpg" - with open(f"/app/cdn/{filename}", "wb") as f: + filename = f"{uuid4()}.jpg" + with open(f"/app/cdn/yt/{filename}", "wb") as f: f.write(thumbnail) - thumbnailUri = f"'/cdn/{filename}'" + thumbnailUri = f"'/cdn/yt/{filename}'" except Exception as e: console.log(f"Error downloading thumbnail: {e}", style="bold red") thumbnailUri = "NULL" diff --git a/src/nginx/nginx.conf b/src/nginx/nginx.conf index 03308e4..3551f29 100644 --- a/src/nginx/nginx.conf +++ b/src/nginx/nginx.conf @@ -30,6 +30,8 @@ http { location ^~ /cdn/ { alias /app/cdn/; try_files $uri =404; + + expires 1M; } location ^~ /api { diff --git a/src/psaggregator/package-lock.json b/src/psaggregator/package-lock.json index 5d1a4e1..0ff627c 100644 --- a/src/psaggregator/package-lock.json +++ b/src/psaggregator/package-lock.json @@ -1,12 +1,12 @@ { "name": "psaggregator", - "version": "1.4.1", + "version": "1.5.0", "lockfileVersion": 3, "requires": true, "packages": { "": { "name": "psaggregator", - "version": "1.4.1", + "version": "1.5.0", "devDependencies": { "@fontsource/fira-mono": "^4.5.10", "@neoconfetti/svelte": "^1.0.0", diff --git a/src/psaggregator/package.json b/src/psaggregator/package.json index 53edce0..a3fa484 100644 --- a/src/psaggregator/package.json +++ b/src/psaggregator/package.json @@ -1,6 +1,6 @@ { "name": "psaggregator", - "version": "1.4.1", + "version": "1.5.0", "scripts": { "dev": "vite dev", "build": "vite build",