-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
199 lines (167 loc) · 7.35 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
import os
import json
import logging
from datetime import datetime
from slugify import slugify
import time
from utils.api import post_exists, upload_post_to_strapi
from utils.image import download_image, upload_image_to_strapi
# Set up logging
logging.basicConfig(filename='upload_log.txt', level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s')
# Load the JSON data
with open('parsed_wordpress_posts.json', 'r') as f:
posts = json.load(f)
# Categories mapping
category_mapping = {
"Entertainment": "afroculture",
"Tech": "tech",
"Lifestyle": "style",
"News": "afroculture",
"Art": "arts & design",
"Film": "arts & design",
"Loudr+": "style",
"Music": "music",
"LoudrQandA": "afroculture"
}
# Format content for Strapi blocks
def format_content_for_blocks(content):
return [
{
"type": "paragraph",
"children": [{"type": "text", "text": content}]
}
]
# Function to load progress
def load_progress():
if os.path.exists('upload_progress.txt'):
with open('upload_progress.txt', 'r') as f:
return set(line.strip() for line in f)
return set()
# Function to save progress
def save_progress(post_id):
with open('upload_progress.txt', 'a') as f:
f.write(f"{post_id}\n")
def clear_progress():
if os.path.exists('upload_progress.txt'):
os.remove('upload_progress.txt')
logging.info("Cleared upload progress cache.")
def load_failed_posts():
try:
with open('failed_posts.json', 'r') as f:
return json.load(f)
except FileNotFoundError:
return []
def save_failed_post(post):
failed_posts = load_failed_posts()
failed_posts.append({
'post_id': post['post_id'],
'title': post['title'],
'error_time': datetime.now().isoformat(),
})
with open('failed_posts.json', 'w') as f:
json.dump(failed_posts, f, indent=2)
def retry_failed_posts():
failed_posts = load_failed_posts()
for failed_post in failed_posts[:]: # Create a copy to iterate over
post = next((p for p in posts if p['post_id'] == failed_post['post_id']), None)
if post:
try:
# Process and upload the post (similar to process_posts)
slug = slugify(post['title'])
if post_exists(slug):
logging.info(f"Post already exists: {post['title']}")
continue
cover_photo_path = download_image(post['cover_photo_url'])
cover_photo_id = upload_image_to_strapi(cover_photo_path) if cover_photo_path else None
date_obj = datetime.strptime(post['pubDate'], "%a, %d %b %Y %H:%M:%S %z")
strapi_post = {
"data": {
"title": post['title'],
"slug": slug,
"postContent": format_content_for_blocks(post['content']),
"description": post.get('excerpt', ""),
"date": date_obj,
"tags": post['tags'],
"contentType": category_mapping.get(post['category']),
"headerImage": cover_photo_id,
}
}
upload_post_to_strapi(strapi_post)
save_progress(post['post_id'])
logging.info(f"Successfully uploaded previously failed post: {post['title']}")
# Remove from failed posts list
failed_posts.remove(failed_post)
if cover_photo_path and os.path.exists(cover_photo_path):
os.remove(cover_photo_path)
except Exception as e:
logging.error(f"Error retrying post {post['title']}: {str(e)}")
# Save updated failed posts list
with open('failed_posts.json', 'w') as f:
json.dump(failed_posts, f, indent=2)
def process_posts(batch_size=10):
uploaded_posts = load_progress()
all_posts_processed = True
for i in range(0, len(posts), batch_size):
batch = posts[i:i+batch_size]
for post in batch:
if post['post_id'] in uploaded_posts:
logging.info(f"Skipping already uploaded post: {post['title']}")
continue
try:
# Create slug from title
slug = slugify(post['title'])
# Check if post already exists
if post_exists(slug):
logging.info(f"Post already exists: {post['title']}")
save_progress(post['post_id'])
continue
# Only download and process image if the post doesn't exist
cover_photo_path = download_image(post['cover_photo_url'])
cover_photo_id = upload_image_to_strapi(cover_photo_path) if cover_photo_path else None
# Convert date string to DateTime object
date_obj = datetime.strptime(post['pubDate'], "%a, %d %b %Y %H:%M:%S %z")
# Prepare post data for Strapi
strapi_post = {
"data": {
"title": post['title'],
"slug": slug,
"postContent": format_content_for_blocks(post['content']),
"description": post.get('excerpt', ""),
"date": date_obj,
"tags": post['tags'],
"contentType": category_mapping.get(post['category']),
"headerImage": cover_photo_id,
}
}
# Upload post to Strapi
upload_post_to_strapi(strapi_post)
save_progress(post['post_id'])
logging.info(f"Successfully uploaded post: {post['title']}")
# Clean up downloaded image file
if cover_photo_path and os.path.exists(cover_photo_path):
os.remove(cover_photo_path)
logging.info(f"Cleaned up image file: {cover_photo_path}")
except Exception as e:
logging.error(f"Error processing post {post['title']}: {str(e)}")
save_failed_post(post)
all_posts_processed = False
logging.info(f"Completed batch {i//batch_size + 1} of {len(posts)//batch_size + 1}")
time.sleep(5) # Add a delay between batches to further reduce load
return all_posts_processed
if __name__ == "__main__":
try:
all_posts_processed = process_posts()
if all_posts_processed:
clear_progress() # Clear progress only if all posts were processed successfully
logging.info("All posts processed successfully. Progress cache cleared.")
else:
logging.info("Some posts were not processed. Progress saved for resuming later.")
retry_failed_posts() # Attempt to retry failed posts
except KeyboardInterrupt:
logging.info("Process interrupted by user. Progress saved for resuming later.")
except Exception as e:
logging.error(f"An error occurred during the upload process: {str(e)}")
logging.info("Progress saved for resuming later.")
finally:
logging.info("Upload process finished.")