-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathxmlParseScript.py
118 lines (97 loc) · 5.48 KB
/
xmlParseScript.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
import xml.etree.ElementTree as ET
import json
import re
def parse_xml_to_json(xml_file_path, json_file_path):
tree = ET.parse(xml_file_path)
root = tree.getroot()
namespace = {
'content': 'http://purl.org/rss/1.0/modules/content/',
'wp': 'http://wordpress.org/export/1.2/',
'excerpt': 'http://wordpress.org/export/1.2/excerpt/',
'dc': 'http://purl.org/dc/elements/1.1/',
'rss': 'http://purl.org/rss/1.0/' # Add this for the guid element
}
attachments = {}
for item in root.findall('.//item'):
post_type = item.find('wp:post_type', namespace)
if post_type is not None and post_type.text == 'attachment':
parent = item.find('wp:post_parent', namespace)
if parent is not None:
parent_id = parent.text
attachment_url = item.find('wp:attachment_url', namespace)
if attachment_url is not None:
if parent_id not in attachments:
attachments[parent_id] = []
attachments[parent_id].append(attachment_url.text)
posts = []
for item in root.findall('.//item'):
post_type = item.find('wp:post_type', namespace)
if post_type is not None and post_type.text == 'post':
post_id = item.find('wp:post_id', namespace).text
post_data = {
'title': item.find('title').text if item.find('title') is not None else '',
'link': item.find('link').text if item.find('link') is not None else '',
'pubDate': item.find('pubDate').text if item.find('pubDate') is not None else '',
'creator': item.find('dc:creator', namespace).text if item.find('dc:creator', namespace) is not None else '',
'content': item.find('content:encoded', namespace).text if item.find('content:encoded', namespace) is not None else '',
'excerpt': item.find('excerpt:encoded', namespace).text if item.find('excerpt:encoded', namespace) is not None else '',
'post_id': post_id,
'attachments': attachments.get(post_id, []),
'category': '',
'tags': ''
}
# Extract category and tags
categories = []
tags = []
for category in item.findall('category'):
domain = category.get('domain')
if domain == 'category':
categories.append(category.text)
elif domain == 'post_tag':
tags.append(category.text)
# Assign the first category (if any) to the post
if categories:
post_data['category'] = categories[0]
# Format tags as a string with hashtags
post_data['tags'] = ' '.join(f'#{tag}' for tag in tags)
# Try to find cover photo URL
cover_photo_url = ''
# First, check the guid
guid = item.find('guid', namespace)
if guid is not None and guid.text:
if guid.text.startswith('https://loudrr.files.wordpress.com/'):
cover_photo_url = guid.text
else:
# If guid doesn't contain an image URL, check if it's an attachment
guid_post = root.find(f".//item[guid='{guid.text}']")
if guid_post is not None and guid_post.find('wp:post_type', namespace).text == 'attachment':
cover_photo_url = guid_post.find('wp:attachment_url', namespace).text
# If no cover photo in guid, check for featured image meta
if not cover_photo_url:
meta_elements = item.findall('wp:postmeta', namespace)
for meta in meta_elements:
if meta.find('wp:meta_key', namespace).text == '_thumbnail_id':
thumbnail_id = meta.find('wp:meta_value', namespace).text
for attachment in root.findall('.//item'):
if attachment.find('wp:post_type', namespace).text == 'attachment' and attachment.find('wp:post_id', namespace).text == thumbnail_id:
cover_photo_url = attachment.find('wp:attachment_url', namespace).text
break
break
# If still no cover photo, try to find the first image in content
if not cover_photo_url and post_data['content']:
match = re.search(r'<img.+?src=[\'"](.+?)[\'"].*?>', post_data['content'])
if match:
cover_photo_url = match.group(1)
# If still no cover photo, use the first attachment if available
if not cover_photo_url and post_data['attachments']:
cover_photo_url = post_data['attachments'][0]
post_data['cover_photo_url'] = cover_photo_url
posts.append(post_data)
with open(json_file_path, 'w', encoding='utf-8') as json_file:
json.dump(posts, json_file, ensure_ascii=False, indent=2)
print(f"Parsing complete. JSON file with {len(posts)} posts saved to {json_file_path}")
# File paths
xml_file_path = '/Users/hbo/Downloads/loudrr.wordpress.com-2024-08-12-20_01_55/site.wordpress.2024-08-12.000.xml'
json_file_path = '/Users/hbo/Downloads/parsed_wordpress_posts.json'
# Run the parser
parse_xml_to_json(xml_file_path, json_file_path)