xmlParseScript.py

import xml.etree.ElementTree as ET
import json
import re

def parse_xml_to_json(xml_file_path, json_file_path):
    tree = ET.parse(xml_file_path)
    root = tree.getroot()

    namespace = {
        'content': 'http://purl.org/rss/1.0/modules/content/',
        'wp': 'http://wordpress.org/export/1.2/',
        'excerpt': 'http://wordpress.org/export/1.2/excerpt/',
        'dc': 'http://purl.org/dc/elements/1.1/',
        'rss': 'http://purl.org/rss/1.0/'  # Add this for the guid element
    }

    attachments = {}

    for item in root.findall('.//item'):
        post_type = item.find('wp:post_type', namespace)
        if post_type is not None and post_type.text == 'attachment':
            parent = item.find('wp:post_parent', namespace)
            if parent is not None:
                parent_id = parent.text
                attachment_url = item.find('wp:attachment_url', namespace)
                if attachment_url is not None:
                    if parent_id not in attachments:
                        attachments[parent_id] = []
                    attachments[parent_id].append(attachment_url.text)

    posts = []

    for item in root.findall('.//item'):
        post_type = item.find('wp:post_type', namespace)
        
        if post_type is not None and post_type.text == 'post':
            post_id = item.find('wp:post_id', namespace).text
            
            post_data = {
                'title': item.find('title').text if item.find('title') is not None else '',
                'link': item.find('link').text if item.find('link') is not None else '',
                'pubDate': item.find('pubDate').text if item.find('pubDate') is not None else '',
                'creator': item.find('dc:creator', namespace).text if item.find('dc:creator', namespace) is not None else '',
                'content': item.find('content:encoded', namespace).text if item.find('content:encoded', namespace) is not None else '',
                'excerpt': item.find('excerpt:encoded', namespace).text if item.find('excerpt:encoded', namespace) is not None else '',
                'post_id': post_id,
                'attachments': attachments.get(post_id, []),
                'category': '',
                'tags': ''
            }
            
            # Extract category and tags
            categories = []
            tags = []
            for category in item.findall('category'):
                domain = category.get('domain')
                if domain == 'category':
                    categories.append(category.text)
                elif domain == 'post_tag':
                    tags.append(category.text)
            
            # Assign the first category (if any) to the post
            if categories:
                post_data['category'] = categories[0]
            
            # Format tags as a string with hashtags
            post_data['tags'] = ' '.join(f'#{tag}' for tag in tags)

            # Try to find cover photo URL
            cover_photo_url = ''
            
            # First, check the guid
            guid = item.find('guid', namespace)
            if guid is not None and guid.text:
                if guid.text.startswith('https://loudrr.files.wordpress.com/'):
                    cover_photo_url = guid.text
                else:
                    # If guid doesn't contain an image URL, check if it's an attachment
                    guid_post = root.find(f".//item[guid='{guid.text}']")
                    if guid_post is not None and guid_post.find('wp:post_type', namespace).text == 'attachment':
                        cover_photo_url = guid_post.find('wp:attachment_url', namespace).text
            
            # If no cover photo in guid, check for featured image meta
            if not cover_photo_url:
                meta_elements = item.findall('wp:postmeta', namespace)
                for meta in meta_elements:
                    if meta.find('wp:meta_key', namespace).text == '_thumbnail_id':
                        thumbnail_id = meta.find('wp:meta_value', namespace).text
                        for attachment in root.findall('.//item'):
                            if attachment.find('wp:post_type', namespace).text == 'attachment' and attachment.find('wp:post_id', namespace).text == thumbnail_id:
                                cover_photo_url = attachment.find('wp:attachment_url', namespace).text
                                break
                        break
            
            # If still no cover photo, try to find the first image in content
            if not cover_photo_url and post_data['content']:
                match = re.search(r'<img.+?src=[\'"](.+?)[\'"].*?>', post_data['content'])
                if match:
                    cover_photo_url = match.group(1)
            
            # If still no cover photo, use the first attachment if available
            if not cover_photo_url and post_data['attachments']:
                cover_photo_url = post_data['attachments'][0]
            
            post_data['cover_photo_url'] = cover_photo_url
            posts.append(post_data)

    with open(json_file_path, 'w', encoding='utf-8') as json_file:
        json.dump(posts, json_file, ensure_ascii=False, indent=2)

    print(f"Parsing complete. JSON file with {len(posts)} posts saved to {json_file_path}")

# File paths
xml_file_path = '/Users/hbo/Downloads/loudrr.wordpress.com-2024-08-12-20_01_55/site.wordpress.2024-08-12.000.xml'
json_file_path = '/Users/hbo/Downloads/parsed_wordpress_posts.json'

# Run the parser
parse_xml_to_json(xml_file_path, json_file_path)