error on line 26 at column 534: Sequence ']]>' not allowed in content #1

apple050620312 · 2023-11-08T00:57:06Z

我做了一個複製，用fetchrss抓facebook，但是不太成功，可以幫我看看嗎
https://github.com/apple050620312/RSS

RavelloH · 2023-11-10T14:41:59Z

我做了一個複製，用fetchrss抓facebook，但是不太成功，可以幫我看看嗎 https://github.com/apple050620312/RSS

@apple050620312 看起来是因为在fetchrss生成时自动添加了摘要，摘要和标题因为有换行符所以行数不同，导致生成的内容长度不一，从而无法正确解析。

要修复这个问题，可以先格式化整个文件，一个改进的实现如下。

# -*- coding: utf-8 -*-
## 使用有问题请到github.com/ravelloh/RSS提ISSUE
### Author: RavelloH
#### MICENCE: MIT
##### RSS Maker

from xml.dom.minidom import parse
import xml.etree.ElementTree as ET
from urllib.request import urlopen
from wget import download
from bs4 import BeautifulSoup as bs
import xml.dom.minidom
import re,os
import linecache

def format_xml(xml):
    pattern1 = re.compile(r'<description>(.*?)</description>', re.DOTALL)
    pattern2 = re.compile(r'<title>(.*?)</title>', re.DOTALL)
    
    def replace_newline(match):
        return match.group(0).replace('\n', '\\n')
    formatted_xml = re.sub(pattern1, replace_newline, xml)
    formatted_xml = re.sub(pattern2, replace_newline, formatted_xml)
    return formatted_xml
    
def process_xml_file(file_path):
    try:
        tree = ET.parse(file_path)
        root = tree.getroot()
        xml_content = ET.tostring(root, encoding='unicode')
        formatted_xml_content = format_xml(xml_content)
        new_root = ET.fromstring(formatted_xml_content)
        tree._setroot(new_root)
        tree.write(file_path, encoding='utf-8', xml_declaration=True)
    except Exception as e:
        print(f"处理XML文件 {file_path} 失败：{str(e)}")
        exit()

def remove_rss_line(text):
    lines = text.splitlines()
    new_lines = [line for line in lines if not line.startswith("<rss>")]
    return "\n".join(new_lines)

# 替换函数
def alter(file,old_str,new_str):
    lines = ''
    with open(file, "r", encoding="utf-8") as f1,open("%s.bak" % file, "w", encoding="utf-8") as f2:
        for line in f1:
            lines = lines + line
        if old_str in lines:
            lines = lines.replace(old_str, new_str , 1)
        f2.write(lines)
    os.remove(file)
    os.rename("%s.bak" % file, file)

# 初始化 
print('[进程0/6]正在初始化...')
rsslink = 'https://fetchrss.com/rss/64d97f73c0c2b813d8126e73654ac8295887f3316723e202.xml' # 此处为原始rss
if 'originRss.xml' in os.listdir('.'):
    os.remove('originRss.xml')
    filename = download(rsslink,'./originRss.xml')
    process_xml_file('./originRss.xml')
    print('\n[进程0/6]原始RSS已更新')
else:
    filename = download(rsslink,'./originRss.xml')
    process_xml_file('./originRss.xml')
    download(rsslink,'./rss.xml')
    process_xml_file('./rss.xml')
    l1 = []
    with open(r"./rss.xml", 'r') as fp:
        l1 = fp.readlines()
    with open(r"./rss.xml", 'w') as fp:
        for number, line in enumerate(l1):
            if number not in [15,16,17,18,19,20,21,22,23]:
                fp.write(line)
    print('\n[进程0/6]初始化完毕，已构筑文件')

# 下载原始rss
try:
    rsscontext = urlopen(rsslink)
    print('[进程1/6]RSS拉取成功')   
except:
    print('[Error]RSS拉取失败')
    exit()

# 打开原始及本地rss
try:
    OriginRss = xml.dom.minidom.parse(rsscontext)
    getresult = OriginRss.documentElement
    LocalRss = xml.dom.minidom.parse('./rss.xml')
    Localresult = LocalRss.documentElement
    print('[进程2/6]RSS打开成功')
except:
    print('[Error]RSS打开失败')
    exit()
    
# 判断RSS是否有更新
items = getresult.getElementsByTagName("item")
local = Localresult.getElementsByTagName("item")
newpost = items[0]
newtitle = newpost.getElementsByTagName('title')[0]
# 解析本地
localpost = local[0]
localtitle = localpost.getElementsByTagName('title')[0]
newtitlestring = newtitle.childNodes[0].data
localtitlestring = localtitle.childNodes[0].data
if newtitlestring == localtitlestring:
    print('[进程3/6]RSS比对完成:当前已同步，无需继续合并')
    newtimes = getresult.getElementsByTagName('pubDate')[0]
    oldtimes = Localresult.getElementsByTagName('pubDate')[0]
    newtime = newtimes.childNodes[0].data
    oldtime = oldtimes.childNodes[0].data
    alter('rss.xml',str(oldtime),str(newtime))
    print('[进程4已跳过]')
    print('[进程5已跳过]')
    print('[进程6/6]RSS时间更新完成:%s=>%s' %(oldtime,newtime))
    print('[RSS更新完成]')
    exit()
else:
    print('[进程3/6]RSS比对完成:有新项目待更新')
    
# 合并新项目(目前只设计了合并一个)
print('[进程4/6]正在合并新项目...')
filltext = '''
</image>

<item>
'''
# 查找对应行
with open(filename, 'r') as file:
    line = file.readline()
    counts = 1
    while line:
        if newtitlestring in line:
            break
        line = file.readline()
        counts += 1
# 创建插入新内容
tofiletext = linecache.getline(filename, counts-1)+linecache.getline(filename, counts)+linecache.getline(filename, counts+1)+linecache.getline(filename, counts+2)+linecache.getline(filename, counts+3)+linecache.getline(filename, counts+4)+linecache.getline(filename, counts+5)
file_name = "./rss.xml"
with open(file_name, 'r') as f:
    lines = f.readlines()
    lines.insert(15,tofiletext)
    s = ''.join(lines)
with open(file_name, 'w') as f:
    f.write(remove_rss_line(s))
print('\n[进程4/6]新项目已合并')

# 爬取博客内描述 去广告
keyword = '<![CDATA[<br/><br/><span style="font-size:12px; color: gray;">(Feed generated with <a href="https://fetchrss.com" target="_blank">FetchRSS</a>)</span>]]>'
totalline = len(open(file_name,'r').readlines())
needs = []
with open(file_name, 'r') as file:
    line = file.readline()
    counts = 1
    while line:
        if keyword in line:
            needs.append(counts)
        if counts == totalline:
            break
        line = file.readline()
        counts += 1

for j in needs:
    originurl = linecache.getline(file_name, j-1)
    pattern = re.compile(r'[a-zA-z]+://[^\s<]*')
    obj = bs(urlopen(pattern.search(originurl).group()).read(),'html.parser')
    description_info = obj.find_all('p')
    totallen = 0
    nowlen = 0
    for k in description_info:
        totallen += 1
    while True:
        if len(str(description_info[nowlen])) > 70:
            break
        if nowlen > totallen-1:
            break  
        nowlen += 1
    summary = str(description_info[nowlen].get_text(strip=True))
    print('[进程5/6]已获取一篇摘要:%s...' %(summary[:10]))
    alter(file_name,keyword,summary)

# 时间整理
newtimes = getresult.getElementsByTagName('pubDate')[0]
oldtimes = Localresult.getElementsByTagName('pubDate')[0]
newtime = newtimes.childNodes[0].data
oldtime = oldtimes.childNodes[0].data
alter('rss.xml',str(oldtime),str(newtime))
print('[进程6/6]RSS时间更新完成:%s=>%s' %(oldtime,newtime))
print('[RSS更新完成]')
print('生成了%s与%s' %(file_name,filename))

如果上述代码生效，则说明可正常解析此文件。

若错误仍继续出现，请尝试以下两个方法：

1.修改fetchrss的设置，仅需生成相关rss项目的标题、链接，无需其他附加信息，确保其与(https://github.com/RavelloH/RSS/blob/main/originRss.xml )中的结构一致
2.修改源代码，源代码中的此部分与行数替换有关，你可以自己更改相对行数来修改。行数信息放在一个列表内，请确保第一个项目的行数完全在此列表内

    with open(r"./rss.xml", 'w') as fp:
        for number, line in enumerate(l1):
            if number not in [15,16,17,18,19,20,21,22,23]:
                fp.write(line)
    print('\n[进程0/6]初始化完毕，已构筑文件')

apple050620312 · 2023-11-10T15:39:44Z

@RavelloH 可以參考一下你建議的fetchrss設定嗎

RavelloH · 2023-11-11T06:29:13Z

@RavelloH 可以參考一下你建議的fetchrss設定嗎

@apple050620312 我似乎是使用的是以元素选择器自己选择相关元素的方式构建的，没找到有导出配置的地方

不过，其实我还开发了其他的相关rss处理程序

RavelloH/local-feed-generation - 从本地HTML文件构建RSS
RavelloH/rss-extender - 此项目，从fetchrss生成的文件中扩充并整合其内容
RavelloH/rss-aggregator - 从已有的多个RSS源中聚合rss

考虑到你的原始rss已经有很丰富的内容了，继续用此程序应该用处不大，或许你应该试试直接使用RavelloH/rss-aggregator来整合你的rss，它也可以像此程序一样保留历史rss记录、自动同步RSS并更新日期、构建你的Github Pages页面

apple050620312 · 2023-11-11T19:21:49Z

@RavelloH 我想用這個只是因為他能去除廣告😂

RavelloH · 2023-11-12T02:10:38Z

@RavelloH 我想用這個只是因為他能去除廣告😂

@apple050620312 可以尝试这样修改

# -*- coding: utf-8 -*-
## 使用有问题请到github.com/ravelloh/RSS提ISSUE
### Author: RavelloH
#### MICENCE: MIT
##### RSS Maker

from xml.dom.minidom import parse
import xml.etree.ElementTree as ET
from urllib.request import urlopen
from wget import download
from bs4 import BeautifulSoup as bs
import xml.dom.minidom
import re,os
import linecache

def format_xml(xml):
    pattern1 = re.compile(r'<description>(.*?)</description>', re.DOTALL)
    pattern2 = re.compile(r'<title>(.*?)</title>', re.DOTALL)
    
    def replace_newline(match):
        return match.group(0).replace('\n', '\\n')
    formatted_xml = re.sub(pattern1, replace_newline, xml)
    formatted_xml = re.sub(pattern2, replace_newline, formatted_xml)
    return formatted_xml.replace('&lt;span style="font-size:12px; color: gray;"&gt;(Feed generated with &lt;a href="https://fetchrss.com" target="_blank"&gt;FetchRSS&lt;/a&gt;)&lt;/span&gt;','')
    
def process_xml_file(file_path):
    try:
        tree = ET.parse(file_path)
        root = tree.getroot()
        xml_content = ET.tostring(root, encoding='unicode')
        formatted_xml_content = format_xml(xml_content)
        new_root = ET.fromstring(formatted_xml_content)
        tree._setroot(new_root)
        tree.write(file_path, encoding='utf-8', xml_declaration=True)
    except Exception as e:
        print(f"处理XML文件 {file_path} 失败：{str(e)}")
        exit()

def remove_rss_line(text):
    lines = text.splitlines()
    new_lines = [line for line in lines if not line.startswith("<rss>")]
    return "\n".join(new_lines)

# 替换函数
def alter(file,old_str,new_str):
    lines = ''
    with open(file, "r", encoding="utf-8") as f1,open("%s.bak" % file, "w", encoding="utf-8") as f2:
        for line in f1:
            lines = lines + line
        if old_str in lines:
            lines = lines.replace(old_str, new_str , 1)
        f2.write(lines)
    os.remove(file)
    os.rename("%s.bak" % file, file)

# 初始化 
print('[进程0/6]正在初始化...')
rsslink = 'https://fetchrss.com/rss/64d97f73c0c2b813d8126e73654ac8295887f3316723e202.xml' # 此处为原始rss
if 'originRss.xml' in os.listdir('.'):
    os.remove('originRss.xml')
    filename = download(rsslink,'./originRss.xml')
    process_xml_file('./originRss.xml')
    print('\n[进程0/6]原始RSS已更新')
else:
    filename = download(rsslink,'./originRss.xml')
    process_xml_file('./originRss.xml')
    download(rsslink,'./rss.xml')
    process_xml_file('./rss.xml')
    l1 = []
    with open(r"./rss.xml", 'r') as fp:
        l1 = fp.readlines()
    with open(r"./rss.xml", 'w') as fp:
        for number, line in enumerate(l1):
            if number not in [15,16,17,18,19,20,21,22,23]:
                fp.write(line)
    print('\n[进程0/6]初始化完毕，已构筑文件')

# 下载原始rss
try:
    rsscontext = urlopen(rsslink)
    print('[进程1/6]RSS拉取成功')   
except:
    print('[Error]RSS拉取失败')
    exit()

# 打开原始及本地rss
try:
    OriginRss = xml.dom.minidom.parse(rsscontext)
    getresult = OriginRss.documentElement
    LocalRss = xml.dom.minidom.parse('./rss.xml')
    Localresult = LocalRss.documentElement
    print('[进程2/6]RSS打开成功')
except:
    print('[Error]RSS打开失败')
    exit()
    
# 判断RSS是否有更新
items = getresult.getElementsByTagName("item")
local = Localresult.getElementsByTagName("item")
newpost = items[0]
newtitle = newpost.getElementsByTagName('title')[0]
# 解析本地
localpost = local[0]
localtitle = localpost.getElementsByTagName('title')[0]
newtitlestring = newtitle.childNodes[0].data
localtitlestring = localtitle.childNodes[0].data
if newtitlestring == localtitlestring:
    print('[进程3/6]RSS比对完成:当前已同步，无需继续合并')
    newtimes = getresult.getElementsByTagName('pubDate')[0]
    oldtimes = Localresult.getElementsByTagName('pubDate')[0]
    newtime = newtimes.childNodes[0].data
    oldtime = oldtimes.childNodes[0].data
    alter('rss.xml',str(oldtime),str(newtime))
    print('[进程4已跳过]')
    print('[进程5已跳过]')
    print('[进程6/6]RSS时间更新完成:%s=>%s' %(oldtime,newtime))
    print('[RSS更新完成]')
    exit()
else:
    print('[进程3/6]RSS比对完成:有新项目待更新')
    
# 合并新项目(目前只设计了合并一个)
print('[进程4/6]正在合并新项目...')
filltext = '''
</image>

<item>
'''
# 查找对应行
with open(filename, 'r') as file:
    line = file.readline()
    counts = 1
    while line:
        if newtitlestring in line:
            break
        line = file.readline()
        counts += 1
# 创建插入新内容
tofiletext = linecache.getline(filename, counts-1)+linecache.getline(filename, counts)+linecache.getline(filename, counts+1)+linecache.getline(filename, counts+2)+linecache.getline(filename, counts+3)+linecache.getline(filename, counts+4)+linecache.getline(filename, counts+5)
file_name = "./rss.xml"
with open(file_name, 'r') as f:
    lines = f.readlines()
    lines.insert(15,tofiletext)
    s = ''.join(lines)
with open(file_name, 'w') as f:
    f.write(remove_rss_line(s))
print('\n[进程4/6]新项目已合并')

# 爬取博客内描述 去广告
keyword = '<![CDATA[<br/><br/><span style="font-size:12px; color: gray;">(Feed generated with <a href="https://fetchrss.com" target="_blank">FetchRSS</a>)</span>]]>'
totalline = len(open(file_name,'r').readlines())
needs = []
with open(file_name, 'r') as file:
    line = file.readline()
    counts = 1
    while line:
        if keyword in line:
            needs.append(counts)
        if counts == totalline:
            break
        line = file.readline()
        counts += 1

for j in needs:
    originurl = linecache.getline(file_name, j-1)
    pattern = re.compile(r'[a-zA-z]+://[^\s<]*')
    obj = bs(urlopen(pattern.search(originurl).group()).read(),'html.parser')
    description_info = obj.find_all('p')
    totallen = 0
    nowlen = 0
    for k in description_info:
        totallen += 1
    while True:
        if len(str(description_info[nowlen])) > 70:
            break
        if nowlen > totallen-1:
            break  
        nowlen += 1
    summary = str(description_info[nowlen].get_text(strip=True))
    print('[进程5/6]已获取一篇摘要:%s...' %(summary[:10]))
    alter(file_name,keyword,summary)

# 时间整理
newtimes = getresult.getElementsByTagName('pubDate')[0]
oldtimes = Localresult.getElementsByTagName('pubDate')[0]
newtime = newtimes.childNodes[0].data
oldtime = oldtimes.childNodes[0].data
alter('rss.xml',str(oldtime),str(newtime))
print('[进程6/6]RSS时间更新完成:%s=>%s' %(oldtime,newtime))
print('[RSS更新完成]')
print('生成了%s与%s' %(file_name,filename))

这样虽然在构建整合后的rss的时候有问题，不过originRss.xml里是没有广告的

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

error on line 26 at column 534: Sequence ']]>' not allowed in content #1

error on line 26 at column 534: Sequence ']]>' not allowed in content #1

apple050620312 commented Nov 8, 2023 •

edited

Loading

RavelloH commented Nov 10, 2023

apple050620312 commented Nov 10, 2023

RavelloH commented Nov 11, 2023

apple050620312 commented Nov 11, 2023

RavelloH commented Nov 12, 2023 •

edited

Loading

error on line 26 at column 534: Sequence ']]>' not allowed in content #1

error on line 26 at column 534: Sequence ']]>' not allowed in content #1

Comments

apple050620312 commented Nov 8, 2023 • edited Loading

RavelloH commented Nov 10, 2023

apple050620312 commented Nov 10, 2023

RavelloH commented Nov 11, 2023

apple050620312 commented Nov 11, 2023

RavelloH commented Nov 12, 2023 • edited Loading

apple050620312 commented Nov 8, 2023 •

edited

Loading

RavelloH commented Nov 12, 2023 •

edited

Loading