Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

error on line 26 at column 534: Sequence ']]>' not allowed in content #1

Open
apple050620312 opened this issue Nov 8, 2023 · 5 comments

Comments

@apple050620312
Copy link

apple050620312 commented Nov 8, 2023

我做了一個複製,用fetchrss抓facebook,但是不太成功,可以幫我看看嗎
https://github.com/apple050620312/RSS

@RavelloH
Copy link
Owner

我做了一個複製,用fetchrss抓facebook,但是不太成功,可以幫我看看嗎 https://github.com/apple050620312/RSS

@apple050620312 看起来是因为在fetchrss生成时自动添加了摘要,摘要和标题因为有换行符所以行数不同,导致生成的内容长度不一,从而无法正确解析。

要修复这个问题,可以先格式化整个文件,一个改进的实现如下。

# -*- coding: utf-8 -*-
## 使用有问题请到github.com/ravelloh/RSS提ISSUE
### Author: RavelloH
#### MICENCE: MIT
##### RSS Maker

from xml.dom.minidom import parse
import xml.etree.ElementTree as ET
from urllib.request import urlopen
from wget import download
from bs4 import BeautifulSoup as bs
import xml.dom.minidom
import re,os
import linecache

def format_xml(xml):
    pattern1 = re.compile(r'<description>(.*?)</description>', re.DOTALL)
    pattern2 = re.compile(r'<title>(.*?)</title>', re.DOTALL)
    
    def replace_newline(match):
        return match.group(0).replace('\n', '\\n')
    formatted_xml = re.sub(pattern1, replace_newline, xml)
    formatted_xml = re.sub(pattern2, replace_newline, formatted_xml)
    return formatted_xml
    
def process_xml_file(file_path):
    try:
        tree = ET.parse(file_path)
        root = tree.getroot()
        xml_content = ET.tostring(root, encoding='unicode')
        formatted_xml_content = format_xml(xml_content)
        new_root = ET.fromstring(formatted_xml_content)
        tree._setroot(new_root)
        tree.write(file_path, encoding='utf-8', xml_declaration=True)
    except Exception as e:
        print(f"处理XML文件 {file_path} 失败:{str(e)}")
        exit()

def remove_rss_line(text):
    lines = text.splitlines()
    new_lines = [line for line in lines if not line.startswith("<rss>")]
    return "\n".join(new_lines)

# 替换函数
def alter(file,old_str,new_str):
    lines = ''
    with open(file, "r", encoding="utf-8") as f1,open("%s.bak" % file, "w", encoding="utf-8") as f2:
        for line in f1:
            lines = lines + line
        if old_str in lines:
            lines = lines.replace(old_str, new_str , 1)
        f2.write(lines)
    os.remove(file)
    os.rename("%s.bak" % file, file)

# 初始化 
print('[进程0/6]正在初始化...')
rsslink = 'https://fetchrss.com/rss/64d97f73c0c2b813d8126e73654ac8295887f3316723e202.xml' # 此处为原始rss
if 'originRss.xml' in os.listdir('.'):
    os.remove('originRss.xml')
    filename = download(rsslink,'./originRss.xml')
    process_xml_file('./originRss.xml')
    print('\n[进程0/6]原始RSS已更新')
else:
    filename = download(rsslink,'./originRss.xml')
    process_xml_file('./originRss.xml')
    download(rsslink,'./rss.xml')
    process_xml_file('./rss.xml')
    l1 = []
    with open(r"./rss.xml", 'r') as fp:
        l1 = fp.readlines()
    with open(r"./rss.xml", 'w') as fp:
        for number, line in enumerate(l1):
            if number not in [15,16,17,18,19,20,21,22,23]:
                fp.write(line)
    print('\n[进程0/6]初始化完毕,已构筑文件')

# 下载原始rss
try:
    rsscontext = urlopen(rsslink)
    print('[进程1/6]RSS拉取成功')   
except:
    print('[Error]RSS拉取失败')
    exit()

# 打开原始及本地rss
try:
    OriginRss = xml.dom.minidom.parse(rsscontext)
    getresult = OriginRss.documentElement
    LocalRss = xml.dom.minidom.parse('./rss.xml')
    Localresult = LocalRss.documentElement
    print('[进程2/6]RSS打开成功')
except:
    print('[Error]RSS打开失败')
    exit()
    
# 判断RSS是否有更新
items = getresult.getElementsByTagName("item")
local = Localresult.getElementsByTagName("item")
newpost = items[0]
newtitle = newpost.getElementsByTagName('title')[0]
# 解析本地
localpost = local[0]
localtitle = localpost.getElementsByTagName('title')[0]
newtitlestring = newtitle.childNodes[0].data
localtitlestring = localtitle.childNodes[0].data
if newtitlestring == localtitlestring:
    print('[进程3/6]RSS比对完成:当前已同步,无需继续合并')
    newtimes = getresult.getElementsByTagName('pubDate')[0]
    oldtimes = Localresult.getElementsByTagName('pubDate')[0]
    newtime = newtimes.childNodes[0].data
    oldtime = oldtimes.childNodes[0].data
    alter('rss.xml',str(oldtime),str(newtime))
    print('[进程4已跳过]')
    print('[进程5已跳过]')
    print('[进程6/6]RSS时间更新完成:%s=>%s' %(oldtime,newtime))
    print('[RSS更新完成]')
    exit()
else:
    print('[进程3/6]RSS比对完成:有新项目待更新')
    
# 合并新项目(目前只设计了合并一个)
print('[进程4/6]正在合并新项目...')
filltext = '''
</image>

<item>
'''
# 查找对应行
with open(filename, 'r') as file:
    line = file.readline()
    counts = 1
    while line:
        if newtitlestring in line:
            break
        line = file.readline()
        counts += 1
# 创建插入新内容
tofiletext = linecache.getline(filename, counts-1)+linecache.getline(filename, counts)+linecache.getline(filename, counts+1)+linecache.getline(filename, counts+2)+linecache.getline(filename, counts+3)+linecache.getline(filename, counts+4)+linecache.getline(filename, counts+5)
file_name = "./rss.xml"
with open(file_name, 'r') as f:
    lines = f.readlines()
    lines.insert(15,tofiletext)
    s = ''.join(lines)
with open(file_name, 'w') as f:
    f.write(remove_rss_line(s))
print('\n[进程4/6]新项目已合并')

# 爬取博客内描述 去广告
keyword = '<![CDATA[<br/><br/><span style="font-size:12px; color: gray;">(Feed generated with <a href="https://fetchrss.com" target="_blank">FetchRSS</a>)</span>]]>'
totalline = len(open(file_name,'r').readlines())
needs = []
with open(file_name, 'r') as file:
    line = file.readline()
    counts = 1
    while line:
        if keyword in line:
            needs.append(counts)
        if counts == totalline:
            break
        line = file.readline()
        counts += 1

for j in needs:
    originurl = linecache.getline(file_name, j-1)
    pattern = re.compile(r'[a-zA-z]+://[^\s<]*')
    obj = bs(urlopen(pattern.search(originurl).group()).read(),'html.parser')
    description_info = obj.find_all('p')
    totallen = 0
    nowlen = 0
    for k in description_info:
        totallen += 1
    while True:
        if len(str(description_info[nowlen])) > 70:
            break
        if nowlen > totallen-1:
            break  
        nowlen += 1
    summary = str(description_info[nowlen].get_text(strip=True))
    print('[进程5/6]已获取一篇摘要:%s...' %(summary[:10]))
    alter(file_name,keyword,summary)

# 时间整理
newtimes = getresult.getElementsByTagName('pubDate')[0]
oldtimes = Localresult.getElementsByTagName('pubDate')[0]
newtime = newtimes.childNodes[0].data
oldtime = oldtimes.childNodes[0].data
alter('rss.xml',str(oldtime),str(newtime))
print('[进程6/6]RSS时间更新完成:%s=>%s' %(oldtime,newtime))
print('[RSS更新完成]')
print('生成了%s与%s' %(file_name,filename))

如果上述代码生效,则说明可正常解析此文件。

若错误仍继续出现,请尝试以下两个方法:

1.修改fetchrss的设置,仅需生成相关rss项目的标题、链接,无需其他附加信息,确保其与(https://github.com/RavelloH/RSS/blob/main/originRss.xml )中的结构一致
2.修改源代码,源代码中的此部分与行数替换有关,你可以自己更改相对行数来修改。行数信息放在一个列表内,请确保第一个项目的行数完全在此列表内

    with open(r"./rss.xml", 'w') as fp:
        for number, line in enumerate(l1):
            if number not in [15,16,17,18,19,20,21,22,23]:
                fp.write(line)
    print('\n[进程0/6]初始化完毕,已构筑文件')

@apple050620312
Copy link
Author

@RavelloH 可以參考一下你建議的fetchrss設定嗎

@RavelloH
Copy link
Owner

@RavelloH 可以參考一下你建議的fetchrss設定嗎

@apple050620312 我似乎是使用的是以元素选择器自己选择相关元素的方式构建的,没找到有导出配置的地方

不过,其实我还开发了其他的相关rss处理程序

考虑到你的原始rss已经有很丰富的内容了,继续用此程序应该用处不大,或许你应该试试直接使用RavelloH/rss-aggregator来整合你的rss,它也可以像此程序一样保留历史rss记录、自动同步RSS并更新日期、构建你的Github Pages页面

@apple050620312
Copy link
Author

@RavelloH 我想用這個只是因為他能去除廣告😂

@RavelloH
Copy link
Owner

RavelloH commented Nov 12, 2023

@RavelloH 我想用這個只是因為他能去除廣告😂

@apple050620312 可以尝试这样修改

# -*- coding: utf-8 -*-
## 使用有问题请到github.com/ravelloh/RSS提ISSUE
### Author: RavelloH
#### MICENCE: MIT
##### RSS Maker

from xml.dom.minidom import parse
import xml.etree.ElementTree as ET
from urllib.request import urlopen
from wget import download
from bs4 import BeautifulSoup as bs
import xml.dom.minidom
import re,os
import linecache

def format_xml(xml):
    pattern1 = re.compile(r'<description>(.*?)</description>', re.DOTALL)
    pattern2 = re.compile(r'<title>(.*?)</title>', re.DOTALL)
    
    def replace_newline(match):
        return match.group(0).replace('\n', '\\n')
    formatted_xml = re.sub(pattern1, replace_newline, xml)
    formatted_xml = re.sub(pattern2, replace_newline, formatted_xml)
    return formatted_xml.replace('&lt;span style="font-size:12px; color: gray;"&gt;(Feed generated with &lt;a href="https://fetchrss.com" target="_blank"&gt;FetchRSS&lt;/a&gt;)&lt;/span&gt;','')
    
def process_xml_file(file_path):
    try:
        tree = ET.parse(file_path)
        root = tree.getroot()
        xml_content = ET.tostring(root, encoding='unicode')
        formatted_xml_content = format_xml(xml_content)
        new_root = ET.fromstring(formatted_xml_content)
        tree._setroot(new_root)
        tree.write(file_path, encoding='utf-8', xml_declaration=True)
    except Exception as e:
        print(f"处理XML文件 {file_path} 失败:{str(e)}")
        exit()

def remove_rss_line(text):
    lines = text.splitlines()
    new_lines = [line for line in lines if not line.startswith("<rss>")]
    return "\n".join(new_lines)

# 替换函数
def alter(file,old_str,new_str):
    lines = ''
    with open(file, "r", encoding="utf-8") as f1,open("%s.bak" % file, "w", encoding="utf-8") as f2:
        for line in f1:
            lines = lines + line
        if old_str in lines:
            lines = lines.replace(old_str, new_str , 1)
        f2.write(lines)
    os.remove(file)
    os.rename("%s.bak" % file, file)

# 初始化 
print('[进程0/6]正在初始化...')
rsslink = 'https://fetchrss.com/rss/64d97f73c0c2b813d8126e73654ac8295887f3316723e202.xml' # 此处为原始rss
if 'originRss.xml' in os.listdir('.'):
    os.remove('originRss.xml')
    filename = download(rsslink,'./originRss.xml')
    process_xml_file('./originRss.xml')
    print('\n[进程0/6]原始RSS已更新')
else:
    filename = download(rsslink,'./originRss.xml')
    process_xml_file('./originRss.xml')
    download(rsslink,'./rss.xml')
    process_xml_file('./rss.xml')
    l1 = []
    with open(r"./rss.xml", 'r') as fp:
        l1 = fp.readlines()
    with open(r"./rss.xml", 'w') as fp:
        for number, line in enumerate(l1):
            if number not in [15,16,17,18,19,20,21,22,23]:
                fp.write(line)
    print('\n[进程0/6]初始化完毕,已构筑文件')

# 下载原始rss
try:
    rsscontext = urlopen(rsslink)
    print('[进程1/6]RSS拉取成功')   
except:
    print('[Error]RSS拉取失败')
    exit()

# 打开原始及本地rss
try:
    OriginRss = xml.dom.minidom.parse(rsscontext)
    getresult = OriginRss.documentElement
    LocalRss = xml.dom.minidom.parse('./rss.xml')
    Localresult = LocalRss.documentElement
    print('[进程2/6]RSS打开成功')
except:
    print('[Error]RSS打开失败')
    exit()
    
# 判断RSS是否有更新
items = getresult.getElementsByTagName("item")
local = Localresult.getElementsByTagName("item")
newpost = items[0]
newtitle = newpost.getElementsByTagName('title')[0]
# 解析本地
localpost = local[0]
localtitle = localpost.getElementsByTagName('title')[0]
newtitlestring = newtitle.childNodes[0].data
localtitlestring = localtitle.childNodes[0].data
if newtitlestring == localtitlestring:
    print('[进程3/6]RSS比对完成:当前已同步,无需继续合并')
    newtimes = getresult.getElementsByTagName('pubDate')[0]
    oldtimes = Localresult.getElementsByTagName('pubDate')[0]
    newtime = newtimes.childNodes[0].data
    oldtime = oldtimes.childNodes[0].data
    alter('rss.xml',str(oldtime),str(newtime))
    print('[进程4已跳过]')
    print('[进程5已跳过]')
    print('[进程6/6]RSS时间更新完成:%s=>%s' %(oldtime,newtime))
    print('[RSS更新完成]')
    exit()
else:
    print('[进程3/6]RSS比对完成:有新项目待更新')
    
# 合并新项目(目前只设计了合并一个)
print('[进程4/6]正在合并新项目...')
filltext = '''
</image>

<item>
'''
# 查找对应行
with open(filename, 'r') as file:
    line = file.readline()
    counts = 1
    while line:
        if newtitlestring in line:
            break
        line = file.readline()
        counts += 1
# 创建插入新内容
tofiletext = linecache.getline(filename, counts-1)+linecache.getline(filename, counts)+linecache.getline(filename, counts+1)+linecache.getline(filename, counts+2)+linecache.getline(filename, counts+3)+linecache.getline(filename, counts+4)+linecache.getline(filename, counts+5)
file_name = "./rss.xml"
with open(file_name, 'r') as f:
    lines = f.readlines()
    lines.insert(15,tofiletext)
    s = ''.join(lines)
with open(file_name, 'w') as f:
    f.write(remove_rss_line(s))
print('\n[进程4/6]新项目已合并')

# 爬取博客内描述 去广告
keyword = '<![CDATA[<br/><br/><span style="font-size:12px; color: gray;">(Feed generated with <a href="https://fetchrss.com" target="_blank">FetchRSS</a>)</span>]]>'
totalline = len(open(file_name,'r').readlines())
needs = []
with open(file_name, 'r') as file:
    line = file.readline()
    counts = 1
    while line:
        if keyword in line:
            needs.append(counts)
        if counts == totalline:
            break
        line = file.readline()
        counts += 1

for j in needs:
    originurl = linecache.getline(file_name, j-1)
    pattern = re.compile(r'[a-zA-z]+://[^\s<]*')
    obj = bs(urlopen(pattern.search(originurl).group()).read(),'html.parser')
    description_info = obj.find_all('p')
    totallen = 0
    nowlen = 0
    for k in description_info:
        totallen += 1
    while True:
        if len(str(description_info[nowlen])) > 70:
            break
        if nowlen > totallen-1:
            break  
        nowlen += 1
    summary = str(description_info[nowlen].get_text(strip=True))
    print('[进程5/6]已获取一篇摘要:%s...' %(summary[:10]))
    alter(file_name,keyword,summary)

# 时间整理
newtimes = getresult.getElementsByTagName('pubDate')[0]
oldtimes = Localresult.getElementsByTagName('pubDate')[0]
newtime = newtimes.childNodes[0].data
oldtime = oldtimes.childNodes[0].data
alter('rss.xml',str(oldtime),str(newtime))
print('[进程6/6]RSS时间更新完成:%s=>%s' %(oldtime,newtime))
print('[RSS更新完成]')
print('生成了%s与%s' %(file_name,filename))

这样虽然在构建整合后的rss的时候有问题,不过originRss.xml里是没有广告的

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

2 participants