过滤CDATA

This commit is contained in:
qcminecraft 2021-02-04 21:11:08 +08:00
parent 03bd04c4f9
commit 83a0aeea34
2 changed files with 16 additions and 3 deletions

View File

@ -13,5 +13,15 @@
"rss_source": "https://rsshub.app/solidot/www", "rss_source": "https://rsshub.app/solidot/www",
"identity": "misskey.dev", "identity": "misskey.dev",
"extra_content": "" "extra_content": ""
},
"ZaoBao": {
"rss_source": "https://rsshub.app/zaobao/realtime/china",
"identity": "ZaobaoBot@x61.uk",
"extra_content": "#News"
},
"NIKKEI": {
"rss_source": "https://rsshub.app/nikkei/index",
"identity": "NIKKEI@x61.uk",
"extra_content": "#News"
} }
} }

View File

@ -2,6 +2,7 @@
# -*- coding: UTF-8 -*- # -*- coding: UTF-8 -*-
import json import json
import re
import sqlite3 import sqlite3
import requests import requests
import xmltodict import xmltodict
@ -42,13 +43,15 @@ def spider(rule_name, rss_url):
item_list = result['rss']['channel']['item'] item_list = result['rss']['channel']['item']
for i in item_list: for i in item_list:
unique = c.execute('SELECT * FROM "main"."result" WHERE "title" = ? LIMIT 0,1', (i['title'],)).fetchone() unique = c.execute('SELECT * FROM "main"."result" WHERE "title" = ? LIMIT 0,1', (i['title'],)).fetchone()
re_cdata = re.compile('//<![CDATA[[^>]*//]]>', re.I)
title = re_cdata.sub('', i['title'])
if not (unique is None): if not (unique is None):
print("Skip: ", i['title']) print("Skip: ", title)
continue continue
print("Got: ", i['title']) print("Got: ", title)
desc = i['description'].replace("<blockquote>", "").replace("</blockquote>", "") desc = i['description'].replace("<blockquote>", "").replace("</blockquote>", "")
c.execute('INSERT INTO "main"."result" ("rule_name", "url", "title", "description", "timestamp")' c.execute('INSERT INTO "main"."result" ("rule_name", "url", "title", "description", "timestamp")'
' VALUES (?, ?, ?, ?, ?)', (rule_name, i['link'], i['title'], desc, time.time())) ' VALUES (?, ?, ?, ?, ?)', (rule_name, i['link'], title, desc, time.time()))
c.close() c.close()
end = time.time() end = time.time()