过滤CDATA
This commit is contained in:
parent
03bd04c4f9
commit
83a0aeea34
10
rules.json
10
rules.json
|
@ -13,5 +13,15 @@
|
||||||
"rss_source": "https://rsshub.app/solidot/www",
|
"rss_source": "https://rsshub.app/solidot/www",
|
||||||
"identity": "misskey.dev",
|
"identity": "misskey.dev",
|
||||||
"extra_content": ""
|
"extra_content": ""
|
||||||
|
},
|
||||||
|
"ZaoBao": {
|
||||||
|
"rss_source": "https://rsshub.app/zaobao/realtime/china",
|
||||||
|
"identity": "ZaobaoBot@x61.uk",
|
||||||
|
"extra_content": "#News"
|
||||||
|
},
|
||||||
|
"NIKKEI": {
|
||||||
|
"rss_source": "https://rsshub.app/nikkei/index",
|
||||||
|
"identity": "NIKKEI@x61.uk",
|
||||||
|
"extra_content": "#News"
|
||||||
}
|
}
|
||||||
}
|
}
|
|
@ -2,6 +2,7 @@
|
||||||
# -*- coding: UTF-8 -*-
|
# -*- coding: UTF-8 -*-
|
||||||
|
|
||||||
import json
|
import json
|
||||||
|
import re
|
||||||
import sqlite3
|
import sqlite3
|
||||||
import requests
|
import requests
|
||||||
import xmltodict
|
import xmltodict
|
||||||
|
@ -42,13 +43,15 @@ def spider(rule_name, rss_url):
|
||||||
item_list = result['rss']['channel']['item']
|
item_list = result['rss']['channel']['item']
|
||||||
for i in item_list:
|
for i in item_list:
|
||||||
unique = c.execute('SELECT * FROM "main"."result" WHERE "title" = ? LIMIT 0,1', (i['title'],)).fetchone()
|
unique = c.execute('SELECT * FROM "main"."result" WHERE "title" = ? LIMIT 0,1', (i['title'],)).fetchone()
|
||||||
|
re_cdata = re.compile('//<![CDATA[[^>]*//]]>', re.I)
|
||||||
|
title = re_cdata.sub('', i['title'])
|
||||||
if not (unique is None):
|
if not (unique is None):
|
||||||
print("Skip: ", i['title'])
|
print("Skip: ", title)
|
||||||
continue
|
continue
|
||||||
print("Got: ", i['title'])
|
print("Got: ", title)
|
||||||
desc = i['description'].replace("<blockquote>", "“").replace("</blockquote>", "”")
|
desc = i['description'].replace("<blockquote>", "“").replace("</blockquote>", "”")
|
||||||
c.execute('INSERT INTO "main"."result" ("rule_name", "url", "title", "description", "timestamp")'
|
c.execute('INSERT INTO "main"."result" ("rule_name", "url", "title", "description", "timestamp")'
|
||||||
' VALUES (?, ?, ?, ?, ?)', (rule_name, i['link'], i['title'], desc, time.time()))
|
' VALUES (?, ?, ?, ?, ?)', (rule_name, i['link'], title, desc, time.time()))
|
||||||
|
|
||||||
c.close()
|
c.close()
|
||||||
end = time.time()
|
end = time.time()
|
||||||
|
|
Loading…
Reference in New Issue