Added simple url validation for the python fetching script #171

This commit is contained in:
ManeraKai 2022-04-08 05:36:19 +03:00
parent 4faf8f0d67
commit ec3089c8bc
No known key found for this signature in database
GPG Key ID: 5ABC31FFD562E337
2 changed files with 55 additions and 26 deletions

View File

@ -17,9 +17,9 @@
"https://invidious.lunar.icu", "https://invidious.lunar.icu",
"https://invidious.mutahar.rocks", "https://invidious.mutahar.rocks",
"https://invidious.weblibre.org", "https://invidious.weblibre.org",
"https://invidious.esmailelbob.xyz",
"https://invidious.privacy.gd", "https://invidious.privacy.gd",
"https://youtube.076.ne.jp", "https://youtube.076.ne.jp",
"https://invidious.esmailelbob.xyz",
"https://invidious.namazso.eu" "https://invidious.namazso.eu"
], ],
"tor": [ "tor": [
@ -175,7 +175,6 @@
"https://libreddit.winscloud.net", "https://libreddit.winscloud.net",
"https://libreddit.tiekoetter.com", "https://libreddit.tiekoetter.com",
"https://reddit.rtrace.io", "https://reddit.rtrace.io",
"http://libreddit.lqs5fjmajyp7rvp4qvyubwofzi6d4imua7vs237rkc4m5qogitqwrgyd.onion/",
"https://libreddit.lunar.icu" "https://libreddit.lunar.icu"
], ],
"tor": [ "tor": [
@ -190,7 +189,8 @@
"http://libredoxhxwnmsb6dvzzd35hmgzmawsq5i764es7witwhddvpc2razid.onion", "http://libredoxhxwnmsb6dvzzd35hmgzmawsq5i764es7witwhddvpc2razid.onion",
"http://libreddit.2syis2nnyytz6jnusnjurva4swlaizlnleiks5mjp46phuwjbdjqwgqd.onion", "http://libreddit.2syis2nnyytz6jnusnjurva4swlaizlnleiks5mjp46phuwjbdjqwgqd.onion",
"http://ol5begilptoou34emq2sshf3may3hlblvipdjtybbovpb7c7zodxmtqd.onion", "http://ol5begilptoou34emq2sshf3may3hlblvipdjtybbovpb7c7zodxmtqd.onion",
"http://lbrdtjaj7567ptdd4rv74lv27qhxfkraabnyphgcvptl64ijx2tijwid.onion" "http://lbrdtjaj7567ptdd4rv74lv27qhxfkraabnyphgcvptl64ijx2tijwid.onion",
"http://libreddit.lqs5fjmajyp7rvp4qvyubwofzi6d4imua7vs237rkc4m5qogitqwrgyd.onion"
] ]
}, },
"teddit": { "teddit": {
@ -230,12 +230,14 @@
"https://wikiless.sethforprivacy.com", "https://wikiless.sethforprivacy.com",
"https://wiki.604kph.xyz", "https://wiki.604kph.xyz",
"https://wikiless.lunar.icu", "https://wikiless.lunar.icu",
"https://https://wiki.froth.zone", "https://https://wiki.froth.zone"
"https://hflqp2ejxygpj6cdwo3ogfieqmxw3b56w7dblt7bor2ltwk6kcfa.b32.i2p"
], ],
"tor": [ "tor": [
"http://dj2tbh2nqfxyfmvq33cjmhuw7nb6am7thzd3zsjvizeqf374fixbrxyd.onion", "http://dj2tbh2nqfxyfmvq33cjmhuw7nb6am7thzd3zsjvizeqf374fixbrxyd.onion",
"http://c2pesewpalbi6lbfc5hf53q4g3ovnxe4s7tfa6k2aqkf7jd7a7dlz5ad.onion" "http://c2pesewpalbi6lbfc5hf53q4g3ovnxe4s7tfa6k2aqkf7jd7a7dlz5ad.onion"
],
"i2p": [
"http://hflqp2ejxygpj6cdwo3ogfieqmxw3b56w7dblt7bor2ltwk6kcfa.b32.i2p"
] ]
}, },
"scribe": { "scribe": {

View File

@ -5,13 +5,19 @@ import json
from urllib.parse import urlparse from urllib.parse import urlparse
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
import re import re
from colorama import Fore, Back, Style
mightyList = {} mightyList = {}
def filterLastSlash(urlList):
def get_host_name(link): tmp = []
url = urlparse(link) for i in urlList:
return url.netloc if i.endswith('/'):
tmp.append(i[:-1])
print(Fore.YELLOW + "filtered " + Style.RESET_ALL + i)
else:
tmp.append(i)
return tmp
# Invidious # Invidious
@ -26,7 +32,7 @@ for instance in rJson:
elif instance[1]['type'] == 'onion': elif instance[1]['type'] == 'onion':
invidiousList['tor'].append(instance[1]['uri']) invidiousList['tor'].append(instance[1]['uri'])
mightyList['invidious'] = invidiousList mightyList['invidious'] = invidiousList
print('fetched Invidious') print(Fore.GREEN + 'fetched ' + Style.RESET_ALL + 'Invidious')
# Nitter # Nitter
@ -53,10 +59,10 @@ for table in tables:
url = 'https://' + url url = 'https://' + url
nitterList['normal'].append(url) nitterList['normal'].append(url)
mightyList['nitter'] = nitterList mightyList['nitter'] = nitterList
print('fetched Nitter') print(Fore.GREEN + 'fetched ' + Style.RESET_ALL + 'Nitter')
# Bibliogram # Bibliogram
r = requests.get('https://bibliogram.1d4.us/api/instances') r = requests.get('https://bibliogram.pussthecat.org/api/instances')
rJson = json.loads(r.text) rJson = json.loads(r.text)
bibliogramList = {} bibliogramList = {}
bibliogramList['normal'] = [] bibliogramList['normal'] = []
@ -64,7 +70,7 @@ bibliogramList['tor'] = []
for item in rJson['data']: for item in rJson['data']:
bibliogramList['normal'].append(item['address']) bibliogramList['normal'].append(item['address'])
mightyList['bibliogram'] = bibliogramList mightyList['bibliogram'] = bibliogramList
print('fetched Bibliogram') print(Fore.GREEN + 'fetched ' + Style.RESET_ALL + 'Bibliogram')
# LibReddit # LibReddit
r = requests.get( r = requests.get(
@ -72,15 +78,19 @@ r = requests.get(
libredditList = {} libredditList = {}
libredditList['normal'] = [] libredditList['normal'] = []
libredditList['tor'] = [] libredditList['tor'] = []
tmp = re.findall( tmp = re.findall(
r"\| \[.*\]\(([-a-zA-Z0-9@:%_\+.~#?&//=]{2,}\.[a-z]{2,}\b(?:\/[-a-zA-Z0-9@:%_\+.~#?&//=]*)?)\)*\|*[A-Z]{0,}.*\|.*\|", r.text) r"\| \[.*\]\(([-a-zA-Z0-9@:%_\+.~#?&//=]{2,}\.[a-z]{2,}\b(?:\/[-a-zA-Z0-9@:%_\+.~#?&//=]*)?)\)*\|*[A-Z]{0,}.*\|.*\|", r.text)
tmp = filterLastSlash(tmp)
for item in tmp: for item in tmp:
if item.endswith('.onion'): if item.endswith('.onion'):
libredditList['tor'].append(item) libredditList['tor'].append(item)
else: else:
libredditList['normal'].append(item) libredditList['normal'].append(item)
mightyList['libreddit'] = libredditList mightyList['libreddit'] = libredditList
print('fetched LibReddit') print(Fore.GREEN + 'fetched ' + Style.RESET_ALL + 'LibReddit')
# Teddit # Teddit
r = requests.get( r = requests.get(
@ -99,7 +109,7 @@ for item in rJson:
tedditList['tor'].append(onion) tedditList['tor'].append(onion)
mightyList['teddit'] = tedditList mightyList['teddit'] = tedditList
print('fetched Teddit') print(Fore.GREEN + 'fetched ' + Style.RESET_ALL + 'Teddit')
# Wikiless # Wikiless
@ -117,7 +127,7 @@ for item in rJson:
else: else:
wikilessList['normal'].append('https://' + item) wikilessList['normal'].append('https://' + item)
mightyList['wikiless'] = wikilessList mightyList['wikiless'] = wikilessList
print('fetched Wikiless') print(Fore.GREEN + 'fetched ' + Style.RESET_ALL + 'Wikiless')
# Scribe # Scribe
r = requests.get( r = requests.get(
@ -129,8 +139,7 @@ scribeList['tor'] = []
for item in rJson: for item in rJson:
scribeList['normal'].append(item) scribeList['normal'].append(item)
mightyList['scribe'] = scribeList mightyList['scribe'] = scribeList
print('fetched Scribe') print(Fore.GREEN + 'fetched ' + Style.RESET_ALL + 'Scribe')
# SimplyTranslate # SimplyTranslate
r = requests.get('https://simple-web.org/instances/simplytranslate') r = requests.get('https://simple-web.org/instances/simplytranslate')
@ -145,7 +154,7 @@ for item in r.text.strip().split('\n'):
simplyTranslateList['tor'].append('http://' + item) simplyTranslateList['tor'].append('http://' + item)
mightyList['simplyTranslate'] = simplyTranslateList mightyList['simplyTranslate'] = simplyTranslateList
print('fetched SimplyTranslate') print(Fore.GREEN + 'fetched ' + Style.RESET_ALL + 'SimplyTranslate')
# LinvgaTranslate # LinvgaTranslate
r = requests.get( r = requests.get(
@ -157,7 +166,7 @@ lingvaList['tor'] = []
for item in rJson: for item in rJson:
lingvaList['normal'].append(item) lingvaList['normal'].append(item)
mightyList['lingva'] = lingvaList mightyList['lingva'] = lingvaList
print('fetched LinvgaTranslate') print(Fore.GREEN + 'fetched ' + Style.RESET_ALL + 'LinvgaTranslate')
# SearX, SearXNG # SearX, SearXNG
@ -190,7 +199,7 @@ for item in rJson['instances']:
mightyList['searx'] = searxList mightyList['searx'] = searxList
mightyList['searxng'] = searxngList mightyList['searxng'] = searxngList
print('fetched SearX, SearXNG') print(Fore.GREEN + 'fetched ' + Style.RESET_ALL + 'SearX, SearXNG')
# Whoogle # Whoogle
r = requests.get( r = requests.get(
@ -202,8 +211,7 @@ whoogleList['tor'] = []
for item in tmpList: for item in tmpList:
whoogleList['normal'].append(item) whoogleList['normal'].append(item)
mightyList['whoogle'] = whoogleList mightyList['whoogle'] = whoogleList
print('fetched Whoogle') print(Fore.GREEN + 'fetched ' + Style.RESET_ALL + 'Whoogle')
# Rimgo # Rimgo
r = requests.get( r = requests.get(
@ -219,7 +227,7 @@ for item in rJson:
else: else:
rimgoList['normal'].append('https://' + item) rimgoList['normal'].append('https://' + item)
mightyList['rimgo'] = rimgoList mightyList['rimgo'] = rimgoList
print('fetched Rimgo') print(Fore.GREEN + 'fetched ' + Style.RESET_ALL + 'Rimgo')
# Peertube # Peertube
r = requests.get( r = requests.get(
@ -231,12 +239,31 @@ for k in rJson['data']:
myList.append('https://'+k['host']) myList.append('https://'+k['host'])
mightyList['peertube'] = myList mightyList['peertube'] = myList
print('fetched Peertube') print(Fore.GREEN + 'fetched ' + Style.RESET_ALL + 'Peertube')
def isValid(url): # This code is contributed by avanitrachhadiya2155
return re.search(r"([-a-zA-Z0-9@:%_\+.~#?&//=]{2,}\.[a-z0-9]{2,}\b(?:\/[-a-zA-Z0-9@:%_\+.~#?&//=]*)?)\)*\|*[A-Z]{0,}", url)
for k1, v1 in mightyList.items():
if type(mightyList[k1]) is dict:
for k2, v2 in mightyList[k1].items():
for instance in mightyList[k1][k2]:
if (not isValid(instance)):
mightyList[k1][k2].remove(instance)
print("removed " + instance)
elif type(mightyList[k1]) is list:
for instance in mightyList[k1]:
if (not isValid(instance)):
mightyList[k1].remove(instance)
print("removed " + instance)
# Writing to file # Writing to file
json_object = json.dumps(mightyList, ensure_ascii=False, indent=2) json_object = json.dumps(mightyList, ensure_ascii=False, indent=2)
with open('./src/instances/data.json', 'w') as outfile: with open('./src/instances/data.json', 'w') as outfile:
outfile.write(json_object) outfile.write(json_object)
# print(json_object) # print(json_object)
print('wrote instances/data.json')
print(Fore.BLUE + 'wrote ' + Style.RESET_ALL + 'instances/data.json')