From ec3089c8bcb58fe0bd2a6e71c928c711bac967ec Mon Sep 17 00:00:00 2001
From: ManeraKai <manerakai@protonmail.com>
Date: Fri, 8 Apr 2022 05:36:19 +0300
Subject: [PATCH] Added simple url validation for the python fetching script
 #171

---
 src/instances/data.json        | 12 +++---
 src/instances/get_instances.py | 69 +++++++++++++++++++++++-----------
 2 files changed, 55 insertions(+), 26 deletions(-)

diff --git a/src/instances/data.json b/src/instances/data.json
index 4f34bbc..079bc7a 100644
--- a/src/instances/data.json
+++ b/src/instances/data.json
@@ -17,9 +17,9 @@
       "https://invidious.lunar.icu",
       "https://invidious.mutahar.rocks",
       "https://invidious.weblibre.org",
+      "https://invidious.esmailelbob.xyz",
       "https://invidious.privacy.gd",
       "https://youtube.076.ne.jp",
-      "https://invidious.esmailelbob.xyz",
       "https://invidious.namazso.eu"
     ],
     "tor": [
@@ -175,7 +175,6 @@
       "https://libreddit.winscloud.net",
       "https://libreddit.tiekoetter.com",
       "https://reddit.rtrace.io",
-      "http://libreddit.lqs5fjmajyp7rvp4qvyubwofzi6d4imua7vs237rkc4m5qogitqwrgyd.onion/",
       "https://libreddit.lunar.icu"
     ],
     "tor": [
@@ -190,7 +189,8 @@
       "http://libredoxhxwnmsb6dvzzd35hmgzmawsq5i764es7witwhddvpc2razid.onion",
       "http://libreddit.2syis2nnyytz6jnusnjurva4swlaizlnleiks5mjp46phuwjbdjqwgqd.onion",
       "http://ol5begilptoou34emq2sshf3may3hlblvipdjtybbovpb7c7zodxmtqd.onion",
-      "http://lbrdtjaj7567ptdd4rv74lv27qhxfkraabnyphgcvptl64ijx2tijwid.onion"
+      "http://lbrdtjaj7567ptdd4rv74lv27qhxfkraabnyphgcvptl64ijx2tijwid.onion",
+      "http://libreddit.lqs5fjmajyp7rvp4qvyubwofzi6d4imua7vs237rkc4m5qogitqwrgyd.onion"
     ]
   },
   "teddit": {
@@ -230,12 +230,14 @@
       "https://wikiless.sethforprivacy.com",
       "https://wiki.604kph.xyz",
       "https://wikiless.lunar.icu",
-      "https://https://wiki.froth.zone",
-      "https://hflqp2ejxygpj6cdwo3ogfieqmxw3b56w7dblt7bor2ltwk6kcfa.b32.i2p"
+      "https://https://wiki.froth.zone"
     ],
     "tor": [
       "http://dj2tbh2nqfxyfmvq33cjmhuw7nb6am7thzd3zsjvizeqf374fixbrxyd.onion",
       "http://c2pesewpalbi6lbfc5hf53q4g3ovnxe4s7tfa6k2aqkf7jd7a7dlz5ad.onion"
+    ],
+    "i2p": [
+      "http://hflqp2ejxygpj6cdwo3ogfieqmxw3b56w7dblt7bor2ltwk6kcfa.b32.i2p"
     ]
   },
   "scribe": {
diff --git a/src/instances/get_instances.py b/src/instances/get_instances.py
index 5f54fcf..4fe3bc9 100644
--- a/src/instances/get_instances.py
+++ b/src/instances/get_instances.py
@@ -5,13 +5,19 @@ import json
 from urllib.parse import urlparse
 from bs4 import BeautifulSoup
 import re
+from colorama import Fore, Back, Style
 
 mightyList = {}
 
-
-def get_host_name(link):
-    url = urlparse(link)
-    return url.netloc
+def filterLastSlash(urlList):
+    tmp = []
+    for i in urlList:
+        if i.endswith('/'):
+            tmp.append(i[:-1])
+            print(Fore.YELLOW + "filtered " + Style.RESET_ALL + i)
+        else:
+            tmp.append(i)
+    return tmp
 
 
 # Invidious
@@ -26,7 +32,7 @@ for instance in rJson:
     elif instance[1]['type'] == 'onion':
         invidiousList['tor'].append(instance[1]['uri'])
 mightyList['invidious'] = invidiousList
-print('fetched Invidious')
+print(Fore.GREEN + 'fetched ' + Style.RESET_ALL + 'Invidious')
 
 
 # Nitter
@@ -53,10 +59,10 @@ for table in tables:
             url = 'https://' + url
             nitterList['normal'].append(url)
 mightyList['nitter'] = nitterList
-print('fetched Nitter')
+print(Fore.GREEN + 'fetched ' + Style.RESET_ALL + 'Nitter')
 
 # Bibliogram
-r = requests.get('https://bibliogram.1d4.us/api/instances')
+r = requests.get('https://bibliogram.pussthecat.org/api/instances')
 rJson = json.loads(r.text)
 bibliogramList = {}
 bibliogramList['normal'] = []
@@ -64,7 +70,7 @@ bibliogramList['tor'] = []
 for item in rJson['data']:
     bibliogramList['normal'].append(item['address'])
 mightyList['bibliogram'] = bibliogramList
-print('fetched Bibliogram')
+print(Fore.GREEN + 'fetched ' + Style.RESET_ALL + 'Bibliogram')
 
 # LibReddit
 r = requests.get(
@@ -72,15 +78,19 @@ r = requests.get(
 libredditList = {}
 libredditList['normal'] = []
 libredditList['tor'] = []
+
 tmp = re.findall(
     r"\| \[.*\]\(([-a-zA-Z0-9@:%_\+.~#?&//=]{2,}\.[a-z]{2,}\b(?:\/[-a-zA-Z0-9@:%_\+.~#?&//=]*)?)\)*\|*[A-Z]{0,}.*\|.*\|", r.text)
+
+tmp = filterLastSlash(tmp)
+
 for item in tmp:
     if item.endswith('.onion'):
         libredditList['tor'].append(item)
     else:
         libredditList['normal'].append(item)
 mightyList['libreddit'] = libredditList
-print('fetched LibReddit')
+print(Fore.GREEN + 'fetched ' + Style.RESET_ALL + 'LibReddit')
 
 # Teddit
 r = requests.get(
@@ -99,7 +109,7 @@ for item in rJson:
             tedditList['tor'].append(onion)
 
 mightyList['teddit'] = tedditList
-print('fetched Teddit')
+print(Fore.GREEN + 'fetched ' + Style.RESET_ALL + 'Teddit')
 
 
 # Wikiless
@@ -117,7 +127,7 @@ for item in rJson:
     else:
         wikilessList['normal'].append('https://' + item)
 mightyList['wikiless'] = wikilessList
-print('fetched Wikiless')
+print(Fore.GREEN + 'fetched ' + Style.RESET_ALL + 'Wikiless')
 
 # Scribe
 r = requests.get(
@@ -129,8 +139,7 @@ scribeList['tor'] = []
 for item in rJson:
     scribeList['normal'].append(item)
 mightyList['scribe'] = scribeList
-print('fetched Scribe')
-
+print(Fore.GREEN + 'fetched ' + Style.RESET_ALL + 'Scribe')
 
 # SimplyTranslate
 r = requests.get('https://simple-web.org/instances/simplytranslate')
@@ -145,7 +154,7 @@ for item in r.text.strip().split('\n'):
     simplyTranslateList['tor'].append('http://' + item)
 
 mightyList['simplyTranslate'] = simplyTranslateList
-print('fetched SimplyTranslate')
+print(Fore.GREEN + 'fetched ' + Style.RESET_ALL + 'SimplyTranslate')
 
 # LinvgaTranslate
 r = requests.get(
@@ -157,7 +166,7 @@ lingvaList['tor'] = []
 for item in rJson:
     lingvaList['normal'].append(item)
 mightyList['lingva'] = lingvaList
-print('fetched LinvgaTranslate')
+print(Fore.GREEN + 'fetched ' + Style.RESET_ALL + 'LinvgaTranslate')
 
 
 # SearX, SearXNG
@@ -190,7 +199,7 @@ for item in rJson['instances']:
 
 mightyList['searx'] = searxList
 mightyList['searxng'] = searxngList
-print('fetched SearX, SearXNG')
+print(Fore.GREEN + 'fetched ' + Style.RESET_ALL + 'SearX, SearXNG')
 
 # Whoogle
 r = requests.get(
@@ -202,8 +211,7 @@ whoogleList['tor'] = []
 for item in tmpList:
     whoogleList['normal'].append(item)
 mightyList['whoogle'] = whoogleList
-print('fetched Whoogle')
-
+print(Fore.GREEN + 'fetched ' + Style.RESET_ALL + 'Whoogle')
 
 # Rimgo
 r = requests.get(
@@ -219,7 +227,7 @@ for item in rJson:
     else:
         rimgoList['normal'].append('https://' + item)
 mightyList['rimgo'] = rimgoList
-print('fetched Rimgo')
+print(Fore.GREEN + 'fetched ' + Style.RESET_ALL + 'Rimgo')
 
 # Peertube
 r = requests.get(
@@ -231,12 +239,31 @@ for k in rJson['data']:
     myList.append('https://'+k['host'])
 
 mightyList['peertube'] = myList
-print('fetched Peertube')
+print(Fore.GREEN + 'fetched ' + Style.RESET_ALL + 'Peertube')
 
 
+def isValid(url):  # This code is contributed by avanitrachhadiya2155
+    return re.search(r"([-a-zA-Z0-9@:%_\+.~#?&//=]{2,}\.[a-z0-9]{2,}\b(?:\/[-a-zA-Z0-9@:%_\+.~#?&//=]*)?)\)*\|*[A-Z]{0,}", url)
+
+
+for k1, v1 in mightyList.items():
+    if type(mightyList[k1]) is dict:
+        for k2, v2 in mightyList[k1].items():
+            for instance in mightyList[k1][k2]:
+                if (not isValid(instance)):
+                    mightyList[k1][k2].remove(instance)
+                    print("removed " + instance)
+
+    elif type(mightyList[k1]) is list:
+        for instance in mightyList[k1]:
+            if (not isValid(instance)):
+                mightyList[k1].remove(instance)
+                print("removed " + instance)
+
 # Writing to file
 json_object = json.dumps(mightyList, ensure_ascii=False, indent=2)
 with open('./src/instances/data.json', 'w') as outfile:
     outfile.write(json_object)
 # print(json_object)
-print('wrote instances/data.json')
+
+print(Fore.BLUE + 'wrote ' + Style.RESET_ALL + 'instances/data.json')