libredirect/src/instances/get_instances.py

# Note: Run this script from the root of the repo

import requests
import json
from urllib.parse import urlparse
from bs4 import BeautifulSoup
import re
from colorama import Fore, Back, Style
from urllib.parse import urlparse
import socket
import subprocess

mightyList = {}

startRegex = "https?:\/{2}(?:[^\s\/]+\.)+"
endRegex = "(?:\/[^\s\/]+)*\/?"
torRegex = startRegex + "onion" + endRegex
i2pRegex = startRegex + "i2p" + endRegex
lokiRegex = startRegex + "loki" + endRegex
authRegex = "https?:\/{2}\S+:\S+@(?:[^\s\/]+\.)+[a-zA-Z0-9]+" + endRegex

def filterLastSlash(urlList):
    tmp = {}
    for x in urlList:
        tmp[x] = {}
        for y in urlList[x]:
            tmp[x][y] = []
            for z in urlList[x][y]:
                if z.endswith('/'):
                    tmp[x][y].append(z[:-1])
                    print(Fore.YELLOW + "Fixed " + Style.RESET_ALL + z)
                else:
                    tmp[x][y].append(z)
    return tmp


def ip2bin(ip): return "".join(
    map(
        str,
        [
            "{0:08b}".format(int(x)) for x in ip.split(".")
        ]
    )
)


def get_cloudflare_ips():
    r = requests.get('https://www.cloudflare.com/ips-v4')
    return r.text.split('\n')


cloudflare_ips = get_cloudflare_ips()


def is_cloudflare(url):
    instance_ip = None
    try:
        instance_ip = socket.gethostbyname(urlparse(url).hostname)
        if instance_ip is None:
            return False
    except:
        return False
    instance_bin = ip2bin(instance_ip)

    for cloudflare_ip_mask in cloudflare_ips:
        cloudflare_ip = cloudflare_ip_mask.split('/')[0]
        cloudflare_bin = ip2bin(cloudflare_ip)

        mask = int(cloudflare_ip_mask.split('/')[1])
        cloudflare_bin_masked = cloudflare_bin[:mask]
        instance_bin_masked = instance_bin[:mask]

        if cloudflare_bin_masked == instance_bin_masked:
            print(url + ' is behind ' + Fore.RED + 'cloudflare' + Style.RESET_ALL)
            return True
    return False


def is_authenticate(url):
    try:
        if re.match(authRegex, url):
            print(url + ' requires ' + Fore.RED + 'authentication' + Style.RESET_ALL)
            return True
        r = requests.get(url, timeout=5)
        if 'www-authenticate' in r.headers:
            print(url + ' requires ' + Fore.RED + 'authentication' + Style.RESET_ALL)
            return True
    except:
        return False
    return False


def invidious():
    r = requests.get('https://api.invidious.io/instances.json')
    rJson = json.loads(r.text)
    invidiousList = {}
    invidiousList['normal'] = []
    invidiousList['tor'] = []
    invidiousList['i2p'] = []
    invidiousList['loki'] = []
    for instance in rJson:
        if instance[1]['type'] == 'https':
            invidiousList['normal'].append(instance[1]['uri'])
        elif instance[1]['type'] == 'onion':
            invidiousList['tor'].append(instance[1]['uri'])
    mightyList['invidious'] = invidiousList
    print(Fore.GREEN + 'Fetched ' + Style.RESET_ALL + 'Invidious')


def piped():
    r = requests.get(
        'https://raw.githubusercontent.com/wiki/TeamPiped/Piped/Instances.md')

    tmp = re.findall(
        '(?:[^\s\/]+\.)+[a-zA-Z]+ (?:\(Official\) )?\| (https:\/{2}(?:[^\s\/]+\.)+[a-zA-Z]+) \| ', r.text)
    _list = {}
    _list['normal'] = []
    _list['tor'] = []
    _list['i2p'] = []
    _list['loki'] = []
    for item in tmp:
        try:
            url = requests.get(item, timeout=5).url
            if url.strip("/") == item:
                continue
            else:
                _list['normal'].append(url)
        except:
            continue
    mightyList['piped'] = _list
    print(Fore.GREEN + 'Fetched ' + Style.RESET_ALL + 'Piped')


def pipedMaterial():
    r = requests.get(
        'https://raw.githubusercontent.com/mmjee/Piped-Material/master/README.md')

    tmp = re.findall(
            r"\| (https?:\/{2}(?:\S+\.)+[a-zA-Z0-9]*) +\|", r.text)
    pipedMaterialList = {}
    pipedMaterialList['normal'] = []
    pipedMaterialList['tor'] = []
    pipedMaterialList['i2p'] = []
    pipedMaterialList['loki'] = []
    for item in tmp:
        pipedMaterialList['normal'].append(item)
    mightyList['pipedMaterial'] = pipedMaterialList
    print(Fore.GREEN + 'Fetched ' + Style.RESET_ALL + 'pipedMaterial')


def cloudtube():
    json_object = json.dumps(mightyList, ensure_ascii=False, indent=2)
    with open('./src/instances/cloudtube.json') as file:
        mightyList['cloudtube'] = json.load(file)
    print(Fore.GREEN + 'Fetched ' + Style.RESET_ALL + 'CloudTube')


def proxitok():
    r = requests.get(
        'https://raw.githubusercontent.com/wiki/pablouser1/ProxiTok/Public-instances.md')

    tmp = re.findall(
        r"\| \[.*\]\(([-a-zA-Z0-9@:%_\+.~#?&//=]{2,}\.[a-z]{2,}\b(?:\/[-a-zA-Z0-9@:%_\+.~#?&//=]*)?)\)*\|*[A-Z]{0,}.*\|.*\|", r.text)
    proxiTokList = {}
    proxiTokList['normal'] = []
    proxiTokList['tor'] = []
    proxiTokList['i2p'] = []
    proxiTokList['loki'] = []
    for item in tmp:
        proxiTokList['normal'].append(re.sub(r'/$', '', item))
    mightyList['proxiTok'] = proxiTokList
    print(Fore.GREEN + 'Fetched ' + Style.RESET_ALL + 'ProxiTok')


def send():
    r = requests.get(
        'https://gitlab.com/timvisee/send-instances/-/raw/master/README.md')
    tmp = re.findall(
        r"- ([-a-zA-Z0-9@:%_\+.~#?&//=]{2,}\.[a-z0-9]{2,}\b(?:\/[-a-zA-Z0-9@:%_\+.~#?&//=]*)?)\)*\|*[A-Z]{0,}", r.text)
    sendList = {}
    sendList['normal'] = []
    sendList['tor'] = []
    sendList['i2p'] = []
    sendList['loki'] = []
    for item in tmp:
        sendList['normal'].append(item)
    mightyList['send'] = sendList
    print(Fore.GREEN + 'Fetched ' + Style.RESET_ALL + 'Send')


def nitter():
    r = requests.get('https://raw.githubusercontent.com/wiki/zedeus/nitter/Instances.md')
    tmp = re.findall(
        r"(?:(?:\| \[(?:\S+\.)+[a-zA-Z]+\]\((https?:\/{2}(?:\S+\.)+[a-zA-Z]+)\/?\) (?:\((?:\S+ ?\S*)\) )? *\| (?:✅|🇩🇪) +\|(?:(?:\n)|(?: (?:❌)|(?: ✅)|(?: ❓)|(?: \[))))|(?:-   \[(?:\S+\.)+(?:(?:i2p)|(?:loki))\]\((https?:\/{2}(?:\S+\.)(?:(?:i2p)|(?:loki)))\/?\)))", r.text)

    nitterList = {}
    nitterList['normal'] = []
    nitterList['tor'] = []
    nitterList['i2p'] = []
    nitterList['loki'] = []
    for item in tmp:
        for i in item:
            if i == '':
                continue
            else:
                item = i
        if re.search(torRegex, item):
            nitterList['tor'].append(item)
        elif re.search(i2pRegex, item):
            nitterList['i2p'].append(item)
        elif re.search(lokiRegex, item):
            nitterList['loki'].append(item)
        else:
            nitterList['normal'].append(item)
    mightyList['nitter'] = nitterList
    print(Fore.GREEN + 'Fetched ' + Style.RESET_ALL + 'Nitter')


def bibliogram():
    r = requests.get('https://bibliogram.art/api/instances')
    rJson = json.loads(r.text)
    bibliogramList = {}
    bibliogramList['normal'] = []
    bibliogramList['tor'] = []
    bibliogramList['i2p'] = []
    bibliogramList['loki'] = []
    for item in rJson['data']:
        bibliogramList['normal'].append(item['address'])
    mightyList['bibliogram'] = bibliogramList
    print(Fore.GREEN + 'Fetched ' + Style.RESET_ALL + 'Bibliogram')


def libreddit():
    r = requests.get(
        'https://raw.githubusercontent.com/spikecodes/libreddit/master/README.md')
    libredditList = {}
    libredditList['normal'] = []
    libredditList['tor'] = []
    libredditList['i2p'] = []
    libredditList['loki'] = []

    tmp = re.findall(
        r"\| \[.*\]\(([-a-zA-Z0-9@:%_\+.~#?&//=]{2,}\.[a-z]{2,}\b(?:\/[-a-zA-Z0-9@:%_\+.~#?&//=]*)?)\)*\|*[A-Z]{0,}.*\|.*\|", r.text)

    for item in tmp:
        if re.search(torRegex, item):
            libredditList['tor'].append(item)
        else:
            libredditList['normal'].append(item)
    mightyList['libreddit'] = libredditList
    print(Fore.GREEN + 'Fetched ' + Style.RESET_ALL + 'LibReddit')


def teddit():
    r = requests.get(
        'https://codeberg.org/teddit/teddit/raw/branch/main/instances.json')
    rJson = json.loads(r.text)
    tedditList = {}
    tedditList['normal'] = []
    tedditList['tor'] = []
    tedditList['i2p'] = []
    tedditList['loki'] = []
    for item in rJson:
        url = item['url']
        if url != '':
            tedditList['normal'].append(url)
        if 'onion' in item:
            onion = item['onion']
            if onion != '':
                tedditList['tor'].append(onion)

    mightyList['teddit'] = tedditList
    print(Fore.GREEN + 'Fetched ' + Style.RESET_ALL + 'Teddit')


def wikiless():
    r = requests.get('https://wikiless.org/instances.json')
    rJson = json.loads(r.text)
    wikilessList = {}
    wikilessList['normal'] = []
    wikilessList['tor'] = []
    wikilessList['i2p'] = []
    wikilessList['loki'] = []
    for item in rJson:
        if 'url' in item:
            if item['url'].strip() != "":
                wikilessList['normal'].append(item['url'])
        if 'onion' in item:
            if item['onion'].strip() != "":
                wikilessList['tor'].append(item['onion'])
        if 'i2p' in item:
            if item['i2p'].strip() != "":
                wikilessList['i2p'].append(item['i2p'])
    mightyList['wikiless'] = wikilessList
    print(Fore.GREEN + 'Fetched ' + Style.RESET_ALL + 'Wikiless')


def scribe():
    r = requests.get(
        'https://git.sr.ht/~edwardloveall/scribe/blob/main/docs/instances.json')
    rJson = json.loads(r.text)
    scribeList = {}
    scribeList['normal'] = []
    scribeList['tor'] = []
    scribeList['i2p'] = []
    scribeList['loki'] = []
    for item in rJson:
        scribeList['normal'].append(item)
    mightyList['scribe'] = scribeList
    print(Fore.GREEN + 'Fetched ' + Style.RESET_ALL + 'Scribe')


def quetre():
    r = requests.get(
        'https://raw.githubusercontent.com/zyachel/quetre/main/README.md')
    _list = {}
    _list['normal'] = []
    _list['tor'] = []
    _list['i2p'] = []
    _list['loki'] = []

    tmp = re.findall(
        r"\| \[.*\]\(([-a-zA-Z0-9@:%_\+.~#?&//=]{2,}\.[a-z]{2,}\b(?:\/[-a-zA-Z0-9@:%_\+.~#?&//=]*)?)\)*\|*[A-Z]{0,}.*\|.*\|", r.text)


    for item in tmp:
        if re.search(torRegex, item):
            _list['tor'].append(item)
        else:
            _list['normal'].append(item)
    mightyList['quetre'] = _list
    print(Fore.GREEN + 'Fetched ' + Style.RESET_ALL + 'Quetre')


def libremdb():
    r = requests.get(
        'https://raw.githubusercontent.com/zyachel/libremdb/main/README.md')
    _list = {}
    _list['normal'] = []
    _list['tor'] = []
    _list['i2p'] = []
    _list['loki'] = []

    tmp = re.findall(
        r"\| ([-a-zA-Z0-9@:%_\+.~#?&//=]{2,}\.[a-z]{2,}\b(?:\/[-a-zA-Z0-9@:%_\+.~#?&//=]*)?)*\|*[A-Z]{0,}.*\|.*\|", r.text)


    for item in tmp:
        if item.strip() == "":
            continue
        if re.search(torRegex, item):
            _list['tor'].append(item)
        else:
            _list['normal'].append(item)

    mightyList['libremdb'] = _list
    print(Fore.GREEN + 'Fetched ' + Style.RESET_ALL + 'Libremdb')

def simpleertube():
    r = requests.get('https://simple-web.org/instances/simpleertube')
    _list = {}
    _list['normal'] = []
    _list['tor'] = []
    _list['i2p'] = []
    _list['loki'] = []
    for item in r.text.strip().split('\n'):
        _list['normal'].append('https://' + item)

    mightyList['simpleertube'] = _list
    print(Fore.GREEN + 'Fetched ' + Style.RESET_ALL + 'SimpleerTube')


def simplytranslate():
    r = requests.get('https://simple-web.org/instances/simplytranslate')
    simplyTranslateList = {}
    simplyTranslateList['normal'] = []
    for item in r.text.strip().split('\n'):
        simplyTranslateList['normal'].append('https://' + item)

    r = requests.get('https://simple-web.org/instances/simplytranslate_onion')
    simplyTranslateList['tor'] = []
    for item in r.text.strip().split('\n'):
        simplyTranslateList['tor'].append('http://' + item)

    r = requests.get('https://simple-web.org/instances/simplytranslate_i2p')
    simplyTranslateList['i2p'] = []
    for item in r.text.strip().split('\n'):
        simplyTranslateList['i2p'].append('http://' + item)

    r = requests.get('https://simple-web.org/instances/simplytranslate_loki')
    simplyTranslateList['loki'] = []
    for item in r.text.strip().split('\n'):
        simplyTranslateList['loki'].append('http://' + item)

    mightyList['simplyTranslate'] = simplyTranslateList
    print(Fore.GREEN + 'Fetched ' + Style.RESET_ALL + 'SimplyTranslate')


def linvgatranslate():
    r = requests.get(
        'https://raw.githubusercontent.com/TheDavidDelta/lingva-translate/main/instances.json')
    rJson = json.loads(r.text)
    lingvaList = {}
    lingvaList['normal'] = []
    lingvaList['tor'] = []
    lingvaList['i2p'] = []
    lingvaList['loki'] = []
    for item in rJson:
        lingvaList['normal'].append(item)

    mightyList['lingva'] = lingvaList
    print(Fore.GREEN + 'Fetched ' + Style.RESET_ALL + 'LinvgaTranslate')


def searx_searxng():
    r = requests.get('https://searx.space/data/instances.json')
    rJson = json.loads(r.text)
    searxList = {}
    searxList['normal'] = []
    searxList['tor'] = []
    searxList['i2p'] = []
    searxList['loki'] = []
    searxngList = {}
    searxngList['normal'] = []
    searxngList['tor'] = []
    searxngList['i2p'] = []
    searxngList['loki'] = []
    for item in rJson['instances']:
        if re.search(torRegex, item[:-1]):
            if (rJson['instances'][item].get('generator') == 'searxng'):
                searxngList['tor'].append(item[:-1])
            else:
                searxList['tor'].append(item[:-1])
        elif re.search(torRegex, item[:-1]):
            if (rJson['instances'][item].get('generator') == 'searxng'):
                searxngList['i2p'].append(item[:-1])
            else:
                searxList['i2p'].append(item[:-1])
        else:
            if (rJson['instances'][item].get('generator') == 'searxng'):
                searxngList['normal'].append(item[:-1])
            else:
                searxList['normal'].append(item[:-1])

    mightyList['searx'] = searxList
    mightyList['searxng'] = searxngList
    print(Fore.GREEN + 'Fetched ' + Style.RESET_ALL + 'SearX, SearXNG')


def whoogle():
    r = requests.get(
        'https://raw.githubusercontent.com/benbusby/whoogle-search/main/misc/instances.txt')
    tmpList = r.text.strip().split('\n')
    whoogleList = {}
    whoogleList['normal'] = []
    whoogleList['tor'] = []
    whoogleList['i2p'] = []
    whoogleList['loki'] = []
    for item in tmpList:
        if re.search(torRegex, item):
            whoogleList['tor'].append(item)
        elif re.search(torRegex, item):
            whoogleList['i2p'].append(item)
        else:
            whoogleList['normal'].append(item)
    mightyList['whoogle'] = whoogleList
    print(Fore.GREEN + 'Fetched ' + Style.RESET_ALL + 'Whoogle')


def librex():
    r = requests.get(
        'https://raw.githubusercontent.com/hnhx/librex/main/README.md')
    _list = {}
    _list['normal'] = []
    _list['tor'] = []
    _list['i2p'] = []
    _list['loki'] = []

    tmp = re.findall(
            r"\| {1,2}\[(?:(?:[a-zA-Z0-9]+\.)+[a-zA-Z]{2,}|✅)\]\((https?:\/{2}(?:[a-zA-Z0-9]+\.)+[a-zA-Z0-9]{2,})", r.text)

    for item in tmp:
        if item.strip() == "":
            continue
        elif re.search(torRegex, item):
            _list['tor'].append(item)
        elif re.search(i2pRegex, item):
            _list['i2p'].append(item)
        else:
            _list['normal'].append(item)
    mightyList['librex'] = _list
    print(Fore.GREEN + 'Fetched ' + Style.RESET_ALL + 'Librex')


def rimgo():
    r = requests.get(
        'https://codeberg.org/video-prize-ranch/rimgo/raw/branch/main/instances.json')
    rJson = json.loads(r.text)
    rimgoList = {}
    rimgoList['normal'] = []
    rimgoList['tor'] = []
    rimgoList['i2p'] = []
    rimgoList['loki'] = []
    for item in rJson:
        if 'url' in item:
            rimgoList['normal'].append(item['url'])
        if 'onion' in item:
            rimgoList['tor'].append(item['onion'])
        if 'i2p' in item:
            rimgoList['i2p'].append(item['i2p'])
    mightyList['rimgo'] = rimgoList
    print(Fore.GREEN + 'Fetched ' + Style.RESET_ALL + 'Rimgo')


def librarian():
    r = requests.get(
        'https://codeberg.org/librarian/librarian/raw/branch/main/instances.json')
    rJson = json.loads(r.text)
    librarianList = {}
    librarianList['normal'] = []
    librarianList['tor'] = []
    librarianList['i2p'] = []
    librarianList['loki'] = []
    instances = rJson['instances']
    for item in instances:
        url = item['url']
        if url.strip() == "":
            continue
        elif re.search(torRegex, url):
            librarianList['tor'].append(url)
        elif re.search(i2pRegex, url):
            librarianList['i2p'].append(url)
        elif re.search(lokiRegex, url):
            librarianList['loki'].append(url)
        else:
            librarianList['normal'].append(url)
    mightyList['librarian'] = librarianList
    print(Fore.GREEN + 'Fetched ' + Style.RESET_ALL + 'Librarian')


def neuters():
    json_object = json.dumps(mightyList, ensure_ascii=False, indent=2)
    with open('./src/instances/neuters.json') as file:
        mightyList['neuters'] = json.load(file)
    print(Fore.GREEN + 'Fetched ' + Style.RESET_ALL + 'Neuters')


def beatbump():
    json_object = json.dumps(mightyList, ensure_ascii=False, indent=2)
    with open('./src/instances/beatbump.json') as file:
        mightyList['beatbump'] = json.load(file)
    print(Fore.GREEN + 'Fetched ' + Style.RESET_ALL + 'Beatbump')


def hyperpipe():
    r = requests.get(
        'https://codeberg.org/Hyperpipe/pages/raw/branch/main/api/frontend.json')
    rJson = json.loads(r.text)
    hyperpipeList = {}
    hyperpipeList['normal'] = []
    hyperpipeList['tor'] = []
    hyperpipeList['i2p'] = []
    hyperpipeList['loki'] = []
    for item in rJson:
        url = item['url']
        if url.strip() == "":
            continue
        elif re.search(torRegex, url):
            hyperpipeList['tor'].append(url)
        elif re.search(i2pRegex, url):
            hyperpipeList['i2p'].append(url)
        elif re.search(lokiRegex, url):
            hyperpipeList['loki'].append(url)
        else:
            hyperpipeList['normal'].append(url)
    mightyList['hyperpipe'] = hyperpipeList
    print(Fore.GREEN + 'Fetched ' + Style.RESET_ALL + 'Hyperpipe')


def facil():
    json_object = json.dumps(mightyList, ensure_ascii=False, indent=2)
    with open('./src/instances/facil.json') as file:
        mightyList['facil'] = json.load(file)
    print(Fore.GREEN + 'Fetched ' + Style.RESET_ALL + 'FacilMap')


def peertube():
    r = requests.get(
        'https://instances.joinpeertube.org/api/v1/instances?start=0&count=1045&sort=-createdAt')
    rJson = json.loads(r.text)

    myList = []
    for k in rJson['data']:
        myList.append('https://'+k['host'])

    mightyList['peertube'] = myList
    print(Fore.GREEN + 'Fetched ' + Style.RESET_ALL + 'PeerTube')


def isValid(url):  # This code is contributed by avanitrachhadiya2155
    try:
        result = urlparse(url)
        return all([result.scheme, result.netloc])
    except:
        return False


invidious()
piped()
pipedMaterial()
cloudtube()
proxitok()
send()
nitter()
bibliogram()
libreddit()
teddit()
wikiless()
scribe()
quetre()
libremdb()
simplytranslate()
linvgatranslate()
searx_searxng()
whoogle()
librex()
rimgo()
librarian()
neuters()
beatbump()
hyperpipe()
facil()
simpleertube()
mightyList = filterLastSlash(mightyList)

cloudflare = []
authenticate = []
for k1, v1 in mightyList.items():
    if type(mightyList[k1]) is dict:
        for k2, v2 in mightyList[k1].items():
            for instance in mightyList[k1][k2]:
                if (not isValid(instance)):
                    mightyList[k1][k2].remove(instance)
                    print("removed " + instance)
                else:
                    if not instance.endswith('.onion') and not instance.endswith('.i2p') and not instance.endswith('.loki') and is_cloudflare(instance):
                        cloudflare.append(instance)
                    if not instance.endswith('.onion') and not instance.endswith('.i2p') and not instance.endswith('.loki') and is_authenticate(instance):
                        authenticate.append(instance)

peertube()

blacklist = {
    'cloudflare': cloudflare,
    'authenticate': authenticate
}

# Writing to file
json_object = json.dumps(mightyList, ensure_ascii=False, indent=2)
with open('./src/instances/data.json', 'w') as outfile:
    outfile.write(json_object)
print(Fore.BLUE + 'wrote ' + Style.RESET_ALL + 'instances/data.json')

json_object = json.dumps(blacklist, ensure_ascii=False, indent=2)
with open('./src/instances/blacklist.json', 'w') as outfile:
    outfile.write(json_object)
print(Fore.BLUE + 'wrote ' + Style.RESET_ALL + 'instances/blacklist.json')

# print(json_object)