hudebnibazar

2024-01-03 11:40:00 +01:00 · 2024-01-03 11:40:00 +01:00 · 8bfe156467
commit 8bfe156467
parent db3c18ce8f
1 changed files with 124 additions and 0 deletions
--- a/resources/scripts/scrapers/hudebnibazar.py
+++ b/resources/scripts/scrapers/hudebnibazar.py
@ -0,0 +1,124 @@
+# Obtains listings from hudebnibazar.cz and outputs them as JSON feed.
+#
+# How to call: python3 hudebnibazar.cz <category> <number-of-pages>
+# For example: python3 hudebnibazar.cz "elektricke-kytary/110100" 4
+
+import urllib.request
+import requests
+import re
+import json
+import sys
+import ssl
+import http.client
+import http.cookies
+import dateparser
+import bs4
+import datetime
+
+# ssl._DEFAULT_CIPHERS = "TLS_RSA_WITH_AES_256_GCM_SHA384"
+category = sys.argv[1]
+number_of_pages = int(sys.argv[2])
+
+url_base = "https://hudebnibazar.cz"
+url = "{}/{}/?is=1&f=&n=vse&r=&i=50&o=datum&ign=on".format(url_base, category)
+json_feed = '{{"title": "HudebniBazar - {cat}", "items": [{items}]}}'
+items = list()
+
+# To avoid TLSv1.2 errors.
+ct = ssl.SSLContext(ssl.PROTOCOL_TLSv1_2)
+ct.set_ciphers("DEFAULT:@SECLEVEL=0")
+
+
+def processListingDate(string_date: str):
+    # mins = re.search("^před (\\d+) min\\.", string_date)
+
+    # if mins is not None:
+    #     return datetime.datetime.now() - datetime.timedelta(minutes=int(mins.group(1)))
+
+    # tday = re.search("^dnes v (\\d{1,2}):(\\d{2})$", string_date)
+
+    # if tday is not None:
+    #     return datetime.datetime.today().replace(
+    #         hour=int(tday.group(1)), minute=int(tday.group(2))
+    #     )
+
+    # yday = re.search("^včera v (\\d{1,2}):(\\d{2})$", string_date)
+
+    # if yday is not None:
+    #     return (datetime.datetime.today() - datetime.timedelta(days=1)).replace(
+    #         hour=int(yday.group(1)), minute=int(yday.group(2))
+    #     )
+
+    dy = dateparser.parse(string_date, languages=["cs"])
+    return dy
+
+
+def processListingImgs(listing: bs4.Tag):
+    pics = list()
+    for pic in listing.find("div", class_="InzeratObr").find_all("a"):
+        pics.append(url_base + pic.get("href"))
+
+    return pics
+
+
+def generateListingJson(listing):
+    article_price = listing.find(class_="InzeratCena").contents[0].get_text(strip=True)
+    article_title = listing.find(class_="InzeratNadpis").b.get_text(strip=True)
+    article_date = listing.find(class_="InzeratZarazeno").get_text(strip=True)
+
+    article_parsed_date = processListingDate(article_date)
+    article_imgs = processListingImgs(listing)
+
+    if article_imgs.count == 0:
+        article_attachments = ""
+    else:
+        article_attachments = ", ".join(
+            ['{{"url": {}, "mime_type": "image/jpeg"}}'.format(json.dumps(i)) for i in article_imgs]
+        )
+
+    article_url = json.dumps(url_base + listing.a["href"])
+    article_fulltitle = json.dumps("[{}] {}".format(article_price, article_title))
+    article_html = json.dumps(listing.find(class_="InzeratText").get_text(strip=True))
+    article_author = json.dumps(
+        listing.find(lambda tag: tag.name == "div" and not tag.attrs).get_text(
+            strip=True
+        )
+    )
+    article_fulldate = json.dumps(article_parsed_date.isoformat())
+
+    return '{{"title": {title}, "attachments": [{att}], "authors": [{{"name": {author}}}], "date_published": {publ}, "content_html": {html}, "url": {url}}}'.format(
+        title=article_fulltitle,
+        html=article_html,
+        url=article_url,
+        author=article_author,
+        publ=article_fulldate,
+        att=article_attachments,
+    )
+
+
+php_ssid = None
+
+for page_number in range(1, number_of_pages + 1):
+    page_url = url + "&p={}".format(page_number)
+    page_request = urllib.request.Request(page_url)
+    page_request.add_header("User-Agent", "curl/8.4.0")
+
+    if php_ssid is not None:
+        page_request.add_header("Cookie", "PHPSESSID={};".format(php_ssid))
+
+    page_response = urllib.request.urlopen(page_request, context=ct)
+    page_html = page_response.read().decode("utf-8")
+
+    if php_ssid is None:
+        cook = http.cookies.SimpleCookie()
+        cook.load(page_response.getheader("Set-Cookie"))
+        php_ssid = cook.get("PHPSESSID").value
+
+    soup = bs4.BeautifulSoup(page_html, "html.parser")
+    listings = soup.find_all("div", class_="InzeratBody")
+
+    for listing in listings:
+        items.append(generateListingJson(listing))
+
+json_feed = json_feed.format(cat=category, items=", ".join(items))
+print(json_feed)