From 51650fd8994d7ad9722153cf38a2004372237d49 Mon Sep 17 00:00:00 2001 From: oogm Date: Tue, 16 Mar 2021 16:23:06 +0100 Subject: [PATCH] Add script to pull emojis from Unicode.org as a file --- tools/import_emojis.py | 77 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 77 insertions(+) create mode 100644 tools/import_emojis.py diff --git a/tools/import_emojis.py b/tools/import_emojis.py new file mode 100644 index 0000000000..bdbade0e2d --- /dev/null +++ b/tools/import_emojis.py @@ -0,0 +1,77 @@ +import requests +import json +import re +from bs4 import BeautifulSoup + +# Create skeleton of the final json file as a python dictionary: +emoji_picker_datasource = { + "compressed": True, + "categories": [], + "emojis": {}, + "aliases": {} +} +emoji_picker_datasource_categories = emoji_picker_datasource["categories"] +emoji_picker_datasource_emojis = emoji_picker_datasource["emojis"] + + +# Get official emoji list from unicode.org (Emoji List, v13.1 at time of writing) +req = requests.get("https://unicode.org/emoji/charts/emoji-list.html") +soup = BeautifulSoup(req.content, 'html.parser') + +# Navigate to table +table = soup.body.table + +# Go over all rows +for row in table.find_all('tr'): + # Add "bigheads" rows to categories + if 'bighead' in next(row.children)['class']: + relevant_element = row.find('a') + category_id = relevant_element['name'] + category_name = relevant_element.text + emoji_picker_datasource_categories.append({ + "id": category_id, + "name": category_name, + "emojis": [] + }) + + # Add information in "rchars" rows to the last encountered category and emojis + if row.find('td', class_='code'): + # Get columns + cols = row.find_all('td') + no_element = cols[0] + code_element = cols[1] + sample_element = cols[2] + cldr_element = cols[3] + keywords_element = cols[4] + + # Extract information from columns + # Extract name and id + # => Remove spaces, colons and unicode-characters + emoji_name = cldr_element.text + emoji_id = cldr_element.text.lower() + emoji_id = re.sub(r'[^A-Za-z0-9 ]+', '', emoji_id, flags=re.UNICODE) # Only keep alphanumeric, space characters + emoji_id = emoji_id.strip() # Remove leading/trailing whitespaces + emoji_id = emoji_id.replace(' ', '-') + + # Extract emoji unicode-codepoint + emoji_code_raw = code_element.text + emoji_code_list = emoji_code_raw.split(" ") + emoji_code_list = [e[2:] for e in emoji_code_list] + emoji_code = "-".join(emoji_code_list) + + # Extract keywords + emoji_keywords = keywords_element.text.split(" | ") + + # Add the emoji-id to the last entry in "categories" + emoji_picker_datasource_categories[-1]["emojis"].append(emoji_id) + + # Add the emoji itself to the "emojis" dict + emoji_picker_datasource_emojis[emoji_id] = { + "a": emoji_name, + "b": emoji_code, + "j": emoji_keywords + } + +# Print result to file (overwrite previous), without escaping unicode characters +with open("../vector/src/main/res/raw/emoji_picker_datasource.json", "w") as outfile: + json.dump(emoji_picker_datasource, outfile, ensure_ascii=False)