Update emoji import script to ease execution, output smaller .json files and capitalize emoji names

This commit is contained in:
oogm 2021-03-26 17:21:02 +01:00
parent 7453509df4
commit a40adb903a
2 changed files with 22 additions and 5 deletions

25
tools/import_emojis.py Normal file → Executable file
View File

@ -1,10 +1,15 @@
#!/usr/bin/env python3
from collections import OrderedDict from collections import OrderedDict
import requests import requests
import json import json
import re import re
import os
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
# A list of words to not capitalize in emoji-names
capitalization_exclude = {'with', 'a', 'at', 'of', 'for', 'and', 'over', 'the', 'off', 'on', 'out', 'in', 'but', 'or'}
# Create skeleton of the final json file as a python dictionary: # Create skeleton of the final json file as a python dictionary:
emoji_picker_datasource = { emoji_picker_datasource = {
"compressed": True, "compressed": True,
@ -17,6 +22,7 @@ emoji_picker_datasource_emojis = emoji_picker_datasource["emojis"]
# Get official emoji list from unicode.org (Emoji List, v13.1 at time of writing) # Get official emoji list from unicode.org (Emoji List, v13.1 at time of writing)
print("Fetching emoji list from Unicode.org...",)
req = requests.get("https://unicode.org/emoji/charts/emoji-list.html") req = requests.get("https://unicode.org/emoji/charts/emoji-list.html")
soup = BeautifulSoup(req.content, 'html.parser') soup = BeautifulSoup(req.content, 'html.parser')
@ -24,6 +30,7 @@ soup = BeautifulSoup(req.content, 'html.parser')
table = soup.body.table table = soup.body.table
# Go over all rows # Go over all rows
print("Extracting emojis...")
for row in table.find_all('tr'): for row in table.find_all('tr'):
# Add "bigheads" rows to categories # Add "bigheads" rows to categories
if 'bighead' in next(row.children)['class']: if 'bighead' in next(row.children)['class']:
@ -55,6 +62,11 @@ for row in table.find_all('tr'):
emoji_id = emoji_id.strip() # Remove leading/trailing whitespaces emoji_id = emoji_id.strip() # Remove leading/trailing whitespaces
emoji_id = emoji_id.replace(' ', '-') emoji_id = emoji_id.replace(' ', '-')
# Capitalize name according to the same rules as the previous emoji_picker_datasource.json
# - Words are separated by any non-word character (\W), e.g. space, comma, parentheses, dots, etc.
# - Words are capitalized if they are either at the beginning of the name OR not in capitalization_exclude (extracted from the previous datasource, too)
emoji_name_cap = "".join([w.capitalize() if i == 0 or w not in capitalization_exclude else w for i, w in enumerate(re.split('(\W)', emoji_name))])
# Extract emoji unicode-codepoint # Extract emoji unicode-codepoint
emoji_code_raw = code_element.text emoji_code_raw = code_element.text
emoji_code_list = emoji_code_raw.split(" ") emoji_code_list = emoji_code_raw.split(" ")
@ -69,7 +81,7 @@ for row in table.find_all('tr'):
# Add the emoji itself to the "emojis" dict # Add the emoji itself to the "emojis" dict
emoji_picker_datasource_emojis[emoji_id] = { emoji_picker_datasource_emojis[emoji_id] = {
"a": emoji_name, "a": emoji_name_cap,
"b": emoji_code, "b": emoji_code,
"j": emoji_keywords "j": emoji_keywords
} }
@ -78,10 +90,12 @@ for row in table.find_all('tr'):
# There is no official specification of keywords beyond that, but muan/emojilib maintains a well maintained and # There is no official specification of keywords beyond that, but muan/emojilib maintains a well maintained and
# established repository with additional keywords. We extend our list with the keywords from there. # established repository with additional keywords. We extend our list with the keywords from there.
# At the time of writing it had additional keyword information for all emojis except a few from the newest unicode 13.1. # At the time of writing it had additional keyword information for all emojis except a few from the newest unicode 13.1.
print("Fetching additional keywords from Emojilib...")
req = requests.get("https://raw.githubusercontent.com/muan/emojilib/main/dist/emoji-en-US.json") req = requests.get("https://raw.githubusercontent.com/muan/emojilib/main/dist/emoji-en-US.json")
emojilib_data = json.loads(req.content) emojilib_data = json.loads(req.content)
# We just go over all the official emojis from unicode, and add the keywords there # We just go over all the official emojis from unicode, and add the keywords there
print("Adding keywords to emojis...")
for emoji in emoji_picker_datasource_emojis: for emoji in emoji_picker_datasource_emojis:
emoji_name = emoji_picker_datasource_emojis[emoji]["a"] emoji_name = emoji_picker_datasource_emojis[emoji]["a"]
emoji_code = emoji_picker_datasource_emojis[emoji]["b"] emoji_code = emoji_picker_datasource_emojis[emoji]["b"]
@ -95,7 +109,7 @@ for emoji in emoji_picker_datasource_emojis:
elif emoji_unicode+chr(0xfe0f) in emojilib_data: elif emoji_unicode+chr(0xfe0f) in emojilib_data:
emoji_additional_keywords = emojilib_data[emoji_unicode+chr(0xfe0f)] emoji_additional_keywords = emojilib_data[emoji_unicode+chr(0xfe0f)]
else: else:
print("No additional keywords for", emoji_unicode, emoji_picker_datasource_emojis[emoji]) print("* No additional keywords for", emoji_unicode, emoji_picker_datasource_emojis[emoji])
continue continue
# If additional keywords exist, add them to emoji_picker_datasource_emojis # If additional keywords exist, add them to emoji_picker_datasource_emojis
@ -110,5 +124,8 @@ for emoji in emoji_picker_datasource_emojis:
emoji_picker_datasource['categories'] = [x for x in emoji_picker_datasource['categories'] if x['id'] != 'component'] emoji_picker_datasource['categories'] = [x for x in emoji_picker_datasource['categories'] if x['id'] != 'component']
# Write result to file (overwrite previous), without escaping unicode characters # Write result to file (overwrite previous), without escaping unicode characters
with open("../vector/src/main/res/raw/emoji_picker_datasource.json", "w") as outfile: print("Writing emoji_picker_datasource.json...")
json.dump(emoji_picker_datasource, outfile, ensure_ascii=False) scripts_dir = os.path.dirname(os.path.abspath(__file__))
with open(os.path.join(scripts_dir, "../vector/src/main/res/raw/emoji_picker_datasource.json"), "w") as outfile:
json.dump(emoji_picker_datasource, outfile, ensure_ascii=False, separators=(',', ':'))
print("Done.")

File diff suppressed because one or more lines are too long