mirror of
https://github.com/vector-im/element-android.git
synced 2024-11-15 01:35:07 +08:00
19500a8cd9
Use fully qualified R classes Fix or ignore deprecation Update github actions and ensure JDK 17 is used Add group for paparazzi Fixes Lint issues Fix Jacoco configuration
148 lines
6.7 KiB
Python
Executable File
148 lines
6.7 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
import json
|
|
import os
|
|
import re
|
|
import requests
|
|
from bs4 import BeautifulSoup
|
|
from collections import OrderedDict
|
|
|
|
# A list of words to not capitalize in emoji-names
|
|
capitalization_exclude = {'with', 'a', 'at', 'of', 'for', 'and', 'over', 'the', 'off', 'on', 'out', 'in', 'but', 'or'}
|
|
|
|
# Create skeleton of the final json file as a python dictionary:
|
|
emoji_picker_datasource = {
|
|
"compressed": True,
|
|
"categories": [],
|
|
"emojis": {},
|
|
"aliases": {}
|
|
}
|
|
emoji_picker_datasource_categories = emoji_picker_datasource["categories"]
|
|
emoji_picker_datasource_emojis = emoji_picker_datasource["emojis"]
|
|
|
|
|
|
# Get official emoji list from unicode.org (Emoji List, v13.1 at time of writing)
|
|
print("Fetching emoji list from Unicode.org...")
|
|
req = requests.get("https://unicode.org/emoji/charts/emoji-list.html")
|
|
soup = BeautifulSoup(req.content, 'html.parser')
|
|
|
|
variation_sequence_data = requests.get("https://www.unicode.org/Public/15.0.0/ucd/emoji/emoji-variation-sequences.txt").text
|
|
variation_sequence_overrides = {}
|
|
for line in variation_sequence_data.split("\n"):
|
|
if "emoji style" in line:
|
|
emoji_hex = line.split(" ", 1)[0]
|
|
variation_sequence_overrides[emoji_hex] = emoji_hex + "-FE0F"
|
|
|
|
# Navigate to table
|
|
table = soup.body.table
|
|
|
|
# Go over all rows
|
|
print("Extracting emojis...")
|
|
for row in table.find_all('tr'):
|
|
# Add "bigheads" rows to categories
|
|
if 'bighead' in next(row.children)['class']:
|
|
relevant_element = row.find('a')
|
|
category_id = relevant_element['name']
|
|
category_name = relevant_element.text
|
|
emoji_picker_datasource_categories.append({
|
|
"id": category_id,
|
|
"name": category_name,
|
|
"emojis": []
|
|
})
|
|
|
|
# Add information in "rchars" rows to the last encountered category and emojis
|
|
if row.find('td', class_='code'):
|
|
# Get columns
|
|
cols = row.find_all('td')
|
|
no_element = cols[0]
|
|
code_element = cols[1]
|
|
sample_element = cols[2]
|
|
cldr_element = cols[3]
|
|
keywords_element = cols[4]
|
|
|
|
# Extract information from columns
|
|
# Extract name and id
|
|
# => Remove spaces, colons and unicode-characters
|
|
emoji_name = cldr_element.text
|
|
emoji_id = cldr_element.text.lower()
|
|
emoji_id = re.sub(r'[^A-Za-z0-9 ]+', '', emoji_id, flags=re.UNICODE) # Only keep alphanumeric, space characters
|
|
emoji_id = emoji_id.strip() # Remove leading/trailing whitespaces
|
|
emoji_id = emoji_id.replace(' ', '-')
|
|
|
|
# Capitalize name according to the same rules as the previous emoji_picker_datasource.json
|
|
# - Words are separated by any non-word character (\W), e.g. space, comma, parentheses, dots, etc.
|
|
# - Words are capitalized if they are either at the beginning of the name OR not in capitalization_exclude (extracted from the previous datasource, too)
|
|
emoji_name_cap = "".join([w.capitalize() if i == 0 or w not in capitalization_exclude else w for i, w in enumerate(re.split('(\W)', emoji_name))])
|
|
|
|
# Extract emoji unicode-codepoint
|
|
emoji_code_raw = code_element.text
|
|
emoji_code_list = emoji_code_raw.split(" ")
|
|
emoji_code_list = [e[2:] for e in emoji_code_list]
|
|
emoji_code = "-".join(emoji_code_list)
|
|
|
|
# Extract keywords
|
|
emoji_keywords = keywords_element.text.split(" | ")
|
|
|
|
# Add the emoji-id to the last entry in "categories"
|
|
emoji_picker_datasource_categories[-1]["emojis"].append(emoji_id)
|
|
|
|
# Add the emoji itself to the "emojis" dict
|
|
emoji_picker_datasource_emojis[emoji_id] = {
|
|
"a": emoji_name_cap,
|
|
"b": emoji_code,
|
|
"j": emoji_keywords
|
|
}
|
|
|
|
# The keywords of unicode.org are usually quite sparse.
|
|
# There is no official specification of keywords beyond that, but muan/emojilib maintains a well maintained and
|
|
# established repository with additional keywords. We extend our list with the keywords from there.
|
|
# At the time of writing it had additional keyword information for all emojis except a few from the newest unicode 13.1.
|
|
print("Fetching additional keywords from Emojilib...")
|
|
req = requests.get("https://raw.githubusercontent.com/muan/emojilib/main/dist/emoji-en-US.json")
|
|
emojilib_data = json.loads(req.content)
|
|
|
|
# We just go over all the official emojis from unicode, and add the keywords there
|
|
print("Adding keywords to emojis...")
|
|
for emoji in emoji_picker_datasource_emojis:
|
|
emoji_name = emoji_picker_datasource_emojis[emoji]["a"]
|
|
emoji_code = emoji_picker_datasource_emojis[emoji]["b"]
|
|
|
|
# Convert back to actual unicode emoji
|
|
emoji_unicode = ''.join(map(lambda s: chr(int(s, 16)), emoji_code.split("-")))
|
|
|
|
# Search for emoji in emojilib
|
|
if emoji_unicode in emojilib_data:
|
|
emoji_additional_keywords = emojilib_data[emoji_unicode]
|
|
elif emoji_unicode+chr(0xfe0f) in emojilib_data:
|
|
emoji_additional_keywords = emojilib_data[emoji_unicode+chr(0xfe0f)]
|
|
else:
|
|
print("* No additional keywords for", emoji_unicode, emoji_picker_datasource_emojis[emoji])
|
|
continue
|
|
|
|
# If additional keywords exist, add them to emoji_picker_datasource_emojis
|
|
# Avoid duplicates and keep order. Put official unicode.com keywords first and extend up with emojilib ones.
|
|
new_keywords = OrderedDict.fromkeys(emoji_picker_datasource_emojis[emoji]["j"] + emoji_additional_keywords)
|
|
# Remove the ones derived from the unicode name
|
|
for keyword in [emoji.replace("-", "_")] + [emoji.replace("-", " ")] + [emoji_name]:
|
|
if keyword in new_keywords:
|
|
new_keywords.pop(keyword)
|
|
# Write new keywords back
|
|
emoji_picker_datasource_emojis[emoji]["j"] = list(new_keywords.keys())
|
|
if emoji_code in variation_sequence_overrides:
|
|
emoji_picker_datasource_emojis[emoji]["b"] = variation_sequence_overrides[emoji_code]
|
|
|
|
# Filter out components from unicode 13.1 (as they are not suitable for single-emoji reactions)
|
|
emoji_picker_datasource['categories'] = [x for x in emoji_picker_datasource['categories'] if x['id'] != 'component']
|
|
|
|
# Write result to file (overwrite previous), without escaping unicode characters
|
|
print("Writing emoji_picker_datasource.json...")
|
|
scripts_dir = os.path.dirname(os.path.abspath(__file__))
|
|
with open(os.path.join(scripts_dir, "../vector/src/main/res/raw/emoji_picker_datasource.json"), "w") as outfile:
|
|
json.dump(emoji_picker_datasource, outfile, ensure_ascii=False, separators=(',', ':'))
|
|
|
|
# Also export a formatted version
|
|
print("Writing emoji_picker_datasource_formatted.json...")
|
|
with open(os.path.join(scripts_dir, "../tools/emojis/emoji_picker_datasource_formatted.json"), "w") as outfile:
|
|
json.dump(emoji_picker_datasource, outfile, ensure_ascii=False, indent=4)
|
|
|
|
print("Done.")
|