2021-03-27 00:21:02 +08:00
#!/usr/bin/env python3
2021-03-16 23:23:06 +08:00
import json
2021-03-27 00:21:02 +08:00
import os
2024-07-16 00:12:08 +08:00
import re
import requests
2021-03-16 23:23:06 +08:00
from bs4 import BeautifulSoup
2024-07-16 00:12:08 +08:00
from collections import OrderedDict
2021-03-16 23:23:06 +08:00
2021-03-27 00:21:02 +08:00
# A list of words to not capitalize in emoji-names
capitalization_exclude = { ' with ' , ' a ' , ' at ' , ' of ' , ' for ' , ' and ' , ' over ' , ' the ' , ' off ' , ' on ' , ' out ' , ' in ' , ' but ' , ' or ' }
2021-03-16 23:23:06 +08:00
# Create skeleton of the final json file as a python dictionary:
emoji_picker_datasource = {
" compressed " : True ,
" categories " : [ ] ,
" emojis " : { } ,
" aliases " : { }
}
emoji_picker_datasource_categories = emoji_picker_datasource [ " categories " ]
emoji_picker_datasource_emojis = emoji_picker_datasource [ " emojis " ]
# Get official emoji list from unicode.org (Emoji List, v13.1 at time of writing)
2021-04-08 01:00:02 +08:00
print ( " Fetching emoji list from Unicode.org... " )
2021-03-16 23:23:06 +08:00
req = requests . get ( " https://unicode.org/emoji/charts/emoji-list.html " )
soup = BeautifulSoup ( req . content , ' html.parser ' )
2023-04-13 02:43:54 +08:00
variation_sequence_data = requests . get ( " https://www.unicode.org/Public/15.0.0/ucd/emoji/emoji-variation-sequences.txt " ) . text
variation_sequence_overrides = { }
for line in variation_sequence_data . split ( " \n " ) :
if " emoji style " in line :
emoji_hex = line . split ( " " , 1 ) [ 0 ]
variation_sequence_overrides [ emoji_hex ] = emoji_hex + " -FE0F "
2021-03-16 23:23:06 +08:00
# Navigate to table
table = soup . body . table
# Go over all rows
2021-03-27 00:21:02 +08:00
print ( " Extracting emojis... " )
2021-03-16 23:23:06 +08:00
for row in table . find_all ( ' tr ' ) :
# Add "bigheads" rows to categories
if ' bighead ' in next ( row . children ) [ ' class ' ] :
relevant_element = row . find ( ' a ' )
category_id = relevant_element [ ' name ' ]
category_name = relevant_element . text
emoji_picker_datasource_categories . append ( {
" id " : category_id ,
" name " : category_name ,
" emojis " : [ ]
} )
# Add information in "rchars" rows to the last encountered category and emojis
if row . find ( ' td ' , class_ = ' code ' ) :
# Get columns
cols = row . find_all ( ' td ' )
no_element = cols [ 0 ]
code_element = cols [ 1 ]
sample_element = cols [ 2 ]
cldr_element = cols [ 3 ]
keywords_element = cols [ 4 ]
# Extract information from columns
# Extract name and id
# => Remove spaces, colons and unicode-characters
emoji_name = cldr_element . text
emoji_id = cldr_element . text . lower ( )
emoji_id = re . sub ( r ' [^A-Za-z0-9 ]+ ' , ' ' , emoji_id , flags = re . UNICODE ) # Only keep alphanumeric, space characters
emoji_id = emoji_id . strip ( ) # Remove leading/trailing whitespaces
emoji_id = emoji_id . replace ( ' ' , ' - ' )
2021-03-27 00:21:02 +08:00
# Capitalize name according to the same rules as the previous emoji_picker_datasource.json
# - Words are separated by any non-word character (\W), e.g. space, comma, parentheses, dots, etc.
# - Words are capitalized if they are either at the beginning of the name OR not in capitalization_exclude (extracted from the previous datasource, too)
emoji_name_cap = " " . join ( [ w . capitalize ( ) if i == 0 or w not in capitalization_exclude else w for i , w in enumerate ( re . split ( ' ( \ W) ' , emoji_name ) ) ] )
2021-03-16 23:23:06 +08:00
# Extract emoji unicode-codepoint
emoji_code_raw = code_element . text
emoji_code_list = emoji_code_raw . split ( " " )
emoji_code_list = [ e [ 2 : ] for e in emoji_code_list ]
emoji_code = " - " . join ( emoji_code_list )
# Extract keywords
emoji_keywords = keywords_element . text . split ( " | " )
# Add the emoji-id to the last entry in "categories"
emoji_picker_datasource_categories [ - 1 ] [ " emojis " ] . append ( emoji_id )
# Add the emoji itself to the "emojis" dict
emoji_picker_datasource_emojis [ emoji_id ] = {
2021-03-27 00:21:02 +08:00
" a " : emoji_name_cap ,
2021-03-16 23:23:06 +08:00
" b " : emoji_code ,
" j " : emoji_keywords
}
2021-03-20 05:24:43 +08:00
# The keywords of unicode.org are usually quite sparse.
# There is no official specification of keywords beyond that, but muan/emojilib maintains a well maintained and
# established repository with additional keywords. We extend our list with the keywords from there.
# At the time of writing it had additional keyword information for all emojis except a few from the newest unicode 13.1.
2021-03-27 00:21:02 +08:00
print ( " Fetching additional keywords from Emojilib... " )
2021-03-20 05:24:43 +08:00
req = requests . get ( " https://raw.githubusercontent.com/muan/emojilib/main/dist/emoji-en-US.json " )
emojilib_data = json . loads ( req . content )
# We just go over all the official emojis from unicode, and add the keywords there
2021-03-27 00:21:02 +08:00
print ( " Adding keywords to emojis... " )
2021-03-20 05:24:43 +08:00
for emoji in emoji_picker_datasource_emojis :
emoji_name = emoji_picker_datasource_emojis [ emoji ] [ " a " ]
emoji_code = emoji_picker_datasource_emojis [ emoji ] [ " b " ]
# Convert back to actual unicode emoji
emoji_unicode = ' ' . join ( map ( lambda s : chr ( int ( s , 16 ) ) , emoji_code . split ( " - " ) ) )
# Search for emoji in emojilib
if emoji_unicode in emojilib_data :
emoji_additional_keywords = emojilib_data [ emoji_unicode ]
elif emoji_unicode + chr ( 0xfe0f ) in emojilib_data :
emoji_additional_keywords = emojilib_data [ emoji_unicode + chr ( 0xfe0f ) ]
else :
2021-03-27 00:21:02 +08:00
print ( " * No additional keywords for " , emoji_unicode , emoji_picker_datasource_emojis [ emoji ] )
2021-03-20 05:24:43 +08:00
continue
# If additional keywords exist, add them to emoji_picker_datasource_emojis
# Avoid duplicates and keep order. Put official unicode.com keywords first and extend up with emojilib ones.
2021-04-09 04:14:41 +08:00
new_keywords = OrderedDict . fromkeys ( emoji_picker_datasource_emojis [ emoji ] [ " j " ] + emoji_additional_keywords )
2021-03-20 05:24:43 +08:00
# Remove the ones derived from the unicode name
2021-04-09 04:14:41 +08:00
for keyword in [ emoji . replace ( " - " , " _ " ) ] + [ emoji . replace ( " - " , " " ) ] + [ emoji_name ] :
if keyword in new_keywords :
new_keywords . pop ( keyword )
2021-03-20 05:24:43 +08:00
# Write new keywords back
2021-04-09 04:14:41 +08:00
emoji_picker_datasource_emojis [ emoji ] [ " j " ] = list ( new_keywords . keys ( ) )
2023-04-13 02:43:54 +08:00
if emoji_code in variation_sequence_overrides :
emoji_picker_datasource_emojis [ emoji ] [ " b " ] = variation_sequence_overrides [ emoji_code ]
2021-03-20 05:24:43 +08:00
# Filter out components from unicode 13.1 (as they are not suitable for single-emoji reactions)
emoji_picker_datasource [ ' categories ' ] = [ x for x in emoji_picker_datasource [ ' categories ' ] if x [ ' id ' ] != ' component ' ]
# Write result to file (overwrite previous), without escaping unicode characters
2021-03-27 00:21:02 +08:00
print ( " Writing emoji_picker_datasource.json... " )
scripts_dir = os . path . dirname ( os . path . abspath ( __file__ ) )
with open ( os . path . join ( scripts_dir , " ../vector/src/main/res/raw/emoji_picker_datasource.json " ) , " w " ) as outfile :
json . dump ( emoji_picker_datasource , outfile , ensure_ascii = False , separators = ( ' , ' , ' : ' ) )
2021-06-17 20:17:44 +08:00
# Also export a formatted version
print ( " Writing emoji_picker_datasource_formatted.json... " )
with open ( os . path . join ( scripts_dir , " ../tools/emojis/emoji_picker_datasource_formatted.json " ) , " w " ) as outfile :
json . dump ( emoji_picker_datasource , outfile , ensure_ascii = False , indent = 4 )
2021-03-27 00:21:02 +08:00
print ( " Done. " )