-
Notifications
You must be signed in to change notification settings - Fork 5
Closed
Description
Hi @magiclen, would you be interested in re-running your code generator to handle the new characters from Unicode 15.1.0, which was released in September 2023? I'm asking since I was generating a list of code blocks and noticed that the new CJK_UNIFIED_IDEOGRAPHS_EXTENSION_I
block was missing.
(here's the hacky script I used to generate a list of all code blocks, if you're interested in using it for #3)
#!/usr/bin/env bash
target="/tmp/unicode_blocks.txt"
if ! [ -f "$target" ]; then
curl -o "$target" "https://www.unicode.org/Public/UCD/latest/ucd/Blocks.txt"
fi
echo "/// all the unicode blocks ordered by their range of code points"
echo "pub const ALL_UNICODE_BLOCKS: &[UnicodeBlock] = &["
grep -E '^[0-9A-F]{4,}' "$target" | # find the lines with unicode ranges
tr '[:lower:]' '[:upper:]' | # convert range names to uppercase
awk -F '; ' '
{
range_name=$2;
gsub(/[- ]/, "_", range_name); # convert range names into snake_case identifiers
print range_name "," # you could also print `"// " $1` if you want the range itself
}
' |
sed 's/^/ /g' | # indent the output
grep -v "CJK_UNIFIED_IDEOGRAPHS_EXTENSION_I" | # remove missing range
cat - # output the result
echo "];"
output
/// all the unicode blocks ordered by their range of code points
pub const ALL_UNICODE_BLOCKS: &[UnicodeBlock] = &[
BASIC_LATIN,
LATIN_1_SUPPLEMENT,
LATIN_EXTENDED_A,
LATIN_EXTENDED_B,
IPA_EXTENSIONS,
SPACING_MODIFIER_LETTERS,
COMBINING_DIACRITICAL_MARKS,
GREEK_AND_COPTIC,
CYRILLIC,
CYRILLIC_SUPPLEMENT,
ARMENIAN,
HEBREW,
ARABIC,
SYRIAC,
ARABIC_SUPPLEMENT,
THAANA,
NKO,
SAMARITAN,
MANDAIC,
SYRIAC_SUPPLEMENT,
ARABIC_EXTENDED_B,
ARABIC_EXTENDED_A,
DEVANAGARI,
BENGALI,
GURMUKHI,
GUJARATI,
ORIYA,
TAMIL,
TELUGU,
KANNADA,
MALAYALAM,
SINHALA,
THAI,
LAO,
TIBETAN,
MYANMAR,
GEORGIAN,
HANGUL_JAMO,
ETHIOPIC,
ETHIOPIC_SUPPLEMENT,
CHEROKEE,
UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS,
OGHAM,
RUNIC,
TAGALOG,
HANUNOO,
BUHID,
TAGBANWA,
KHMER,
MONGOLIAN,
UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS_EXTENDED,
LIMBU,
TAI_LE,
NEW_TAI_LUE,
KHMER_SYMBOLS,
BUGINESE,
TAI_THAM,
COMBINING_DIACRITICAL_MARKS_EXTENDED,
BALINESE,
SUNDANESE,
BATAK,
LEPCHA,
OL_CHIKI,
CYRILLIC_EXTENDED_C,
GEORGIAN_EXTENDED,
SUNDANESE_SUPPLEMENT,
VEDIC_EXTENSIONS,
PHONETIC_EXTENSIONS,
PHONETIC_EXTENSIONS_SUPPLEMENT,
COMBINING_DIACRITICAL_MARKS_SUPPLEMENT,
LATIN_EXTENDED_ADDITIONAL,
GREEK_EXTENDED,
GENERAL_PUNCTUATION,
SUPERSCRIPTS_AND_SUBSCRIPTS,
CURRENCY_SYMBOLS,
COMBINING_DIACRITICAL_MARKS_FOR_SYMBOLS,
LETTERLIKE_SYMBOLS,
NUMBER_FORMS,
ARROWS,
MATHEMATICAL_OPERATORS,
MISCELLANEOUS_TECHNICAL,
CONTROL_PICTURES,
OPTICAL_CHARACTER_RECOGNITION,
ENCLOSED_ALPHANUMERICS,
BOX_DRAWING,
BLOCK_ELEMENTS,
GEOMETRIC_SHAPES,
MISCELLANEOUS_SYMBOLS,
DINGBATS,
MISCELLANEOUS_MATHEMATICAL_SYMBOLS_A,
SUPPLEMENTAL_ARROWS_A,
BRAILLE_PATTERNS,
SUPPLEMENTAL_ARROWS_B,
MISCELLANEOUS_MATHEMATICAL_SYMBOLS_B,
SUPPLEMENTAL_MATHEMATICAL_OPERATORS,
MISCELLANEOUS_SYMBOLS_AND_ARROWS,
GLAGOLITIC,
LATIN_EXTENDED_C,
COPTIC,
GEORGIAN_SUPPLEMENT,
TIFINAGH,
ETHIOPIC_EXTENDED,
CYRILLIC_EXTENDED_A,
SUPPLEMENTAL_PUNCTUATION,
CJK_RADICALS_SUPPLEMENT,
KANGXI_RADICALS,
IDEOGRAPHIC_DESCRIPTION_CHARACTERS,
CJK_SYMBOLS_AND_PUNCTUATION,
HIRAGANA,
KATAKANA,
BOPOMOFO,
HANGUL_COMPATIBILITY_JAMO,
KANBUN,
BOPOMOFO_EXTENDED,
CJK_STROKES,
KATAKANA_PHONETIC_EXTENSIONS,
ENCLOSED_CJK_LETTERS_AND_MONTHS,
CJK_COMPATIBILITY,
CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A,
YIJING_HEXAGRAM_SYMBOLS,
CJK_UNIFIED_IDEOGRAPHS,
YI_SYLLABLES,
YI_RADICALS,
LISU,
VAI,
CYRILLIC_EXTENDED_B,
BAMUM,
MODIFIER_TONE_LETTERS,
LATIN_EXTENDED_D,
SYLOTI_NAGRI,
COMMON_INDIC_NUMBER_FORMS,
PHAGS_PA,
SAURASHTRA,
DEVANAGARI_EXTENDED,
KAYAH_LI,
REJANG,
HANGUL_JAMO_EXTENDED_A,
JAVANESE,
MYANMAR_EXTENDED_B,
CHAM,
MYANMAR_EXTENDED_A,
TAI_VIET,
MEETEI_MAYEK_EXTENSIONS,
ETHIOPIC_EXTENDED_A,
LATIN_EXTENDED_E,
CHEROKEE_SUPPLEMENT,
MEETEI_MAYEK,
HANGUL_SYLLABLES,
HANGUL_JAMO_EXTENDED_B,
HIGH_SURROGATES,
HIGH_PRIVATE_USE_SURROGATES,
LOW_SURROGATES,
PRIVATE_USE_AREA,
CJK_COMPATIBILITY_IDEOGRAPHS,
ALPHABETIC_PRESENTATION_FORMS,
ARABIC_PRESENTATION_FORMS_A,
VARIATION_SELECTORS,
VERTICAL_FORMS,
COMBINING_HALF_MARKS,
CJK_COMPATIBILITY_FORMS,
SMALL_FORM_VARIANTS,
ARABIC_PRESENTATION_FORMS_B,
HALFWIDTH_AND_FULLWIDTH_FORMS,
SPECIALS,
LINEAR_B_SYLLABARY,
LINEAR_B_IDEOGRAMS,
AEGEAN_NUMBERS,
ANCIENT_GREEK_NUMBERS,
ANCIENT_SYMBOLS,
PHAISTOS_DISC,
LYCIAN,
CARIAN,
COPTIC_EPACT_NUMBERS,
OLD_ITALIC,
GOTHIC,
OLD_PERMIC,
UGARITIC,
OLD_PERSIAN,
DESERET,
SHAVIAN,
OSMANYA,
OSAGE,
ELBASAN,
CAUCASIAN_ALBANIAN,
VITHKUQI,
LINEAR_A,
LATIN_EXTENDED_F,
CYPRIOT_SYLLABARY,
IMPERIAL_ARAMAIC,
PALMYRENE,
NABATAEAN,
HATRAN,
PHOENICIAN,
LYDIAN,
MEROITIC_HIEROGLYPHS,
MEROITIC_CURSIVE,
KHAROSHTHI,
OLD_SOUTH_ARABIAN,
OLD_NORTH_ARABIAN,
MANICHAEAN,
AVESTAN,
INSCRIPTIONAL_PARTHIAN,
INSCRIPTIONAL_PAHLAVI,
PSALTER_PAHLAVI,
OLD_TURKIC,
OLD_HUNGARIAN,
HANIFI_ROHINGYA,
RUMI_NUMERAL_SYMBOLS,
YEZIDI,
ARABIC_EXTENDED_C,
OLD_SOGDIAN,
SOGDIAN,
OLD_UYGHUR,
CHORASMIAN,
ELYMAIC,
BRAHMI,
KAITHI,
SORA_SOMPENG,
CHAKMA,
MAHAJANI,
SHARADA,
SINHALA_ARCHAIC_NUMBERS,
KHOJKI,
MULTANI,
KHUDAWADI,
GRANTHA,
NEWA,
TIRHUTA,
SIDDHAM,
MODI,
MONGOLIAN_SUPPLEMENT,
TAKRI,
AHOM,
DOGRA,
WARANG_CITI,
DIVES_AKURU,
NANDINAGARI,
ZANABAZAR_SQUARE,
SOYOMBO,
UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS_EXTENDED_A,
PAU_CIN_HAU,
DEVANAGARI_EXTENDED_A,
BHAIKSUKI,
MARCHEN,
MASARAM_GONDI,
GUNJALA_GONDI,
MAKASAR,
KAWI,
LISU_SUPPLEMENT,
TAMIL_SUPPLEMENT,
CUNEIFORM,
CUNEIFORM_NUMBERS_AND_PUNCTUATION,
EARLY_DYNASTIC_CUNEIFORM,
CYPRO_MINOAN,
EGYPTIAN_HIEROGLYPHS,
EGYPTIAN_HIEROGLYPH_FORMAT_CONTROLS,
ANATOLIAN_HIEROGLYPHS,
BAMUM_SUPPLEMENT,
MRO,
TANGSA,
BASSA_VAH,
PAHAWH_HMONG,
MEDEFAIDRIN,
MIAO,
IDEOGRAPHIC_SYMBOLS_AND_PUNCTUATION,
TANGUT,
TANGUT_COMPONENTS,
KHITAN_SMALL_SCRIPT,
TANGUT_SUPPLEMENT,
KANA_EXTENDED_B,
KANA_SUPPLEMENT,
KANA_EXTENDED_A,
SMALL_KANA_EXTENSION,
NUSHU,
DUPLOYAN,
SHORTHAND_FORMAT_CONTROLS,
ZNAMENNY_MUSICAL_NOTATION,
BYZANTINE_MUSICAL_SYMBOLS,
MUSICAL_SYMBOLS,
ANCIENT_GREEK_MUSICAL_NOTATION,
KAKTOVIK_NUMERALS,
MAYAN_NUMERALS,
TAI_XUAN_JING_SYMBOLS,
COUNTING_ROD_NUMERALS,
MATHEMATICAL_ALPHANUMERIC_SYMBOLS,
SUTTON_SIGNWRITING,
LATIN_EXTENDED_G,
GLAGOLITIC_SUPPLEMENT,
CYRILLIC_EXTENDED_D,
NYIAKENG_PUACHUE_HMONG,
TOTO,
WANCHO,
NAG_MUNDARI,
ETHIOPIC_EXTENDED_B,
MENDE_KIKAKUI,
ADLAM,
INDIC_SIYAQ_NUMBERS,
OTTOMAN_SIYAQ_NUMBERS,
ARABIC_MATHEMATICAL_ALPHABETIC_SYMBOLS,
MAHJONG_TILES,
DOMINO_TILES,
PLAYING_CARDS,
ENCLOSED_ALPHANUMERIC_SUPPLEMENT,
ENCLOSED_IDEOGRAPHIC_SUPPLEMENT,
MISCELLANEOUS_SYMBOLS_AND_PICTOGRAPHS,
EMOTICONS,
ORNAMENTAL_DINGBATS,
TRANSPORT_AND_MAP_SYMBOLS,
ALCHEMICAL_SYMBOLS,
GEOMETRIC_SHAPES_EXTENDED,
SUPPLEMENTAL_ARROWS_C,
SUPPLEMENTAL_SYMBOLS_AND_PICTOGRAPHS,
CHESS_SYMBOLS,
SYMBOLS_AND_PICTOGRAPHS_EXTENDED_A,
SYMBOLS_FOR_LEGACY_COMPUTING,
CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B,
CJK_UNIFIED_IDEOGRAPHS_EXTENSION_C,
CJK_UNIFIED_IDEOGRAPHS_EXTENSION_D,
CJK_UNIFIED_IDEOGRAPHS_EXTENSION_E,
CJK_UNIFIED_IDEOGRAPHS_EXTENSION_F,
CJK_COMPATIBILITY_IDEOGRAPHS_SUPPLEMENT,
CJK_UNIFIED_IDEOGRAPHS_EXTENSION_G,
CJK_UNIFIED_IDEOGRAPHS_EXTENSION_H,
TAGS,
VARIATION_SELECTORS_SUPPLEMENT,
SUPPLEMENTARY_PRIVATE_USE_AREA_A,
SUPPLEMENTARY_PRIVATE_USE_AREA_B,
];
Metadata
Metadata
Assignees
Labels
No labels