Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Feature request: update to unicode 15.1.0 #5

@SKalt

Description

@SKalt

Hi @magiclen, would you be interested in re-running your code generator to handle the new characters from Unicode 15.1.0, which was released in September 2023? I'm asking since I was generating a list of code blocks and noticed that the new CJK_UNIFIED_IDEOGRAPHS_EXTENSION_I block was missing.

(here's the hacky script I used to generate a list of all code blocks, if you're interested in using it for #3)
#!/usr/bin/env bash
target="/tmp/unicode_blocks.txt"
if ! [ -f "$target" ]; then
  curl -o "$target" "https://www.unicode.org/Public/UCD/latest/ucd/Blocks.txt"
fi

echo "/// all the unicode blocks ordered by their range of code points"
echo "pub const ALL_UNICODE_BLOCKS: &[UnicodeBlock] = &["
grep -E '^[0-9A-F]{4,}' "$target" | # find the lines with unicode ranges
  tr '[:lower:]' '[:upper:]' | # convert range names to uppercase
  awk -F '; ' '
    {
      range_name=$2;
      gsub(/[- ]/, "_", range_name); # convert range names into snake_case identifiers
      print range_name "," # you could also print `"// " $1` if you want the range itself
    }
  ' |
  sed 's/^/    /g' | # indent the output
  grep -v "CJK_UNIFIED_IDEOGRAPHS_EXTENSION_I" | # remove missing range
  cat - # output the result
echo "];"
output
/// all the unicode blocks ordered by their range of code points
pub const ALL_UNICODE_BLOCKS: &[UnicodeBlock] = &[
    BASIC_LATIN,
    LATIN_1_SUPPLEMENT,
    LATIN_EXTENDED_A,
    LATIN_EXTENDED_B,
    IPA_EXTENSIONS,
    SPACING_MODIFIER_LETTERS,
    COMBINING_DIACRITICAL_MARKS,
    GREEK_AND_COPTIC,
    CYRILLIC,
    CYRILLIC_SUPPLEMENT,
    ARMENIAN,
    HEBREW,
    ARABIC,
    SYRIAC,
    ARABIC_SUPPLEMENT,
    THAANA,
    NKO,
    SAMARITAN,
    MANDAIC,
    SYRIAC_SUPPLEMENT,
    ARABIC_EXTENDED_B,
    ARABIC_EXTENDED_A,
    DEVANAGARI,
    BENGALI,
    GURMUKHI,
    GUJARATI,
    ORIYA,
    TAMIL,
    TELUGU,
    KANNADA,
    MALAYALAM,
    SINHALA,
    THAI,
    LAO,
    TIBETAN,
    MYANMAR,
    GEORGIAN,
    HANGUL_JAMO,
    ETHIOPIC,
    ETHIOPIC_SUPPLEMENT,
    CHEROKEE,
    UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS,
    OGHAM,
    RUNIC,
    TAGALOG,
    HANUNOO,
    BUHID,
    TAGBANWA,
    KHMER,
    MONGOLIAN,
    UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS_EXTENDED,
    LIMBU,
    TAI_LE,
    NEW_TAI_LUE,
    KHMER_SYMBOLS,
    BUGINESE,
    TAI_THAM,
    COMBINING_DIACRITICAL_MARKS_EXTENDED,
    BALINESE,
    SUNDANESE,
    BATAK,
    LEPCHA,
    OL_CHIKI,
    CYRILLIC_EXTENDED_C,
    GEORGIAN_EXTENDED,
    SUNDANESE_SUPPLEMENT,
    VEDIC_EXTENSIONS,
    PHONETIC_EXTENSIONS,
    PHONETIC_EXTENSIONS_SUPPLEMENT,
    COMBINING_DIACRITICAL_MARKS_SUPPLEMENT,
    LATIN_EXTENDED_ADDITIONAL,
    GREEK_EXTENDED,
    GENERAL_PUNCTUATION,
    SUPERSCRIPTS_AND_SUBSCRIPTS,
    CURRENCY_SYMBOLS,
    COMBINING_DIACRITICAL_MARKS_FOR_SYMBOLS,
    LETTERLIKE_SYMBOLS,
    NUMBER_FORMS,
    ARROWS,
    MATHEMATICAL_OPERATORS,
    MISCELLANEOUS_TECHNICAL,
    CONTROL_PICTURES,
    OPTICAL_CHARACTER_RECOGNITION,
    ENCLOSED_ALPHANUMERICS,
    BOX_DRAWING,
    BLOCK_ELEMENTS,
    GEOMETRIC_SHAPES,
    MISCELLANEOUS_SYMBOLS,
    DINGBATS,
    MISCELLANEOUS_MATHEMATICAL_SYMBOLS_A,
    SUPPLEMENTAL_ARROWS_A,
    BRAILLE_PATTERNS,
    SUPPLEMENTAL_ARROWS_B,
    MISCELLANEOUS_MATHEMATICAL_SYMBOLS_B,
    SUPPLEMENTAL_MATHEMATICAL_OPERATORS,
    MISCELLANEOUS_SYMBOLS_AND_ARROWS,
    GLAGOLITIC,
    LATIN_EXTENDED_C,
    COPTIC,
    GEORGIAN_SUPPLEMENT,
    TIFINAGH,
    ETHIOPIC_EXTENDED,
    CYRILLIC_EXTENDED_A,
    SUPPLEMENTAL_PUNCTUATION,
    CJK_RADICALS_SUPPLEMENT,
    KANGXI_RADICALS,
    IDEOGRAPHIC_DESCRIPTION_CHARACTERS,
    CJK_SYMBOLS_AND_PUNCTUATION,
    HIRAGANA,
    KATAKANA,
    BOPOMOFO,
    HANGUL_COMPATIBILITY_JAMO,
    KANBUN,
    BOPOMOFO_EXTENDED,
    CJK_STROKES,
    KATAKANA_PHONETIC_EXTENSIONS,
    ENCLOSED_CJK_LETTERS_AND_MONTHS,
    CJK_COMPATIBILITY,
    CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A,
    YIJING_HEXAGRAM_SYMBOLS,
    CJK_UNIFIED_IDEOGRAPHS,
    YI_SYLLABLES,
    YI_RADICALS,
    LISU,
    VAI,
    CYRILLIC_EXTENDED_B,
    BAMUM,
    MODIFIER_TONE_LETTERS,
    LATIN_EXTENDED_D,
    SYLOTI_NAGRI,
    COMMON_INDIC_NUMBER_FORMS,
    PHAGS_PA,
    SAURASHTRA,
    DEVANAGARI_EXTENDED,
    KAYAH_LI,
    REJANG,
    HANGUL_JAMO_EXTENDED_A,
    JAVANESE,
    MYANMAR_EXTENDED_B,
    CHAM,
    MYANMAR_EXTENDED_A,
    TAI_VIET,
    MEETEI_MAYEK_EXTENSIONS,
    ETHIOPIC_EXTENDED_A,
    LATIN_EXTENDED_E,
    CHEROKEE_SUPPLEMENT,
    MEETEI_MAYEK,
    HANGUL_SYLLABLES,
    HANGUL_JAMO_EXTENDED_B,
    HIGH_SURROGATES,
    HIGH_PRIVATE_USE_SURROGATES,
    LOW_SURROGATES,
    PRIVATE_USE_AREA,
    CJK_COMPATIBILITY_IDEOGRAPHS,
    ALPHABETIC_PRESENTATION_FORMS,
    ARABIC_PRESENTATION_FORMS_A,
    VARIATION_SELECTORS,
    VERTICAL_FORMS,
    COMBINING_HALF_MARKS,
    CJK_COMPATIBILITY_FORMS,
    SMALL_FORM_VARIANTS,
    ARABIC_PRESENTATION_FORMS_B,
    HALFWIDTH_AND_FULLWIDTH_FORMS,
    SPECIALS,
    LINEAR_B_SYLLABARY,
    LINEAR_B_IDEOGRAMS,
    AEGEAN_NUMBERS,
    ANCIENT_GREEK_NUMBERS,
    ANCIENT_SYMBOLS,
    PHAISTOS_DISC,
    LYCIAN,
    CARIAN,
    COPTIC_EPACT_NUMBERS,
    OLD_ITALIC,
    GOTHIC,
    OLD_PERMIC,
    UGARITIC,
    OLD_PERSIAN,
    DESERET,
    SHAVIAN,
    OSMANYA,
    OSAGE,
    ELBASAN,
    CAUCASIAN_ALBANIAN,
    VITHKUQI,
    LINEAR_A,
    LATIN_EXTENDED_F,
    CYPRIOT_SYLLABARY,
    IMPERIAL_ARAMAIC,
    PALMYRENE,
    NABATAEAN,
    HATRAN,
    PHOENICIAN,
    LYDIAN,
    MEROITIC_HIEROGLYPHS,
    MEROITIC_CURSIVE,
    KHAROSHTHI,
    OLD_SOUTH_ARABIAN,
    OLD_NORTH_ARABIAN,
    MANICHAEAN,
    AVESTAN,
    INSCRIPTIONAL_PARTHIAN,
    INSCRIPTIONAL_PAHLAVI,
    PSALTER_PAHLAVI,
    OLD_TURKIC,
    OLD_HUNGARIAN,
    HANIFI_ROHINGYA,
    RUMI_NUMERAL_SYMBOLS,
    YEZIDI,
    ARABIC_EXTENDED_C,
    OLD_SOGDIAN,
    SOGDIAN,
    OLD_UYGHUR,
    CHORASMIAN,
    ELYMAIC,
    BRAHMI,
    KAITHI,
    SORA_SOMPENG,
    CHAKMA,
    MAHAJANI,
    SHARADA,
    SINHALA_ARCHAIC_NUMBERS,
    KHOJKI,
    MULTANI,
    KHUDAWADI,
    GRANTHA,
    NEWA,
    TIRHUTA,
    SIDDHAM,
    MODI,
    MONGOLIAN_SUPPLEMENT,
    TAKRI,
    AHOM,
    DOGRA,
    WARANG_CITI,
    DIVES_AKURU,
    NANDINAGARI,
    ZANABAZAR_SQUARE,
    SOYOMBO,
    UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS_EXTENDED_A,
    PAU_CIN_HAU,
    DEVANAGARI_EXTENDED_A,
    BHAIKSUKI,
    MARCHEN,
    MASARAM_GONDI,
    GUNJALA_GONDI,
    MAKASAR,
    KAWI,
    LISU_SUPPLEMENT,
    TAMIL_SUPPLEMENT,
    CUNEIFORM,
    CUNEIFORM_NUMBERS_AND_PUNCTUATION,
    EARLY_DYNASTIC_CUNEIFORM,
    CYPRO_MINOAN,
    EGYPTIAN_HIEROGLYPHS,
    EGYPTIAN_HIEROGLYPH_FORMAT_CONTROLS,
    ANATOLIAN_HIEROGLYPHS,
    BAMUM_SUPPLEMENT,
    MRO,
    TANGSA,
    BASSA_VAH,
    PAHAWH_HMONG,
    MEDEFAIDRIN,
    MIAO,
    IDEOGRAPHIC_SYMBOLS_AND_PUNCTUATION,
    TANGUT,
    TANGUT_COMPONENTS,
    KHITAN_SMALL_SCRIPT,
    TANGUT_SUPPLEMENT,
    KANA_EXTENDED_B,
    KANA_SUPPLEMENT,
    KANA_EXTENDED_A,
    SMALL_KANA_EXTENSION,
    NUSHU,
    DUPLOYAN,
    SHORTHAND_FORMAT_CONTROLS,
    ZNAMENNY_MUSICAL_NOTATION,
    BYZANTINE_MUSICAL_SYMBOLS,
    MUSICAL_SYMBOLS,
    ANCIENT_GREEK_MUSICAL_NOTATION,
    KAKTOVIK_NUMERALS,
    MAYAN_NUMERALS,
    TAI_XUAN_JING_SYMBOLS,
    COUNTING_ROD_NUMERALS,
    MATHEMATICAL_ALPHANUMERIC_SYMBOLS,
    SUTTON_SIGNWRITING,
    LATIN_EXTENDED_G,
    GLAGOLITIC_SUPPLEMENT,
    CYRILLIC_EXTENDED_D,
    NYIAKENG_PUACHUE_HMONG,
    TOTO,
    WANCHO,
    NAG_MUNDARI,
    ETHIOPIC_EXTENDED_B,
    MENDE_KIKAKUI,
    ADLAM,
    INDIC_SIYAQ_NUMBERS,
    OTTOMAN_SIYAQ_NUMBERS,
    ARABIC_MATHEMATICAL_ALPHABETIC_SYMBOLS,
    MAHJONG_TILES,
    DOMINO_TILES,
    PLAYING_CARDS,
    ENCLOSED_ALPHANUMERIC_SUPPLEMENT,
    ENCLOSED_IDEOGRAPHIC_SUPPLEMENT,
    MISCELLANEOUS_SYMBOLS_AND_PICTOGRAPHS,
    EMOTICONS,
    ORNAMENTAL_DINGBATS,
    TRANSPORT_AND_MAP_SYMBOLS,
    ALCHEMICAL_SYMBOLS,
    GEOMETRIC_SHAPES_EXTENDED,
    SUPPLEMENTAL_ARROWS_C,
    SUPPLEMENTAL_SYMBOLS_AND_PICTOGRAPHS,
    CHESS_SYMBOLS,
    SYMBOLS_AND_PICTOGRAPHS_EXTENDED_A,
    SYMBOLS_FOR_LEGACY_COMPUTING,
    CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B,
    CJK_UNIFIED_IDEOGRAPHS_EXTENSION_C,
    CJK_UNIFIED_IDEOGRAPHS_EXTENSION_D,
    CJK_UNIFIED_IDEOGRAPHS_EXTENSION_E,
    CJK_UNIFIED_IDEOGRAPHS_EXTENSION_F,
    CJK_COMPATIBILITY_IDEOGRAPHS_SUPPLEMENT,
    CJK_UNIFIED_IDEOGRAPHS_EXTENSION_G,
    CJK_UNIFIED_IDEOGRAPHS_EXTENSION_H,
    TAGS,
    VARIATION_SELECTORS_SUPPLEMENT,
    SUPPLEMENTARY_PRIVATE_USE_AREA_A,
    SUPPLEMENTARY_PRIVATE_USE_AREA_B,
];

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions