From 7257dece326375582875cca756639f85780e3393 Mon Sep 17 00:00:00 2001 From: Christopher Dryden Date: Mon, 2 Feb 2026 22:50:29 +0000 Subject: [PATCH 1/4] sort: add locale-aware month parsing using ICU --- .../cspell.dictionaries/jargon.wordlist.txt | 1 + Cargo.lock | 50 ++++++------ src/uu/sort/Cargo.toml | 2 + src/uu/sort/src/sort.rs | 46 +---------- src/uucore/Cargo.toml | 3 +- src/uucore/src/lib/features/i18n/mod.rs | 9 +++ src/uucore/src/lib/features/i18n/month.rs | 78 +++++++++++++++++++ tests/by-util/test_sort.rs | 47 +++++++++++ 8 files changed, 167 insertions(+), 69 deletions(-) create mode 100644 src/uucore/src/lib/features/i18n/month.rs diff --git a/.vscode/cspell.dictionaries/jargon.wordlist.txt b/.vscode/cspell.dictionaries/jargon.wordlist.txt index 0eb8b360673..f1bda1861a6 100644 --- a/.vscode/cspell.dictionaries/jargon.wordlist.txt +++ b/.vscode/cspell.dictionaries/jargon.wordlist.txt @@ -1,3 +1,4 @@ +janv AFAICT asimd ASIMD diff --git a/Cargo.lock b/Cargo.lock index 78e401b03c7..2db7fed087f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -284,9 +284,9 @@ dependencies = [ [[package]] name = "cc" -version = "1.2.52" +version = "1.2.53" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cd4932aefd12402b36c60956a4fe0035421f544799057659ff86f923657aada3" +checksum = "755d2fce177175ffca841e9a06afdb2c4ab0f593d53b4dee48147dfaade85932" dependencies = [ "find-msvc-tools", "shlex", @@ -337,18 +337,18 @@ dependencies = [ [[package]] name = "clap" -version = "4.5.56" +version = "4.5.54" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a75ca66430e33a14957acc24c5077b503e7d374151b2b4b3a10c83b4ceb4be0e" +checksum = "c6e6ff9dcd79cff5cd969a17a545d79e84ab086e444102a591e288a8aa3ce394" dependencies = [ "clap_builder", ] [[package]] name = "clap_builder" -version = "4.5.56" +version = "4.5.54" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "793207c7fa6300a0608d1080b858e5fdbe713cdc1c8db9fb17777d8a13e63df0" +checksum = "fa42cf4d2b7a41bc8f663a7cab4031ebafa1bf3875705bfaf8466dc60ab52c00" dependencies = [ "anstream", "anstyle", @@ -384,9 +384,9 @@ dependencies = [ [[package]] name = "codspeed" -version = "4.3.0" +version = "4.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "38c2eb3388ebe26b5a0ab6bf4969d9c4840143d7f6df07caa3cc851b0606cef6" +checksum = "5f0d98d97fd75ca4489a1a0997820a6521531085e7c8a98941bd0e1264d567dd" dependencies = [ "anyhow", "cc", @@ -402,9 +402,9 @@ dependencies = [ [[package]] name = "codspeed-divan-compat" -version = "4.3.0" +version = "4.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b2de65b7489a59709724d489070c6d05b7744039e4bf751d0a2006b90bb5593d" +checksum = "4179ec5518e79efcd02ed50aa483ff807902e43c85146e87fff58b9cffc06078" dependencies = [ "clap", "codspeed", @@ -415,9 +415,9 @@ dependencies = [ [[package]] name = "codspeed-divan-compat-macros" -version = "4.3.0" +version = "4.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "56ca01ce4fd22b8dcc6c770dcd6b74343642e842482b94e8920d14e10c57638d" +checksum = "15eaee97aa5bceb32cc683fe25cd6373b7fc48baee5c12471996b58b6ddf0d7c" dependencies = [ "divan-macros", "itertools 0.14.0", @@ -429,9 +429,9 @@ dependencies = [ [[package]] name = "codspeed-divan-compat-walltime" -version = "4.3.0" +version = "4.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "720ab9d0714718afe5f5832be6e5f5eb5ce97836e24ca7bf7042eea4308b9fb8" +checksum = "c38671153aa73be075d6019cab5ab1e6b31d36644067c1ac4cef73bf9723ce33" dependencies = [ "cfg-if", "clap", @@ -1059,9 +1059,9 @@ dependencies = [ [[package]] name = "find-msvc-tools" -version = "0.1.7" +version = "0.1.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f449e6c6c08c865631d4890cfacf252b3d396c9bcc83adb6623cdb02a8336c41" +checksum = "8591b0bcc8a98a64310a2fae1bb3e9b8564dd10e381e6e28010fde8e8e8568db" [[package]] name = "fixed_decimal" @@ -2342,9 +2342,9 @@ dependencies = [ [[package]] name = "proc-macro2" -version = "1.0.106" +version = "1.0.105" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8fd00f0bb2e90d81d1044c2b32617f68fcb9fa3bb7640c23e9c748e53fb30934" +checksum = "535d180e0ecab6268a3e718bb9fd44db66bbbc256257165fc699dadf70d16fe7" dependencies = [ "unicode-ident", ] @@ -2375,9 +2375,9 @@ dependencies = [ [[package]] name = "quote" -version = "1.0.44" +version = "1.0.43" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "21b2ebcf727b7760c461f091f9f0f539b77b8e87f2fd88131e7f1b433b3cece4" +checksum = "dc74d9a594b72ae6656596548f56f667211f8a97b3d4c3d467150794690dc40a" dependencies = [ "proc-macro2", ] @@ -2767,9 +2767,9 @@ dependencies = [ [[package]] name = "signal-hook" -version = "0.4.3" +version = "0.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3b57709da74f9ff9f4a27dce9526eec25ca8407c45a7887243b031a58935fb8e" +checksum = "2a37d01603c37b5466f808de79f845c7116049b0579adb70a6b7d47c1fa3a952" dependencies = [ "libc", "signal-hook-registry", @@ -3414,7 +3414,7 @@ dependencies = [ "gcd", "libc", "nix", - "signal-hook 0.4.3", + "signal-hook 0.4.1", "tempfile", "thiserror 2.0.18", "uucore", @@ -5111,9 +5111,9 @@ checksum = "40990edd51aae2c2b6907af74ffb635029d5788228222c4bb811e9351c0caad3" [[package]] name = "zmij" -version = "1.0.14" +version = "1.0.16" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bd8f3f50b848df28f887acb68e41201b5aea6bc8a8dacc00fb40635ff9a72fea" +checksum = "dfcd145825aace48cff44a8844de64bf75feec3080e0aa5cdbde72961ae51a65" [[package]] name = "zopfli" diff --git a/src/uu/sort/Cargo.toml b/src/uu/sort/Cargo.toml index e487a1bfe49..ec0496a6e80 100644 --- a/src/uu/sort/Cargo.toml +++ b/src/uu/sort/Cargo.toml @@ -42,6 +42,7 @@ uucore = { workspace = true, features = [ "version-cmp", "i18n-decimal", "i18n-collator", + "i18n-month", ] } fluent = { workspace = true } @@ -60,6 +61,7 @@ uucore = { workspace = true, features = [ "parser-size", "version-cmp", "i18n-collator", + "i18n-month", ] } [[bin]] diff --git a/src/uu/sort/src/sort.rs b/src/uu/sort/src/sort.rs index a16198d3332..cbace2dfed0 100644 --- a/src/uu/sort/src/sort.rs +++ b/src/uu/sort/src/sort.rs @@ -50,6 +50,7 @@ use uucore::extendedbigdecimal::ExtendedBigDecimal; #[cfg(feature = "i18n-collator")] use uucore::i18n::collator::locale_cmp; use uucore::i18n::decimal::locale_decimal_separator; +use uucore::i18n::month::month_parse as locale_month_parse; use uucore::line_ending::LineEnding; use uucore::parser::num_parser::{ExtendedParser, ExtendedParserError}; use uucore::parser::parse_size::{ParseSizeError, Parser}; @@ -779,7 +780,7 @@ impl<'a> Line<'a> { .enumerate() .skip_while(|(_, c)| c.is_ascii_whitespace()); - let month = if month_parse(initial_selection) == Month::Unknown { + let month = if locale_month_parse(initial_selection) == 0 { // We failed to parse a month, which is equivalent to matching nothing. // Add the "no match for key" marker to the first non-whitespace character. let first_non_whitespace = month_chars.next(); @@ -2967,49 +2968,8 @@ fn random_shuffle(a: &[u8], b: &[u8], salt: &[u8]) -> Ordering { da.cmp(&db) } -#[derive(Eq, Ord, PartialEq, PartialOrd, Clone, Copy)] -enum Month { - Unknown, - January, - February, - March, - April, - May, - June, - July, - August, - September, - October, - November, - December, -} - -/// Parse the beginning string into a Month, returning [`Month::Unknown`] on errors. -fn month_parse(line: &[u8]) -> Month { - let line = line.trim_ascii_start(); - - match line.get(..3).map(|x| x.to_ascii_uppercase()).as_deref() { - Some(b"JAN") => Month::January, - Some(b"FEB") => Month::February, - Some(b"MAR") => Month::March, - Some(b"APR") => Month::April, - Some(b"MAY") => Month::May, - Some(b"JUN") => Month::June, - Some(b"JUL") => Month::July, - Some(b"AUG") => Month::August, - Some(b"SEP") => Month::September, - Some(b"OCT") => Month::October, - Some(b"NOV") => Month::November, - Some(b"DEC") => Month::December, - _ => Month::Unknown, - } -} - fn month_compare(a: &[u8], b: &[u8]) -> Ordering { - let ma = month_parse(a); - let mb = month_parse(b); - - ma.cmp(&mb) + locale_month_parse(a).cmp(&locale_month_parse(b)) } fn print_sorted<'a, T: Iterator>>( diff --git a/src/uucore/Cargo.toml b/src/uucore/Cargo.toml index d18d0630ed5..2a8cb2616b9 100644 --- a/src/uucore/Cargo.toml +++ b/src/uucore/Cargo.toml @@ -150,7 +150,7 @@ format = [ "quoting-style", "unit-prefix", ] -i18n-all = ["i18n-collator", "i18n-decimal", "i18n-datetime"] +i18n-all = ["i18n-collator", "i18n-decimal", "i18n-datetime", "i18n-month"] i18n-common = ["icu_locale"] i18n-collator = ["i18n-common", "icu_collator"] i18n-decimal = ["i18n-common", "icu_decimal", "icu_provider"] @@ -161,6 +161,7 @@ i18n-datetime = [ "jiff-icu", "jiff", ] +i18n-month = ["i18n-common", "icu_datetime", "icu_provider", "libc"] mode = ["libc"] perms = ["entries", "libc", "walkdir"] buf-copy = [] diff --git a/src/uucore/src/lib/features/i18n/mod.rs b/src/uucore/src/lib/features/i18n/mod.rs index e8e0f3f3c5d..5f0dfec0050 100644 --- a/src/uucore/src/lib/features/i18n/mod.rs +++ b/src/uucore/src/lib/features/i18n/mod.rs @@ -13,6 +13,8 @@ pub mod collator; pub mod datetime; #[cfg(feature = "i18n-decimal")] pub mod decimal; +#[cfg(feature = "i18n-month")] +pub mod month; /// The encoding specified by the locale, if specified /// Currently only supports ASCII and UTF-8 for the sake of simplicity. @@ -86,6 +88,13 @@ pub fn get_numeric_locale() -> &'static (Locale, UEncoding) { NUMERIC_LOCALE.get_or_init(|| get_locale_from_env("LC_NUMERIC")) } +/// Get the time locale from the environment (used for month names, etc.) +pub fn get_time_locale() -> &'static (Locale, UEncoding) { + static TIME_LOCALE: OnceLock<(Locale, UEncoding)> = OnceLock::new(); + + TIME_LOCALE.get_or_init(|| get_locale_from_env("LC_TIME")) +} + /// Return the encoding deduced from the locale environment variable. pub fn get_locale_encoding() -> UEncoding { get_collating_locale().1 diff --git a/src/uucore/src/lib/features/i18n/month.rs b/src/uucore/src/lib/features/i18n/month.rs new file mode 100644 index 00000000000..68471246409 --- /dev/null +++ b/src/uucore/src/lib/features/i18n/month.rs @@ -0,0 +1,78 @@ +// This file is part of the uutils coreutils package. +// +// For the full copyright and license information, please view the LICENSE +// file that was distributed with this source code. + +use std::sync::OnceLock; + +use icu_datetime::provider::neo::{DatetimeNamesMonthGregorianV1, MonthNames}; +use icu_locale::{Locale, locale}; +use icu_provider::prelude::*; + +use crate::i18n::get_time_locale; + +fn load_month_names(loc: &Locale) -> Option> { + let data_locale = DataLocale::from(loc.clone()); + let abbr_attr = DataMarkerAttributes::from_str_or_panic("3"); + let request = DataRequest { + id: DataIdentifierBorrowed::for_marker_attributes_and_locale(abbr_attr, &data_locale), + metadata: DataRequestMetadata::default(), + }; + + let response: DataResponse = + icu_datetime::provider::Baked.load(request).ok()?; + + if let MonthNames::Linear(names) = response.payload.get() { + let mut result = Vec::new(); + for (i, name) in names.iter().take(12).enumerate() { + let month = (i + 1) as u8; + let upper = name.to_uppercase(); + // Some locales use trailing periods in abbreviated months (e.g., "janv." in French). + // Store both with and without the period so we can match either format. + let stripped = upper.trim_end_matches('.'); + if stripped != upper { + result.push((stripped.to_string(), month)); + } + result.push((upper, month)); + } + return Some(result); + } + None +} + +fn get_month_names() -> &'static Vec<(String, u8)> { + static MONTH_NAMES: OnceLock> = OnceLock::new(); + MONTH_NAMES.get_or_init(|| { + let loc = get_time_locale().0.clone(); + load_month_names(&loc) + .or_else(|| load_month_names(&locale!("en"))) + .expect("ICU should always have English month data") + }) +} + +/// Parse a month name from the beginning of the input bytes. +/// Returns month number (1-12) or 0 if not recognized. +pub fn month_parse(input: &[u8]) -> u8 { + let input = input.trim_ascii_start(); + + // Convert bytes to string for comparison. For valid UTF-8, use it directly. + // For non-UTF-8 (e.g., Latin-1 locales), treat each byte as a Unicode codepoint. + // This handles legacy encodings like ISO-8859-1 where byte 0xE9 = 'é'. + let input_upper = std::str::from_utf8(input).map_or_else( + |_| { + input + .iter() + .map(|&b| b as char) + .collect::() + .to_uppercase() + }, + |s| s.to_uppercase(), + ); + + for (name, month) in get_month_names() { + if input_upper.starts_with(name) { + return *month; + } + } + 0 +} diff --git a/tests/by-util/test_sort.rs b/tests/by-util/test_sort.rs index 42cbcff5ad4..e08e16fa320 100644 --- a/tests/by-util/test_sort.rs +++ b/tests/by-util/test_sort.rs @@ -2704,4 +2704,51 @@ fn test_locale_complex_utf8_sorting() { .stdout_is("apple\nApple\nbanana\nBanana\nzebra\nZebra\n"); } +#[test] +fn test_month_sort_english() { + new_ucmd!() + .arg("-M") + .pipe_in("Dec\nJan\nMar\nFeb\n") + .succeeds() + .stdout_only("Jan\nFeb\nMar\nDec\n"); +} + +#[test] +fn test_month_sort_case_insensitive() { + new_ucmd!() + .arg("-M") + .pipe_in("dec\nJAN\nmar\nFEB\n") + .succeeds() + .stdout_only("JAN\nFEB\nmar\ndec\n"); +} + +#[test] +fn test_month_sort_with_prefix() { + new_ucmd!() + .arg("-M") + .pipe_in("December 25\nJanuary 1\nMarch 15\n") + .succeeds() + .stdout_only("January 1\nMarch 15\nDecember 25\n"); +} + +#[test] +fn test_month_sort_unknown_sorted_first() { + // Unknown month names sort before known months + new_ucmd!() + .arg("-M") + .pipe_in("Jan\nxyz\nFeb\nabc\n") + .succeeds() + .stdout_only("abc\nxyz\nJan\nFeb\n"); +} + +#[test] +fn test_month_sort_french_locale() { + new_ucmd!() + .arg("-M") + .env("LC_ALL", "fr_FR.UTF-8") + .pipe_in("déc.\njanv.\nmars\nfévr.\n") + .succeeds() + .stdout_only("janv.\nfévr.\nmars\ndéc.\n"); +} + /* spell-checker: enable */ From 539e8a4dffbdfd5c9cfcd7781dc21e3a6c5235fe Mon Sep 17 00:00:00 2001 From: Christopher Dryden Date: Fri, 19 Dec 2025 23:36:18 +0000 Subject: [PATCH 2/4] CI: add ja_JP.UTF-8 locale for sort-month test --- .github/workflows/GnuTests.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/GnuTests.yml b/.github/workflows/GnuTests.yml index 8bc78bf19a0..2229d807b55 100644 --- a/.github/workflows/GnuTests.yml +++ b/.github/workflows/GnuTests.yml @@ -89,6 +89,7 @@ jobs: sudo locale-gen --keep-existing am_ET.UTF-8 # Ethiopia sudo locale-gen --keep-existing th_TH.UTF-8 # Thailand sudo locale-gen --keep-existing zh_CN.GB18030 # China + sudo locale-gen --keep-existing ja_JP.UTF-8 # Japan sudo update-locale echo "After:" From 66e7f8c9acb47076c9bb62dc133db8471686e264 Mon Sep 17 00:00:00 2001 From: Christopher Dryden Date: Wed, 14 Jan 2026 00:43:19 +0000 Subject: [PATCH 3/4] deny.toml: skip hashbrown 0.15.5 duplicate --- deny.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deny.toml b/deny.toml index 24222750941..227be9d6f8b 100644 --- a/deny.toml +++ b/deny.toml @@ -88,7 +88,7 @@ skip = [ { name = "itertools", version = "0.13.0" }, # ordered-multimap { name = "hashbrown", version = "0.14.5" }, - # lru (via num-prime) + # lru (via num-prime), icu4x { name = "hashbrown", version = "0.15.5" }, # cexpr (via bindgen) { name = "nom", version = "7.1.3" }, From c556ee0958cc2064a62b7c160a9cf27b2ca18198 Mon Sep 17 00:00:00 2001 From: Christopher Dryden Date: Mon, 26 Jan 2026 17:15:54 +0000 Subject: [PATCH 4/4] sort: fix month parsing for C/POSIX locale --- src/uucore/src/lib/features/i18n/month.rs | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/src/uucore/src/lib/features/i18n/month.rs b/src/uucore/src/lib/features/i18n/month.rs index 68471246409..87d75c6f0dd 100644 --- a/src/uucore/src/lib/features/i18n/month.rs +++ b/src/uucore/src/lib/features/i18n/month.rs @@ -44,7 +44,14 @@ fn get_month_names() -> &'static Vec<(String, u8)> { static MONTH_NAMES: OnceLock> = OnceLock::new(); MONTH_NAMES.get_or_init(|| { let loc = get_time_locale().0.clone(); - load_month_names(&loc) + // For undefined locale (C/POSIX), ICU returns generic month names like "M01", "M02" + // which aren't useful for matching. Skip directly to English fallback. + let result = if loc == locale!("und") { + None + } else { + load_month_names(&loc) + }; + result .or_else(|| load_month_names(&locale!("en"))) .expect("ICU should always have English month data") })