From 6d8a7814d57a3ba8dac61980bee0160c91d1567e Mon Sep 17 00:00:00 2001 From: Andrew Liebenow Date: Sun, 20 Oct 2024 11:37:18 -0500 Subject: [PATCH 01/10] echo: handle multibyte escape sequences Bug was reported, with root cause analysis, by kkew3 Added tests were derived from test cases provided by kkew3 See https://github.com/uutils/coreutils/issues/6741 --- src/uu/echo/src/echo.rs | 53 ++++++++++++++++++++++---------------- tests/by-util/test_echo.rs | 35 +++++++++++++++++++++++++ 2 files changed, 66 insertions(+), 22 deletions(-) diff --git a/src/uu/echo/src/echo.rs b/src/uu/echo/src/echo.rs index c94443822e0..243c0d816b8 100644 --- a/src/uu/echo/src/echo.rs +++ b/src/uu/echo/src/echo.rs @@ -39,7 +39,7 @@ impl Base { } /// Parse the numeric part of the `\xHHH` and `\0NNN` escape sequences -fn parse_code(input: &mut Peekable, base: Base) -> Option { +fn parse_code(input: &mut Peekable, base: Base) -> Option { // All arithmetic on `ret` needs to be wrapping, because octal input can // take 3 digits, which is 9 bits, and therefore more than what fits in a // `u8`. GNU just seems to wrap these values. @@ -60,14 +60,16 @@ fn parse_code(input: &mut Peekable, base: Base) -> Option { let _ = input.next(); } - Some(ret.into()) + Some(ret) } fn print_escaped(input: &str, mut output: impl Write) -> io::Result> { let mut iter = input.chars().peekable(); + while let Some(c) = iter.next() { if c != '\\' { write!(output, "{c}")?; + continue; } @@ -76,40 +78,47 @@ fn print_escaped(input: &str, mut output: impl Write) -> io::Result '\\', - 'a' => '\x07', - 'b' => '\x08', + // For extending lifetime + let sl: [u8; 1_usize]; + + let unescaped: &[u8] = match next { + '\\' => br"\", + 'a' => b"\x07", + 'b' => b"\x08", 'c' => return Ok(ControlFlow::Break(())), - 'e' => '\x1b', - 'f' => '\x0c', - 'n' => '\n', - 'r' => '\r', - 't' => '\t', - 'v' => '\x0b', + 'e' => b"\x1b", + 'f' => b"\x0c", + 'n' => b"\n", + 'r' => b"\r", + 't' => b"\t", + 'v' => b"\x0b", 'x' => { - if let Some(c) = parse_code(&mut iter, Base::Hex) { - c + if let Some(ue) = parse_code(&mut iter, Base::Hex) { + sl = [ue]; + + &sl } else { - write!(output, "\\")?; - 'x' + br"\x" } } - '0' => parse_code(&mut iter, Base::Oct).unwrap_or('\0'), + '0' => &[parse_code(&mut iter, Base::Oct).unwrap_or(b'\0')], c => { - write!(output, "\\")?; - c + write!(output, "\\{c}")?; + + continue; } }; - write!(output, "{unescaped}")?; + + output.write_all(unescaped)?; } else { - write!(output, "\\")?; + output.write_all(br"\")?; } } diff --git a/tests/by-util/test_echo.rs b/tests/by-util/test_echo.rs index 4ae623f2f6f..6475d7f6c58 100644 --- a/tests/by-util/test_echo.rs +++ b/tests/by-util/test_echo.rs @@ -303,3 +303,38 @@ fn partial_version_argument() { fn partial_help_argument() { new_ucmd!().arg("--he").succeeds().stdout_is("--he\n"); } + +#[test] +fn multibyte_escape_unicode() { + // spell-checker:disable-next-line + // Tests suggested by kkew3 + // https://github.com/uutils/coreutils/issues/6741 + + // \u{1F602} is: + // + // "Face with Tears of Joy" + // U+1F602 + // "😂" + + new_ucmd!() + .args(&["-e", r"\xf0\x9f\x98\x82"]) + .succeeds() + .stdout_only("\u{1F602}\n"); + + new_ucmd!() + .args(&["-e", r"\x41\xf0\x9f\x98\x82\x42"]) + .succeeds() + .stdout_only("A\u{1F602}B\n"); + + new_ucmd!() + .args(&["-e", r"\xf0\x41\x9f\x98\x82"]) + .succeeds() + .stdout_is_bytes(b"\xF0A\x9F\x98\x82\n") + .no_stderr(); + + new_ucmd!() + .args(&["-e", r"\x41\xf0\c\x9f\x98\x82"]) + .succeeds() + .stdout_is_bytes(b"A\xF0") + .no_stderr(); +} From 5d9fa3840aebcf8549395a0c7594dd03626501b0 Mon Sep 17 00:00:00 2001 From: Andrew Liebenow Date: Sun, 20 Oct 2024 11:40:19 -0500 Subject: [PATCH 02/10] Use concrete type --- src/uu/echo/src/echo.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/uu/echo/src/echo.rs b/src/uu/echo/src/echo.rs index 243c0d816b8..b779b418e93 100644 --- a/src/uu/echo/src/echo.rs +++ b/src/uu/echo/src/echo.rs @@ -4,7 +4,7 @@ // file that was distributed with this source code. use clap::{crate_version, Arg, ArgAction, Command}; -use std::io::{self, Write}; +use std::io::{self, StdoutLock, Write}; use std::iter::Peekable; use std::ops::ControlFlow; use std::str::Chars; @@ -63,7 +63,7 @@ fn parse_code(input: &mut Peekable, base: Base) -> Option { Some(ret) } -fn print_escaped(input: &str, mut output: impl Write) -> io::Result> { +fn print_escaped(input: &str, output: &mut StdoutLock) -> io::Result> { let mut iter = input.chars().peekable(); while let Some(c) = iter.next() { From 7ab6c017a7179c488e51063c9f3226ea57797252 Mon Sep 17 00:00:00 2001 From: Andrew Liebenow Date: Sun, 20 Oct 2024 11:44:09 -0500 Subject: [PATCH 03/10] Fix MSRV issue --- src/uu/echo/src/echo.rs | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/uu/echo/src/echo.rs b/src/uu/echo/src/echo.rs index b779b418e93..ff84048a170 100644 --- a/src/uu/echo/src/echo.rs +++ b/src/uu/echo/src/echo.rs @@ -87,6 +87,7 @@ fn print_escaped(input: &str, output: &mut StdoutLock) -> io::Result br"\", @@ -108,7 +109,11 @@ fn print_escaped(input: &str, output: &mut StdoutLock) -> io::Result &[parse_code(&mut iter, Base::Oct).unwrap_or(b'\0')], + '0' => { + sli = [parse_code(&mut iter, Base::Oct).unwrap_or(b'\0')]; + + &sli + } c => { write!(output, "\\{c}")?; From 199c94700dfff38e64b8e3aaf031082db63def41 Mon Sep 17 00:00:00 2001 From: Andrew Liebenow Date: Sun, 20 Oct 2024 15:49:03 -0500 Subject: [PATCH 04/10] Fix non-UTF-8 argument handling --- src/uu/echo/src/echo.rs | 189 ++++++++++++++++++++++++++----------- tests/by-util/test_echo.rs | 48 ++++++++++ 2 files changed, 181 insertions(+), 56 deletions(-) diff --git a/src/uu/echo/src/echo.rs b/src/uu/echo/src/echo.rs index ff84048a170..e5c59972917 100644 --- a/src/uu/echo/src/echo.rs +++ b/src/uu/echo/src/echo.rs @@ -3,11 +3,14 @@ // For the full copyright and license information, please view the LICENSE // file that was distributed with this source code. +use clap::builder::ValueParser; +use clap::parser::ValuesRef; use clap::{crate_version, Arg, ArgAction, Command}; +use std::ffi::{OsStr, OsString}; use std::io::{self, StdoutLock, Write}; use std::iter::Peekable; use std::ops::ControlFlow; -use std::str::Chars; +use std::slice::Iter; use uucore::error::{FromIo, UResult}; use uucore::{format_usage, help_about, help_section, help_usage}; @@ -22,14 +25,19 @@ mod options { pub const DISABLE_BACKSLASH_ESCAPE: &str = "disable_backslash_escape"; } -#[repr(u8)] -#[derive(Clone, Copy)] enum Base { - Oct = 8, - Hex = 16, + Oct, + Hex, } impl Base { + fn radix(&self) -> u8 { + match self { + Self::Oct => 8, + Self::Hex => 16, + } + } + fn max_digits(&self) -> u8 { match self { Self::Oct => 3, @@ -39,36 +47,58 @@ impl Base { } /// Parse the numeric part of the `\xHHH` and `\0NNN` escape sequences -fn parse_code(input: &mut Peekable, base: Base) -> Option { - // All arithmetic on `ret` needs to be wrapping, because octal input can +fn parse_code(input: &mut Peekable>, base: Base) -> Option { + // All arithmetic on `sum` needs to be wrapping, because octal input can // take 3 digits, which is 9 bits, and therefore more than what fits in a // `u8`. GNU just seems to wrap these values. - // Note that if we instead make `ret` a `u32` and use `char::from_u32` will - // yield incorrect results because it will interpret values larger than - // `u8::MAX` as unicode. - let mut ret = input.peek().and_then(|c| c.to_digit(base as u32))? as u8; + let radix = base.radix(); + let radix_u_three_two = u32::from(radix); + + let mut sum = match input.peek() { + Some(&&ue) => match char::from(ue).to_digit(radix_u_three_two) { + // A u8 interpreted as a hexadecimal or octal digit is never more than 16 + Some(ut) => u8::try_from(ut).unwrap(), + None => { + return None; + } + }, + None => { + return None; + } + }; // We can safely ignore the None case because we just peeked it. let _ = input.next(); for _ in 1..base.max_digits() { - match input.peek().and_then(|c| c.to_digit(base as u32)) { - Some(n) => ret = ret.wrapping_mul(base as u8).wrapping_add(n as u8), - None => break, + match input + .peek() + .and_then(|&&ue| char::from(ue).to_digit(radix_u_three_two)) + { + Some(ut) => { + // A u8 interpreted as a hexadecimal or octal digit is never more than 16 + let ue = u8::try_from(ut).unwrap(); + + sum = sum.wrapping_mul(radix).wrapping_add(ue) + } + None => { + break; + } } + // We can safely ignore the None case because we just peeked it. let _ = input.next(); } - Some(ret) + Some(sum) } -fn print_escaped(input: &str, output: &mut StdoutLock) -> io::Result> { - let mut iter = input.chars().peekable(); +fn print_escaped(input: &[u8], output: &mut StdoutLock) -> io::Result> { + let mut iter = input.iter().peekable(); - while let Some(c) = iter.next() { - if c != '\\' { - write!(output, "{c}")?; + while let Some(¤t_byte) = iter.next() { + if current_byte != b'\\' { + output.write_all(&[current_byte])?; continue; } @@ -76,7 +106,7 @@ fn print_escaped(input: &str, output: &mut StdoutLock) -> io::Result io::Result br"\", - 'a' => b"\x07", - 'b' => b"\x08", - 'c' => return Ok(ControlFlow::Break(())), - 'e' => b"\x1b", - 'f' => b"\x0c", - 'n' => b"\n", - 'r' => b"\r", - 't' => b"\t", - 'v' => b"\x0b", - 'x' => { + let sli: [u8; 2_usize]; + + let unescaped: &[u8] = match *next { + b'\\' => br"\", + b'a' => b"\x07", + b'b' => b"\x08", + b'c' => return Ok(ControlFlow::Break(())), + b'e' => b"\x1b", + b'f' => b"\x0c", + b'n' => b"\n", + b'r' => b"\r", + b't' => b"\t", + b'v' => b"\x0b", + b'x' => { if let Some(ue) = parse_code(&mut iter, Base::Hex) { sl = [ue]; @@ -109,15 +139,19 @@ fn print_escaped(input: &str, output: &mut StdoutLock) -> io::Result { - sli = [parse_code(&mut iter, Base::Oct).unwrap_or(b'\0')]; + b'0' => { + // \0 with any non-octal digit after it is 0 + let parsed_octal_number_or_zero = + parse_code(&mut iter, Base::Oct).unwrap_or(b'\0'); - &sli + sl = [parsed_octal_number_or_zero]; + + &sl } - c => { - write!(output, "\\{c}")?; + ue => { + sli = [b'\\', ue]; - continue; + &sli } }; @@ -134,15 +168,29 @@ fn print_escaped(input: &str, output: &mut StdoutLock) -> io::Result UResult<()> { let matches = uu_app().get_matches_from(args); + // TODO + // "If the POSIXLY_CORRECT environment variable is set, then when echo’s first argument is not -n it outputs option-like arguments instead of treating them as options." + // https://www.gnu.org/software/coreutils/manual/html_node/echo-invocation.html + let no_newline = matches.get_flag(options::NO_NEWLINE); let escaped = matches.get_flag(options::ENABLE_BACKSLASH_ESCAPE); - let values: Vec = match matches.get_many::(options::STRING) { - Some(s) => s.map(|s| s.to_string()).collect(), - None => vec![String::new()], - }; - execute(no_newline, escaped, &values) - .map_err_context(|| "could not write to stdout".to_string()) + let mut stdout_lock = io::stdout().lock(); + + match matches.get_many::(options::STRING) { + Some(va) => { + execute(&mut stdout_lock, no_newline, escaped, va) + .map_err_context(|| "could not write to stdout".to_string())?; + } + None => { + // No strings to print, so just handle newline setting + if !no_newline { + stdout_lock.write_all(b"\n")?; + } + } + } + + Ok(()) } pub fn uu_app() -> Command { @@ -179,29 +227,58 @@ pub fn uu_app() -> Command { .action(ArgAction::SetTrue) .overrides_with(options::ENABLE_BACKSLASH_ESCAPE), ) - .arg(Arg::new(options::STRING).action(ArgAction::Append)) + .arg( + Arg::new(options::STRING) + .action(ArgAction::Append) + .value_parser(ValueParser::os_string()), + ) } -fn execute(no_newline: bool, escaped: bool, free: &[String]) -> io::Result<()> { - let stdout = io::stdout(); - let mut output = stdout.lock(); +fn execute( + stdout_lock: &mut StdoutLock, + no_newline: bool, + escaped: bool, + non_option_arguments: ValuesRef<'_, OsString>, +) -> io::Result<()> { + for (i, input) in non_option_arguments.into_iter().enumerate() { + let bytes = bytes_from_os_string(input.as_os_str()); - for (i, input) in free.iter().enumerate() { if i > 0 { - write!(output, " ")?; + stdout_lock.write_all(b" ")?; } + if escaped { - if print_escaped(input, &mut output)?.is_break() { + if print_escaped(bytes, stdout_lock)?.is_break() { return Ok(()); } } else { - write!(output, "{input}")?; + stdout_lock.write_all(bytes)?; } } if !no_newline { - writeln!(output)?; + stdout_lock.write_all(b"\n")?; } Ok(()) } + +fn bytes_from_os_string(input: &OsStr) -> &[u8] { + let bytes = { + #[cfg(target_family = "unix")] + { + use std::os::unix::ffi::OsStrExt; + + input.as_bytes() + } + + #[cfg(not(target_family = "unix"))] + { + // TODO + // Verify + input.as_encoded_bytes() + } + }; + + bytes +} diff --git a/tests/by-util/test_echo.rs b/tests/by-util/test_echo.rs index 6475d7f6c58..620f1990eb1 100644 --- a/tests/by-util/test_echo.rs +++ b/tests/by-util/test_echo.rs @@ -5,6 +5,7 @@ // spell-checker:ignore (words) araba merci use crate::common::util::TestScenario; +use std::ffi::OsStr; #[test] fn test_default() { @@ -338,3 +339,50 @@ fn multibyte_escape_unicode() { .stdout_is_bytes(b"A\xF0") .no_stderr(); } + +#[test] +fn non_utf_8_hex_round_trip() { + new_ucmd!() + .args(&["-e", r"\xFF"]) + .succeeds() + .stdout_is_bytes(b"\xFF\n") + .no_stderr(); +} + +#[test] +fn nine_bit_octal() { + const RESULT: &[u8] = b"\xFF\n"; + + new_ucmd!() + .args(&["-e", r"\0777"]) + .succeeds() + .stdout_is_bytes(RESULT) + .no_stderr(); + + new_ucmd!() + .args(&["-e", r"\777"]) + .succeeds() + .stdout_is_bytes(RESULT) + .no_stderr(); +} + +#[test] +#[cfg(target_family = "unix")] +fn non_utf_8() { + use std::os::unix::ffi::OsStrExt; + + // ISO-8859-1 encoded text + // spell-checker:disable + const INPUT_AND_OUTPUT: &[u8] = + b"Swer an rehte g\xFCete wendet s\xEEn gem\xFCete, dem volget s\xE6lde und \xEAre."; + // spell-checker:enable + + let os_str = OsStr::from_bytes(INPUT_AND_OUTPUT); + + new_ucmd!() + .arg("-n") + .arg(os_str) + .succeeds() + .stdout_is_bytes(INPUT_AND_OUTPUT) + .no_stderr(); +} From 711b17af5b6c6869922644dd5dada094ef97d98c Mon Sep 17 00:00:00 2001 From: Andrew Liebenow Date: Sun, 20 Oct 2024 16:05:03 -0500 Subject: [PATCH 05/10] Fix MSRV issue --- src/uu/echo/src/echo.rs | 28 +++++++++++++++++----------- 1 file changed, 17 insertions(+), 11 deletions(-) diff --git a/src/uu/echo/src/echo.rs b/src/uu/echo/src/echo.rs index e5c59972917..ac49ef14d3b 100644 --- a/src/uu/echo/src/echo.rs +++ b/src/uu/echo/src/echo.rs @@ -11,7 +11,7 @@ use std::io::{self, StdoutLock, Write}; use std::iter::Peekable; use std::ops::ControlFlow; use std::slice::Iter; -use uucore::error::{FromIo, UResult}; +use uucore::error::{UResult, USimpleError}; use uucore::{format_usage, help_about, help_section, help_usage}; const ABOUT: &str = help_about!("echo.md"); @@ -179,8 +179,7 @@ pub fn uumain(args: impl uucore::Args) -> UResult<()> { match matches.get_many::(options::STRING) { Some(va) => { - execute(&mut stdout_lock, no_newline, escaped, va) - .map_err_context(|| "could not write to stdout".to_string())?; + execute(&mut stdout_lock, no_newline, escaped, va)?; } None => { // No strings to print, so just handle newline setting @@ -239,9 +238,14 @@ fn execute( no_newline: bool, escaped: bool, non_option_arguments: ValuesRef<'_, OsString>, -) -> io::Result<()> { +) -> UResult<()> { for (i, input) in non_option_arguments.into_iter().enumerate() { - let bytes = bytes_from_os_string(input.as_os_str()); + let Some(bytes) = bytes_from_os_string(input.as_os_str()) else { + return Err(USimpleError::new( + 1, + "Non-UTF-8 arguments provided, but this platform does not support them", + )); + }; if i > 0 { stdout_lock.write_all(b" ")?; @@ -263,22 +267,24 @@ fn execute( Ok(()) } -fn bytes_from_os_string(input: &OsStr) -> &[u8] { - let bytes = { +fn bytes_from_os_string(input: &OsStr) -> Option<&[u8]> { + let option = { #[cfg(target_family = "unix")] { use std::os::unix::ffi::OsStrExt; - input.as_bytes() + Some(input.as_bytes()) } #[cfg(not(target_family = "unix"))] { // TODO - // Verify - input.as_encoded_bytes() + match input.to_str() { + Some(st) => Some(st.as_bytes()), + None => None, + } } }; - bytes + option } From c46d8af183662f577e75d2eeb9605de169088332 Mon Sep 17 00:00:00 2001 From: Andrew Liebenow Date: Sun, 20 Oct 2024 16:32:12 -0500 Subject: [PATCH 06/10] Fix Clippy violation --- src/uu/echo/src/echo.rs | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/src/uu/echo/src/echo.rs b/src/uu/echo/src/echo.rs index ac49ef14d3b..73da2c17b0f 100644 --- a/src/uu/echo/src/echo.rs +++ b/src/uu/echo/src/echo.rs @@ -279,10 +279,8 @@ fn bytes_from_os_string(input: &OsStr) -> Option<&[u8]> { #[cfg(not(target_family = "unix"))] { // TODO - match input.to_str() { - Some(st) => Some(st.as_bytes()), - None => None, - } + // Verify that this works correctly on these platforms + input.to_str().map(|st| st.as_bytes()) } }; From d1e6d0a2a5e2ad7956421e988b7c9c4f587ce5c3 Mon Sep 17 00:00:00 2001 From: Andrew Liebenow Date: Sun, 20 Oct 2024 22:39:27 -0500 Subject: [PATCH 07/10] Fix compiler warning --- tests/by-util/test_echo.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/by-util/test_echo.rs b/tests/by-util/test_echo.rs index 620f1990eb1..d8f56fdcaef 100644 --- a/tests/by-util/test_echo.rs +++ b/tests/by-util/test_echo.rs @@ -5,7 +5,6 @@ // spell-checker:ignore (words) araba merci use crate::common::util::TestScenario; -use std::ffi::OsStr; #[test] fn test_default() { @@ -369,6 +368,7 @@ fn nine_bit_octal() { #[test] #[cfg(target_family = "unix")] fn non_utf_8() { + use std::ffi::OsStr; use std::os::unix::ffi::OsStrExt; // ISO-8859-1 encoded text From 4be524a3e1dd201c524b79f97b9a8610eaa2a742 Mon Sep 17 00:00:00 2001 From: Andrew Liebenow Date: Mon, 21 Oct 2024 09:55:57 -0500 Subject: [PATCH 08/10] Address PR comments --- src/uu/echo/src/echo.rs | 236 +++++++++++++++++++++++++++---------- tests/by-util/test_echo.rs | 11 +- 2 files changed, 181 insertions(+), 66 deletions(-) diff --git a/src/uu/echo/src/echo.rs b/src/uu/echo/src/echo.rs index 73da2c17b0f..60d52a9fb12 100644 --- a/src/uu/echo/src/echo.rs +++ b/src/uu/echo/src/echo.rs @@ -25,69 +25,158 @@ mod options { pub const DISABLE_BACKSLASH_ESCAPE: &str = "disable_backslash_escape"; } +enum BackslashNumberType { + OctalStartingWithNonZero(u8), + OctalStartingWithZero, + Hexadecimal, +} + +impl BackslashNumberType { + fn base(&self) -> Base { + match self { + BackslashNumberType::OctalStartingWithZero + | BackslashNumberType::OctalStartingWithNonZero(_) => Base::Octal, + BackslashNumberType::Hexadecimal => Base::Hexadecimal, + } + } +} + enum Base { - Oct, - Hex, + Octal, + Hexadecimal, } impl Base { - fn radix(&self) -> u8 { + fn ascii_to_number(&self, digit: u8) -> Option { + fn octal_ascii_digit_to_number(digit: u8) -> Option { + let number = match digit { + b'0' => 0, + b'1' => 1, + b'2' => 2, + b'3' => 3, + b'4' => 4, + b'5' => 5, + b'6' => 6, + b'7' => 7, + _ => { + return None; + } + }; + + Some(number) + } + + fn hexadecimal_ascii_digit_to_number(digit: u8) -> Option { + let number = match digit { + b'0' => 0, + b'1' => 1, + b'2' => 2, + b'3' => 3, + b'4' => 4, + b'5' => 5, + b'6' => 6, + b'7' => 7, + b'8' => 8, + b'9' => 9, + b'A' | b'a' => 10, + b'B' | b'b' => 11, + b'C' | b'c' => 12, + b'D' | b'd' => 13, + b'E' | b'e' => 14, + b'F' | b'f' => 15, + _ => { + return None; + } + }; + + Some(number) + } + match self { - Self::Oct => 8, - Self::Hex => 16, + Self::Octal => octal_ascii_digit_to_number(digit), + Self::Hexadecimal => hexadecimal_ascii_digit_to_number(digit), } } - fn max_digits(&self) -> u8 { + fn maximum_number_of_digits(&self) -> u8 { match self { - Self::Oct => 3, - Self::Hex => 2, + Self::Octal => 3, + Self::Hexadecimal => 2, } } -} -/// Parse the numeric part of the `\xHHH` and `\0NNN` escape sequences -fn parse_code(input: &mut Peekable>, base: Base) -> Option { - // All arithmetic on `sum` needs to be wrapping, because octal input can - // take 3 digits, which is 9 bits, and therefore more than what fits in a - // `u8`. GNU just seems to wrap these values. - let radix = base.radix(); - let radix_u_three_two = u32::from(radix); + fn radix(&self) -> u8 { + match self { + Self::Octal => 8, + Self::Hexadecimal => 16, + } + } +} - let mut sum = match input.peek() { - Some(&&ue) => match char::from(ue).to_digit(radix_u_three_two) { - // A u8 interpreted as a hexadecimal or octal digit is never more than 16 - Some(ut) => u8::try_from(ut).unwrap(), - None => { - return None; +/// Parse the numeric part of `\xHHH`, `\0NNN`, and `\NNN` escape sequences +fn parse_backslash_number( + input: &mut Peekable>, + backslash_number_type: BackslashNumberType, +) -> Option { + let first_digit_ascii = match backslash_number_type { + BackslashNumberType::OctalStartingWithZero | BackslashNumberType::Hexadecimal => { + match input.peek() { + Some(&&digit_ascii) => digit_ascii, + None => { + // One of the following cases: argument ends with "\0" or "\x" + // If "\0" (octal): caller will print not ASCII '0', 0x30, but ASCII '\0' (NUL), 0x00 + // If "\x" (hexadecimal): caller will print literal "\x" + return None; + } } - }, + } + // Never returns early when backslash number starts with "\1" through "\7", because caller provides the + // first digit + BackslashNumberType::OctalStartingWithNonZero(digit_ascii) => digit_ascii, + }; + + let base = backslash_number_type.base(); + + let first_digit_number = match base.ascii_to_number(first_digit_ascii) { + Some(digit_number) => { + // Move past byte, since it was successfully parsed + let _ = input.next(); + + digit_number + } None => { + // The first digit was not a valid octal or hexadecimal digit + // This should never be the case when the backslash number starts with "\1" through "\7" + // (caller unwraps to verify this) return None; } }; - // We can safely ignore the None case because we just peeked it. - let _ = input.next(); + let radix = base.radix(); + + let mut sum = first_digit_number; - for _ in 1..base.max_digits() { + for _ in 1..(base.maximum_number_of_digits()) { match input .peek() - .and_then(|&&ue| char::from(ue).to_digit(radix_u_three_two)) + .and_then(|&&digit_ascii| base.ascii_to_number(digit_ascii)) { - Some(ut) => { - // A u8 interpreted as a hexadecimal or octal digit is never more than 16 - let ue = u8::try_from(ut).unwrap(); - - sum = sum.wrapping_mul(radix).wrapping_add(ue) + Some(digit_number) => { + // Move past byte, since it was successfully parsed + let _ = input.next(); + + // All arithmetic on `sum` needs to be wrapping, because octal input can + // take 3 digits, which is 9 bits, and therefore more than what fits in a + // `u8`. + // + // GNU Core Utilities: "if nnn is a nine-bit value, the ninth bit is ignored" + // https://www.gnu.org/software/coreutils/manual/html_node/echo-invocation.html + sum = sum.wrapping_mul(radix).wrapping_add(digit_number); } None => { break; } } - - // We can safely ignore the None case because we just peeked it. - let _ = input.next(); } Some(sum) @@ -103,55 +192,69 @@ fn print_escaped(input: &[u8], output: &mut StdoutLock) -> io::Result= 1.79.0 + // https://github.com/rust-lang/rust/pull/121346 + let hold_one_byte_outside_of_match: [u8; 1_usize]; + let hold_two_bytes_outside_of_match: [u8; 2_usize]; let unescaped: &[u8] = match *next { b'\\' => br"\", b'a' => b"\x07", b'b' => b"\x08", b'c' => return Ok(ControlFlow::Break(())), - b'e' => b"\x1b", - b'f' => b"\x0c", + b'e' => b"\x1B", + b'f' => b"\x0C", b'n' => b"\n", b'r' => b"\r", b't' => b"\t", - b'v' => b"\x0b", + b'v' => b"\x0B", b'x' => { - if let Some(ue) = parse_code(&mut iter, Base::Hex) { - sl = [ue]; + if let Some(parsed_hexadecimal_number) = + parse_backslash_number(&mut iter, BackslashNumberType::Hexadecimal) + { + hold_one_byte_outside_of_match = [parsed_hexadecimal_number]; - &sl + &hold_one_byte_outside_of_match } else { + // "\x" with any non-hexadecimal digit after means "\x" is treated literally br"\x" } } b'0' => { - // \0 with any non-octal digit after it is 0 - let parsed_octal_number_or_zero = - parse_code(&mut iter, Base::Oct).unwrap_or(b'\0'); - - sl = [parsed_octal_number_or_zero]; + if let Some(parsed_octal_number) = parse_backslash_number( + &mut iter, + BackslashNumberType::OctalStartingWithZero, + ) { + hold_one_byte_outside_of_match = [parsed_octal_number]; - &sl + &hold_one_byte_outside_of_match + } else { + // "\0" with any non-octal digit after it means "\0" is treated as ASCII '\0' (NUL), 0x00 + b"\0" + } } - ue => { - sli = [b'\\', ue]; + other_byte => { + // Backslash and the following byte are treated literally + hold_two_bytes_outside_of_match = [b'\\', other_byte]; - &sli + &hold_two_bytes_outside_of_match } }; @@ -178,8 +281,13 @@ pub fn uumain(args: impl uucore::Args) -> UResult<()> { let mut stdout_lock = io::stdout().lock(); match matches.get_many::(options::STRING) { - Some(va) => { - execute(&mut stdout_lock, no_newline, escaped, va)?; + Some(arguments_after_options) => { + execute( + &mut stdout_lock, + no_newline, + escaped, + arguments_after_options, + )?; } None => { // No strings to print, so just handle newline setting @@ -237,9 +345,9 @@ fn execute( stdout_lock: &mut StdoutLock, no_newline: bool, escaped: bool, - non_option_arguments: ValuesRef<'_, OsString>, + arguments_after_options: ValuesRef<'_, OsString>, ) -> UResult<()> { - for (i, input) in non_option_arguments.into_iter().enumerate() { + for (i, input) in arguments_after_options.enumerate() { let Some(bytes) = bytes_from_os_string(input.as_os_str()) else { return Err(USimpleError::new( 1, diff --git a/tests/by-util/test_echo.rs b/tests/by-util/test_echo.rs index d8f56fdcaef..3892bb52098 100644 --- a/tests/by-util/test_echo.rs +++ b/tests/by-util/test_echo.rs @@ -383,6 +383,13 @@ fn non_utf_8() { .arg("-n") .arg(os_str) .succeeds() - .stdout_is_bytes(INPUT_AND_OUTPUT) - .no_stderr(); + .stdout_only_bytes(INPUT_AND_OUTPUT); +} + +#[test] +fn slash_eight_off_by_one() { + new_ucmd!() + .args(&["-e", "-n", r"\8"]) + .succeeds() + .stdout_only(r"\8"); } From 635f360f21de0fe8fc9f4755724623c5a5eb9026 Mon Sep 17 00:00:00 2001 From: Andrew Liebenow Date: Mon, 21 Oct 2024 12:41:25 -0500 Subject: [PATCH 09/10] Add MSRV TODO comments --- src/uu/echo/src/echo.rs | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/uu/echo/src/echo.rs b/src/uu/echo/src/echo.rs index 60d52a9fb12..6d8c6242111 100644 --- a/src/uu/echo/src/echo.rs +++ b/src/uu/echo/src/echo.rs @@ -211,6 +211,7 @@ fn print_escaped(input: &[u8], output: &mut StdoutLock) -> io::Result= 1.79.0 // https://github.com/rust-lang/rust/pull/121346 + // TODO: when we have a MSRV >= 1.79.0, delete these "hold" bindings let hold_one_byte_outside_of_match: [u8; 1_usize]; let hold_two_bytes_outside_of_match: [u8; 2_usize]; @@ -229,8 +230,11 @@ fn print_escaped(input: &[u8], output: &mut StdoutLock) -> io::Result= 1.79.0 hold_one_byte_outside_of_match = [parsed_hexadecimal_number]; + // TODO: when we have a MSRV >= 1.79.0, return reference to a temporary array: + // &[parsed_hexadecimal_number] &hold_one_byte_outside_of_match } else { // "\x" with any non-hexadecimal digit after means "\x" is treated literally @@ -242,8 +246,11 @@ fn print_escaped(input: &[u8], output: &mut StdoutLock) -> io::Result= 1.79.0 hold_one_byte_outside_of_match = [parsed_octal_number]; + // TODO: when we have a MSRV >= 1.79.0, return reference to a temporary array: + // &[parsed_octal_number] &hold_one_byte_outside_of_match } else { // "\0" with any non-octal digit after it means "\0" is treated as ASCII '\0' (NUL), 0x00 From b4c057e5ab7d66da566e0a3e86447c7bdd6079fc Mon Sep 17 00:00:00 2001 From: Daniel Hofstetter Date: Tue, 22 Oct 2024 10:33:12 +0200 Subject: [PATCH 10/10] echo: use stdout_only_bytes instead of stdout_is_bytes --- tests/by-util/test_echo.rs | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/tests/by-util/test_echo.rs b/tests/by-util/test_echo.rs index 3892bb52098..47240c7c056 100644 --- a/tests/by-util/test_echo.rs +++ b/tests/by-util/test_echo.rs @@ -329,14 +329,12 @@ fn multibyte_escape_unicode() { new_ucmd!() .args(&["-e", r"\xf0\x41\x9f\x98\x82"]) .succeeds() - .stdout_is_bytes(b"\xF0A\x9F\x98\x82\n") - .no_stderr(); + .stdout_only_bytes(b"\xF0A\x9F\x98\x82\n"); new_ucmd!() .args(&["-e", r"\x41\xf0\c\x9f\x98\x82"]) .succeeds() - .stdout_is_bytes(b"A\xF0") - .no_stderr(); + .stdout_only_bytes(b"A\xF0"); } #[test] @@ -344,8 +342,7 @@ fn non_utf_8_hex_round_trip() { new_ucmd!() .args(&["-e", r"\xFF"]) .succeeds() - .stdout_is_bytes(b"\xFF\n") - .no_stderr(); + .stdout_only_bytes(b"\xFF\n"); } #[test] @@ -355,14 +352,12 @@ fn nine_bit_octal() { new_ucmd!() .args(&["-e", r"\0777"]) .succeeds() - .stdout_is_bytes(RESULT) - .no_stderr(); + .stdout_only_bytes(RESULT); new_ucmd!() .args(&["-e", r"\777"]) .succeeds() - .stdout_is_bytes(RESULT) - .no_stderr(); + .stdout_only_bytes(RESULT); } #[test]