diff --git a/src/uu/cut/Cargo.toml b/src/uu/cut/Cargo.toml index 66165a48e83..16b6e03d567 100644 --- a/src/uu/cut/Cargo.toml +++ b/src/uu/cut/Cargo.toml @@ -20,7 +20,7 @@ doctest = false [dependencies] clap = { workspace = true } -uucore = { workspace = true, features = ["ranges"] } +uucore = { workspace = true, features = ["ranges", "i18n-charmap"] } memchr = { workspace = true } bstr = { workspace = true } fluent = { workspace = true } diff --git a/src/uu/cut/locales/en-US.ftl b/src/uu/cut/locales/en-US.ftl index d320fc86d11..e2a9bce2a1a 100644 --- a/src/uu/cut/locales/en-US.ftl +++ b/src/uu/cut/locales/en-US.ftl @@ -101,6 +101,7 @@ cut-help-complement = invert the filter - instead of displaying only the filtere cut-help-only-delimited = in field mode, only print lines which contain the delimiter cut-help-zero-terminated = instead of filtering columns based on line, filter columns based on \\0 (NULL character) cut-help-output-delimiter = in field mode, replace the delimiter in output lines with this option's argument +cut-help-no-partial = with -b, don't output partial multi-byte characters # Error messages cut-error-is-directory = Is a directory diff --git a/src/uu/cut/locales/fr-FR.ftl b/src/uu/cut/locales/fr-FR.ftl index a95773099d6..be73c47b0ac 100644 --- a/src/uu/cut/locales/fr-FR.ftl +++ b/src/uu/cut/locales/fr-FR.ftl @@ -101,6 +101,7 @@ cut-help-complement = inverser le filtre - au lieu d'afficher seulement les colo cut-help-only-delimited = en mode champ, afficher seulement les lignes qui contiennent le délimiteur cut-help-zero-terminated = au lieu de filtrer les colonnes basées sur la ligne, filtrer les colonnes basées sur \\0 (caractère NULL) cut-help-output-delimiter = en mode champ, remplacer le délimiteur dans les lignes de sortie avec l'argument de cette option +cut-help-no-partial = avec -b, ne pas afficher les caractères multi-octets partiels # Messages d'erreur cut-error-is-directory = Est un répertoire diff --git a/src/uu/cut/src/cut.rs b/src/uu/cut/src/cut.rs index 1e0cf90bcd8..616fb404baf 100644 --- a/src/uu/cut/src/cut.rs +++ b/src/uu/cut/src/cut.rs @@ -13,6 +13,7 @@ use std::io::{BufRead, BufReader, BufWriter, IsTerminal, Read, Write, stdin, std use std::path::Path; use uucore::display::Quotable; use uucore::error::{FromIo, UResult, USimpleError, set_exit_code}; +use uucore::i18n::charmap::{is_multibyte_locale, mb_char_len, mb_chars}; use uucore::line_ending::LineEnding; use uucore::os_str_as_bytes; @@ -29,6 +30,8 @@ struct Options<'a> { out_delimiter: Option<&'a [u8]>, line_ending: LineEnding, field_opts: Option>, + /// `-n`: with `-b`, do not split multi-byte characters across the selection. + suppress_split: bool, } enum Delimiter<'a> { @@ -104,6 +107,61 @@ fn cut_bytes( Ok(()) } +/// Cut `-c` (whole characters) or `-b -n` (bytes, keeping whole characters). +/// +/// In a single-byte locale, or for `-b` without `-n`, this falls back to the +/// plain byte path. Otherwise each character is emitted whole when its 1-based +/// position falls in a range: the character index for `-c`, or the offset of +/// its last byte for `-b -n` (matching GNU). +fn cut_chars( + reader: R, + out: &mut W, + ranges: &[Range], + opts: &Options, + by_char: bool, +) -> UResult<()> { + if !is_multibyte_locale() || !(by_char || opts.suppress_split) { + return cut_bytes(reader, out, ranges, opts); + } + + let newline_char = opts.line_ending.into(); + let mut buf_in = BufReader::new(reader); + let out_delim = opts.out_delimiter.unwrap_or(b"\t"); + + let result = buf_in.for_byte_record(newline_char, |line| { + let mut print_delim = false; + for &Range { low, high } in ranges { + let mut pos = 0; + let mut wrote = false; + for ch in mb_chars(line) { + pos += if by_char { 1 } else { ch.len() }; + if pos > high { + break; + } + if pos >= low { + if !wrote { + if print_delim { + out.write_all(out_delim)?; + } else if opts.out_delimiter.is_some() { + print_delim = true; + } + wrote = true; + } + out.write_all(ch)?; + } + } + } + out.write_all(&[newline_char])?; + Ok(true) + }); + + if let Err(e) = result { + return Err(USimpleError::new(1, e.to_string())); + } + + Ok(()) +} + /// Output delimiter is explicitly specified fn cut_fields_explicit_out_delim( reader: R, @@ -458,8 +516,8 @@ where } show_if_err!(match mode { - Mode::Bytes(ranges, opts) => cut_bytes(stdin(), &mut out, ranges, opts), - Mode::Characters(ranges, opts) => cut_bytes(stdin(), &mut out, ranges, opts), + Mode::Bytes(ranges, opts) => cut_chars(stdin(), &mut out, ranges, opts, false), + Mode::Characters(ranges, opts) => cut_chars(stdin(), &mut out, ranges, opts, true), Mode::Fields(ranges, opts) => cut_fields(stdin(), &mut out, ranges, opts), }); @@ -482,8 +540,11 @@ where .map_err_context(|| filename.maybe_quote().to_string()) .and_then(|file| { match &mode { - Mode::Bytes(ranges, opts) | Mode::Characters(ranges, opts) => { - cut_bytes(file, &mut out, ranges, opts) + Mode::Bytes(ranges, opts) => { + cut_chars(file, &mut out, ranges, opts, false) + } + Mode::Characters(ranges, opts) => { + cut_chars(file, &mut out, ranges, opts, true) } Mode::Fields(ranges, opts) => cut_fields(file, &mut out, ranges, opts), } @@ -514,12 +575,14 @@ fn get_delimiters(matches: &ArgMatches) -> UResult<(Delimiter<'_>, Option<&[u8]> if os_string.is_empty() { Delimiter::Slice(b"\0") } else { - // For delimiter `-d` option value - allow both UTF-8 (possibly multi-byte) characters - // and Non UTF-8 (and not ASCII) single byte "characters", like `b"\xAD"` to align with GNU behavior + // The delimiter must be a single character. We accept a single + // UTF-8 character (e.g. an emoji), a single byte (including a + // non-UTF-8 byte like `b"\xFF"`), or a single character of the + // current locale's encoding (e.g. a 2-byte GB18030 character). let bytes = os_str_as_bytes(os_string)?; - if os_string.to_str().is_some_and(|s| s.chars().count() > 1) - || os_string.to_str().is_none() && bytes.len() > 1 - { + let single_utf8_char = os_string.to_str().is_some_and(|s| s.chars().count() == 1); + let single_locale_char = mb_char_len(bytes) == bytes.len(); + if !single_utf8_char && !single_locale_char { return Err(USimpleError::new( 1, translate!("cut-error-delimiter-must-be-single-character"), @@ -583,6 +646,7 @@ pub fn uumain(args: impl uucore::Args) -> UResult<()> { let (delimiter, out_delimiter) = get_delimiters(&matches)?; let line_ending = LineEnding::from_zero_flag(matches.get_flag(options::ZERO_TERMINATED)); + let suppress_split = matches.get_flag(options::NOTHING); // Only one, and only one of cutting mode arguments, i.e. `-b`, `-c`, `-f`, // is expected. The number of those arguments is used for parsing a cutting @@ -610,6 +674,7 @@ pub fn uumain(args: impl uucore::Args) -> UResult<()> { out_delimiter, line_ending, field_opts: None, + suppress_split, }, ) }) @@ -623,6 +688,7 @@ pub fn uumain(args: impl uucore::Args) -> UResult<()> { out_delimiter, line_ending, field_opts: None, + suppress_split, }, ) }) @@ -639,6 +705,7 @@ pub fn uumain(args: impl uucore::Args) -> UResult<()> { delimiter, only_delimited, }), + suppress_split, }, ) }) @@ -775,7 +842,8 @@ pub fn uu_app() -> Command { .arg( Arg::new(options::NOTHING) .short('n') - .help("(ignored)") + .long("no-partial") + .help(translate!("cut-help-no-partial")) .action(ArgAction::SetTrue), ) } diff --git a/src/uucore/src/lib/features/i18n/charmap.rs b/src/uucore/src/lib/features/i18n/charmap.rs index 2ec99229bc8..f1d60e3055b 100644 --- a/src/uucore/src/lib/features/i18n/charmap.rs +++ b/src/uucore/src/lib/features/i18n/charmap.rs @@ -10,6 +10,7 @@ use std::sync::OnceLock; enum MbEncoding { + SingleByte, Utf8, Gb18030, EucJp, @@ -19,11 +20,12 @@ enum MbEncoding { fn encoding_from_name(enc: &str) -> MbEncoding { match enc { + "utf-8" | "utf8" => MbEncoding::Utf8, "gb18030" | "gbk" | "gb2312" => MbEncoding::Gb18030, "euc-jp" | "eucjp" => MbEncoding::EucJp, "euc-kr" | "euckr" => MbEncoding::EucKr, "big5" | "big5-hkscs" | "big5hkscs" | "euc-tw" | "euctw" => MbEncoding::Big5, - _ => MbEncoding::Utf8, + _ => MbEncoding::SingleByte, } } @@ -35,7 +37,7 @@ fn get_encoding() -> &'static MbEncoding { .find_map(|&k| std::env::var(k).ok().filter(|v| !v.is_empty())); let s = match val.as_deref() { Some(s) if s != "C" && s != "POSIX" => s, - _ => return MbEncoding::Utf8, + _ => return MbEncoding::SingleByte, }; if let Some(enc) = s.split('.').nth(1) { let enc = enc.split('@').next().unwrap_or(enc); @@ -51,6 +53,12 @@ fn get_encoding() -> &'static MbEncoding { }) } +/// Whether the current locale uses a multi-byte encoding (i.e. `MB_CUR_MAX > 1`). +/// `C`/`POSIX` and single-byte encodings return `false`. +pub fn is_multibyte_locale() -> bool { + !matches!(get_encoding(), MbEncoding::SingleByte) +} + /// Byte length of the first character in `bytes` under the current locale encoding. pub fn mb_char_len(bytes: &[u8]) -> usize { debug_assert!(!bytes.is_empty()); @@ -59,7 +67,9 @@ pub fn mb_char_len(bytes: &[u8]) -> usize { return 1; } match get_encoding() { - MbEncoding::Utf8 => utf8_len(bytes, b0), + // `C`/`POSIX` and unknown encodings have `MB_CUR_MAX == 1`, but we still + // decode UTF-8 there as a sensible default for byte-length detection. + MbEncoding::SingleByte | MbEncoding::Utf8 => utf8_len(bytes, b0), MbEncoding::Gb18030 => gb18030_len(bytes, b0), MbEncoding::EucJp => eucjp_len(bytes, b0), MbEncoding::EucKr => euckr_len(bytes, b0), @@ -67,6 +77,19 @@ pub fn mb_char_len(bytes: &[u8]) -> usize { } } +/// Iterate over the characters of `bytes` under the current locale encoding, +/// yielding each character as a byte slice. Invalid bytes are yielded one at a +/// time, so the concatenation of all items is always `bytes`. +pub fn mb_chars(bytes: &[u8]) -> impl Iterator { + let mut idx = 0; + std::iter::from_fn(move || { + let rest = bytes.get(idx..).filter(|r| !r.is_empty())?; + let len = mb_char_len(rest).max(1); // mb_char_len never exceeds rest.len() + idx += len; + Some(&bytes[idx - len..idx]) + }) +} + // All helpers below assume b0 > 0x7F (ASCII already handled by caller). fn utf8_len(b: &[u8], b0: u8) -> usize { diff --git a/tests/by-util/test_cut.rs b/tests/by-util/test_cut.rs index 4b565ea6f43..8bbe8890e96 100644 --- a/tests/by-util/test_cut.rs +++ b/tests/by-util/test_cut.rs @@ -3,7 +3,7 @@ // For the full copyright and license information, please view the LICENSE // file that was distributed with this source code. -// spell-checker:ignore defg +// spell-checker:ignore defg héllo hllo use uutests::at_and_ucmd; use uutests::new_ucmd; @@ -661,3 +661,173 @@ fn test_cut_non_utf8_paths() { .succeeds() .stdout_only("a\tc\n1\t3\n"); } + +// `0xA2 0xE3` is a valid 2-byte GB18030 character (and not valid UTF-8). +// The encoding is detected from `LC_ALL` directly, so these tests do not +// require the `zh_CN.gb18030` locale to be installed on the system. +#[cfg(target_os = "linux")] +const GB18030_LOCALE: &str = "zh_CN.gb18030"; + +#[test] +#[cfg(target_os = "linux")] +#[cfg_attr(wasi_runner, ignore = "WASI: argv must be valid UTF-8")] +fn test_gb18030_multibyte_delimiter() { + use std::ffi::OsString; + use std::os::unix::ffi::OsStringExt; + + let delim = OsString::from_vec(vec![0xA2, 0xE3]); + // 123 -> fields 2,3 joined with `_` + new_ucmd!() + .env("LC_ALL", GB18030_LOCALE) + .arg("-d") + .arg(&delim) + .args(&["-f2,3", "--output-delimiter=_"]) + .pipe_in(b"1\xA2\xE32\xA2\xE33\n".to_vec()) + .succeeds() + .stdout_only("2_3\n"); + + // -f1,3 keeps the multibyte delimiter as the output delimiter + new_ucmd!() + .env("LC_ALL", GB18030_LOCALE) + .arg("-d") + .arg(&delim) + .arg("-f1,3") + .pipe_in(b"1\xA2\xE32\xA2\xE33\n".to_vec()) + .succeeds() + .stdout_only_bytes(b"1\xA2\xE33\n"); + + // --complement -f1 + new_ucmd!() + .env("LC_ALL", GB18030_LOCALE) + .arg("--complement") + .arg("-d") + .arg(&delim) + .arg("-f1") + .pipe_in(b"1\xA2\xE32\xA2\xE33\n".to_vec()) + .succeeds() + .stdout_only_bytes(b"2\xA2\xE33\n"); + + // empty fields with a multibyte delimiter + new_ucmd!() + .env("LC_ALL", GB18030_LOCALE) + .arg("-d") + .arg(&delim) + .args(&["-f1-3", "--output-delimiter=:"]) + .pipe_in(b"\xA2\xE3\xA2\xE33\n".to_vec()) + .succeeds() + .stdout_only("::3\n"); +} + +#[test] +#[cfg(target_os = "linux")] +fn test_gb18030_single_byte_delimiter() { + use std::ffi::OsString; + use std::os::unix::ffi::OsStringExt; + + // 0xFF is invalid in GB18030, but any single byte is a valid delimiter. + let delim = OsString::from_vec(vec![0xFF]); + new_ucmd!() + .env("LC_ALL", GB18030_LOCALE) + .arg("-d") + .arg(&delim) + .args(&["-f2,3", "--output-delimiter=_"]) + .pipe_in(b"1\xFF2\xFF3\n".to_vec()) + .succeeds() + .stdout_only("2_3\n"); +} + +#[test] +#[cfg(target_os = "linux")] +#[cfg_attr(wasi_runner, ignore = "WASI: argv must be valid UTF-8")] +fn test_gb18030_characters() { + // -c1 selects the whole first (multibyte) character, -c2 the next one. + new_ucmd!() + .env("LC_ALL", GB18030_LOCALE) + .arg("-c1") + .pipe_in(b"\xA2\xE3x\n".to_vec()) + .succeeds() + .stdout_only_bytes(b"\xA2\xE3\n"); + + new_ucmd!() + .env("LC_ALL", GB18030_LOCALE) + .arg("-c2") + .pipe_in(b"\xA2\xE3x\n".to_vec()) + .succeeds() + .stdout_only("x\n"); +} + +#[test] +#[cfg(target_os = "linux")] +#[cfg_attr(wasi_runner, ignore = "WASI: argv must be valid UTF-8")] +fn test_gb18030_bytes_no_split() { + // With -n, a multibyte character is emitted only when its last byte is + // selected, and it is then emitted whole. + new_ucmd!() + .env("LC_ALL", GB18030_LOCALE) + .args(&["-b1", "-n"]) + .pipe_in(b"\xA2\xE3x\n".to_vec()) + .succeeds() + .stdout_only("\n"); + + new_ucmd!() + .env("LC_ALL", GB18030_LOCALE) + .args(&["-b2", "-n"]) + .pipe_in(b"\xA2\xE3x\n".to_vec()) + .succeeds() + .stdout_only_bytes(b"\xA2\xE3\n"); +} + +// In a UTF-8 locale `-c` cuts whole characters too (this is the common path, +// but the test harness runs under `LC_ALL=C` so it must be set explicitly). +// `h\xc3\xa9llo` is "héllo": 'h', 'é' (2 bytes), 'l', 'l', 'o'. +#[test] +#[cfg(target_os = "linux")] +#[cfg_attr(wasi_runner, ignore = "WASI: argv must be valid UTF-8")] +fn test_utf8_characters() { + // -c2 selects the whole 2-byte 'é', not a single byte of it. + new_ucmd!() + .env("LC_ALL", "en_US.UTF-8") + .arg("-c2") + .pipe_in("héllo\n".as_bytes().to_vec()) + .succeeds() + .stdout_only("é\n"); + + // Two ranges joined with an output delimiter, one of them spanning 'é'. + new_ucmd!() + .env("LC_ALL", "en_US.UTF-8") + .args(&["-c1,3", "--output-delimiter=_"]) + .pipe_in("héllo\n".as_bytes().to_vec()) + .succeeds() + .stdout_only("h_l\n"); + + // --complement drops the whole 'é', keeping the rest. + new_ucmd!() + .env("LC_ALL", "en_US.UTF-8") + .args(&["--complement", "-c2"]) + .pipe_in("héllo\n".as_bytes().to_vec()) + .succeeds() + .stdout_only("hllo\n"); +} + +// Multiple ranges and --complement in char mode for a non-UTF-8 multibyte +// locale, exercising the per-range output-delimiter logic in `cut_chars`. +#[test] +#[cfg(target_os = "linux")] +#[cfg_attr(wasi_runner, ignore = "WASI: argv must be valid UTF-8")] +fn test_gb18030_characters_ranges() { + // -c1,3 keeps the first and third characters joined with `_`. + new_ucmd!() + .env("LC_ALL", GB18030_LOCALE) + .args(&["-c1,3", "--output-delimiter=_"]) + .pipe_in(b"\xA2\xE3x\xA2\xE3\n".to_vec()) + .succeeds() + .stdout_only_bytes(b"\xA2\xE3_\xA2\xE3\n"); + + // --complement -c1 drops the leading multibyte character. + new_ucmd!() + .env("LC_ALL", GB18030_LOCALE) + .args(&["--complement", "-c1"]) + .pipe_in(b"\xA2\xE3x\xA2\xE3\n".to_vec()) + .succeeds() + .stdout_only_bytes(b"x\xA2\xE3\n"); +}