uutils · sylvestre · Jun 4, 2026
diff --git a/src/uu/cut/Cargo.toml b/src/uu/cut/Cargo.toml
@@ -20,7 +20,7 @@ doctest = false
 
 [dependencies]
 clap = { workspace = true }
-uucore = { workspace = true, features = ["ranges"] }
+uucore = { workspace = true, features = ["ranges", "i18n-charmap"] }
 memchr = { workspace = true }
 bstr = { workspace = true }
 fluent = { workspace = true }

diff --git a/src/uu/cut/locales/en-US.ftl b/src/uu/cut/locales/en-US.ftl
@@ -101,6 +101,7 @@ cut-help-complement = invert the filter - instead of displaying only the filtere
 cut-help-only-delimited = in field mode, only print lines which contain the delimiter
 cut-help-zero-terminated = instead of filtering columns based on line, filter columns based on \\0 (NULL character)
 cut-help-output-delimiter = in field mode, replace the delimiter in output lines with this option's argument
+cut-help-no-partial = with -b, don't output partial multi-byte characters
 
 # Error messages
 cut-error-is-directory = Is a directory

diff --git a/src/uu/cut/locales/fr-FR.ftl b/src/uu/cut/locales/fr-FR.ftl
@@ -101,6 +101,7 @@ cut-help-complement = inverser le filtre - au lieu d'afficher seulement les colo
 cut-help-only-delimited = en mode champ, afficher seulement les lignes qui contiennent le délimiteur
 cut-help-zero-terminated = au lieu de filtrer les colonnes basées sur la ligne, filtrer les colonnes basées sur \\0 (caractère NULL)
 cut-help-output-delimiter = en mode champ, remplacer le délimiteur dans les lignes de sortie avec l'argument de cette option
+cut-help-no-partial = avec -b, ne pas afficher les caractères multi-octets partiels
 
 # Messages d'erreur
 cut-error-is-directory = Est un répertoire

diff --git a/src/uu/cut/src/cut.rs b/src/uu/cut/src/cut.rs
@@ -13,6 +13,7 @@ use std::io::{BufRead, BufReader, BufWriter, IsTerminal, Read, Write, stdin, std
 use std::path::Path;
 use uucore::display::Quotable;
 use uucore::error::{FromIo, UResult, USimpleError, set_exit_code};
+use uucore::i18n::charmap::{is_multibyte_locale, mb_char_len, mb_chars};
 use uucore::line_ending::LineEnding;
 use uucore::os_str_as_bytes;
 
@@ -29,6 +30,8 @@ struct Options<'a> {
     out_delimiter: Option<&'a [u8]>,
     line_ending: LineEnding,
     field_opts: Option<FieldOptions<'a>>,
+    /// `-n`: with `-b`, do not split multi-byte characters across the selection.
+    suppress_split: bool,
 }
 
 enum Delimiter<'a> {
@@ -104,6 +107,61 @@ fn cut_bytes<R: Read, W: Write>(
     Ok(())
 }
 
+/// Cut `-c` (whole characters) or `-b -n` (bytes, keeping whole characters).
+///
+/// In a single-byte locale, or for `-b` without `-n`, this falls back to the
+/// plain byte path. Otherwise each character is emitted whole when its 1-based
+/// position falls in a range: the character index for `-c`, or the offset of
+/// its last byte for `-b -n` (matching GNU).
+fn cut_chars<R: Read, W: Write>(
+    reader: R,
+    out: &mut W,
+    ranges: &[Range],
+    opts: &Options,
+    by_char: bool,
+) -> UResult<()> {
+    if !is_multibyte_locale() || !(by_char || opts.suppress_split) {
+        return cut_bytes(reader, out, ranges, opts);
+    }
+
+    let newline_char = opts.line_ending.into();
+    let mut buf_in = BufReader::new(reader);
+    let out_delim = opts.out_delimiter.unwrap_or(b"\t");
+
+    let result = buf_in.for_byte_record(newline_char, |line| {
+        let mut print_delim = false;
+        for &Range { low, high } in ranges {
+            let mut pos = 0;
+            let mut wrote = false;
+            for ch in mb_chars(line) {
+                pos += if by_char { 1 } else { ch.len() };
+                if pos > high {
+                    break;
+                }
+                if pos >= low {
+                    if !wrote {
+                        if print_delim {
+                            out.write_all(out_delim)?;
+                        } else if opts.out_delimiter.is_some() {
+                            print_delim = true;
+                        }
+                        wrote = true;
+                    }
+                    out.write_all(ch)?;
+                }
+            }
+        }
+        out.write_all(&[newline_char])?;
+        Ok(true)
+    });
+
+    if let Err(e) = result {
+        return Err(USimpleError::new(1, e.to_string()));
+    }
+
+    Ok(())
+}
+
 /// Output delimiter is explicitly specified
 fn cut_fields_explicit_out_delim<R: Read, W: Write, M: Matcher>(
     reader: R,
@@ -458,8 +516,8 @@ where
             }
 
             show_if_err!(match mode {
-                Mode::Bytes(ranges, opts) => cut_bytes(stdin(), &mut out, ranges, opts),
-                Mode::Characters(ranges, opts) => cut_bytes(stdin(), &mut out, ranges, opts),
+                Mode::Bytes(ranges, opts) => cut_chars(stdin(), &mut out, ranges, opts, false),
+                Mode::Characters(ranges, opts) => cut_chars(stdin(), &mut out, ranges, opts, true),
                 Mode::Fields(ranges, opts) => cut_fields(stdin(), &mut out, ranges, opts),
             });
 
@@ -482,8 +540,11 @@ where
                     .map_err_context(|| filename.maybe_quote().to_string())
                     .and_then(|file| {
                         match &mode {
-                            Mode::Bytes(ranges, opts) | Mode::Characters(ranges, opts) => {
-                                cut_bytes(file, &mut out, ranges, opts)
+                            Mode::Bytes(ranges, opts) => {
+                                cut_chars(file, &mut out, ranges, opts, false)
+                            }
+                            Mode::Characters(ranges, opts) => {
+                                cut_chars(file, &mut out, ranges, opts, true)
                             }
                             Mode::Fields(ranges, opts) => cut_fields(file, &mut out, ranges, opts),
                         }
@@ -514,12 +575,14 @@ fn get_delimiters(matches: &ArgMatches) -> UResult<(Delimiter<'_>, Option<&[u8]>
             if os_string.is_empty() {
                 Delimiter::Slice(b"\0")
             } else {
-                // For delimiter `-d` option value - allow both UTF-8 (possibly multi-byte) characters
-                // and Non UTF-8 (and not ASCII) single byte "characters", like `b"\xAD"` to align with GNU behavior
+                // The delimiter must be a single character. We accept a single
+                // UTF-8 character (e.g. an emoji), a single byte (including a
+                // non-UTF-8 byte like `b"\xFF"`), or a single character of the
+                // current locale's encoding (e.g. a 2-byte GB18030 character).
                 let bytes = os_str_as_bytes(os_string)?;
-                if os_string.to_str().is_some_and(|s| s.chars().count() > 1)
-                    || os_string.to_str().is_none() && bytes.len() > 1
-                {
+                let single_utf8_char = os_string.to_str().is_some_and(|s| s.chars().count() == 1);
+                let single_locale_char = mb_char_len(bytes) == bytes.len();
+                if !single_utf8_char && !single_locale_char {
                     return Err(USimpleError::new(
                         1,
                         translate!("cut-error-delimiter-must-be-single-character"),
@@ -583,6 +646,7 @@ pub fn uumain(args: impl uucore::Args) -> UResult<()> {
 
     let (delimiter, out_delimiter) = get_delimiters(&matches)?;
     let line_ending = LineEnding::from_zero_flag(matches.get_flag(options::ZERO_TERMINATED));
+    let suppress_split = matches.get_flag(options::NOTHING);
 
     // Only one, and only one of cutting mode arguments, i.e. `-b`, `-c`, `-f`,
     // is expected. The number of those arguments is used for parsing a cutting
@@ -610,6 +674,7 @@ pub fn uumain(args: impl uucore::Args) -> UResult<()> {
                         out_delimiter,
                         line_ending,
                         field_opts: None,
+                        suppress_split,
                     },
                 )
             })
@@ -623,6 +688,7 @@ pub fn uumain(args: impl uucore::Args) -> UResult<()> {
                         out_delimiter,
                         line_ending,
                         field_opts: None,
+                        suppress_split,
                     },
                 )
             })
@@ -639,6 +705,7 @@ pub fn uumain(args: impl uucore::Args) -> UResult<()> {
                             delimiter,
                             only_delimited,
                         }),
+                        suppress_split,
                     },
                 )
             })
@@ -775,7 +842,8 @@ pub fn uu_app() -> Command {
         .arg(
             Arg::new(options::NOTHING)
                 .short('n')
-                .help("(ignored)")
+                .long("no-partial")
+                .help(translate!("cut-help-no-partial"))
                 .action(ArgAction::SetTrue),
         )
 }
diff --git a/src/uucore/src/lib/features/i18n/charmap.rs b/src/uucore/src/lib/features/i18n/charmap.rs
@@ -10,6 +10,7 @@
 use std::sync::OnceLock;
 
 enum MbEncoding {
+    SingleByte,
     Utf8,
     Gb18030,
     EucJp,
@@ -19,11 +20,12 @@ enum MbEncoding {
 
 fn encoding_from_name(enc: &str) -> MbEncoding {
     match enc {
+        "utf-8" | "utf8" => MbEncoding::Utf8,
         "gb18030" | "gbk" | "gb2312" => MbEncoding::Gb18030,
         "euc-jp" | "eucjp" => MbEncoding::EucJp,
         "euc-kr" | "euckr" => MbEncoding::EucKr,
         "big5" | "big5-hkscs" | "big5hkscs" | "euc-tw" | "euctw" => MbEncoding::Big5,
-        _ => MbEncoding::Utf8,
+        _ => MbEncoding::SingleByte,
     }
 }
 
@@ -35,7 +37,7 @@ fn get_encoding() -> &'static MbEncoding {
             .find_map(|&k| std::env::var(k).ok().filter(|v| !v.is_empty()));
         let s = match val.as_deref() {
             Some(s) if s != "C" && s != "POSIX" => s,
-            _ => return MbEncoding::Utf8,
+            _ => return MbEncoding::SingleByte,
         };
         if let Some(enc) = s.split('.').nth(1) {
             let enc = enc.split('@').next().unwrap_or(enc);
@@ -51,6 +53,12 @@ fn get_encoding() -> &'static MbEncoding {
     })
 }
 
+/// Whether the current locale uses a multi-byte encoding (i.e. `MB_CUR_MAX > 1`).
+/// `C`/`POSIX` and single-byte encodings return `false`.
+pub fn is_multibyte_locale() -> bool {
+    !matches!(get_encoding(), MbEncoding::SingleByte)
+}
+
 /// Byte length of the first character in `bytes` under the current locale encoding.
 pub fn mb_char_len(bytes: &[u8]) -> usize {
     debug_assert!(!bytes.is_empty());
@@ -59,14 +67,29 @@ pub fn mb_char_len(bytes: &[u8]) -> usize {
         return 1;
     }
     match get_encoding() {
-        MbEncoding::Utf8 => utf8_len(bytes, b0),
+        // `C`/`POSIX` and unknown encodings have `MB_CUR_MAX == 1`, but we still
+        // decode UTF-8 there as a sensible default for byte-length detection.
+        MbEncoding::SingleByte | MbEncoding::Utf8 => utf8_len(bytes, b0),
         MbEncoding::Gb18030 => gb18030_len(bytes, b0),
         MbEncoding::EucJp => eucjp_len(bytes, b0),
         MbEncoding::EucKr => euckr_len(bytes, b0),
         MbEncoding::Big5 => big5_len(bytes, b0),
     }
 }
 
+/// Iterate over the characters of `bytes` under the current locale encoding,
+/// yielding each character as a byte slice. Invalid bytes are yielded one at a
+/// time, so the concatenation of all items is always `bytes`.
+pub fn mb_chars(bytes: &[u8]) -> impl Iterator<Item = &[u8]> {
+    let mut idx = 0;
+    std::iter::from_fn(move || {
+        let rest = bytes.get(idx..).filter(|r| !r.is_empty())?;
+        let len = mb_char_len(rest).max(1); // mb_char_len never exceeds rest.len()
+        idx += len;
+        Some(&bytes[idx - len..idx])
+    })
+}
+
 // All helpers below assume b0 > 0x7F (ASCII already handled by caller).
 
 fn utf8_len(b: &[u8], b0: u8) -> usize {