Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion src/uu/cut/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ doctest = false

[dependencies]
clap = { workspace = true }
uucore = { workspace = true, features = ["ranges"] }
uucore = { workspace = true, features = ["ranges", "i18n-charmap"] }
memchr = { workspace = true }
bstr = { workspace = true }
fluent = { workspace = true }
Expand Down
1 change: 1 addition & 0 deletions src/uu/cut/locales/en-US.ftl
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,7 @@ cut-help-complement = invert the filter - instead of displaying only the filtere
cut-help-only-delimited = in field mode, only print lines which contain the delimiter
cut-help-zero-terminated = instead of filtering columns based on line, filter columns based on \\0 (NULL character)
cut-help-output-delimiter = in field mode, replace the delimiter in output lines with this option's argument
cut-help-no-partial = with -b, don't output partial multi-byte characters

# Error messages
cut-error-is-directory = Is a directory
Expand Down
1 change: 1 addition & 0 deletions src/uu/cut/locales/fr-FR.ftl
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,7 @@ cut-help-complement = inverser le filtre - au lieu d'afficher seulement les colo
cut-help-only-delimited = en mode champ, afficher seulement les lignes qui contiennent le délimiteur
cut-help-zero-terminated = au lieu de filtrer les colonnes basées sur la ligne, filtrer les colonnes basées sur \\0 (caractère NULL)
cut-help-output-delimiter = en mode champ, remplacer le délimiteur dans les lignes de sortie avec l'argument de cette option
cut-help-no-partial = avec -b, ne pas afficher les caractères multi-octets partiels

# Messages d'erreur
cut-error-is-directory = Est un répertoire
Expand Down
88 changes: 78 additions & 10 deletions src/uu/cut/src/cut.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ use std::io::{BufRead, BufReader, BufWriter, IsTerminal, Read, Write, stdin, std
use std::path::Path;
use uucore::display::Quotable;
use uucore::error::{FromIo, UResult, USimpleError, set_exit_code};
use uucore::i18n::charmap::{is_multibyte_locale, mb_char_len, mb_chars};
use uucore::line_ending::LineEnding;
use uucore::os_str_as_bytes;

Expand All @@ -29,6 +30,8 @@ struct Options<'a> {
out_delimiter: Option<&'a [u8]>,
line_ending: LineEnding,
field_opts: Option<FieldOptions<'a>>,
/// `-n`: with `-b`, do not split multi-byte characters across the selection.
suppress_split: bool,
}

enum Delimiter<'a> {
Expand Down Expand Up @@ -104,6 +107,61 @@ fn cut_bytes<R: Read, W: Write>(
Ok(())
}

/// Cut `-c` (whole characters) or `-b -n` (bytes, keeping whole characters).
///
/// In a single-byte locale, or for `-b` without `-n`, this falls back to the
/// plain byte path. Otherwise each character is emitted whole when its 1-based
/// position falls in a range: the character index for `-c`, or the offset of
/// its last byte for `-b -n` (matching GNU).
fn cut_chars<R: Read, W: Write>(
reader: R,
out: &mut W,
ranges: &[Range],
opts: &Options,
by_char: bool,
) -> UResult<()> {
if !is_multibyte_locale() || !(by_char || opts.suppress_split) {
return cut_bytes(reader, out, ranges, opts);
}

let newline_char = opts.line_ending.into();
let mut buf_in = BufReader::new(reader);
let out_delim = opts.out_delimiter.unwrap_or(b"\t");

let result = buf_in.for_byte_record(newline_char, |line| {
let mut print_delim = false;
for &Range { low, high } in ranges {
let mut pos = 0;
let mut wrote = false;
for ch in mb_chars(line) {
pos += if by_char { 1 } else { ch.len() };
if pos > high {
break;
}
if pos >= low {
if !wrote {
if print_delim {
out.write_all(out_delim)?;
} else if opts.out_delimiter.is_some() {
print_delim = true;
}
wrote = true;
}
out.write_all(ch)?;
}
}
}
out.write_all(&[newline_char])?;
Ok(true)
});

if let Err(e) = result {
return Err(USimpleError::new(1, e.to_string()));
}

Ok(())
}

/// Output delimiter is explicitly specified
fn cut_fields_explicit_out_delim<R: Read, W: Write, M: Matcher>(
reader: R,
Expand Down Expand Up @@ -458,8 +516,8 @@ where
}

show_if_err!(match mode {
Mode::Bytes(ranges, opts) => cut_bytes(stdin(), &mut out, ranges, opts),
Mode::Characters(ranges, opts) => cut_bytes(stdin(), &mut out, ranges, opts),
Mode::Bytes(ranges, opts) => cut_chars(stdin(), &mut out, ranges, opts, false),
Mode::Characters(ranges, opts) => cut_chars(stdin(), &mut out, ranges, opts, true),
Mode::Fields(ranges, opts) => cut_fields(stdin(), &mut out, ranges, opts),
});

Expand All @@ -482,8 +540,11 @@ where
.map_err_context(|| filename.maybe_quote().to_string())
.and_then(|file| {
match &mode {
Mode::Bytes(ranges, opts) | Mode::Characters(ranges, opts) => {
cut_bytes(file, &mut out, ranges, opts)
Mode::Bytes(ranges, opts) => {
cut_chars(file, &mut out, ranges, opts, false)
}
Mode::Characters(ranges, opts) => {
cut_chars(file, &mut out, ranges, opts, true)
}
Mode::Fields(ranges, opts) => cut_fields(file, &mut out, ranges, opts),
}
Expand Down Expand Up @@ -514,12 +575,14 @@ fn get_delimiters(matches: &ArgMatches) -> UResult<(Delimiter<'_>, Option<&[u8]>
if os_string.is_empty() {
Delimiter::Slice(b"\0")
} else {
// For delimiter `-d` option value - allow both UTF-8 (possibly multi-byte) characters
// and Non UTF-8 (and not ASCII) single byte "characters", like `b"\xAD"` to align with GNU behavior
// The delimiter must be a single character. We accept a single
// UTF-8 character (e.g. an emoji), a single byte (including a
// non-UTF-8 byte like `b"\xFF"`), or a single character of the
// current locale's encoding (e.g. a 2-byte GB18030 character).
let bytes = os_str_as_bytes(os_string)?;
if os_string.to_str().is_some_and(|s| s.chars().count() > 1)
|| os_string.to_str().is_none() && bytes.len() > 1
{
let single_utf8_char = os_string.to_str().is_some_and(|s| s.chars().count() == 1);
let single_locale_char = mb_char_len(bytes) == bytes.len();
if !single_utf8_char && !single_locale_char {
return Err(USimpleError::new(
1,
translate!("cut-error-delimiter-must-be-single-character"),
Expand Down Expand Up @@ -583,6 +646,7 @@ pub fn uumain(args: impl uucore::Args) -> UResult<()> {

let (delimiter, out_delimiter) = get_delimiters(&matches)?;
let line_ending = LineEnding::from_zero_flag(matches.get_flag(options::ZERO_TERMINATED));
let suppress_split = matches.get_flag(options::NOTHING);

// Only one, and only one of cutting mode arguments, i.e. `-b`, `-c`, `-f`,
// is expected. The number of those arguments is used for parsing a cutting
Expand Down Expand Up @@ -610,6 +674,7 @@ pub fn uumain(args: impl uucore::Args) -> UResult<()> {
out_delimiter,
line_ending,
field_opts: None,
suppress_split,
},
)
})
Expand All @@ -623,6 +688,7 @@ pub fn uumain(args: impl uucore::Args) -> UResult<()> {
out_delimiter,
line_ending,
field_opts: None,
suppress_split,
},
)
})
Expand All @@ -639,6 +705,7 @@ pub fn uumain(args: impl uucore::Args) -> UResult<()> {
delimiter,
only_delimited,
}),
suppress_split,
},
)
})
Expand Down Expand Up @@ -775,7 +842,8 @@ pub fn uu_app() -> Command {
.arg(
Arg::new(options::NOTHING)
.short('n')
.help("(ignored)")
.long("no-partial")
.help(translate!("cut-help-no-partial"))
.action(ArgAction::SetTrue),
)
}
29 changes: 26 additions & 3 deletions src/uucore/src/lib/features/i18n/charmap.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
use std::sync::OnceLock;

enum MbEncoding {
SingleByte,
Utf8,
Gb18030,
EucJp,
Expand All @@ -19,11 +20,12 @@ enum MbEncoding {

fn encoding_from_name(enc: &str) -> MbEncoding {
match enc {
"utf-8" | "utf8" => MbEncoding::Utf8,
"gb18030" | "gbk" | "gb2312" => MbEncoding::Gb18030,
"euc-jp" | "eucjp" => MbEncoding::EucJp,
"euc-kr" | "euckr" => MbEncoding::EucKr,
"big5" | "big5-hkscs" | "big5hkscs" | "euc-tw" | "euctw" => MbEncoding::Big5,
_ => MbEncoding::Utf8,
_ => MbEncoding::SingleByte,
}
}

Expand All @@ -35,7 +37,7 @@ fn get_encoding() -> &'static MbEncoding {
.find_map(|&k| std::env::var(k).ok().filter(|v| !v.is_empty()));
let s = match val.as_deref() {
Some(s) if s != "C" && s != "POSIX" => s,
_ => return MbEncoding::Utf8,
_ => return MbEncoding::SingleByte,
};
if let Some(enc) = s.split('.').nth(1) {
let enc = enc.split('@').next().unwrap_or(enc);
Expand All @@ -51,6 +53,12 @@ fn get_encoding() -> &'static MbEncoding {
})
}

/// Whether the current locale uses a multi-byte encoding (i.e. `MB_CUR_MAX > 1`).
/// `C`/`POSIX` and single-byte encodings return `false`.
pub fn is_multibyte_locale() -> bool {
!matches!(get_encoding(), MbEncoding::SingleByte)
}

/// Byte length of the first character in `bytes` under the current locale encoding.
pub fn mb_char_len(bytes: &[u8]) -> usize {
debug_assert!(!bytes.is_empty());
Expand All @@ -59,14 +67,29 @@ pub fn mb_char_len(bytes: &[u8]) -> usize {
return 1;
}
match get_encoding() {
MbEncoding::Utf8 => utf8_len(bytes, b0),
// `C`/`POSIX` and unknown encodings have `MB_CUR_MAX == 1`, but we still
// decode UTF-8 there as a sensible default for byte-length detection.
MbEncoding::SingleByte | MbEncoding::Utf8 => utf8_len(bytes, b0),
MbEncoding::Gb18030 => gb18030_len(bytes, b0),
MbEncoding::EucJp => eucjp_len(bytes, b0),
MbEncoding::EucKr => euckr_len(bytes, b0),
MbEncoding::Big5 => big5_len(bytes, b0),
}
}

/// Iterate over the characters of `bytes` under the current locale encoding,
/// yielding each character as a byte slice. Invalid bytes are yielded one at a
/// time, so the concatenation of all items is always `bytes`.
pub fn mb_chars(bytes: &[u8]) -> impl Iterator<Item = &[u8]> {
let mut idx = 0;
std::iter::from_fn(move || {
let rest = bytes.get(idx..).filter(|r| !r.is_empty())?;
let len = mb_char_len(rest).max(1); // mb_char_len never exceeds rest.len()
idx += len;
Some(&bytes[idx - len..idx])
})
}

// All helpers below assume b0 > 0x7F (ASCII already handled by caller).

fn utf8_len(b: &[u8], b0: u8) -> usize {
Expand Down
Loading
Loading