Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

cut: refactor #4255

Merged
merged 2 commits into from
Jan 27, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
193 changes: 65 additions & 128 deletions src/uu/cut/src/cut.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,12 @@ use uucore::display::Quotable;
use uucore::error::{FromIo, UResult, USimpleError};

use self::searcher::Searcher;
use self::whitespace_searcher::WhitespaceSearcher;
use matcher::{ExactMatcher, Matcher, WhitespaceMatcher};
use uucore::ranges::Range;
use uucore::{format_usage, show, show_error, show_if_err};

mod matcher;
mod searcher;
mod whitespace_searcher;

static USAGE: &str =
"{} [-d|-w] [-s] [-z] [--output-delimiter] ((-f|-b|-c) {{sequence}}) {{sourcefile}}+";
Expand Down Expand Up @@ -188,23 +188,22 @@ fn cut_bytes<R: Read>(reader: R, ranges: &[Range], opts: &Options) -> UResult<()
Ok(())
}

#[allow(clippy::cognitive_complexity)]
fn cut_fields_delimiter<R: Read>(
// Output delimiter is explicitly specified
fn cut_fields_explicit_out_delim<R: Read, M: Matcher>(
reader: R,
matcher: &M,
ranges: &[Range],
delim: &str,
only_delimited: bool,
newline_char: u8,
out_delim: &str,
) -> UResult<()> {
let mut buf_in = BufReader::new(reader);
let mut out = stdout_writer();
let input_delim_len = delim.len();

let result = buf_in.for_byte_record_with_terminator(newline_char, |line| {
let mut fields_pos = 1;
let mut low_idx = 0;
let mut delim_search = Searcher::new(line, delim.as_bytes()).peekable();
let mut delim_search = Searcher::new(matcher, line).peekable();
let mut print_delim = false;

if delim_search.peek().is_none() {
Expand All @@ -220,29 +219,35 @@ fn cut_fields_delimiter<R: Read>(

for &Range { low, high } in ranges {
if low - fields_pos > 0 {
// current field is not in the range, so jump to the field corresponding to the
// beginning of the range if any
low_idx = match delim_search.nth(low - fields_pos - 1) {
Some(index) => index + input_delim_len,
Some((_, last)) => last,
None => break,
};
}

// at this point, current field is the first in the range
for _ in 0..=high - low {
// skip printing delimiter if this is the first matching field for this line
if print_delim {
out.write_all(out_delim.as_bytes())?;
} else {
print_delim = true;
}

match delim_search.next() {
Some(high_idx) => {
let segment = &line[low_idx..high_idx];
// print the current field up to the next field delim
Some((first, last)) => {
let segment = &line[low_idx..first];

out.write_all(segment)?;

low_idx = high_idx + input_delim_len;
low_idx = last;
fields_pos = high + 1;
}
None => {
// this is the last field in the line, so print the rest
let segment = &line[low_idx..];

out.write_all(segment)?;
Expand All @@ -267,20 +272,21 @@ fn cut_fields_delimiter<R: Read>(
Ok(())
}

fn cut_fields_whitespace<R: Read>(
// Output delimiter is the same as input delimiter
fn cut_fields_implicit_out_delim<R: Read, M: Matcher>(
reader: R,
matcher: &M,
ranges: &[Range],
only_delimited: bool,
newline_char: u8,
out_delim: &str,
) -> UResult<()> {
let mut buf_in = BufReader::new(reader);
let mut out = stdout_writer();

let result = buf_in.for_byte_record_with_terminator(newline_char, |line| {
let mut fields_pos = 1;
let mut low_idx = 0;
let mut delim_search = WhitespaceSearcher::new(line).peekable();
let mut delim_search = Searcher::new(matcher, line).peekable();
let mut print_delim = false;

if delim_search.peek().is_none() {
Expand All @@ -293,54 +299,38 @@ fn cut_fields_whitespace<R: Read>(

return Ok(true);
}
// The logic is identical to `cut_fields_delimiter` function above, which uses
// `Searcher` that iterates over and returns the first position of the delimiter character.
// The main difference is that `WhitespaceSearcher` returns a pair of the first and last
// delimiter character positions, since each delimiter sequence length can vary.

for &Range { low, high } in ranges {
if low - fields_pos > 0 {
// current field is not in the range, so jump to the field corresponding to the
// beginning of the range if any
low_idx = match delim_search.nth(low - fields_pos - 1) {
Some((_, last)) => last,
None => break,
};
}

// at this point, current field is the first in the range
for _ in 0..=high - low {
// skip printing delimiter if this is the first matching field for this line
if print_delim {
out.write_all(out_delim.as_bytes())?;
if let Some((first, last)) = delim_search.nth(low - fields_pos - 1) {
low_idx = if print_delim { first } else { last }
} else {
print_delim = true;
break;
}
}

match delim_search.next() {
// print the current field up to the next whitespace
Some((first, last)) => {
let segment = &line[low_idx..first];
match delim_search.nth(high - low) {
Some((first, _)) => {
let segment = &line[low_idx..first];

out.write_all(segment)?;
out.write_all(segment)?;

low_idx = last;
fields_pos = high + 1;
}
None => {
// this is the last field in the line, so print the rest
let segment = &line[low_idx..];
print_delim = true;
low_idx = first;
fields_pos = high + 1;
}
None => {
let segment = &line[low_idx..line.len()];

out.write_all(segment)?;
out.write_all(segment)?;

if line[line.len() - 1] == newline_char {
return Ok(true);
}
break;
if line[line.len() - 1] == newline_char {
return Ok(true);
}
break;
}
}
}

out.write_all(&[newline_char])?;
Ok(true)
});
Expand All @@ -355,90 +345,37 @@ fn cut_fields_whitespace<R: Read>(
fn cut_fields<R: Read>(reader: R, ranges: &[Range], opts: &FieldOptions) -> UResult<()> {
let newline_char = if opts.zero_terminated { b'\0' } else { b'\n' };
match opts.delimiter {
Delimiter::Whitespace => cut_fields_whitespace(
reader,
ranges,
opts.only_delimited,
newline_char,
opts.out_delimiter.as_deref().unwrap_or("\t"),
),
Delimiter::String(ref delimiter) => {
if let Some(ref o_delim) = opts.out_delimiter {
return cut_fields_delimiter(
Delimiter::String(ref delim) => {
let matcher = ExactMatcher::new(delim.as_bytes());
match opts.out_delimiter {
Some(ref out_delim) => cut_fields_explicit_out_delim(
reader,
&matcher,
ranges,
delimiter,
opts.only_delimited,
newline_char,
o_delim,
);
}

let mut buf_in = BufReader::new(reader);
let mut out = stdout_writer();
let delim_len = delimiter.len();

let result = buf_in.for_byte_record_with_terminator(newline_char, |line| {
let mut fields_pos = 1;
let mut low_idx = 0;
let mut delim_search = Searcher::new(line, delimiter.as_bytes()).peekable();
let mut print_delim = false;

if delim_search.peek().is_none() {
if !opts.only_delimited {
out.write_all(line)?;
if line[line.len() - 1] != newline_char {
out.write_all(&[newline_char])?;
}
}

return Ok(true);
}

for &Range { low, high } in ranges {
if low - fields_pos > 0 {
if let Some(delim_pos) = delim_search.nth(low - fields_pos - 1) {
low_idx = if print_delim {
delim_pos
} else {
delim_pos + delim_len
}
} else {
break;
}
}

match delim_search.nth(high - low) {
Some(high_idx) => {
let segment = &line[low_idx..high_idx];

out.write_all(segment)?;

print_delim = true;
low_idx = high_idx;
fields_pos = high + 1;
}
None => {
let segment = &line[low_idx..line.len()];

out.write_all(segment)?;

if line[line.len() - 1] == newline_char {
return Ok(true);
}
break;
}
}
}
out.write_all(&[newline_char])?;
Ok(true)
});

if let Err(e) = result {
return Err(USimpleError::new(1, e.to_string()));
out_delim,
),
None => cut_fields_implicit_out_delim(
reader,
&matcher,
ranges,
opts.only_delimited,
newline_char,
),
}

Ok(())
}
Delimiter::Whitespace => {
let matcher = WhitespaceMatcher {};
let out_delim = opts.out_delimiter.as_deref().unwrap_or("\t");
cut_fields_explicit_out_delim(
reader,
&matcher,
ranges,
opts.only_delimited,
newline_char,
out_delim,
)
}
}
}
Expand Down
Loading