Skip to content

Commit

Permalink
cut: refactor (#4255)
Browse files Browse the repository at this point in the history
refactors `cut field` logic to reduce code duplication by factoring out the common `Searcer`, which is _templatized_ on a specific `Matcher` -- `ExactMatcher` for an explicit delimiter and `WhitespaceMatcher` for white-space delimiter.

before
- code duplication in `Searcher` and `WhitespaceSearcher`
- code duplication in `cut_fields` and `cut_fields_whitespace`

after
- two versions of `Matcher`s
- one `Searcher`
- simplify `cut_fields` by delegating actual work to specific functions
  • Loading branch information
TechHara committed Jan 27, 2023
1 parent 8c6d0e7 commit 3ad36a4
Show file tree
Hide file tree
Showing 4 changed files with 316 additions and 270 deletions.
193 changes: 65 additions & 128 deletions src/uu/cut/src/cut.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,12 @@ use uucore::display::Quotable;
use uucore::error::{FromIo, UResult, USimpleError};

use self::searcher::Searcher;
use self::whitespace_searcher::WhitespaceSearcher;
use matcher::{ExactMatcher, Matcher, WhitespaceMatcher};
use uucore::ranges::Range;
use uucore::{format_usage, show, show_error, show_if_err};

mod matcher;
mod searcher;
mod whitespace_searcher;

static USAGE: &str =
"{} [-d|-w] [-s] [-z] [--output-delimiter] ((-f|-b|-c) {{sequence}}) {{sourcefile}}+";
Expand Down Expand Up @@ -188,23 +188,22 @@ fn cut_bytes<R: Read>(reader: R, ranges: &[Range], opts: &Options) -> UResult<()
Ok(())
}

#[allow(clippy::cognitive_complexity)]
fn cut_fields_delimiter<R: Read>(
// Output delimiter is explicitly specified
fn cut_fields_explicit_out_delim<R: Read, M: Matcher>(
reader: R,
matcher: &M,
ranges: &[Range],
delim: &str,
only_delimited: bool,
newline_char: u8,
out_delim: &str,
) -> UResult<()> {
let mut buf_in = BufReader::new(reader);
let mut out = stdout_writer();
let input_delim_len = delim.len();

let result = buf_in.for_byte_record_with_terminator(newline_char, |line| {
let mut fields_pos = 1;
let mut low_idx = 0;
let mut delim_search = Searcher::new(line, delim.as_bytes()).peekable();
let mut delim_search = Searcher::new(matcher, line).peekable();
let mut print_delim = false;

if delim_search.peek().is_none() {
Expand All @@ -220,29 +219,35 @@ fn cut_fields_delimiter<R: Read>(

for &Range { low, high } in ranges {
if low - fields_pos > 0 {
// current field is not in the range, so jump to the field corresponding to the
// beginning of the range if any
low_idx = match delim_search.nth(low - fields_pos - 1) {
Some(index) => index + input_delim_len,
Some((_, last)) => last,
None => break,
};
}

// at this point, current field is the first in the range
for _ in 0..=high - low {
// skip printing delimiter if this is the first matching field for this line
if print_delim {
out.write_all(out_delim.as_bytes())?;
} else {
print_delim = true;
}

match delim_search.next() {
Some(high_idx) => {
let segment = &line[low_idx..high_idx];
// print the current field up to the next field delim
Some((first, last)) => {
let segment = &line[low_idx..first];

out.write_all(segment)?;

low_idx = high_idx + input_delim_len;
low_idx = last;
fields_pos = high + 1;
}
None => {
// this is the last field in the line, so print the rest
let segment = &line[low_idx..];

out.write_all(segment)?;
Expand All @@ -267,20 +272,21 @@ fn cut_fields_delimiter<R: Read>(
Ok(())
}

fn cut_fields_whitespace<R: Read>(
// Output delimiter is the same as input delimiter
fn cut_fields_implicit_out_delim<R: Read, M: Matcher>(
reader: R,
matcher: &M,
ranges: &[Range],
only_delimited: bool,
newline_char: u8,
out_delim: &str,
) -> UResult<()> {
let mut buf_in = BufReader::new(reader);
let mut out = stdout_writer();

let result = buf_in.for_byte_record_with_terminator(newline_char, |line| {
let mut fields_pos = 1;
let mut low_idx = 0;
let mut delim_search = WhitespaceSearcher::new(line).peekable();
let mut delim_search = Searcher::new(matcher, line).peekable();
let mut print_delim = false;

if delim_search.peek().is_none() {
Expand All @@ -293,54 +299,38 @@ fn cut_fields_whitespace<R: Read>(

return Ok(true);
}
// The logic is identical to `cut_fields_delimiter` function above, which uses
// `Searcher` that iterates over and returns the first position of the delimiter character.
// The main difference is that `WhitespaceSearcher` returns a pair of the first and last
// delimiter character positions, since each delimiter sequence length can vary.

for &Range { low, high } in ranges {
if low - fields_pos > 0 {
// current field is not in the range, so jump to the field corresponding to the
// beginning of the range if any
low_idx = match delim_search.nth(low - fields_pos - 1) {
Some((_, last)) => last,
None => break,
};
}

// at this point, current field is the first in the range
for _ in 0..=high - low {
// skip printing delimiter if this is the first matching field for this line
if print_delim {
out.write_all(out_delim.as_bytes())?;
if let Some((first, last)) = delim_search.nth(low - fields_pos - 1) {
low_idx = if print_delim { first } else { last }
} else {
print_delim = true;
break;
}
}

match delim_search.next() {
// print the current field up to the next whitespace
Some((first, last)) => {
let segment = &line[low_idx..first];
match delim_search.nth(high - low) {
Some((first, _)) => {
let segment = &line[low_idx..first];

out.write_all(segment)?;
out.write_all(segment)?;

low_idx = last;
fields_pos = high + 1;
}
None => {
// this is the last field in the line, so print the rest
let segment = &line[low_idx..];
print_delim = true;
low_idx = first;
fields_pos = high + 1;
}
None => {
let segment = &line[low_idx..line.len()];

out.write_all(segment)?;
out.write_all(segment)?;

if line[line.len() - 1] == newline_char {
return Ok(true);
}
break;
if line[line.len() - 1] == newline_char {
return Ok(true);
}
break;
}
}
}

out.write_all(&[newline_char])?;
Ok(true)
});
Expand All @@ -355,90 +345,37 @@ fn cut_fields_whitespace<R: Read>(
fn cut_fields<R: Read>(reader: R, ranges: &[Range], opts: &FieldOptions) -> UResult<()> {
let newline_char = if opts.zero_terminated { b'\0' } else { b'\n' };
match opts.delimiter {
Delimiter::Whitespace => cut_fields_whitespace(
reader,
ranges,
opts.only_delimited,
newline_char,
opts.out_delimiter.as_deref().unwrap_or("\t"),
),
Delimiter::String(ref delimiter) => {
if let Some(ref o_delim) = opts.out_delimiter {
return cut_fields_delimiter(
Delimiter::String(ref delim) => {
let matcher = ExactMatcher::new(delim.as_bytes());
match opts.out_delimiter {
Some(ref out_delim) => cut_fields_explicit_out_delim(
reader,
&matcher,
ranges,
delimiter,
opts.only_delimited,
newline_char,
o_delim,
);
}

let mut buf_in = BufReader::new(reader);
let mut out = stdout_writer();
let delim_len = delimiter.len();

let result = buf_in.for_byte_record_with_terminator(newline_char, |line| {
let mut fields_pos = 1;
let mut low_idx = 0;
let mut delim_search = Searcher::new(line, delimiter.as_bytes()).peekable();
let mut print_delim = false;

if delim_search.peek().is_none() {
if !opts.only_delimited {
out.write_all(line)?;
if line[line.len() - 1] != newline_char {
out.write_all(&[newline_char])?;
}
}

return Ok(true);
}

for &Range { low, high } in ranges {
if low - fields_pos > 0 {
if let Some(delim_pos) = delim_search.nth(low - fields_pos - 1) {
low_idx = if print_delim {
delim_pos
} else {
delim_pos + delim_len
}
} else {
break;
}
}

match delim_search.nth(high - low) {
Some(high_idx) => {
let segment = &line[low_idx..high_idx];

out.write_all(segment)?;

print_delim = true;
low_idx = high_idx;
fields_pos = high + 1;
}
None => {
let segment = &line[low_idx..line.len()];

out.write_all(segment)?;

if line[line.len() - 1] == newline_char {
return Ok(true);
}
break;
}
}
}
out.write_all(&[newline_char])?;
Ok(true)
});

if let Err(e) = result {
return Err(USimpleError::new(1, e.to_string()));
out_delim,
),
None => cut_fields_implicit_out_delim(
reader,
&matcher,
ranges,
opts.only_delimited,
newline_char,
),
}

Ok(())
}
Delimiter::Whitespace => {
let matcher = WhitespaceMatcher {};
let out_delim = opts.out_delimiter.as_deref().unwrap_or("\t");
cut_fields_explicit_out_delim(
reader,
&matcher,
ranges,
opts.only_delimited,
newline_char,
out_delim,
)
}
}
}
Expand Down
Loading

0 comments on commit 3ad36a4

Please sign in to comment.