Skip to content

Commit

Permalink
wip: initial version of pandas-vet constant series check
Browse files Browse the repository at this point in the history
  • Loading branch information
sbrugman committed Jul 16, 2023
1 parent c87faca commit 37e7ed5
Show file tree
Hide file tree
Showing 8 changed files with 273 additions and 0 deletions.
27 changes: 27 additions & 0 deletions crates/ruff/resources/test/fixtures/pandas_vet/PD801.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
import pandas as pd


data = pd.Series(range(1000))

# PD801
data.nunique() <= 1
data.nunique(dropna=True) <= 1
data.nunique(dropna=False) <= 1
data.nunique() == 1
data.nunique(dropna=True) == 1
data.nunique(dropna=False) == 1
data.nunique() != 1
data.nunique(dropna=True) != 1
data.nunique(dropna=False) != 1
data.nunique() > 1
data.dropna().nunique() == 1
data[data.notnull()].nunique() == 1

# No violation of this rule
data.nunique() == 0 # empty
data.nunique() >= 1 # not-empty
data.nunique() < 1 # empty
data.nunique() == 2 # not constant
data.unique() == 1 # not `nunique`

{"hello": "world"}.nunique() == 1 # no pd.Series
9 changes: 9 additions & 0 deletions crates/ruff/src/checkers/ast/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3290,6 +3290,15 @@ where
if self.enabled(Rule::YodaConditions) {
flake8_simplify::rules::yoda_conditions(self, expr, left, ops, comparators);
}
if self.enabled(Rule::PandasNuniqueConstantSeriesCheck) {
pandas_vet::rules::pandas_nunique_constant_series_check(
self,
expr,
left,
ops,
comparators,
);
}
}
Expr::Constant(ast::ExprConstant {
value: Constant::Int(_) | Constant::Float(_) | Constant::Complex { .. },
Expand Down
1 change: 1 addition & 0 deletions crates/ruff/src/codes.rs
Original file line number Diff line number Diff line change
Expand Up @@ -603,6 +603,7 @@ pub fn code_to_rule(linter: Linter, code: &str) -> Option<(RuleGroup, Rule)> {
(PandasVet, "012") => (RuleGroup::Unspecified, rules::pandas_vet::rules::PandasUseOfDotReadTable),
(PandasVet, "013") => (RuleGroup::Unspecified, rules::pandas_vet::rules::PandasUseOfDotStack),
(PandasVet, "015") => (RuleGroup::Unspecified, rules::pandas_vet::rules::PandasUseOfPdMerge),
(PandasVet, "801") => (RuleGroup::Unspecified, rules::pandas_vet::rules::PandasNuniqueConstantSeriesCheck),
(PandasVet, "901") => (RuleGroup::Unspecified, rules::pandas_vet::rules::PandasDfVariableName),

// flake8-errmsg
Expand Down
1 change: 1 addition & 0 deletions crates/ruff/src/rules/pandas_vet/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -361,6 +361,7 @@ mod tests {
}

#[test_case(Rule::PandasUseOfInplaceArgument, Path::new("PD002.py"))]
#[test_case(Rule::PandasNuniqueConstantSeriesCheck, Path::new("PD801.py"))]
fn paths(rule_code: Rule, path: &Path) -> Result<()> {
let snapshot = format!("{}_{}", rule_code.noqa_code(), path.to_string_lossy());
let diagnostics = test_path(
Expand Down
2 changes: 2 additions & 0 deletions crates/ruff/src/rules/pandas_vet/rules/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,14 @@ pub(crate) use assignment_to_df::*;
pub(crate) use attr::*;
pub(crate) use call::*;
pub(crate) use inplace_argument::*;
pub(crate) use pandas_nunique_constant_series_check::*;
pub(crate) use pd_merge::*;
pub(crate) use subscript::*;

pub(crate) mod assignment_to_df;
pub(crate) mod attr;
pub(crate) mod call;
pub(crate) mod inplace_argument;
pub(crate) mod pandas_nunique_constant_series_check;
pub(crate) mod pd_merge;
pub(crate) mod subscript;
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
use num_traits::One;
use ruff_diagnostics::Violation;
use ruff_macros::{derive_message_formats, violation};
use rustpython_parser::ast::{self, CmpOp, Constant, Expr, Ranged};

use crate::checkers::ast::Checker;
use crate::rules::pandas_vet::helpers::{test_expression, Resolution};
use ruff_diagnostics::Diagnostic;

/// ## What it does
/// Check for the use of `.nunique()` for determining if a Pandas Series is constant.
///
/// ## Why is this bad?
/// Let's take the example of a series of increasing integers (1, 2, 3, 4) of length `n`.
/// While walking through the series, we already know at observing the second value that
/// the series is not unique. However, using `.nunique()`, we will count till the end of
/// the series before returning the result. This is computationally inefficient.
///
/// ## Example
/// ```python
/// import pandas as pd
///
/// data = pd.Series(range(1000))
/// if data.nunique() <= 1:
/// print("Series is constant")
/// ```
///
/// Use instead:
/// ```python
/// import pandas as pd
///
/// data = pd.Series(range(1000))
/// v = s.to_numpy()
/// if v.shape[0] == 0 or (s[0] == s).all():
/// print("Series is constant")
/// ```
///
/// The [Pandas Cookbook](https://pandas.pydata.org/docs/user_guide/cookbook.html#constant-series) provides additional examples in case that the Series contain missing values.
///
/// ## References
/// - [Pandas documentation: `nunique`](https://pandas.pydata.org/docs/reference/api/pandas.Series.nunique.html)
#[violation]
pub struct PandasNuniqueConstantSeriesCheck;

impl Violation for PandasNuniqueConstantSeriesCheck {
#[derive_message_formats]
fn message(&self) -> String {
format!("Using `series.nunique()` for checking that a series is constant is inefficient")
}
}

/// Return `true` if an [`Expr`] is a constant `1`.
fn is_constant_one(expr: &Expr) -> bool {
match expr {
Expr::Constant(constant) => match &constant.value {
Constant::Int(int) => int.is_one(),
_ => false,
},
_ => false,
}
}

/// PD801
pub(crate) fn pandas_nunique_constant_series_check(
checker: &mut Checker,
expr: &Expr,
left: &Expr,
ops: &[CmpOp],
comparators: &[Expr],
) {
let ([op], [right]) = (ops, comparators) else {
return;
};

// Operators may be ==, !=, <=, >
if !matches!(op, CmpOp::Eq | CmpOp::NotEq | CmpOp::LtE | CmpOp::Gt,) {
return;
}

// Right should be the integer 1
if !is_constant_one(right) {
return;
}

// Check if call is .nuniuqe()
let Expr::Call(ast::ExprCall {func, .. }) = left else {
return;
};

let Expr::Attribute(ast::ExprAttribute { value, attr, .. }) = func.as_ref() else {
return;
};

if attr.as_str() != "nunique" {
return;
}

// Avoid flagging on non-Series (e.g., `{"a": 1}.at[0]`).
if !matches!(
test_expression(value, checker.semantic()),
Resolution::RelevantLocal
) {
return;
}

checker.diagnostics.push(Diagnostic::new(
PandasNuniqueConstantSeriesCheck,
expr.range(),
));
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
---
source: crates/ruff/src/rules/pandas_vet/mod.rs
---
PD801.py:7:1: PD801 Using `series.nunique()` for checking that a series is constant is inefficient
|
6 | # PD801
7 | data.nunique() <= 1
| ^^^^^^^^^^^^^^^^^^^ PD801
8 | data.nunique(dropna=True) <= 1
9 | data.nunique(dropna=False) <= 1
|

PD801.py:8:1: PD801 Using `series.nunique()` for checking that a series is constant is inefficient
|
6 | # PD801
7 | data.nunique() <= 1
8 | data.nunique(dropna=True) <= 1
| ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ PD801
9 | data.nunique(dropna=False) <= 1
10 | data.nunique() == 1
|

PD801.py:9:1: PD801 Using `series.nunique()` for checking that a series is constant is inefficient
|
7 | data.nunique() <= 1
8 | data.nunique(dropna=True) <= 1
9 | data.nunique(dropna=False) <= 1
| ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ PD801
10 | data.nunique() == 1
11 | data.nunique(dropna=True) == 1
|

PD801.py:10:1: PD801 Using `series.nunique()` for checking that a series is constant is inefficient
|
8 | data.nunique(dropna=True) <= 1
9 | data.nunique(dropna=False) <= 1
10 | data.nunique() == 1
| ^^^^^^^^^^^^^^^^^^^ PD801
11 | data.nunique(dropna=True) == 1
12 | data.nunique(dropna=False) == 1
|

PD801.py:11:1: PD801 Using `series.nunique()` for checking that a series is constant is inefficient
|
9 | data.nunique(dropna=False) <= 1
10 | data.nunique() == 1
11 | data.nunique(dropna=True) == 1
| ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ PD801
12 | data.nunique(dropna=False) == 1
13 | data.nunique() != 1
|

PD801.py:12:1: PD801 Using `series.nunique()` for checking that a series is constant is inefficient
|
10 | data.nunique() == 1
11 | data.nunique(dropna=True) == 1
12 | data.nunique(dropna=False) == 1
| ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ PD801
13 | data.nunique() != 1
14 | data.nunique(dropna=True) != 1
|

PD801.py:13:1: PD801 Using `series.nunique()` for checking that a series is constant is inefficient
|
11 | data.nunique(dropna=True) == 1
12 | data.nunique(dropna=False) == 1
13 | data.nunique() != 1
| ^^^^^^^^^^^^^^^^^^^ PD801
14 | data.nunique(dropna=True) != 1
15 | data.nunique(dropna=False) != 1
|

PD801.py:14:1: PD801 Using `series.nunique()` for checking that a series is constant is inefficient
|
12 | data.nunique(dropna=False) == 1
13 | data.nunique() != 1
14 | data.nunique(dropna=True) != 1
| ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ PD801
15 | data.nunique(dropna=False) != 1
16 | data.nunique() > 1
|

PD801.py:15:1: PD801 Using `series.nunique()` for checking that a series is constant is inefficient
|
13 | data.nunique() != 1
14 | data.nunique(dropna=True) != 1
15 | data.nunique(dropna=False) != 1
| ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ PD801
16 | data.nunique() > 1
17 | data.dropna().nunique() == 1
|

PD801.py:16:1: PD801 Using `series.nunique()` for checking that a series is constant is inefficient
|
14 | data.nunique(dropna=True) != 1
15 | data.nunique(dropna=False) != 1
16 | data.nunique() > 1
| ^^^^^^^^^^^^^^^^^^ PD801
17 | data.dropna().nunique() == 1
18 | data[data.notnull()].nunique() == 1
|

PD801.py:17:1: PD801 Using `series.nunique()` for checking that a series is constant is inefficient
|
15 | data.nunique(dropna=False) != 1
16 | data.nunique() > 1
17 | data.dropna().nunique() == 1
| ^^^^^^^^^^^^^^^^^^^^^^^^^^^^ PD801
18 | data[data.notnull()].nunique() == 1
|

PD801.py:18:1: PD801 Using `series.nunique()` for checking that a series is constant is inefficient
|
16 | data.nunique() > 1
17 | data.dropna().nunique() == 1
18 | data[data.notnull()].nunique() == 1
| ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ PD801
19 |
20 | # No violation of this rule
|
3 changes: 3 additions & 0 deletions ruff.schema.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit 37e7ed5

Please sign in to comment.