From 37e7ed57a8e0b78c9561e92eb7a187fb162c9c82 Mon Sep 17 00:00:00 2001 From: Simon Brugman Date: Thu, 13 Jul 2023 01:06:40 +0200 Subject: [PATCH] wip: initial version of pandas-vet constant series check --- .../test/fixtures/pandas_vet/PD801.py | 27 ++++ crates/ruff/src/checkers/ast/mod.rs | 9 ++ crates/ruff/src/codes.rs | 1 + crates/ruff/src/rules/pandas_vet/mod.rs | 1 + crates/ruff/src/rules/pandas_vet/rules/mod.rs | 2 + .../pandas_nunique_constant_series_check.rs | 110 ++++++++++++++++ ...es__pandas_vet__tests__PD801_PD801.py.snap | 120 ++++++++++++++++++ ruff.schema.json | 3 + 8 files changed, 273 insertions(+) create mode 100644 crates/ruff/resources/test/fixtures/pandas_vet/PD801.py create mode 100644 crates/ruff/src/rules/pandas_vet/rules/pandas_nunique_constant_series_check.rs create mode 100644 crates/ruff/src/rules/pandas_vet/snapshots/ruff__rules__pandas_vet__tests__PD801_PD801.py.snap diff --git a/crates/ruff/resources/test/fixtures/pandas_vet/PD801.py b/crates/ruff/resources/test/fixtures/pandas_vet/PD801.py new file mode 100644 index 00000000000000..edaabde31956ff --- /dev/null +++ b/crates/ruff/resources/test/fixtures/pandas_vet/PD801.py @@ -0,0 +1,27 @@ +import pandas as pd + + +data = pd.Series(range(1000)) + +# PD801 +data.nunique() <= 1 +data.nunique(dropna=True) <= 1 +data.nunique(dropna=False) <= 1 +data.nunique() == 1 +data.nunique(dropna=True) == 1 +data.nunique(dropna=False) == 1 +data.nunique() != 1 +data.nunique(dropna=True) != 1 +data.nunique(dropna=False) != 1 +data.nunique() > 1 +data.dropna().nunique() == 1 +data[data.notnull()].nunique() == 1 + +# No violation of this rule +data.nunique() == 0 # empty +data.nunique() >= 1 # not-empty +data.nunique() < 1 # empty +data.nunique() == 2 # not constant +data.unique() == 1 # not `nunique` + +{"hello": "world"}.nunique() == 1 # no pd.Series diff --git a/crates/ruff/src/checkers/ast/mod.rs b/crates/ruff/src/checkers/ast/mod.rs index 9fac7dde22eb79..0c2b472050b227 100644 --- a/crates/ruff/src/checkers/ast/mod.rs +++ b/crates/ruff/src/checkers/ast/mod.rs @@ -3290,6 +3290,15 @@ where if self.enabled(Rule::YodaConditions) { flake8_simplify::rules::yoda_conditions(self, expr, left, ops, comparators); } + if self.enabled(Rule::PandasNuniqueConstantSeriesCheck) { + pandas_vet::rules::pandas_nunique_constant_series_check( + self, + expr, + left, + ops, + comparators, + ); + } } Expr::Constant(ast::ExprConstant { value: Constant::Int(_) | Constant::Float(_) | Constant::Complex { .. }, diff --git a/crates/ruff/src/codes.rs b/crates/ruff/src/codes.rs index 704deb14377686..b2e0535436e2b6 100644 --- a/crates/ruff/src/codes.rs +++ b/crates/ruff/src/codes.rs @@ -603,6 +603,7 @@ pub fn code_to_rule(linter: Linter, code: &str) -> Option<(RuleGroup, Rule)> { (PandasVet, "012") => (RuleGroup::Unspecified, rules::pandas_vet::rules::PandasUseOfDotReadTable), (PandasVet, "013") => (RuleGroup::Unspecified, rules::pandas_vet::rules::PandasUseOfDotStack), (PandasVet, "015") => (RuleGroup::Unspecified, rules::pandas_vet::rules::PandasUseOfPdMerge), + (PandasVet, "801") => (RuleGroup::Unspecified, rules::pandas_vet::rules::PandasNuniqueConstantSeriesCheck), (PandasVet, "901") => (RuleGroup::Unspecified, rules::pandas_vet::rules::PandasDfVariableName), // flake8-errmsg diff --git a/crates/ruff/src/rules/pandas_vet/mod.rs b/crates/ruff/src/rules/pandas_vet/mod.rs index 295a83cd240868..4bf4797dab3b43 100644 --- a/crates/ruff/src/rules/pandas_vet/mod.rs +++ b/crates/ruff/src/rules/pandas_vet/mod.rs @@ -361,6 +361,7 @@ mod tests { } #[test_case(Rule::PandasUseOfInplaceArgument, Path::new("PD002.py"))] + #[test_case(Rule::PandasNuniqueConstantSeriesCheck, Path::new("PD801.py"))] fn paths(rule_code: Rule, path: &Path) -> Result<()> { let snapshot = format!("{}_{}", rule_code.noqa_code(), path.to_string_lossy()); let diagnostics = test_path( diff --git a/crates/ruff/src/rules/pandas_vet/rules/mod.rs b/crates/ruff/src/rules/pandas_vet/rules/mod.rs index a0c1ef09083072..4923684df57953 100644 --- a/crates/ruff/src/rules/pandas_vet/rules/mod.rs +++ b/crates/ruff/src/rules/pandas_vet/rules/mod.rs @@ -2,6 +2,7 @@ pub(crate) use assignment_to_df::*; pub(crate) use attr::*; pub(crate) use call::*; pub(crate) use inplace_argument::*; +pub(crate) use pandas_nunique_constant_series_check::*; pub(crate) use pd_merge::*; pub(crate) use subscript::*; @@ -9,5 +10,6 @@ pub(crate) mod assignment_to_df; pub(crate) mod attr; pub(crate) mod call; pub(crate) mod inplace_argument; +pub(crate) mod pandas_nunique_constant_series_check; pub(crate) mod pd_merge; pub(crate) mod subscript; diff --git a/crates/ruff/src/rules/pandas_vet/rules/pandas_nunique_constant_series_check.rs b/crates/ruff/src/rules/pandas_vet/rules/pandas_nunique_constant_series_check.rs new file mode 100644 index 00000000000000..9e825e247a61cb --- /dev/null +++ b/crates/ruff/src/rules/pandas_vet/rules/pandas_nunique_constant_series_check.rs @@ -0,0 +1,110 @@ +use num_traits::One; +use ruff_diagnostics::Violation; +use ruff_macros::{derive_message_formats, violation}; +use rustpython_parser::ast::{self, CmpOp, Constant, Expr, Ranged}; + +use crate::checkers::ast::Checker; +use crate::rules::pandas_vet::helpers::{test_expression, Resolution}; +use ruff_diagnostics::Diagnostic; + +/// ## What it does +/// Check for the use of `.nunique()` for determining if a Pandas Series is constant. +/// +/// ## Why is this bad? +/// Let's take the example of a series of increasing integers (1, 2, 3, 4) of length `n`. +/// While walking through the series, we already know at observing the second value that +/// the series is not unique. However, using `.nunique()`, we will count till the end of +/// the series before returning the result. This is computationally inefficient. +/// +/// ## Example +/// ```python +/// import pandas as pd +/// +/// data = pd.Series(range(1000)) +/// if data.nunique() <= 1: +/// print("Series is constant") +/// ``` +/// +/// Use instead: +/// ```python +/// import pandas as pd +/// +/// data = pd.Series(range(1000)) +/// v = s.to_numpy() +/// if v.shape[0] == 0 or (s[0] == s).all(): +/// print("Series is constant") +/// ``` +/// +/// The [Pandas Cookbook](https://pandas.pydata.org/docs/user_guide/cookbook.html#constant-series) provides additional examples in case that the Series contain missing values. +/// +/// ## References +/// - [Pandas documentation: `nunique`](https://pandas.pydata.org/docs/reference/api/pandas.Series.nunique.html) +#[violation] +pub struct PandasNuniqueConstantSeriesCheck; + +impl Violation for PandasNuniqueConstantSeriesCheck { + #[derive_message_formats] + fn message(&self) -> String { + format!("Using `series.nunique()` for checking that a series is constant is inefficient") + } +} + +/// Return `true` if an [`Expr`] is a constant `1`. +fn is_constant_one(expr: &Expr) -> bool { + match expr { + Expr::Constant(constant) => match &constant.value { + Constant::Int(int) => int.is_one(), + _ => false, + }, + _ => false, + } +} + +/// PD801 +pub(crate) fn pandas_nunique_constant_series_check( + checker: &mut Checker, + expr: &Expr, + left: &Expr, + ops: &[CmpOp], + comparators: &[Expr], +) { + let ([op], [right]) = (ops, comparators) else { + return; + }; + + // Operators may be ==, !=, <=, > + if !matches!(op, CmpOp::Eq | CmpOp::NotEq | CmpOp::LtE | CmpOp::Gt,) { + return; + } + + // Right should be the integer 1 + if !is_constant_one(right) { + return; + } + + // Check if call is .nuniuqe() + let Expr::Call(ast::ExprCall {func, .. }) = left else { + return; + }; + + let Expr::Attribute(ast::ExprAttribute { value, attr, .. }) = func.as_ref() else { + return; + }; + + if attr.as_str() != "nunique" { + return; + } + + // Avoid flagging on non-Series (e.g., `{"a": 1}.at[0]`). + if !matches!( + test_expression(value, checker.semantic()), + Resolution::RelevantLocal + ) { + return; + } + + checker.diagnostics.push(Diagnostic::new( + PandasNuniqueConstantSeriesCheck, + expr.range(), + )); +} diff --git a/crates/ruff/src/rules/pandas_vet/snapshots/ruff__rules__pandas_vet__tests__PD801_PD801.py.snap b/crates/ruff/src/rules/pandas_vet/snapshots/ruff__rules__pandas_vet__tests__PD801_PD801.py.snap new file mode 100644 index 00000000000000..3c4afbc021bbe1 --- /dev/null +++ b/crates/ruff/src/rules/pandas_vet/snapshots/ruff__rules__pandas_vet__tests__PD801_PD801.py.snap @@ -0,0 +1,120 @@ +--- +source: crates/ruff/src/rules/pandas_vet/mod.rs +--- +PD801.py:7:1: PD801 Using `series.nunique()` for checking that a series is constant is inefficient + | +6 | # PD801 +7 | data.nunique() <= 1 + | ^^^^^^^^^^^^^^^^^^^ PD801 +8 | data.nunique(dropna=True) <= 1 +9 | data.nunique(dropna=False) <= 1 + | + +PD801.py:8:1: PD801 Using `series.nunique()` for checking that a series is constant is inefficient + | + 6 | # PD801 + 7 | data.nunique() <= 1 + 8 | data.nunique(dropna=True) <= 1 + | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ PD801 + 9 | data.nunique(dropna=False) <= 1 +10 | data.nunique() == 1 + | + +PD801.py:9:1: PD801 Using `series.nunique()` for checking that a series is constant is inefficient + | + 7 | data.nunique() <= 1 + 8 | data.nunique(dropna=True) <= 1 + 9 | data.nunique(dropna=False) <= 1 + | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ PD801 +10 | data.nunique() == 1 +11 | data.nunique(dropna=True) == 1 + | + +PD801.py:10:1: PD801 Using `series.nunique()` for checking that a series is constant is inefficient + | + 8 | data.nunique(dropna=True) <= 1 + 9 | data.nunique(dropna=False) <= 1 +10 | data.nunique() == 1 + | ^^^^^^^^^^^^^^^^^^^ PD801 +11 | data.nunique(dropna=True) == 1 +12 | data.nunique(dropna=False) == 1 + | + +PD801.py:11:1: PD801 Using `series.nunique()` for checking that a series is constant is inefficient + | + 9 | data.nunique(dropna=False) <= 1 +10 | data.nunique() == 1 +11 | data.nunique(dropna=True) == 1 + | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ PD801 +12 | data.nunique(dropna=False) == 1 +13 | data.nunique() != 1 + | + +PD801.py:12:1: PD801 Using `series.nunique()` for checking that a series is constant is inefficient + | +10 | data.nunique() == 1 +11 | data.nunique(dropna=True) == 1 +12 | data.nunique(dropna=False) == 1 + | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ PD801 +13 | data.nunique() != 1 +14 | data.nunique(dropna=True) != 1 + | + +PD801.py:13:1: PD801 Using `series.nunique()` for checking that a series is constant is inefficient + | +11 | data.nunique(dropna=True) == 1 +12 | data.nunique(dropna=False) == 1 +13 | data.nunique() != 1 + | ^^^^^^^^^^^^^^^^^^^ PD801 +14 | data.nunique(dropna=True) != 1 +15 | data.nunique(dropna=False) != 1 + | + +PD801.py:14:1: PD801 Using `series.nunique()` for checking that a series is constant is inefficient + | +12 | data.nunique(dropna=False) == 1 +13 | data.nunique() != 1 +14 | data.nunique(dropna=True) != 1 + | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ PD801 +15 | data.nunique(dropna=False) != 1 +16 | data.nunique() > 1 + | + +PD801.py:15:1: PD801 Using `series.nunique()` for checking that a series is constant is inefficient + | +13 | data.nunique() != 1 +14 | data.nunique(dropna=True) != 1 +15 | data.nunique(dropna=False) != 1 + | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ PD801 +16 | data.nunique() > 1 +17 | data.dropna().nunique() == 1 + | + +PD801.py:16:1: PD801 Using `series.nunique()` for checking that a series is constant is inefficient + | +14 | data.nunique(dropna=True) != 1 +15 | data.nunique(dropna=False) != 1 +16 | data.nunique() > 1 + | ^^^^^^^^^^^^^^^^^^ PD801 +17 | data.dropna().nunique() == 1 +18 | data[data.notnull()].nunique() == 1 + | + +PD801.py:17:1: PD801 Using `series.nunique()` for checking that a series is constant is inefficient + | +15 | data.nunique(dropna=False) != 1 +16 | data.nunique() > 1 +17 | data.dropna().nunique() == 1 + | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^ PD801 +18 | data[data.notnull()].nunique() == 1 + | + +PD801.py:18:1: PD801 Using `series.nunique()` for checking that a series is constant is inefficient + | +16 | data.nunique() > 1 +17 | data.dropna().nunique() == 1 +18 | data[data.notnull()].nunique() == 1 + | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ PD801 +19 | +20 | # No violation of this rule + | diff --git a/ruff.schema.json b/ruff.schema.json index 2fc94daff77182..87d39b7a383ce3 100644 --- a/ruff.schema.json +++ b/ruff.schema.json @@ -2090,6 +2090,9 @@ "PD012", "PD013", "PD015", + "PD8", + "PD80", + "PD801", "PD9", "PD90", "PD901",