From c57415861833533db716172f6143b983169027a0 Mon Sep 17 00:00:00 2001 From: "renovate[bot]" <29139614+renovate[bot]@users.noreply.github.com> Date: Tue, 18 Mar 2025 00:09:44 +0000 Subject: [PATCH 01/12] chore(actions): update dependency python to 3.13 --- .github/workflows/pull-request.yml | 8 +-- .github/workflows/release.yml | 5 +- MANIFEST.in | 4 +- pyproject.toml | 81 ++++++++++++++++++------------ setup.py | 5 -- 5 files changed, 57 insertions(+), 46 deletions(-) diff --git a/.github/workflows/pull-request.yml b/.github/workflows/pull-request.yml index d51d14932..c66e0317c 100644 --- a/.github/workflows/pull-request.yml +++ b/.github/workflows/pull-request.yml @@ -40,7 +40,7 @@ jobs: - name: Set up Python 3.11 uses: actions/setup-python@v5 with: - python-version: "3.13" + python-version: '3.11' - uses: actions/cache@v4 name: Cache pip dependencies @@ -52,7 +52,7 @@ jobs: - name: Install pip dependencies run: | - python -m pip install --upgrade pip setuptools + python -m pip install --upgrade pip python -m pip install ".[dev,test]" - name: Install pre-commit hooks @@ -93,7 +93,7 @@ jobs: - name: Setup Python 3.11 uses: actions/setup-python@v5 with: - python-version: "3.13" + python-version: '3.11' - name: Cache pip dependencies id: cache @@ -105,7 +105,7 @@ jobs: - name: Install dependencies run: | python -m pip install --upgrade pip - python -m pip install ".[dev,test,docs]" + python -m pip install ".[docs]" - name: Install the package run: make install diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index b732a56cb..019d1cc14 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -80,7 +80,6 @@ jobs: name: built-artifacts path: dist/ - - uses: pypa/gh-action-pypi-publish@v1.12.4 + - uses: pypa/gh-action-pypi-publish@release/v1 with: - user: __token__ - password: ${{ secrets.PYPI_API_TOKEN }} + packages-dir: dist/ diff --git a/MANIFEST.in b/MANIFEST.in index 06909e0a6..36a019687 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -4,8 +4,6 @@ include requirements*.txt # Include license, Readme, etc. include LICENSE include *.md -include mypy.ini -include src/ydata_profiling/py.typed # Templates and static resources recursive-include src/ydata_profiling/report/presentation/flavours/html/templates *.html *.js *.css @@ -20,7 +18,7 @@ recursive-include venv *.yml exclude .pre-commit-config.yaml exclude commitlint.config.js exclude .releaserc.json -include Makefile make.bat +exclude Makefile make.bat exclude docs examples tests docsrc .devcontainer recursive-exclude docs * recursive-exclude docsrc * diff --git a/pyproject.toml b/pyproject.toml index 72f019a1f..50e75eb96 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,18 +1,25 @@ [build-system] -requires = ["setuptools"] build-backend = "setuptools.build_meta" +requires = [ + "setuptools>=72.0.0,<80.0.0", + "setuptools-scm>=8.0.0,<9.0.0", + "wheel>=0.38.4,<1.0.0" +] + +[packaging] +package_name = "ydata-profiling" [project] name = "ydata-profiling" +requires-python = ">=3.7,<3.13" authors = [ - {name = "YData Labs Inc", email = "opensource@ydata.ai"}, + {name = "YData Labs Inc", email = "opensource@ydata.ai"} ] -description="Generate profile report for pandas DataFrame" +description = "Generate profile report for pandas DataFrame" +keywords = ["pandas", "data-science", "data-analysis", "python", "jupyter", "ipython"] readme = "README.md" -requires-python=">=3.7, <3.13" -keywords=["pandas", "data-science", "data-analysis", "python", "jupyter", "ipython"] -license = {text = "MIT"} -classifiers=[ +license = {file = "LICENSE.md"} +classifiers = [ "Development Status :: 5 - Production/Stable", "Topic :: Software Development :: Build Tools", "License :: OSI Approved :: MIT License", @@ -63,10 +70,11 @@ dependencies = [ "numba>=0.56.0, <1", ] -dynamic = ["version"] +dynamic = [ + "version", +] [project.optional-dependencies] -# dependencies for development and testing dev = [ "black>=20.8b1", "isort>=5.0.7", @@ -80,6 +88,22 @@ dev = [ "sphinx-multiversion>=0.2.3", "autodoc_pydantic", ] + +docs = [ + "mkdocs>=1.6.0,<1.7.0", + "mkdocs-material>=9.0.12,<10.0.0", + "mkdocs-material-extensions>=1.1.1,<2.0.0", + "mkdocs-table-reader-plugin<=2.2.0", + "mike>=2.1.1,<2.2.0", + "mkdocstrings[python]>=0.20.0,<1.0.0", + "mkdocs-badges", +] + +notebook = [ + "jupyter>=1.0.0", + "ipywidgets>=7.5.1", +] + # this provides the recommended pyspark and pyarrow versions for spark to work on pandas-profiling # note that if you are using pyspark 2.3 or 2.4 and pyarrow >= 0.15, you might need to # set ARROW_PRE_0_15_IPC_FORMAT=1 in your conf/spark-env.sh for toPandas functions to work properly @@ -90,6 +114,7 @@ spark = [ "numpy>=1.16.0,<1.24", "visions[type_image_path]>=0.7.5, <0.7.7", ] + test = [ "pytest", "coverage>=6.5, <8", @@ -100,35 +125,29 @@ test = [ "twine>=3.1.1", "kaggle", ] -notebook = [ - "jupyter>=1.0.0", - "ipywidgets>=7.5.1", -] -docs = [ - "mkdocs>=1.6.0,<1.7.0", - "mkdocs-material>=9.0.12,<10.0.0", - "mkdocs-material-extensions>=1.1.1,<2.0.0", - "mkdocs-table-reader-plugin<=2.2.0", - "mike>=2.1.1,<2.2.0", - "mkdocstrings[python]>=0.20.0,<1.0.0", - "mkdocs-badges", -] + unicode= [ "tangled-up-in-unicode==0.2.0", ] -[tool.setuptools.packages.find] -where = ["src"] +[project.urls] +Homepage = "https://ydata.ai" +Repository = "https://github.com/ydataai/ydata-profiling" -[tool.setuptools.package-data] -ydata_profiling = ["py.typed"] +[project.scripts] +ydata_profiling = "ydata_profiling.controller.console:main" +pandas_profiling = "ydata_profiling.controller.console:main" + +# setuptools relative [tool.setuptools] include-package-data = true -[project.scripts] -ydata_profiling = "ydata_profiling.controller.console:main" -pandas_profiling = "ydata_profiling.controller.console:main" +[tool.setuptools.package-data] +ydata_profiling = ["py.typed"] -[project.urls] -homepage = "https://github.com/ydataai/ydata-profiling" \ No newline at end of file +[tool.distutils.bdist_wheel] +universal = true + +[tool.setuptools.package-dir] +"" = "src" diff --git a/setup.py b/setup.py index d64be44d8..4824966a1 100644 --- a/setup.py +++ b/setup.py @@ -4,8 +4,6 @@ # Read the contents of README file source_root = Path(".") -with (source_root / "README.md").open(encoding="utf-8") as f: - long_description = f.read() try: version = (source_root / "VERSION").read_text().rstrip("\n") @@ -17,7 +15,4 @@ setup( version=version, - long_description=long_description, - long_description_content_type="text/markdown", - options={"bdist_wheel": {"universal": True}}, ) From 872bc270354d2632d8de9ce8421569dc6dd5e44a Mon Sep 17 00:00:00 2001 From: Fabiana <30911746+fabclmnt@users.noreply.github.com> Date: Tue, 18 Mar 2025 08:32:28 -0700 Subject: [PATCH 02/12] chore: fix linting errors --- src/ydata_profiling/model/dataframe.py | 5 ++--- src/ydata_profiling/model/describe.py | 2 +- src/ydata_profiling/model/handler.py | 2 +- src/ydata_profiling/model/pandas/dataframe_pandas.py | 1 - src/ydata_profiling/model/pandas/summary_pandas.py | 4 ++-- src/ydata_profiling/model/spark/summary_spark.py | 1 - tests/issues/test_issue537.py | 1 - 7 files changed, 6 insertions(+), 10 deletions(-) diff --git a/src/ydata_profiling/model/dataframe.py b/src/ydata_profiling/model/dataframe.py index d182f9ff3..9b4311352 100644 --- a/src/ydata_profiling/model/dataframe.py +++ b/src/ydata_profiling/model/dataframe.py @@ -2,6 +2,8 @@ from typing import Any import pandas as pd +from ydata_profiling.config import Settings +from ydata_profiling.model.pandas.dataframe_pandas import pandas_preprocess spec = importlib.util.find_spec("pyspark") if spec is None: @@ -12,9 +14,6 @@ from pyspark.sql import DataFrame as sparkDataFrame # type: ignore from ydata_profiling.model.spark.dataframe_spark import spark_preprocess -from ydata_profiling.config import Settings -from ydata_profiling.model.pandas.dataframe_pandas import pandas_preprocess - def preprocess(config: Settings, df: Any) -> Any: """ diff --git a/src/ydata_profiling/model/describe.py b/src/ydata_profiling/model/describe.py index 5b162f9fb..91386c51e 100644 --- a/src/ydata_profiling/model/describe.py +++ b/src/ydata_profiling/model/describe.py @@ -29,7 +29,7 @@ def describe( config: Settings, - df: Union[pd.DataFrame, "pyspark.sql.DataFrame"], # type: ignore + df: Union[pd.DataFrame, "pyspark.sql.DataFrame"], # noqa: F821 summarizer: BaseSummarizer, typeset: VisionsTypeset, sample: Optional[dict] = None, diff --git a/src/ydata_profiling/model/handler.py b/src/ydata_profiling/model/handler.py index ebbb285cf..072e87708 100644 --- a/src/ydata_profiling/model/handler.py +++ b/src/ydata_profiling/model/handler.py @@ -1,7 +1,7 @@ """ Auxiliary handler methods for data summary extraction """ -from typing import Any, Callable, Dict, List, Sequence +from typing import Callable, Dict, List, Sequence import networkx as nx from visions import VisionsTypeset diff --git a/src/ydata_profiling/model/pandas/dataframe_pandas.py b/src/ydata_profiling/model/pandas/dataframe_pandas.py index d4b07d1b2..3c6acbdfd 100644 --- a/src/ydata_profiling/model/pandas/dataframe_pandas.py +++ b/src/ydata_profiling/model/pandas/dataframe_pandas.py @@ -1,4 +1,3 @@ -import warnings import pandas as pd diff --git a/src/ydata_profiling/model/pandas/summary_pandas.py b/src/ydata_profiling/model/pandas/summary_pandas.py index 3e48267b1..bfec2f4f6 100644 --- a/src/ydata_profiling/model/pandas/summary_pandas.py +++ b/src/ydata_profiling/model/pandas/summary_pandas.py @@ -20,7 +20,7 @@ def _is_cast_type_defined(typeset: VisionsTypeset, series: str) -> bool: def pandas_describe_1d( config: Settings, series: pd.Series, - summarizer: "BaseSummarizer", # type:ignore + summarizer: "BaseSummarizer", # noqa: F821 typeset: VisionsTypeset, ) -> dict: """Describe a series (infer the variable type, then calculate type-specific values). @@ -67,7 +67,7 @@ def pandas_describe_1d( def pandas_get_series_descriptions( config: Settings, df: pd.DataFrame, - summarizer: "BaseSummarizer", # type:ignore + summarizer: "BaseSummarizer", # noqa: F821 typeset: VisionsTypeset, pbar: tqdm, ) -> dict: diff --git a/src/ydata_profiling/model/spark/summary_spark.py b/src/ydata_profiling/model/spark/summary_spark.py index 31a6c3f97..5a033b1d5 100644 --- a/src/ydata_profiling/model/spark/summary_spark.py +++ b/src/ydata_profiling/model/spark/summary_spark.py @@ -1,5 +1,4 @@ """Compute statistical description of datasets.""" -import multiprocessing from typing import Tuple import numpy as np diff --git a/tests/issues/test_issue537.py b/tests/issues/test_issue537.py index 16bee3efe..7248a3ac8 100644 --- a/tests/issues/test_issue537.py +++ b/tests/issues/test_issue537.py @@ -86,7 +86,6 @@ def download_and_process_data(): ("labels", "S16"), ] - dtype = np.dtype(dtype_mapping) split_text = np.array(split_text, dtype=object) # Convert each column to its appropriate type From 5246f8c64ed6b3ee227ff09117739468ff8ff94c Mon Sep 17 00:00:00 2001 From: Fabiana <30911746+fabclmnt@users.noreply.github.com> Date: Tue, 18 Mar 2025 09:06:03 -0700 Subject: [PATCH 03/12] fix: typing annotations --- src/ydata_profiling/model/correlations.py | 3 +-- src/ydata_profiling/model/handler.py | 2 +- src/ydata_profiling/model/summarizer.py | 2 +- 3 files changed, 3 insertions(+), 4 deletions(-) diff --git a/src/ydata_profiling/model/correlations.py b/src/ydata_profiling/model/correlations.py index 13f3f165a..78b8bc31f 100644 --- a/src/ydata_profiling/model/correlations.py +++ b/src/ydata_profiling/model/correlations.py @@ -12,7 +12,6 @@ except ImportError: from pandas.errors import DataError - class CorrelationBackend: """Helper class to select and cache the appropriate correlation backend (Pandas or Spark).""" @@ -29,7 +28,7 @@ def __init__(self, df: Sized): self.backend = correlation_backend - def get_method(self, method_name: str): + def get_method(self, method_name: str): # noqa: ANN201 """Retrieve the appropriate correlation method class from the backend.""" if hasattr(self.backend, method_name): return getattr(self.backend, method_name) diff --git a/src/ydata_profiling/model/handler.py b/src/ydata_profiling/model/handler.py index 072e87708..c43a6eff0 100644 --- a/src/ydata_profiling/model/handler.py +++ b/src/ydata_profiling/model/handler.py @@ -15,7 +15,7 @@ def compose(functions: Sequence[Callable]) -> Callable: :return: combined function applying all functions in order. """ - def composed_function(*args): + def composed_function(*args) -> List[Callable]: result = args # Start with the input arguments for func in functions: result = func(*result) if isinstance(result, tuple) else func(result) diff --git a/src/ydata_profiling/model/summarizer.py b/src/ydata_profiling/model/summarizer.py index e54ad9a83..4ddb155c3 100644 --- a/src/ydata_profiling/model/summarizer.py +++ b/src/ydata_profiling/model/summarizer.py @@ -54,7 +54,7 @@ def summarize( class ProfilingSummarizer(BaseSummarizer): """A summarizer for Pandas DataFrames.""" - def __init__(self, typeset: VisionsTypeset, use_spark=False): + def __init__(self, typeset: VisionsTypeset, use_spark: bool=False): self.use_spark = use_spark and is_pyspark_installed() self._summary_map = self._create_summary_map() super().__init__(self._summary_map, typeset) From 286924303abba059f4d78e3ab83ada54d271f20c Mon Sep 17 00:00:00 2001 From: Fabiana <30911746+fabclmnt@users.noreply.github.com> Date: Tue, 18 Mar 2025 09:14:12 -0700 Subject: [PATCH 04/12] fix: fix import linting error --- src/ydata_profiling/model/describe.py | 2 +- src/ydata_profiling/model/handler.py | 4 ++-- src/ydata_profiling/model/summary.py | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/ydata_profiling/model/describe.py b/src/ydata_profiling/model/describe.py index 91386c51e..3c81021e5 100644 --- a/src/ydata_profiling/model/describe.py +++ b/src/ydata_profiling/model/describe.py @@ -29,7 +29,7 @@ def describe( config: Settings, - df: Union[pd.DataFrame, "pyspark.sql.DataFrame"], # noqa: F821 + df: Union[pd.DataFrame, "pyspark.sql.DataFrame"], # type: ignore[name-defined] summarizer: BaseSummarizer, typeset: VisionsTypeset, sample: Optional[dict] = None, diff --git a/src/ydata_profiling/model/handler.py b/src/ydata_profiling/model/handler.py index c43a6eff0..b34982100 100644 --- a/src/ydata_profiling/model/handler.py +++ b/src/ydata_profiling/model/handler.py @@ -1,7 +1,7 @@ """ Auxiliary handler methods for data summary extraction """ -from typing import Callable, Dict, List, Sequence +from typing import Any, Callable, Dict, List, Sequence import networkx as nx from visions import VisionsTypeset @@ -15,7 +15,7 @@ def compose(functions: Sequence[Callable]) -> Callable: :return: combined function applying all functions in order. """ - def composed_function(*args) -> List[Callable]: + def composed_function(*args) -> List: result = args # Start with the input arguments for func in functions: result = func(*result) if isinstance(result, tuple) else func(result) diff --git a/src/ydata_profiling/model/summary.py b/src/ydata_profiling/model/summary.py index a997a8933..0d90ce466 100644 --- a/src/ydata_profiling/model/summary.py +++ b/src/ydata_profiling/model/summary.py @@ -20,7 +20,7 @@ sparkDataFrame = TypeVar("sparkDataFrame") # type: ignore sparkSeries = TypeVar("sparkSeries") # type: ignore else: - from pyspark.sql import DataFrame as sparkDataFrame # noqa: E402 + from pyspark.sql import DataFrame as sparkDataFrame # type: ignore[name-defined] from ydata_profiling.model.spark.summary_spark import ( # noqa: E402 get_series_descriptions_spark, From 572c7b922045bafffa7e15c396129a5b34d0a2d9 Mon Sep 17 00:00:00 2001 From: Fabiana <30911746+fabclmnt@users.noreply.github.com> Date: Tue, 18 Mar 2025 09:21:43 -0700 Subject: [PATCH 05/12] fix: ignore linter --- src/ydata_profiling/model/correlations.py | 8 ++++---- src/ydata_profiling/model/handler.py | 2 +- src/ydata_profiling/model/pandas/summary_pandas.py | 4 ++-- src/ydata_profiling/model/summary.py | 2 +- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/src/ydata_profiling/model/correlations.py b/src/ydata_profiling/model/correlations.py index 78b8bc31f..bb43e9e6d 100644 --- a/src/ydata_profiling/model/correlations.py +++ b/src/ydata_profiling/model/correlations.py @@ -18,12 +18,12 @@ class CorrelationBackend: def __init__(self, df: Sized): """Determine backend once and store it for all correlation computations.""" if isinstance(df, pd.DataFrame): - from ydata_profiling.model.pandas import ( - correlations_pandas as correlation_backend, #type: ignore + from ydata_profiling.model.pandas import ( # type: ignore + correlations_pandas as correlation_backend, ) else: - from ydata_profiling.model.spark import ( - correlations_spark as correlation_backend, # type: ignore + from ydata_profiling.model.spark import ( # type: ignore + correlations_spark as correlation_backend, ) self.backend = correlation_backend diff --git a/src/ydata_profiling/model/handler.py b/src/ydata_profiling/model/handler.py index b34982100..7d946d669 100644 --- a/src/ydata_profiling/model/handler.py +++ b/src/ydata_profiling/model/handler.py @@ -15,7 +15,7 @@ def compose(functions: Sequence[Callable]) -> Callable: :return: combined function applying all functions in order. """ - def composed_function(*args) -> List: + def composed_function(*args) -> List[Any]: result = args # Start with the input arguments for func in functions: result = func(*result) if isinstance(result, tuple) else func(result) diff --git a/src/ydata_profiling/model/pandas/summary_pandas.py b/src/ydata_profiling/model/pandas/summary_pandas.py index bfec2f4f6..b790db79c 100644 --- a/src/ydata_profiling/model/pandas/summary_pandas.py +++ b/src/ydata_profiling/model/pandas/summary_pandas.py @@ -20,7 +20,7 @@ def _is_cast_type_defined(typeset: VisionsTypeset, series: str) -> bool: def pandas_describe_1d( config: Settings, series: pd.Series, - summarizer: "BaseSummarizer", # noqa: F821 + summarizer: "BaseSummarizer", # type: ignore # type: ignore typeset: VisionsTypeset, ) -> dict: """Describe a series (infer the variable type, then calculate type-specific values). @@ -67,7 +67,7 @@ def pandas_describe_1d( def pandas_get_series_descriptions( config: Settings, df: pd.DataFrame, - summarizer: "BaseSummarizer", # noqa: F821 + summarizer: "BaseSummarizer", # noqa: F821 # type: ignore typeset: VisionsTypeset, pbar: tqdm, ) -> dict: diff --git a/src/ydata_profiling/model/summary.py b/src/ydata_profiling/model/summary.py index 0d90ce466..87a2acc8a 100644 --- a/src/ydata_profiling/model/summary.py +++ b/src/ydata_profiling/model/summary.py @@ -20,7 +20,7 @@ sparkDataFrame = TypeVar("sparkDataFrame") # type: ignore sparkSeries = TypeVar("sparkSeries") # type: ignore else: - from pyspark.sql import DataFrame as sparkDataFrame # type: ignore[name-defined] + from pyspark.sql import DataFrame as sparkDataFrame # type: ignore from ydata_profiling.model.spark.summary_spark import ( # noqa: E402 get_series_descriptions_spark, From 4e711642ac6098dc27e17b7a91b97192124d62ab Mon Sep 17 00:00:00 2001 From: Fabiana <30911746+fabclmnt@users.noreply.github.com> Date: Tue, 18 Mar 2025 09:51:01 -0700 Subject: [PATCH 06/12] chore: ignore mypy linter for some cases --- src/ydata_profiling/model/correlations.py | 3 +++ src/ydata_profiling/model/handler.py | 4 ++-- src/ydata_profiling/model/pandas/summary_pandas.py | 2 ++ 3 files changed, 7 insertions(+), 2 deletions(-) diff --git a/src/ydata_profiling/model/correlations.py b/src/ydata_profiling/model/correlations.py index bb43e9e6d..82e37cee7 100644 --- a/src/ydata_profiling/model/correlations.py +++ b/src/ydata_profiling/model/correlations.py @@ -1,4 +1,7 @@ +# mypy: ignore-errors + """Correlations between variables.""" + import warnings from typing import Dict, List, Optional, Sized diff --git a/src/ydata_profiling/model/handler.py b/src/ydata_profiling/model/handler.py index 7d946d669..ab0eaf8e5 100644 --- a/src/ydata_profiling/model/handler.py +++ b/src/ydata_profiling/model/handler.py @@ -19,9 +19,9 @@ def composed_function(*args) -> List[Any]: result = args # Start with the input arguments for func in functions: result = func(*result) if isinstance(result, tuple) else func(result) - return result + return result # type: ignore - return composed_function + return composed_function # type: ignore class Handler: diff --git a/src/ydata_profiling/model/pandas/summary_pandas.py b/src/ydata_profiling/model/pandas/summary_pandas.py index b790db79c..5878755f0 100644 --- a/src/ydata_profiling/model/pandas/summary_pandas.py +++ b/src/ydata_profiling/model/pandas/summary_pandas.py @@ -1,3 +1,5 @@ +# mypy: ignore-errors + """Compute statistical description of datasets.""" import multiprocessing from concurrent.futures import ThreadPoolExecutor From 97899506a3d21c29de80f1de252ecfacbe64fc81 Mon Sep 17 00:00:00 2001 From: Azory YData Bot Date: Tue, 18 Mar 2025 16:54:18 +0000 Subject: [PATCH 07/12] fix(linting): code formatting --- src/ydata_profiling/model/correlations.py | 11 +++---- src/ydata_profiling/model/dataframe.py | 2 ++ src/ydata_profiling/model/handler.py | 4 +-- .../model/pandas/dataframe_pandas.py | 1 - src/ydata_profiling/model/spark/__init__.py | 2 +- .../model/spark/describe_counts_spark.py | 29 +++++++++++-------- .../model/spark/duplicates_spark.py | 1 + .../model/spark/sample_spark.py | 1 + .../model/spark/table_spark.py | 1 + src/ydata_profiling/model/summarizer.py | 2 +- src/ydata_profiling/model/summary.py | 9 +++--- .../model/summary_algorithms.py | 1 + src/ydata_profiling/model/table.py | 1 + src/ydata_profiling/profile_report.py | 8 ++--- .../spark_backend/test_correlations_spark.py | 5 ++-- .../spark_backend/test_descriptions_spark.py | 2 +- tests/conftest.py | 9 ++++-- 17 files changed, 51 insertions(+), 38 deletions(-) diff --git a/src/ydata_profiling/model/correlations.py b/src/ydata_profiling/model/correlations.py index 82e37cee7..2138d88c8 100644 --- a/src/ydata_profiling/model/correlations.py +++ b/src/ydata_profiling/model/correlations.py @@ -15,23 +15,24 @@ except ImportError: from pandas.errors import DataError + class CorrelationBackend: """Helper class to select and cache the appropriate correlation backend (Pandas or Spark).""" def __init__(self, df: Sized): """Determine backend once and store it for all correlation computations.""" if isinstance(df, pd.DataFrame): - from ydata_profiling.model.pandas import ( # type: ignore - correlations_pandas as correlation_backend, + from ydata_profiling.model.pandas import ( + correlations_pandas as correlation_backend, # type: ignore ) else: - from ydata_profiling.model.spark import ( # type: ignore - correlations_spark as correlation_backend, + from ydata_profiling.model.spark import ( + correlations_spark as correlation_backend, # type: ignore ) self.backend = correlation_backend - def get_method(self, method_name: str): # noqa: ANN201 + def get_method(self, method_name: str): # noqa: ANN201 """Retrieve the appropriate correlation method class from the backend.""" if hasattr(self.backend, method_name): return getattr(self.backend, method_name) diff --git a/src/ydata_profiling/model/dataframe.py b/src/ydata_profiling/model/dataframe.py index 9b4311352..dd01f8ffe 100644 --- a/src/ydata_profiling/model/dataframe.py +++ b/src/ydata_profiling/model/dataframe.py @@ -2,6 +2,7 @@ from typing import Any import pandas as pd + from ydata_profiling.config import Settings from ydata_profiling.model.pandas.dataframe_pandas import pandas_preprocess @@ -12,6 +13,7 @@ sparkDataFrame = TypeVar("sparkDataFrame") else: from pyspark.sql import DataFrame as sparkDataFrame # type: ignore + from ydata_profiling.model.spark.dataframe_spark import spark_preprocess diff --git a/src/ydata_profiling/model/handler.py b/src/ydata_profiling/model/handler.py index ab0eaf8e5..992c1840c 100644 --- a/src/ydata_profiling/model/handler.py +++ b/src/ydata_profiling/model/handler.py @@ -19,9 +19,9 @@ def composed_function(*args) -> List[Any]: result = args # Start with the input arguments for func in functions: result = func(*result) if isinstance(result, tuple) else func(result) - return result # type: ignore + return result # type: ignore - return composed_function # type: ignore + return composed_function # type: ignore class Handler: diff --git a/src/ydata_profiling/model/pandas/dataframe_pandas.py b/src/ydata_profiling/model/pandas/dataframe_pandas.py index 3c6acbdfd..e98dc7d24 100644 --- a/src/ydata_profiling/model/pandas/dataframe_pandas.py +++ b/src/ydata_profiling/model/pandas/dataframe_pandas.py @@ -1,4 +1,3 @@ - import pandas as pd from ydata_profiling.config import Settings diff --git a/src/ydata_profiling/model/spark/__init__.py b/src/ydata_profiling/model/spark/__init__.py index d77bd0a46..b71241218 100644 --- a/src/ydata_profiling/model/spark/__init__.py +++ b/src/ydata_profiling/model/spark/__init__.py @@ -29,7 +29,7 @@ for name in dir(module) if not name.startswith("_") } - ) # type: ignore + ) # type: ignore # Explicitly list all available functions __all__ = [ diff --git a/src/ydata_profiling/model/spark/describe_counts_spark.py b/src/ydata_profiling/model/spark/describe_counts_spark.py index 5c764c23d..d7a091e7f 100644 --- a/src/ydata_profiling/model/spark/describe_counts_spark.py +++ b/src/ydata_profiling/model/spark/describe_counts_spark.py @@ -2,13 +2,15 @@ Pyspark counts """ from typing import Tuple + import pandas as pd +from pyspark.sql import DataFrame +from pyspark.sql import functions as F from ydata_profiling.config import Settings - -from pyspark.sql import DataFrame, functions as F from ydata_profiling.model.summary_algorithms import describe_counts + @describe_counts.register def describe_counts_spark( config: Settings, series: DataFrame, summary: dict @@ -34,7 +36,9 @@ def describe_counts_spark( value_counts_index_sorted = value_counts.orderBy(F.asc(series.columns[0])) # Count missing values - n_missing = value_counts.filter(F.col(series.columns[0]).isNull()).select("count").first() + n_missing = ( + value_counts.filter(F.col(series.columns[0]).isNull()).select("count").first() + ) n_missing = n_missing["count"] if n_missing else 0 # Convert top 200 values to Pandas for frequency table display @@ -51,11 +55,9 @@ def describe_counts_spark( column = series.columns[0] - - if series.dtypes[0][1] in ('int', 'float', 'bigint', 'double'): + if series.dtypes[0][1] in ("int", "float", "bigint", "double"): value_counts_no_nan = ( - value_counts - .filter(F.col(column).isNotNull()) # Exclude NaNs + value_counts.filter(F.col(column).isNotNull()) # Exclude NaNs .filter(~F.isnan(F.col(column))) # Remove implicit NaNs (if numeric column) .groupBy(column) # Group by unique values .count() # Count occurrences @@ -64,8 +66,7 @@ def describe_counts_spark( ) else: value_counts_no_nan = ( - value_counts - .filter(F.col(column).isNotNull()) # Exclude NULLs + value_counts.filter(F.col(column).isNotNull()) # Exclude NULLs .groupBy(column) # Group by unique timestamp values .count() # Count occurrences .orderBy(F.desc("count")) # Sort by most frequent timestamps @@ -75,8 +76,12 @@ def describe_counts_spark( # Convert to Pandas Series, forcing proper structure if value_counts_no_nan.count() > 0: pdf = value_counts_no_nan.toPandas().set_index(column)["count"] - summary["value_counts_without_nan"] = pd.Series(pdf) # Ensures it's always a Series + summary["value_counts_without_nan"] = pd.Series( + pdf + ) # Ensures it's always a Series else: - summary["value_counts_without_nan"] = pd.Series(dtype=int) # Ensures an empty Series + summary["value_counts_without_nan"] = pd.Series( + dtype=int + ) # Ensures an empty Series - return config, series, summary \ No newline at end of file + return config, series, summary diff --git a/src/ydata_profiling/model/spark/duplicates_spark.py b/src/ydata_profiling/model/spark/duplicates_spark.py index 77f0379cf..95e32cc30 100644 --- a/src/ydata_profiling/model/spark/duplicates_spark.py +++ b/src/ydata_profiling/model/spark/duplicates_spark.py @@ -6,6 +6,7 @@ from ydata_profiling.config import Settings from ydata_profiling.model.duplicates import get_duplicates + @get_duplicates.register def get_duplicates_spark( config: Settings, df: DataFrame, supported_columns: Sequence diff --git a/src/ydata_profiling/model/spark/sample_spark.py b/src/ydata_profiling/model/spark/sample_spark.py index 0a608c6a7..05636b354 100644 --- a/src/ydata_profiling/model/spark/sample_spark.py +++ b/src/ydata_profiling/model/spark/sample_spark.py @@ -6,6 +6,7 @@ from ydata_profiling.config import Settings from ydata_profiling.model.sample import Sample, get_sample + @get_sample.register def get_sample_spark(config: Settings, df: DataFrame) -> List[Sample]: """Obtains a sample from head and tail of the DataFrame diff --git a/src/ydata_profiling/model/spark/table_spark.py b/src/ydata_profiling/model/spark/table_spark.py index 234c6531c..33e862e61 100644 --- a/src/ydata_profiling/model/spark/table_spark.py +++ b/src/ydata_profiling/model/spark/table_spark.py @@ -5,6 +5,7 @@ from ydata_profiling.config import Settings from ydata_profiling.model.table import get_table_stats + @get_table_stats.register def get_table_stats_spark( config: Settings, df: DataFrame, variable_stats: dict diff --git a/src/ydata_profiling/model/summarizer.py b/src/ydata_profiling/model/summarizer.py index 4ddb155c3..d733a7d36 100644 --- a/src/ydata_profiling/model/summarizer.py +++ b/src/ydata_profiling/model/summarizer.py @@ -54,7 +54,7 @@ def summarize( class ProfilingSummarizer(BaseSummarizer): """A summarizer for Pandas DataFrames.""" - def __init__(self, typeset: VisionsTypeset, use_spark: bool=False): + def __init__(self, typeset: VisionsTypeset, use_spark: bool = False): self.use_spark = use_spark and is_pyspark_installed() self._summary_map = self._create_summary_map() super().__init__(self._summary_map, typeset) diff --git a/src/ydata_profiling/model/summary.py b/src/ydata_profiling/model/summary.py index 87a2acc8a..4fa9831a7 100644 --- a/src/ydata_profiling/model/summary.py +++ b/src/ydata_profiling/model/summary.py @@ -15,18 +15,19 @@ spec = importlib.util.find_spec("pyspark") if spec is None: - from typing import TypeVar # noqa: E402 + from typing import TypeVar # noqa: E402 sparkDataFrame = TypeVar("sparkDataFrame") # type: ignore sparkSeries = TypeVar("sparkSeries") # type: ignore else: from pyspark.sql import DataFrame as sparkDataFrame # type: ignore - from ydata_profiling.model.spark.summary_spark import ( # noqa: E402 + from ydata_profiling.model.spark.summary_spark import ( # noqa: E402 get_series_descriptions_spark, spark_describe_1d, ) + def describe_1d( config: Settings, series: Any, @@ -44,7 +45,7 @@ def describe_1d( """ if isinstance(series, pd.Series): return pandas_describe_1d(config, series, summarizer, typeset) - elif isinstance(series, sparkDataFrame): # type: ignore + elif isinstance(series, sparkDataFrame): # type: ignore return spark_describe_1d(config, series, summarizer, typeset) else: raise TypeError(f"Unsupported series type: {type(series)}") @@ -59,7 +60,7 @@ def get_series_descriptions( ) -> dict: if isinstance(df, pd.DataFrame): return pandas_get_series_descriptions(config, df, summarizer, typeset, pbar) - elif isinstance(df, sparkDataFrame): # type: ignore + elif isinstance(df, sparkDataFrame): # type: ignore return get_series_descriptions_spark(config, df, summarizer, typeset, pbar) else: raise TypeError(f"Unsupported dataframe type: {type(df)}") diff --git a/src/ydata_profiling/model/summary_algorithms.py b/src/ydata_profiling/model/summary_algorithms.py index cd98fc741..e70c467d2 100644 --- a/src/ydata_profiling/model/summary_algorithms.py +++ b/src/ydata_profiling/model/summary_algorithms.py @@ -101,6 +101,7 @@ def named_aggregate_summary(series: pd.Series, key: str) -> dict: return summary + @multimethod def describe_counts( config: Settings, series: Any, summary: dict diff --git a/src/ydata_profiling/model/table.py b/src/ydata_profiling/model/table.py index c0ebadb4e..e5eb6fdc2 100644 --- a/src/ydata_profiling/model/table.py +++ b/src/ydata_profiling/model/table.py @@ -4,6 +4,7 @@ from ydata_profiling.config import Settings + @multimethod def get_table_stats(config: Settings, df: Any, variable_stats: dict) -> dict: raise NotImplementedError() diff --git a/src/ydata_profiling/profile_report.py b/src/ydata_profiling/profile_report.py index b99f0ade5..acf18c7e0 100644 --- a/src/ydata_profiling/profile_report.py +++ b/src/ydata_profiling/profile_report.py @@ -190,7 +190,7 @@ def __validate_inputs( ) if ( - df is not None and df.rdd.isEmpty() # type: ignore + df is not None and df.rdd.isEmpty() # type: ignore ): # df.isEmpty is only support by 3.3.0 pyspark version raise ValueError( "DataFrame is empty. Please" "provide a non-empty DataFrame." @@ -259,11 +259,9 @@ def summarizer(self) -> BaseSummarizer: if self._summarizer is None: use_spark = False if self._df_type is not pd.DataFrame: - use_spark=True + use_spark = True - self._summarizer = ProfilingSummarizer( - self.typeset, use_spark=use_spark - ) + self._summarizer = ProfilingSummarizer(self.typeset, use_spark=use_spark) return self._summarizer @property diff --git a/tests/backends/spark_backend/test_correlations_spark.py b/tests/backends/spark_backend/test_correlations_spark.py index 6674ca2a3..ef16224a1 100644 --- a/tests/backends/spark_backend/test_correlations_spark.py +++ b/tests/backends/spark_backend/test_correlations_spark.py @@ -15,6 +15,7 @@ spearman_compute as spark_spearman_compute, ) + @pytest.fixture def correlation_data_num(spark_session): correlation_testdata = pd.DataFrame( @@ -79,6 +80,4 @@ def test_kendall_spark(correlation_data_cat): cfg = Settings() with pytest.raises(NotImplementedError): - kendall_compute(config=cfg, - df=correlation_data_cat, - summary={}) + kendall_compute(config=cfg, df=correlation_data_cat, summary={}) diff --git a/tests/backends/spark_backend/test_descriptions_spark.py b/tests/backends/spark_backend/test_descriptions_spark.py index c11330017..5c608f2f0 100644 --- a/tests/backends/spark_backend/test_descriptions_spark.py +++ b/tests/backends/spark_backend/test_descriptions_spark.py @@ -374,7 +374,7 @@ def test_describe_spark_df( describe_data[column] = [ True if i else False for i in describe_data[column] # noqa: SIM210 ] - pdf= pd.DataFrame({column: describe_data[column]})# Convert to Pandas DataFrame + pdf = pd.DataFrame({column: describe_data[column]}) # Convert to Pandas DataFrame # Ensure NaNs are replaced with None (Spark does not support NaN in non-float columns) pdf = pdf.where(pd.notna(pdf), None) sdf = spark_session.createDataFrame(pdf) diff --git a/tests/conftest.py b/tests/conftest.py index 0bda369d0..4a5864d84 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,11 +1,13 @@ import shutil import sys from pathlib import Path + import pytest try: + from pyspark import SparkConf, SparkContext from pyspark.sql import SparkSession - from pyspark import SparkContext, SparkConf + has_spark = True except ImportError: has_spark = False @@ -45,6 +47,7 @@ def test_output_dir(tmpdir_factory): def summarizer(typeset): return ProfilingSummarizer(typeset) + @pytest.fixture(scope="function") def summarizer_spark(typeset): return ProfilingSummarizer(typeset, use_spark=True) @@ -70,7 +73,7 @@ def pytest_runtest_setup(item): pytest.skip(f"cannot run on platform {plat}") -@pytest.fixture(scope='session') +@pytest.fixture(scope="session") def spark_context(): """Fixture for SparkContext initialization. @@ -94,7 +97,7 @@ def spark_context(): sc.stop() -@pytest.fixture(scope='session') +@pytest.fixture(scope="session") def spark_session(spark_context): """Fixture for SparkSession initialization. From 6edf2d6d11e4e66982010337c4974d854a6a4501 Mon Sep 17 00:00:00 2001 From: Fabiana <30911746+fabclmnt@users.noreply.github.com> Date: Tue, 18 Mar 2025 09:59:57 -0700 Subject: [PATCH 08/12] chore: create basesummarizer as a variable --- src/ydata_profiling/model/pandas/summary_pandas.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/src/ydata_profiling/model/pandas/summary_pandas.py b/src/ydata_profiling/model/pandas/summary_pandas.py index 5878755f0..e869a456b 100644 --- a/src/ydata_profiling/model/pandas/summary_pandas.py +++ b/src/ydata_profiling/model/pandas/summary_pandas.py @@ -1,5 +1,3 @@ -# mypy: ignore-errors - """Compute statistical description of datasets.""" import multiprocessing from concurrent.futures import ThreadPoolExecutor @@ -14,6 +12,7 @@ from ydata_profiling.model.typeset import ProfilingTypeSet from ydata_profiling.utils.dataframe import sort_column_names +BaseSummarizer = "BaseSummarizer" def _is_cast_type_defined(typeset: VisionsTypeset, series: str) -> bool: return isinstance(typeset, ProfilingTypeSet) and series in typeset.type_schema @@ -22,7 +21,7 @@ def _is_cast_type_defined(typeset: VisionsTypeset, series: str) -> bool: def pandas_describe_1d( config: Settings, series: pd.Series, - summarizer: "BaseSummarizer", # type: ignore # type: ignore + summarizer: BaseSummarizer, typeset: VisionsTypeset, ) -> dict: """Describe a series (infer the variable type, then calculate type-specific values). @@ -69,7 +68,7 @@ def pandas_describe_1d( def pandas_get_series_descriptions( config: Settings, df: pd.DataFrame, - summarizer: "BaseSummarizer", # noqa: F821 # type: ignore + summarizer: BaseSummarizer, typeset: VisionsTypeset, pbar: tqdm, ) -> dict: From a75f96f42df5dc62d9a3ee0ec7ab4ad6a3ec069b Mon Sep 17 00:00:00 2001 From: Fabiana <30911746+fabclmnt@users.noreply.github.com> Date: Tue, 18 Mar 2025 10:15:22 -0700 Subject: [PATCH 09/12] chore: mypy type aliases --- src/ydata_profiling/model/pandas/summary_pandas.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/ydata_profiling/model/pandas/summary_pandas.py b/src/ydata_profiling/model/pandas/summary_pandas.py index e869a456b..c06ff0b43 100644 --- a/src/ydata_profiling/model/pandas/summary_pandas.py +++ b/src/ydata_profiling/model/pandas/summary_pandas.py @@ -1,7 +1,7 @@ """Compute statistical description of datasets.""" import multiprocessing from concurrent.futures import ThreadPoolExecutor -from typing import Tuple +from typing import Tuple, TypeAlias import numpy as np import pandas as pd @@ -12,7 +12,7 @@ from ydata_profiling.model.typeset import ProfilingTypeSet from ydata_profiling.utils.dataframe import sort_column_names -BaseSummarizer = "BaseSummarizer" +BaseSummarizer: TypeAlias = "BaseSummarizer" def _is_cast_type_defined(typeset: VisionsTypeset, series: str) -> bool: return isinstance(typeset, ProfilingTypeSet) and series in typeset.type_schema From 3a9c7a572d760c7a681ecc0f4932c82368fa374b Mon Sep 17 00:00:00 2001 From: Fabiana <30911746+fabclmnt@users.noreply.github.com> Date: Tue, 18 Mar 2025 10:23:39 -0700 Subject: [PATCH 10/12] chore: fix typealias for older python versions --- src/ydata_profiling/model/pandas/summary_pandas.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/ydata_profiling/model/pandas/summary_pandas.py b/src/ydata_profiling/model/pandas/summary_pandas.py index c06ff0b43..5ebd4f3a4 100644 --- a/src/ydata_profiling/model/pandas/summary_pandas.py +++ b/src/ydata_profiling/model/pandas/summary_pandas.py @@ -1,7 +1,7 @@ """Compute statistical description of datasets.""" import multiprocessing from concurrent.futures import ThreadPoolExecutor -from typing import Tuple, TypeAlias +from typing import Tuple, Any import numpy as np import pandas as pd @@ -12,7 +12,7 @@ from ydata_profiling.model.typeset import ProfilingTypeSet from ydata_profiling.utils.dataframe import sort_column_names -BaseSummarizer: TypeAlias = "BaseSummarizer" +BaseSummarizer: Any = "BaseSummarizer" # type: ignore def _is_cast_type_defined(typeset: VisionsTypeset, series: str) -> bool: return isinstance(typeset, ProfilingTypeSet) and series in typeset.type_schema From a0612fc1e7446784375061caace45b189a450427 Mon Sep 17 00:00:00 2001 From: Azory YData Bot Date: Tue, 18 Mar 2025 17:26:57 +0000 Subject: [PATCH 11/12] fix(linting): code formatting --- src/ydata_profiling/model/pandas/summary_pandas.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/ydata_profiling/model/pandas/summary_pandas.py b/src/ydata_profiling/model/pandas/summary_pandas.py index 5ebd4f3a4..fb15bf98b 100644 --- a/src/ydata_profiling/model/pandas/summary_pandas.py +++ b/src/ydata_profiling/model/pandas/summary_pandas.py @@ -1,7 +1,7 @@ """Compute statistical description of datasets.""" import multiprocessing from concurrent.futures import ThreadPoolExecutor -from typing import Tuple, Any +from typing import Any, Tuple import numpy as np import pandas as pd @@ -14,6 +14,7 @@ BaseSummarizer: Any = "BaseSummarizer" # type: ignore + def _is_cast_type_defined(typeset: VisionsTypeset, series: str) -> bool: return isinstance(typeset, ProfilingTypeSet) and series in typeset.type_schema From 430bcde238a0930114aec17a4281a4a0fefc4964 Mon Sep 17 00:00:00 2001 From: Fabiana <30911746+fabclmnt@users.noreply.github.com> Date: Tue, 18 Mar 2025 10:32:15 -0700 Subject: [PATCH 12/12] chore: ignore F821 --- src/ydata_profiling/model/describe.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ydata_profiling/model/describe.py b/src/ydata_profiling/model/describe.py index 3c81021e5..74bdf924a 100644 --- a/src/ydata_profiling/model/describe.py +++ b/src/ydata_profiling/model/describe.py @@ -29,7 +29,7 @@ def describe( config: Settings, - df: Union[pd.DataFrame, "pyspark.sql.DataFrame"], # type: ignore[name-defined] + df: Union[pd.DataFrame, "pyspark.sql.DataFrame"], # type: ignore[name-defined] # noqa: F821 summarizer: BaseSummarizer, typeset: VisionsTypeset, sample: Optional[dict] = None,