diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 000000000..7bb15bf5d --- /dev/null +++ b/Dockerfile @@ -0,0 +1,21 @@ +FROM python:3.10-slim + +WORKDIR /app + +RUN apt-get update && apt-get install -y --no-install-recommends \ + build-essential \ + && rm -rf /var/lib/apt/lists/* + +COPY . . + +RUN pip install --no-cache-dir --upgrade pip && \ + pip install --no-cache-dir "setuptools>=72.0.0,<80.0.0" wheel && \ + pip install --no-cache-dir . && \ + pip install --no-cache-dir "setuptools>=72.0.0,<80.0.0" && \ + pip install --no-cache-dir jupyter + +EXPOSE 8888 + +CMD ["jupyter", "notebook", "--ip=0.0.0.0", "--port=8888", "--no-browser", "--allow-root"] + + diff --git a/src/ydata_profiling/model/handler.py b/src/ydata_profiling/model/handler.py index 992c1840c..e9ba6a39a 100644 --- a/src/ydata_profiling/model/handler.py +++ b/src/ydata_profiling/model/handler.py @@ -1,81 +1,90 @@ -""" - Auxiliary handler methods for data summary extraction -""" -from typing import Any, Callable, Dict, List, Sequence - -import networkx as nx -from visions import VisionsTypeset - - -def compose(functions: Sequence[Callable]) -> Callable: - """ - Compose a sequence of functions. - - :param functions: sequence of functions - :return: combined function applying all functions in order. - """ - - def composed_function(*args) -> List[Any]: - result = args # Start with the input arguments - for func in functions: - result = func(*result) if isinstance(result, tuple) else func(result) - return result # type: ignore - - return composed_function # type: ignore - - -class Handler: - """A generic handler - - Allows any custom mapping between data types and functions - """ - - def __init__( - self, - mapping: Dict[str, List[Callable]], - typeset: VisionsTypeset, - *args, - **kwargs - ): - self.mapping = mapping - self.typeset = typeset - self._complete_dag() - - def _complete_dag(self) -> None: - for from_type, to_type in nx.topological_sort( - nx.line_graph(self.typeset.base_graph) - ): - self.mapping[str(to_type)] = ( - self.mapping[str(from_type)] + self.mapping[str(to_type)] - ) - - def handle(self, dtype: str, *args, **kwargs) -> dict: - """ - Returns: - object: a tuple containing the config, the dataset series and the summary extracted - """ - funcs = self.mapping.get(dtype, []) - op = compose(funcs) - summary = op(*args)[-1] - return summary - - -def get_render_map() -> Dict[str, Callable]: - import ydata_profiling.report.structure.variables as render_algorithms - - render_map = { - "Boolean": render_algorithms.render_boolean, - "Numeric": render_algorithms.render_real, - "Complex": render_algorithms.render_complex, - "Text": render_algorithms.render_text, - "DateTime": render_algorithms.render_date, - "Categorical": render_algorithms.render_categorical, - "URL": render_algorithms.render_url, - "Path": render_algorithms.render_path, - "File": render_algorithms.render_file, - "Image": render_algorithms.render_image, - "Unsupported": render_algorithms.render_generic, - "TimeSeries": render_algorithms.render_timeseries, - } - - return render_map +""" + Auxiliary handler methods for data summary extraction +""" +from typing import Any, Callable, Dict, List, Sequence, Tuple + +import networkx as nx +from visions import VisionsTypeset + + +def compose(functions: Sequence[Callable[..., Any]]) -> Callable[..., Tuple[Any, ...]]: + """ + Compose a sequence of functions. + + Each function in the sequence should accept the arguments passed to the composed + function and return either a single value or a tuple of values. + + :param functions: sequence of functions + :return: combined function applying all functions in order. + """ + + def composed_function(*args: Any) -> Tuple[Any, ...]: + result: Tuple[Any, ...] = args + for func in functions: + result = func(*result) + # Ensure result is always a tuple for consistent unpacking + if not isinstance(result, tuple): + result = (result,) + return result + + return composed_function + + +class Handler: + """A generic handler + + Allows any custom mapping between data types and functions + """ + + def __init__( + self, + mapping: Dict[str, List[Callable[..., Any]]], + typeset: VisionsTypeset, + *args: Any, + **kwargs: Any + ): + self.mapping = mapping + self.typeset = typeset + self._complete_dag() + + def _complete_dag(self) -> None: + for from_type, to_type in nx.topological_sort( + nx.line_graph(self.typeset.base_graph) + ): + self.mapping[str(to_type)] = ( + self.mapping[str(from_type)] + self.mapping[str(to_type)] + ) + + def handle(self, dtype: str, *args: Any, **kwargs: Any) -> Any: + """ + Execute the handler chain for the given dtype. + + :param dtype: The data type to handle + :param args: Arguments to pass to the handler chain + :return: The last element of the result tuple from the handler chain + """ + funcs = self.mapping.get(dtype, []) + op = compose(funcs) + summary = op(*args)[-1] + return summary + + +def get_render_map() -> Dict[str, Callable[..., Any]]: + import ydata_profiling.report.structure.variables as render_algorithms + + render_map: Dict[str, Callable[..., Any]] = { + "Boolean": render_algorithms.render_boolean, + "Numeric": render_algorithms.render_real, + "Complex": render_algorithms.render_complex, + "Text": render_algorithms.render_text, + "DateTime": render_algorithms.render_date, + "Categorical": render_algorithms.render_categorical, + "URL": render_algorithms.render_url, + "Path": render_algorithms.render_path, + "File": render_algorithms.render_file, + "Image": render_algorithms.render_image, + "Unsupported": render_algorithms.render_generic, + "TimeSeries": render_algorithms.render_timeseries, + } + + return render_map diff --git a/src/ydata_profiling/model/spark/missing_spark.py b/src/ydata_profiling/model/spark/missing_spark.py index deacf1b89..384670232 100644 --- a/src/ydata_profiling/model/spark/missing_spark.py +++ b/src/ydata_profiling/model/spark/missing_spark.py @@ -13,50 +13,49 @@ class MissingnoBarSparkPatch: """ - Technical Debt : - This is a monkey patching object that allows usage of the library missingno as is for spark dataframes. - This is because missingno library's bar function always applies a isnull().sum() on dataframes in the visualisation - function, instead of allowing just values counts as an entry point. Thus, in order to calculate the - missing values dataframe in spark, we compute it first, then wrap it in this MissingnoBarSparkPatch object which - will be unwrapped by missingno and return the pre-computed value counts. - The best fix to this currently terrible patch is to submit a PR to missingno to separate preprocessing function - (compute value counts from df) and visualisation functions such that we can call the visualisation directly. - Unfortunately, the missingno library people have not really responded to our issues on gitlab. - See https://github.com/ResidentMario/missingno/issues/119. - We could also fork the missingno library and implement some of the code in our database, but that feels - like bad practice as well. + Adapter class to enable missingno library compatibility with Spark DataFrames. + + The missingno library's visualization functions internally call isnull().sum() + on dataframes. For Spark DataFrames, we pre-compute the null counts and wrap + them in this adapter to provide the expected interface. + + Note: This is a workaround for missingno's lack of separation between + data preprocessing and visualization. See: + https://github.com/ResidentMario/missingno/issues/119 """ def __init__( - self, df: DataFrame, columns: List[str] = None, original_df_size: int = None + self, + df: DataFrame, + columns: Optional[List[str]] = None, + original_df_size: Optional[int] = None ): self.df = df self.columns = columns self.original_df_size = original_df_size - def isnull(self) -> Any: - """ - This patches the .isnull().sum() function called by missingno library - """ - return self # return self to patch .sum() function + def isnull(self) -> "MissingnoBarSparkPatch": + """Returns self to enable chained .isnull().sum() calls.""" + return self def sum(self) -> DataFrame: - """ - This patches the .sum() function called by missingno library - """ - return self.df # return unwrapped dataframe + """Returns the pre-computed null counts dataframe.""" + return self.df def __len__(self) -> Optional[int]: - """ - This patches the len(df) function called by missingno library - """ + """Returns the original dataframe size.""" return self.original_df_size def missing_bar(config: Settings, df: DataFrame) -> str: + """Generate a missing values bar chart for Spark DataFrame. + + :param config: Report settings + :param df: Spark DataFrame + :return: HTML string of the bar chart + """ import pyspark.sql.functions as F - # FIXME: move to univariate data_nan_counts = ( df.agg( *[F.count(F.when(F.isnull(c) | F.isnan(c), c)).alias(c) for c in df.columns] @@ -71,6 +70,12 @@ def missing_bar(config: Settings, df: DataFrame) -> str: def missing_matrix(config: Settings, df: DataFrame) -> str: + """Generate a missing values matrix visualization for Spark DataFrame. + + :param config: Report settings + :param df: Spark DataFrame + :return: HTML string of the matrix visualization + """ df = MissingnoBarSparkPatch(df, columns=df.columns, original_df_size=df.count()) return plot_missing_matrix( config, @@ -81,6 +86,12 @@ def missing_matrix(config: Settings, df: DataFrame) -> str: def missing_heatmap(config: Settings, df: DataFrame) -> str: + """Generate a missing values heatmap for Spark DataFrame. + + :param config: Report settings + :param df: Spark DataFrame + :return: HTML string of the heatmap + """ df = MissingnoBarSparkPatch(df, columns=df.columns, original_df_size=df.count()) # Remove completely filled or completely empty variables. diff --git a/src/ydata_profiling/report/presentation/core/renderable.py b/src/ydata_profiling/report/presentation/core/renderable.py index 3f7f09f6c..1040c9656 100644 --- a/src/ydata_profiling/report/presentation/core/renderable.py +++ b/src/ydata_profiling/report/presentation/core/renderable.py @@ -1,5 +1,5 @@ from abc import ABC, abstractmethod -from typing import Any, Dict, Optional +from typing import Any, Callable, Dict, Optional class Renderable(ABC): @@ -34,9 +34,14 @@ def classes(self) -> str: def render(self) -> Any: pass - def __str__(self): + def __str__(self) -> str: return self.__class__.__name__ @classmethod - def convert_to_class(cls, obj: "Renderable", flavour_func) -> None: # noqa: ANN001 + def convert_to_class(cls, obj: "Renderable", flavour_func: Callable[["Renderable"], None]) -> None: + """Convert the object's class to this class and recursively apply flavour to nested items. + + :param obj: The renderable object to convert + :param flavour_func: Function to apply to nested renderable items + """ obj.__class__ = cls diff --git a/src/ydata_profiling/report/presentation/flavours/flavours.py b/src/ydata_profiling/report/presentation/flavours/flavours.py index 10a5fa522..547a7a758 100644 --- a/src/ydata_profiling/report/presentation/flavours/flavours.py +++ b/src/ydata_profiling/report/presentation/flavours/flavours.py @@ -1,27 +1,46 @@ """ Flavours registry information """ +from typing import Callable, Dict, Type + from ydata_profiling.report.presentation.core import Root from ydata_profiling.report.presentation.core.renderable import Renderable -_FLAVOUR_REGISTRY: dict = {} +_FLAVOUR_REGISTRY: Dict[str, Dict[Type[Renderable], Type[Renderable]]] = {} -def register_flavour(name: str, mapping: dict) -> None: +def register_flavour(name: str, mapping: Dict[Type[Renderable], Type[Renderable]]) -> None: + """Register a flavour mapping. + + :param name: The flavour name + :param mapping: Dictionary mapping core renderable types to flavour-specific types + """ _FLAVOUR_REGISTRY[name] = mapping -def get_flavour_mapping(name: str) -> dict: +def get_flavour_mapping(name: str) -> Dict[Type[Renderable], Type[Renderable]]: + """Get a registered flavour mapping. + + :param name: The flavour name + :return: The flavour mapping dictionary + :raises ValueError: If the flavour is not registered + """ if name not in _FLAVOUR_REGISTRY: raise ValueError(f"Flavour '{name}' is not registered.") return _FLAVOUR_REGISTRY[name] def apply_renderable_mapping( - mapping: dict, + mapping: Dict[Type[Renderable], Type[Renderable]], structure: Renderable, - flavour_func, # noqa: ANN001 + flavour_func: Callable[[Renderable], None], ) -> None: + """Apply flavour mapping to a renderable structure. + + :param mapping: The flavour mapping dictionary + :param structure: The renderable structure to transform + :param flavour_func: The flavour application function for recursive calls + """ mapping[type(structure)].convert_to_class(structure, flavour_func) diff --git a/src/ydata_profiling/report/presentation/flavours/html/table.py b/src/ydata_profiling/report/presentation/flavours/html/table.py index c5d71412b..59aa0eccf 100644 --- a/src/ydata_profiling/report/presentation/flavours/html/table.py +++ b/src/ydata_profiling/report/presentation/flavours/html/table.py @@ -1,4 +1,4 @@ -from ydata_profiling.report.presentation.core.table import Table +from ydata_profiling.report.presentation.core import Table from ydata_profiling.report.presentation.flavours.html import templates diff --git a/src/ydata_profiling/report/presentation/flavours/html/templates.py b/src/ydata_profiling/report/presentation/flavours/html/templates.py index 85e24a46a..30fcecda7 100644 --- a/src/ydata_profiling/report/presentation/flavours/html/templates.py +++ b/src/ydata_profiling/report/presentation/flavours/html/templates.py @@ -1,6 +1,7 @@ """Contains all templates used for generating the HTML profile report""" import shutil from pathlib import Path +from typing import Any import jinja2 diff --git a/src/ydata_profiling/report/presentation/frequency_table_utils.py b/src/ydata_profiling/report/presentation/frequency_table_utils.py index f194bc514..6517cf621 100644 --- a/src/ydata_profiling/report/presentation/frequency_table_utils.py +++ b/src/ydata_profiling/report/presentation/frequency_table_utils.py @@ -7,8 +7,6 @@ def _frequency_table( freqtable: pd.Series, n: int, max_number_to_print: int ) -> List[Dict[str, Any]]: - # TODO: replace '' by '(Empty)' ? - if max_number_to_print > n: max_number_to_print = n @@ -26,7 +24,6 @@ def _frequency_table( max_freq = max(freqtable.values[0], freq_other, freq_missing) - # TODO: Correctly sort missing and other # No values if max_freq == 0: return [] @@ -77,7 +74,7 @@ def freq_table( freqtable: Union[pd.Series, List[pd.Series]], n: Union[int, List[int]], max_number_to_print: int, -) -> Union[List[Dict[str, Any]], List[List[Dict[str, Any]]]]: +) -> List[List[Dict[str, Any]]]: """Render the rows for a frequency table (value, count). Args: @@ -94,7 +91,7 @@ def freq_table( _frequency_table(v, n2, max_number_to_print) for v, n2 in zip(freqtable, n) ] else: - return [_frequency_table(freqtable, n, max_number_to_print)] # type: ignore + return [_frequency_table(freqtable, n, max_number_to_print)] def _extreme_obs_table( @@ -138,4 +135,4 @@ def extreme_obs_table( _extreme_obs_table(v, number_to_print, n1) for v, n1 in zip(freqtable, n) ] - return [_extreme_obs_table(freqtable, number_to_print, n)] # type: ignore + return [_extreme_obs_table(freqtable, number_to_print, n)]