diff --git a/superset/commands/dashboard/export_example.py b/superset/commands/dashboard/export_example.py index 7924fe0ad4d1..477e6a70ebf0 100644 --- a/superset/commands/dashboard/export_example.py +++ b/superset/commands/dashboard/export_example.py @@ -31,12 +31,15 @@ from superset.commands.base import BaseCommand from superset.commands.dashboard.exceptions import DashboardNotFoundError +from superset.common.db_query_status import QueryStatus from superset.daos.dashboard import DashboardDAO +from superset.exceptions import SupersetSecurityException if TYPE_CHECKING: from superset.connectors.sqla.models import SqlaTable from superset.models.dashboard import Dashboard from superset.models.slice import Slice + from superset.superset_typing import QueryObjectDict from superset.sql.parse import SQLStatement, Table @@ -235,8 +238,6 @@ def export_dataset_data( sample_rows: int | None = None, ) -> bytes | None: """Export dataset data to Parquet format. Returns bytes or None on failure.""" - import pandas as pd # pylint: disable=import-outside-toplevel - from superset import db # pylint: disable=import-outside-toplevel # Ensure dataset is attached to session and relationships are loaded @@ -251,35 +252,50 @@ def export_dataset_data( logger.warning("Dataset %s has no database", dataset.table_name) return None + # Only export rows the requester is entitled to read. The dataset's own + # access check is applied here, and the rows below are fetched through the + # dataset's query builder (the same path the chart-data API uses) so that + # per-row filters are applied consistently. A requester without access to + # the dataset yields no data file rather than the raw underlying table. try: - logger.info("Exporting data for %s to Parquet...", dataset.table_name) + dataset.raise_for_access() + except SupersetSecurityException: + logger.info( + "Skipping data export for dataset %s: requester not entitled", + dataset.table_name, + ) + return None - # Check if this is a virtual dataset (SQL-based) - if dataset.sql: - sql = dataset.sql - else: - # For physical tables, build SELECT query from columns - columns = [col.column_name for col in dataset.columns if not col.expression] - - if not columns: - logger.warning("No columns to export for %s", dataset.table_name) - return None - - # Build simple SELECT query (quote identifiers to handle spaces/keywords) - column_list = ", ".join(f'"{c}"' for c in columns) - quoted_table = f'"{dataset.table_name}"' - if dataset.schema: - table_ref = f'"{dataset.schema}".{quoted_table}' - else: - table_ref = quoted_table - sql = f"SELECT {column_list} FROM {table_ref}" # noqa: S608 + columns = [col.column_name for col in dataset.columns if not col.expression] + if not columns: + logger.warning("No columns to export for %s", dataset.table_name) + return None - with dataset.database.get_sqla_engine() as engine: - df = pd.read_sql(sql, engine) + try: + logger.info("Exporting data for %s to Parquet...", dataset.table_name) - if sample_rows and len(df) > sample_rows: - df = df.head(sample_rows) - logger.info("Sampled to %d rows", sample_rows) + # Fetch through the dataset's query builder so that the projection, + # per-row filters, and (for virtual datasets) the wrapping query are + # produced the same way as the chart-data path. The row cap is applied + # at the SQL level via row_limit rather than after a full table read. + query_obj: QueryObjectDict = { + "columns": columns, + "metrics": [], + "orderby": [], + "is_timeseries": False, + "filter": [], + "extras": {}, + "row_limit": sample_rows, + } + result = dataset.query(query_obj) + if result.status == QueryStatus.FAILED: + # The query path reports failures via status rather than raising; + # omit the data file instead of writing an empty/partial Parquet. + logger.warning( + "Query failed while exporting data for %s", dataset.table_name + ) + return None + df = result.df # Write to bytes buffer buf = BytesIO() diff --git a/tests/integration_tests/dashboards/api_tests.py b/tests/integration_tests/dashboards/api_tests.py index bdc42ed9287c..ef5104b91841 100644 --- a/tests/integration_tests/dashboards/api_tests.py +++ b/tests/integration_tests/dashboards/api_tests.py @@ -2961,6 +2961,70 @@ def test_export_as_example_content_disposition(self): assert "Content-Disposition" in rv.headers assert "_example.zip" in rv.headers["Content-Disposition"] + @pytest.mark.usefixtures("load_birth_names_dashboard_with_slices") + def test_export_as_example_data_respects_row_level_filters(self) -> None: + """ + Dashboard API: export_as_example with export_data must apply the + dataset's row-level filters, so a restricted user only receives the + rows they are entitled to (not the full underlying table). + """ + import pandas as pd + + from superset.connectors.sqla.models import RowLevelSecurityFilter + + table = self.get_table(name="birth_names") + gamma = security_manager.find_role("Gamma") + # birth_names access so the row-level filter (not the access check) is + # what scopes the result, and permission to call the endpoint. Track + # what we grant so the shared test role is restored afterwards. + granted = [] + for perm, view in ( + ("datasource_access", table.perm), + ("can_export_as_example", "Dashboard"), + ("can_export", "Dashboard"), + ): + pvm = security_manager.find_permission_view_menu(perm, view) + if pvm and pvm not in gamma.permissions: + gamma.permissions.append(pvm) + granted.append(pvm) + rls = RowLevelSecurityFilter( + name="export_example_girls_only", + filter_type="Regular", + clause="gender = 'girl'", + group_key="gender", + ) + rls.tables.append(table) + rls.roles.append(gamma) + db.session.add(rls) + db.session.commit() + try: + self.login(GAMMA_USERNAME) + dashboard_id = get_dashboards_ids(["births"])[0] + uri = ( + f"api/v1/dashboard/{dashboard_id}/export_as_example/" + "?export_data=true&sample_rows=500" + ) + rv = self.client.get(uri) + assert rv.status_code == 200 + + with ZipFile(BytesIO(rv.data)) as zf: + parquet_files = [n for n in zf.namelist() if n.endswith(".parquet")] + assert parquet_files, f"expected a data file: {zf.namelist()}" + for name in parquet_files: + df = pd.read_parquet(BytesIO(zf.read(name))) + if "gender" in df.columns: + unexpected = set(df["gender"].unique()) - {"girl"} + assert not unexpected, ( + f"export returned rows outside the configured " + f"row-level filter: {unexpected}" + ) + finally: + db.session.delete(rls) + for pvm in granted: + if pvm in gamma.permissions: + gamma.permissions.remove(pvm) + db.session.commit() + @patch("superset.commands.database.importers.v1.utils.add_permissions") def test_import_dashboard(self, mock_add_permissions): """ diff --git a/tests/unit_tests/commands/dashboard/export_example_test.py b/tests/unit_tests/commands/dashboard/export_example_test.py index 8aeab21de022..edc5c172ef83 100644 --- a/tests/unit_tests/commands/dashboard/export_example_test.py +++ b/tests/unit_tests/commands/dashboard/export_example_test.py @@ -16,6 +16,9 @@ # under the License. from __future__ import annotations +from collections.abc import Iterator +from io import BytesIO +from typing import Any from unittest.mock import MagicMock, patch from uuid import uuid4 @@ -27,10 +30,14 @@ _make_bytes_generator, _make_yaml_generator, export_chart, + export_dataset_data, export_dataset_yaml, ExportExampleCommand, sanitize_filename, ) +from superset.common.db_query_status import QueryStatus +from superset.errors import ErrorLevel, SupersetError, SupersetErrorType +from superset.exceptions import SupersetSecurityException def test_sanitize_filename_basic(): @@ -321,3 +328,98 @@ def test_export_example_command_no_data(): assert "dataset.yaml" in files assert "data.parquet" not in files assert "dashboard.yaml" in files + + +def _make_data_export_dataset(rows: list[dict[str, Any]]) -> MagicMock: + """A dataset mock whose query() returns ``rows`` as a DataFrame.""" + import pandas as pd + + dataset = MagicMock() + dataset.table_name = "private_table" + dataset.database = MagicMock() # truthy + dataset.columns = [ + MagicMock(column_name=name, expression=None) for name in ("uid", "data") + ] + result = MagicMock() + result.status = QueryStatus.SUCCESS + result.df = pd.DataFrame(rows) + dataset.query.return_value = result + return dataset + + +@pytest.fixture +def patched_db() -> Iterator[MagicMock]: + """Patch ``superset.db`` so merge() returns the dataset unchanged.""" + with patch("superset.db") as mock_db: + mock_db.session.merge.side_effect = lambda d: d + yield mock_db + + +def test_export_dataset_data_skips_when_access_denied(patched_db: MagicMock) -> None: + """ + A requester without access to the dataset must get no data file, and the + underlying rows must never be fetched (no fallback raw read). + """ + dataset = _make_data_export_dataset([{"uid": 1, "data": "secret"}]) + dataset.raise_for_access.side_effect = SupersetSecurityException( + SupersetError( + message="denied", + error_type=SupersetErrorType.DATASOURCE_SECURITY_ACCESS_ERROR, + level=ErrorLevel.ERROR, + ) + ) + + assert export_dataset_data(dataset) is None + + dataset.raise_for_access.assert_called_once() + dataset.query.assert_not_called() + + +def test_export_dataset_data_fetches_through_query_path(patched_db: MagicMock) -> None: + """ + Rows are fetched through the dataset's own query builder (which applies the + per-row filters), and exactly those rows are what gets written to Parquet. + """ + import pandas as pd + + own_rows = [{"uid": 6, "data": "row-for-user-6"}] + dataset = _make_data_export_dataset(own_rows) + + payload = export_dataset_data(dataset) + + assert payload is not None + dataset.raise_for_access.assert_called_once() + dataset.query.assert_called_once() + + # The fetch goes through query() with a column projection, not a raw read. + query_obj = dataset.query.call_args.args[0] + assert query_obj["columns"] == ["uid", "data"] + assert query_obj["is_timeseries"] is False + assert "row_limit" in query_obj + + # Only the rows query() returned are exported. + exported = pd.read_parquet(BytesIO(payload)) + assert exported.to_dict("records") == own_rows + + +def test_export_dataset_data_applies_row_limit_at_query_level( + patched_db: MagicMock, +) -> None: + """sample_rows is passed as a SQL-level row_limit, not applied post-fetch.""" + dataset = _make_data_export_dataset([{"uid": 1, "data": "a"}]) + + export_dataset_data(dataset, sample_rows=5) + + query_obj = dataset.query.call_args.args[0] + assert query_obj["row_limit"] == 5 + + +def test_export_dataset_data_skips_on_failed_query(patched_db: MagicMock) -> None: + """ + The query path signals failure via status rather than raising, so a failed + query must yield no data file instead of an empty/partial Parquet. + """ + dataset = _make_data_export_dataset([{"uid": 1, "data": "a"}]) + dataset.query.return_value.status = QueryStatus.FAILED + + assert export_dataset_data(dataset) is None