Skip to content
70 changes: 43 additions & 27 deletions superset/commands/dashboard/export_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,12 +31,15 @@

from superset.commands.base import BaseCommand
from superset.commands.dashboard.exceptions import DashboardNotFoundError
from superset.common.db_query_status import QueryStatus
from superset.daos.dashboard import DashboardDAO
from superset.exceptions import SupersetSecurityException

if TYPE_CHECKING:
from superset.connectors.sqla.models import SqlaTable
from superset.models.dashboard import Dashboard
from superset.models.slice import Slice
from superset.superset_typing import QueryObjectDict

from superset.sql.parse import SQLStatement, Table

Expand Down Expand Up @@ -235,8 +238,6 @@ def export_dataset_data(
sample_rows: int | None = None,
) -> bytes | None:
"""Export dataset data to Parquet format. Returns bytes or None on failure."""
import pandas as pd # pylint: disable=import-outside-toplevel

from superset import db # pylint: disable=import-outside-toplevel

# Ensure dataset is attached to session and relationships are loaded
Expand All @@ -251,35 +252,50 @@ def export_dataset_data(
logger.warning("Dataset %s has no database", dataset.table_name)
return None

# Only export rows the requester is entitled to read. The dataset's own
# access check is applied here, and the rows below are fetched through the
# dataset's query builder (the same path the chart-data API uses) so that
# per-row filters are applied consistently. A requester without access to
# the dataset yields no data file rather than the raw underlying table.
try:
logger.info("Exporting data for %s to Parquet...", dataset.table_name)
dataset.raise_for_access()
except SupersetSecurityException:
logger.info(
"Skipping data export for dataset %s: requester not entitled",
dataset.table_name,
)
return None

# Check if this is a virtual dataset (SQL-based)
if dataset.sql:
sql = dataset.sql
else:
# For physical tables, build SELECT query from columns
columns = [col.column_name for col in dataset.columns if not col.expression]

if not columns:
logger.warning("No columns to export for %s", dataset.table_name)
return None

# Build simple SELECT query (quote identifiers to handle spaces/keywords)
column_list = ", ".join(f'"{c}"' for c in columns)
quoted_table = f'"{dataset.table_name}"'
if dataset.schema:
table_ref = f'"{dataset.schema}".{quoted_table}'
else:
table_ref = quoted_table
sql = f"SELECT {column_list} FROM {table_ref}" # noqa: S608
columns = [col.column_name for col in dataset.columns if not col.expression]
if not columns:
logger.warning("No columns to export for %s", dataset.table_name)
return None

with dataset.database.get_sqla_engine() as engine:
df = pd.read_sql(sql, engine)
try:
logger.info("Exporting data for %s to Parquet...", dataset.table_name)

if sample_rows and len(df) > sample_rows:
df = df.head(sample_rows)
logger.info("Sampled to %d rows", sample_rows)
# Fetch through the dataset's query builder so that the projection,
# per-row filters, and (for virtual datasets) the wrapping query are
# produced the same way as the chart-data path. The row cap is applied
# at the SQL level via row_limit rather than after a full table read.
query_obj: QueryObjectDict = {
"columns": columns,
"metrics": [],
"orderby": [],
"is_timeseries": False,
"filter": [],
"extras": {},
"row_limit": sample_rows,
}
result = dataset.query(query_obj)
if result.status == QueryStatus.FAILED:
# The query path reports failures via status rather than raising;
# omit the data file instead of writing an empty/partial Parquet.
logger.warning(
"Query failed while exporting data for %s", dataset.table_name
)
return None
df = result.df

# Write to bytes buffer
buf = BytesIO()
Expand Down
64 changes: 64 additions & 0 deletions tests/integration_tests/dashboards/api_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -2961,6 +2961,70 @@ def test_export_as_example_content_disposition(self):
assert "Content-Disposition" in rv.headers
assert "_example.zip" in rv.headers["Content-Disposition"]

@pytest.mark.usefixtures("load_birth_names_dashboard_with_slices")
def test_export_as_example_data_respects_row_level_filters(self) -> None:
"""
Dashboard API: export_as_example with export_data must apply the
dataset's row-level filters, so a restricted user only receives the
rows they are entitled to (not the full underlying table).
"""
import pandas as pd

from superset.connectors.sqla.models import RowLevelSecurityFilter

table = self.get_table(name="birth_names")
gamma = security_manager.find_role("Gamma")
# birth_names access so the row-level filter (not the access check) is
# what scopes the result, and permission to call the endpoint. Track
# what we grant so the shared test role is restored afterwards.
granted = []
for perm, view in (
("datasource_access", table.perm),
("can_export_as_example", "Dashboard"),
("can_export", "Dashboard"),
):
pvm = security_manager.find_permission_view_menu(perm, view)
if pvm and pvm not in gamma.permissions:
gamma.permissions.append(pvm)
granted.append(pvm)
rls = RowLevelSecurityFilter(
name="export_example_girls_only",
filter_type="Regular",
clause="gender = 'girl'",
group_key="gender",
)
rls.tables.append(table)
rls.roles.append(gamma)
db.session.add(rls)
db.session.commit()
try:
self.login(GAMMA_USERNAME)
dashboard_id = get_dashboards_ids(["births"])[0]
uri = (
f"api/v1/dashboard/{dashboard_id}/export_as_example/"
"?export_data=true&sample_rows=500"
)
rv = self.client.get(uri)
assert rv.status_code == 200

with ZipFile(BytesIO(rv.data)) as zf:
parquet_files = [n for n in zf.namelist() if n.endswith(".parquet")]
assert parquet_files, f"expected a data file: {zf.namelist()}"
for name in parquet_files:
df = pd.read_parquet(BytesIO(zf.read(name)))
if "gender" in df.columns:
unexpected = set(df["gender"].unique()) - {"girl"}
assert not unexpected, (
f"export returned rows outside the configured "
f"row-level filter: {unexpected}"
)
finally:
db.session.delete(rls)
for pvm in granted:
if pvm in gamma.permissions:
gamma.permissions.remove(pvm)
db.session.commit()

@patch("superset.commands.database.importers.v1.utils.add_permissions")
def test_import_dashboard(self, mock_add_permissions):
"""
Expand Down
102 changes: 102 additions & 0 deletions tests/unit_tests/commands/dashboard/export_example_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,9 @@
# under the License.
from __future__ import annotations

from collections.abc import Iterator
from io import BytesIO
from typing import Any
from unittest.mock import MagicMock, patch
from uuid import uuid4

Expand All @@ -27,10 +30,14 @@
_make_bytes_generator,
_make_yaml_generator,
export_chart,
export_dataset_data,
export_dataset_yaml,
ExportExampleCommand,
sanitize_filename,
)
from superset.common.db_query_status import QueryStatus
from superset.errors import ErrorLevel, SupersetError, SupersetErrorType
from superset.exceptions import SupersetSecurityException


def test_sanitize_filename_basic():
Expand Down Expand Up @@ -321,3 +328,98 @@ def test_export_example_command_no_data():
assert "dataset.yaml" in files
assert "data.parquet" not in files
assert "dashboard.yaml" in files


def _make_data_export_dataset(rows: list[dict[str, Any]]) -> MagicMock:
"""A dataset mock whose query() returns ``rows`` as a DataFrame."""
import pandas as pd

dataset = MagicMock()
dataset.table_name = "private_table"
dataset.database = MagicMock() # truthy
dataset.columns = [
MagicMock(column_name=name, expression=None) for name in ("uid", "data")
]
result = MagicMock()
result.status = QueryStatus.SUCCESS
result.df = pd.DataFrame(rows)
dataset.query.return_value = result
return dataset


@pytest.fixture
Comment thread
sha174n marked this conversation as resolved.
def patched_db() -> Iterator[MagicMock]:
"""Patch ``superset.db`` so merge() returns the dataset unchanged."""
with patch("superset.db") as mock_db:
mock_db.session.merge.side_effect = lambda d: d
yield mock_db


def test_export_dataset_data_skips_when_access_denied(patched_db: MagicMock) -> None:
"""
A requester without access to the dataset must get no data file, and the
underlying rows must never be fetched (no fallback raw read).
"""
dataset = _make_data_export_dataset([{"uid": 1, "data": "secret"}])
dataset.raise_for_access.side_effect = SupersetSecurityException(
SupersetError(
message="denied",
error_type=SupersetErrorType.DATASOURCE_SECURITY_ACCESS_ERROR,
level=ErrorLevel.ERROR,
)
)

assert export_dataset_data(dataset) is None

dataset.raise_for_access.assert_called_once()
dataset.query.assert_not_called()


def test_export_dataset_data_fetches_through_query_path(patched_db: MagicMock) -> None:
"""
Rows are fetched through the dataset's own query builder (which applies the
per-row filters), and exactly those rows are what gets written to Parquet.
"""
import pandas as pd

own_rows = [{"uid": 6, "data": "row-for-user-6"}]
dataset = _make_data_export_dataset(own_rows)

payload = export_dataset_data(dataset)

assert payload is not None
dataset.raise_for_access.assert_called_once()
dataset.query.assert_called_once()

# The fetch goes through query() with a column projection, not a raw read.
query_obj = dataset.query.call_args.args[0]
assert query_obj["columns"] == ["uid", "data"]
assert query_obj["is_timeseries"] is False
assert "row_limit" in query_obj

# Only the rows query() returned are exported.
exported = pd.read_parquet(BytesIO(payload))
assert exported.to_dict("records") == own_rows


def test_export_dataset_data_applies_row_limit_at_query_level(
patched_db: MagicMock,
) -> None:
"""sample_rows is passed as a SQL-level row_limit, not applied post-fetch."""
dataset = _make_data_export_dataset([{"uid": 1, "data": "a"}])

export_dataset_data(dataset, sample_rows=5)

query_obj = dataset.query.call_args.args[0]
assert query_obj["row_limit"] == 5


def test_export_dataset_data_skips_on_failed_query(patched_db: MagicMock) -> None:
"""
The query path signals failure via status rather than raising, so a failed
query must yield no data file instead of an empty/partial Parquet.
"""
dataset = _make_data_export_dataset([{"uid": 1, "data": "a"}])
dataset.query.return_value.status = QueryStatus.FAILED

assert export_dataset_data(dataset) is None
Loading