Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
50 changes: 46 additions & 4 deletions superset/models/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -1154,6 +1154,20 @@ def get_extra_cache_keys(self, query_obj: QueryObjectDict) -> list[Hashable]:
def get_template_processor(self, **kwargs: Any) -> BaseTemplateProcessor:
raise NotImplementedError()

def get_dataset_timezone(self) -> str | None:
"""
Get the timezone configured for this dataset from the extra JSON field.

Returns an IANA timezone name (e.g., "Europe/Berlin", "America/New_York")
or None if not configured.

``extra_dict`` is provided by concrete datasources (e.g. ``SqlaTable``)
rather than this mixin, so read it defensively: subclasses without it
simply have no configured timezone.
"""
extra = getattr(self, "extra_dict", None) or {}
return extra.get("timezone")

def get_fetch_values_predicate(
self,
template_processor: Optional[ # pylint: disable=unused-argument
Expand Down Expand Up @@ -1598,11 +1612,13 @@ def _get_timestamp_format(column: str | None) -> str | None:
and (col.get("is_dttm") if isinstance(col, dict) else col.is_dttm)
)

dataset_timezone = self.get_dataset_timezone()
dttm_cols = [
DateColumn(
timestamp_format=_get_timestamp_format(label),
offset=self.offset,
time_shift=query_object.time_shift,
timezone=dataset_timezone,
col_label=label,
)
for label in labels
Expand All @@ -1615,6 +1631,7 @@ def _get_timestamp_format(column: str | None) -> str | None:
timestamp_format=_get_timestamp_format(query_object.granularity),
offset=self.offset,
time_shift=query_object.time_shift,
timezone=dataset_timezone,
)
)

Expand Down Expand Up @@ -2762,19 +2779,44 @@ def get_time_filter( # pylint: disable=too-many-arguments
)
)

# Apply timezone conversion for time filter boundaries
# This converts user's local time boundaries to UTC for querying UTC-stored data
adjusted_start, adjusted_end = start_dttm, end_dttm
dataset_timezone = self.get_dataset_timezone()

if dataset_timezone and (start_dttm or end_dttm):
try:
tz = pytz.timezone(dataset_timezone)
utc = pytz.UTC

# The datetimes from the UI are naive (no timezone info)
# We interpret them as being in the dataset's configured timezone
# and convert them to UTC for comparison with UTC-stored data
if start_dttm:
local_start = tz.localize(start_dttm)
adjusted_start = local_start.astimezone(utc).replace(tzinfo=None)
if end_dttm:
local_end = tz.localize(end_dttm)
adjusted_end = local_end.astimezone(utc).replace(tzinfo=None)
except pytz.UnknownTimeZoneError:
logger.warning(
"Invalid timezone '%s' in dataset extra",
dataset_timezone,
)

l = [] # noqa: E741
if start_dttm:
if adjusted_start:
l.append(
col
>= self.db_engine_spec.get_text_clause(
self.dttm_sql_literal(start_dttm, time_col)
self.dttm_sql_literal(adjusted_start, time_col)
)
)
if end_dttm:
if adjusted_end:
l.append(
col
< self.db_engine_spec.get_text_clause(
self.dttm_sql_literal(end_dttm, time_col)
self.dttm_sql_literal(adjusted_end, time_col)
)
)
if not l:
Expand Down
26 changes: 25 additions & 1 deletion superset/utils/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@
import markdown as md
import nh3
import pandas as pd
import pytz

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Use zoneinfo instead of pytz

Since Superset targets Python ≥3.10, replace the third-party pytz import with from zoneinfo import ZoneInfo, ZoneInfoNotFoundError, use ZoneInfo(_col.timezone) instead of pytz.timezone, and catch ZoneInfoNotFoundError rather than pytz.UnknownTimeZoneError.

Code Review Run #5607e8


Should Bito avoid suggestions like this for future reviews? (Manage Rules)

  • Yes, avoid them

import sqlalchemy as sa
from cryptography.hazmat.backends import default_backend
from cryptography.x509 import Certificate, load_pem_x509_certificate
Expand Down Expand Up @@ -1935,6 +1936,7 @@ class DateColumn:
timestamp_format: str | None = None
offset: int | None = None
time_shift: str | None = None
timezone: str | None = None # IANA timezone name

def __hash__(self) -> int:
return hash(self.col_label)
Expand All @@ -1948,11 +1950,13 @@ def get_legacy_time_column(
timestamp_format: str | None,
offset: int | None,
time_shift: str | None,
timezone: str | None = None,
) -> DateColumn:
return cls(
timestamp_format=timestamp_format,
offset=offset,
time_shift=time_shift,
timezone=timezone,
col_label=DTTM_ALIAS,
)

Expand Down Expand Up @@ -2037,8 +2041,28 @@ def normalize_dttm_col(

_process_datetime_column(df, _col)

if _col.offset:
if _col.timezone:
try:
tz = pytz.timezone(_col.timezone)
# Data is stored in UTC, convert to the dataset's configured timezone
# First make the datetime UTC-aware, then convert to target timezone
series = df[_col.col_label]
if not series.empty and series.notna().any():
# Convert UTC to target timezone
df[_col.col_label] = (
series.dt.tz_localize("UTC")
.dt.tz_convert(tz)
.dt.tz_localize(None) # Remove timezone info for display
)
except pytz.UnknownTimeZoneError:
logging.warning(
"Unknown timezone '%s', falling back to offset", _col.timezone
)
if _col.offset:
df[_col.col_label] += timedelta(hours=_col.offset)
elif _col.offset:
df[_col.col_label] += timedelta(hours=_col.offset)
Comment on lines +2061 to 2064

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggestion: Offset handling is incorrect: the code adds the configured offset hours to the datetime (using += timedelta(hours=_col.offset)) which moves times in the wrong direction when converting local boundaries to UTC; additionally, the code treats offset 0 as falsy so a valid zero offset is ignored. Change the logic to subtract the offset (and check is not None) so offsets of 0 are handled. [logic error]

Severity Level: Critical 🚨
- ❌ Dashboard time filters produce incorrect UTC boundaries.
- ❌ Queries to /data return wrong time ranges.
- ⚠️ Charts and aggregations misaligned by timezone offset.
- ⚠️ Dataset timezone/offset settings produce incorrect results.
Suggested change
if _col.offset:
df[_col.col_label] += timedelta(hours=_col.offset)
elif _col.offset:
df[_col.col_label] += timedelta(hours=_col.offset)
if _col.offset is not None:
# Subtract offset to convert local time to UTC (offset is hours ahead)
df[_col.col_label] -= timedelta(hours=_col.offset)
elif _col.offset is not None:
# Subtract configured offset (support zero offset)
df[_col.col_label] -= timedelta(hours=_col.offset)
Steps of Reproduction ✅
1. Open a Python REPL or test and import normalize_dttm_col from superset/utils/core.py
(function begins at file line ~1934 where `def normalize_dttm_col(` is defined).

2. Construct a DataFrame with a datetime column named "__timestamp", e.g. in code at
runtime by:

   - df = pd.DataFrame({"__timestamp": pd.to_datetime(["2026-01-09 00:00:00"])})

3. Create a DateColumn instance with offset set (example using class defined around lines
~1847 in superset/utils/core.py):

   - from superset.utils.core import DateColumn

   - col = DateColumn(col_label="__timestamp", offset=6)

4. Call normalize_dttm_col(df, (col,)) (function at superset/utils/core.py: ~1934).
Observe resulting df["__timestamp"] has been increased by +6 hours (current code does +=
timedelta), whereas the PR description and intended behavior expect the local midnight
(UTC+6) to be converted to UTC by subtracting 6 hours. Also observe that setting offset=0
(DateColumn(offset=0)) will be ignored by current code because `if _col.offset:` treats 0
as falsy, so no adjustment is applied.
Prompt for AI Agent 🤖
This is a comment left during a code review.

**Path:** superset/utils/core.py
**Line:** 1974:1977
**Comment:**
	*Logic Error: Offset handling is incorrect: the code adds the configured `offset` hours to the datetime (using `+= timedelta(hours=_col.offset)`) which moves times in the wrong direction when converting local boundaries to UTC; additionally, the code treats offset `0` as falsy so a valid zero offset is ignored. Change the logic to subtract the offset (and check `is not None`) so offsets of `0` are handled.

Validate the correctness of the flagged issue. If correct, How can I resolve this? If you propose a fix, implement it and please make it concise.


if _col.time_shift is not None:
df[_col.col_label] += parse_human_timedelta(_col.time_shift)

Expand Down
1 change: 1 addition & 0 deletions superset/viz.py
Original file line number Diff line number Diff line change
Expand Up @@ -324,6 +324,7 @@ def get_df(self, query_obj: QueryObjectDict | None = None) -> pd.DataFrame:
timestamp_format=timestamp_format,
offset=self.datasource.offset,
time_shift=self.form_data.get("time_shift"),
timezone=self.datasource.get_dataset_timezone(),

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Missing test for timezone feature

The added timezone parameter introduces behavioral change (UTC-to-dataset-timezone conversion) without corresponding unit test coverage. Existing test_get_df_handles_dttm_col mocks offset but not get_dataset_timezone(), so timezone scenarios will fall through without validation. Per BITO.md adaptive rule [11730], new functionality should have dedicated tests covering success paths and edge cases.

Code Review Run #5607e8


Should Bito avoid suggestions like this for future reviews? (Manage Rules)

  • Yes, avoid them

)
]
),
Expand Down
72 changes: 72 additions & 0 deletions tests/unit_tests/utils/test_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -341,6 +341,78 @@ def test_normalize_dttm_col_with_offset_and_time_shift() -> None:
assert df["date_col"][2].strftime("%Y-%m-%d %H:%M:%S") == "2022-01-01 04:00:00"


def test_normalize_dttm_col_with_timezone() -> None:
"""UTC-stored values are converted to the dataset's configured timezone."""
# Winter date: Europe/Berlin is UTC+1, so 00:00 UTC renders as 01:00 local.
df = pd.DataFrame({"date_col": ["2020-01-01 00:00:00"]})
dttm_cols = (
DateColumn(
col_label="date_col",
timestamp_format="%Y-%m-%d %H:%M:%S",
timezone="Europe/Berlin",
),
)

normalize_dttm_col(df, dttm_cols)

assert is_datetime64_dtype(df["date_col"])
# tz-naive after conversion (display value), shifted by the zone offset.
assert df["date_col"][0].tzinfo is None
assert df["date_col"][0].strftime("%Y-%m-%d %H:%M:%S") == "2020-01-01 01:00:00"


def test_normalize_dttm_col_timezone_handles_dst() -> None:
"""The timezone path respects DST, unlike a fixed hour offset."""
# Summer date: Europe/Berlin is UTC+2 (CEST), so 00:00 UTC renders as 02:00.
df = pd.DataFrame({"date_col": ["2020-07-01 00:00:00"]})
dttm_cols = (
DateColumn(
col_label="date_col",
timestamp_format="%Y-%m-%d %H:%M:%S",
timezone="Europe/Berlin",
),
)

normalize_dttm_col(df, dttm_cols)

assert df["date_col"][0].strftime("%Y-%m-%d %H:%M:%S") == "2020-07-01 02:00:00"


def test_normalize_dttm_col_timezone_takes_precedence_over_offset() -> None:
"""When both timezone and offset are set, the timezone conversion wins."""
df = pd.DataFrame({"date_col": ["2020-01-01 00:00:00"]})
dttm_cols = (
DateColumn(
col_label="date_col",
timestamp_format="%Y-%m-%d %H:%M:%S",
timezone="Europe/Berlin",
offset=10,
),
)

normalize_dttm_col(df, dttm_cols)

# +1h from the Berlin (winter) conversion, NOT +10h from the offset.
assert df["date_col"][0].strftime("%Y-%m-%d %H:%M:%S") == "2020-01-01 01:00:00"


def test_normalize_dttm_col_invalid_timezone_falls_back_to_offset() -> None:
"""An unknown timezone falls back to the plain hour offset."""
df = pd.DataFrame({"date_col": ["2020-01-01 00:00:00"]})
dttm_cols = (
DateColumn(
col_label="date_col",
timestamp_format="%Y-%m-%d %H:%M:%S",
timezone="Not/AZone",
offset=3,
),
)

normalize_dttm_col(df, dttm_cols)

assert df["date_col"][0].strftime("%Y-%m-%d %H:%M:%S") == "2020-01-01 03:00:00"


def test_normalize_dttm_col_invalid_date_coerced() -> None:
"""Test that invalid dates are coerced to NaT."""
df = pd.DataFrame({"date_col": ["2020-01-01", "invalid_date", "2022-01-01"]})
Expand Down
Loading