From 360bb8d04b199d4508bfdb0bb9f4256c35887c5a Mon Sep 17 00:00:00 2001 From: lepy Date: Mon, 29 Jun 2026 16:50:28 +0200 Subject: [PATCH] feat(dtypes): date / time / duration / decimal dtypes (reine stdlib) Vier neue Attribut-dtypes in der Registry (sdata/dtypes.py), strikt additiv: - date -> datetime.date (xsd:date) ISO 'YYYY-MM-DD'; datetime -> date - time -> datetime.time (xsd:time) ISO 'HH:MM:SS' - duration -> datetime.timedelta (xsd:duration) ISO 8601 (PnW/PnDTnHnMnS, negatives Vorzeichen, Zahl=Sekunden); Jahre/Monate nicht als timedelta darstellbar -> DtypeError - decimal -> decimal.Decimal (xsd:decimal) exakte Numerik (via str, keine Float-Binaer-Artefakte) Anbindung: DTYPES_INV (Klasse->Name; datetime.datetime bleibt timestamp), json_default (Roh-Objekte JSON-sicher), DtypeSpec.to_json (kanonische ISO-/ Decimal-Strings fuer JSON-LD @value), semantic._DTYPE_FROM_XSD (from_jsonld- Rueckabbildung). lenient/strict-Coercion wie bei den bestehenden dtypes. Tests: Coercion (gueltig/leer/ungueltig/strict), to_json, json_default, resolve, xsd-Map, JSON- und JSON-LD-Round-Trip. dtypes.py 244/244 100%, semantic.py 100%. Doku (metadata-jsonld) + CHANGELOG [1.3.0] aktualisiert (1.3.0 noch unveroeffentlicht). Kanonische CI: 611 passed, 7 skipped, TOTAL 100%; mkdocs --strict gruen. --- CHANGELOG.md | 5 ++ docs/usage/metadata-jsonld.md | 8 ++- sdata/dtypes.py | 132 +++++++++++++++++++++++++++++++++- sdata/semantic.py | 2 + tests/test_dtypes.py | 122 +++++++++++++++++++++++++++++++ 5 files changed, 264 insertions(+), 5 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 1302645..6421f39 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -15,6 +15,11 @@ native, format-agnostic metadata embedding for images. Core dependencies remain ### Added +- **New attribute dtypes** `date`, `time`, `duration` and `decimal` (pure stdlib): + `date`/`time` (`xsd:date`/`xsd:time`), `duration` as a `datetime.timedelta` parsed + from ISO 8601 (`xsd:duration`), and `decimal` as `decimal.Decimal` for exact numerics + (`xsd:decimal`). Round-trip through JSON and JSON-LD; lenient/`strict=` coercion as + for the existing dtypes. - **Native image metadata (RFC 0005).** New pure-Python, Pillow-free module `sdata.imagemeta` embeds/reads sdata metadata **natively** into six containers with one API (`detect_format`/`embed`/`extract`/`supported_formats`): **PNG** (`iTXt`), diff --git a/docs/usage/metadata-jsonld.md b/docs/usage/metadata-jsonld.md index 8fd77c9..49d4e4a 100644 --- a/docs/usage/metadata-jsonld.md +++ b/docs/usage/metadata-jsonld.md @@ -18,9 +18,11 @@ pip install "sdata[schema]" # jsonschema -> JSON-Schema validation Every [`Attribute`][sdata.metadata.Attribute] carries `name, value, unit, dtype, description, label, required, ontology`. Supported -`dtype` values are `str, int, float, bool, list, timestamp, bytes, json, uri`. -Coercion is lenient by default; pass `strict=True` to raise -`sdata.dtypes.DtypeError` on invalid values. +`dtype` values are `str, int, float, bool, list, timestamp, bytes, json, uri, +date, time, duration, decimal`. Each maps to an XSD type for JSON-LD (e.g. `date` +→ `xsd:date`, `duration` → `xsd:duration` as ISO 8601 / `timedelta`, `decimal` → +`xsd:decimal` for exact numerics). Coercion is lenient by default; pass +`strict=True` to raise `sdata.dtypes.DtypeError` on invalid values. ```python import pandas as pd diff --git a/sdata/dtypes.py b/sdata/dtypes.py index 937ad16..8aefcce 100644 --- a/sdata/dtypes.py +++ b/sdata/dtypes.py @@ -15,12 +15,15 @@ * **Strikt opt-in** – ``strict=True`` wirft ``DtypeError`` statt still zu degradieren. * **Erweiterbar** – neben den 6 Alt-dtypes (str/int/float/bool/timestamp/list) - zusätzlich ``bytes`` (base64), ``json`` (dict/list) und ``uri``. + zusätzlich ``bytes`` (base64), ``json`` (dict/list), ``uri`` sowie ``date``, + ``time``, ``duration`` (ISO 8601 / ``timedelta``) und ``decimal`` (exakt). """ import base64 import binascii import datetime import json as _json +import re +from decimal import Decimal, InvalidOperation from urllib.parse import urlsplit import numpy as np @@ -158,6 +161,80 @@ def _c_uri(value, strict): return text +def _c_date(value, strict): + if value is None or value == "": + return None + if isinstance(value, datetime.datetime): # vor date prüfen (Subklasse!) + return value.date() + if isinstance(value, datetime.date): + return value + try: + return datetime.date.fromisoformat(str(value)) + except (ValueError, TypeError) as exp: + raise DtypeError("date: {!r} (ISO 8601 'YYYY-MM-DD')".format(value)) from exp + + +def _c_time(value, strict): + if value is None or value == "": + return None + if isinstance(value, datetime.datetime): + return value.timetz() + if isinstance(value, datetime.time): + return value + try: + return datetime.time.fromisoformat(str(value)) + except (ValueError, TypeError) as exp: + raise DtypeError("time: {!r} (ISO 8601 'HH:MM:SS')".format(value)) from exp + + +#: ISO-8601-Dauer ohne Jahre/Monate (nicht als ``timedelta`` darstellbar). +_ISO_DURATION = re.compile( + r"^(?P-?)P" + r"(?:(?P\d+(?:\.\d+)?)W)?" + r"(?:(?P\d+(?:\.\d+)?)D)?" + r"(?:T" + r"(?:(?P\d+(?:\.\d+)?)H)?" + r"(?:(?P\d+(?:\.\d+)?)M)?" + r"(?:(?P\d+(?:\.\d+)?)S)?" + r")?$" +) + + +def _c_duration(value, strict): + if value is None or value == "": + return None + if isinstance(value, datetime.timedelta): + return value + if isinstance(value, bool): # bool ist int-Subklasse → ablehnen + raise DtypeError("duration: {!r}".format(value)) + if isinstance(value, (int, float)): + return datetime.timedelta(seconds=value) # Zahl = Sekunden + match = _ISO_DURATION.match(str(value).strip()) + parts = ({k: float(v) for k, v in match.groupdict().items() + if v is not None and k != "sign"} if match else None) + if not parts: + raise DtypeError("duration: {!r} (ISO 8601 like 'PT1H30M'; years/months " + "are not representable as a timedelta)".format(value)) + td = datetime.timedelta( + weeks=parts.get("weeks", 0), days=parts.get("days", 0), + hours=parts.get("hours", 0), minutes=parts.get("minutes", 0), + seconds=parts.get("seconds", 0)) + return -td if match.group("sign") == "-" else td + + +def _c_decimal(value, strict): + if value is None or value == "": + return None + if isinstance(value, Decimal): + return value + if isinstance(value, bool): # bool ist kein Dezimalwert + raise DtypeError("decimal: {!r}".format(value)) + try: + return Decimal(str(value)) # via str: keine Float-Binär-Artefakte + except (InvalidOperation, ValueError, TypeError) as exp: + raise DtypeError("decimal: {!r}".format(value)) from exp + + # --- JSON-Serialisierung je dtype ------------------------------------------- def _ts_to_json(value): return str(value.utc) if isinstance(value, TimeStamp) else value @@ -167,6 +244,42 @@ def _bytes_to_json(value): return base64.b64encode(value).decode("ascii") if isinstance(value, bytes) else value +def _date_to_json(value): + return value.isoformat() if isinstance(value, datetime.date) else value + + +def _time_to_json(value): + return value.isoformat() if isinstance(value, datetime.time) else value + + +def _duration_to_json(value): + """``timedelta`` → kanonische ISO-8601-Dauer (``PT0S`` für null).""" + if not isinstance(value, datetime.timedelta): + return value + total = value.total_seconds() + sign = "-" if total < 0 else "" + total = abs(total) + days = int(total // 86400) + rem = total - days * 86400 + hours = int(rem // 3600) + rem -= hours * 3600 + minutes = int(rem // 60) + seconds = rem - minutes * 60 + date_part = "{:d}D".format(days) if days else "" + time_part = "" + if hours: + time_part += "{:d}H".format(hours) + if minutes: + time_part += "{:d}M".format(minutes) + if seconds or not (date_part or time_part): + time_part += "{:g}S".format(seconds) + return sign + "P" + date_part + ("T" + time_part if time_part else "") + + +def _decimal_to_json(value): + return str(value) if isinstance(value, Decimal) else value + + class DtypeSpec: """Beschreibt einen dtype: Coercion, JSON-Repräsentation, Klasse, XSD-Typ.""" @@ -201,6 +314,10 @@ def register(spec): DtypeSpec("bytes", bytes, _c_bytes, "xsd:base64Binary", _bytes_to_json), DtypeSpec("json", dict, _c_json, "xsd:string"), DtypeSpec("uri", str, _c_uri, "xsd:anyURI"), + DtypeSpec("date", datetime.date, _c_date, "xsd:date", _date_to_json), + DtypeSpec("time", datetime.time, _c_time, "xsd:time", _time_to_json), + DtypeSpec("duration", datetime.timedelta, _c_duration, "xsd:duration", _duration_to_json), + DtypeSpec("decimal", Decimal, _c_decimal, "xsd:decimal", _decimal_to_json), ]: register(_spec) @@ -222,6 +339,8 @@ def names(): float: "float", int: "int", str: "str", datetime.datetime: "timestamp", bool: "bool", list: "list", bytes: "bytes", dict: "json", + datetime.date: "date", datetime.time: "time", + datetime.timedelta: "duration", Decimal: "decimal", } XSD = {name: spec.xsd for name, spec in _REGISTRY.items()} @@ -263,9 +382,18 @@ def coerce(value, dtype, strict=False): def json_default(obj): - """``default=`` für ``json.dumps``: serialisiert TimeStamp/bytes JSON-sicher.""" + """``default=`` für ``json.dumps``: serialisiert die nicht-nativen dtype-Werte + (TimeStamp/bytes/Decimal/timedelta/date/time) JSON-sicher.""" if isinstance(obj, TimeStamp): return str(obj.utc) if isinstance(obj, (bytes, bytearray)): return base64.b64encode(bytes(obj)).decode("ascii") + if isinstance(obj, Decimal): + return str(obj) + if isinstance(obj, datetime.timedelta): + return _duration_to_json(obj) + if isinstance(obj, datetime.date): # fängt auch datetime.datetime + return obj.isoformat() + if isinstance(obj, datetime.time): + return obj.isoformat() raise TypeError("not JSON serializable: {!r}".format(type(obj))) diff --git a/sdata/semantic.py b/sdata/semantic.py index 20edb44..65a5a89 100644 --- a/sdata/semantic.py +++ b/sdata/semantic.py @@ -52,6 +52,8 @@ "xsd:string": "str", "xsd:integer": "int", "xsd:double": "float", "xsd:boolean": "bool", "xsd:dateTime": "timestamp", "xsd:base64Binary": "bytes", "xsd:anyURI": "uri", + "xsd:date": "date", "xsd:time": "time", + "xsd:duration": "duration", "xsd:decimal": "decimal", } diff --git a/tests/test_dtypes.py b/tests/test_dtypes.py index 36ba8e8..9d06345 100644 --- a/tests/test_dtypes.py +++ b/tests/test_dtypes.py @@ -153,6 +153,128 @@ def test_uri_dtype(): Attribute("u", " ", dtype="uri", strict=True) +# --- date / time / duration / decimal (neu) -------------------------------- +def test_date_dtype(): + assert Attribute("d", "2024-01-15", dtype="date").value == datetime.date(2024, 1, 15) + assert Attribute("d", datetime.date(2024, 1, 15), dtype="date").value == datetime.date(2024, 1, 15) + # datetime -> date (Subklasse korrekt zuerst behandelt) + assert Attribute("d", datetime.datetime(2024, 1, 15, 9, 30), + dtype="date").value == datetime.date(2024, 1, 15) + assert Attribute("d", None, dtype="date").value is None + assert Attribute("d", "", dtype="date").value is None + assert Attribute("d", "nope", dtype="date").value is None # lenient + with pytest.raises(DtypeError): + Attribute("d", "nope", dtype="date", strict=True) + + +def test_time_dtype(): + assert Attribute("t", "14:30:00", dtype="time").value == datetime.time(14, 30) + assert Attribute("t", datetime.time(14, 30), dtype="time").value == datetime.time(14, 30) + assert Attribute("t", datetime.datetime(2024, 1, 15, 14, 30), + dtype="time").value == datetime.time(14, 30) + assert Attribute("t", None, dtype="time").value is None + assert Attribute("t", "25:99", dtype="time").value is None # lenient + with pytest.raises(DtypeError): + Attribute("t", "25:99", dtype="time", strict=True) + + +def test_duration_dtype(): + td = datetime.timedelta + assert Attribute("u", "PT1H30M", dtype="duration").value == td(hours=1, minutes=30) + assert Attribute("u", "P2DT3H", dtype="duration").value == td(days=2, hours=3) + assert Attribute("u", "P1W", dtype="duration").value == td(weeks=1) + assert Attribute("u", "-PT5S", dtype="duration").value == td(seconds=-5) + assert Attribute("u", 90, dtype="duration").value == td(seconds=90) # Zahl = Sekunden + assert Attribute("u", td(minutes=5), dtype="duration").value == td(minutes=5) + assert Attribute("u", None, dtype="duration").value is None + assert Attribute("u", True, dtype="duration").value is None # bool abgelehnt (lenient) + # ungültig / Jahre-Monate / keine Komponenten -> lenient None, strict raises + for bad in ["P1Y", "P", "PT", "nope"]: + assert Attribute("u", bad, dtype="duration").value is None + with pytest.raises(DtypeError): + Attribute("u", bad, dtype="duration", strict=True) + with pytest.raises(DtypeError): + Attribute("u", True, dtype="duration", strict=True) + + +def test_decimal_dtype(): + from decimal import Decimal + assert Attribute("p", "0.1", dtype="decimal").value == Decimal("0.1") + assert Attribute("p", 0.1, dtype="decimal").value == Decimal("0.1") # via str -> exakt + assert Attribute("p", 5, dtype="decimal").value == Decimal("5") + assert Attribute("p", Decimal("3.14"), dtype="decimal").value == Decimal("3.14") + assert Attribute("p", None, dtype="decimal").value is None + assert Attribute("p", "", dtype="decimal").value is None + assert Attribute("p", "abc", dtype="decimal").value is None # lenient + with pytest.raises(DtypeError): + Attribute("p", "abc", dtype="decimal", strict=True) + with pytest.raises(DtypeError): + Attribute("p", True, dtype="decimal", strict=True) # bool abgelehnt + + +def test_new_dtypes_resolve_and_xsd(): + from decimal import Decimal + assert dtypes.resolve(datetime.date) == "date" + assert dtypes.resolve(datetime.time) == "time" + assert dtypes.resolve(datetime.timedelta) == "duration" + assert dtypes.resolve(Decimal) == "decimal" + assert dtypes.resolve(datetime.datetime) == "timestamp" # date-Subklasse bleibt timestamp + assert dtypes.resolve("date") == "date" and dtypes.resolve("decimal") == "decimal" + xsd = dtypes.xsd_map() + assert xsd["date"] == "xsd:date" and xsd["time"] == "xsd:time" + assert xsd["duration"] == "xsd:duration" and xsd["decimal"] == "xsd:decimal" + + +def test_new_dtypes_to_json_and_default(): + from decimal import Decimal + td = datetime.timedelta + assert dtypes.get("date").to_json(datetime.date(2024, 1, 15)) == "2024-01-15" + assert dtypes.get("time").to_json(datetime.time(14, 30)) == "14:30:00" + assert dtypes.get("duration").to_json(td(hours=1, minutes=30)) == "PT1H30M" + assert dtypes.get("duration").to_json(td(days=2)) == "P2D" + assert dtypes.get("duration").to_json(td(seconds=-5)) == "-PT5S" + assert dtypes.get("duration").to_json(td(0)) == "PT0S" + assert dtypes.get("decimal").to_json(Decimal("3.140")) == "3.140" # Präzision erhalten + # passthrough (None / falscher Typ) je to_json + assert dtypes.get("date").to_json(None) is None + assert dtypes.get("time").to_json(None) is None + assert dtypes.get("duration").to_json(None) is None + assert dtypes.get("decimal").to_json(None) is None + # json_default deckt die Roh-Objekte ab + assert dtypes.json_default(Decimal("1.5")) == "1.5" + assert dtypes.json_default(td(minutes=90)) == "PT1H30M" + assert dtypes.json_default(datetime.date(2024, 1, 15)) == "2024-01-15" + assert dtypes.json_default(datetime.time(9, 0)) == "09:00:00" + + +def test_new_dtypes_json_roundtrip(): + from decimal import Decimal + m = Metadata() + m.add("acquired_on", "2024-01-15", dtype="date") + m.add("start_time", "09:30:00", dtype="time") + m.add("test_duration", "PT2H", dtype="duration") + m.add("price", "19.99", dtype="decimal") + restored = Metadata.from_json(m.to_json()) + assert restored.get("acquired_on").value == datetime.date(2024, 1, 15) + assert restored.get("start_time").value == datetime.time(9, 30) + assert restored.get("test_duration").value == datetime.timedelta(hours=2) + assert restored.get("price").value == Decimal("19.99") + + +def test_new_dtypes_jsonld_roundtrip(): + from decimal import Decimal + from sdata import semantic + m = Metadata(name="probe") + m.add("acquired_on", "2024-01-15", dtype="date") + m.add("price", "19.99", dtype="decimal") + doc = semantic.to_jsonld(m) + assert doc["sdata:acquired_on"]["@type"] == "xsd:date" + assert doc["sdata:price"]["@type"] == "xsd:decimal" + back = semantic.from_jsonld(doc) + assert back.get("acquired_on").value == datetime.date(2024, 1, 15) + assert back.get("price").value == Decimal("19.99") + + # --- dtype=class & Re-Cast -------------------------------------------------- def test_dtype_class_accepted(): assert Attribute("a", 1, dtype=int).dtype == "int"