Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,11 @@ native, format-agnostic metadata embedding for images. Core dependencies remain

### Added

- **New attribute dtypes** `date`, `time`, `duration` and `decimal` (pure stdlib):
`date`/`time` (`xsd:date`/`xsd:time`), `duration` as a `datetime.timedelta` parsed
from ISO 8601 (`xsd:duration`), and `decimal` as `decimal.Decimal` for exact numerics
(`xsd:decimal`). Round-trip through JSON and JSON-LD; lenient/`strict=` coercion as
for the existing dtypes.
- **Native image metadata (RFC 0005).** New pure-Python, Pillow-free module
`sdata.imagemeta` embeds/reads sdata metadata **natively** into six containers with
one API (`detect_format`/`embed`/`extract`/`supported_formats`): **PNG** (`iTXt`),
Expand Down
8 changes: 5 additions & 3 deletions docs/usage/metadata-jsonld.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,11 @@ pip install "sdata[schema]" # jsonschema -> JSON-Schema validation

Every [`Attribute`][sdata.metadata.Attribute] carries
`name, value, unit, dtype, description, label, required, ontology`. Supported
`dtype` values are `str, int, float, bool, list, timestamp, bytes, json, uri`.
Coercion is lenient by default; pass `strict=True` to raise
`sdata.dtypes.DtypeError` on invalid values.
`dtype` values are `str, int, float, bool, list, timestamp, bytes, json, uri,
date, time, duration, decimal`. Each maps to an XSD type for JSON-LD (e.g. `date`
→ `xsd:date`, `duration` → `xsd:duration` as ISO 8601 / `timedelta`, `decimal` →
`xsd:decimal` for exact numerics). Coercion is lenient by default; pass
`strict=True` to raise `sdata.dtypes.DtypeError` on invalid values.

```python
import pandas as pd
Expand Down
132 changes: 130 additions & 2 deletions sdata/dtypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,12 +15,15 @@
* **Strikt opt-in** – ``strict=True`` wirft ``DtypeError`` statt still zu
degradieren.
* **Erweiterbar** – neben den 6 Alt-dtypes (str/int/float/bool/timestamp/list)
zusätzlich ``bytes`` (base64), ``json`` (dict/list) und ``uri``.
zusätzlich ``bytes`` (base64), ``json`` (dict/list), ``uri`` sowie ``date``,
``time``, ``duration`` (ISO 8601 / ``timedelta``) und ``decimal`` (exakt).
"""
import base64
import binascii
import datetime
import json as _json
import re
from decimal import Decimal, InvalidOperation
from urllib.parse import urlsplit

import numpy as np
Expand Down Expand Up @@ -158,6 +161,80 @@
return text


def _c_date(value, strict):
if value is None or value == "":
return None
if isinstance(value, datetime.datetime): # vor date prüfen (Subklasse!)
return value.date()
if isinstance(value, datetime.date):
return value
try:
return datetime.date.fromisoformat(str(value))
except (ValueError, TypeError) as exp:
raise DtypeError("date: {!r} (ISO 8601 'YYYY-MM-DD')".format(value)) from exp


def _c_time(value, strict):
if value is None or value == "":
return None
if isinstance(value, datetime.datetime):
return value.timetz()
if isinstance(value, datetime.time):
return value
try:
return datetime.time.fromisoformat(str(value))
except (ValueError, TypeError) as exp:
raise DtypeError("time: {!r} (ISO 8601 'HH:MM:SS')".format(value)) from exp


#: ISO-8601-Dauer ohne Jahre/Monate (nicht als ``timedelta`` darstellbar).
_ISO_DURATION = re.compile(
r"^(?P<sign>-?)P"
r"(?:(?P<weeks>\d+(?:\.\d+)?)W)?"
r"(?:(?P<days>\d+(?:\.\d+)?)D)?"
r"(?:T"
r"(?:(?P<hours>\d+(?:\.\d+)?)H)?"
r"(?:(?P<minutes>\d+(?:\.\d+)?)M)?"
r"(?:(?P<seconds>\d+(?:\.\d+)?)S)?"
r")?$"
)


def _c_duration(value, strict):

Check warning on line 203 in sdata/dtypes.py

View check run for this annotation

Codacy Production / Codacy Static Code Analysis

sdata/dtypes.py#L203

Method _c_duration has a cyclomatic complexity of 12 (limit is 8)
if value is None or value == "":
return None
if isinstance(value, datetime.timedelta):
return value
if isinstance(value, bool): # bool ist int-Subklasse → ablehnen
raise DtypeError("duration: {!r}".format(value))
if isinstance(value, (int, float)):
return datetime.timedelta(seconds=value) # Zahl = Sekunden
match = _ISO_DURATION.match(str(value).strip())
parts = ({k: float(v) for k, v in match.groupdict().items()
if v is not None and k != "sign"} if match else None)
if not parts:
raise DtypeError("duration: {!r} (ISO 8601 like 'PT1H30M'; years/months "
"are not representable as a timedelta)".format(value))
td = datetime.timedelta(
weeks=parts.get("weeks", 0), days=parts.get("days", 0),
hours=parts.get("hours", 0), minutes=parts.get("minutes", 0),
seconds=parts.get("seconds", 0))
return -td if match.group("sign") == "-" else td


def _c_decimal(value, strict):
if value is None or value == "":
return None
if isinstance(value, Decimal):
return value
if isinstance(value, bool): # bool ist kein Dezimalwert
raise DtypeError("decimal: {!r}".format(value))
try:
return Decimal(str(value)) # via str: keine Float-Binär-Artefakte
except (InvalidOperation, ValueError, TypeError) as exp:
raise DtypeError("decimal: {!r}".format(value)) from exp


# --- JSON-Serialisierung je dtype -------------------------------------------
def _ts_to_json(value):
return str(value.utc) if isinstance(value, TimeStamp) else value
Expand All @@ -167,6 +244,42 @@
return base64.b64encode(value).decode("ascii") if isinstance(value, bytes) else value


def _date_to_json(value):
return value.isoformat() if isinstance(value, datetime.date) else value


def _time_to_json(value):
return value.isoformat() if isinstance(value, datetime.time) else value


def _duration_to_json(value):

Check warning on line 255 in sdata/dtypes.py

View check run for this annotation

Codacy Production / Codacy Static Code Analysis

sdata/dtypes.py#L255

Method _duration_to_json has a cyclomatic complexity of 10 (limit is 8)
"""``timedelta`` → kanonische ISO-8601-Dauer (``PT0S`` für null)."""
if not isinstance(value, datetime.timedelta):
return value
total = value.total_seconds()
sign = "-" if total < 0 else ""
total = abs(total)
days = int(total // 86400)
rem = total - days * 86400
hours = int(rem // 3600)
rem -= hours * 3600
minutes = int(rem // 60)
seconds = rem - minutes * 60
date_part = "{:d}D".format(days) if days else ""
time_part = ""
if hours:
time_part += "{:d}H".format(hours)
if minutes:
time_part += "{:d}M".format(minutes)
if seconds or not (date_part or time_part):
time_part += "{:g}S".format(seconds)
return sign + "P" + date_part + ("T" + time_part if time_part else "")


def _decimal_to_json(value):
return str(value) if isinstance(value, Decimal) else value


class DtypeSpec:
"""Beschreibt einen dtype: Coercion, JSON-Repräsentation, Klasse, XSD-Typ."""

Expand Down Expand Up @@ -201,6 +314,10 @@
DtypeSpec("bytes", bytes, _c_bytes, "xsd:base64Binary", _bytes_to_json),
DtypeSpec("json", dict, _c_json, "xsd:string"),
DtypeSpec("uri", str, _c_uri, "xsd:anyURI"),
DtypeSpec("date", datetime.date, _c_date, "xsd:date", _date_to_json),
DtypeSpec("time", datetime.time, _c_time, "xsd:time", _time_to_json),
DtypeSpec("duration", datetime.timedelta, _c_duration, "xsd:duration", _duration_to_json),
DtypeSpec("decimal", Decimal, _c_decimal, "xsd:decimal", _decimal_to_json),
]:
register(_spec)

Expand All @@ -222,6 +339,8 @@
float: "float", int: "int", str: "str",
datetime.datetime: "timestamp", bool: "bool", list: "list",
bytes: "bytes", dict: "json",
datetime.date: "date", datetime.time: "time",
datetime.timedelta: "duration", Decimal: "decimal",
}
XSD = {name: spec.xsd for name, spec in _REGISTRY.items()}

Expand Down Expand Up @@ -263,9 +382,18 @@


def json_default(obj):
"""``default=`` für ``json.dumps``: serialisiert TimeStamp/bytes JSON-sicher."""
"""``default=`` für ``json.dumps``: serialisiert die nicht-nativen dtype-Werte

Check notice on line 385 in sdata/dtypes.py

View check run for this annotation

Codacy Production / Codacy Static Code Analysis

sdata/dtypes.py#L385

1 blank line required between summary line and description (found 0) (D205)

Check notice on line 385 in sdata/dtypes.py

View check run for this annotation

Codacy Production / Codacy Static Code Analysis

sdata/dtypes.py#L385

First line should end with a period, question mark, or exclamation point (not 'e') (D415)

Check notice on line 385 in sdata/dtypes.py

View check run for this annotation

Codacy Production / Codacy Static Code Analysis

sdata/dtypes.py#L385

Multi-line docstring closing quotes should be on a separate line (D209)

Check notice on line 385 in sdata/dtypes.py

View check run for this annotation

Codacy Production / Codacy Static Code Analysis

sdata/dtypes.py#L385

Multi-line docstring summary should start at the second line (D213)
(TimeStamp/bytes/Decimal/timedelta/date/time) JSON-sicher."""
if isinstance(obj, TimeStamp):
return str(obj.utc)
if isinstance(obj, (bytes, bytearray)):
return base64.b64encode(bytes(obj)).decode("ascii")
if isinstance(obj, Decimal):
return str(obj)
if isinstance(obj, datetime.timedelta):
return _duration_to_json(obj)
if isinstance(obj, datetime.date): # fängt auch datetime.datetime
return obj.isoformat()
if isinstance(obj, datetime.time):
return obj.isoformat()
raise TypeError("not JSON serializable: {!r}".format(type(obj)))
2 changes: 2 additions & 0 deletions sdata/semantic.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,8 @@
"xsd:string": "str", "xsd:integer": "int", "xsd:double": "float",
"xsd:boolean": "bool", "xsd:dateTime": "timestamp",
"xsd:base64Binary": "bytes", "xsd:anyURI": "uri",
"xsd:date": "date", "xsd:time": "time",
"xsd:duration": "duration", "xsd:decimal": "decimal",
}


Expand Down
122 changes: 122 additions & 0 deletions tests/test_dtypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,6 +153,128 @@ def test_uri_dtype():
Attribute("u", " ", dtype="uri", strict=True)


# --- date / time / duration / decimal (neu) --------------------------------
def test_date_dtype():
assert Attribute("d", "2024-01-15", dtype="date").value == datetime.date(2024, 1, 15)
assert Attribute("d", datetime.date(2024, 1, 15), dtype="date").value == datetime.date(2024, 1, 15)
# datetime -> date (Subklasse korrekt zuerst behandelt)
assert Attribute("d", datetime.datetime(2024, 1, 15, 9, 30),
dtype="date").value == datetime.date(2024, 1, 15)
assert Attribute("d", None, dtype="date").value is None
assert Attribute("d", "", dtype="date").value is None
assert Attribute("d", "nope", dtype="date").value is None # lenient
with pytest.raises(DtypeError):
Attribute("d", "nope", dtype="date", strict=True)


def test_time_dtype():
assert Attribute("t", "14:30:00", dtype="time").value == datetime.time(14, 30)
assert Attribute("t", datetime.time(14, 30), dtype="time").value == datetime.time(14, 30)
assert Attribute("t", datetime.datetime(2024, 1, 15, 14, 30),
dtype="time").value == datetime.time(14, 30)
assert Attribute("t", None, dtype="time").value is None
assert Attribute("t", "25:99", dtype="time").value is None # lenient
with pytest.raises(DtypeError):
Attribute("t", "25:99", dtype="time", strict=True)


def test_duration_dtype():
td = datetime.timedelta
assert Attribute("u", "PT1H30M", dtype="duration").value == td(hours=1, minutes=30)
assert Attribute("u", "P2DT3H", dtype="duration").value == td(days=2, hours=3)
assert Attribute("u", "P1W", dtype="duration").value == td(weeks=1)
assert Attribute("u", "-PT5S", dtype="duration").value == td(seconds=-5)
assert Attribute("u", 90, dtype="duration").value == td(seconds=90) # Zahl = Sekunden
assert Attribute("u", td(minutes=5), dtype="duration").value == td(minutes=5)
assert Attribute("u", None, dtype="duration").value is None
assert Attribute("u", True, dtype="duration").value is None # bool abgelehnt (lenient)
# ungültig / Jahre-Monate / keine Komponenten -> lenient None, strict raises
for bad in ["P1Y", "P", "PT", "nope"]:
assert Attribute("u", bad, dtype="duration").value is None
with pytest.raises(DtypeError):
Attribute("u", bad, dtype="duration", strict=True)
with pytest.raises(DtypeError):
Attribute("u", True, dtype="duration", strict=True)


def test_decimal_dtype():
from decimal import Decimal
assert Attribute("p", "0.1", dtype="decimal").value == Decimal("0.1")
assert Attribute("p", 0.1, dtype="decimal").value == Decimal("0.1") # via str -> exakt
assert Attribute("p", 5, dtype="decimal").value == Decimal("5")
assert Attribute("p", Decimal("3.14"), dtype="decimal").value == Decimal("3.14")
assert Attribute("p", None, dtype="decimal").value is None
assert Attribute("p", "", dtype="decimal").value is None
assert Attribute("p", "abc", dtype="decimal").value is None # lenient
with pytest.raises(DtypeError):
Attribute("p", "abc", dtype="decimal", strict=True)
with pytest.raises(DtypeError):
Attribute("p", True, dtype="decimal", strict=True) # bool abgelehnt


def test_new_dtypes_resolve_and_xsd():
from decimal import Decimal
assert dtypes.resolve(datetime.date) == "date"
assert dtypes.resolve(datetime.time) == "time"
assert dtypes.resolve(datetime.timedelta) == "duration"
assert dtypes.resolve(Decimal) == "decimal"
assert dtypes.resolve(datetime.datetime) == "timestamp" # date-Subklasse bleibt timestamp
assert dtypes.resolve("date") == "date" and dtypes.resolve("decimal") == "decimal"
xsd = dtypes.xsd_map()
assert xsd["date"] == "xsd:date" and xsd["time"] == "xsd:time"
assert xsd["duration"] == "xsd:duration" and xsd["decimal"] == "xsd:decimal"


def test_new_dtypes_to_json_and_default():
from decimal import Decimal
td = datetime.timedelta
assert dtypes.get("date").to_json(datetime.date(2024, 1, 15)) == "2024-01-15"
assert dtypes.get("time").to_json(datetime.time(14, 30)) == "14:30:00"
assert dtypes.get("duration").to_json(td(hours=1, minutes=30)) == "PT1H30M"
assert dtypes.get("duration").to_json(td(days=2)) == "P2D"
assert dtypes.get("duration").to_json(td(seconds=-5)) == "-PT5S"
assert dtypes.get("duration").to_json(td(0)) == "PT0S"
assert dtypes.get("decimal").to_json(Decimal("3.140")) == "3.140" # Präzision erhalten
# passthrough (None / falscher Typ) je to_json
assert dtypes.get("date").to_json(None) is None
assert dtypes.get("time").to_json(None) is None
assert dtypes.get("duration").to_json(None) is None
assert dtypes.get("decimal").to_json(None) is None
# json_default deckt die Roh-Objekte ab
assert dtypes.json_default(Decimal("1.5")) == "1.5"
assert dtypes.json_default(td(minutes=90)) == "PT1H30M"
assert dtypes.json_default(datetime.date(2024, 1, 15)) == "2024-01-15"
assert dtypes.json_default(datetime.time(9, 0)) == "09:00:00"


def test_new_dtypes_json_roundtrip():
from decimal import Decimal
m = Metadata()
m.add("acquired_on", "2024-01-15", dtype="date")
m.add("start_time", "09:30:00", dtype="time")
m.add("test_duration", "PT2H", dtype="duration")
m.add("price", "19.99", dtype="decimal")
restored = Metadata.from_json(m.to_json())
assert restored.get("acquired_on").value == datetime.date(2024, 1, 15)
assert restored.get("start_time").value == datetime.time(9, 30)
assert restored.get("test_duration").value == datetime.timedelta(hours=2)
assert restored.get("price").value == Decimal("19.99")


def test_new_dtypes_jsonld_roundtrip():
from decimal import Decimal
from sdata import semantic
m = Metadata(name="probe")
m.add("acquired_on", "2024-01-15", dtype="date")
m.add("price", "19.99", dtype="decimal")
doc = semantic.to_jsonld(m)
assert doc["sdata:acquired_on"]["@type"] == "xsd:date"
assert doc["sdata:price"]["@type"] == "xsd:decimal"
back = semantic.from_jsonld(doc)
assert back.get("acquired_on").value == datetime.date(2024, 1, 15)
assert back.get("price").value == Decimal("19.99")


# --- dtype=class & Re-Cast --------------------------------------------------
def test_dtype_class_accepted():
assert Attribute("a", 1, dtype=int).dtype == "int"
Expand Down