Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 7 additions & 6 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,13 +15,14 @@ native, format-agnostic metadata embedding for images. Core dependencies remain

### Added

- **New attribute dtypes** `date`, `time`, `duration`, `decimal`, `complex` and
`floatlist` (pure stdlib): `date`/`time` (`xsd:date`/`xsd:time`), `duration` as a
`datetime.timedelta` parsed from ISO 8601 (`xsd:duration`), `decimal` as
`decimal.Decimal` for exact numerics (`xsd:decimal`), `complex` numbers and
- **New attribute dtypes** `date`, `time`, `duration`, `decimal`, `complex`,
`floatlist` and `langstring` (pure stdlib): `date`/`time` (`xsd:date`/`xsd:time`),
`duration` as a `datetime.timedelta` parsed from ISO 8601 (`xsd:duration`), `decimal`
as `decimal.Decimal` for exact numerics (`xsd:decimal`), `complex` numbers and
`floatlist` (typed `list[float]`, also from numpy arrays) — the latter two use the
custom datatype CURIEs `sdata:complex` / `sdata:floatlist` (no standard XSD type) for
a lossless JSON-LD round-trip. Lenient/`strict=` coercion as for the existing dtypes.
custom datatype CURIEs `sdata:complex` / `sdata:floatlist`, and `langstring`
(`rdf:langString`, `"Hallo@de"`) renders via JSON-LD `@language` — all with a lossless
round-trip. Lenient/`strict=` coercion as for the existing dtypes.
- **Native image metadata (RFC 0005).** New pure-Python, Pillow-free module
`sdata.imagemeta` embeds/reads sdata metadata **natively** into six containers with
one API (`detect_format`/`embed`/`extract`/`supported_formats`): **PNG** (`iTXt`),
Expand Down
14 changes: 8 additions & 6 deletions docs/usage/metadata-jsonld.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,12 +19,14 @@ pip install "sdata[schema]" # jsonschema -> JSON-Schema validation
Every [`Attribute`][sdata.metadata.Attribute] carries
`name, value, unit, dtype, description, label, required, ontology`. Supported
`dtype` values are `str, int, float, bool, list, timestamp, bytes, json, uri,
date, time, duration, decimal, complex, floatlist`. Each maps to an XSD type for
JSON-LD (e.g. `date` → `xsd:date`, `duration` → `xsd:duration` as ISO 8601 /
`timedelta`, `decimal` → `xsd:decimal` for exact numerics); `complex` and
`floatlist` (a typed `list[float]`) have no standard XSD type and use the custom
datatype CURIEs `sdata:complex` / `sdata:floatlist`. Coercion is lenient by
default; pass `strict=True` to raise `sdata.dtypes.DtypeError` on invalid values.
date, time, duration, decimal, complex, floatlist, langstring`. Each maps to an XSD
type for JSON-LD (e.g. `date` → `xsd:date`, `duration` → `xsd:duration` as ISO 8601 /
`timedelta`, `decimal` → `xsd:decimal` for exact numerics); `complex` and `floatlist`
(a typed `list[float]`) have no standard XSD type and use the custom datatype CURIEs
`sdata:complex` / `sdata:floatlist`; `langstring` is a language-tagged string
(`"Hallo@de"` → `{"@value": "Hallo", "@language": "de"}` in JSON-LD). Coercion is
lenient by default; pass `strict=True` to raise `sdata.dtypes.DtypeError` on invalid
values.

```python
import pandas as pd
Expand Down
69 changes: 63 additions & 6 deletions sdata/dtypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,11 @@
degradieren.
* **Erweiterbar** – neben den 6 Alt-dtypes (str/int/float/bool/timestamp/list)
zusätzlich ``bytes`` (base64), ``json`` (dict/list), ``uri``, ``date``, ``time``,
``duration`` (ISO 8601 / ``timedelta``), ``decimal`` (exakt), ``complex`` sowie
``floatlist`` (typisierte Float-Liste). ``complex``/``floatlist`` haben keinen
Standard-XSD-Typ und nutzen daher eigene Datentyp-CURIEs (``sdata:complex`` /
``sdata:floatlist``).
``duration`` (ISO 8601 / ``timedelta``), ``decimal`` (exakt), ``complex``,
``floatlist`` (typisierte Float-Liste) sowie ``langstring`` (sprach-getaggt,
``rdf:langString``). ``complex``/``floatlist`` haben keinen Standard-XSD-Typ und
nutzen eigene Datentyp-CURIEs (``sdata:complex`` / ``sdata:floatlist``);
``langstring`` wird in JSON-LD über ``@language`` ausgedrückt.
"""
import base64
import binascii
Expand All @@ -35,7 +36,7 @@
from sdata.timestamp import TimeStamp

__all__ = [
"DtypeError", "DtypeSpec", "register", "get", "names", "resolve",
"DtypeError", "DtypeSpec", "LangString", "register", "get", "names", "resolve",
"coerce", "xsd_map", "json_default", "XSD", "DTYPES", "DTYPES_INV",
]

Expand All @@ -44,6 +45,33 @@
"""Wert kann nicht in den Ziel-dtype überführt werden (v.a. im strict-Modus)."""


class LangString:
"""Ein sprach-getaggter String (``rdf:langString``): ``text`` + BCP-47 ``lang``.

Check notice on line 49 in sdata/dtypes.py

View check run for this annotation

Codacy Production / Codacy Static Code Analysis

sdata/dtypes.py#L49

1 blank line required before class docstring (found 0) (D203)

Check notice on line 49 in sdata/dtypes.py

View check run for this annotation

Codacy Production / Codacy Static Code Analysis

sdata/dtypes.py#L49

Multi-line docstring summary should start at the second line (D213)

In JSON-LD als ``{"@value": text, "@language": lang}`` repräsentiert (nicht über
``@type``). Die kompakte Textform ist ``"text@lang"`` (z. B. ``"Hallo@de"``).
"""

__slots__ = ("text", "lang")

def __init__(self, text, lang=""):

Check notice on line 57 in sdata/dtypes.py

View check run for this annotation

Codacy Production / Codacy Static Code Analysis

sdata/dtypes.py#L57

Missing docstring in __init__ (D107)
self.text = str(text)
self.lang = str(lang or "")

def __eq__(self, other):

Check notice on line 61 in sdata/dtypes.py

View check run for this annotation

Codacy Production / Codacy Static Code Analysis

sdata/dtypes.py#L61

Missing docstring in magic method (D105)
return (isinstance(other, LangString)
and self.text == other.text and self.lang == other.lang)

def __hash__(self):

Check notice on line 65 in sdata/dtypes.py

View check run for this annotation

Codacy Production / Codacy Static Code Analysis

sdata/dtypes.py#L65

Missing docstring in magic method (D105)
return hash((self.text, self.lang))

def __str__(self):

Check notice on line 68 in sdata/dtypes.py

View check run for this annotation

Codacy Production / Codacy Static Code Analysis

sdata/dtypes.py#L68

Missing docstring in magic method (D105)
return "{}@{}".format(self.text, self.lang) if self.lang else self.text

def __repr__(self):

Check notice on line 71 in sdata/dtypes.py

View check run for this annotation

Codacy Production / Codacy Static Code Analysis

sdata/dtypes.py#L71

Missing docstring in magic method (D105)
return "LangString({!r}, {!r})".format(self.text, self.lang)


def _isna(value):
"""``pd.isna`` skalar; Array-Eingaben (Liste o.ä.) gelten als nicht-NA."""
result = pd.isna(value)
Expand Down Expand Up @@ -268,6 +296,27 @@
raise DtypeError("floatlist: {!r}".format(value)) from exp


#: Sprach-Tag am Stringende (BCP-47-artig) zum Zerlegen von ``"text@lang"``.
_LANG_TAG = re.compile(r"^(.*)@([A-Za-z]{2,8}(?:-[A-Za-z0-9]{1,8})*)$", re.DOTALL)


def _c_langstring(value, strict):
if value is None:
return None
if isinstance(value, LangString):
return value
if isinstance(value, (tuple, list)) and len(value) == 2:
return LangString(value[0], value[1]) # (text, lang) explizit & eindeutig
if isinstance(value, dict):
return LangString(value.get("@value", value.get("text", "")),
value.get("@language", value.get("lang", "")))
text = str(value)
if text == "":
return None
match = _LANG_TAG.match(text) # "text@lang" -> (text, lang)
return LangString(match.group(1), match.group(2)) if match else LangString(text, "")


# --- JSON-Serialisierung je dtype -------------------------------------------
def _ts_to_json(value):
return str(value.utc) if isinstance(value, TimeStamp) else value
Expand Down Expand Up @@ -317,6 +366,10 @@
return str(value) if isinstance(value, complex) else value


def _langstring_to_json(value):
return str(value) if isinstance(value, LangString) else value


class DtypeSpec:
"""Beschreibt einen dtype: Coercion, JSON-Repräsentation, Klasse, XSD-Typ."""

Expand Down Expand Up @@ -359,6 +412,8 @@
# -> eigener Datentyp-CURIE in der sdata-Namespace (verlustfreier JSON-LD-Roundtrip).
DtypeSpec("complex", complex, _c_complex, "sdata:complex", _complex_to_json),
DtypeSpec("floatlist", list, _c_floatlist, "sdata:floatlist"),
# sprach-getaggter String: JSON-LD nutzt @language (nicht @type), siehe semantic.py
DtypeSpec("langstring", LangString, _c_langstring, "rdf:langString", _langstring_to_json),
]:
register(_spec)

Expand All @@ -382,7 +437,7 @@
bytes: "bytes", dict: "json",
datetime.date: "date", datetime.time: "time",
datetime.timedelta: "duration", Decimal: "decimal",
complex: "complex",
complex: "complex", LangString: "langstring",
}
XSD = {name: spec.xsd for name, spec in _REGISTRY.items()}

Expand Down Expand Up @@ -436,6 +491,8 @@
return str(obj)
if isinstance(obj, complex):
return str(obj)
if isinstance(obj, LangString):
return str(obj)
if isinstance(obj, datetime.timedelta):
return _duration_to_json(obj)
if isinstance(obj, datetime.date): # fängt auch datetime.datetime
Expand Down
18 changes: 16 additions & 2 deletions sdata/semantic.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,16 @@


def _value_literal(attr):
"""Typisiertes JSON-LD-Literal ``{"@value": …, "@type": xsd}`` für ein Attribut."""
"""Typisiertes JSON-LD-Literal ``{"@value": …, "@type": xsd}`` für ein Attribut.

Check notice on line 81 in sdata/semantic.py

View check run for this annotation

Codacy Production / Codacy Static Code Analysis

sdata/semantic.py#L81

Multi-line docstring summary should start at the second line (D213)

Ausnahme ``langstring``: rdf:langString wird über ``@language`` ausgedrückt
(nicht ``@type``).
"""
if attr.dtype == "langstring" and isinstance(attr.value, dtypes.LangString):
node = {"@value": attr.value.text}
if attr.value.lang:
node["@language"] = attr.value.lang
return node
spec = dtypes.get(attr.dtype) or dtypes.get("str")
return {"@value": spec.to_json(attr.value), "@type": vocab.xsd_for_dtype(attr.dtype)}

Expand Down Expand Up @@ -176,8 +185,10 @@
"""Rekonstruiere ein User-Attribut aus Knoten/Literal."""
unit = "-"
ontology = ""
lang = None
if isinstance(node, dict) and "@value" in node:
raw, xsd = node.get("@value"), node.get("@type")
lang = node.get("@language")
elif isinstance(node, dict):
literal = node.get("value", {})
raw = literal.get("@value") if isinstance(literal, dict) else literal
Expand All @@ -189,11 +200,14 @@
ontology = next((t for t in type_list if t != "qudt:Quantity"), "")
else:
raw, xsd = node, None
# dtype bestimmen: JSON-Typ hat Vorrang (list/json), sonst XSD-Rückabbildung
# dtype bestimmen: JSON-Typ/@language haben Vorrang, sonst XSD-Rückabbildung
if isinstance(raw, list):
dtype = "floatlist" if xsd == "sdata:floatlist" else "list"
elif isinstance(raw, dict):
dtype = "json"
elif lang:
dtype = "langstring"
raw = "{}@{}".format(raw, lang) # text@lang rekonstruieren
else:
dtype = _DTYPE_FROM_XSD.get(xsd, "str")
metadata.set_attr(name, raw, dtype=dtype, unit=unit, ontology=ontology)
Expand Down
54 changes: 54 additions & 0 deletions tests/test_dtypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -347,6 +347,60 @@ def test_complex_floatlist_jsonld_roundtrip():
assert back.get("spectrum").value == [1.0, 2.5, 3.0] # floatlist, nicht str-list


# --- langstring (neu) ------------------------------------------------------
def test_langstring_coercion():
from sdata.dtypes import LangString
ls = Attribute("l", "Hallo@de", dtype="langstring").value
assert isinstance(ls, LangString) and ls.text == "Hallo" and ls.lang == "de"
assert Attribute("l", "Hello@en-US", dtype="langstring").value == LangString("Hello", "en-US")
assert Attribute("l", "plain", dtype="langstring").value == LangString("plain", "")
assert Attribute("l", "a@b.com", dtype="langstring").value == LangString("a@b.com", "") # kein Tag
assert Attribute("l", ("meeting@noon", "en"), dtype="langstring").value == LangString("meeting@noon", "en")
assert Attribute("l", {"@value": "Bonjour", "@language": "fr"},
dtype="langstring").value == LangString("Bonjour", "fr")
assert Attribute("l", LangString("x", "de"), dtype="langstring").value == LangString("x", "de")
assert Attribute("l", None, dtype="langstring").value is None
assert Attribute("l", "", dtype="langstring").value is None


def test_langstring_class_and_json():
from sdata.dtypes import LangString
assert str(LangString("Hallo", "de")) == "Hallo@de"
assert str(LangString("Hallo", "")) == "Hallo" # ohne Tag
assert repr(LangString("Hallo", "de")) == "LangString('Hallo', 'de')"
assert LangString("a", "de") != "a@de" # != Nicht-LangString
assert LangString("a", "de") != LangString("a", "en") # Lang unterscheidet
assert {LangString("a", "de"): 1}[LangString("a", "de")] == 1 # hashbar
assert dtypes.resolve("langString") == "langstring" # case-insensitiv
assert dtypes.resolve(LangString) == "langstring"
assert dtypes.xsd_map()["langstring"] == "rdf:langString"
assert dtypes.get("langstring").to_json(LangString("Hallo", "de")) == "Hallo@de"
assert dtypes.get("langstring").to_json(None) is None # passthrough
assert dtypes.json_default(LangString("Hallo", "de")) == "Hallo@de"


def test_langstring_json_roundtrip():
from sdata.dtypes import LangString
m = Metadata()
m.add("label", "Hallo@de", dtype="langstring")
restored = Metadata.from_json(m.to_json())
assert restored.get("label").value == LangString("Hallo", "de")


def test_langstring_jsonld_roundtrip():
from sdata.dtypes import LangString
from sdata import semantic
m = Metadata(name="probe")
m.add("label", "Hallo@de", dtype="langstring")
m.add("note", "plain", dtype="langstring") # ohne Tag -> degradiert zu str
doc = semantic.to_jsonld(m)
assert doc["sdata:label"] == {"@value": "Hallo", "@language": "de"} # rdf:langString via @language
assert doc["sdata:note"] == {"@value": "plain"} # kein @language/@type
back = semantic.from_jsonld(doc)
assert back.get("label").value == LangString("Hallo", "de")
assert back.get("note").value == "plain" # ohne Sprach-Tag -> str


# --- dtype=class & Re-Cast --------------------------------------------------
def test_dtype_class_accepted():
assert Attribute("a", 1, dtype=int).dtype == "int"
Expand Down