From cf1db79b985058d12617b3f29b502012945c04b6 Mon Sep 17 00:00:00 2001 From: lepy Date: Mon, 29 Jun 2026 17:17:03 +0200 Subject: [PATCH] =?UTF-8?q?feat(dtypes):=20langstring=20=E2=80=94=20sprach?= =?UTF-8?q?-getaggte=20Strings=20(rdf:langString)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Neuer dtype langstring + Wertklasse LangString(text, lang). In JSON-LD ueber @language ausgedrueckt (nicht @type): {"@value": "Hallo", "@language": "de"} — also korrektes rdf:langString. Kompakte Textform "text@lang" ("Hallo@de"). Coercion: LangString-Passthrough, (text, lang)-Tupel/Liste, {"@value","@language"}- Dict, oder String mit endstaendigem BCP-47-Tag (ambig "a@b.com" -> kein Tag); leer/None -> None. Registry-Key "langstring" (resolve lowercased -> "langString" funktioniert auch). DTYPES_INV, json_default, to_json (-> "Hallo@de") angebunden. semantic.py: _value_literal rendert langstring via @language (mit/ohne Tag); _set_from_node liest @language zurueck und rekonstruiert text@lang. Ohne Sprach-Tag degradiert ein langstring sauber zu str (rdf:langString verlangt ein Tag). Tests: Coercion (alle Eingabeformen), LangString-Klasse (eq/hash/str/repr), to_json, json_default, resolve/xsd, JSON- und JSON-LD-Round-Trip (mit und ohne Tag). dtypes.py 307/307 100%, semantic.py 100%. Doku + CHANGELOG [1.3.0] ergaenzt. Kanonische CI: 620 passed, 7 skipped, TOTAL 100%; mkdocs --strict gruen. --- CHANGELOG.md | 13 ++++--- docs/usage/metadata-jsonld.md | 14 ++++--- sdata/dtypes.py | 69 ++++++++++++++++++++++++++++++++--- sdata/semantic.py | 18 ++++++++- tests/test_dtypes.py | 54 +++++++++++++++++++++++++++ 5 files changed, 148 insertions(+), 20 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 787a0a6..5b4715c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -15,13 +15,14 @@ native, format-agnostic metadata embedding for images. Core dependencies remain ### Added -- **New attribute dtypes** `date`, `time`, `duration`, `decimal`, `complex` and - `floatlist` (pure stdlib): `date`/`time` (`xsd:date`/`xsd:time`), `duration` as a - `datetime.timedelta` parsed from ISO 8601 (`xsd:duration`), `decimal` as - `decimal.Decimal` for exact numerics (`xsd:decimal`), `complex` numbers and +- **New attribute dtypes** `date`, `time`, `duration`, `decimal`, `complex`, + `floatlist` and `langstring` (pure stdlib): `date`/`time` (`xsd:date`/`xsd:time`), + `duration` as a `datetime.timedelta` parsed from ISO 8601 (`xsd:duration`), `decimal` + as `decimal.Decimal` for exact numerics (`xsd:decimal`), `complex` numbers and `floatlist` (typed `list[float]`, also from numpy arrays) — the latter two use the - custom datatype CURIEs `sdata:complex` / `sdata:floatlist` (no standard XSD type) for - a lossless JSON-LD round-trip. Lenient/`strict=` coercion as for the existing dtypes. + custom datatype CURIEs `sdata:complex` / `sdata:floatlist`, and `langstring` + (`rdf:langString`, `"Hallo@de"`) renders via JSON-LD `@language` — all with a lossless + round-trip. Lenient/`strict=` coercion as for the existing dtypes. - **Native image metadata (RFC 0005).** New pure-Python, Pillow-free module `sdata.imagemeta` embeds/reads sdata metadata **natively** into six containers with one API (`detect_format`/`embed`/`extract`/`supported_formats`): **PNG** (`iTXt`), diff --git a/docs/usage/metadata-jsonld.md b/docs/usage/metadata-jsonld.md index dedca91..38b442d 100644 --- a/docs/usage/metadata-jsonld.md +++ b/docs/usage/metadata-jsonld.md @@ -19,12 +19,14 @@ pip install "sdata[schema]" # jsonschema -> JSON-Schema validation Every [`Attribute`][sdata.metadata.Attribute] carries `name, value, unit, dtype, description, label, required, ontology`. Supported `dtype` values are `str, int, float, bool, list, timestamp, bytes, json, uri, -date, time, duration, decimal, complex, floatlist`. Each maps to an XSD type for -JSON-LD (e.g. `date` → `xsd:date`, `duration` → `xsd:duration` as ISO 8601 / -`timedelta`, `decimal` → `xsd:decimal` for exact numerics); `complex` and -`floatlist` (a typed `list[float]`) have no standard XSD type and use the custom -datatype CURIEs `sdata:complex` / `sdata:floatlist`. Coercion is lenient by -default; pass `strict=True` to raise `sdata.dtypes.DtypeError` on invalid values. +date, time, duration, decimal, complex, floatlist, langstring`. Each maps to an XSD +type for JSON-LD (e.g. `date` → `xsd:date`, `duration` → `xsd:duration` as ISO 8601 / +`timedelta`, `decimal` → `xsd:decimal` for exact numerics); `complex` and `floatlist` +(a typed `list[float]`) have no standard XSD type and use the custom datatype CURIEs +`sdata:complex` / `sdata:floatlist`; `langstring` is a language-tagged string +(`"Hallo@de"` → `{"@value": "Hallo", "@language": "de"}` in JSON-LD). Coercion is +lenient by default; pass `strict=True` to raise `sdata.dtypes.DtypeError` on invalid +values. ```python import pandas as pd diff --git a/sdata/dtypes.py b/sdata/dtypes.py index 37c9e25..b294c11 100644 --- a/sdata/dtypes.py +++ b/sdata/dtypes.py @@ -16,10 +16,11 @@ degradieren. * **Erweiterbar** – neben den 6 Alt-dtypes (str/int/float/bool/timestamp/list) zusätzlich ``bytes`` (base64), ``json`` (dict/list), ``uri``, ``date``, ``time``, - ``duration`` (ISO 8601 / ``timedelta``), ``decimal`` (exakt), ``complex`` sowie - ``floatlist`` (typisierte Float-Liste). ``complex``/``floatlist`` haben keinen - Standard-XSD-Typ und nutzen daher eigene Datentyp-CURIEs (``sdata:complex`` / - ``sdata:floatlist``). + ``duration`` (ISO 8601 / ``timedelta``), ``decimal`` (exakt), ``complex``, + ``floatlist`` (typisierte Float-Liste) sowie ``langstring`` (sprach-getaggt, + ``rdf:langString``). ``complex``/``floatlist`` haben keinen Standard-XSD-Typ und + nutzen eigene Datentyp-CURIEs (``sdata:complex`` / ``sdata:floatlist``); + ``langstring`` wird in JSON-LD über ``@language`` ausgedrückt. """ import base64 import binascii @@ -35,7 +36,7 @@ from sdata.timestamp import TimeStamp __all__ = [ - "DtypeError", "DtypeSpec", "register", "get", "names", "resolve", + "DtypeError", "DtypeSpec", "LangString", "register", "get", "names", "resolve", "coerce", "xsd_map", "json_default", "XSD", "DTYPES", "DTYPES_INV", ] @@ -44,6 +45,33 @@ class DtypeError(ValueError): """Wert kann nicht in den Ziel-dtype überführt werden (v.a. im strict-Modus).""" +class LangString: + """Ein sprach-getaggter String (``rdf:langString``): ``text`` + BCP-47 ``lang``. + + In JSON-LD als ``{"@value": text, "@language": lang}`` repräsentiert (nicht über + ``@type``). Die kompakte Textform ist ``"text@lang"`` (z. B. ``"Hallo@de"``). + """ + + __slots__ = ("text", "lang") + + def __init__(self, text, lang=""): + self.text = str(text) + self.lang = str(lang or "") + + def __eq__(self, other): + return (isinstance(other, LangString) + and self.text == other.text and self.lang == other.lang) + + def __hash__(self): + return hash((self.text, self.lang)) + + def __str__(self): + return "{}@{}".format(self.text, self.lang) if self.lang else self.text + + def __repr__(self): + return "LangString({!r}, {!r})".format(self.text, self.lang) + + def _isna(value): """``pd.isna`` skalar; Array-Eingaben (Liste o.ä.) gelten als nicht-NA.""" result = pd.isna(value) @@ -268,6 +296,27 @@ def _c_floatlist(value, strict): raise DtypeError("floatlist: {!r}".format(value)) from exp +#: Sprach-Tag am Stringende (BCP-47-artig) zum Zerlegen von ``"text@lang"``. +_LANG_TAG = re.compile(r"^(.*)@([A-Za-z]{2,8}(?:-[A-Za-z0-9]{1,8})*)$", re.DOTALL) + + +def _c_langstring(value, strict): + if value is None: + return None + if isinstance(value, LangString): + return value + if isinstance(value, (tuple, list)) and len(value) == 2: + return LangString(value[0], value[1]) # (text, lang) explizit & eindeutig + if isinstance(value, dict): + return LangString(value.get("@value", value.get("text", "")), + value.get("@language", value.get("lang", ""))) + text = str(value) + if text == "": + return None + match = _LANG_TAG.match(text) # "text@lang" -> (text, lang) + return LangString(match.group(1), match.group(2)) if match else LangString(text, "") + + # --- JSON-Serialisierung je dtype ------------------------------------------- def _ts_to_json(value): return str(value.utc) if isinstance(value, TimeStamp) else value @@ -317,6 +366,10 @@ def _complex_to_json(value): return str(value) if isinstance(value, complex) else value +def _langstring_to_json(value): + return str(value) if isinstance(value, LangString) else value + + class DtypeSpec: """Beschreibt einen dtype: Coercion, JSON-Repräsentation, Klasse, XSD-Typ.""" @@ -359,6 +412,8 @@ def register(spec): # -> eigener Datentyp-CURIE in der sdata-Namespace (verlustfreier JSON-LD-Roundtrip). DtypeSpec("complex", complex, _c_complex, "sdata:complex", _complex_to_json), DtypeSpec("floatlist", list, _c_floatlist, "sdata:floatlist"), + # sprach-getaggter String: JSON-LD nutzt @language (nicht @type), siehe semantic.py + DtypeSpec("langstring", LangString, _c_langstring, "rdf:langString", _langstring_to_json), ]: register(_spec) @@ -382,7 +437,7 @@ def names(): bytes: "bytes", dict: "json", datetime.date: "date", datetime.time: "time", datetime.timedelta: "duration", Decimal: "decimal", - complex: "complex", + complex: "complex", LangString: "langstring", } XSD = {name: spec.xsd for name, spec in _REGISTRY.items()} @@ -436,6 +491,8 @@ def json_default(obj): return str(obj) if isinstance(obj, complex): return str(obj) + if isinstance(obj, LangString): + return str(obj) if isinstance(obj, datetime.timedelta): return _duration_to_json(obj) if isinstance(obj, datetime.date): # fängt auch datetime.datetime diff --git a/sdata/semantic.py b/sdata/semantic.py index 107976e..602d66d 100644 --- a/sdata/semantic.py +++ b/sdata/semantic.py @@ -78,7 +78,16 @@ def _class_local(class_spec): def _value_literal(attr): - """Typisiertes JSON-LD-Literal ``{"@value": …, "@type": xsd}`` für ein Attribut.""" + """Typisiertes JSON-LD-Literal ``{"@value": …, "@type": xsd}`` für ein Attribut. + + Ausnahme ``langstring``: rdf:langString wird über ``@language`` ausgedrückt + (nicht ``@type``). + """ + if attr.dtype == "langstring" and isinstance(attr.value, dtypes.LangString): + node = {"@value": attr.value.text} + if attr.value.lang: + node["@language"] = attr.value.lang + return node spec = dtypes.get(attr.dtype) or dtypes.get("str") return {"@value": spec.to_json(attr.value), "@type": vocab.xsd_for_dtype(attr.dtype)} @@ -176,8 +185,10 @@ def _set_from_node(metadata, name, node): """Rekonstruiere ein User-Attribut aus Knoten/Literal.""" unit = "-" ontology = "" + lang = None if isinstance(node, dict) and "@value" in node: raw, xsd = node.get("@value"), node.get("@type") + lang = node.get("@language") elif isinstance(node, dict): literal = node.get("value", {}) raw = literal.get("@value") if isinstance(literal, dict) else literal @@ -189,11 +200,14 @@ def _set_from_node(metadata, name, node): ontology = next((t for t in type_list if t != "qudt:Quantity"), "") else: raw, xsd = node, None - # dtype bestimmen: JSON-Typ hat Vorrang (list/json), sonst XSD-Rückabbildung + # dtype bestimmen: JSON-Typ/@language haben Vorrang, sonst XSD-Rückabbildung if isinstance(raw, list): dtype = "floatlist" if xsd == "sdata:floatlist" else "list" elif isinstance(raw, dict): dtype = "json" + elif lang: + dtype = "langstring" + raw = "{}@{}".format(raw, lang) # text@lang rekonstruieren else: dtype = _DTYPE_FROM_XSD.get(xsd, "str") metadata.set_attr(name, raw, dtype=dtype, unit=unit, ontology=ontology) diff --git a/tests/test_dtypes.py b/tests/test_dtypes.py index 47900d7..cdc2707 100644 --- a/tests/test_dtypes.py +++ b/tests/test_dtypes.py @@ -347,6 +347,60 @@ def test_complex_floatlist_jsonld_roundtrip(): assert back.get("spectrum").value == [1.0, 2.5, 3.0] # floatlist, nicht str-list +# --- langstring (neu) ------------------------------------------------------ +def test_langstring_coercion(): + from sdata.dtypes import LangString + ls = Attribute("l", "Hallo@de", dtype="langstring").value + assert isinstance(ls, LangString) and ls.text == "Hallo" and ls.lang == "de" + assert Attribute("l", "Hello@en-US", dtype="langstring").value == LangString("Hello", "en-US") + assert Attribute("l", "plain", dtype="langstring").value == LangString("plain", "") + assert Attribute("l", "a@b.com", dtype="langstring").value == LangString("a@b.com", "") # kein Tag + assert Attribute("l", ("meeting@noon", "en"), dtype="langstring").value == LangString("meeting@noon", "en") + assert Attribute("l", {"@value": "Bonjour", "@language": "fr"}, + dtype="langstring").value == LangString("Bonjour", "fr") + assert Attribute("l", LangString("x", "de"), dtype="langstring").value == LangString("x", "de") + assert Attribute("l", None, dtype="langstring").value is None + assert Attribute("l", "", dtype="langstring").value is None + + +def test_langstring_class_and_json(): + from sdata.dtypes import LangString + assert str(LangString("Hallo", "de")) == "Hallo@de" + assert str(LangString("Hallo", "")) == "Hallo" # ohne Tag + assert repr(LangString("Hallo", "de")) == "LangString('Hallo', 'de')" + assert LangString("a", "de") != "a@de" # != Nicht-LangString + assert LangString("a", "de") != LangString("a", "en") # Lang unterscheidet + assert {LangString("a", "de"): 1}[LangString("a", "de")] == 1 # hashbar + assert dtypes.resolve("langString") == "langstring" # case-insensitiv + assert dtypes.resolve(LangString) == "langstring" + assert dtypes.xsd_map()["langstring"] == "rdf:langString" + assert dtypes.get("langstring").to_json(LangString("Hallo", "de")) == "Hallo@de" + assert dtypes.get("langstring").to_json(None) is None # passthrough + assert dtypes.json_default(LangString("Hallo", "de")) == "Hallo@de" + + +def test_langstring_json_roundtrip(): + from sdata.dtypes import LangString + m = Metadata() + m.add("label", "Hallo@de", dtype="langstring") + restored = Metadata.from_json(m.to_json()) + assert restored.get("label").value == LangString("Hallo", "de") + + +def test_langstring_jsonld_roundtrip(): + from sdata.dtypes import LangString + from sdata import semantic + m = Metadata(name="probe") + m.add("label", "Hallo@de", dtype="langstring") + m.add("note", "plain", dtype="langstring") # ohne Tag -> degradiert zu str + doc = semantic.to_jsonld(m) + assert doc["sdata:label"] == {"@value": "Hallo", "@language": "de"} # rdf:langString via @language + assert doc["sdata:note"] == {"@value": "plain"} # kein @language/@type + back = semantic.from_jsonld(doc) + assert back.get("label").value == LangString("Hallo", "de") + assert back.get("note").value == "plain" # ohne Sprach-Tag -> str + + # --- dtype=class & Re-Cast -------------------------------------------------- def test_dtype_class_accepted(): assert Attribute("a", 1, dtype=int).dtype == "int"