diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index f18f968..071bd3e 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -180,6 +180,8 @@ jobs: needs: [ts-build] # Python tests need TypeScript compiled steps: - uses: actions/checkout@v4 + with: + submodules: true # vendor/layers lexicons for interop validation - name: Install pnpm uses: pnpm/action-setup@v4 diff --git a/.gitignore b/.gitignore index 899b424..15f3c05 100644 --- a/.gitignore +++ b/.gitignore @@ -34,4 +34,6 @@ tests/fixtures/cli_work/ /exports/ /trial_config_*.json /*.jzip -.claude/ \ No newline at end of file +.claude/ +# Hypothesis example database +.hypothesis/ diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 0000000..a1bc115 --- /dev/null +++ b/.gitmodules @@ -0,0 +1,5 @@ +[submodule "vendor/layers"] + path = vendor/layers + url = https://github.com/layers-pub/layers.git + branch = main + shallow = true diff --git a/CHANGELOG.md b/CHANGELOG.md index 5a85f09..170c5c8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,94 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [Unreleased] + +## [0.6.0] - 2026-05-29 + +### Added + +#### `bead.corpus` — streaming corpus ingestion and structural sampling + +- New subpackage `bead.corpus` for turning raw text corpora into experimental + `Item`s. `CorpusRecord` carries text plus flat provenance; `CorpusSource` is + a streaming-source protocol. +- Sources: `JsonlCorpusSource` (JSON Lines, transparently decompressing + Zstandard `.zst` files), `CsvCorpusSource` (CSV/TSV), and + `CompletionCorpusSource` (a language model as a corpus source, via the new + `TextGenerator` protocol on the OpenAI and Anthropic adapters). +- Lazy pipeline: `parse_records`, `filter_by_structure`, `sample_corpus`, and + `record_to_item` stream records through a dependency parser and keep only + those whose parse satisfies a structural DSL constraint, producing `Item`s + with standoff parse annotations and source provenance. The pipeline never + loads the full corpus into memory. +- New `corpus` optional-dependency extra (`zstandard`). + +#### Dependency parsing in `bead.tokenization` + +- New `bead.tokenization.parsers`: `SpacyParser`, `StanzaParser`, and + `create_parser` produce a per-sentence `ParsedSentence` of `ParsedToken` + records (token, lemma, upos, xpos, head, deprel, morphology, offsets). +- `parse_to_spans` projects a dependency parse onto the standoff `Span` + + `SpanRelation` models: one single-token span per token (with its governor as + `head_index` and its features in `span_metadata`) and one directed + head-to-dependent relation per syntactic arc. + +#### Structural-query builtins in the constraint DSL + +- New `bead.dsl` standard-library functions query a dependency parse stored on + an `Item`: `upos`, `xpos`, `lemma_of`, `form_of`, `deprel`, `morph`, `head`, + `dependents`, `has_relation`, `root`, `subtree`, `path_to_root`, + `tokens_with_upos`, `tokens_with_deprel`, `any_deprel`, and `filter_upos`. + Constraints can now match syntactic structure, e.g. + `upos(self, root(self)) == "VERB" and len(dependents(self, root(self), "obj")) > 0`. + +#### Text transforms for corpus cleanup + +- New transforms in `bead.transforms.text`: `MarkdownStripTransform`, + `RedditCleanupTransform`, and the `split_sentences` helper (parser-backed or + regex fallback). The first two are registered in the default transform + registry. + +#### `bead.corpus` buffering graph tier + +- New `bead.corpus.graph`: `CorpusGraph`, a typed directed multidigraph of + `CorpusNode`s and `CorpusEdge`s (parallel typed edges allowed; trees are a + special case), with traversal helpers (`children`, `parents`, `roots`, + `out_edges`, `in_edges`, `subtree`, `node_by_id`). +- New `bead.corpus.assemble`: `assemble_graph` buffers a record stream into a + `CorpusGraph`, building edges from declarative `EdgeSpec`s or a runtime edge + function. Reconstructs thread structure such as Reddit reply trees from + `parent_id`/`link_id`. This tier is opt-in and layered on top of the + streaming pipeline, which is untouched. + +#### `bead.interop.layers` — lossless layers interop + +- New subpackage mapping bead data to and from the + [layers](https://github.com/layers-pub/layers) linguistic-annotation schema + as law-verified didactic lenses (`dx.Iso` for bijections, `dx.Lens` with a + complement for projections), so every round-trip is exact and verified. +- Faithful mirror models for the layers shared defs and record types, each with + a generic lossless `MirrorIso` to and from layers-shaped JSON (snake/camel + case, feature maps, slug+uri enums, integer confidence, `$type` unions). +- Bridge lenses map bead-native models onto layers constructs: `CorpusRecord` + to an `expression`, `CorpusGraph` to a property graph (`expression`s, + `graphNode`s, and a `graphEdgeSet`), and a dependency-parsed `ParsedSentence` + to a `tokenization` plus part-of-speech and dependency `annotationLayer`s. The + lens complement holds the bead-only remainder (framework identity and fields + layers has no slot for). Resource-overlap lenses map lexical items, lexicons, + and templates to the layers resource constructs. +- Mappings are validated against the layers lexicons, vendored as the + `vendor/layers` git submodule, using the ATProto lexicon validator + (`@atproto/lexicon`), proving every mapping produces schema-valid layers. + +### Changed + +- Minimum `didactic` raised to `>=0.7.2` and `panproto` to `>=0.51.0`. +- Streaming corpus ingestion is now lossless by default: `JsonlCorpusSource` + and `CsvCorpusSource` retain every field (not just a configured subset), and + non-scalar values round-trip through JSON rather than being stringified, so + no source information is dropped at ingestion. + ## [0.5.0] - 2026-05-12 ### Added @@ -440,6 +528,10 @@ guards as type-checkers. - CI/CD: GitHub Actions for testing, docs, PyPI publishing - Read the Docs integration -[Unreleased]: https://github.com/FACTSlab/bead/compare/v0.2.0...HEAD +[Unreleased]: https://github.com/FACTSlab/bead/compare/v0.6.0...HEAD +[0.6.0]: https://github.com/FACTSlab/bead/compare/v0.5.0...v0.6.0 +[0.5.0]: https://github.com/FACTSlab/bead/compare/v0.4.0...v0.5.0 +[0.4.0]: https://github.com/FACTSlab/bead/compare/v0.3.0...v0.4.0 +[0.3.0]: https://github.com/FACTSlab/bead/compare/v0.2.0...v0.3.0 [0.2.0]: https://github.com/FACTSlab/bead/compare/v0.1.0...v0.2.0 [0.1.0]: https://github.com/FACTSlab/bead/releases/tag/v0.1.0 diff --git a/README.md b/README.md index 94f9b9d..7e1daf1 100644 --- a/README.md +++ b/README.md @@ -30,12 +30,18 @@ uv pip install bead[training] # PyTorch Lightning, TensorBoard ### Development ```bash -git clone https://github.com/FACTSlab/bead.git +git clone --recurse-submodules https://github.com/FACTSlab/bead.git cd bead uv sync --all-extras uv run pytest tests/ ``` +The `vendor/layers` submodule holds the layers lexicons that the interop tests +validate against. If you cloned without `--recurse-submodules`, fetch them with +`git submodule update --init vendor/layers`, and refresh to the latest published +lexicons with `git submodule update --remote vendor/layers`. The lexicon +validation tests skip automatically when the submodule is absent. + Always use `uv run` to execute commands. ## Quick Start diff --git a/bead/__init__.py b/bead/__init__.py index 4949b4b..b9111f0 100644 --- a/bead/__init__.py +++ b/bead/__init__.py @@ -6,6 +6,6 @@ from __future__ import annotations -__version__ = "0.5.0" +__version__ = "0.6.0" __author__ = "Aaron Steven White" __email__ = "aaron.white@rochester.edu" diff --git a/bead/corpus/__init__.py b/bead/corpus/__init__.py new file mode 100644 index 0000000..5af1eb2 --- /dev/null +++ b/bead/corpus/__init__.py @@ -0,0 +1,43 @@ +"""Streaming corpus ingestion and structural sampling. + +Turns raw external text (JSONL, optionally Zstandard-compressed; CSV/TSV) into +structurally filtered experimental ``Item``s: stream ``CorpusRecord``s from a +``CorpusSource``, dependency-parse them, and keep only those whose parse +satisfies a structural DSL constraint. +""" + +from __future__ import annotations + +from bead.corpus.assemble import EdgeSpec, assemble_graph +from bead.corpus.base import CorpusSource +from bead.corpus.graph import CorpusEdge, CorpusGraph, CorpusNode +from bead.corpus.pipeline import ( + filter_by_structure, + parse_records, + record_to_item, + sample_corpus, +) +from bead.corpus.records import CorpusRecord, ProvenanceValue +from bead.corpus.sources import ( + CompletionCorpusSource, + CsvCorpusSource, + JsonlCorpusSource, +) + +__all__ = [ + "CompletionCorpusSource", + "CorpusEdge", + "CorpusGraph", + "CorpusNode", + "CorpusRecord", + "CorpusSource", + "CsvCorpusSource", + "EdgeSpec", + "JsonlCorpusSource", + "ProvenanceValue", + "assemble_graph", + "filter_by_structure", + "parse_records", + "record_to_item", + "sample_corpus", +] diff --git a/bead/corpus/assemble.py b/bead/corpus/assemble.py new file mode 100644 index 0000000..4b9eda8 --- /dev/null +++ b/bead/corpus/assemble.py @@ -0,0 +1,118 @@ +"""Buffer a record stream into a typed multidigraph. + +``assemble_graph`` is the opt-in buffering tier that sits on top of the lazy +streaming sources: it consumes ``CorpusRecord``s and reconstructs the structure +between them (e.g. a Reddit reply tree from ``parent_id``, or an arbitrary typed +graph) as a :class:`~bead.corpus.graph.CorpusGraph`. It holds the records in +memory, so it is a deliberate, explicit step distinct from streaming. +""" + +from __future__ import annotations + +from collections.abc import Callable, Iterable, Sequence + +import didactic.api as dx + +from bead.corpus.graph import CorpusEdge, CorpusGraph, CorpusNode +from bead.corpus.records import CorpusRecord +from bead.data.base import BeadBaseModel + + +class EdgeSpec(BeadBaseModel): + """Declarative rule for deriving one typed edge per record from a field. + + For each record, if ``target_field`` is present in the record's provenance, + an edge ``record_node -> target`` is created with type ``edge_type``. The + target id is the field value with any matching ``strip_prefixes`` removed + (e.g. Reddit's ``t1_``/``t3_`` fullname prefixes). + + Attributes + ---------- + target_field : str + Provenance field naming the other endpoint (e.g. ``"parent_id"``). + edge_type : str + Edge type slug for the created edge (e.g. ``"reply-to"``). + edge_type_uri : str | None + Optional canonical edge-type URI. + strip_prefixes : tuple[str, ...] + Prefixes to strip from the field value to recover the bare node id. + directed : bool + Whether the created edge is directed. + """ + + target_field: str + edge_type: str + edge_type_uri: str | None = None + strip_prefixes: tuple[str, ...] = () + directed: bool = True + + @dx.validates("target_field", "edge_type") + def _check_non_empty(self, value: str) -> str: + if not value or not value.strip(): + raise ValueError("must be non-empty") + return value.strip() + + +def _strip_prefix(value: str, prefixes: tuple[str, ...]) -> str: + """Strip the first matching prefix from *value*.""" + for prefix in prefixes: + if prefix and value.startswith(prefix): + return value[len(prefix) :] + return value + + +def assemble_graph( + records: Iterable[CorpusRecord], + *, + node_id_field: str, + edge_specs: Sequence[EdgeSpec] = (), + edge_fn: Callable[[CorpusRecord, str], Iterable[CorpusEdge]] | None = None, +) -> CorpusGraph: + """Buffer a record stream into a typed multidigraph. + + Each record with a ``node_id_field`` value becomes one expression node. + Edges are derived from the declarative ``edge_specs`` and/or a runtime + ``edge_fn`` (given the record and its node id) for arbitrary extraction. + + Parameters + ---------- + records : Iterable[CorpusRecord] + The records to buffer (typically a streaming source). + node_id_field : str + Provenance field holding each record's stable node id. + edge_specs : Sequence[EdgeSpec] + Declarative field-to-edge rules (the common case). + edge_fn : Callable[[CorpusRecord, str], Iterable[CorpusEdge]] | None + Optional function yielding extra edges for arbitrary structure. + + Returns + ------- + CorpusGraph + The assembled graph. Edges may reference target ids that have no node + (dangling references are preserved, not dropped). + """ + nodes: list[CorpusNode] = [] + edges: list[CorpusEdge] = [] + for record in records: + node_id_raw = record.provenance.get(node_id_field) + if node_id_raw is None: + continue + node_id = str(node_id_raw) + nodes.append(CorpusNode(node_id=node_id, record=record)) + for spec in edge_specs: + target_raw = record.provenance.get(spec.target_field) + if target_raw is None: + continue + target_id = _strip_prefix(str(target_raw), spec.strip_prefixes) + edges.append( + CorpusEdge( + source_id=node_id, + target_id=target_id, + edge_type=spec.edge_type, + edge_type_uri=spec.edge_type_uri, + directed=spec.directed, + ) + ) + if edge_fn is not None: + edges.extend(edge_fn(record, node_id)) + return CorpusGraph(nodes=tuple(nodes), edges=tuple(edges)) diff --git a/bead/corpus/base.py b/bead/corpus/base.py new file mode 100644 index 0000000..d4d32f6 --- /dev/null +++ b/bead/corpus/base.py @@ -0,0 +1,30 @@ +"""Corpus source protocol. + +A ``CorpusSource`` is anything that streams ``CorpusRecord``s. It is modeled as +a runtime-checkable ``Protocol`` (behavior, not data) rather than a didactic +model, mirroring the transform protocols elsewhere in bead. +""" + +from __future__ import annotations + +from collections.abc import Iterator +from typing import Protocol, runtime_checkable + +from bead.corpus.records import CorpusRecord + + +@runtime_checkable +class CorpusSource(Protocol): + """A streaming source of corpus records. + + Attributes + ---------- + source_name : str + Identifier stamped onto every record's ``source_name``. + """ + + source_name: str + + def __iter__(self) -> Iterator[CorpusRecord]: + """Iterate the records of the source.""" + ... diff --git a/bead/corpus/graph.py b/bead/corpus/graph.py new file mode 100644 index 0000000..92faeff --- /dev/null +++ b/bead/corpus/graph.py @@ -0,0 +1,179 @@ +"""Typed multidigraph over corpus expressions (buffering tier). + +On top of the lazy streaming tier, a :class:`CorpusGraph` materializes the +structure *between* records: a directed, typed multigraph whose nodes are +expressions (one per :class:`~bead.corpus.records.CorpusRecord`) or abstract +entities, and whose edges are typed, directed relations (multiple edges may +connect the same pair). A reply tree (Reddit) is the special case of a graph +whose edges all share one type; arbitrarily complex corpora (typed relations +between expressions) are the general case. + +This model is aligned with the ``layers`` property graph (``graphNode`` / +``graphEdgeSet``) so it maps losslessly; see ``bead.interop.layers``. +""" + +from __future__ import annotations + +import didactic.api as dx + +from bead.corpus.records import CorpusRecord +from bead.data.base import BeadBaseModel +from bead.items.item import MetadataValue + + +class CorpusNode(BeadBaseModel): + """A node in a corpus graph. + + Attributes + ---------- + node_id : str + Stable identifier, unique within the graph (e.g. a Reddit comment id). + node_type : str + Node type slug (``"expression"`` for a text record, or an abstract type + such as ``"entity"``/``"concept"``). Mirrors layers' ``nodeType``. + node_type_uri : str | None + Optional canonical type URI (the layers slug+uri pattern). + label : str | None + Human-readable node label. + record : CorpusRecord | None + The expression this node wraps, if it is a text node. + properties : dict[str, MetadataValue] + Arbitrary node properties (maps to a layers feature map). + """ + + node_id: str + node_type: str = "expression" + node_type_uri: str | None = None + label: str | None = None + record: dx.Embed[CorpusRecord] | None = None + properties: dict[str, MetadataValue] = dx.field(default_factory=dict) + + @dx.validates("node_id") + def _check_node_id(self, value: str) -> str: + if not value or not value.strip(): + raise ValueError("node_id cannot be empty") + return value.strip() + + +class CorpusEdge(BeadBaseModel): + """A typed, directed edge between two corpus nodes. + + Attributes + ---------- + source_id : str + ``node_id`` of the source node. + target_id : str + ``node_id`` of the target node. + edge_type : str + Edge type slug (e.g. ``"reply-to"``, ``"coreference"``). + edge_type_uri : str | None + Optional canonical edge-type URI (the layers slug+uri pattern). + directed : bool + Whether the edge is directed (``True``) or symmetric. + confidence : float | None + Optional confidence in ``[0, 1]``. + features : dict[str, MetadataValue] + Arbitrary edge features (maps to a layers feature map). + """ + + source_id: str + target_id: str + edge_type: str + edge_type_uri: str | None = None + directed: bool = True + confidence: float | None = None + features: dict[str, MetadataValue] = dx.field(default_factory=dict) + + +class CorpusGraph(BeadBaseModel): + """A directed, typed multigraph over corpus nodes. + + Edges are directed ``source -> target``. Multiple edges (of the same or + different types) may connect a pair, so this is a multidigraph; a tree is + the special case where every node has at most one out-edge of the tree's + edge type. + + Attributes + ---------- + nodes : tuple[CorpusNode, ...] + The graph's nodes. + edges : tuple[CorpusEdge, ...] + The graph's directed edges. + graph_metadata : dict[str, MetadataValue] + Graph-level metadata. + """ + + nodes: tuple[dx.Embed[CorpusNode], ...] = () + edges: tuple[dx.Embed[CorpusEdge], ...] = () + graph_metadata: dict[str, MetadataValue] = dx.field(default_factory=dict) + + def node_by_id(self, node_id: str) -> CorpusNode | None: + """Return the node with ``node_id``, or ``None`` if absent.""" + for node in self.nodes: + if node.node_id == node_id: + return node + return None + + def out_edges( + self, node_id: str, edge_type: str | None = None + ) -> tuple[CorpusEdge, ...]: + """Edges whose source is ``node_id`` (optionally filtered by type).""" + return tuple( + edge + for edge in self.edges + if edge.source_id == node_id + and (edge_type is None or edge.edge_type == edge_type) + ) + + def in_edges( + self, node_id: str, edge_type: str | None = None + ) -> tuple[CorpusEdge, ...]: + """Edges whose target is ``node_id`` (optionally filtered by type).""" + return tuple( + edge + for edge in self.edges + if edge.target_id == node_id + and (edge_type is None or edge.edge_type == edge_type) + ) + + def successors(self, node_id: str, edge_type: str | None = None) -> tuple[str, ...]: + """Target ids of ``node_id``'s out-edges, in edge order.""" + return tuple(edge.target_id for edge in self.out_edges(node_id, edge_type)) + + def predecessors( + self, node_id: str, edge_type: str | None = None + ) -> tuple[str, ...]: + """Source ids of ``node_id``'s in-edges, in edge order.""" + return tuple(edge.source_id for edge in self.in_edges(node_id, edge_type)) + + def roots(self, edge_type: str | None = None) -> tuple[str, ...]: + """Node ids with no in-edges (of the given type).""" + return tuple( + node.node_id + for node in self.nodes + if not self.in_edges(node.node_id, edge_type) + ) + + def descendants( + self, node_id: str, edge_type: str | None = None + ) -> tuple[str, ...]: + """Transitive successors of ``node_id`` (cycle-guarded, excludes self).""" + seen: set[str] = {node_id} + order: list[str] = [] + queue: list[str] = list(self.successors(node_id, edge_type)) + while queue: + current = queue.pop(0) + if current in seen: + continue + seen.add(current) + order.append(current) + queue.extend(self.successors(current, edge_type)) + return tuple(order) + + def reverse(self) -> CorpusGraph: + """Return a copy of the graph with every edge's direction flipped.""" + flipped = tuple( + edge.with_(source_id=edge.target_id, target_id=edge.source_id) + for edge in self.edges + ) + return self.with_(edges=flipped).touched() diff --git a/bead/corpus/pipeline.py b/bead/corpus/pipeline.py new file mode 100644 index 0000000..1de0be7 --- /dev/null +++ b/bead/corpus/pipeline.py @@ -0,0 +1,232 @@ +"""Streaming corpus pipeline: parse, structurally filter, build items. + +Composable lazy generators that turn a ``CorpusSource`` into structurally +filtered ``Item``s: + +``parse_records`` -> ``filter_by_structure`` -> ``Item``s. + +The whole chain is lazy, so a structural query (a DSL constraint over the +dependency parse, e.g. a transitive-verb pattern) can be run over a +multi-gigabyte corpus without loading it into memory. +""" + +from __future__ import annotations + +import itertools +from collections.abc import Iterable, Iterator +from uuid import UUID + +from bead.corpus.records import CorpusRecord +from bead.dsl.evaluator import DSLEvaluator +from bead.items.item import Item, MetadataValue +from bead.tokenization.parsers import ( + UNIVERSAL_DEPENDENCIES, + DependencyParser, + ParsedSentence, + parse_to_spans, +) + + +def record_to_item( + record: CorpusRecord, + parsed: ParsedSentence, + *, + item_template_id: UUID, + tool: str, + element_name: str = "text", + formalism: str = UNIVERSAL_DEPENDENCIES, +) -> Item: + """Build an ``Item`` from a corpus record and its parse. + + The parse is projected onto spans and relations via ``parse_to_spans``; the + record's provenance plus the layers-aligned layer discriminators are stored + on ``item_metadata``. + + Parameters + ---------- + record : CorpusRecord + The source record (supplies text and provenance). + parsed : ParsedSentence + The dependency parse of ``record.text`` (or one of its sentences). + item_template_id : UUID + Template the resulting item is associated with. + tool : str + Parser identifier, recorded as provenance. + element_name : str + Rendered-element name for the parsed text. + formalism : str + Dependency formalism slug. + + Returns + ------- + Item + The constructed item with spans, relations, and provenance. + """ + tokenization_id = str(record.id) + spans, relations = parse_to_spans( + parsed, + element_name=element_name, + tokenization_id=tokenization_id, + formalism=formalism, + tool=tool, + ) + + item_metadata: dict[str, MetadataValue] = {} + for key, value in record.provenance.items(): + item_metadata[key] = value + item_metadata["source_name"] = record.source_name + item_metadata["corpus_record_id"] = str(record.id) + item_metadata["record_index"] = record.record_index + item_metadata["parser_tool"] = tool + item_metadata["formalism"] = formalism + item_metadata["subkind"] = "dependency" + item_metadata["tokenization_id"] = tokenization_id + + return Item( + item_template_id=item_template_id, + rendered_elements={element_name: parsed.original_text}, + spans=spans, + span_relations=relations, + tokenized_elements={element_name: tuple(t.text for t in parsed.tokens)}, + token_space_after={element_name: tuple(t.space_after for t in parsed.tokens)}, + item_metadata=item_metadata, + ) + + +def parse_records( + source: Iterable[CorpusRecord], + parser: DependencyParser, + *, + split_sentences: bool = True, +) -> Iterator[tuple[CorpusRecord, ParsedSentence]]: + """Parse each record, yielding ``(record, sentence)`` pairs. + + Parameters + ---------- + source : Iterable[CorpusRecord] + The records to parse. + parser : DependencyParser + The dependency parser to apply. + split_sentences : bool + When ``True`` (default), multi-sentence records fan out to one pair per + sentence. When ``False``, only records that parse to exactly one + sentence are emitted (multi-sentence records are skipped). + + Yields + ------ + tuple[CorpusRecord, ParsedSentence] + A record paired with one of its parsed sentences. + """ + for record in source: + sentences = parser(record.text) + if not split_sentences and len(sentences) != 1: + continue + for sentence in sentences: + yield record, sentence + + +def filter_by_structure( + parsed: Iterable[tuple[CorpusRecord, ParsedSentence]], + constraint: str, + *, + item_template_id: UUID, + tool: str, + element_name: str = "text", + formalism: str = UNIVERSAL_DEPENDENCIES, + evaluator: DSLEvaluator | None = None, +) -> Iterator[Item]: + """Yield items whose parse satisfies a structural DSL constraint. + + Parameters + ---------- + parsed : Iterable[tuple[CorpusRecord, ParsedSentence]] + ``(record, sentence)`` pairs (e.g. from ``parse_records``). + constraint : str + A DSL expression evaluated with the item bound as ``self`` and ``item`` + (e.g. ``'upos(self, root(self)) == "VERB"'``). + item_template_id : UUID + Template the resulting items are associated with. + tool : str + Parser identifier, recorded as provenance. + element_name : str + Rendered-element name for the parsed text. + formalism : str + Dependency formalism slug. + evaluator : DSLEvaluator | None + Reused evaluator (one is created if ``None``). + + Yields + ------ + Item + Items whose parse satisfies ``constraint``. + """ + engine = evaluator if evaluator is not None else DSLEvaluator() + for record, sentence in parsed: + item = record_to_item( + record, + sentence, + item_template_id=item_template_id, + tool=tool, + element_name=element_name, + formalism=formalism, + ) + if engine.evaluate(constraint, {"self": item, "item": item}): + yield item + + +def sample_corpus( + source: Iterable[CorpusRecord], + parser: DependencyParser, + constraint: str, + *, + item_template_id: UUID, + element_name: str = "text", + formalism: str = UNIVERSAL_DEPENDENCIES, + split_sentences: bool = True, + limit: int | None = None, + evaluator: DSLEvaluator | None = None, +) -> Iterator[Item]: + """Stream, parse, and structurally filter a corpus into items. + + Convenience composition of ``parse_records`` and ``filter_by_structure``, + optionally capped at ``limit`` items. + + Parameters + ---------- + source : Iterable[CorpusRecord] + The corpus source. + parser : DependencyParser + The dependency parser to apply (its ``tool`` is recorded as provenance). + constraint : str + Structural DSL constraint each item must satisfy. + item_template_id : UUID + Template the resulting items are associated with. + element_name : str + Rendered-element name for the parsed text. + formalism : str + Dependency formalism slug. + split_sentences : bool + Whether multi-sentence records fan out (see ``parse_records``). + limit : int | None + Maximum number of items to yield. + evaluator : DSLEvaluator | None + Reused evaluator (one is created if ``None``). + + Yields + ------ + Item + Matching items, at most ``limit`` of them. + """ + pairs = parse_records(source, parser, split_sentences=split_sentences) + items = filter_by_structure( + pairs, + constraint, + item_template_id=item_template_id, + tool=parser.tool, + element_name=element_name, + formalism=formalism, + evaluator=evaluator, + ) + if limit is not None: + items = itertools.islice(items, limit) + yield from items diff --git a/bead/corpus/records.py b/bead/corpus/records.py new file mode 100644 index 0000000..981fb5b --- /dev/null +++ b/bead/corpus/records.py @@ -0,0 +1,41 @@ +"""Streamed corpus records with provenance. + +A ``CorpusRecord`` is the raw ingress of the corpus pipeline: one text unit +drawn from an external source (a JSONL/CSV file, a language model) together +with the provenance needed to trace it. Provenance keys follow the ``layers`` +``AnnotationMetadata`` shape (``source_name``, ``tool``, ``model``, +``created_at``, ``confidence``, ``formalism``) alongside any raw source fields, +so corpus-derived items carry layers-ready provenance from ingestion onward. +""" + +from __future__ import annotations + +import didactic.api as dx + +from bead.data.base import BeadBaseModel + +type ProvenanceValue = str | int | float | bool | None + + +class CorpusRecord(BeadBaseModel): + """A single streamed text record with provenance. + + Attributes + ---------- + text : str + The text of the record. + source_name : str + Identifier of the source the record was drawn from (e.g. a file + basename, a corpus name, or a model name). + record_index : int + 0-based position of the record within its source stream. + provenance : dict[str, ProvenanceValue] + Flat scalar provenance. Conventionally includes layers-aligned keys + (``source_name``, ``tool``, ``model``, ``created_at``, ``confidence``, + ``formalism``) plus any raw source fields. + """ + + text: str + source_name: str + record_index: int = 0 + provenance: dict[str, ProvenanceValue] = dx.field(default_factory=dict) diff --git a/bead/corpus/sources.py b/bead/corpus/sources.py new file mode 100644 index 0000000..618a32f --- /dev/null +++ b/bead/corpus/sources.py @@ -0,0 +1,254 @@ +"""Concrete corpus sources. + +Streaming readers that turn external text data into ``CorpusRecord``s: + +- ``JsonlCorpusSource`` streams JSON Lines, transparently decompressing + ``.zst`` (Zstandard) files. +- ``CsvCorpusSource`` streams rows of a CSV/TSV file. + +Both are lazy: records are produced one at a time, so multi-gigabyte corpora +never load into memory. +""" + +from __future__ import annotations + +import importlib +import json +from collections.abc import Callable, Iterator, Sequence +from pathlib import Path +from typing import IO, TYPE_CHECKING + +import pandas as pd + +from bead.corpus.records import CorpusRecord, ProvenanceValue +from bead.data.serialization import iter_jsonl_lines + +if TYPE_CHECKING: + from bead.items.adapters.base import TextGenerator + +# A value parsed from JSON or a CSV cell (lists, unlike bead's tuple-based +# JsonValue, since json.loads produces lists). +type JsonInput = ( + str | int | float | bool | None | list["JsonInput"] | dict[str, "JsonInput"] +) + + +def _as_scalar(value: JsonInput) -> ProvenanceValue: + """Coerce a parsed value to a flat provenance scalar without losing it. + + Scalars pass through unchanged. Anything else (lists, nested objects) is + serialized to a JSON string so the provenance dict stays flat while + remaining recoverable via ``json.loads``. + """ + if value is None or isinstance(value, (str, int, float, bool)): + return value + return json.dumps(value) + + +def _zstd_open(path: Path) -> IO[str]: + """Open a Zstandard-compressed file as a UTF-8 text stream.""" + try: + zstandard = importlib.import_module("zstandard") + except ImportError as e: + raise ImportError( + "zstandard is required to read .zst corpora. " + "Install it with: pip install 'bead[corpus]'" + ) from e + return zstandard.open(path, "rt", encoding="utf-8") + + +class JsonlCorpusSource: + """Stream JSON Lines (optionally Zstandard-compressed) as corpus records. + + Parameters + ---------- + path : str | Path + Path to the ``.jsonl`` or ``.jsonl.zst`` file. + source_name : str | None + Source identifier; defaults to the file name. + text_field : str + JSON field holding the record text. + provenance_fields : tuple[str, ...] | None + JSON fields to copy into each record's provenance. ``None`` (the + default) retains **every** field except ``text_field`` so no source + information (e.g. Reddit ``id``/``parent_id``/``link_id``) is dropped; + pass an explicit tuple to keep only a subset. + compression : str + ``"auto"`` (detect ``.zst`` by suffix), ``"zst"``, or ``"none"``. + """ + + def __init__( + self, + path: str | Path, + *, + source_name: str | None = None, + text_field: str = "text", + provenance_fields: tuple[str, ...] | None = None, + compression: str = "auto", + ) -> None: + self._path = Path(path) + self.source_name = source_name if source_name is not None else self._path.name + self._text_field = text_field + self._provenance_fields = provenance_fields + self._compression = compression + + def _open_fn(self) -> Callable[[Path], IO[str]] | None: + compressed = self._compression == "zst" or ( + self._compression == "auto" and self._path.suffix == ".zst" + ) + return _zstd_open if compressed else None + + def __iter__(self) -> Iterator[CorpusRecord]: + """Yield one ``CorpusRecord`` per non-empty JSON line.""" + open_fn = self._open_fn() + line_iter = ( + iter_jsonl_lines(self._path, open_fn=open_fn) + if open_fn is not None + else iter_jsonl_lines(self._path) + ) + for index, (_, line) in enumerate(line_iter): + data = json.loads(line) + if not isinstance(data, dict): + continue + raw_text = data.get(self._text_field) + if raw_text is None: + continue + fields = ( + tuple(k for k in data if k != self._text_field) + if self._provenance_fields is None + else self._provenance_fields + ) + provenance: dict[str, ProvenanceValue] = { + field: _as_scalar(data[field]) for field in fields if field in data + } + yield CorpusRecord( + text=str(raw_text), + source_name=self.source_name, + record_index=index, + provenance=provenance, + ) + + +class CompletionCorpusSource: + """Generate text from a language model as a corpus source. + + Wraps any ``TextGenerator`` (e.g. an OpenAI or Anthropic adapter) and yields + one ``CorpusRecord`` per generated completion, with the model and prompt + recorded as layers-aligned provenance. + + Parameters + ---------- + generator : TextGenerator + The model used to generate completions. + prompts : Sequence[str] + Prompts to complete. + source_name : str | None + Source identifier; defaults to the generator's ``model_name``. + completions_per_prompt : int + Number of completions to draw per prompt. + max_tokens : int + Maximum tokens per completion. + temperature : float + Sampling temperature. + """ + + def __init__( + self, + generator: TextGenerator, + prompts: Sequence[str], + *, + source_name: str | None = None, + completions_per_prompt: int = 1, + max_tokens: int = 256, + temperature: float = 1.0, + ) -> None: + self._generator = generator + self._prompts = prompts + self.source_name = ( + source_name if source_name is not None else generator.model_name + ) + self._completions_per_prompt = completions_per_prompt + self._max_tokens = max_tokens + self._temperature = temperature + + def __iter__(self) -> Iterator[CorpusRecord]: + """Yield one ``CorpusRecord`` per generated completion.""" + index = 0 + for prompt in self._prompts: + for _ in range(self._completions_per_prompt): + text = self._generator.generate_completion( + prompt, + max_tokens=self._max_tokens, + temperature=self._temperature, + ) + provenance: dict[str, ProvenanceValue] = { + "tool": "completion", + "model": self._generator.model_name, + "prompt": prompt, + } + yield CorpusRecord( + text=text, + source_name=self.source_name, + record_index=index, + provenance=provenance, + ) + index += 1 + + +class CsvCorpusSource: + r"""Stream rows of a CSV/TSV file as corpus records. + + Parameters + ---------- + path : str | Path + Path to the CSV/TSV file. + text_column : str + Column holding the record text. + source_name : str | None + Source identifier; defaults to the file name. + provenance_columns : tuple[str, ...] | None + Columns to copy into each record's provenance. ``None`` (the default) + retains **every** column except ``text_column`` so no source information + is dropped; pass an explicit tuple to keep only a subset. + sep : str + Field separator (``","`` for CSV, ``"\\t"`` for TSV). + """ + + def __init__( + self, + path: str | Path, + *, + text_column: str, + source_name: str | None = None, + provenance_columns: tuple[str, ...] | None = None, + sep: str = ",", + ) -> None: + self._path = Path(path) + self.source_name = source_name if source_name is not None else self._path.name + self._text_column = text_column + self._provenance_columns = provenance_columns + self._sep = sep + + def __iter__(self) -> Iterator[CorpusRecord]: + """Yield one ``CorpusRecord`` per CSV row with a non-empty text cell.""" + frame = pd.read_csv(self._path, sep=self._sep, dtype=str, keep_default_na=False) + for index, row in enumerate(frame.to_dict(orient="records")): + raw_text = row.get(self._text_column, "") + if raw_text is None or str(raw_text) == "": + continue + columns = ( + tuple(c for c in row if c != self._text_column) + if self._provenance_columns is None + else self._provenance_columns + ) + provenance: dict[str, ProvenanceValue] = { + str(column): _as_scalar(row[column]) + for column in columns + if column in row + } + yield CorpusRecord( + text=str(raw_text), + source_name=self.source_name, + record_index=index, + provenance=provenance, + ) diff --git a/bead/data/serialization.py b/bead/data/serialization.py index ee1a369..8d17937 100644 --- a/bead/data/serialization.py +++ b/bead/data/serialization.py @@ -6,8 +6,9 @@ from __future__ import annotations -from collections.abc import Iterator, Sequence +from collections.abc import Callable, Iterator, Sequence from pathlib import Path +from typing import IO import didactic.api as dx @@ -20,6 +21,41 @@ class DeserializationError(Exception): """Raised when deserialization from JSONLines fails.""" +def _open_text(path: Path) -> IO[str]: + """Open *path* as a UTF-8 text stream (default JSONL line opener).""" + return path.open("r", encoding="utf-8") + + +def iter_jsonl_lines( + path: Path, + *, + open_fn: Callable[[Path], IO[str]] = _open_text, +) -> Iterator[tuple[int, str]]: + """Yield ``(line_number, stripped_line)`` for each non-empty line. + + Single canonical line-iteration step shared by the JSONLines readers and + by streaming corpus sources (which pass a decompressing ``open_fn``). + + Parameters + ---------- + path : Path + File to read. + open_fn : Callable[[Path], IO[str]] + Opener returning a text stream; defaults to UTF-8 text. Pass a + decompressing opener (e.g. ``zstandard.open``) for compressed files. + + Yields + ------ + tuple[int, str] + 1-based line number and the stripped line (blank lines skipped). + """ + with open_fn(path) as handle: + for line_num, line in enumerate(handle, start=1): + stripped = line.strip() + if stripped: + yield line_num, stripped + + def write_jsonlines[T: dx.Model]( objects: Sequence[T], path: Path | str, @@ -66,19 +102,15 @@ def read_jsonlines[T: dx.Model]( path = Path(path) objects: list[T] = [] try: - with path.open("r", encoding="utf-8") as f: - for line_num, line in enumerate(f, start=1): - line = line.strip() - if not line: + for line_num, line in iter_jsonl_lines(path): + try: + objects.append(model_class.model_validate_json(line)) + except (dx.ValidationError, ValueError) as e: + if skip_errors: continue - try: - objects.append(model_class.model_validate_json(line)) - except (dx.ValidationError, ValueError) as e: - if skip_errors: - continue - raise DeserializationError( - f"Failed to parse line {line_num} in {path}: {e}" - ) from e + raise DeserializationError( + f"Failed to parse line {line_num} in {path}: {e}" + ) from e except OSError as e: raise DeserializationError(f"Failed to read from {path}: {e}") from e return objects @@ -93,17 +125,13 @@ def stream_jsonlines[T: dx.Model]( del validate path = Path(path) try: - with path.open("r", encoding="utf-8") as f: - for line_num, line in enumerate(f, start=1): - line = line.strip() - if not line: - continue - try: - yield model_class.model_validate_json(line) - except (dx.ValidationError, ValueError) as e: - raise DeserializationError( - f"Failed to parse line {line_num} in {path}: {e}" - ) from e + for line_num, line in iter_jsonl_lines(path): + try: + yield model_class.model_validate_json(line) + except (dx.ValidationError, ValueError) as e: + raise DeserializationError( + f"Failed to parse line {line_num} in {path}: {e}" + ) from e except OSError as e: raise DeserializationError(f"Failed to read from {path}: {e}") from e diff --git a/bead/dsl/evaluator.py b/bead/dsl/evaluator.py index d296ffb..2d16579 100644 --- a/bead/dsl/evaluator.py +++ b/bead/dsl/evaluator.py @@ -8,7 +8,8 @@ from __future__ import annotations -from typing import TYPE_CHECKING, Any +from collections.abc import Mapping +from typing import TYPE_CHECKING from bead.dsl import ast from bead.dsl.context import EvaluationContext @@ -17,10 +18,102 @@ from bead.dsl.stdlib import register_stdlib if TYPE_CHECKING: - from bead.items.item import Item + from bead.data.base import BeadBaseModel, JsonValue from bead.resources.constraints import ContextValue - from bead.resources.lexical_item import LexicalItem - from bead.templates.filler import FilledTemplate + +# Every value an expression can produce or operate on: DSL scalars, collections, +# and bead model objects (reached via attribute access on a bound ``self`` / +# ``item``). Attribute and subscript access ultimately bottom out in model +# fields, which are themselves of this shape. +type DslValue = ( + str + | int + | float + | bool + | None + | list["DslValue"] + | tuple["DslValue", ...] + | dict[str, "DslValue"] + | set["DslValue"] + | frozenset["DslValue"] + | BeadBaseModel + | JsonValue +) + + +def _compare(operator: str, left: DslValue, right: DslValue) -> bool: + """Apply an ordering operator to two numeric or two string operands.""" + if isinstance(left, (int, float)) and isinstance(right, (int, float)): + lf, rf = float(left), float(right) + if operator == "<": + return lf < rf + if operator == ">": + return lf > rf + if operator == "<=": + return lf <= rf + return lf >= rf + if isinstance(left, str) and isinstance(right, str): + if operator == "<": + return left < right + if operator == ">": + return left > right + if operator == "<=": + return left <= right + return left >= right + raise EvaluationError( + f"Cannot compare {type(left).__name__} and {type(right).__name__}" + ) + + +def _arithmetic(operator: str, left: DslValue, right: DslValue) -> int | float | str: + """Apply an arithmetic operator, preserving int/float/str result types.""" + if isinstance(left, int) and isinstance(right, int): + if operator == "+": + return left + right + if operator == "-": + return left - right + if operator == "*": + return left * right + if operator == "/": + if right == 0: + raise EvaluationError("Division by zero") + return left / right + if right == 0: + raise EvaluationError("Modulo by zero") + return left % right + if isinstance(left, (int, float)) and isinstance(right, (int, float)): + lf, rf = float(left), float(right) + if operator == "+": + return lf + rf + if operator == "-": + return lf - rf + if operator == "*": + return lf * rf + if operator == "/": + if rf == 0: + raise EvaluationError("Division by zero") + return lf / rf + if rf == 0: + raise EvaluationError("Modulo by zero") + return lf % rf + if operator == "+" and isinstance(left, str) and isinstance(right, str): + return left + right + raise EvaluationError( + f"Cannot apply '{operator}' to {type(left).__name__} and {type(right).__name__}" + ) + + +def _contains(left: DslValue, right: DslValue) -> bool: + """Test membership of ``left`` in a container ``right``.""" + if isinstance(right, str): + if isinstance(left, str): + return left in right + raise EvaluationError("Substring test requires a string on the left") + if isinstance(right, (list, tuple, set, frozenset, dict)): + return left in right + raise EvaluationError( + f"Membership test requires a container, got {type(right).__name__}" + ) class Evaluator: @@ -53,9 +146,9 @@ class Evaluator: def __init__(self, use_cache: bool = True) -> None: self._use_cache = use_cache - self._cache: dict[tuple[str, ...], Any] = {} + self._cache: dict[tuple[str, ...], DslValue] = {} - def evaluate(self, node: ast.ASTNode, context: EvaluationContext) -> Any: + def evaluate(self, node: ast.ASTNode, context: EvaluationContext) -> DslValue: """Evaluate an AST node in the given context. Parameters @@ -67,7 +160,7 @@ def evaluate(self, node: ast.ASTNode, context: EvaluationContext) -> Any: Returns ------- - Any + DslValue Result of evaluation. Raises @@ -95,7 +188,9 @@ def evaluate(self, node: ast.ASTNode, context: EvaluationContext) -> Any: else: raise EvaluationError(f"Unknown node type: {type(node).__name__}") - def _evaluate_literal(self, node: ast.Literal, context: EvaluationContext) -> Any: + def _evaluate_literal( + self, node: ast.Literal, context: EvaluationContext + ) -> DslValue: """Evaluate literal node. Parameters @@ -107,12 +202,14 @@ def _evaluate_literal(self, node: ast.Literal, context: EvaluationContext) -> An Returns ------- - Any + DslValue Literal value. """ return node.value - def _evaluate_variable(self, node: ast.Variable, context: EvaluationContext) -> Any: + def _evaluate_variable( + self, node: ast.Variable, context: EvaluationContext + ) -> DslValue: """Evaluate variable node. Parameters @@ -124,7 +221,7 @@ def _evaluate_variable(self, node: ast.Variable, context: EvaluationContext) -> Returns ------- - Any + DslValue Variable value from context. Raises @@ -138,7 +235,7 @@ def _evaluate_variable(self, node: ast.Variable, context: EvaluationContext) -> def _evaluate_binary_op( self, node: ast.BinaryOp, context: EvaluationContext - ) -> Any: + ) -> DslValue: """Evaluate binary operation node. Parameters @@ -150,7 +247,7 @@ def _evaluate_binary_op( Returns ------- - Any + DslValue Result of binary operation. Raises @@ -174,51 +271,27 @@ def _evaluate_binary_op( left = self.evaluate(node.left, context) right = self.evaluate(node.right, context) - try: - # comparison operators - if node.operator == "==": - return left == right - elif node.operator == "!=": - return left != right - elif node.operator == "<": - return left < right - elif node.operator == ">": - return left > right - elif node.operator == "<=": - return left <= right - elif node.operator == ">=": - return left >= right - # membership operators - elif node.operator == "in": - return left in right - elif node.operator == "not in": - return left not in right - # arithmetic operators - elif node.operator == "+": - return left + right - elif node.operator == "-": - return left - right - elif node.operator == "*": - return left * right - elif node.operator == "/": - if right == 0: - raise EvaluationError("Division by zero") - return left / right - elif node.operator == "%": - if right == 0: - raise EvaluationError("Modulo by zero") - return left % right - else: - raise EvaluationError(f"Unknown operator: {node.operator}") - except TypeError as e: - raise EvaluationError( - f"Type error in operation '{node.operator}': " - f"cannot operate on {type(left).__name__} and {type(right).__name__}" - ) from e - except ZeroDivisionError as e: - raise EvaluationError("Division by zero") from e - - def _evaluate_unary_op(self, node: ast.UnaryOp, context: EvaluationContext) -> Any: + # equality works on any pair of values + if node.operator == "==": + return left == right + if node.operator == "!=": + return left != right + # ordering operators (numeric or string operands) + if node.operator in ("<", ">", "<=", ">="): + return _compare(node.operator, left, right) + # membership operators + if node.operator == "in": + return _contains(left, right) + if node.operator == "not in": + return not _contains(left, right) + # arithmetic operators + if node.operator in ("+", "-", "*", "/", "%"): + return _arithmetic(node.operator, left, right) + raise EvaluationError(f"Unknown operator: {node.operator}") + + def _evaluate_unary_op( + self, node: ast.UnaryOp, context: EvaluationContext + ) -> DslValue: """Evaluate unary operation node. Parameters @@ -230,7 +303,7 @@ def _evaluate_unary_op(self, node: ast.UnaryOp, context: EvaluationContext) -> A Returns ------- - Any + DslValue Result of unary operation. Raises @@ -240,24 +313,20 @@ def _evaluate_unary_op(self, node: ast.UnaryOp, context: EvaluationContext) -> A """ operand = self.evaluate(node.operand, context) - try: - if node.operator == "not": - return not operand - elif node.operator == "-": - return -operand - elif node.operator == "+": - return +operand - else: - raise EvaluationError(f"Unknown unary operator: {node.operator}") - except TypeError as e: - raise EvaluationError( - f"Type error in unary operation '{node.operator}': " - f"cannot operate on {type(operand).__name__}" - ) from e + if node.operator == "not": + return not operand + if node.operator in ("-", "+"): + if not isinstance(operand, (int, float)): + raise EvaluationError( + f"Unary '{node.operator}' requires a number, got " + f"{type(operand).__name__}" + ) + return -operand if node.operator == "-" else +operand + raise EvaluationError(f"Unknown unary operator: {node.operator}") def _evaluate_function_call( self, node: ast.FunctionCall, context: EvaluationContext - ) -> Any: + ) -> DslValue: """Evaluate function call node. Parameters @@ -269,7 +338,7 @@ def _evaluate_function_call( Returns ------- - Any + DslValue Function return value. Raises @@ -308,7 +377,7 @@ def _evaluate_function_call( def _evaluate_attribute_access( self, node: ast.AttributeAccess, context: EvaluationContext - ) -> Any: + ) -> DslValue: """Evaluate attribute access node. Parameters @@ -320,7 +389,7 @@ def _evaluate_attribute_access( Returns ------- - Any + DslValue Attribute value. Raises @@ -334,7 +403,7 @@ def _evaluate_attribute_access( if isinstance(obj, dict): if node.attribute not in obj: raise EvaluationError(f"Dictionary does not have key: {node.attribute}") - return obj[node.attribute] # type: ignore[reportUnknownVariableType] + return obj[node.attribute] # try attribute access try: @@ -347,7 +416,7 @@ def _evaluate_attribute_access( def _evaluate_subscript( self, node: ast.Subscript, context: EvaluationContext - ) -> Any: + ) -> DslValue: """Evaluate subscript access node. Parameters @@ -359,7 +428,7 @@ def _evaluate_subscript( Returns ------- - Any + DslValue Subscripted value. Raises @@ -371,16 +440,30 @@ def _evaluate_subscript( index = self.evaluate(node.index, context) try: - return obj[index] # type: ignore[reportUnknownVariableType] - except (KeyError, IndexError, TypeError) as e: + if isinstance(obj, dict): + if not isinstance(index, str): + raise EvaluationError( + f"Dictionary index must be a string, got {type(index).__name__}" + ) + return obj[index] + if isinstance(obj, (list, tuple, str)): + if not isinstance(index, int): + raise EvaluationError( + f"Sequence index must be an integer, got {type(index).__name__}" + ) + return obj[index] + raise EvaluationError( + f"Subscript access not supported on {type(obj).__name__}" + ) + except (KeyError, IndexError) as e: obj_type = type(obj).__name__ raise EvaluationError( - f"Subscript access failed on {obj_type} with index {index}: {e}" + f"Subscript access failed on {obj_type} with index {index!r}: {e}" ) from e def _evaluate_list_literal( self, node: ast.ListLiteral, context: EvaluationContext - ) -> list[Any]: + ) -> list[DslValue]: """Evaluate list literal node. Parameters @@ -392,7 +475,7 @@ def _evaluate_list_literal( Returns ------- - list[Any] + list[DslValue] Evaluated list elements. """ return [self.evaluate(element, context) for element in node.elements] @@ -454,24 +537,23 @@ def __init__(self) -> None: def evaluate( self, expression: str, - context: dict[str, ContextValue | LexicalItem | FilledTemplate | Item], - ) -> bool | str | int | float | list[Any]: + context: Mapping[str, DslValue], + ) -> DslValue: """Evaluate DSL expression with given context. Parameters ---------- expression : str DSL expression to evaluate. - context : dict[str, ContextValue | LexicalItem | FilledTemplate | Item] - Variables available during evaluation. Can include: - - ContextValue: primitive values, lists, sets - - LexicalItem: lexical items for single-slot constraints - - FilledTemplate: filled templates for multi-slot constraints - - Item: items for list partitioning + context : Mapping[str, DslValue] + Variables available during evaluation. Values may be DSL scalars, + collections, or bead models (e.g. a ``LexicalItem`` bound to + ``self`` for single-slot constraints, a ``FilledTemplate`` for + multi-slot constraints, or an ``Item`` for list partitioning). Returns ------- - bool | str | int | float | list[Any] + DslValue Result of evaluation. Raises @@ -510,10 +592,10 @@ def evaluate( def extract_property_value( self, - obj: Any, + obj: DslValue, property_expression: str, context: dict[str, ContextValue] | None = None, - ) -> Any: + ) -> DslValue: """Extract property value using DSL expression. This method is used by ListPartitioner to extract property values @@ -522,7 +604,7 @@ def extract_property_value( Parameters ---------- - obj : Any + obj : DslValue Object to extract property from (typically a LexicalItem or Item). property_expression : str DSL expression that accesses object properties (e.g., "item.lemma", @@ -532,7 +614,7 @@ def extract_property_value( Returns ------- - Any + DslValue Extracted property value. Raises @@ -549,9 +631,10 @@ def extract_property_value( >>> evaluator.extract_property_value(item, "len(item.lemma)") 4 """ - eval_context_dict: dict[str, Any] = {"item": obj} + eval_context_dict: dict[str, DslValue] = {"item": obj} if context: - eval_context_dict.update(context) + for key, value in context.items(): + eval_context_dict[key] = value return self.evaluate(property_expression, eval_context_dict) diff --git a/bead/dsl/stdlib.py b/bead/dsl/stdlib.py index d0affb5..48cd018 100644 --- a/bead/dsl/stdlib.py +++ b/bead/dsl/stdlib.py @@ -15,6 +15,7 @@ if TYPE_CHECKING: from bead.dsl.context import EvaluationContext from bead.items.item import Item + from bead.items.spans import Span # Type for DSL scalar values that can be compared/processed DslScalar = str | int | float | bool | None @@ -855,8 +856,206 @@ def preference_prob(score1: float, score2: float, temperature: float = 1.0) -> f return sigmoid((score1 - score2) / temperature) +# Structural query functions +# +# These operate over a dependency parse stored on an ``Item`` as token-level +# ``Span``s (``span_type == "token"``) plus directed ``SpanRelation``s +# (``source`` = head, ``target`` = dependent). Tokens are addressed by their +# sentence-local 0-based index. They let constraint expressions query syntactic +# structure, e.g. ``upos(self, root(self)) == "VERB"``. +def _token_spans(item: Item) -> dict[int, Span]: + """Map token index to its ``Span`` for token-level spans on the item.""" + result: dict[int, Span] = {} + for span in item.spans: + if span.span_type != "token" or not span.segments: + continue + indices = span.segments[0].indices + if indices: + result[indices[0]] = span + return result + + +def _span_id_index(token_spans: dict[int, Span]) -> dict[str, int]: + """Map ``span_id`` to token index for the given token spans.""" + return {span.span_id: index for index, span in token_spans.items()} + + +def _meta_str(span: Span, key: str) -> str | None: + """Read a string-valued metadata field from a span, else ``None``.""" + value = span.span_metadata.get(key) + return value if isinstance(value, str) else None + + +def upos(item: Item, index: int) -> str | None: + """Universal POS tag of the token at ``index``.""" + span = _token_spans(item).get(index) + return _meta_str(span, "upos") if span is not None else None + + +def xpos(item: Item, index: int) -> str | None: + """Treebank (language-specific) POS tag of the token at ``index``.""" + span = _token_spans(item).get(index) + return _meta_str(span, "xpos") if span is not None else None + + +def lemma_of(item: Item, index: int) -> str | None: + """Lemma of the token at ``index``.""" + span = _token_spans(item).get(index) + return _meta_str(span, "lemma") if span is not None else None + + +def form_of(item: Item, index: int) -> str | None: + """Surface form (token text) of the token at ``index``.""" + span = _token_spans(item).get(index) + return _meta_str(span, "form") if span is not None else None + + +def deprel(item: Item, index: int) -> str | None: + """Dependency relation of the token at ``index`` to its head.""" + span = _token_spans(item).get(index) + return _meta_str(span, "deprel") if span is not None else None + + +def morph(item: Item, index: int, feature: str) -> str | None: + """Value of a morphological ``feature`` for the token at ``index``.""" + span = _token_spans(item).get(index) + if span is None: + return None + features = span.span_metadata.get("morph") + if isinstance(features, dict): + value = features.get(feature) + return value if isinstance(value, str) else None + return None + + +def head(item: Item, index: int) -> int | None: + """Index of the syntactic head of the token at ``index`` (``None`` = root).""" + token_spans = _token_spans(item) + target = token_spans.get(index) + if target is None: + return None + id_to_index = _span_id_index(token_spans) + for relation in item.span_relations: + if relation.target_span_id == target.span_id: + return id_to_index.get(relation.source_span_id) + return None + + +def dependents(item: Item, index: int, relation: str | None = None) -> list[int]: + """Return token indices governed by ``index``, optionally filtered by deprel.""" + token_spans = _token_spans(item) + source = token_spans.get(index) + if source is None: + return [] + id_to_index = _span_id_index(token_spans) + found: list[int] = [] + for rel in item.span_relations: + if rel.source_span_id != source.span_id: + continue + if relation is not None and (rel.label is None or rel.label.label != relation): + continue + target_index = id_to_index.get(rel.target_span_id) + if target_index is not None: + found.append(target_index) + return sorted(found) + + +def has_relation( + item: Item, head_index: int, dep_index: int, relation: str | None = None +) -> bool: + """Whether a head -> dependent arc exists, optionally with the given deprel.""" + return dep_index in dependents(item, head_index, relation) + + +def root(item: Item) -> int | None: + """Index of the root token (``deprel == "root"`` or no incoming arc).""" + token_spans = _token_spans(item) + for index, span in token_spans.items(): + if _meta_str(span, "deprel") == "root": + return index + for index, span in token_spans.items(): + if span.head_index is None: + return index + return None + + +def tokens_with_upos(item: Item, tag: str) -> list[int]: + """Return indices of all tokens whose UPOS equals ``tag``.""" + return sorted( + index + for index, span in _token_spans(item).items() + if _meta_str(span, "upos") == tag + ) + + +def tokens_with_deprel(item: Item, rel: str) -> list[int]: + """Return indices of all tokens whose dependency relation equals ``rel``.""" + return sorted( + index + for index, span in _token_spans(item).items() + if _meta_str(span, "deprel") == rel + ) + + +def path_to_root(item: Item, index: int) -> list[int]: + """Token indices from ``index`` up to the root (cycle-guarded).""" + path: list[int] = [] + seen: set[int] = set() + current: int | None = index + while current is not None and current not in seen: + path.append(current) + seen.add(current) + current = head(item, current) + return path + + +def subtree(item: Item, index: int) -> list[int]: + """All transitive dependents of ``index``, including ``index`` itself.""" + result: list[int] = [] + seen: set[int] = set() + queue: list[int] = [index] + while queue: + current = queue.pop() + if current in seen: + continue + seen.add(current) + result.append(current) + queue.extend(dependents(item, current)) + return sorted(result) + + +def any_deprel(item: Item, indices: list[int], rel: str) -> bool: + """Whether any token in ``indices`` has dependency relation ``rel``.""" + return any(deprel(item, index) == rel for index in indices) + + +def filter_upos(item: Item, indices: list[int], tag: str) -> list[int]: + """Subset of ``indices`` whose tokens have UPOS ``tag``.""" + return [index for index in indices if upos(item, index) == tag] + + # Type alias for DSL callable functions -DslFunction = Callable[..., DslScalar | list[DslScalar] | list[float]] +DslFunction = Callable[..., DslScalar | list[DslScalar] | list[float] | list[int]] + +# Register structural query functions +STRUCTURE_FUNCTIONS: dict[str, DslFunction] = { + "upos": upos, + "xpos": xpos, + "lemma_of": lemma_of, + "form_of": form_of, + "deprel": deprel, + "morph": morph, + "head": head, + "dependents": dependents, + "has_relation": has_relation, + "root": root, + "tokens_with_upos": tokens_with_upos, + "tokens_with_deprel": tokens_with_deprel, + "path_to_root": path_to_root, + "subtree": subtree, + "any_deprel": any_deprel, + "filter_upos": filter_upos, +} # Register simulation functions SIMULATION_FUNCTIONS: dict[str, DslFunction] = { @@ -905,8 +1104,9 @@ def preference_prob(score1: float, score2: float, temperature: float = 1.0) -> f "not": not_, } -# Update STDLIB_FUNCTIONS with simulation functions +# Update STDLIB_FUNCTIONS with simulation and structural-query functions STDLIB_FUNCTIONS.update(SIMULATION_FUNCTIONS) +STDLIB_FUNCTIONS.update(STRUCTURE_FUNCTIONS) def register_stdlib(context: EvaluationContext) -> None: diff --git a/bead/interop/__init__.py b/bead/interop/__init__.py new file mode 100644 index 0000000..64d10e6 --- /dev/null +++ b/bead/interop/__init__.py @@ -0,0 +1,5 @@ +"""Lossless interoperability mappings between bead and external schemas. + +Currently provides bidirectional, law-verified lenses between bead models and +the ``layers`` linguistic-annotation schema (``bead.interop.layers``). +""" diff --git a/bead/interop/layers/__init__.py b/bead/interop/layers/__init__.py new file mode 100644 index 0000000..56f7b36 --- /dev/null +++ b/bead/interop/layers/__init__.py @@ -0,0 +1,80 @@ +"""Lossless, law-verified lenses between bead models and the ``layers`` schema. + +Maps bead's corpus and annotation models to ``layers``-shaped JSON and back via +didactic lenses (``dx.Lens``/``dx.Iso``): the layers view is a faithful, +standalone projection; the lens complement holds the bead-only round-trip +remainder. Round-trip fidelity is guaranteed by the didactic GetPut/PutGet laws. + +What is mapped: + +- The linguistic ``layers`` constructs (the shared object definitions and the + expression, segmentation, annotation, graph, media, and ontology records) are + mirrored as didactic models with a generic ``MirrorIso`` (see ``models``, + ``models_records``, and ``model_lenses``). +- bead's pipeline outputs bridge directly to layers: ``CorpusGraph`` to the + property graph, ``CorpusRecord`` to an ``expression``, and a dependency parse + to a ``tokenization`` with part-of-speech and dependency annotation layers. +- bead's resources map to their layers counterparts: ``LexicalItem`` to an + ``entry``, ``Lexicon`` to a ``collection``, and ``Template`` to a ``template`` + (see ``resource_lens``). +""" + +from __future__ import annotations + +from bead.interop.layers.bridges import ( + RECORD_EXPRESSION, + RecordExpressionLens, + record_to_expression, +) +from bead.interop.layers.graph_lens import ( + CORPUS_GRAPH_LAYERS, + CorpusGraphLayersLens, + graph_to_layers, +) +from bead.interop.layers.model_lenses import ( + ALL_MIRROR_ISOS, + RECORD_ISOS, + RECORD_MODELS, + SHARED_DEF_ISOS, + SHARED_DEF_MODELS, + MirrorIso, + mirror_iso, +) +from bead.interop.layers.parse_lens import ( + PARSED_SENTENCE_LAYERS, + ParsedSentenceLayersIso, + parse_to_layers, +) +from bead.interop.layers.resource_lens import ( + LEXICAL_ITEM_ENTRY, + LEXICON_COLLECTION, + TEMPLATE_LAYERS, + LexicalItemEntryLens, + LexiconCollectionLens, + TemplateLayersLens, +) + +__all__ = [ + "ALL_MIRROR_ISOS", + "CORPUS_GRAPH_LAYERS", + "LEXICAL_ITEM_ENTRY", + "LEXICON_COLLECTION", + "PARSED_SENTENCE_LAYERS", + "RECORD_EXPRESSION", + "TEMPLATE_LAYERS", + "LexicalItemEntryLens", + "LexiconCollectionLens", + "TemplateLayersLens", + "RECORD_ISOS", + "RECORD_MODELS", + "SHARED_DEF_ISOS", + "SHARED_DEF_MODELS", + "CorpusGraphLayersLens", + "MirrorIso", + "ParsedSentenceLayersIso", + "RecordExpressionLens", + "graph_to_layers", + "mirror_iso", + "parse_to_layers", + "record_to_expression", +] diff --git a/bead/interop/layers/_convert.py b/bead/interop/layers/_convert.py new file mode 100644 index 0000000..6d56395 --- /dev/null +++ b/bead/interop/layers/_convert.py @@ -0,0 +1,196 @@ +"""Shared, reversible conversions between bead values and layers JSON shapes. + +These helpers centralize the mechanical, lossless conversions every layers lens +relies on: feature maps, object references, confidence scaling, and capture / +restore of a bead model's framework identity (the ``BeadBaseModel`` id and +timestamps, which ``layers`` represents through its own identity scheme and so +travel in a lens complement rather than the layers view). +""" + +from __future__ import annotations + +import json +from collections.abc import Mapping +from datetime import datetime +from typing import TYPE_CHECKING +from uuid import UUID + +from bead.corpus.records import ProvenanceValue +from bead.data.base import BeadBaseModel, JsonValue + +if TYPE_CHECKING: + from bead.items.item import MetadataValue + + +def to_feature_map(features: Mapping[str, MetadataValue]) -> JsonValue: + """Encode a feature dict as a layers ``featureMap`` (values JSON-encoded). + + Each value is serialized with ``json.dumps`` so arbitrary (including + non-string) values round-trip exactly via :func:`from_feature_map`. Entries + preserve the dict's insertion order so the round-trip is exact. + """ + entries: tuple[JsonValue, ...] = tuple( + {"key": key, "value": json.dumps(features[key])} for key in features + ) + return {"entries": entries} + + +type _Loaded = str | int | float | bool | None | list["_Loaded"] | dict[str, "_Loaded"] + + +def _tuplify(value: _Loaded) -> MetadataValue: + """Convert ``json.loads`` output (lists) into the tuple-based MetadataValue.""" + if isinstance(value, list): + return tuple(_tuplify(item) for item in value) + if isinstance(value, dict): + return {str(key): _tuplify(val) for key, val in value.items()} + return value + + +def from_feature_map(feature_map: JsonValue) -> dict[str, MetadataValue]: + """Decode a layers ``featureMap`` back into a feature dict.""" + result: dict[str, MetadataValue] = {} + if not isinstance(feature_map, dict): + return result + entries = feature_map.get("entries") + if not isinstance(entries, tuple): + return result + for entry in entries: + if isinstance(entry, dict): + key = entry.get("key") + value = entry.get("value") + if isinstance(key, str) and isinstance(value, str): + result[key] = _tuplify(json.loads(value)) + return result + + +def from_feature_map_scalar(feature_map: JsonValue) -> dict[str, ProvenanceValue]: + """Decode a ``featureMap`` whose values are flat provenance scalars.""" + result: dict[str, ProvenanceValue] = {} + if not isinstance(feature_map, dict): + return result + entries = feature_map.get("entries") + if not isinstance(entries, tuple): + return result + for entry in entries: + if isinstance(entry, dict): + key = entry.get("key") + value = entry.get("value") + if isinstance(key, str) and isinstance(value, str): + result[key] = json.loads(value) + return result + + +def strip_nulls(value: JsonValue) -> JsonValue: + """Recursively drop dict entries whose value is ``None``. + + The ATProto data model has no null: optional fields are omitted, not set to + null, and a lexicon rejects an explicit null for a typed optional field. + Layers views therefore omit absent optionals; the round-trip is unaffected + because the reverse direction defaults missing keys back to ``None``. + """ + if isinstance(value, dict): + return { + key: strip_nulls(item) for key, item in value.items() if item is not None + } + if isinstance(value, tuple): + return tuple(strip_nulls(item) for item in value) + return value + + +def object_ref(local_id: str) -> JsonValue: + """Build a layers ``objectRef`` to a local node by id.""" + return {"localId": {"value": local_id}} + + +def from_object_ref(ref: JsonValue) -> str: + """Read the local id out of a layers ``objectRef``.""" + if isinstance(ref, dict): + local = ref.get("localId") + if isinstance(local, dict): + value = local.get("value") + if isinstance(value, str): + return value + raise ValueError("objectRef has no localId.value") + + +def identity_of(model: BeadBaseModel) -> JsonValue: + """Capture a model's framework identity for a lens complement.""" + return { + "id": str(model.id), + "created_at": model.created_at.isoformat(), + "modified_at": model.modified_at.isoformat(), + "version": model.version, + "metadata": dict(model.metadata), + } + + +def apply_identity[T: BeadBaseModel](model: T, identity: JsonValue) -> T: + """Restore a model's captured framework identity onto a fresh instance. + + The model is constructed with content fields (and default identity); this + overrides the framework identity (id, timestamps, version, metadata) with + the values captured by :func:`identity_of`, so a round-trip is exact. + """ + fields = j_obj(identity) + metadata = fields["metadata"] + return model.with_( + id=UUID(_as_str(fields["id"])), + created_at=datetime.fromisoformat(_as_str(fields["created_at"])), + modified_at=datetime.fromisoformat(_as_str(fields["modified_at"])), + version=_as_str(fields["version"]), + metadata=metadata if isinstance(metadata, dict) else {}, + ) + + +def _as_str(value: JsonValue) -> str: + if not isinstance(value, str): + raise ValueError(f"expected str, got {type(value).__name__}") + return value + + +def j_obj(value: JsonValue) -> dict[str, JsonValue]: + """Narrow a ``JsonValue`` to a JSON object, raising otherwise.""" + if not isinstance(value, dict): + raise ValueError(f"expected JSON object, got {type(value).__name__}") + return value + + +def j_list(value: JsonValue) -> tuple[JsonValue, ...]: + """Narrow a ``JsonValue`` to a JSON array, raising otherwise.""" + if isinstance(value, tuple): + return value + raise ValueError(f"expected JSON array, got {type(value).__name__}") + + +def j_str(value: JsonValue) -> str: + """Narrow a ``JsonValue`` to a string, raising otherwise.""" + return _as_str(value) + + +def j_str_or_none(value: JsonValue) -> str | None: + """Narrow a ``JsonValue`` to ``str | None``.""" + if value is None or isinstance(value, str): + return value + raise ValueError(f"expected str or None, got {type(value).__name__}") + + +def j_float_or_none(value: JsonValue) -> float | None: + """Narrow a ``JsonValue`` to ``float | None``.""" + if value is None or isinstance(value, (int, float)): + return value + raise ValueError(f"expected number or None, got {type(value).__name__}") + + +def j_bool(value: JsonValue) -> bool: + """Narrow a ``JsonValue`` to a bool, raising otherwise.""" + if isinstance(value, bool): + return value + raise ValueError(f"expected bool, got {type(value).__name__}") + + +def j_int(value: JsonValue) -> int: + """Narrow a ``JsonValue`` to an int, raising otherwise.""" + if isinstance(value, bool) or not isinstance(value, int): + raise ValueError(f"expected int, got {type(value).__name__}") + return value diff --git a/bead/interop/layers/_mirror.py b/bead/interop/layers/_mirror.py new file mode 100644 index 0000000..7f349ef --- /dev/null +++ b/bead/interop/layers/_mirror.py @@ -0,0 +1,115 @@ +"""Generic serialization between mirror models and layers JSON. + +The mirror models in :mod:`bead.interop.layers.models` match the ``layers`` +schema structurally: snake_case fields correspond to layers' camelCase, nested +objects are embedded models, feature maps are :class:`FeatureMap`, and +confidence is an integer. A single pair of conversions therefore serializes any +of them to and from layers-shaped JSON. + +Conversion goes through each model's canonical JSON form (``model_dump_json`` / +``model_validate_json``), so it does not depend on didactic's internal +field-value types. +""" + +from __future__ import annotations + +import json +import re + +import didactic.api as dx + +from bead.data.base import JsonValue +from bead.interop.layers._convert import j_obj, strip_nulls + +_CAMEL_BOUNDARY = re.compile(r"([A-Z])") + +type _Loaded = str | int | float | bool | None | list["_Loaded"] | dict[str, "_Loaded"] + + +def _to_camel(name: str) -> str: + head, *rest = name.split("_") + return head + "".join(part[:1].upper() + part[1:] for part in rest) + + +def _to_snake(name: str) -> str: + return _CAMEL_BOUNDARY.sub(lambda match: "_" + match.group(1).lower(), name) + + +def _camel_keys(value: _Loaded) -> JsonValue: + """Recursively camelCase dict keys and turn JSON arrays into tuples.""" + if isinstance(value, dict): + return {_to_camel(key): _camel_keys(item) for key, item in value.items()} + if isinstance(value, list): + return tuple(_camel_keys(item) for item in value) + return value + + +def _snake_keys(value: JsonValue) -> JsonValue: + """Recursively snake_case dict keys (arrays stay tuples).""" + if isinstance(value, dict): + return {_to_snake(key): _snake_keys(item) for key, item in value.items()} + if isinstance(value, tuple): + return tuple(_snake_keys(item) for item in value) + return value + + +_DEFS_NSID = "pub.layers.defs" + +#: The ``externalTarget.selector`` union variants (camelCase mirror keys, each +#: also the layers def name). layers models this as an open ATProto union, so the +#: wire value carries a ``$type`` discriminator rather than a wrapper key. +_SELECTOR_VARIANTS = frozenset( + {"textQuoteSelector", "textPositionSelector", "fragmentSelector"} +) + + +def _wrap_unions(value: JsonValue) -> JsonValue: + """Rewrite ``selector`` wrappers into ATProto ``$type`` union members.""" + if isinstance(value, dict): + result: dict[str, JsonValue] = {} + for key, item in value.items(): + if key == "selector" and isinstance(item, dict) and len(item) == 1: + variant, payload = next(iter(item.items())) + if variant in _SELECTOR_VARIANTS and isinstance(payload, dict): + member: dict[str, JsonValue] = {"$type": f"{_DEFS_NSID}#{variant}"} + for inner_key, inner_item in payload.items(): + member[inner_key] = _wrap_unions(inner_item) + result[key] = member + continue + result[key] = _wrap_unions(item) + return result + if isinstance(value, tuple): + return tuple(_wrap_unions(item) for item in value) + return value + + +def _unwrap_unions(value: JsonValue) -> JsonValue: + """Rewrite ATProto ``$type`` selector union members back to wrappers.""" + if isinstance(value, dict): + result: dict[str, JsonValue] = {} + for key, item in value.items(): + type_ref = item.get("$type") if isinstance(item, dict) else None + if key == "selector" and isinstance(type_ref, str): + variant = type_ref.rsplit("#", 1)[-1] + payload: dict[str, JsonValue] = {} + for inner_key, inner_item in j_obj(item).items(): + if inner_key != "$type": + payload[inner_key] = _unwrap_unions(inner_item) + result[key] = {variant: payload} + continue + result[key] = _unwrap_unions(item) + return result + if isinstance(value, tuple): + return tuple(_unwrap_unions(item) for item in value) + return value + + +def mirror_to_layers(model: dx.Model) -> JsonValue: + """Serialize a faithful mirror model to layers-shaped JSON (camelCase).""" + return _wrap_unions(strip_nulls(_camel_keys(json.loads(model.model_dump_json())))) + + +def mirror_from_layers[M: dx.Model](model_type: type[M], data: JsonValue) -> M: + """Deserialize layers-shaped JSON back into a mirror model.""" + restored = _snake_keys(j_obj(_unwrap_unions(data))) + return model_type.model_validate_json(json.dumps(restored)) diff --git a/bead/interop/layers/bridges.py b/bead/interop/layers/bridges.py new file mode 100644 index 0000000..fc0d30a --- /dev/null +++ b/bead/interop/layers/bridges.py @@ -0,0 +1,69 @@ +"""Bridge lenses between bead-native models and layers constructs. + +These map the things bead's pipeline actually produces onto layers records: + +- :class:`~bead.corpus.records.CorpusRecord` <-> a layers ``expression``. + +The layers view is a faithful, standalone projection; the lens complement holds +the bead-only remainder (framework identity and fields layers has no slot for), +so the round-trip is exact and the GetPut/PutGet laws hold. +""" + +from __future__ import annotations + +import didactic.api as dx + +from bead.corpus.records import CorpusRecord +from bead.data.base import JsonValue +from bead.interop.layers._convert import ( + apply_identity, + from_feature_map_scalar, + identity_of, + j_int, + j_obj, + j_str, + to_feature_map, +) + +_EXPRESSION_KIND = "expression" + + +class RecordExpressionLens(dx.Lens[CorpusRecord, JsonValue, JsonValue]): + """Lossless lens ``CorpusRecord <-> (layers expression view, complement)``.""" + + def forward(self, record: CorpusRecord) -> tuple[JsonValue, JsonValue]: + """Project a record to a layers expression view and bead complement.""" + view: JsonValue = { + "id": str(record.id), + "kind": _EXPRESSION_KIND, + "text": record.text, + "features": to_feature_map(record.provenance), + "createdAt": record.created_at.isoformat(), + } + complement: JsonValue = { + "identity": identity_of(record), + "source_name": record.source_name, + "record_index": record.record_index, + } + return view, complement + + def backward(self, view: JsonValue, complement: JsonValue) -> CorpusRecord: + """Reconstruct a record from its layers expression view and complement.""" + view_obj = j_obj(view) + comp = j_obj(complement) + record = CorpusRecord( + text=j_str(view_obj["text"]), + source_name=j_str(comp["source_name"]), + record_index=j_int(comp["record_index"]), + provenance=from_feature_map_scalar(view_obj["features"]), + ) + return apply_identity(record, comp["identity"]) + + +RECORD_EXPRESSION = RecordExpressionLens() + + +def record_to_expression(record: CorpusRecord) -> JsonValue: + """Return the standalone layers ``expression`` view of a corpus record.""" + view, _complement = RECORD_EXPRESSION.forward(record) + return view diff --git a/bead/interop/layers/graph_lens.py b/bead/interop/layers/graph_lens.py new file mode 100644 index 0000000..6f861f2 --- /dev/null +++ b/bead/interop/layers/graph_lens.py @@ -0,0 +1,208 @@ +"""Lens between a ``CorpusGraph`` and the layers property graph. + +The lens projects a :class:`~bead.corpus.graph.CorpusGraph` to a layers-shaped +view (expression records, graph nodes, and a ``graphEdgeSet``) and keeps a +complement holding the information layers' graph does not express directly (the +bead framework identity, edge directedness, and exact float confidence). +Together the view and complement reconstruct the graph exactly. +""" + +from __future__ import annotations + +import didactic.api as dx + +from bead.corpus.graph import CorpusEdge, CorpusGraph, CorpusNode +from bead.corpus.records import CorpusRecord +from bead.data.base import JsonValue +from bead.interop.layers._convert import ( + apply_identity, + from_feature_map, + from_feature_map_scalar, + from_object_ref, + identity_of, + j_bool, + j_float_or_none, + j_int, + j_list, + j_obj, + j_str, + j_str_or_none, + object_ref, + to_feature_map, +) + +_CONFIDENCE_SCALE = 1000 + + +class CorpusGraphLayersLens(dx.Lens[CorpusGraph, JsonValue, JsonValue]): + """Lossless lens ``CorpusGraph <-> (layers graph view, bead complement)``.""" + + def forward(self, graph: CorpusGraph) -> tuple[JsonValue, JsonValue]: + """Project a graph to its layers view and bead complement.""" + expressions: dict[str, JsonValue] = {} + graph_nodes: dict[str, JsonValue] = {} + node_complements: dict[str, JsonValue] = {} + node_order: list[JsonValue] = [] + + for node in graph.nodes: + node_order.append(node.node_id) + if node.record is not None: + expr: dict[str, JsonValue] = { + "id": node.node_id, + "kind": node.node_type, + "text": node.record.text, + "features": to_feature_map(node.record.provenance), + "createdAt": node.record.created_at.isoformat(), + } + if node.node_type_uri is not None: + expr["kindUri"] = node.node_type_uri + expressions[node.node_id] = expr + node_complements[node.node_id] = { + "is_expression": True, + "identity": identity_of(node), + "label": node.label, + "properties": to_feature_map(node.properties), + "record_identity": identity_of(node.record), + "record_source_name": node.record.source_name, + "record_index": node.record.record_index, + } + else: + graph_node: dict[str, JsonValue] = { + "nodeType": node.node_type, + "properties": to_feature_map(node.properties), + "createdAt": node.created_at.isoformat(), + } + if node.node_type_uri is not None: + graph_node["nodeTypeUri"] = node.node_type_uri + if node.label is not None: + graph_node["label"] = node.label + graph_nodes[node.node_id] = graph_node + node_complements[node.node_id] = { + "is_expression": False, + "identity": identity_of(node), + } + + edge_views: list[JsonValue] = [] + edge_complements: list[JsonValue] = [] + for edge in graph.edges: + edge_view: dict[str, JsonValue] = { + "edgeType": edge.edge_type, + "source": object_ref(edge.source_id), + "target": object_ref(edge.target_id), + "features": to_feature_map(edge.features), + } + if edge.edge_type_uri is not None: + edge_view["edgeTypeUri"] = edge.edge_type_uri + if edge.confidence is not None: + edge_view["confidence"] = round(edge.confidence * _CONFIDENCE_SCALE) + edge_view["uuid"] = {"value": str(edge.id)} + edge_views.append(edge_view) + edge_complements.append( + { + "identity": identity_of(edge), + "directed": edge.directed, + "confidence": edge.confidence, + } + ) + + view: JsonValue = { + "expressions": expressions, + "graphNodes": graph_nodes, + "graphEdgeSet": { + "edges": tuple(edge_views), + "createdAt": graph.created_at.isoformat(), + }, + } + complement: JsonValue = { + "graph_identity": identity_of(graph), + "graph_metadata": to_feature_map(graph.graph_metadata), + "node_order": tuple(node_order), + "node_complements": node_complements, + "edge_complements": tuple(edge_complements), + } + return view, complement + + def backward(self, view: JsonValue, complement: JsonValue) -> CorpusGraph: + """Reconstruct the graph from its layers view and bead complement.""" + view_obj = j_obj(view) + comp = j_obj(complement) + expressions = j_obj(view_obj["expressions"]) + graph_nodes = j_obj(view_obj["graphNodes"]) + node_complements = j_obj(comp["node_complements"]) + + nodes: list[CorpusNode] = [] + for node_id_value in j_list(comp["node_order"]): + node_id = j_str(node_id_value) + node_comp = j_obj(node_complements[node_id]) + if j_bool(node_comp["is_expression"]): + entry = j_obj(expressions[node_id]) + record = apply_identity( + CorpusRecord( + text=j_str(entry["text"]), + source_name=j_str(node_comp["record_source_name"]), + record_index=j_int(node_comp["record_index"]), + provenance=from_feature_map_scalar(entry["features"]), + ), + node_comp["record_identity"], + ) + node = CorpusNode( + node_id=node_id, + node_type=j_str(entry["kind"]), + node_type_uri=j_str_or_none(entry.get("kindUri")), + label=j_str_or_none(node_comp["label"]), + record=record, + properties=from_feature_map(node_comp["properties"]), + ) + else: + entry = j_obj(graph_nodes[node_id]) + node = CorpusNode( + node_id=node_id, + node_type=j_str(entry["nodeType"]), + node_type_uri=j_str_or_none(entry.get("nodeTypeUri")), + label=j_str_or_none(entry.get("label")), + record=None, + properties=from_feature_map(entry["properties"]), + ) + nodes.append(apply_identity(node, node_comp["identity"])) + + edge_set = j_obj(view_obj["graphEdgeSet"]) + edge_views = j_list(edge_set["edges"]) + edge_complements = j_list(comp["edge_complements"]) + edges: list[CorpusEdge] = [] + for edge_view_value, edge_comp_value in zip( + edge_views, edge_complements, strict=True + ): + edge_view = j_obj(edge_view_value) + edge_comp = j_obj(edge_comp_value) + edges.append( + apply_identity( + CorpusEdge( + source_id=from_object_ref(edge_view["source"]), + target_id=from_object_ref(edge_view["target"]), + edge_type=j_str(edge_view["edgeType"]), + edge_type_uri=j_str_or_none(edge_view.get("edgeTypeUri")), + directed=j_bool(edge_comp["directed"]), + confidence=j_float_or_none(edge_comp["confidence"]), + features=from_feature_map(edge_view["features"]), + ), + edge_comp["identity"], + ) + ) + + return apply_identity( + CorpusGraph( + nodes=tuple(nodes), + edges=tuple(edges), + graph_metadata=from_feature_map(comp["graph_metadata"]), + ), + comp["graph_identity"], + ) + + +CORPUS_GRAPH_LAYERS = CorpusGraphLayersLens() + + +def graph_to_layers(graph: CorpusGraph) -> JsonValue: + """Return the standalone layers-shaped view of a corpus graph.""" + view, _complement = CORPUS_GRAPH_LAYERS.forward(graph) + return view diff --git a/bead/interop/layers/model_lenses.py b/bead/interop/layers/model_lenses.py new file mode 100644 index 0000000..d26182a --- /dev/null +++ b/bead/interop/layers/model_lenses.py @@ -0,0 +1,154 @@ +"""Generic lossless isos between faithful mirror models and layers JSON. + +A single :class:`MirrorIso` (parameterized by model type) serves every faithful +mirror model, since they all serialize through the structural snake<->camel +conversion in :mod:`bead.interop.layers._mirror`. ``SHARED_DEF_ISOS`` registers +one iso per shared-def mirror so a coverage test can assert every construct has +a law-passing mapping. +""" + +from __future__ import annotations + +import didactic.api as dx + +from bead.data.base import JsonValue +from bead.interop.layers._mirror import mirror_from_layers, mirror_to_layers +from bead.interop.layers.models import ( + AgentRef, + AlignmentLink, + Anchor, + AnnotationMetadata, + BoundingBox, + ExternalTarget, + Feature, + FeatureMap, + FragmentSelector, + Keyframe, + KnowledgeRef, + LayersConstraint, + LayersSpan, + LayersUuid, + ObjectRef, + PageAnchor, + Selector, + SpatialEntity, + SpatialExpression, + SpatialModifier, + SpatioTemporalAnchor, + TemporalEntity, + TemporalExpression, + TemporalModifier, + TemporalSpan, + TextPositionSelector, + TextQuoteSelector, + TokenRef, + TokenRefSequence, +) +from bead.interop.layers.models_records import ( + Annotation, + AnnotationLayer, + ArgumentRef, + AudioInfo, + Cluster, + DocumentInfo, + Expression, + GraphEdge, + GraphEdgeEntry, + GraphEdgeSet, + GraphNode, + RoleSlot, + Token, + Tokenization, + TypeDef, + VideoInfo, +) + + +class MirrorIso[T: dx.Model](dx.Iso[T, JsonValue]): + """Lossless iso between a faithful mirror model and layers-shaped JSON.""" + + def __init__(self, model_type: type[T]) -> None: + self._model_type = model_type + + def forward(self, model: T) -> JsonValue: + """Serialize the mirror model to layers JSON.""" + return mirror_to_layers(model) + + def backward(self, data: JsonValue) -> T: + """Deserialize layers JSON back into the mirror model.""" + return mirror_from_layers(self._model_type, data) + + +def mirror_iso[T: dx.Model](model_type: type[T]) -> MirrorIso[T]: + """Build a :class:`MirrorIso` for a mirror model type.""" + return MirrorIso(model_type) + + +#: Every shared-def mirror model, for coverage and registration. +SHARED_DEF_MODELS: tuple[type[dx.Model], ...] = ( + LayersUuid, + Feature, + FeatureMap, + KnowledgeRef, + BoundingBox, + TemporalSpan, + AgentRef, + ObjectRef, + LayersSpan, + TokenRef, + TokenRefSequence, + Keyframe, + SpatioTemporalAnchor, + TemporalEntity, + TemporalModifier, + TemporalExpression, + SpatialEntity, + SpatialModifier, + SpatialExpression, + PageAnchor, + TextQuoteSelector, + TextPositionSelector, + FragmentSelector, + Selector, + ExternalTarget, + Anchor, + AlignmentLink, + AnnotationMetadata, + LayersConstraint, +) + +#: One lossless iso per shared-def mirror model. +SHARED_DEF_ISOS: dict[type[dx.Model], MirrorIso[dx.Model]] = { + model_type: MirrorIso(model_type) for model_type in SHARED_DEF_MODELS +} + +#: Every linguistic record mirror model. +RECORD_MODELS: tuple[type[dx.Model], ...] = ( + Expression, + Token, + Tokenization, + ArgumentRef, + Annotation, + Cluster, + AnnotationLayer, + GraphNode, + GraphEdge, + GraphEdgeEntry, + GraphEdgeSet, + AudioInfo, + VideoInfo, + DocumentInfo, + RoleSlot, + TypeDef, +) + +#: One lossless iso per record mirror model. +RECORD_ISOS: dict[type[dx.Model], MirrorIso[dx.Model]] = { + model_type: MirrorIso(model_type) for model_type in RECORD_MODELS +} + +#: All mirror isos (shared defs + records), keyed by model type. +ALL_MIRROR_ISOS: dict[type[dx.Model], MirrorIso[dx.Model]] = { + **SHARED_DEF_ISOS, + **RECORD_ISOS, +} diff --git a/bead/interop/layers/models.py b/bead/interop/layers/models.py new file mode 100644 index 0000000..1a7ff6f --- /dev/null +++ b/bead/interop/layers/models.py @@ -0,0 +1,289 @@ +"""Faithful didactic mirrors of the ``layers`` shared object definitions. + +Each model mirrors a ``pub.layers.defs`` object field-for-field (snake_case +names corresponding to layers' camelCase, nested objects as embedded models, +feature maps as :class:`FeatureMap`, confidence as an integer 0-1000). The +structural fidelity lets :mod:`bead.interop.layers._mirror` serialize any of +them to and from layers JSON losslessly with a single generic conversion. + +Names that would clash with bead's own models are prefixed ``Layers``. +""" + +from __future__ import annotations + +import didactic.api as dx + + +class LayersUuid(dx.Model): + """A layers ``uuid`` value object.""" + + value: str + + +class Feature(dx.Model): + """A single key/value entry in a layers ``featureMap``.""" + + key: str + value: str + + +class FeatureMap(dx.Model): + """A layers ``featureMap`` (ordered key/value entries).""" + + entries: tuple[dx.Embed[Feature], ...] = () + + +class KnowledgeRef(dx.Model): + """A layers ``knowledgeRef`` grounding to an external knowledge base.""" + + source: str + identifier: str + source_uri: str | None = None + uri: str | None = None + label: str | None = None + + +class BoundingBox(dx.Model): + """A layers ``boundingBox`` (pixel region).""" + + x: int + y: int + width: int + height: int + + +class TemporalSpan(dx.Model): + """A layers ``temporalSpan`` (millisecond interval).""" + + start: int + ending: int + + +class AgentRef(dx.Model): + """A layers ``agentRef`` (annotating agent).""" + + did: str | None = None + id: str | None = None + name: str | None = None + knowledge_ref: dx.Embed[KnowledgeRef] | None = None + + +class ObjectRef(dx.Model): + """A layers ``objectRef`` (local, record, or external reference).""" + + local_id: dx.Embed[LayersUuid] | None = None + record_ref: str | None = None + object_id: dx.Embed[LayersUuid] | None = None + knowledge_ref: dx.Embed[KnowledgeRef] | None = None + + +class LayersSpan(dx.Model): + """A layers ``span`` (UTF-8 byte offsets, optional char offsets).""" + + byte_start: int + byte_end: int + char_start: int | None = None + char_end: int | None = None + + +class TokenRef(dx.Model): + """A layers ``tokenRef`` (single token in a tokenization).""" + + tokenization_id: dx.Embed[LayersUuid] + token_index: int + + +class TokenRefSequence(dx.Model): + """A layers ``tokenRefSequence`` (ordered tokens, optional anchor).""" + + tokenization_id: dx.Embed[LayersUuid] + token_indexes: tuple[int, ...] = () + anchor_token_index: int | None = None + + +class Keyframe(dx.Model): + """A layers ``keyframe`` (a bounding box at a video time).""" + + time_ms: int + bbox: dx.Embed[BoundingBox] + features: dx.Embed[FeatureMap] | None = None + + +class SpatioTemporalAnchor(dx.Model): + """A layers ``spatioTemporalAnchor`` (time span plus keyframes).""" + + temporal_span: dx.Embed[TemporalSpan] + keyframes: tuple[dx.Embed[Keyframe], ...] = () + interpolation_uri: str | None = None + interpolation: str | None = None + + +class TemporalEntity(dx.Model): + """A layers ``temporalEntity`` (instant/interval/duration value).""" + + instant: str | None = None + interval_start: str | None = None + interval_end: str | None = None + duration: str | None = None + earliest: str | None = None + latest: str | None = None + granularity_uri: str | None = None + granularity: str | None = None + calendar_uri: str | None = None + calendar: str | None = None + recurrence: str | None = None + features: dx.Embed[FeatureMap] | None = None + + +class TemporalModifier(dx.Model): + """A layers ``temporalModifier``.""" + + mod_uri: str | None = None + mod: str | None = None + features: dx.Embed[FeatureMap] | None = None + + +class TemporalExpression(dx.Model): + """A layers ``temporalExpression``.""" + + type_uri: str | None = None + type: str | None = None + value: dx.Embed[TemporalEntity] | None = None + modifier: dx.Embed[TemporalModifier] | None = None + anchor_ref: dx.Embed[ObjectRef] | None = None + function_uri: str | None = None + function: str | None = None + features: dx.Embed[FeatureMap] | None = None + + +class SpatialEntity(dx.Model): + """A layers ``spatialEntity`` (geometry/region value).""" + + bbox: dx.Embed[BoundingBox] | None = None + geometry: str | None = None + type_uri: str | None = None + type: str | None = None + geometry_format_uri: str | None = None + geometry_format: str | None = None + crs_uri: str | None = None + crs: str | None = None + dimensions: int | None = None + uncertainty: str | None = None + features: dx.Embed[FeatureMap] | None = None + + +class SpatialModifier(dx.Model): + """A layers ``spatialModifier``.""" + + mod_uri: str | None = None + mod: str | None = None + features: dx.Embed[FeatureMap] | None = None + + +class SpatialExpression(dx.Model): + """A layers ``spatialExpression``.""" + + type_uri: str | None = None + type: str | None = None + value: dx.Embed[SpatialEntity] | None = None + modifier: dx.Embed[SpatialModifier] | None = None + anchor_ref: dx.Embed[ObjectRef] | None = None + function_uri: str | None = None + function: str | None = None + features: dx.Embed[FeatureMap] | None = None + + +class PageAnchor(dx.Model): + """A layers ``pageAnchor`` (a region on a document page).""" + + page: int + bounding_box: dx.Embed[BoundingBox] | None = None + text_span: dx.Embed[LayersSpan] | None = None + + +class TextQuoteSelector(dx.Model): + """A W3C-style ``textQuoteSelector``.""" + + exact: str + prefix: str | None = None + suffix: str | None = None + + +class TextPositionSelector(dx.Model): + """A W3C-style ``textPositionSelector``.""" + + byte_start: int + byte_end: int + char_start: int | None = None + char_end: int | None = None + + +class FragmentSelector(dx.Model): + """A W3C-style ``fragmentSelector``.""" + + value: str + conforms_to: str | None = None + + +class Selector(dx.Model): + """The selector union of a layers ``externalTarget``.""" + + text_quote_selector: dx.Embed[TextQuoteSelector] | None = None + text_position_selector: dx.Embed[TextPositionSelector] | None = None + fragment_selector: dx.Embed[FragmentSelector] | None = None + + +class ExternalTarget(dx.Model): + """A layers ``externalTarget`` (a web resource + selector).""" + + source: str + source_hash: str | None = None + title: str | None = None + selector: dx.Embed[Selector] | None = None + + +class Anchor(dx.Model): + """A layers ``anchor`` (the polymorphic attachment point).""" + + text_span: dx.Embed[LayersSpan] | None = None + token_ref: dx.Embed[TokenRef] | None = None + token_ref_sequence: dx.Embed[TokenRefSequence] | None = None + temporal_span: dx.Embed[TemporalSpan] | None = None + spatio_temporal_anchor: dx.Embed[SpatioTemporalAnchor] | None = None + page_anchor: dx.Embed[PageAnchor] | None = None + external_target: dx.Embed[ExternalTarget] | None = None + + +class AlignmentLink(dx.Model): + """A layers ``alignmentLink`` (aligned token-index sets).""" + + source_indices: tuple[int, ...] = () + target_indices: tuple[int, ...] = () + confidence: int | None = None + label: str | None = None + knowledge_refs: tuple[dx.Embed[KnowledgeRef], ...] = () + features: dx.Embed[FeatureMap] | None = None + + +class AnnotationMetadata(dx.Model): + """A layers ``annotationMetadata`` (provenance for an annotation).""" + + tool: str + agent: dx.Embed[AgentRef] | None = None + timestamp: str | None = None + confidence: int | None = None + persona_ref: str | None = None + dependencies: tuple[dx.Embed[ObjectRef], ...] = () + digest: str | None = None + + +class LayersConstraint(dx.Model): + """A layers ``constraint`` (an expression with scope).""" + + expression: str + expression_format_uri: str | None = None + expression_format: str | None = None + scope_uri: str | None = None + scope: str | None = None + context: tuple[str, ...] = () + description: str | None = None diff --git a/bead/interop/layers/models_records.py b/bead/interop/layers/models_records.py new file mode 100644 index 0000000..0268af7 --- /dev/null +++ b/bead/interop/layers/models_records.py @@ -0,0 +1,252 @@ +"""Faithful didactic mirrors of the linguistic ``layers`` record types. + +Mirrors the expression, segmentation, annotation, graph, media, and ontology +records field-for-field (reusing the shared-def mirrors in +:mod:`bead.interop.layers.models`). Like the shared defs, they serialize to and +from layers JSON losslessly through the generic snake<->camel conversion. +Binary ``blob`` fields are mirrored as their reference string. +""" + +from __future__ import annotations + +import didactic.api as dx + +from bead.interop.layers.models import ( + Anchor, + AnnotationMetadata, + FeatureMap, + KnowledgeRef, + LayersConstraint, + LayersSpan, + LayersUuid, + ObjectRef, + SpatialExpression, + TemporalExpression, + TemporalSpan, +) + + +class Expression(dx.Model): + """A layers ``expression`` (a text/linguistic unit, recursively nested).""" + + id: str + kind: str + created_at: str + kind_uri: str | None = None + text: str | None = None + parent_ref: str | None = None + anchor: dx.Embed[Anchor] | None = None + media_ref: str | None = None + media_blob: str | None = None + metadata: dx.Embed[AnnotationMetadata] | None = None + features: dx.Embed[FeatureMap] | None = None + source_url: str | None = None + source_ref: str | None = None + eprint_ref: str | None = None + knowledge_refs: tuple[dx.Embed[KnowledgeRef], ...] = () + languages: tuple[str, ...] = () + + +class Token(dx.Model): + """A layers ``token`` within a tokenization.""" + + token_index: int + text: str | None = None + text_span: dx.Embed[LayersSpan] | None = None + temporal_span: dx.Embed[TemporalSpan] | None = None + + +class Tokenization(dx.Model): + """A layers ``tokenization`` (one segmentation of an expression).""" + + uuid: dx.Embed[LayersUuid] + kind: str + kind_uri: str | None = None + expression_ref: str | None = None + tokens: tuple[dx.Embed[Token], ...] = () + metadata: dx.Embed[AnnotationMetadata] | None = None + + +class ArgumentRef(dx.Model): + """A layers ``argumentRef`` (a role-filling argument of a predicate).""" + + role: str + target: dx.Embed[ObjectRef] + features: dx.Embed[FeatureMap] | None = None + + +class Annotation(dx.Model): + """A layers ``annotation`` (the polymorphic annotation object).""" + + uuid: dx.Embed[LayersUuid] + anchor: dx.Embed[Anchor] | None = None + token_index: int | None = None + label: str | None = None + value: str | None = None + text: str | None = None + parent_id: dx.Embed[LayersUuid] | None = None + child_ids: tuple[dx.Embed[LayersUuid], ...] = () + head_index: int | None = None + target_index: int | None = None + arguments: tuple[dx.Embed[ArgumentRef], ...] = () + confidence: int | None = None + ontology_type_ref: str | None = None + knowledge_refs: tuple[dx.Embed[KnowledgeRef], ...] = () + temporal: dx.Embed[TemporalExpression] | None = None + spatial: dx.Embed[SpatialExpression] | None = None + features: dx.Embed[FeatureMap] | None = None + + +class Cluster(dx.Model): + """A layers ``cluster`` (a coreference/equivalence set).""" + + uuid: dx.Embed[LayersUuid] + canonical_label: str | None = None + members: tuple[dx.Embed[ObjectRef], ...] = () + knowledge_refs: tuple[dx.Embed[KnowledgeRef], ...] = () + features: dx.Embed[FeatureMap] | None = None + + +class AnnotationLayer(dx.Model): + """A layers ``annotationLayer`` (a typed layer of annotations).""" + + expression: str + kind: str + created_at: str + kind_uri: str | None = None + subkind_uri: str | None = None + subkind: str | None = None + formalism_uri: str | None = None + formalism: str | None = None + source_method_uri: str | None = None + source_method: str | None = None + label_set: str | None = None + ontology_ref: str | None = None + tokenization_id: dx.Embed[LayersUuid] | None = None + rank: int | None = None + alternatives_ref: str | None = None + parent_layer_ref: str | None = None + annotations: tuple[dx.Embed[Annotation], ...] = () + metadata: dx.Embed[AnnotationMetadata] | None = None + languages: tuple[str, ...] = () + + +class GraphNode(dx.Model): + """A layers ``graphNode`` (a standalone property-graph node).""" + + node_type: str + created_at: str + node_type_uri: str | None = None + label: str | None = None + properties: dx.Embed[FeatureMap] | None = None + knowledge_refs: tuple[dx.Embed[KnowledgeRef], ...] = () + metadata: dx.Embed[AnnotationMetadata] | None = None + + +class GraphEdge(dx.Model): + """A layers ``graphEdge`` (a single typed, directed edge record).""" + + source: dx.Embed[ObjectRef] + target: dx.Embed[ObjectRef] + edge_type: str + created_at: str + edge_type_uri: str | None = None + label: str | None = None + ordinal: int | None = None + confidence: int | None = None + properties: dx.Embed[FeatureMap] | None = None + metadata: dx.Embed[AnnotationMetadata] | None = None + + +class GraphEdgeEntry(dx.Model): + """A layers ``graphEdgeEntry`` (one edge within a graphEdgeSet).""" + + uuid: dx.Embed[LayersUuid] + edge_type: str + source: dx.Embed[ObjectRef] + target: dx.Embed[ObjectRef] + edge_type_uri: str | None = None + confidence: int | None = None + features: dx.Embed[FeatureMap] | None = None + + +class GraphEdgeSet(dx.Model): + """A layers ``graphEdgeSet`` (a batch of typed, directed edges).""" + + created_at: str + edges: tuple[dx.Embed[GraphEdgeEntry], ...] = () + expression: str | None = None + edge_type_uri: str | None = None + edge_type: str | None = None + metadata: dx.Embed[AnnotationMetadata] | None = None + knowledge_refs: tuple[dx.Embed[KnowledgeRef], ...] = () + features: dx.Embed[FeatureMap] | None = None + + +class AudioInfo(dx.Model): + """A layers ``audioInfo`` media descriptor.""" + + sample_rate: int | None = None + channels: int | None = None + bit_depth: int | None = None + codec: str | None = None + bit_rate: int | None = None + bit_rate_mode: str | None = None + number_of_samples: int | None = None + speaker_count: int | None = None + transcript_ref: str | None = None + segmentation_ref: str | None = None + + +class VideoInfo(dx.Model): + """A layers ``videoInfo`` media descriptor.""" + + width: int | None = None + height: int | None = None + frame_rate: int | None = None + codec: str | None = None + aspect_ratio: str | None = None + color_space: str | None = None + bit_rate: int | None = None + scan_type: str | None = None + + +class DocumentInfo(dx.Model): + """A layers ``documentInfo`` media descriptor.""" + + dpi: int | None = None + color_mode: str | None = None + page_count: int | None = None + script_system: str | None = None + writing_direction: str | None = None + ocr_engine: str | None = None + + +class RoleSlot(dx.Model): + """A layers ``roleSlot`` (a role in a type definition).""" + + role_name: str + role_description: str | None = None + filler_type_refs: tuple[str, ...] = () + collection_ref: str | None = None + required: bool | None = None + default_value: str | None = None + constraints: tuple[dx.Embed[LayersConstraint], ...] = () + knowledge_refs: tuple[dx.Embed[KnowledgeRef], ...] = () + features: dx.Embed[FeatureMap] | None = None + + +class TypeDef(dx.Model): + """A layers ``typeDef`` (an ontology type definition).""" + + ontology_ref: str + name: str + created_at: str + type_kind: str | None = None + type_kind_uri: str | None = None + gloss: str | None = None + parent_type_ref: str | None = None + allowed_roles: tuple[dx.Embed[RoleSlot], ...] = () + allowed_values: tuple[str, ...] = () + knowledge_refs: tuple[dx.Embed[KnowledgeRef], ...] = () + features: dx.Embed[FeatureMap] | None = None diff --git a/bead/interop/layers/parse_lens.py b/bead/interop/layers/parse_lens.py new file mode 100644 index 0000000..e2c3e83 --- /dev/null +++ b/bead/interop/layers/parse_lens.py @@ -0,0 +1,161 @@ +"""Lossless iso between a dependency parse and layers annotation records. + +A :class:`~bead.tokenization.parsers.ParsedSentence` maps to a layers +``tokenization`` plus two annotation layers (a part-of-speech ``token-tag`` +layer and a ``dependency`` ``relation`` layer). ``ParsedToken``/``ParsedSentence`` +carry no framework identity, so the mapping is a true bijection (``dx.Iso``): +the layers view captures everything and reconstructs the parse exactly. +""" + +from __future__ import annotations + +import didactic.api as dx + +from bead.data.base import JsonValue +from bead.interop.layers._convert import ( + from_feature_map, + j_bool, + j_int, + j_list, + j_obj, + j_str, + j_str_or_none, + strip_nulls, + to_feature_map, +) +from bead.tokenization.parsers import ( + UNIVERSAL_DEPENDENCIES, + ParsedSentence, + ParsedToken, +) + +_ROOT_HEAD = -1 + + +def _opt_str(value: JsonValue) -> str | None: + if value is None or isinstance(value, str): + return value + return str(value) + + +class ParsedSentenceLayersIso(dx.Iso[ParsedSentence, JsonValue]): + """Lossless ``ParsedSentence <-> layers tokenization + annotation layers``.""" + + def forward(self, sentence: ParsedSentence) -> JsonValue: + """Project a parsed sentence to layers tokenization + annotations.""" + text = sentence.original_text + + def _byte(char_index: int) -> int: + return len(text[:char_index].encode("utf-8")) + + token_views: tuple[JsonValue, ...] = tuple( + { + "tokenIndex": token.index, + "text": token.text, + "textSpan": { + "byteStart": _byte(token.start_char), + "byteEnd": _byte(token.end_char), + "charStart": token.start_char, + "charEnd": token.end_char, + }, + "spaceAfter": token.space_after, + } + for token in sentence.tokens + ) + pos_annotations: tuple[JsonValue, ...] = tuple( + { + "uuid": {"value": f"pos-{token.index}"}, + "tokenIndex": token.index, + "label": token.upos, + "features": to_feature_map( + { + "xpos": token.xpos, + "lemma": token.lemma, + "morph": dict(token.morph), + } + ), + } + for token in sentence.tokens + ) + dependency_annotations: tuple[JsonValue, ...] = tuple( + { + "uuid": {"value": f"dep-{token.index}"}, + "tokenIndex": token.index, + "label": token.deprel, + "headIndex": token.head if token.head is not None else _ROOT_HEAD, + } + for token in sentence.tokens + ) + return strip_nulls( + { + "originalText": sentence.original_text, + "tokenization": { + "uuid": {"value": "tokenization"}, + "kind": "parser", + "tokens": token_views, + }, + "posLayer": { + "kind": "token-tag", + "subkind": "pos", + "formalism": UNIVERSAL_DEPENDENCIES, + "annotations": pos_annotations, + }, + "dependencyLayer": { + "kind": "relation", + "subkind": "dependency", + "formalism": UNIVERSAL_DEPENDENCIES, + "annotations": dependency_annotations, + }, + } + ) + + def backward(self, view: JsonValue) -> ParsedSentence: + """Reconstruct a parsed sentence from its layers projection.""" + view_obj = j_obj(view) + tokenization = j_obj(view_obj["tokenization"]) + token_views = j_list(tokenization["tokens"]) + pos_annotations = j_list(j_obj(view_obj["posLayer"])["annotations"]) + dep_annotations = j_list(j_obj(view_obj["dependencyLayer"])["annotations"]) + + tokens: list[ParsedToken] = [] + for token_value, pos_value, dep_value in zip( + token_views, pos_annotations, dep_annotations, strict=True + ): + token_obj = j_obj(token_value) + pos_obj = j_obj(pos_value) + dep_obj = j_obj(dep_value) + span = j_obj(token_obj["textSpan"]) + features = from_feature_map(pos_obj["features"]) + raw_morph = features.get("morph") + morph = ( + {key: str(value) for key, value in raw_morph.items()} + if isinstance(raw_morph, dict) + else {} + ) + head_index = j_int(dep_obj["headIndex"]) + tokens.append( + ParsedToken( + index=j_int(token_obj["tokenIndex"]), + text=j_str(token_obj["text"]), + lemma=_opt_str(features.get("lemma")), + upos=j_str_or_none(pos_obj.get("label")), + xpos=_opt_str(features.get("xpos")), + deprel=j_str_or_none(dep_obj.get("label")), + head=None if head_index == _ROOT_HEAD else head_index, + morph=morph, + space_after=j_bool(token_obj["spaceAfter"]), + start_char=j_int(span["charStart"]), + end_char=j_int(span["charEnd"]), + ) + ) + return ParsedSentence( + original_text=j_str(view_obj["originalText"]), tokens=tuple(tokens) + ) + + +PARSED_SENTENCE_LAYERS = ParsedSentenceLayersIso() + + +def parse_to_layers(sentence: ParsedSentence) -> JsonValue: + """Return the layers tokenization + annotation-layer view of a parse.""" + return PARSED_SENTENCE_LAYERS.forward(sentence) diff --git a/bead/interop/layers/resource_lens.py b/bead/interop/layers/resource_lens.py new file mode 100644 index 0000000..aa2a34c --- /dev/null +++ b/bead/interop/layers/resource_lens.py @@ -0,0 +1,259 @@ +"""Lenses between bead resource models and layers resource records. + +Maps bead's lexical and template resources to their ``layers`` counterparts: + +- ``LexicalItem`` <-> a layers ``entry`` +- ``Lexicon`` <-> a layers ``collection`` with its ``entry`` records +- ``Template`` <-> a layers ``template`` (with its slots and constraints) + +Each lens produces a layers-shaped view and keeps the fields that have no layers +equivalent (the bead framework identity, the single language code, tags, the +``LexicalItem`` ``form``/``source`` fields, and DSL constraint context) in the +lens complement, so reconstruction is exact. +""" + +from __future__ import annotations + +import didactic.api as dx + +from bead.data.base import JsonValue +from bead.interop.layers._convert import ( + apply_identity, + from_feature_map, + identity_of, + j_bool, + j_list, + j_obj, + j_str, + j_str_or_none, + to_feature_map, +) +from bead.resources.constraints import Constraint +from bead.resources.lexical_item import LexicalItem +from bead.resources.lexicon import Lexicon +from bead.resources.template import Slot, Template + +_LEXICON_KIND = "lexicon" + + +class LexicalItemEntryLens(dx.Lens[LexicalItem, JsonValue, JsonValue]): + """Lossless lens ``LexicalItem <-> (layers entry view, complement)``.""" + + def forward(self, item: LexicalItem) -> tuple[JsonValue, JsonValue]: + """Project a lexical item to a layers entry view and complement.""" + view: dict[str, JsonValue] = { + "form": item.form if item.form is not None else item.lemma, + "lemma": item.lemma, + "features": to_feature_map(item.features), + } + if item.language_code is not None: + view["languages"] = (item.language_code,) + complement: JsonValue = { + "identity": identity_of(item), + "form": item.form, + "language_code": item.language_code, + "source": item.source, + } + return view, complement + + def backward(self, view: JsonValue, complement: JsonValue) -> LexicalItem: + """Reconstruct a lexical item from its layers entry view and complement.""" + view_obj = j_obj(view) + comp = j_obj(complement) + item = LexicalItem( + lemma=j_str(view_obj["lemma"]), + language_code=j_str_or_none(comp["language_code"]), + form=j_str_or_none(comp["form"]), + features=from_feature_map(view_obj["features"]), + source=j_str_or_none(comp["source"]), + ) + return apply_identity(item, comp["identity"]) + + +LEXICAL_ITEM_ENTRY = LexicalItemEntryLens() + + +class LexiconCollectionLens(dx.Lens[Lexicon, JsonValue, JsonValue]): + """Lossless lens ``Lexicon <-> (layers collection + entries, complement)``.""" + + def forward(self, lexicon: Lexicon) -> tuple[JsonValue, JsonValue]: + """Project a lexicon to a layers collection + entry views.""" + collection: dict[str, JsonValue] = {"name": lexicon.name, "kind": _LEXICON_KIND} + if lexicon.description is not None: + collection["description"] = lexicon.description + if lexicon.language_code is not None: + collection["languages"] = (lexicon.language_code,) + entries: list[JsonValue] = [] + item_complements: list[JsonValue] = [] + for item in lexicon.items: + entry_view, entry_complement = LEXICAL_ITEM_ENTRY.forward(item) + entries.append(entry_view) + item_complements.append(entry_complement) + view: JsonValue = {"collection": collection, "entries": tuple(entries)} + complement: JsonValue = { + "identity": identity_of(lexicon), + "description": lexicon.description, + "language_code": lexicon.language_code, + "tags": lexicon.tags, + "item_complements": tuple(item_complements), + } + return view, complement + + def backward(self, view: JsonValue, complement: JsonValue) -> Lexicon: + """Reconstruct a lexicon from its layers collection + complement.""" + view_obj = j_obj(view) + comp = j_obj(complement) + collection = j_obj(view_obj["collection"]) + entries = j_list(view_obj["entries"]) + item_complements = j_list(comp["item_complements"]) + items = tuple( + LEXICAL_ITEM_ENTRY.backward(entry, item_complement) + for entry, item_complement in zip(entries, item_complements, strict=True) + ) + lexicon = Lexicon( + name=j_str(collection["name"]), + description=j_str_or_none(comp["description"]), + language_code=j_str_or_none(comp["language_code"]), + items=items, + tags=tuple(j_str(tag) for tag in j_list(comp["tags"])), + ) + return apply_identity(lexicon, comp["identity"]) + + +LEXICON_COLLECTION = LexiconCollectionLens() + + +def _constraint_forward(constraint: Constraint) -> tuple[JsonValue, JsonValue]: + view: dict[str, JsonValue] = {"expression": constraint.expression} + if constraint.description is not None: + view["description"] = constraint.description + complement: JsonValue = { + "identity": identity_of(constraint), + "context": to_feature_map(constraint.context), + } + return view, complement + + +def _constraint_backward(view: JsonValue, complement: JsonValue) -> Constraint: + view_obj = j_obj(view) + comp = j_obj(complement) + constraint = Constraint( + expression=j_str(view_obj["expression"]), + context=from_feature_map(comp["context"]), + description=j_str_or_none(view_obj.get("description")), + ) + return apply_identity(constraint, comp["identity"]) + + +def _slot_forward(slot: Slot) -> tuple[JsonValue, JsonValue]: + view: dict[str, JsonValue] = {"name": slot.name, "required": slot.required} + if slot.description is not None: + view["description"] = slot.description + if slot.default_value is not None: + view["defaultValue"] = slot.default_value + constraint_views: list[JsonValue] = [] + constraint_complements: list[JsonValue] = [] + for constraint in slot.constraints: + constraint_view, constraint_complement = _constraint_forward(constraint) + constraint_views.append(constraint_view) + constraint_complements.append(constraint_complement) + view["constraints"] = tuple(constraint_views) + complement: JsonValue = { + "identity": identity_of(slot), + "constraint_complements": tuple(constraint_complements), + } + return view, complement + + +def _slot_backward(view: JsonValue, complement: JsonValue) -> Slot: + view_obj = j_obj(view) + comp = j_obj(complement) + constraint_views = j_list(view_obj["constraints"]) + constraint_complements = j_list(comp["constraint_complements"]) + constraints = tuple( + _constraint_backward(constraint_view, constraint_complement) + for constraint_view, constraint_complement in zip( + constraint_views, constraint_complements, strict=True + ) + ) + slot = Slot( + name=j_str(view_obj["name"]), + description=j_str_or_none(view_obj.get("description")), + constraints=constraints, + required=j_bool(view_obj["required"]), + default_value=j_str_or_none(view_obj.get("defaultValue")), + ) + return apply_identity(slot, comp["identity"]) + + +class TemplateLayersLens(dx.Lens[Template, JsonValue, JsonValue]): + """Lossless lens ``Template <-> (layers template view, complement)``.""" + + def forward(self, template: Template) -> tuple[JsonValue, JsonValue]: + """Project a template to a layers template view and complement.""" + slot_views: dict[str, JsonValue] = {} + slot_complements: dict[str, JsonValue] = {} + for slot_key, slot in template.slots.items(): + slot_view, slot_complement = _slot_forward(slot) + slot_views[slot_key] = slot_view + slot_complements[slot_key] = slot_complement + constraint_views: list[JsonValue] = [] + constraint_complements: list[JsonValue] = [] + for constraint in template.constraints: + constraint_view, constraint_complement = _constraint_forward(constraint) + constraint_views.append(constraint_view) + constraint_complements.append(constraint_complement) + view: dict[str, JsonValue] = { + "name": template.name, + "text": template.template_string, + "slots": slot_views, + "constraints": tuple(constraint_views), + } + if template.language_code is not None: + view["languages"] = (template.language_code,) + complement: JsonValue = { + "identity": identity_of(template), + "description": template.description, + "language_code": template.language_code, + "tags": template.tags, + "metadata": to_feature_map(template.metadata), + "slot_order": tuple(template.slots), + "slot_complements": slot_complements, + "constraint_complements": tuple(constraint_complements), + } + return view, complement + + def backward(self, view: JsonValue, complement: JsonValue) -> Template: + """Reconstruct a template from its layers template view and complement.""" + view_obj = j_obj(view) + comp = j_obj(complement) + slot_views = j_obj(view_obj["slots"]) + slot_complements = j_obj(comp["slot_complements"]) + slots: dict[str, Slot] = {} + for slot_key_value in j_list(comp["slot_order"]): + slot_key = j_str(slot_key_value) + slots[slot_key] = _slot_backward( + slot_views[slot_key], slot_complements[slot_key] + ) + constraint_views = j_list(view_obj["constraints"]) + constraint_complements = j_list(comp["constraint_complements"]) + constraints = tuple( + _constraint_backward(constraint_view, constraint_complement) + for constraint_view, constraint_complement in zip( + constraint_views, constraint_complements, strict=True + ) + ) + template = Template( + name=j_str(view_obj["name"]), + template_string=j_str(view_obj["text"]), + slots=slots, + constraints=constraints, + description=j_str_or_none(comp["description"]), + language_code=j_str_or_none(comp["language_code"]), + tags=tuple(j_str(tag) for tag in j_list(comp["tags"])), + metadata=from_feature_map(comp["metadata"]), + ) + return apply_identity(template, comp["identity"]) + + +TEMPLATE_LAYERS = TemplateLayersLens() diff --git a/bead/items/adapters/__init__.py b/bead/items/adapters/__init__.py index 490909b..1fc56ca 100644 --- a/bead/items/adapters/__init__.py +++ b/bead/items/adapters/__init__.py @@ -10,7 +10,7 @@ rate_limit, retry_with_backoff, ) -from bead.items.adapters.base import ModelAdapter +from bead.items.adapters.base import ModelAdapter, TextGenerator from bead.items.adapters.huggingface import ( HuggingFaceLanguageModel, HuggingFaceMaskedLanguageModel, @@ -50,6 +50,7 @@ __all__ = [ # Base "ModelAdapter", + "TextGenerator", # HuggingFace adapters "HuggingFaceLanguageModel", "HuggingFaceMaskedLanguageModel", diff --git a/bead/items/adapters/anthropic.py b/bead/items/adapters/anthropic.py index 7a98188..5e06a99 100644 --- a/bead/items/adapters/anthropic.py +++ b/bead/items/adapters/anthropic.py @@ -222,3 +222,31 @@ def compute_nli(self, premise: str, hypothesis: str) -> dict[str, float]: ) return scores + + def generate_completion( + self, prompt: str, *, max_tokens: int = 256, temperature: float = 1.0 + ) -> str: + """Generate a text completion for *prompt* via the messages API. + + Parameters + ---------- + prompt : str + The prompt to complete. + max_tokens : int + Maximum number of tokens to generate. + temperature : float + Sampling temperature. + + Returns + ------- + str + The concatenated text of the response (empty if none). + """ + response = self.client.messages.create( + model=self.model_name, + max_tokens=max_tokens, + temperature=temperature, + messages=[{"role": "user", "content": prompt}], + ) + parts = [block.text for block in response.content if block.type == "text"] + return "".join(parts).strip() diff --git a/bead/items/adapters/base.py b/bead/items/adapters/base.py index bd1a4a3..4f2df31 100644 --- a/bead/items/adapters/base.py +++ b/bead/items/adapters/base.py @@ -11,12 +11,32 @@ from __future__ import annotations from abc import ABC, abstractmethod +from typing import Protocol, runtime_checkable import numpy as np from bead.items.cache import ModelOutputCache +@runtime_checkable +class TextGenerator(Protocol): + """A model that generates text from a prompt. + + Implemented by API adapters that can produce completions (e.g. OpenAI, + Anthropic). Used by ``CompletionCorpusSource`` to treat a language model as + a corpus source. Kept separate from ``ModelAdapter`` because most adapters + only score text, not generate it. + """ + + model_name: str + + def generate_completion( + self, prompt: str, *, max_tokens: int = 256, temperature: float = 1.0 + ) -> str: + """Generate a text completion for *prompt*.""" + ... + + class ModelAdapter(ABC): """Base class for model adapters used in item construction. @@ -213,4 +233,4 @@ def get_nli_label(self, premise: str, hypothesis: str) -> str: If NLI is not supported by the model type. """ scores = self.compute_nli(premise, hypothesis) - return max(scores, key=scores.get) # type: ignore[arg-type, return-value] + return max(scores, key=lambda label: scores[label]) diff --git a/bead/items/adapters/openai.py b/bead/items/adapters/openai.py index 3d20e87..70ef943 100644 --- a/bead/items/adapters/openai.py +++ b/bead/items/adapters/openai.py @@ -321,3 +321,31 @@ def compute_nli(self, premise: str, hypothesis: str) -> dict[str, float]: ) return scores + + def generate_completion( + self, prompt: str, *, max_tokens: int = 256, temperature: float = 1.0 + ) -> str: + """Generate a text completion for *prompt* via the chat API. + + Parameters + ---------- + prompt : str + The prompt to complete. + max_tokens : int + Maximum number of tokens to generate. + temperature : float + Sampling temperature. + + Returns + ------- + str + The generated text (empty if the API returns no content). + """ + response = self.client.chat.completions.create( + model=self.model_name, + messages=[{"role": "user", "content": prompt}], + temperature=temperature, + max_tokens=max_tokens, + ) + content = response.choices[0].message.content + return content if content is not None else "" diff --git a/bead/tokenization/__init__.py b/bead/tokenization/__init__.py index da26859..3fcc7b5 100644 --- a/bead/tokenization/__init__.py +++ b/bead/tokenization/__init__.py @@ -11,6 +11,16 @@ from __future__ import annotations from bead.tokenization.config import TokenizerBackend, TokenizerConfig +from bead.tokenization.parsers import ( + UNIVERSAL_DEPENDENCIES, + DependencyParser, + ParsedSentence, + ParsedToken, + SpacyParser, + StanzaParser, + create_parser, + parse_to_spans, +) from bead.tokenization.tokenizers import ( DisplayToken, SpacyTokenizer, @@ -21,12 +31,20 @@ ) __all__ = [ + "UNIVERSAL_DEPENDENCIES", + "DependencyParser", "DisplayToken", + "ParsedSentence", + "ParsedToken", + "SpacyParser", "SpacyTokenizer", + "StanzaParser", "StanzaTokenizer", "TokenizedText", "TokenizerBackend", "TokenizerConfig", "WhitespaceTokenizer", + "create_parser", "create_tokenizer", + "parse_to_spans", ] diff --git a/bead/tokenization/parsers.py b/bead/tokenization/parsers.py new file mode 100644 index 0000000..edfaf8f --- /dev/null +++ b/bead/tokenization/parsers.py @@ -0,0 +1,502 @@ +"""Dependency parsing into standoff spans. + +Provides dependency parsers (spaCy, Stanza) that produce a per-sentence +``ParsedSentence`` of ``ParsedToken`` records (token, lemma, upos, xpos, +morphological features, head, deprel), and ``parse_to_spans`` which projects a +parse onto bead's standoff ``Span`` + ``SpanRelation`` models. + +The projection is deliberately aligned with the ``layers`` linguistic +annotation model so a parse stored on an ``Item`` carries every field a layers +dependency ``AnnotationLayer``/``Annotation`` needs: each token becomes a +single-token ``Span`` whose ``head_index`` is its governor and whose +``span_metadata`` carries ``upos``/``xpos``/``lemma``/``deprel``/``formalism``/ +``tool`` plus morphological features and character offsets; each syntactic arc +becomes a directed ``SpanRelation`` from head to dependent labeled with the +dependency relation. The conventions below (Universal Dependencies labels, +``head -> dependent`` arc direction, retained character offsets) keep that +mapping lossless without coupling bead to layers' wire format. +""" + +from __future__ import annotations + +import importlib +from typing import TYPE_CHECKING, Protocol, runtime_checkable + +import didactic.api as dx + +from bead.items.spans import ( + MetadataValue, + Span, + SpanLabel, + SpanRelation, + SpanSegment, +) +from bead.tokenization.config import TokenizerConfig +from bead.tokenization.tokenizers import spacy_space_after + +if TYPE_CHECKING: + from spacy.language import Language + +# layers-aligned conventions, recorded once so both projects stay matched. +UNIVERSAL_DEPENDENCIES = "universal-dependencies" +ROOT_DEPREL = "root" + + +@runtime_checkable +class DependencyParser(Protocol): + """A callable that dependency-parses text into sentences. + + Carries a ``tool`` identifier recorded in the layers-aligned provenance of + any spans projected from its output. + """ + + tool: str + + def __call__(self, text: str) -> tuple[ParsedSentence, ...]: + """Dependency-parse text into sentences.""" + ... + + +class ParsedToken(dx.Model): + """A dependency-parsed token. + + A superset of ``DisplayToken``: it adds the syntactic and morphological + fields produced by a dependency parser. Indices are sentence-local and + 0-based; ``head`` is the 0-based index of the governor token, or ``None`` + for the sentence root. + + Attributes + ---------- + index : int + Sentence-local 0-based token index. + text : str + Surface form of the token. + lemma : str | None + Lemma of the token. + upos : str | None + Universal part-of-speech tag. + xpos : str | None + Language-specific (treebank) part-of-speech tag. + deprel : str | None + Dependency relation of the token to its head. + head : int | None + Sentence-local 0-based index of the governor token; ``None`` for the + root. + morph : dict[str, str] + Morphological features (e.g. ``{"Number": "Sing"}``). + space_after : bool + Whether whitespace follows this token in the source text. + start_char : int + Character offset of the token start in the sentence text. + end_char : int + Character offset of the token end in the sentence text. + """ + + index: int + text: str + lemma: str | None = None + upos: str | None = None + xpos: str | None = None + deprel: str | None = None + head: int | None = None + morph: dict[str, str] = dx.field(default_factory=dict) + space_after: bool = True + start_char: int = 0 + end_char: int = 0 + + +class ParsedSentence(dx.Model): + """A single dependency-parsed sentence. + + Attributes + ---------- + original_text : str + The sentence text. + tokens : tuple[ParsedToken, ...] + The parsed tokens, in order. + """ + + original_text: str + tokens: tuple[dx.Embed[ParsedToken], ...] = () + + +def _parse_feats(feats: str | None) -> dict[str, str]: + """Parse a CoNLL-U ``feats`` string into a feature dict. + + Parameters + ---------- + feats : str | None + Pipe-separated ``Key=Value`` morphological features, or ``None``. + + Returns + ------- + dict[str, str] + Parsed features (empty when ``feats`` is ``None`` or ``"_"``). + """ + if not feats or feats == "_": + return {} + result: dict[str, str] = {} + for pair in feats.split("|"): + if "=" in pair: + key, value = pair.split("=", 1) + result[key] = value + return result + + +class SpacyParser: + """spaCy-based dependency parser. + + Loads a spaCy pipeline with tagger, parser, lemmatizer, and morphologizer + components and yields one ``ParsedSentence`` per sentence. + + Parameters + ---------- + language : str + ISO 639 language code. + model_name : str | None + Explicit spaCy model name. When ``None``, uses + ``{language}_core_web_sm``. + """ + + tool = "spacy" + + def __init__(self, language: str = "en", model_name: str | None = None) -> None: + self._language = language + self._model_name = model_name + self._nlp: Language | None = None + + def _load(self) -> Language: + if self._nlp is not None: + return self._nlp + + try: + spacy = importlib.import_module("spacy") + except ImportError as e: + raise ImportError( + "spaCy is required for SpacyParser. " + "Install it with: pip install 'bead[tokenization]'" + ) from e + + model = self._model_name or f"{self._language}_core_web_sm" + try: + nlp: Language = spacy.load(model) + except OSError as e: + raise ImportError( + f"spaCy model {model!r} is required for dependency parsing. " + f"Install it with: python -m spacy download {model}" + ) from e + + self._nlp = nlp + return nlp + + def __call__(self, text: str) -> tuple[ParsedSentence, ...]: + """Parse text into dependency-parsed sentences. + + Parameters + ---------- + text : str + Input text (may contain multiple sentences). + + Returns + ------- + tuple[ParsedSentence, ...] + One ``ParsedSentence`` per detected sentence. + """ + nlp = self._load() + doc = nlp(text) + sentences: list[ParsedSentence] = [] + for sent in doc.sents: + offset = sent.start + base_char = sent.start_char + tokens: list[ParsedToken] = [] + for token in sent: + local_index = token.i - offset + head_local = token.head.i - offset + head = None if token.head.i == token.i else head_local + tokens.append( + ParsedToken( + index=local_index, + text=token.text, + lemma=token.lemma_ or None, + upos=token.pos_ or None, + xpos=token.tag_ or None, + deprel=token.dep_.lower() or None, + head=head, + morph=_parse_feats(str(token.morph) or None), + space_after=spacy_space_after(token), + start_char=token.idx - base_char, + end_char=token.idx + len(token.text) - base_char, + ) + ) + sentences.append( + ParsedSentence(original_text=sent.text, tokens=tuple(tokens)) + ) + return tuple(sentences) + + +class StanzaParser: + """Stanza-based dependency parser. + + Loads a Stanza pipeline with ``tokenize,pos,lemma,depparse`` processors and + yields one ``ParsedSentence`` per sentence. + + Parameters + ---------- + language : str + ISO 639 language code. + model_name : str | None + Explicit Stanza package name. When ``None``, uses the default package. + """ + + tool = "stanza" + + def __init__(self, language: str = "en", model_name: str | None = None) -> None: + self._language = language + self._model_name = model_name + self._nlp: _StanzaPipelineProtocol | None = None + + def _load(self) -> _StanzaPipelineProtocol: + if self._nlp is not None: + return self._nlp + + try: + stanza = importlib.import_module("stanza") + except ImportError as e: + raise ImportError( + "Stanza is required for StanzaParser. " + "Install it with: pip install 'bead[tokenization]'" + ) from e + + pkg = self._model_name + pkg_kwarg = {"package": pkg} if pkg is not None else {} + processors = "tokenize,pos,lemma,depparse" + + try: + nlp: _StanzaPipelineProtocol = stanza.Pipeline( + lang=self._language, + processors=processors, + verbose=False, + **pkg_kwarg, + ) + except Exception: + stanza.download(self._language, verbose=False) + nlp = stanza.Pipeline( + lang=self._language, + processors=processors, + verbose=False, + **pkg_kwarg, + ) + + self._nlp = nlp + return nlp + + def __call__(self, text: str) -> tuple[ParsedSentence, ...]: + """Parse text into dependency-parsed sentences. + + Parameters + ---------- + text : str + Input text (may contain multiple sentences). + + Returns + ------- + tuple[ParsedSentence, ...] + One ``ParsedSentence`` per detected sentence. + """ + nlp = self._load() + doc = nlp(text) + sentences: list[ParsedSentence] = [] + for sentence in doc.sentences: + base_char = sentence.words[0].start_char if sentence.words else 0 + tokens: list[ParsedToken] = [] + for word in sentence.words: + # Stanza ids are 1-based within the sentence; head 0 is root. + head = None if word.head == 0 else word.head - 1 + deprel = word.deprel.lower() if word.deprel else None + tokens.append( + ParsedToken( + index=word.id - 1, + text=word.text, + lemma=word.lemma or None, + upos=word.upos or None, + xpos=word.xpos or None, + deprel=deprel, + head=head, + morph=_parse_feats(word.feats), + space_after=_stanza_word_space_after(word, text), + start_char=word.start_char - base_char, + end_char=word.end_char - base_char, + ) + ) + sentences.append( + ParsedSentence(original_text=sentence.text, tokens=tuple(tokens)) + ) + return tuple(sentences) + + +def _stanza_word_space_after(word: _StanzaWordProtocol, text: str) -> bool: + """Whether whitespace follows a Stanza word in the source text.""" + if word.misc: + return "SpaceAfter=No" not in word.misc + if word.end_char < len(text): + return text[word.end_char] == " " + return True + + +def create_parser(config: TokenizerConfig) -> DependencyParser: + """Return a dependency-parsing function for the given config. + + Parameters + ---------- + config : TokenizerConfig + Tokenizer configuration. The ``backend`` selects the parser; the + ``whitespace`` backend cannot parse and raises. + + Returns + ------- + DependencyParser + A callable that dependency-parses text into sentences. + + Raises + ------ + ValueError + If the backend cannot produce a dependency parse. + """ + if config.backend == "spacy": + return SpacyParser(language=config.language, model_name=config.model_name) + if config.backend == "stanza": + return StanzaParser(language=config.language, model_name=config.model_name) + raise ValueError( + f"Backend {config.backend!r} cannot produce a dependency parse; " + "use 'spacy' or 'stanza'." + ) + + +def parse_to_spans( + sentence: ParsedSentence, + *, + element_name: str = "text", + tokenization_id: str, + formalism: str = UNIVERSAL_DEPENDENCIES, + tool: str, +) -> tuple[tuple[Span, ...], tuple[SpanRelation, ...]]: + """Project a parsed sentence onto standoff spans and relations. + + Each token becomes a single-token ``Span`` (``span_type == "token"``) whose + ``head_index`` is the governor index and whose ``span_metadata`` carries the + layers-aligned fields. Each non-root token contributes one directed + ``SpanRelation`` from its head (``source``) to itself (``target``), labeled + with the dependency relation. This function is the single canonical owner of + the ``span_id`` scheme and the ``head -> dependent`` arc direction. + + Parameters + ---------- + sentence : ParsedSentence + The parsed sentence to project. + element_name : str + Rendered-element name the token indices refer to. + tokenization_id : str + Stable identifier of the tokenization these tokens belong to (mirrors + layers' ``TokenRef.tokenization_id``). Recorded in each span's metadata. + formalism : str + Dependency formalism slug (default ``"universal-dependencies"``). + tool : str + Identifier of the parser that produced the analysis. + + Returns + ------- + tuple[tuple[Span, ...], tuple[SpanRelation, ...]] + The token spans and the dependency-arc relations. + """ + spans: list[Span] = [] + relations: list[SpanRelation] = [] + + for token in sentence.tokens: + span_metadata: dict[str, MetadataValue] = { + "tokenization_id": tokenization_id, + "formalism": formalism, + "tool": tool, + "start_char": token.start_char, + "end_char": token.end_char, + } + if token.upos is not None: + span_metadata["upos"] = token.upos + if token.xpos is not None: + span_metadata["xpos"] = token.xpos + if token.lemma is not None: + span_metadata["lemma"] = token.lemma + if token.deprel is not None: + span_metadata["deprel"] = token.deprel + if token.morph: + morph_value: dict[str, MetadataValue] = {} + for feature, value in token.morph.items(): + morph_value[feature] = value + span_metadata["morph"] = morph_value + + label = SpanLabel(label=token.upos) if token.upos is not None else None + spans.append( + Span( + span_id=f"{element_name}:tok:{token.index}", + segments=( + SpanSegment(element_name=element_name, indices=(token.index,)), + ), + head_index=token.head, + label=label, + span_type="token", + span_metadata=span_metadata, + ) + ) + + if token.head is not None: + relation_label = ( + SpanLabel(label=token.deprel) if token.deprel is not None else None + ) + relations.append( + SpanRelation( + relation_id=f"{element_name}:dep:{token.index}", + source_span_id=f"{element_name}:tok:{token.head}", + target_span_id=f"{element_name}:tok:{token.index}", + label=relation_label, + directed=True, + ) + ) + + return tuple(spans), tuple(relations) + + +# structural typing protocols for the untyped Stanza pipeline +class _StanzaWordProtocol(Protocol): + """Structural type for a parsed Stanza ``Word``.""" + + id: int + text: str + lemma: str | None + upos: str | None + xpos: str | None + feats: str | None + head: int + deprel: str | None + start_char: int + end_char: int + misc: str | None + + +class _StanzaSentenceProtocol(Protocol): + """Structural type for a parsed Stanza sentence.""" + + text: str + words: list[_StanzaWordProtocol] + + +class _StanzaDocProtocol(Protocol): + """Structural type for a parsed Stanza document.""" + + sentences: list[_StanzaSentenceProtocol] + + +class _StanzaPipelineProtocol(Protocol): + """Structural type for a Stanza ``Pipeline``.""" + + def __call__(self, text: str) -> _StanzaDocProtocol: + """Parse text into a Stanza document.""" + ... diff --git a/bead/tokenization/tokenizers.py b/bead/tokenization/tokenizers.py index 4d16130..c859b36 100644 --- a/bead/tokenization/tokenizers.py +++ b/bead/tokenization/tokenizers.py @@ -7,6 +7,7 @@ from __future__ import annotations +import importlib import re from collections.abc import Callable, Iterator from typing import Protocol @@ -79,6 +80,27 @@ def render(self) -> str: return "".join(parts).rstrip() +def spacy_space_after(token: _SpacyTokenProtocol) -> bool: + """Whether whitespace follows a spaCy token in the source text. + + Shared by ``SpacyTokenizer`` and ``SpacyParser`` (single canonical site). + """ + return token.whitespace_ != "" + + +def _stanza_space_after(token: _StanzaTokenProtocol, text: str) -> bool: + """Whether whitespace follows a Stanza token in the source text. + + Prefers the CoNLL-U ``SpaceAfter=No`` annotation when present, falling + back to inspecting the character immediately after the token. + """ + if getattr(token, "misc", None): + return "SpaceAfter=No" not in (token.misc or "") + if token.end_char < len(text): + return text[token.end_char] == " " + return True + + class WhitespaceTokenizer: """Simple whitespace-split tokenizer. @@ -142,7 +164,7 @@ def _load(self) -> Callable[..., _SpacyDocProtocol]: return self._nlp try: - import spacy # noqa: PLC0415 # type: ignore[reportMissingImports] + spacy = importlib.import_module("spacy") except ImportError as e: raise ImportError( "spaCy is required for SpacyTokenizer. " @@ -154,10 +176,10 @@ def _load(self) -> Callable[..., _SpacyDocProtocol]: model = f"{self._language}_core_web_sm" try: - nlp: Callable[..., _SpacyDocProtocol] = spacy.load(model) # type: ignore[assignment] + nlp: Callable[..., _SpacyDocProtocol] = spacy.load(model) except OSError: # fall back to blank model - nlp = spacy.blank(self._language) # type: ignore[assignment] + nlp = spacy.blank(self._language) self._nlp = nlp return nlp @@ -182,7 +204,7 @@ def __call__(self, text: str) -> TokenizedText: tokens.append( DisplayToken( text=token.text, - space_after=token.whitespace_ != "", + space_after=spacy_space_after(token), start_char=token.idx, end_char=token.idx + len(token.text), ) @@ -216,7 +238,7 @@ def _load(self) -> _StanzaPipelineProtocol: return self._nlp try: - import stanza # noqa: PLC0415 # type: ignore[reportMissingImports] + stanza = importlib.import_module("stanza") except ImportError as e: raise ImportError( "Stanza is required for StanzaTokenizer. " @@ -227,20 +249,20 @@ def _load(self) -> _StanzaPipelineProtocol: pkg_kwarg = {"package": pkg} if pkg is not None else {} try: - nlp: _StanzaPipelineProtocol = stanza.Pipeline( # type: ignore[assignment] + nlp: _StanzaPipelineProtocol = stanza.Pipeline( lang=self._language, processors="tokenize", verbose=False, - **pkg_kwarg, # type: ignore[reportArgumentType] + **pkg_kwarg, ) except Exception: # download model and retry stanza.download(self._language, verbose=False) - nlp = stanza.Pipeline( # type: ignore[assignment] + nlp = stanza.Pipeline( lang=self._language, processors="tokenize", verbose=False, - **pkg_kwarg, # type: ignore[reportArgumentType] + **pkg_kwarg, ) self._nlp = nlp @@ -264,24 +286,12 @@ def __call__(self, text: str) -> TokenizedText: tokens: list[DisplayToken] = [] for sentence in doc.sentences: for token in sentence.tokens: - start_char = token.start_char - end_char = token.end_char - # stanza tokens have a misc field; space_after can be - # inferred from character offsets or the SpaceAfter=No - # annotation in the misc field. - space_after = True - if hasattr(token, "misc") and token.misc: - if "SpaceAfter=No" in token.misc: - space_after = False - elif end_char < len(text): - space_after = text[end_char] == " " - tokens.append( DisplayToken( text=token.text, - space_after=space_after, - start_char=start_char, - end_char=end_char, + space_after=_stanza_space_after(token, text), + start_char=token.start_char, + end_char=token.end_char, ) ) return TokenizedText(tokens=tuple(tokens), original_text=text) @@ -317,15 +327,32 @@ def create_tokenizer(config: TokenizerConfig) -> Callable[[str], TokenizedText]: raise ValueError(f"Unknown tokenizer backend: {config.backend}") -# structural typing protocols for spaCy/Stanza (avoids hard imports) +# structural typing protocols for spaCy/Stanza (avoids hard imports). +# Attributes are read-only properties so a real spaCy ``Token`` (whose fields +# are properties) structurally satisfies the protocol. class _SpacyTokenProtocol(Protocol): - text: str - whitespace_: str - idx: int + @property + def text(self) -> str: + """Surface form of the token.""" + ... + + @property + def whitespace_(self) -> str: + """Trailing whitespace following the token.""" + ... + + @property + def idx(self) -> int: + """Character offset of the token start.""" + ... class _SpacyDocProtocol(Protocol): - def __iter__(self) -> Iterator[_SpacyTokenProtocol]: ... # noqa: D105 + """Structural type for a spaCy ``Doc``.""" + + def __iter__(self) -> Iterator[_SpacyTokenProtocol]: + """Iterate the tokens of the document.""" + ... class _StanzaTokenProtocol(Protocol): @@ -344,4 +371,8 @@ class _StanzaDocProtocol(Protocol): class _StanzaPipelineProtocol(Protocol): - def __call__(self, text: str) -> _StanzaDocProtocol: ... # noqa: D102 + """Structural type for a Stanza ``Pipeline``.""" + + def __call__(self, text: str) -> _StanzaDocProtocol: + """Parse text into a Stanza document.""" + ... diff --git a/bead/transforms/__init__.py b/bead/transforms/__init__.py index 4a9d06f..7fa1020 100644 --- a/bead/transforms/__init__.py +++ b/bead/transforms/__init__.py @@ -31,14 +31,19 @@ from bead.transforms.text import ( CapitalizeTransform, LowerTransform, + MarkdownStripTransform, + RedditCleanupTransform, TitleTransform, UpperTransform, + split_sentences, ) __all__ = [ "CapitalizeTransform", "LowerTransform", + "MarkdownStripTransform", "MorphologicalTransform", + "RedditCleanupTransform", "SpanTextTransform", "TitleTransform", "TransformContext", @@ -46,6 +51,7 @@ "TransformRegistry", "UpperTransform", "create_default_registry", + "split_sentences", ] @@ -54,8 +60,9 @@ def create_default_registry( ) -> TransformRegistry: """Create a registry pre-loaded with the built-in transforms. - Text transforms (``lower``, ``upper``, ``capitalize``, ``title``) are - always registered. If *language_code* is provided, morphological + Text transforms (``lower``, ``upper``, ``capitalize``, ``title``, + ``markdown_strip``, ``reddit_cleanup``) are always registered. If + *language_code* is provided, morphological transforms (``gerund``, ``past_tense``, ``present_3sg``, ``past_participle``, ``infinitive``) are also registered using the UniMorph backend. @@ -78,6 +85,8 @@ def create_default_registry( registry.register("upper", UpperTransform()) registry.register("capitalize", CapitalizeTransform()) registry.register("title", TitleTransform()) + registry.register("markdown_strip", MarkdownStripTransform()) + registry.register("reddit_cleanup", RedditCleanupTransform()) # morphological transforms — require a language if language_code is not None: diff --git a/bead/transforms/text.py b/bead/transforms/text.py index 5c5ca48..6e55c22 100644 --- a/bead/transforms/text.py +++ b/bead/transforms/text.py @@ -7,8 +7,28 @@ from __future__ import annotations +import html +import re +from typing import TYPE_CHECKING + +from bead.tokenization.parsers import create_parser from bead.transforms.base import TransformContext +if TYPE_CHECKING: + from bead.tokenization.config import TokenizerConfig + +# markdown / web text patterns (module-level so they compile once) +_MD_IMAGE = re.compile(r"!\[([^\]]*)\]\([^)]*\)") +_MD_LINK = re.compile(r"\[([^\]]*)\]\([^)]*\)") +_MD_EMPHASIS = re.compile(r"(\*\*|__|\*|_|~~)(.+?)\1") +_MD_INLINE_CODE = re.compile(r"`([^`]*)`") +_MD_HEADING = re.compile(r"^\s{0,3}#{1,6}\s*", re.MULTILINE) +_MD_BLOCKQUOTE = re.compile(r"^\s*>+\s?", re.MULTILINE) +_URL = re.compile(r"https?://\S+|www\.\S+") +_REDDIT_DELETED = re.compile(r"\[(?:deleted|removed)\]") +_WHITESPACE = re.compile(r"[^\S\n]+") +_SENTENCE_BOUNDARY = re.compile(r"(?<=[.!?])\s+(?=\S)") + class LowerTransform: """Convert text to lowercase. @@ -64,3 +84,93 @@ class TitleTransform: def __call__(self, text: str, context: TransformContext) -> str: """Apply ``str.title`` to *text*.""" return text.title() + + +class MarkdownStripTransform: + """Strip common Markdown markup, keeping the human-readable text. + + Removes link/image targets (keeping the visible text), emphasis markers, + inline code backticks, heading markers, and blockquote markers. + + Examples + -------- + >>> MarkdownStripTransform()("**bold** and [a link](http://x)", TransformContext()) + 'bold and a link' + """ + + def __call__(self, text: str, context: TransformContext) -> str: + """Strip Markdown markup from *text*.""" + text = _MD_IMAGE.sub(r"\1", text) + text = _MD_LINK.sub(r"\1", text) + text = _MD_INLINE_CODE.sub(r"\1", text) + # apply emphasis stripping repeatedly to handle nested markers + previous = None + while previous != text: + previous = text + text = _MD_EMPHASIS.sub(r"\2", text) + text = _MD_HEADING.sub("", text) + text = _MD_BLOCKQUOTE.sub("", text) + return text.strip() + + +class RedditCleanupTransform: + """Clean Reddit comment text into plain prose. + + Unescapes HTML entities, strips Markdown (reusing + :class:`MarkdownStripTransform`), removes URLs and ``[deleted]``/ + ``[removed]`` markers, and collapses runs of intra-line whitespace. + + Examples + -------- + >>> RedditCleanupTransform()("see [here](http://x) & more", TransformContext()) + 'see here & more' + """ + + def __init__(self) -> None: + self._markdown = MarkdownStripTransform() + + def __call__(self, text: str, context: TransformContext) -> str: + """Clean Reddit markup from *text*.""" + text = html.unescape(text) + text = self._markdown(text, context) + text = _URL.sub("", text) + text = _REDDIT_DELETED.sub("", text) + text = _WHITESPACE.sub(" ", text) + return text.strip() + + +def split_sentences( + text: str, + *, + tokenizer_config: TokenizerConfig | None = None, +) -> tuple[str, ...]: + """Split *text* into sentences. + + When *tokenizer_config* selects a ``spacy`` or ``stanza`` backend, sentence + boundaries come from that parser's segmenter. Otherwise a regular-expression + fallback splits on sentence-final punctuation followed by whitespace. + + Parameters + ---------- + text : str + Text to split. + tokenizer_config : TokenizerConfig | None + Backend selector. ``None`` or the ``whitespace`` backend uses the + regex fallback. + + Returns + ------- + tuple[str, ...] + The sentences, with surrounding whitespace stripped (empties dropped). + """ + if tokenizer_config is not None and tokenizer_config.backend != "whitespace": + parser = create_parser(tokenizer_config) + return tuple( + sentence.original_text.strip() + for sentence in parser(text) + if sentence.original_text.strip() + ) + + return tuple( + part.strip() for part in _SENTENCE_BOUNDARY.split(text) if part.strip() + ) diff --git a/docs/api/corpus.md b/docs/api/corpus.md new file mode 100644 index 0000000..f7968ca --- /dev/null +++ b/docs/api/corpus.md @@ -0,0 +1,38 @@ +# bead.corpus + +Streaming corpus ingestion and structural sampling. Turns raw external text +(JSON Lines, optionally Zstandard-compressed; CSV/TSV; or language-model +completions) into structurally filtered experimental `Item`s: stream +`CorpusRecord`s from a `CorpusSource`, dependency-parse them, and keep only those +whose parse satisfies a structural DSL constraint. + +The whole pipeline is lazy, so a structural query can run over a multi-gigabyte +corpus without loading it into memory. + +## Records + +::: bead.corpus.records + options: + show_root_heading: true + show_source: false + +## Source Protocol + +::: bead.corpus.base + options: + show_root_heading: true + show_source: false + +## Sources + +::: bead.corpus.sources + options: + show_root_heading: true + show_source: false + +## Pipeline + +::: bead.corpus.pipeline + options: + show_root_heading: true + show_source: false diff --git a/docs/api/dsl.md b/docs/api/dsl.md index 457efe1..9ebe903 100644 --- a/docs/api/dsl.md +++ b/docs/api/dsl.md @@ -18,6 +18,20 @@ Domain-Specific Language for constraint expressions used in template slot fillin ## Standard Library +The standard library includes string, collection, math, type-checking, and +model/simulation builtins, plus **structural-query builtins** that traverse a +dependency parse stored on an `Item` as token-level spans and relations +(`upos`, `xpos`, `lemma_of`, `deprel`, `morph`, `head`, `dependents`, +`has_relation`, `root`, `subtree`, `path_to_root`, `tokens_with_upos`, +`tokens_with_deprel`, `any_deprel`, `filter_upos`). These let a constraint query +syntactic structure, for example: + +```text +upos(self, root(self)) == "VERB" and len(dependents(self, root(self), "obj")) > 0 +``` + +which matches sentences whose root is a verb taking a direct object. + ::: bead.dsl.stdlib options: show_root_heading: true diff --git a/docs/api/interop.md b/docs/api/interop.md new file mode 100644 index 0000000..4973760 --- /dev/null +++ b/docs/api/interop.md @@ -0,0 +1,48 @@ +# bead.interop + +Lossless, law-verified interoperability mappings between bead models and +external schemas. The `layers` subpackage maps bead's corpus and annotation data +to the [layers](https://github.com/layers-pub/layers) linguistic-annotation +schema and back via didactic lenses (`dx.Iso` / `dx.Lens`); round-trip fidelity +is guaranteed by the GetPut/PutGet lens laws. + +See the [Layers Interoperability guide](../user-guide/api/layers-interop.md) for +usage. + +## Bridge lenses (bead-native <-> layers) + +::: bead.interop.layers.graph_lens + options: + show_root_heading: true + show_source: false + +::: bead.interop.layers.bridges + options: + show_root_heading: true + show_source: false + +::: bead.interop.layers.parse_lens + options: + show_root_heading: true + show_source: false + +## Mirror models + +Faithful didactic mirrors of the layers constructs. + +::: bead.interop.layers.models + options: + show_root_heading: true + show_source: false + +::: bead.interop.layers.models_records + options: + show_root_heading: true + show_source: false + +## Generic mirror iso + +::: bead.interop.layers.model_lenses + options: + show_root_heading: true + show_source: false diff --git a/docs/api/tokenization.md b/docs/api/tokenization.md index eab394b..f96b5b1 100644 --- a/docs/api/tokenization.md +++ b/docs/api/tokenization.md @@ -16,6 +16,21 @@ Configurable multilingual tokenization for span annotation and UI display. show_root_heading: true show_source: false +## Dependency Parsing + +Dependency parsers (spaCy, Stanza) produce a per-sentence `ParsedSentence` of +`ParsedToken` records, and `parse_to_spans` projects a parse onto the standoff +`Span` + `SpanRelation` models used by `bead.items.Item`: one single-token +`Span` per token (carrying its governor as `head_index` and its +`upos`/`xpos`/`lemma`/`deprel`/morphology plus character offsets in +`span_metadata`), and one directed head-to-dependent `SpanRelation` per +syntactic arc labeled with the dependency relation. + +::: bead.tokenization.parsers + options: + show_root_heading: true + show_source: false + ## Display-to-Subword Alignment ::: bead.tokenization.alignment diff --git a/docs/api/transforms.md b/docs/api/transforms.md new file mode 100644 index 0000000..03087c4 --- /dev/null +++ b/docs/api/transforms.md @@ -0,0 +1,33 @@ +# bead.transforms + +Value-level text transforms (`str -> str`, parameterised by a +`TransformContext`) used when rendering template slots and item prompts. +Transforms are registered by name in a `TransformRegistry`; any callable +conforming to the `SpanTextTransform` protocol can be registered. + +## Core Abstractions + +::: bead.transforms.base + options: + show_root_heading: true + show_source: false + +## Text Transforms + +Pure surface-string transforms. In addition to case transforms (`lower`, +`upper`, `capitalize`, `title`), this module provides `MarkdownStripTransform` +and `RedditCleanupTransform` for cleaning web/markdown text into plain prose, +and `split_sentences` for sentence segmentation (parser-backed when a +spaCy/Stanza config is given, with a regular-expression fallback otherwise). + +::: bead.transforms.text + options: + show_root_heading: true + show_source: false + +## Morphological Transforms + +::: bead.transforms.morphology + options: + show_root_heading: true + show_source: false diff --git a/docs/installation.md b/docs/installation.md index 3ddf7c5..e27d514 100644 --- a/docs/installation.md +++ b/docs/installation.md @@ -61,10 +61,24 @@ uv sync --extra api # Active learning with PyTorch uv sync --extra training +# Tokenization and dependency parsing (spaCy, Stanza) +uv sync --extra tokenization + +# Corpus ingestion, including Zstandard-compressed (.zst) files +uv sync --extra corpus + # All dependencies uv sync --all-extras ``` +Structural corpus sampling (parsing a corpus and keeping only sentences whose +dependency structure matches a constraint) needs both the `corpus` and +`tokenization` extras: + +```bash +uv sync --extra corpus --extra tokenization +``` + ## TypeScript Development (jsPsych Plugins) If you need to modify or rebuild the jsPsych plugins, install Node.js dependencies: diff --git a/docs/user-guide/api/corpus.md b/docs/user-guide/api/corpus.md new file mode 100644 index 0000000..3ed66cb --- /dev/null +++ b/docs/user-guide/api/corpus.md @@ -0,0 +1,196 @@ +# Corpus Ingestion + +The `bead.corpus` package turns raw text corpora into experimental `Item`s. You +stream records from a source, dependency-parse them, and keep only those whose +syntactic structure matches a constraint. This is the natural way to build +naturalistic stimuli (for example, transitive-verb sentences drawn from a large +corpus) that then flow into the rest of the pipeline (items, lists, deployment). + +## Installation + +```bash +# Streaming sources, including .zst corpora +uv sync --extra corpus + +# Dependency parsing (spaCy, Stanza) +uv sync --extra tokenization + +# Structural sampling needs both +uv sync --extra corpus --extra tokenization +``` + +## Sources + +A `CorpusSource` streams `CorpusRecord`s, each carrying `text`, a `source_name`, +a `record_index`, and a flat `provenance` dict. + +```python +from bead.corpus import CsvCorpusSource, JsonlCorpusSource + +# JSON Lines (a .jsonl.zst path is transparently decompressed) +reddit = JsonlCorpusSource( + "corpus/comments.jsonl", + text_field="body", + provenance_fields=("author", "subreddit", "score"), +) + +for record in reddit: + print(record.text, record.provenance["author"]) + #> The dog chased the cat in the yard. alice + #> She wrote a long and thoughtful letter. bob + #> They built a sturdy wooden fence. carol + +# CSV / TSV +items = CsvCorpusSource( + "corpus/sentences.csv", + text_column="sentence", + provenance_columns=("verb", "frequency"), +) +print([record.provenance["verb"] for record in items]) +#> ['chase', 'write'] +``` + +Sources are lazy iterators, so multi-gigabyte corpora (including +Zstandard-compressed `.jsonl.zst` files) are never loaded into memory. + +By default a source retains **every** field (not just the ones you name) so no +information is discarded: thread edges like Reddit's `parent_id`/`link_id` ride +along in `provenance` even if you do not list them, and nested values are +JSON-recoverable. Pass an explicit `provenance_fields` / `provenance_columns` +tuple only when you want to keep a subset. + +## Reconstructing Thread and Graph Structure + +Streaming is flat and fast. When you need the structure *between* records (a +Reddit reply tree, or any typed relation graph over expressions), buffer the +stream into a `CorpusGraph` with `assemble_graph`. This is an opt-in, in-memory +step on top of the streaming tier. + +```python +from bead.corpus import CorpusRecord, EdgeSpec, assemble_graph + +# (records would normally come from a streaming source) +records = [ + CorpusRecord(text="the submission", source_name="r", provenance={"id": "sub"}), + CorpusRecord( + text="a reply", + source_name="r", + provenance={"id": "c1", "parent_id": "t3_sub"}, + ), + CorpusRecord( + text="a nested reply", + source_name="r", + provenance={"id": "c2", "parent_id": "t1_c1"}, + ), +] + +graph = assemble_graph( + records, + node_id_field="id", + edge_specs=[ + EdgeSpec( + target_field="parent_id", + edge_type="reply-to", + strip_prefixes=("t1_", "t3_"), # Reddit fullname prefixes + ) + ], +) + +# Edges point child -> parent ("reply-to"); reverse to walk the tree top-down. +tree = graph.reverse() +assert tree.roots("reply-to") == ("sub",) +assert set(tree.descendants("sub", "reply-to")) == {"c1", "c2"} +``` + +`CorpusGraph` is a directed, typed multigraph (parallel and multiple edge types +between a pair are allowed), so arbitrary expression graphs - not just trees - +are supported. It maps losslessly to the layers property graph; see the +[Layers Interoperability guide](layers-interop.md). + +## Structural Sampling + +`sample_corpus` streams a source through a dependency parser and yields only the +items whose parse satisfies a structural DSL constraint. The constraint is a +normal bead DSL expression with the item bound as `self`, using the structural +builtins (`root`, `dependents`, `upos`, `head`, `has_relation`, ...). + +```python +from uuid import uuid4 +from bead.corpus import JsonlCorpusSource, sample_corpus +from bead.tokenization.parsers import StanzaParser + +source = JsonlCorpusSource("comments.jsonl", text_field="body") +parser = StanzaParser(language="en") + +# Keep only sentences whose root verb takes a direct object. +constraint = ( + 'upos(self, root(self)) == "VERB" ' + 'and len(dependents(self, root(self), "obj")) > 0' +) + +items = list( + sample_corpus( + source, + parser, + constraint, + item_template_id=uuid4(), + limit=200, + ) +) +``` + +Each resulting `Item` carries the parse as standoff annotations: one token-level +`Span` per token (with its governor as `head_index` and its POS, lemma, deprel, +morphology, and character offsets in `span_metadata`) and one directed +head-to-dependent `SpanRelation` per syntactic arc. The record's provenance plus +the parser tool and formalism are recorded on `item.item_metadata`. + +## Composing the Pipeline by Hand + +`sample_corpus` is a convenience wrapper. The underlying generators can be +composed directly when you want to inspect or transform intermediate results: + +```python +from bead.corpus import parse_records, filter_by_structure + +pairs = parse_records(source, parser, split_sentences=True) +items = filter_by_structure(pairs, constraint, item_template_id=uuid4(), tool=parser.tool) +``` + +`parse_records` yields one `(record, sentence)` pair per sentence; set +`split_sentences=False` to keep only records that parse to a single sentence. + +## Cleaning Source Text + +Web and forum text often needs cleanup before parsing. The text transforms in +`bead.transforms` help: + +```python +from bead.transforms.base import TransformContext +from bead.transforms.text import RedditCleanupTransform, split_sentences + +clean = RedditCleanupTransform() +text = clean("see [the thread](http://x) & more", TransformContext()) +# -> "see the thread & more" + +sentences = split_sentences("First one. Second one.") +# -> ("First one.", "Second one.") +``` + +## Generated Corpora + +A language model can also act as a corpus source via `CompletionCorpusSource`, +which wraps any adapter implementing the `TextGenerator` protocol (for example +the OpenAI or Anthropic adapters): + +```python +from bead.corpus import CompletionCorpusSource +from bead.items.adapters import OpenAIAdapter # requires the `api` extra + +generator = OpenAIAdapter(model_name="gpt-4o", cache=...) +source = CompletionCorpusSource( + generator, + prompts=["Write a sentence about a cat.", "Write a sentence about a dog."], + completions_per_prompt=5, +) +``` diff --git a/docs/user-guide/api/index.md b/docs/user-guide/api/index.md index 8a33ab3..e6144f9 100644 --- a/docs/user-guide/api/index.md +++ b/docs/user-guide/api/index.md @@ -144,6 +144,11 @@ Each stage has detailed documentation: - [Stage 5: Deployment](deployment.md) - jsPsych generation, JATOS export - [Stage 6: Training](training.md) - Active learning, convergence detection +Upstream of Stage 1, you can build naturalistic stimuli directly from text: + +- [Corpus Ingestion](corpus.md) - Stream a corpus, dependency-parse it, and keep + only sentences whose syntactic structure matches a constraint + ## Complete Workflow See [workflows.md](workflows.md) for complete end-to-end examples with all configuration options. diff --git a/docs/user-guide/api/layers-interop.md b/docs/user-guide/api/layers-interop.md new file mode 100644 index 0000000..1be5d25 --- /dev/null +++ b/docs/user-guide/api/layers-interop.md @@ -0,0 +1,135 @@ +# Layers Interoperability + +bead maps its corpus and annotation data to the +[layers](https://github.com/layers-pub/layers) linguistic-annotation schema and +back **losslessly**, via didactic lenses (`dx.Iso` / `dx.Lens`). The forward +direction produces faithful, standalone layers-shaped JSON; the reverse +reconstructs the exact bead value. Because the mappings are lenses, the +round-trip is guaranteed by the didactic GetPut/PutGet laws (verified in the +test suite with `verify_iso` / `check_lens_laws`). + +There is no ATProto wire/network dependency: the lenses produce and consume +plain layers-shaped Python/JSON. + +## What is covered + +- Every linguistic `pub.layers` construct is mirrored as a faithful didactic + model in `bead.interop.layers.models` / `models_records` (the anchor union, + temporal/spatial expressions, token/text/page/external anchors, the + polymorphic annotation and annotation layer, the property graph, media + descriptors, ontology type definitions, knowledge references, and the shared + objects). Each has a lossless `MirrorIso` to layers JSON. +- bead's own pipeline outputs bridge directly to layers: + - `CorpusGraph` ↔ a layers property graph (expressions + graph nodes + a + `graphEdgeSet`). + - `CorpusRecord` ↔ a layers `expression`. + - a dependency `ParsedSentence` ↔ a layers `tokenization` plus part-of-speech + and dependency annotation layers. + +## Mapping a corpus graph + +```python +from bead.corpus.assemble import EdgeSpec, assemble_graph +from bead.corpus.records import CorpusRecord +from bead.interop.layers import CORPUS_GRAPH_LAYERS, graph_to_layers + +records = [ + CorpusRecord(text="the submission", source_name="r", provenance={"id": "sub"}), + CorpusRecord( + text="a reply", + source_name="r", + provenance={"id": "c1", "parent_id": "t3_sub"}, + ), +] +graph = assemble_graph( + records, + node_id_field="id", + edge_specs=[ + EdgeSpec( + target_field="parent_id", edge_type="reply-to", strip_prefixes=("t3_",) + ) + ], +) + +# Faithful, standalone layers-shaped projection. +view = graph_to_layers(graph) +assert set(view) == {"expressions", "graphNodes", "graphEdgeSet"} + +# Lossless round-trip via the lens (view + complement reconstruct exactly). +layers_view, complement = CORPUS_GRAPH_LAYERS.forward(graph) +assert CORPUS_GRAPH_LAYERS.backward(layers_view, complement) == graph +``` + +## Mapping a dependency parse + +```python +from bead.interop.layers import PARSED_SENTENCE_LAYERS, parse_to_layers +from bead.tokenization.parsers import ParsedSentence, ParsedToken + +sentence = ParsedSentence( + original_text="dogs bark", + tokens=( + ParsedToken( + index=0, + text="dogs", + upos="NOUN", + deprel="nsubj", + head=1, + start_char=0, + end_char=4, + ), + ParsedToken( + index=1, + text="bark", + upos="VERB", + deprel="root", + head=None, + start_char=5, + end_char=9, + ), + ), +) + +view = parse_to_layers(sentence) +assert view["dependencyLayer"]["subkind"] == "dependency" +# The root token is encoded with headIndex -1 (the layers convention). +assert view["dependencyLayer"]["annotations"][1]["headIndex"] == -1 +# Iso: the parse reconstructs exactly (no complement needed). +assert PARSED_SENTENCE_LAYERS.backward(view) == sentence +``` + +## Working with the mirror models directly + +Any layers construct can be built as a bead model and serialized to/from layers +JSON with its `MirrorIso`: + +```python +from bead.interop.layers import mirror_iso +from bead.interop.layers.models import Anchor, LayersUuid, TokenRef + +anchor = Anchor( + token_ref=TokenRef(tokenization_id=LayersUuid(value="tok-1"), token_index=4) +) +iso = mirror_iso(Anchor) + +layers_json = iso.forward(anchor) +assert layers_json["tokenRef"]["tokenIndex"] == 4 # camelCased, layers-shaped +assert iso.backward(layers_json) == anchor # exact round-trip +``` + +`bead.interop.layers.ALL_MIRROR_ISOS` maps every mirror model type to its iso, +and a coverage test guards that every targeted layers construct has a +law-passing mapping. + +## Validating against the layers lexicons + +The mappings are checked against the canonical layers lexicons, vendored as the +`vendor/layers` git submodule pointing at +[`layers-pub/layers`](https://github.com/layers-pub/layers). The interop test +suite feeds every mapping's output through the ATProto lexicon validator +(`@atproto/lexicon`) and asserts each record validates against its lexicon, so a +schema drift in layers surfaces as a failing test. + +Fetch the lexicons with `git submodule update --init vendor/layers`, and pull the +latest published schemas with `git submodule update --remote vendor/layers`. The +validation tests skip when the submodule is not checked out. diff --git a/mkdocs.yml b/mkdocs.yml index 8c9ffed..c30d6a4 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -75,6 +75,8 @@ nav: - Templates: user-guide/api/templates.md - Items: user-guide/api/items.md - Lists: user-guide/api/lists.md + - Corpus Ingestion: user-guide/api/corpus.md + - Layers Interoperability: user-guide/api/layers-interop.md - Deployment: user-guide/api/deployment.md - Training: user-guide/api/training.md - Workflows: user-guide/api/workflows.md @@ -86,6 +88,9 @@ nav: - bead.lists: api/lists.md - bead.deployment: api/deployment.md - bead.tokenization: api/tokenization.md + - bead.transforms: api/transforms.md + - bead.corpus: api/corpus.md + - bead.interop: api/interop.md - bead.active_learning: api/active_learning.md - bead.simulation: api/simulation.md - bead.evaluation: api/evaluation.md diff --git a/pyproject.toml b/pyproject.toml index 2af2ed5..70becde 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "hatchling.build" [project] name = "bead" -version = "0.5.0" +version = "0.6.0" description = "Lexicon and Template Collection Construction Pipeline for Acceptability and Inference Judgment Data" authors = [{name = "Aaron Steven White", email = "aaron.white@rochester.edu"}] readme = "README.md" @@ -23,8 +23,8 @@ classifiers = [ ] requires-python = ">=3.14" dependencies = [ - "didactic>=0.6.2", - "panproto>=0.43", + "didactic>=0.7.2", + "panproto>=0.51.0", "pyyaml>=6.0.0", "jinja2>=3.0.0", "uuid-utils>=0.7.0", @@ -72,6 +72,8 @@ dev = [ "pandas-stubs>=2.0.0", "spacy>=3.7", "stanza>=1.8", + "zstandard>=0.22", + "hypothesis>=6.155.0", ] api = [ "openai>=1.0.0", @@ -95,13 +97,16 @@ tokenization = [ "spacy>=3.7", "stanza>=1.8", ] +corpus = [ + "zstandard>=0.22", +] [project.scripts] bead = "bead.cli.main:cli" [tool.pytest.ini_options] testpaths = ["tests", "docs/user-guide/cli"] -norecursedirs = ["tests/fixtures"] +norecursedirs = ["tests/fixtures", ".hypothesis"] addopts = ["-ra", "--strict-markers", "--cov=bead", "--cov-report=term-missing"] markers = [ "slow_model_training: marks tests that train ML models (deselect with '-m \"not slow_model_training\"')", @@ -111,7 +116,7 @@ markers = [ [tool.ruff] line-length = 88 target-version = "py314" -exclude = ["gallery"] +exclude = ["gallery", "vendor"] [tool.ruff.lint] select = ["E", "F", "I", "N", "D", "UP", "ANN", "B", "A", "C4", "PLC"] @@ -178,3 +183,8 @@ exclude = [ # Items cache has numpy ndarray type issues "bead/items/cache.py", ] + +[dependency-groups] +dev = [ + "hypothesis>=6.155.0", +] diff --git a/tests/corpus/__init__.py b/tests/corpus/__init__.py new file mode 100644 index 0000000..a1646e1 --- /dev/null +++ b/tests/corpus/__init__.py @@ -0,0 +1 @@ +"""Corpus ingestion test package.""" diff --git a/tests/corpus/test_assemble.py b/tests/corpus/test_assemble.py new file mode 100644 index 0000000..7011921 --- /dev/null +++ b/tests/corpus/test_assemble.py @@ -0,0 +1,104 @@ +"""Tests for assembling a corpus graph from a record stream.""" + +from __future__ import annotations + +from collections.abc import Iterable + +from bead.corpus.assemble import EdgeSpec, assemble_graph +from bead.corpus.graph import CorpusEdge +from bead.corpus.records import CorpusRecord, ProvenanceValue + + +def _record(text: str, **provenance: ProvenanceValue) -> CorpusRecord: + return CorpusRecord(text=text, source_name="reddit", provenance=dict(provenance)) + + +def _reddit_thread() -> list[CorpusRecord]: + # submission + three comments forming a reply tree + return [ + _record("the submission", id="sub"), + _record("top reply", id="c1", parent_id="t3_sub"), + _record("nested reply", id="c2", parent_id="t1_c1"), + _record("another nested reply", id="c3", parent_id="t1_c1"), + ] + + +_REPLY = EdgeSpec( + target_field="parent_id", edge_type="reply-to", strip_prefixes=("t1_", "t3_") +) + + +class TestRedditReplyTree: + """Reconstructs a Reddit reply tree (edges child -> parent).""" + + def test_edges_and_prefix_stripping(self) -> None: + g = assemble_graph(_reddit_thread(), node_id_field="id", edge_specs=[_REPLY]) + assert {n.node_id for n in g.nodes} == {"sub", "c1", "c2", "c3"} + # c1 replies to the submission (t3_ prefix stripped) + assert g.successors("c1", "reply-to") == ("sub",) + # c2 and c3 reply to c1 (t1_ prefix stripped) + assert set(g.predecessors("c1", "reply-to")) == {"c2", "c3"} + # the submission replies to nothing + assert g.out_edges("sub", "reply-to") == () + + def test_full_tree_via_reverse(self) -> None: + # Reverse the child->parent edges to get parent->child, then the + # submission is the unique root and its descendants are the thread. + g = assemble_graph( + _reddit_thread(), node_id_field="id", edge_specs=[_REPLY] + ).reverse() + assert g.roots("reply-to") == ("sub",) + assert set(g.descendants("sub", "reply-to")) == {"c1", "c2", "c3"} + + def test_records_preserved_on_nodes(self) -> None: + g = assemble_graph(_reddit_thread(), node_id_field="id", edge_specs=[_REPLY]) + node = g.node_by_id("c2") + assert node is not None + assert node.record is not None + assert node.record.text == "nested reply" + # losslessly retained provenance still present on the wrapped record + assert node.record.provenance["parent_id"] == "t1_c1" + + +class TestGeneralGraph: + """Arbitrary typed multidigraphs, dangling targets, and edge_fn.""" + + def test_multiple_edge_specs(self) -> None: + records = [ + _record("x", id="x", parent_id="root", author="alice"), + _record("y", id="y", parent_id="x", author="alice"), + ] + specs = [ + EdgeSpec(target_field="parent_id", edge_type="reply-to"), + EdgeSpec(target_field="author", edge_type="authored-by"), + ] + g = assemble_graph(records, node_id_field="id", edge_specs=specs) + assert g.successors("y", "reply-to") == ("x",) + assert g.successors("y", "authored-by") == ("alice",) + + def test_dangling_target_preserved(self) -> None: + # parent_id 'root' has no node; the edge is kept, not dropped. + records = [_record("x", id="x", parent_id="root")] + g = assemble_graph(records, node_id_field="id", edge_specs=[_REPLY]) + assert g.successors("x", "reply-to") == ("root",) + assert g.node_by_id("root") is None + + def test_edge_fn(self) -> None: + def link_pairs(record: CorpusRecord, node_id: str) -> Iterable[CorpusEdge]: + mentions = record.provenance.get("mentions") + if isinstance(mentions, str): + return [ + CorpusEdge( + source_id=node_id, target_id=mentions, edge_type="mentions" + ) + ] + return [] + + records = [_record("x", id="x", mentions="y"), _record("y", id="y")] + g = assemble_graph(records, node_id_field="id", edge_fn=link_pairs) + assert g.successors("x", "mentions") == ("y",) + + def test_records_without_node_id_skipped(self) -> None: + records = [_record("x", id="x"), _record("no id")] + g = assemble_graph(records, node_id_field="id") + assert {n.node_id for n in g.nodes} == {"x"} diff --git a/tests/corpus/test_graph.py b/tests/corpus/test_graph.py new file mode 100644 index 0000000..c11693e --- /dev/null +++ b/tests/corpus/test_graph.py @@ -0,0 +1,83 @@ +"""Tests for the corpus graph (typed multidigraph) and its traversal.""" + +from __future__ import annotations + +from bead.corpus.graph import CorpusEdge, CorpusGraph, CorpusNode + + +def _graph() -> CorpusGraph: + # a -> b -> c, plus a parallel typed edge a =mentions=> c + nodes = ( + CorpusNode(node_id="a"), + CorpusNode(node_id="b"), + CorpusNode(node_id="c"), + ) + edges = ( + CorpusEdge(source_id="a", target_id="b", edge_type="next"), + CorpusEdge(source_id="b", target_id="c", edge_type="next"), + CorpusEdge(source_id="a", target_id="c", edge_type="mentions"), + ) + return CorpusGraph(nodes=nodes, edges=edges) + + +class TestTraversal: + """Tests for the graph traversal helpers.""" + + def test_node_by_id(self) -> None: + g = _graph() + assert g.node_by_id("b") is not None + assert g.node_by_id("missing") is None + + def test_out_in_edges_typed(self) -> None: + g = _graph() + assert len(g.out_edges("a")) == 2 + assert len(g.out_edges("a", "next")) == 1 + assert len(g.in_edges("c")) == 2 + assert len(g.in_edges("c", "mentions")) == 1 + + def test_successors_predecessors(self) -> None: + g = _graph() + assert set(g.successors("a")) == {"b", "c"} + assert g.successors("a", "next") == ("b",) + assert g.predecessors("c", "next") == ("b",) + assert g.predecessors("c", "mentions") == ("a",) + + def test_roots(self) -> None: + g = _graph() + # only 'a' has no incoming edge + assert g.roots() == ("a",) + + def test_descendants_follows_type(self) -> None: + g = _graph() + assert g.descendants("a", "next") == ("b", "c") + assert g.descendants("a", "mentions") == ("c",) + + def test_descendants_cycle_guarded(self) -> None: + nodes = (CorpusNode(node_id="x"), CorpusNode(node_id="y")) + edges = ( + CorpusEdge(source_id="x", target_id="y", edge_type="e"), + CorpusEdge(source_id="y", target_id="x", edge_type="e"), + ) + g = CorpusGraph(nodes=nodes, edges=edges) + # does not loop forever; visits the other node once + assert g.descendants("x") == ("y",) + + def test_reverse(self) -> None: + g = _graph().reverse() + # edges flipped: b->a, c->b, c->a + assert g.successors("c") == ("b", "a") + assert g.roots() == ("c",) + + +class TestMultidigraph: + """Parallel edges of the same type between a pair are permitted.""" + + def test_parallel_edges_same_type(self) -> None: + nodes = (CorpusNode(node_id="a"), CorpusNode(node_id="b")) + edges = ( + CorpusEdge(source_id="a", target_id="b", edge_type="cites"), + CorpusEdge(source_id="a", target_id="b", edge_type="cites"), + ) + g = CorpusGraph(nodes=nodes, edges=edges) + assert len(g.out_edges("a", "cites")) == 2 + assert g.successors("a") == ("b", "b") diff --git a/tests/corpus/test_pipeline.py b/tests/corpus/test_pipeline.py new file mode 100644 index 0000000..bcd9026 --- /dev/null +++ b/tests/corpus/test_pipeline.py @@ -0,0 +1,207 @@ +"""Tests for the streaming corpus pipeline.""" + +from __future__ import annotations + +from uuid import uuid4 + +import pytest + +from bead.corpus.pipeline import ( + filter_by_structure, + parse_records, + record_to_item, + sample_corpus, +) +from bead.corpus.records import CorpusRecord +from bead.tokenization.parsers import ParsedSentence, ParsedToken, StanzaParser + +# A structural constraint: root is a verb that takes a direct object. +TRANSITIVE = ( + 'upos(self, root(self)) == "VERB" and len(dependents(self, root(self), "obj")) > 0' +) + + +def _transitive_parse() -> ParsedSentence: + return ParsedSentence( + original_text="The dog chased the cat", + tokens=( + ParsedToken(index=0, text="The", upos="DET", deprel="det", head=1), + ParsedToken(index=1, text="dog", upos="NOUN", deprel="nsubj", head=2), + ParsedToken(index=2, text="chased", upos="VERB", deprel="root", head=None), + ParsedToken(index=3, text="the", upos="DET", deprel="det", head=4), + ParsedToken(index=4, text="cat", upos="NOUN", deprel="obj", head=2), + ), + ) + + +def _intransitive_parse() -> ParsedSentence: + return ParsedSentence( + original_text="The dog slept", + tokens=( + ParsedToken(index=0, text="The", upos="DET", deprel="det", head=1), + ParsedToken(index=1, text="dog", upos="NOUN", deprel="nsubj", head=2), + ParsedToken(index=2, text="slept", upos="VERB", deprel="root", head=None), + ), + ) + + +class _StubParser: + """A deterministic parser keyed on text, satisfying DependencyParser.""" + + tool = "stub" + + def __init__(self, mapping: dict[str, tuple[ParsedSentence, ...]]) -> None: + self._mapping = mapping + + def __call__(self, text: str) -> tuple[ParsedSentence, ...]: + return self._mapping[text] + + +def _records() -> list[CorpusRecord]: + return [ + CorpusRecord( + text="The dog chased the cat", + source_name="corpus", + record_index=0, + provenance={"author": "alice"}, + ), + CorpusRecord( + text="The dog slept", + source_name="corpus", + record_index=1, + provenance={"author": "bob"}, + ), + ] + + +def _parser() -> _StubParser: + return _StubParser( + { + "The dog chased the cat": (_transitive_parse(),), + "The dog slept": (_intransitive_parse(),), + } + ) + + +class TestRecordToItem: + """Tests for building an item from a record and its parse.""" + + def test_builds_item_with_provenance(self) -> None: + template_id = uuid4() + record = _records()[0] + item = record_to_item( + record, _transitive_parse(), item_template_id=template_id, tool="stub" + ) + assert item.item_template_id == template_id + assert item.rendered_elements["text"] == "The dog chased the cat" + assert len(item.spans) == 5 + assert len(item.span_relations) == 4 + # layers-aligned + source provenance on item_metadata + assert item.item_metadata["author"] == "alice" + assert item.item_metadata["source_name"] == "corpus" + assert item.item_metadata["parser_tool"] == "stub" + assert item.item_metadata["subkind"] == "dependency" + assert item.item_metadata["corpus_record_id"] == str(record.id) + assert item.tokenized_elements["text"] == ( + "The", + "dog", + "chased", + "the", + "cat", + ) + + +class TestParseRecords: + """Tests for parsing records into sentence pairs.""" + + def test_one_pair_per_sentence(self) -> None: + multi = CorpusRecord(text="multi", source_name="c") + parser = _StubParser({"multi": (_transitive_parse(), _intransitive_parse())}) + pairs = list(parse_records([multi], parser)) + assert len(pairs) == 2 + + def test_split_sentences_false_skips_multi(self) -> None: + multi = CorpusRecord(text="multi", source_name="c") + single = CorpusRecord(text="single", source_name="c") + parser = _StubParser( + { + "multi": (_transitive_parse(), _intransitive_parse()), + "single": (_transitive_parse(),), + } + ) + pairs = list(parse_records([multi, single], parser, split_sentences=False)) + assert len(pairs) == 1 + assert pairs[0][0].text == "single" + + +class TestFilterByStructure: + """Tests for structural rejection sampling.""" + + def test_keeps_only_transitive(self) -> None: + pairs = list(parse_records(_records(), _parser())) + items = list( + filter_by_structure( + pairs, TRANSITIVE, item_template_id=uuid4(), tool="stub" + ) + ) + assert len(items) == 1 + assert items[0].rendered_elements["text"] == "The dog chased the cat" + + +class TestSampleCorpus: + """Tests for the end-to-end convenience generator.""" + + def test_filters_and_builds_items(self) -> None: + items = list( + sample_corpus( + _records(), + _parser(), + TRANSITIVE, + item_template_id=uuid4(), + ) + ) + assert len(items) == 1 + assert items[0].item_metadata["author"] == "alice" + + def test_limit(self) -> None: + # both records match a trivially-true constraint; limit caps output + items = list( + sample_corpus( + _records(), + _parser(), + "root(self) >= 0", + item_template_id=uuid4(), + limit=1, + ) + ) + assert len(items) == 1 + + +class TestSampleCorpusStanzaIntegration: + """End-to-end with a real Stanza parser (skips only if model unavailable).""" + + def test_filters_transitive_with_real_parser(self) -> None: + stanza = pytest.importorskip("stanza") + try: + stanza.download( + "en", processors="tokenize,pos,lemma,depparse", verbose=False + ) + except Exception as exc: # pragma: no cover - network dependent + pytest.skip(f"Stanza English model unavailable (no network?): {exc}") + + records = [ + CorpusRecord(text="The dog chased the cat.", source_name="c"), + CorpusRecord(text="The dog slept peacefully.", source_name="c"), + CorpusRecord(text="She wrote a long letter.", source_name="c"), + ] + items = list( + sample_corpus( + records, + StanzaParser(language="en"), + TRANSITIVE, + item_template_id=uuid4(), + ) + ) + kept = {item.rendered_elements["text"] for item in items} + assert kept == {"The dog chased the cat.", "She wrote a long letter."} + assert all(it.item_metadata["parser_tool"] == "stanza" for it in items) diff --git a/tests/corpus/test_sources.py b/tests/corpus/test_sources.py new file mode 100644 index 0000000..9280b92 --- /dev/null +++ b/tests/corpus/test_sources.py @@ -0,0 +1,221 @@ +"""Tests for streaming corpus sources.""" + +from __future__ import annotations + +import json +from collections.abc import Mapping, Sequence +from pathlib import Path + +import pytest + +from bead.corpus.records import CorpusRecord +from bead.corpus.sources import ( + CompletionCorpusSource, + CsvCorpusSource, + JsonlCorpusSource, +) +from bead.data.serialization import ( + read_jsonlines, + stream_jsonlines, + write_jsonlines, +) + +type _Json = str | int | float | bool | None | list["_Json"] | dict[str, "_Json"] + +_REDDIT_ROWS: list[dict[str, _Json]] = [ + {"body": "The dog chased the cat.", "author": "alice", "score": 12}, + {"body": "The dog slept.", "author": "bob", "score": 3}, + {"author": "carol", "score": 1}, # no body: skipped +] + + +def _write_jsonl(path: Path, rows: Sequence[Mapping[str, _Json]]) -> None: + path.write_text("\n".join(json.dumps(row) for row in rows) + "\n", encoding="utf-8") + + +class TestJsonlCorpusSource: + """Tests for plain and compressed JSONL ingestion.""" + + def test_plain_jsonl(self, tmp_path: Path) -> None: + path = tmp_path / "reddit.jsonl" + _write_jsonl(path, _REDDIT_ROWS) + source = JsonlCorpusSource( + path, text_field="body", provenance_fields=("author", "score") + ) + records = list(source) + assert len(records) == 2 # row without "body" is skipped + assert records[0].text == "The dog chased the cat." + assert records[0].source_name == "reddit.jsonl" + assert records[0].record_index == 0 + assert records[0].provenance == {"author": "alice", "score": 12} + assert records[1].provenance["author"] == "bob" + + def test_zst_jsonl(self, tmp_path: Path) -> None: + zstandard = pytest.importorskip("zstandard") + path = tmp_path / "reddit.jsonl.zst" + payload = "\n".join(json.dumps(row) for row in _REDDIT_ROWS) + "\n" + with zstandard.open(path, "wt", encoding="utf-8") as handle: + handle.write(payload) + + source = JsonlCorpusSource( + path, text_field="body", provenance_fields=("author",) + ) + records = list(source) + assert [r.text for r in records] == [ + "The dog chased the cat.", + "The dog slept.", + ] + assert records[0].provenance == {"author": "alice"} + + def test_custom_source_name(self, tmp_path: Path) -> None: + path = tmp_path / "data.jsonl" + _write_jsonl(path, [{"text": "hello"}]) + source = JsonlCorpusSource(path, source_name="my-corpus") + assert list(source)[0].source_name == "my-corpus" + + def test_is_lazy(self, tmp_path: Path) -> None: + path = tmp_path / "data.jsonl" + _write_jsonl(path, [{"text": "a"}, {"text": "b"}, {"text": "c"}]) + source = JsonlCorpusSource(path) + iterator = iter(source) + first = next(iterator) + assert first.text == "a" # did not consume the whole file + + def test_retains_all_fields_by_default(self, tmp_path: Path) -> None: + # The default must not drop ANY field - thread edges (parent_id, + # link_id) survive without being enumerated, so structure is + # recoverable downstream. + path = tmp_path / "reddit.jsonl" + rows: list[dict[str, _Json]] = [ + { + "body": "a reply", + "id": "t1_aaa", + "parent_id": "t1_root", + "link_id": "t3_sub", + "author": "alice", + "score": 4, + } + ] + _write_jsonl(path, rows) + record = next(iter(JsonlCorpusSource(path, text_field="body"))) + # every field except the text field is retained + assert record.provenance == { + "id": "t1_aaa", + "parent_id": "t1_root", + "link_id": "t3_sub", + "author": "alice", + "score": 4, + } + assert "body" not in record.provenance + + def test_nested_values_round_trip(self, tmp_path: Path) -> None: + # Non-scalar fields are JSON-serialized (not str()-ified), so they + # remain recoverable via json.loads. + path = tmp_path / "nested.jsonl" + rows: list[dict[str, _Json]] = [ + {"text": "hi", "edits": [1, 2], "meta": {"k": "v"}} + ] + _write_jsonl(path, rows) + record = next(iter(JsonlCorpusSource(path))) + assert json.loads(str(record.provenance["edits"])) == [1, 2] + assert json.loads(str(record.provenance["meta"])) == {"k": "v"} + + +class TestCsvCorpusSource: + """Tests for CSV/TSV ingestion.""" + + def test_csv(self, tmp_path: Path) -> None: + path = tmp_path / "items.csv" + path.write_text( + "sentence,verb,frequency\n" + "The dog chased the cat.,chase,100\n" + "The dog slept.,sleep,50\n", + encoding="utf-8", + ) + source = CsvCorpusSource( + path, text_column="sentence", provenance_columns=("verb", "frequency") + ) + records = list(source) + assert len(records) == 2 + assert records[0].text == "The dog chased the cat." + assert records[0].provenance == {"verb": "chase", "frequency": "100"} + + def test_tsv(self, tmp_path: Path) -> None: + path = tmp_path / "items.tsv" + path.write_text("sentence\tverb\nHello world.\tnone\n", encoding="utf-8") + source = CsvCorpusSource(path, text_column="sentence", sep="\t") + records = list(source) + assert len(records) == 1 + assert records[0].text == "Hello world." + + def test_skips_empty_text(self, tmp_path: Path) -> None: + path = tmp_path / "items.csv" + path.write_text("sentence\nfull\n\nalso full\n", encoding="utf-8") + source = CsvCorpusSource(path, text_column="sentence") + assert [r.text for r in source] == ["full", "also full"] + + +class _StubGenerator: + """A deterministic text generator satisfying TextGenerator.""" + + model_name = "stub-model" + + def __init__(self, mapping: dict[str, str]) -> None: + self._mapping = mapping + self.calls: list[tuple[str, int, float]] = [] + + def generate_completion( + self, prompt: str, *, max_tokens: int = 256, temperature: float = 1.0 + ) -> str: + self.calls.append((prompt, max_tokens, temperature)) + return self._mapping[prompt] + + +class TestCompletionCorpusSource: + """Tests for generating a corpus from a language model.""" + + def test_yields_one_record_per_completion(self) -> None: + generator = _StubGenerator( + {"Write a sentence.": "The dog barked.", "Another one.": "Cats sleep."} + ) + source = CompletionCorpusSource( + generator, ["Write a sentence.", "Another one."] + ) + records = list(source) + assert [r.text for r in records] == ["The dog barked.", "Cats sleep."] + assert records[0].source_name == "stub-model" + assert records[0].provenance["model"] == "stub-model" + assert records[0].provenance["tool"] == "completion" + assert records[0].provenance["prompt"] == "Write a sentence." + assert records[1].record_index == 1 + + def test_completions_per_prompt(self) -> None: + generator = _StubGenerator({"p": "out"}) + source = CompletionCorpusSource( + generator, ["p"], completions_per_prompt=3, max_tokens=10, temperature=0.5 + ) + records = list(source) + assert len(records) == 3 + assert generator.calls == [("p", 10, 0.5)] * 3 + + +class TestCorpusRecordRoundTrip: + """CorpusRecord is a BeadBaseModel and round-trips through JSONLines.""" + + def test_round_trip(self, tmp_path: Path) -> None: + records = [ + CorpusRecord( + text="hello", + source_name="s", + record_index=0, + provenance={"author": "x", "score": 1}, + ) + ] + path = tmp_path / "records.jsonl" + write_jsonlines(records, path) + loaded = read_jsonlines(path, CorpusRecord) + assert loaded[0].text == "hello" + assert loaded[0].provenance == {"author": "x", "score": 1} + # streaming reader (which now shares iter_jsonl_lines) agrees + streamed = list(stream_jsonlines(path, CorpusRecord)) + assert streamed[0].id == loaded[0].id diff --git a/tests/dsl/test_evaluator.py b/tests/dsl/test_evaluator.py index 3174133..6753524 100644 --- a/tests/dsl/test_evaluator.py +++ b/tests/dsl/test_evaluator.py @@ -705,7 +705,7 @@ def test_evaluate_type_error_in_operator() -> None: left=ast.Literal(kind="literal", value="hello"), right=ast.Literal(kind="literal", value=5), ) - with pytest.raises(EvaluationError, match="Type error in operation"): + with pytest.raises(EvaluationError, match="Cannot compare"): evaluator.evaluate(node, ctx) diff --git a/tests/dsl/test_structural.py b/tests/dsl/test_structural.py new file mode 100644 index 0000000..adf250b --- /dev/null +++ b/tests/dsl/test_structural.py @@ -0,0 +1,236 @@ +"""Tests for DSL structural-query builtins over a dependency parse. + +Also includes the layers no-drop smoke test: every field a layers dependency +``AnnotationLayer``/``Annotation`` needs must be reconstructable from a parsed +``Item``. +""" + +from __future__ import annotations + +from uuid import uuid4 + +from bead.dsl.evaluator import DSLEvaluator +from bead.items.item import Item +from bead.tokenization.parsers import ( + UNIVERSAL_DEPENDENCIES, + ParsedSentence, + ParsedToken, + parse_to_spans, +) + + +def _known_sentence() -> ParsedSentence: + """Hand-built parse of 'The dog chased the cat'.""" + return ParsedSentence( + original_text="The dog chased the cat", + tokens=( + ParsedToken( + index=0, + text="The", + lemma="the", + upos="DET", + deprel="det", + head=1, + start_char=0, + end_char=3, + ), + ParsedToken( + index=1, + text="dog", + lemma="dog", + upos="NOUN", + deprel="nsubj", + head=2, + morph={"Number": "Sing"}, + start_char=4, + end_char=7, + ), + ParsedToken( + index=2, + text="chased", + lemma="chase", + upos="VERB", + deprel="root", + head=None, + morph={"Tense": "Past"}, + start_char=8, + end_char=14, + ), + ParsedToken( + index=3, + text="the", + lemma="the", + upos="DET", + deprel="det", + head=4, + start_char=15, + end_char=18, + ), + ParsedToken( + index=4, + text="cat", + lemma="cat", + upos="NOUN", + deprel="obj", + head=2, + morph={"Number": "Sing"}, + start_char=19, + end_char=22, + ), + ), + ) + + +def _parsed_item() -> Item: + sentence = _known_sentence() + spans, relations = parse_to_spans( + sentence, element_name="text", tokenization_id="tok-1", tool="test" + ) + return Item( + item_template_id=uuid4(), + rendered_elements={"text": sentence.original_text}, + spans=spans, + span_relations=relations, + tokenized_elements={"text": tuple(t.text for t in sentence.tokens)}, + ) + + +def _eval(expression: str): + item = _parsed_item() + return DSLEvaluator().evaluate(expression, {"self": item, "item": item}) + + +class TestTokenAttributeBuiltins: + """Tests for per-token attribute accessors.""" + + def test_upos(self) -> None: + assert _eval("upos(self, 2)") == "VERB" + assert _eval("upos(self, 1)") == "NOUN" + + def test_lemma_and_deprel(self) -> None: + assert _eval("lemma_of(self, 2)") == "chase" + assert _eval("deprel(self, 1)") == "nsubj" + assert _eval("deprel(self, 2)") == "root" + + def test_morph(self) -> None: + assert _eval("morph(self, 1, 'Number')") == "Sing" + assert _eval("morph(self, 2, 'Tense')") == "Past" + assert _eval("morph(self, 0, 'Number')") is None + + def test_missing_token(self) -> None: + assert _eval("upos(self, 99)") is None + + +class TestGraphBuiltins: + """Tests for graph traversal accessors.""" + + def test_root(self) -> None: + assert _eval("root(self)") == 2 + + def test_head(self) -> None: + assert _eval("head(self, 1)") == 2 + assert _eval("head(self, 0)") == 1 + assert _eval("head(self, 2)") is None # root + + def test_dependents(self) -> None: + assert _eval("dependents(self, 2)") == [1, 4] + assert _eval("dependents(self, 2, 'obj')") == [4] + assert _eval("dependents(self, 2, 'nsubj')") == [1] + assert _eval("dependents(self, 0)") == [] + + def test_has_relation(self) -> None: + assert _eval("has_relation(self, 2, 4, 'obj')") is True + assert _eval("has_relation(self, 2, 1, 'obj')") is False + assert _eval("has_relation(self, 2, 4)") is True + + def test_tokens_with(self) -> None: + assert _eval("tokens_with_upos(self, 'NOUN')") == [1, 4] + assert _eval("tokens_with_deprel(self, 'det')") == [0, 3] + + def test_path_to_root(self) -> None: + assert _eval("path_to_root(self, 0)") == [0, 1, 2] + + def test_subtree(self) -> None: + assert _eval("subtree(self, 2)") == [0, 1, 2, 3, 4] + assert _eval("subtree(self, 4)") == [3, 4] + + def test_helpers_avoid_comprehensions(self) -> None: + assert _eval("any_deprel(self, [0, 1], 'nsubj')") is True + assert _eval("filter_upos(self, [0, 1, 2], 'DET')") == [0] + + +class TestStructuralConstraints: + """Tests for full constraint expressions over structure.""" + + def test_transitive_verb_constraint(self) -> None: + expr = ( + 'upos(self, root(self)) == "VERB" ' + 'and len(dependents(self, root(self), "obj")) > 0' + ) + assert _eval(expr) is True + + def test_intransitive_fails_object_check(self) -> None: + # 'cat' (index 4) has no object dependent + assert _eval('len(dependents(self, 4, "obj")) > 0') is False + + +class TestLayersNoDropSmoke: + """Every field a layers dependency annotation needs is reconstructable.""" + + def test_all_layers_fields_present(self) -> None: + item = _parsed_item() + token_spans = { + span.segments[0].indices[0]: span + for span in item.spans + if span.span_type == "token" + } + # one token span per token + assert set(token_spans) == {0, 1, 2, 3, 4} + + for span in token_spans.values(): + md = span.span_metadata + # layer-level discriminators + assert md["tokenization_id"] == "tok-1" + assert md["formalism"] == UNIVERSAL_DEPENDENCIES + assert md["tool"] == "test" + # per-token annotation fields + assert "upos" in md + assert "lemma" in md + assert "deprel" in md + # char offsets (layers' canonical byte offsets derive from these) + assert isinstance(md["start_char"], int) + assert isinstance(md["end_char"], int) + # head_index present (None only for the root) + if md["deprel"] != "root": + assert span.head_index is not None + + # arcs reconstructable as head -> dependent with a deprel label + for relation in item.span_relations: + assert relation.directed + assert relation.label is not None + assert relation.source_span_id in {s.span_id for s in item.spans} + assert relation.target_span_id in {s.span_id for s in item.spans} + + def test_reconstruct_conllu_like_rows(self) -> None: + """Reconstruct (id, form, upos, head, deprel) rows from the Item.""" + item = _parsed_item() + evaluator = DSLEvaluator() + rows = [] + for index in range(5): + ctx = {"self": item, "item": item} + rows.append( + ( + index, + evaluator.evaluate(f"upos(self, {index})", ctx), + evaluator.evaluate(f"head(self, {index})", ctx), + evaluator.evaluate(f"deprel(self, {index})", ctx), + ) + ) + + assert rows == [ + (0, "DET", 1, "det"), + (1, "NOUN", 2, "nsubj"), + (2, "VERB", None, "root"), + (3, "DET", 4, "det"), + (4, "NOUN", 2, "obj"), + ] diff --git a/tests/fixtures/api_docs/corpus/comments.jsonl b/tests/fixtures/api_docs/corpus/comments.jsonl new file mode 100644 index 0000000..0034f3a --- /dev/null +++ b/tests/fixtures/api_docs/corpus/comments.jsonl @@ -0,0 +1,3 @@ +{"body": "The dog chased the cat in the yard.", "author": "alice", "subreddit": "animals", "score": 12} +{"body": "She wrote a long and thoughtful letter.", "author": "bob", "subreddit": "writing", "score": 7} +{"body": "They built a sturdy wooden fence.", "author": "carol", "subreddit": "diy", "score": 3} diff --git a/tests/fixtures/api_docs/corpus/sentences.csv b/tests/fixtures/api_docs/corpus/sentences.csv new file mode 100644 index 0000000..4081e9c --- /dev/null +++ b/tests/fixtures/api_docs/corpus/sentences.csv @@ -0,0 +1,3 @@ +sentence,verb,frequency +The dog chased the cat.,chase,120 +She wrote a letter.,write,95 diff --git a/tests/interop/__init__.py b/tests/interop/__init__.py new file mode 100644 index 0000000..4dc2d2f --- /dev/null +++ b/tests/interop/__init__.py @@ -0,0 +1 @@ +"""Tests for bead <-> layers interoperability lenses.""" diff --git a/tests/interop/lexicon_validator/.gitignore b/tests/interop/lexicon_validator/.gitignore new file mode 100644 index 0000000..504afef --- /dev/null +++ b/tests/interop/lexicon_validator/.gitignore @@ -0,0 +1,2 @@ +node_modules/ +package-lock.json diff --git a/tests/interop/lexicon_validator/package.json b/tests/interop/lexicon_validator/package.json new file mode 100644 index 0000000..dac3317 --- /dev/null +++ b/tests/interop/lexicon_validator/package.json @@ -0,0 +1,9 @@ +{ + "name": "bead-layers-lexicon-validator", + "private": true, + "type": "module", + "description": "Validates bead's layers-mapping output against the layers lexicons using @atproto/lexicon.", + "dependencies": { + "@atproto/lexicon": "^0.5.0" + } +} diff --git a/tests/interop/lexicon_validator/validate.mjs b/tests/interop/lexicon_validator/validate.mjs new file mode 100644 index 0000000..92d4723 --- /dev/null +++ b/tests/interop/lexicon_validator/validate.mjs @@ -0,0 +1,45 @@ +// Validates JSON values against the layers lexicons using the ATProto lexicon +// validation machinery (@atproto/lexicon). Reads a JSON array of +// {lexUri, value} pairs from stdin and writes a JSON array of {ok, error?}. +import { readFileSync, readdirSync } from "fs"; +import { dirname, join } from "path"; +import { fileURLToPath } from "url"; +import { Lexicons } from "@atproto/lexicon"; + +const here = dirname(fileURLToPath(import.meta.url)); +const lexiconDir = + process.env.LAYERS_LEXICON_DIR || + join(here, "..", "..", "..", "vendor", "layers", "lexicons", "pub", "layers"); + +function lexiconFiles(dir) { + const out = []; + for (const entry of readdirSync(dir, { withFileTypes: true })) { + const full = join(dir, entry.name); + if (entry.isDirectory()) out.push(...lexiconFiles(full)); + else if (entry.name.endsWith(".json")) out.push(full); + } + return out.sort(); +} + +const lexicons = new Lexicons(); +for (const file of lexiconFiles(lexiconDir)) { + lexicons.add(JSON.parse(readFileSync(file, "utf8"))); +} + +let input = ""; +process.stdin.setEncoding("utf8"); +process.stdin.on("data", (chunk) => (input += chunk)); +process.stdin.on("end", () => { + const items = JSON.parse(input); + const results = items.map(({ lexUri, value }) => { + try { + const result = lexicons.validate(lexUri, value); + return result.success + ? { ok: true } + : { ok: false, lexUri, error: result.error?.message ?? "invalid" }; + } catch (err) { + return { ok: false, lexUri, error: String(err?.message ?? err) }; + } + }); + process.stdout.write(JSON.stringify(results)); +}); diff --git a/tests/interop/test_layers_coverage.py b/tests/interop/test_layers_coverage.py new file mode 100644 index 0000000..bd559d8 --- /dev/null +++ b/tests/interop/test_layers_coverage.py @@ -0,0 +1,73 @@ +"""Coverage guard: every targeted layers construct has a law-passing mapping. + +If a new layers construct is mirrored, it must be registered (and round-trip +tested in the per-construct suites). This test fails loudly if a targeted +construct loses its registered iso. +""" + +from __future__ import annotations + +from bead.interop.layers.model_lenses import ALL_MIRROR_ISOS, MirrorIso + +# layers construct slug -> bead mirror model class name. +_EXPECTED: dict[str, str] = { + # pub.layers.defs shared objects + "uuid": "LayersUuid", + "feature": "Feature", + "featureMap": "FeatureMap", + "knowledgeRef": "KnowledgeRef", + "boundingBox": "BoundingBox", + "temporalSpan": "TemporalSpan", + "agentRef": "AgentRef", + "objectRef": "ObjectRef", + "span": "LayersSpan", + "tokenRef": "TokenRef", + "tokenRefSequence": "TokenRefSequence", + "keyframe": "Keyframe", + "spatioTemporalAnchor": "SpatioTemporalAnchor", + "temporalEntity": "TemporalEntity", + "temporalModifier": "TemporalModifier", + "temporalExpression": "TemporalExpression", + "spatialEntity": "SpatialEntity", + "spatialModifier": "SpatialModifier", + "spatialExpression": "SpatialExpression", + "pageAnchor": "PageAnchor", + "textQuoteSelector": "TextQuoteSelector", + "textPositionSelector": "TextPositionSelector", + "fragmentSelector": "FragmentSelector", + "externalTarget": "ExternalTarget", + "anchor": "Anchor", + "alignmentLink": "AlignmentLink", + "annotationMetadata": "AnnotationMetadata", + "constraint": "LayersConstraint", + # linguistic record types + "expression": "Expression", + "token": "Token", + "tokenization": "Tokenization", + "argumentRef": "ArgumentRef", + "annotation": "Annotation", + "cluster": "Cluster", + "annotationLayer": "AnnotationLayer", + "graphNode": "GraphNode", + "graphEdge": "GraphEdge", + "graphEdgeEntry": "GraphEdgeEntry", + "graphEdgeSet": "GraphEdgeSet", + "audioInfo": "AudioInfo", + "videoInfo": "VideoInfo", + "documentInfo": "DocumentInfo", + "roleSlot": "RoleSlot", + "typeDef": "TypeDef", +} + + +def test_every_targeted_construct_is_mapped() -> None: + mapped_model_names = {model_type.__name__ for model_type in ALL_MIRROR_ISOS} + missing = { + slug: name for slug, name in _EXPECTED.items() if name not in mapped_model_names + } + assert not missing, f"layers constructs without a mirror iso: {missing}" + + +def test_all_registrations_are_law_lenses() -> None: + assert ALL_MIRROR_ISOS + assert all(isinstance(iso, MirrorIso) for iso in ALL_MIRROR_ISOS.values()) diff --git a/tests/interop/test_layers_defs.py b/tests/interop/test_layers_defs.py new file mode 100644 index 0000000..27ef36f --- /dev/null +++ b/tests/interop/test_layers_defs.py @@ -0,0 +1,140 @@ +"""Round-trip law tests for the layers shared-def mirror isos.""" + +from __future__ import annotations + +import didactic.api as dx +import pytest +from didactic.lenses._testing import verify_iso +from hypothesis import strategies as st + +from bead.interop.layers import models as m +from bead.interop.layers.model_lenses import SHARED_DEF_ISOS, MirrorIso, mirror_iso + +_KR = m.KnowledgeRef(source="wikidata", identifier="Q5") +_UUID = m.LayersUuid(value="u1") +_BBOX = m.BoundingBox(x=1, y=2, width=3, height=4) +_FEATURES = m.FeatureMap(entries=(m.Feature(key="k", value="v"),)) + +# One representative instance per shared-def mirror model. +_EXAMPLES: tuple[dx.Model, ...] = ( + _UUID, + m.Feature(key="k", value="v"), + _FEATURES, + m.KnowledgeRef(source="wikidata", identifier="Q5", label="human"), + _BBOX, + m.TemporalSpan(start=0, ending=100), + m.AgentRef(did="did:plc:x", name="A", knowledge_ref=_KR), + m.ObjectRef(local_id=_UUID, knowledge_ref=_KR), + m.LayersSpan(byte_start=0, byte_end=5, char_start=0, char_end=5), + m.TokenRef(tokenization_id=_UUID, token_index=2), + m.TokenRefSequence( + tokenization_id=_UUID, token_indexes=(1, 2, 3), anchor_token_index=2 + ), + m.Keyframe(time_ms=10, bbox=_BBOX, features=_FEATURES), + m.SpatioTemporalAnchor( + temporal_span=m.TemporalSpan(start=0, ending=10), + keyframes=(m.Keyframe(time_ms=1, bbox=_BBOX),), + interpolation="linear", + ), + m.TemporalEntity(instant="2026-05-29", granularity="day", features=_FEATURES), + m.TemporalModifier(mod="approx"), + m.TemporalExpression( + type="date", + value=m.TemporalEntity(instant="2026-05-29"), + modifier=m.TemporalModifier(mod="approx"), + anchor_ref=m.ObjectRef(local_id=_UUID), + ), + m.SpatialEntity(geometry="POINT(0 0)", type="point", dimensions=2), + m.SpatialModifier(mod="near"), + m.SpatialExpression(type="loc", value=m.SpatialEntity(geometry="g")), + m.PageAnchor( + page=1, bounding_box=_BBOX, text_span=m.LayersSpan(byte_start=0, byte_end=2) + ), + m.TextQuoteSelector(exact="quote", prefix="a", suffix="b"), + m.TextPositionSelector(byte_start=0, byte_end=5), + m.FragmentSelector(value="#frag", conforms_to="https://example/spec"), + m.Selector(text_quote_selector=m.TextQuoteSelector(exact="q")), + m.ExternalTarget( + source="http://x", + title="t", + selector=m.Selector(fragment_selector=m.FragmentSelector(value="#f")), + ), + m.Anchor(token_ref=m.TokenRef(tokenization_id=_UUID, token_index=0)), + m.AlignmentLink( + source_indices=(0, 1), + target_indices=(2,), + confidence=900, + label="align", + knowledge_refs=(_KR,), + ), + m.AnnotationMetadata( + tool="spacy", + agent=m.AgentRef(name="A"), + timestamp="2026-05-29T00:00:00+00:00", + confidence=950, + dependencies=(m.ObjectRef(local_id=_UUID),), + ), + m.LayersConstraint( + expression="x>0", scope="token", context=("a", "b"), description="d" + ), +) + + +@pytest.mark.parametrize("example", _EXAMPLES, ids=lambda e: type(e).__name__) +def test_shared_def_roundtrip(example: dx.Model) -> None: + iso = SHARED_DEF_ISOS[type(example)] + view = iso.forward(example) + # GetPut: reconstruct exactly from the layers JSON. + assert iso.backward(view) == example + # PutGet: re-projection is stable. + assert iso.forward(iso.backward(view)) == view + + +def test_every_shared_def_has_a_law_passing_iso() -> None: + # Coverage guard: each example's type has a registered iso, and every + # registered iso is exercised by an example (no silent omission). + example_types = {type(example) for example in _EXAMPLES} + assert example_types == set(SHARED_DEF_ISOS) + + +def test_camelcase_projection() -> None: + view = mirror_iso(m.LayersSpan).forward( + m.LayersSpan(byte_start=1, byte_end=9, char_start=1, char_end=9) + ) + assert view == {"byteStart": 1, "byteEnd": 9, "charStart": 1, "charEnd": 9} + + +# --- didactic law verification on flat models ------------------------------- + + +def test_verify_iso_uuid() -> None: + iso: MirrorIso[m.LayersUuid] = mirror_iso(m.LayersUuid) + verify_iso(iso, st.builds(m.LayersUuid, value=st.text(max_size=8)), max_examples=30) + + +def test_verify_iso_bounding_box() -> None: + iso: MirrorIso[m.BoundingBox] = mirror_iso(m.BoundingBox) + ints = st.integers(0, 1000) + verify_iso( + iso, + st.builds(m.BoundingBox, x=ints, y=ints, width=ints, height=ints), + max_examples=30, + ) + + +def test_verify_iso_knowledge_ref() -> None: + iso: MirrorIso[m.KnowledgeRef] = mirror_iso(m.KnowledgeRef) + text = st.text(max_size=6) + opt = st.one_of(st.none(), text) + verify_iso( + iso, + st.builds( + m.KnowledgeRef, + source=text, + identifier=text, + source_uri=opt, + uri=opt, + label=opt, + ), + max_examples=30, + ) diff --git a/tests/interop/test_layers_graph_roundtrip.py b/tests/interop/test_layers_graph_roundtrip.py new file mode 100644 index 0000000..1e8a66e --- /dev/null +++ b/tests/interop/test_layers_graph_roundtrip.py @@ -0,0 +1,182 @@ +"""Round-trip law tests for the CorpusGraph <-> layers graph lens.""" + +from __future__ import annotations + +from hypothesis import HealthCheck, given, settings +from hypothesis import strategies as st + +from bead.corpus.assemble import EdgeSpec, assemble_graph +from bead.corpus.graph import CorpusEdge, CorpusGraph, CorpusNode +from bead.corpus.records import CorpusRecord +from bead.interop.layers.graph_lens import CORPUS_GRAPH_LAYERS, graph_to_layers + +LENS = CORPUS_GRAPH_LAYERS + + +def _assert_roundtrip(graph: CorpusGraph) -> None: + view, complement = LENS.forward(graph) + # GetPut: reconstructing from view + complement yields the original exactly. + assert LENS.backward(view, complement) == graph + # PutGet: re-projecting the reconstruction yields the same view + complement. + view2, complement2 = LENS.forward(LENS.backward(view, complement)) + assert (view2, complement2) == (view, complement) + + +class TestExampleRoundTrips: + """Deterministic round-trips over representative graphs.""" + + def test_empty_graph(self) -> None: + _assert_roundtrip(CorpusGraph()) + + def test_reddit_thread(self) -> None: + records = [ + CorpusRecord(text="sub", source_name="r", provenance={"id": "sub"}), + CorpusRecord( + text="reply one", + source_name="r", + provenance={"id": "c1", "parent_id": "t3_sub", "score": 5}, + ), + CorpusRecord( + text="reply two", + source_name="r", + provenance={"id": "c2", "parent_id": "t1_c1"}, + ), + ] + graph = assemble_graph( + records, + node_id_field="id", + edge_specs=[ + EdgeSpec( + target_field="parent_id", + edge_type="reply-to", + strip_prefixes=("t1_", "t3_"), + ) + ], + ) + _assert_roundtrip(graph) + + def test_abstract_nodes_and_typed_multidigraph(self) -> None: + graph = CorpusGraph( + nodes=( + CorpusNode(node_id="a", node_type="entity", label="Alice"), + CorpusNode( + node_id="b", + node_type="concept", + node_type_uri="at://x#concept", + properties={"weight": 3, "tags": ("x", "y")}, + ), + ), + edges=( + CorpusEdge(source_id="a", target_id="b", edge_type="mentions"), + CorpusEdge( + source_id="a", + target_id="b", + edge_type="mentions", + edge_type_uri="at://x#mentions", + directed=False, + confidence=0.875, + features={"note": "parallel edge"}, + ), + ), + graph_metadata={"corpus": "demo"}, + ) + _assert_roundtrip(graph) + + def test_expression_node_preserves_provenance(self) -> None: + graph = CorpusGraph( + nodes=( + CorpusNode( + node_id="x", + record=CorpusRecord( + text="hello world", + source_name="src", + record_index=7, + provenance={"author": "a", "score": 2, "deleted": False}, + ), + label="kept", + properties={"k": "v"}, + ), + ), + ) + _assert_roundtrip(graph) + + def test_view_is_layers_shaped(self) -> None: + graph = CorpusGraph( + nodes=( + CorpusNode(node_id="x", record=CorpusRecord(text="t", source_name="s")), + ), + edges=(CorpusEdge(source_id="x", target_id="y", edge_type="e"),), + ) + view = graph_to_layers(graph) + assert set(view) == {"expressions", "graphNodes", "graphEdgeSet"} + edge = view["graphEdgeSet"]["edges"][0] + assert edge["edgeType"] == "e" + assert edge["source"] == {"localId": {"value": "x"}} + assert edge["target"] == {"localId": {"value": "y"}} + assert view["expressions"]["x"]["kind"] == "expression" + assert view["expressions"]["x"]["text"] == "t" + + +# --- property-based lens-law verification ----------------------------------- + +_scalar = st.one_of(st.text(max_size=6), st.integers(-50, 50), st.booleans(), st.none()) +_features = st.dictionaries( + st.text(alphabet="klm", min_size=1, max_size=3), _scalar, max_size=3 +) +_node_ids = st.lists( + st.text(alphabet="abcde", min_size=1, max_size=4), max_size=5, unique=True +) + + +@st.composite +def _graphs(draw: st.DrawFn) -> CorpusGraph: + ids = draw(_node_ids) + nodes: list[CorpusNode] = [] + for node_id in ids: + if draw(st.booleans()): + record = CorpusRecord( + text=draw(st.text(max_size=8)), + source_name=draw(st.text(max_size=4)), + record_index=draw(st.integers(0, 20)), + provenance=draw(_features), + ) + nodes.append( + CorpusNode(node_id=node_id, record=record, properties=draw(_features)) + ) + else: + nodes.append( + CorpusNode( + node_id=node_id, + node_type=draw(st.sampled_from(["entity", "concept"])), + label=draw(st.one_of(st.none(), st.text(max_size=5))), + properties=draw(_features), + ) + ) + endpoint = ( + st.sampled_from(ids) + if ids + else st.text(alphabet="abcde", min_size=1, max_size=4) + ) + edges: list[CorpusEdge] = [] + for _ in range(draw(st.integers(0, 4))): + edges.append( + CorpusEdge( + source_id=draw(endpoint), + target_id=draw(endpoint), + edge_type=draw(st.sampled_from(["e1", "e2"])), + directed=draw(st.booleans()), + confidence=draw(st.one_of(st.none(), st.floats(0.0, 1.0))), + features=draw(_features), + ) + ) + return CorpusGraph(nodes=tuple(nodes), edges=tuple(edges)) + + +class TestLensLaws: + """The didactic GetPut/PutGet laws hold across generated graphs.""" + + @settings(max_examples=60, suppress_health_check=[HealthCheck.too_slow]) + @given(_graphs()) + def test_get_put_law(self, graph: CorpusGraph) -> None: + view, complement = LENS.forward(graph) + assert LENS.backward(view, complement) == graph diff --git a/tests/interop/test_layers_lexicon_validation.py b/tests/interop/test_layers_lexicon_validation.py new file mode 100644 index 0000000..a68afdb --- /dev/null +++ b/tests/interop/test_layers_lexicon_validation.py @@ -0,0 +1,229 @@ +"""Validate bead's layers-mapping output against the layers lexicons. + +Uses the ATProto lexicon validation machinery (``@atproto/lexicon``) against the +layers lexicons vendored as the ``vendor/layers`` git submodule (checked out with +``git submodule update --init``) to prove every mapping produces schema-valid +layers records. The validator runs in Node; the suite skips if Node, the +validator dependency, or the submodule checkout is unavailable. +""" + +from __future__ import annotations + +import json +import shutil +import subprocess +from pathlib import Path + +import pytest + +from bead.corpus.graph import CorpusEdge, CorpusGraph, CorpusNode +from bead.corpus.records import CorpusRecord +from bead.interop.layers import models as m +from bead.interop.layers import models_records as r +from bead.interop.layers.bridges import RECORD_EXPRESSION +from bead.interop.layers.graph_lens import graph_to_layers +from bead.interop.layers.model_lenses import ALL_MIRROR_ISOS +from bead.interop.layers.parse_lens import parse_to_layers +from bead.tokenization.parsers import ParsedSentence, ParsedToken + +# Reuse the exact instances exercised by the round-trip suites. +from tests.interop.test_layers_defs import _EXAMPLES as _DEF_EXAMPLES +from tests.interop.test_layers_records import _EXAMPLES as _RECORD_EXAMPLES + +_VALIDATOR = Path(__file__).parent / "lexicon_validator" +_INSTALLED = _VALIDATOR / "node_modules" / "@atproto" / "lexicon" +_REPO_ROOT = Path(__file__).resolve().parents[2] +_LEXICON_DIR = _REPO_ROOT / "vendor" / "layers" / "lexicons" / "pub" / "layers" + +# Mirror model type -> the lexicon URI its JSON must validate against. +_LEX_URI: dict[type, str] = { + m.LayersUuid: "pub.layers.defs#uuid", + m.Feature: "pub.layers.defs#feature", + m.FeatureMap: "pub.layers.defs#featureMap", + m.KnowledgeRef: "pub.layers.defs#knowledgeRef", + m.BoundingBox: "pub.layers.defs#boundingBox", + m.TemporalSpan: "pub.layers.defs#temporalSpan", + m.AgentRef: "pub.layers.defs#agentRef", + m.ObjectRef: "pub.layers.defs#objectRef", + m.LayersSpan: "pub.layers.defs#span", + m.TokenRef: "pub.layers.defs#tokenRef", + m.TokenRefSequence: "pub.layers.defs#tokenRefSequence", + m.Keyframe: "pub.layers.defs#keyframe", + m.SpatioTemporalAnchor: "pub.layers.defs#spatioTemporalAnchor", + m.TemporalEntity: "pub.layers.defs#temporalEntity", + m.TemporalModifier: "pub.layers.defs#temporalModifier", + m.TemporalExpression: "pub.layers.defs#temporalExpression", + m.SpatialEntity: "pub.layers.defs#spatialEntity", + m.SpatialModifier: "pub.layers.defs#spatialModifier", + m.SpatialExpression: "pub.layers.defs#spatialExpression", + m.PageAnchor: "pub.layers.defs#pageAnchor", + m.TextQuoteSelector: "pub.layers.defs#textQuoteSelector", + m.TextPositionSelector: "pub.layers.defs#textPositionSelector", + m.FragmentSelector: "pub.layers.defs#fragmentSelector", + m.ExternalTarget: "pub.layers.defs#externalTarget", + m.Anchor: "pub.layers.defs#anchor", + m.AlignmentLink: "pub.layers.defs#alignmentLink", + m.AnnotationMetadata: "pub.layers.defs#annotationMetadata", + m.LayersConstraint: "pub.layers.defs#constraint", + r.Expression: "pub.layers.expression.expression", + r.Token: "pub.layers.segmentation.defs#token", + r.Tokenization: "pub.layers.segmentation.defs#tokenization", + r.ArgumentRef: "pub.layers.annotation.defs#argumentRef", + r.Annotation: "pub.layers.annotation.defs#annotation", + r.Cluster: "pub.layers.annotation.defs#cluster", + r.AnnotationLayer: "pub.layers.annotation.annotationLayer", + r.GraphNode: "pub.layers.graph.graphNode", + r.GraphEdge: "pub.layers.graph.graphEdge", + r.GraphEdgeEntry: "pub.layers.graph.defs#graphEdgeEntry", + r.GraphEdgeSet: "pub.layers.graph.graphEdgeSet", + r.AudioInfo: "pub.layers.media.defs#audioInfo", + r.VideoInfo: "pub.layers.media.defs#videoInfo", + r.DocumentInfo: "pub.layers.media.defs#documentInfo", + r.RoleSlot: "pub.layers.ontology.defs#roleSlot", + r.TypeDef: "pub.layers.ontology.typeDef", +} + + +@pytest.fixture(scope="module") +def validate_layers(): # noqa: ANN202 - returns an internal validator callable + """Provide a callable validating ``(lexUri, value)`` pairs via @atproto/lexicon.""" + if not _LEXICON_DIR.is_dir(): + pytest.skip( + "layers lexicons missing; run `git submodule update --init vendor/layers`" + ) + node = shutil.which("node") + if node is None: + pytest.skip("node is required for ATProto lexicon validation") + if not _INSTALLED.exists(): + npm = shutil.which("npm") + if npm is None: + pytest.skip("npm is required to install @atproto/lexicon") + proc = subprocess.run( + [npm, "install", "--no-audit", "--no-fund"], + cwd=_VALIDATOR, + capture_output=True, + text=True, + timeout=300, + ) + if proc.returncode != 0 or not _INSTALLED.exists(): + pytest.skip(f"could not install @atproto/lexicon: {proc.stderr[:200]}") + + def _validate(pairs: list[tuple[str, object]]) -> list[dict[str, object]]: + payload = json.dumps([{"lexUri": uri, "value": value} for uri, value in pairs]) + proc = subprocess.run( + [node, str(_VALIDATOR / "validate.mjs")], + input=payload, + capture_output=True, + text=True, + timeout=120, + ) + assert proc.returncode == 0, proc.stderr + return json.loads(proc.stdout) + + return _validate + + +def _failures(results, pairs): # noqa: ANN001, ANN202 + return [ + {"lexUri": uri, "error": res.get("error")} + for (uri, _value), res in zip(pairs, results, strict=True) + if not res["ok"] + ] + + +def test_all_mirror_models_validate(validate_layers) -> None: # noqa: ANN001 + pairs: list[tuple[str, object]] = [] + for example in (*_DEF_EXAMPLES, *_RECORD_EXAMPLES): + lex_uri = _LEX_URI.get(type(example)) + if lex_uri is None: # the Selector union has no standalone lexicon def + continue + pairs.append((lex_uri, ALL_MIRROR_ISOS[type(example)].forward(example))) + assert not _failures(validate_layers(pairs), pairs) + + +def _graph() -> CorpusGraph: + return CorpusGraph( + nodes=( + CorpusNode( + node_id="sub", record=CorpusRecord(text="submission", source_name="r") + ), + CorpusNode(node_id="alice", node_type="entity", label="Alice"), + ), + edges=( + CorpusEdge( + source_id="sub", + target_id="alice", + edge_type="authored-by", + confidence=0.9, + ), + ), + ) + + +def test_graph_bridge_outputs_validate(validate_layers) -> None: # noqa: ANN001 + view = graph_to_layers(_graph()) + assert isinstance(view, dict) + expressions = view["expressions"] + graph_nodes = view["graphNodes"] + assert isinstance(expressions, dict) and isinstance(graph_nodes, dict) + pairs: list[tuple[str, object]] = [] + for expression in expressions.values(): + pairs.append(("pub.layers.expression.expression", expression)) + for graph_node in graph_nodes.values(): + pairs.append(("pub.layers.graph.graphNode", graph_node)) + pairs.append(("pub.layers.graph.graphEdgeSet", view["graphEdgeSet"])) + assert not _failures(validate_layers(pairs), pairs) + + +def test_record_bridge_output_validates(validate_layers) -> None: # noqa: ANN001 + view, _complement = RECORD_EXPRESSION.forward( + CorpusRecord(text="hello", source_name="s", provenance={"author": "a"}) + ) + pairs = [("pub.layers.expression.expression", view)] + assert not _failures(validate_layers(pairs), pairs) + + +def test_parse_bridge_content_validates(validate_layers) -> None: # noqa: ANN001 + sentence = ParsedSentence( + original_text="dogs bark", + tokens=( + ParsedToken( + index=0, + text="dogs", + upos="NOUN", + deprel="nsubj", + head=1, + start_char=0, + end_char=4, + ), + ParsedToken( + index=1, + text="bark", + upos="VERB", + deprel="root", + head=None, + start_char=5, + end_char=9, + ), + ), + ) + view = parse_to_layers(sentence) + assert isinstance(view, dict) + tokenization = view["tokenization"] + pos_layer = view["posLayer"] + dep_layer = view["dependencyLayer"] + assert isinstance(tokenization, dict) + assert isinstance(pos_layer, dict) and isinstance(dep_layer, dict) + pairs: list[tuple[str, object]] = [ + ("pub.layers.segmentation.defs#tokenization", tokenization) + ] + tokens = tokenization["tokens"] + assert isinstance(tokens, tuple) + for token in tokens: + pairs.append(("pub.layers.segmentation.defs#token", token)) + for layer in (pos_layer, dep_layer): + annotations = layer["annotations"] + assert isinstance(annotations, tuple) + for annotation in annotations: + pairs.append(("pub.layers.annotation.defs#annotation", annotation)) + assert not _failures(validate_layers(pairs), pairs) diff --git a/tests/interop/test_layers_parse_iso.py b/tests/interop/test_layers_parse_iso.py new file mode 100644 index 0000000..2d43cb5 --- /dev/null +++ b/tests/interop/test_layers_parse_iso.py @@ -0,0 +1,157 @@ +"""Round-trip law tests for the ParsedSentence <-> layers annotation iso.""" + +from __future__ import annotations + +from hypothesis import given +from hypothesis import strategies as st + +from bead.interop.layers.parse_lens import PARSED_SENTENCE_LAYERS, parse_to_layers +from bead.tokenization.parsers import ParsedSentence, ParsedToken + +ISO = PARSED_SENTENCE_LAYERS + + +def _assert_roundtrip(sentence: ParsedSentence) -> None: + view = ISO.forward(sentence) + assert ISO.backward(view) == sentence + # PutGet: re-projecting the reconstruction yields the same view. + assert ISO.forward(ISO.backward(view)) == view + + +def _known_sentence() -> ParsedSentence: + return ParsedSentence( + original_text="The dog chased the cat", + tokens=( + ParsedToken( + index=0, + text="The", + lemma="the", + upos="DET", + xpos="DT", + deprel="det", + head=1, + start_char=0, + end_char=3, + ), + ParsedToken( + index=1, + text="dog", + lemma="dog", + upos="NOUN", + xpos="NN", + deprel="nsubj", + head=2, + morph={"Number": "Sing"}, + start_char=4, + end_char=7, + ), + ParsedToken( + index=2, + text="chased", + lemma="chase", + upos="VERB", + xpos="VBD", + deprel="root", + head=None, + morph={"Tense": "Past"}, + start_char=8, + end_char=14, + ), + ParsedToken( + index=3, + text="the", + lemma="the", + upos="DET", + xpos="DT", + deprel="det", + head=4, + start_char=15, + end_char=18, + ), + ParsedToken( + index=4, + text="cat", + lemma="cat", + upos="NOUN", + xpos="NN", + deprel="obj", + head=2, + start_char=19, + end_char=22, + ), + ), + ) + + +class TestExampleRoundTrips: + """Deterministic round-trips over representative parses.""" + + def test_full_parse(self) -> None: + _assert_roundtrip(_known_sentence()) + + def test_root_head_minus_one(self) -> None: + view = parse_to_layers(_known_sentence()) + # the root token (index 2) is encoded with headIndex -1 + dep = view["dependencyLayer"]["annotations"][2] + assert dep["headIndex"] == -1 + assert dep["label"] == "root" + + def test_view_is_layers_shaped(self) -> None: + view = parse_to_layers(_known_sentence()) + assert set(view) == { + "originalText", + "tokenization", + "posLayer", + "dependencyLayer", + } + assert view["posLayer"]["subkind"] == "pos" + assert view["dependencyLayer"]["subkind"] == "dependency" + assert view["tokenization"]["tokens"][0]["textSpan"] == { + "byteStart": 0, + "byteEnd": 3, + "charStart": 0, + "charEnd": 3, + } + + def test_missing_optionals(self) -> None: + _assert_roundtrip( + ParsedSentence( + original_text="x", + tokens=(ParsedToken(index=0, text="x", start_char=0, end_char=1),), + ) + ) + + +_morph = st.dictionaries( + st.text(alphabet="AB", min_size=1, max_size=2), + st.text(alphabet="xy", min_size=1, max_size=2), + max_size=2, +) +_opt = st.one_of(st.none(), st.text(alphabet="pq", min_size=1, max_size=3)) + + +@st.composite +def _sentences(draw: st.DrawFn) -> ParsedSentence: + n = draw(st.integers(0, 5)) + tokens = tuple( + ParsedToken( + index=i, + text=draw(st.text(max_size=5)), + lemma=draw(_opt), + upos=draw(_opt), + xpos=draw(_opt), + deprel=draw(_opt), + head=draw(st.one_of(st.none(), st.integers(0, max(n - 1, 0)))), + morph=draw(_morph), + space_after=draw(st.booleans()), + start_char=draw(st.integers(0, 50)), + end_char=draw(st.integers(0, 50)), + ) + for i in range(n) + ) + return ParsedSentence(original_text=draw(st.text(max_size=20)), tokens=tokens) + + +@given(_sentences()) +def test_iso_round_trip_law(sentence: ParsedSentence) -> None: + assert ISO.backward(ISO.forward(sentence)) == sentence diff --git a/tests/interop/test_layers_record_bridge.py b/tests/interop/test_layers_record_bridge.py new file mode 100644 index 0000000..dea403b --- /dev/null +++ b/tests/interop/test_layers_record_bridge.py @@ -0,0 +1,70 @@ +"""Round-trip law tests for the CorpusRecord <-> layers expression lens.""" + +from __future__ import annotations + +from hypothesis import given +from hypothesis import strategies as st + +from bead.corpus.records import CorpusRecord +from bead.interop.layers.bridges import RECORD_EXPRESSION, record_to_expression + +LENS = RECORD_EXPRESSION + + +def _assert_roundtrip(record: CorpusRecord) -> None: + view, complement = LENS.forward(record) + assert LENS.backward(view, complement) == record + view2, complement2 = LENS.forward(LENS.backward(view, complement)) + assert (view2, complement2) == (view, complement) + + +class TestExampleRoundTrips: + """Deterministic round-trips over representative records.""" + + def test_minimal(self) -> None: + _assert_roundtrip(CorpusRecord(text="hello", source_name="s")) + + def test_with_scalar_provenance(self) -> None: + _assert_roundtrip( + CorpusRecord( + text="a reply", + source_name="reddit", + record_index=3, + provenance={"author": "alice", "score": 5, "deleted": False}, + ) + ) + + def test_view_is_layers_expression(self) -> None: + view = record_to_expression( + CorpusRecord(text="hi", source_name="s", provenance={"k": "v"}) + ) + assert view["kind"] == "expression" + assert view["text"] == "hi" + assert view["features"]["entries"][0] == {"key": "k", "value": '"v"'} + + +_scalar = st.one_of(st.text(max_size=6), st.integers(-50, 50), st.booleans(), st.none()) + + +@given( + text=st.text(max_size=20), + source_name=st.text(max_size=8), + record_index=st.integers(0, 1000), + provenance=st.dictionaries( + st.text(alphabet="abc", min_size=1, max_size=3), _scalar, max_size=4 + ), +) +def test_get_put_law( + text: str, + source_name: str, + record_index: int, + provenance: dict[str, str | int | bool | None], +) -> None: + record = CorpusRecord( + text=text, + source_name=source_name, + record_index=record_index, + provenance=provenance, + ) + view, complement = LENS.forward(record) + assert LENS.backward(view, complement) == record diff --git a/tests/interop/test_layers_records.py b/tests/interop/test_layers_records.py new file mode 100644 index 0000000..bc55da9 --- /dev/null +++ b/tests/interop/test_layers_records.py @@ -0,0 +1,134 @@ +"""Round-trip law tests for the layers record mirror isos.""" + +from __future__ import annotations + +import didactic.api as dx +import pytest + +from bead.interop.layers import models as m +from bead.interop.layers import models_records as r +from bead.interop.layers.model_lenses import RECORD_ISOS + +_UUID = m.LayersUuid(value="u1") +_KR = m.KnowledgeRef(source="wikidata", identifier="Q5") +_REF = m.ObjectRef(local_id=_UUID) +_META = m.AnnotationMetadata(tool="spacy", timestamp="2026-05-29T00:00:00+00:00") +_NOW = "2026-05-29T00:00:00+00:00" + +# One representative instance per record mirror model. +_EXAMPLES: tuple[dx.Model, ...] = ( + r.Expression( + id="doc1", + kind="document", + created_at=_NOW, + text="Hello world.", + anchor=m.Anchor(text_span=m.LayersSpan(byte_start=0, byte_end=12)), + metadata=_META, + features=m.FeatureMap(entries=(m.Feature(key="lang", value="en"),)), + knowledge_refs=(_KR,), + languages=("en",), + ), + r.Token( + token_index=0, text="Hello", text_span=m.LayersSpan(byte_start=0, byte_end=5) + ), + r.Tokenization( + uuid=_UUID, + kind="penn-treebank", + tokens=(r.Token(token_index=0, text="Hi"),), + metadata=_META, + ), + r.ArgumentRef(role="ARG0", target=_REF), + r.Annotation( + uuid=_UUID, + token_index=2, + label="nsubj", + head_index=3, + arguments=(r.ArgumentRef(role="ARG0", target=_REF),), + confidence=900, + knowledge_refs=(_KR,), + temporal=m.TemporalExpression(type="date"), + ), + r.Cluster(uuid=_UUID, canonical_label="Alice", members=(_REF,)), + r.AnnotationLayer( + expression="at://did:plc:abc123/pub.layers.expression.expression/self", + kind="relation", + subkind="dependency", + formalism="universal-dependencies", + created_at=_NOW, + tokenization_id=_UUID, + annotations=(r.Annotation(uuid=_UUID, token_index=0, label="root"),), + metadata=_META, + ), + r.GraphNode( + node_type="entity", + created_at=_NOW, + label="Alice", + properties=m.FeatureMap(entries=(m.Feature(key="k", value="v"),)), + knowledge_refs=(_KR,), + ), + r.GraphEdge( + source=_REF, + target=_REF, + edge_type="coreference", + created_at=_NOW, + ordinal=1, + confidence=800, + ), + r.GraphEdgeEntry(uuid=_UUID, edge_type="reply-to", source=_REF, target=_REF), + r.GraphEdgeSet( + created_at=_NOW, + edges=( + r.GraphEdgeEntry( + uuid=_UUID, edge_type="reply-to", source=_REF, target=_REF + ), + ), + expression="at://did:plc:abc123/pub.layers.expression.expression/self", + ), + r.AudioInfo(sample_rate=44100, channels=2, codec="pcm"), + r.VideoInfo(width=1920, height=1080, frame_rate=30, codec="h264"), + r.DocumentInfo(dpi=300, page_count=12, writing_direction="ltr"), + r.RoleSlot( + role_name="Agent", + filler_type_refs=("at://did:plc:abc123/pub.layers.ontology.typeDef/agent",), + required=True, + constraints=(m.LayersConstraint(expression="x>0"),), + ), + r.TypeDef( + ontology_ref="at://did:plc:abc123/pub.layers.ontology.ontology/self", + name="give", + created_at=_NOW, + type_kind="frame", + allowed_roles=(r.RoleSlot(role_name="Agent"),), + allowed_values=("a", "b"), + ), +) + + +@pytest.mark.parametrize("example", _EXAMPLES, ids=lambda e: type(e).__name__) +def test_record_roundtrip(example: dx.Model) -> None: + iso = RECORD_ISOS[type(example)] + view = iso.forward(example) + assert iso.backward(view) == example + assert iso.forward(iso.backward(view)) == view + + +def test_every_record_has_a_law_passing_iso() -> None: + example_types = {type(example) for example in _EXAMPLES} + assert example_types == set(RECORD_ISOS) + + +def test_annotation_layer_is_camelcased() -> None: + iso = RECORD_ISOS[r.AnnotationLayer] + expression_uri = "at://did:plc:abc123/pub.layers.expression.expression/self" + view = iso.forward( + r.AnnotationLayer( + expression=expression_uri, + kind="relation", + subkind="dependency", + created_at=_NOW, + ) + ) + assert isinstance(view, dict) + assert view["expression"] == expression_uri + assert view["subkind"] == "dependency" + assert "createdAt" in view diff --git a/tests/interop/test_layers_resource.py b/tests/interop/test_layers_resource.py new file mode 100644 index 0000000..0b24f18 --- /dev/null +++ b/tests/interop/test_layers_resource.py @@ -0,0 +1,107 @@ +"""Round-trip law tests for the resource overlap lenses.""" + +from __future__ import annotations + +from bead.interop.layers.resource_lens import ( + LEXICAL_ITEM_ENTRY, + LEXICON_COLLECTION, + TEMPLATE_LAYERS, +) +from bead.resources.constraints import Constraint +from bead.resources.lexical_item import LexicalItem +from bead.resources.lexicon import Lexicon +from bead.resources.template import Slot, Template + + +class TestLexicalItemEntry: + """LexicalItem <-> layers entry.""" + + def test_full(self) -> None: + item = LexicalItem( + lemma="run", + language_code="eng", + form="ran", + features={"pos": "VERB", "tense": "past"}, + source="UniMorph", + ) + view, complement = LEXICAL_ITEM_ENTRY.forward(item) + assert view["form"] == "ran" + assert view["lemma"] == "run" + assert LEXICAL_ITEM_ENTRY.backward(view, complement) == item + + def test_form_defaults_to_lemma_in_view(self) -> None: + item = LexicalItem(lemma="dog", language_code="eng") + view, complement = LEXICAL_ITEM_ENTRY.forward(item) + assert view["form"] == "dog" # faithful entry.form + # but the original None form is recovered exactly + restored = LEXICAL_ITEM_ENTRY.backward(view, complement) + assert restored.form is None + assert restored == item + + +class TestLexiconCollection: + """Lexicon <-> layers collection + entries.""" + + def test_roundtrip(self) -> None: + lexicon = Lexicon( + name="verbs", + description="motion verbs", + language_code="eng", + items=( + LexicalItem(lemma="run", language_code="eng", features={"pos": "VERB"}), + LexicalItem(lemma="walk", language_code="eng", form="walked"), + ), + tags=("motion", "manner"), + ) + view, complement = LEXICON_COLLECTION.forward(lexicon) + assert view["collection"]["kind"] == "lexicon" + assert len(view["entries"]) == 2 + assert LEXICON_COLLECTION.backward(view, complement) == lexicon + + def test_empty(self) -> None: + lexicon = Lexicon(name="empty") + view, complement = LEXICON_COLLECTION.forward(lexicon) + assert LEXICON_COLLECTION.backward(view, complement) == lexicon + + +class TestTemplateLayers: + """Template <-> layers template (with slots and constraints).""" + + def test_roundtrip(self) -> None: + template = Template( + name="transitive", + template_string="The {subj} {verb} the {obj}.", + slots={ + "subj": Slot(name="subj", required=True), + "verb": Slot( + name="verb", + description="a transitive verb", + constraints=( + Constraint(expression="self.pos == 'VERB'", description="verb"), + ), + ), + "obj": Slot(name="obj", default_value="ball"), + }, + constraints=( + Constraint( + expression="subj.number == obj.number", + context={"strict": True}, + ), + ), + description="a 2-argument frame", + language_code="eng", + tags=("syntax",), + metadata={"source": "manual"}, + ) + view, complement = TEMPLATE_LAYERS.forward(template) + assert view["text"] == "The {subj} {verb} the {obj}." + assert set(view["slots"]) == {"subj", "verb", "obj"} + assert view["slots"]["verb"]["constraints"][0]["expression"] == ( + "self.pos == 'VERB'" + ) + assert TEMPLATE_LAYERS.backward(view, complement) == template + + def test_minimal(self) -> None: + template = Template(name="t", template_string="{x}") + view, complement = TEMPLATE_LAYERS.forward(template) + assert TEMPLATE_LAYERS.backward(view, complement) == template diff --git a/tests/test_api_docs.py b/tests/test_api_docs.py index ed7d11c..da8ed7e 100644 --- a/tests/test_api_docs.py +++ b/tests/test_api_docs.py @@ -4,6 +4,7 @@ """ import os +import shutil import sys from pathlib import Path @@ -34,8 +35,6 @@ def setup_test_environment(): 3. Adds gallery to sys.path for imports 4. Cleans up after all tests complete """ - import shutil # noqa: PLC0415 - # Add gallery to sys.path so we can import utils if str(GALLERY_DIR) not in sys.path: sys.path.insert(0, str(GALLERY_DIR)) @@ -97,6 +96,23 @@ def test_api_docs_code_blocks( ): pytest.skip("Glazing data not available (run 'glazing download' first)") + # Skip examples that require optional NLP parser models (spaCy/Stanza) or + # external model APIs (OpenAI/Anthropic) - these resources are not available + # in CI, like glazing data above. + optional_backend_indicators = [ + "StanzaParser", + "SpacyParser", + "create_parser", + "sample_corpus", + "parse_records", + "filter_by_structure", + "CompletionCorpusSource", + "OpenAIAdapter", + "AnthropicAdapter", + ] + if any(ind in example.source for ind in optional_backend_indicators): + pytest.skip("Requires an optional NLP parser model or model API") + # Ignore D100 (module docstrings), D102 (method docstrings), F821 (undefined), # F401 (unused imports), E402 (imports not at top), I001 (import sorting) - # isolated documentation snippets showing specific concepts, not complete scripts diff --git a/tests/tokenization/test_parsers.py b/tests/tokenization/test_parsers.py new file mode 100644 index 0000000..886a951 --- /dev/null +++ b/tests/tokenization/test_parsers.py @@ -0,0 +1,227 @@ +"""Tests for dependency parsing and span projection.""" + +from __future__ import annotations + +import pytest + +from bead.tokenization.config import TokenizerConfig +from bead.tokenization.parsers import ( + UNIVERSAL_DEPENDENCIES, + ParsedSentence, + ParsedToken, + StanzaParser, + _parse_feats, + create_parser, + parse_to_spans, +) + + +def _known_sentence() -> ParsedSentence: + """A hand-built parse of 'The dog chased the cat' (UD-style).""" + return ParsedSentence( + original_text="The dog chased the cat", + tokens=( + ParsedToken( + index=0, + text="The", + lemma="the", + upos="DET", + xpos="DT", + deprel="det", + head=1, + start_char=0, + end_char=3, + ), + ParsedToken( + index=1, + text="dog", + lemma="dog", + upos="NOUN", + xpos="NN", + deprel="nsubj", + head=2, + morph={"Number": "Sing"}, + start_char=4, + end_char=7, + ), + ParsedToken( + index=2, + text="chased", + lemma="chase", + upos="VERB", + xpos="VBD", + deprel="root", + head=None, + morph={"Tense": "Past"}, + start_char=8, + end_char=14, + ), + ParsedToken( + index=3, + text="the", + lemma="the", + upos="DET", + xpos="DT", + deprel="det", + head=4, + start_char=15, + end_char=18, + ), + ParsedToken( + index=4, + text="cat", + lemma="cat", + upos="NOUN", + xpos="NN", + deprel="obj", + head=2, + morph={"Number": "Sing"}, + start_char=19, + end_char=22, + ), + ), + ) + + +class TestParseFeats: + """Tests for CoNLL-U feature parsing.""" + + def test_empty(self) -> None: + assert _parse_feats(None) == {} + assert _parse_feats("_") == {} + + def test_parse(self) -> None: + assert _parse_feats("Number=Sing|Tense=Past") == { + "Number": "Sing", + "Tense": "Past", + } + + def test_skips_malformed(self) -> None: + assert _parse_feats("Number=Sing|garbage") == {"Number": "Sing"} + + +class TestParseToSpans: + """Tests for projecting a parse onto spans and relations.""" + + def test_one_token_span_per_token(self) -> None: + spans, _ = parse_to_spans( + _known_sentence(), tokenization_id="tok-1", tool="test" + ) + assert len(spans) == 5 + assert all(s.span_type == "token" for s in spans) + assert all(len(s.segments) == 1 for s in spans) + assert [s.segments[0].indices[0] for s in spans] == [0, 1, 2, 3, 4] + + def test_span_ids_and_metadata(self) -> None: + spans, _ = parse_to_spans( + _known_sentence(), + element_name="text", + tokenization_id="tok-1", + tool="stanza", + ) + chased = spans[2] + assert chased.span_id == "text:tok:2" + assert chased.head_index is None # root + assert chased.label is not None + assert chased.label.label == "VERB" + assert chased.span_metadata["upos"] == "VERB" + assert chased.span_metadata["xpos"] == "VBD" + assert chased.span_metadata["lemma"] == "chase" + assert chased.span_metadata["deprel"] == "root" + assert chased.span_metadata["formalism"] == UNIVERSAL_DEPENDENCIES + assert chased.span_metadata["tool"] == "stanza" + assert chased.span_metadata["tokenization_id"] == "tok-1" + assert chased.span_metadata["morph"] == {"Tense": "Past"} + assert chased.span_metadata["start_char"] == 8 + assert chased.span_metadata["end_char"] == 14 + + def test_head_index_is_governor(self) -> None: + spans, _ = parse_to_spans( + _known_sentence(), tokenization_id="tok-1", tool="test" + ) + # token 0 ("The") is governed by token 1 ("dog") + assert spans[0].head_index == 1 + # token 1 ("dog") is governed by token 2 ("chased") + assert spans[1].head_index == 2 + + def test_relations_are_head_to_dependent(self) -> None: + _, relations = parse_to_spans( + _known_sentence(), + element_name="text", + tokenization_id="tok-1", + tool="test", + ) + # 4 arcs (every token except the root) + assert len(relations) == 4 + arcs = { + (r.source_span_id, r.target_span_id): (r.label.label if r.label else None) + for r in relations + } + # head ("chased" = tok:2) -> dependent ("dog" = tok:1), labeled nsubj + assert arcs[("text:tok:2", "text:tok:1")] == "nsubj" + assert arcs[("text:tok:2", "text:tok:4")] == "obj" + assert arcs[("text:tok:1", "text:tok:0")] == "det" + assert arcs[("text:tok:4", "text:tok:3")] == "det" + assert all(r.directed for r in relations) + + def test_root_has_no_relation(self) -> None: + _, relations = parse_to_spans( + _known_sentence(), tokenization_id="tok-1", tool="test" + ) + targets = {r.target_span_id for r in relations} + assert "text:tok:2" not in targets # root is never a dependent + + +class TestCreateParser: + """Tests for parser construction.""" + + def test_whitespace_cannot_parse(self) -> None: + with pytest.raises(ValueError, match="cannot produce a dependency parse"): + create_parser(TokenizerConfig(backend="whitespace")) + + def test_spacy_and_stanza_construct(self) -> None: + # Construction is lazy; no model is loaded here. + assert create_parser(TokenizerConfig(backend="spacy")) is not None + assert create_parser(TokenizerConfig(backend="stanza")) is not None + + +def _require_stanza_en() -> None: + """Skip only if Stanza or its English model cannot be obtained. + + Once the model is present, callers run the real parse so genuine parse or + projection bugs surface as failures rather than being skipped. + """ + stanza = pytest.importorskip("stanza") + try: + stanza.download("en", processors="tokenize,pos,lemma,depparse", verbose=False) + except Exception as exc: # pragma: no cover - network dependent + pytest.skip(f"Stanza English model unavailable (no network?): {exc}") + + +class TestStanzaParserIntegration: + """End-to-end parse via a real Stanza model (not skipped when available).""" + + def test_parse_transitive_sentence(self) -> None: + _require_stanza_en() + # Real parse; errors here are genuine failures, not skips. + sentences = StanzaParser(language="en")("The dog chased the cat.") + + assert len(sentences) == 1 + tokens = sentences[0].tokens + roots = [t for t in tokens if t.head is None] + assert len(roots) == 1 + assert roots[0].upos == "VERB" + assert roots[0].lemma == "chase" + obj = [t for t in tokens if t.deprel == "obj" and t.head == roots[0].index] + assert obj, "expected an object dependent of the root verb" + + def test_parse_projects_to_spans(self) -> None: + _require_stanza_en() + sentences = StanzaParser(language="en")("The dog chased the cat.") + spans, relations = parse_to_spans( + sentences[0], tokenization_id="tok-1", tool="stanza" + ) + assert len(spans) == len(sentences[0].tokens) + # exactly one root (no incoming arc); every other token has one + assert len(relations) == len(spans) - 1 + assert all(s.span_metadata["tool"] == "stanza" for s in spans) diff --git a/tests/transforms/test_text.py b/tests/transforms/test_text.py index ef904fe..4fabdd6 100644 --- a/tests/transforms/test_text.py +++ b/tests/transforms/test_text.py @@ -2,12 +2,16 @@ from __future__ import annotations +from bead.tokenization.config import TokenizerConfig from bead.transforms.base import TransformContext from bead.transforms.text import ( CapitalizeTransform, LowerTransform, + MarkdownStripTransform, + RedditCleanupTransform, TitleTransform, UpperTransform, + split_sentences, ) @@ -52,3 +56,63 @@ def test_basic(self) -> None: def test_already_title(self) -> None: assert TitleTransform()("Hello World", TransformContext()) == "Hello World" + + +class TestMarkdownStripTransform: + """Tests for MarkdownStripTransform.""" + + def test_link(self) -> None: + out = MarkdownStripTransform()("see [the docs](http://x)", TransformContext()) + assert out == "see the docs" + + def test_emphasis(self) -> None: + out = MarkdownStripTransform()("**bold** and *italic*", TransformContext()) + assert out == "bold and italic" + + def test_inline_code_and_heading(self) -> None: + out = MarkdownStripTransform()("# Title `code`", TransformContext()) + assert out == "Title code" + + def test_blockquote(self) -> None: + out = MarkdownStripTransform()("> quoted text", TransformContext()) + assert out == "quoted text" + + +class TestRedditCleanupTransform: + """Tests for RedditCleanupTransform.""" + + def test_unescape_and_markdown(self) -> None: + out = RedditCleanupTransform()( + "see [here](http://x) & more", TransformContext() + ) + assert out == "see here & more" + + def test_removes_url_and_deleted(self) -> None: + out = RedditCleanupTransform()( + "check https://example.com [deleted]", TransformContext() + ) + assert out == "check" + + def test_collapses_whitespace(self) -> None: + out = RedditCleanupTransform()("a b\tc", TransformContext()) + assert out == "a b c" + + +class TestSplitSentences: + """Tests for split_sentences.""" + + def test_regex_fallback(self) -> None: + result = split_sentences("Hello world. How are you? Fine!") + assert result == ("Hello world.", "How are you?", "Fine!") + + def test_single_sentence(self) -> None: + assert split_sentences("Just one sentence") == ("Just one sentence",) + + def test_empty(self) -> None: + assert split_sentences("") == () + + def test_whitespace_backend_uses_fallback(self) -> None: + result = split_sentences( + "One. Two.", tokenizer_config=TokenizerConfig(backend="whitespace") + ) + assert result == ("One.", "Two.") diff --git a/uv.lock b/uv.lock index da0e0c8..d17eae4 100644 --- a/uv.lock +++ b/uv.lock @@ -157,7 +157,7 @@ wheels = [ [[package]] name = "bead" -version = "0.5.0" +version = "0.6.0" source = { editable = "." } dependencies = [ { name = "accelerate" }, @@ -198,7 +198,11 @@ api = [ behavioral-analysis = [ { name = "slopit" }, ] +corpus = [ + { name = "zstandard" }, +] dev = [ + { name = "hypothesis" }, { name = "pandas-stubs" }, { name = "pyright" }, { name = "pytest" }, @@ -210,6 +214,7 @@ dev = [ { name = "ruff" }, { name = "spacy" }, { name = "stanza" }, + { name = "zstandard" }, ] stats = [ { name = "statsmodels" }, @@ -226,16 +231,22 @@ ui = [ { name = "textual" }, ] +[package.dev-dependencies] +dev = [ + { name = "hypothesis" }, +] + [package.metadata] requires-dist = [ { name = "accelerate", specifier = ">=0.25.0" }, { name = "anthropic", marker = "extra == 'api'", specifier = ">=0.8.0" }, { name = "click", specifier = ">=8.0.0" }, { name = "datasets", specifier = ">=2.14.0" }, - { name = "didactic", specifier = ">=0.6.2" }, + { name = "didactic", specifier = ">=0.7.2" }, { name = "evaluate", specifier = ">=0.4.0" }, { name = "glazing", specifier = ">=0.2.0" }, { name = "google-generativeai", marker = "extra == 'api'", specifier = ">=0.3.0" }, + { name = "hypothesis", marker = "extra == 'dev'", specifier = ">=6.155.0" }, { name = "jinja2", specifier = ">=3.0.0" }, { name = "krippendorff", specifier = ">=0.6.0" }, { name = "langcodes", specifier = ">=3.3.0" }, @@ -243,7 +254,7 @@ requires-dist = [ { name = "openai", marker = "extra == 'api'", specifier = ">=1.0.0" }, { name = "pandas", specifier = ">=2.0.0" }, { name = "pandas-stubs", marker = "extra == 'dev'", specifier = ">=2.0.0" }, - { name = "panproto", specifier = ">=0.43" }, + { name = "panproto", specifier = ">=0.51.0" }, { name = "peft", specifier = ">=0.6.0" }, { name = "polars", specifier = ">=0.19.0" }, { name = "prompt-toolkit", specifier = ">=3.0.0" }, @@ -276,8 +287,13 @@ requires-dist = [ { name = "typer", specifier = ">=0.9.0" }, { name = "unimorph", specifier = ">=0.0.4" }, { name = "uuid-utils", specifier = ">=0.7.0" }, + { name = "zstandard", marker = "extra == 'corpus'", specifier = ">=0.22" }, + { name = "zstandard", marker = "extra == 'dev'", specifier = ">=0.22" }, ] -provides-extras = ["dev", "api", "training", "stats", "ui", "behavioral-analysis", "tokenization"] +provides-extras = ["dev", "api", "training", "stats", "ui", "behavioral-analysis", "tokenization", "corpus"] + +[package.metadata.requires-dev] +dev = [{ name = "hypothesis", specifier = ">=6.155.0" }] [[package]] name = "black" @@ -511,15 +527,15 @@ wheels = [ [[package]] name = "didactic" -version = "0.6.2" +version = "0.7.2" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "annotated-types" }, { name = "panproto" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/82/49/f20c2d920359c35a3196af220bd97e87e81d4fe4a93b1c604d5a14f4ae88/didactic-0.6.2.tar.gz", hash = "sha256:e782eeae17b03b027f6119dafcaeef7224c23468e255a7b0a487f9b437b92cb4", size = 108463, upload-time = "2026-05-06T20:03:47.514Z" } +sdist = { url = "https://files.pythonhosted.org/packages/1d/e1/3694b7de53f9a09ee4a76f8496523362ffc25f913b0a958e4975452f22a5/didactic-0.7.2.tar.gz", hash = "sha256:279e4495908635f7facb41295fcb8122c7655cb85e758ece5453870132d5975b", size = 111030, upload-time = "2026-05-19T16:28:24.559Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/77/95/3f1e20bb65e78fea6d936ac94c79907bf36c28bf5c332b7e60b88546e124/didactic-0.6.2-py3-none-any.whl", hash = "sha256:34ef2e4df0b938ee7fbd4b352903b85dd3a959b34c2ee3e2b987773426dd2dfb", size = 134154, upload-time = "2026-05-06T20:03:45.934Z" }, + { url = "https://files.pythonhosted.org/packages/ef/c5/2db5aa15b3f83f4590273fae274f9f80e98e185eae7e9054a0a08a6b6ca8/didactic-0.7.2-py3-none-any.whl", hash = "sha256:8c314c7308d7cb15efe7382569c80d3999565a2ce26fbe6c24be070e069754c3", size = 136610, upload-time = "2026-05-19T16:28:22.881Z" }, ] [[package]] @@ -898,6 +914,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/cb/bd/1a875e0d592d447cbc02805fd3fe0f497714d6a2583f59d14fa9ebad96eb/huggingface_hub-0.36.0-py3-none-any.whl", hash = "sha256:7bcc9ad17d5b3f07b57c78e79d527102d08313caa278a641993acddcb894548d", size = 566094, upload-time = "2025-10-23T12:11:59.557Z" }, ] +[[package]] +name = "hypothesis" +version = "6.155.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "sortedcontainers" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/86/7d/9569717766867495510712eba388f7ca0633549f9ff4d3c34398b919e5b4/hypothesis-6.155.0.tar.gz", hash = "sha256:cf09ac913b60b49750585a53152704468de666f35c9c29f8e61d82a01f64bbb5", size = 476704, upload-time = "2026-05-28T15:43:24.193Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/53/f8/31a6a6646c5b76b9746454318989340cea0290ba34e0f3ccd0668ce67868/hypothesis-6.155.0-py3-none-any.whl", hash = "sha256:d6ffa3062afabaf908491be707c60843f6671f7c3e9f2ed249d5827207ebbf33", size = 543120, upload-time = "2026-05-28T15:43:21.855Z" }, +] + [[package]] name = "idna" version = "3.11" @@ -1537,14 +1565,14 @@ wheels = [ [[package]] name = "panproto" -version = "0.44.0" +version = "0.51.0" source = { registry = "https://pypi.org/simple" } wheels = [ - { url = "https://files.pythonhosted.org/packages/5f/7c/de999faaf87d6c41f3ce68a1cebd601ce33bd95705e005602ce86d2e99df/panproto-0.44.0-cp313-abi3-macosx_10_12_x86_64.whl", hash = "sha256:979532098b6144ac86061ee07cf1939fff7c106f164fe4117c3eb202f705cdf5", size = 11016442, upload-time = "2026-05-04T21:11:50.907Z" }, - { url = "https://files.pythonhosted.org/packages/b2/a2/781c3e278d3213d75fcb878731f50f43f2f378c3f15e82933b5db2874026/panproto-0.44.0-cp313-abi3-macosx_11_0_arm64.whl", hash = "sha256:81703f973352a27e71ab8fab9b84bf0002b96829e021f5e220536c1e0678eef8", size = 10840291, upload-time = "2026-05-04T21:11:53.62Z" }, - { url = "https://files.pythonhosted.org/packages/01/3f/2eb8cddff877e1378599ee6fbdb7287231e3dfd51b8a2435949bbd50b22f/panproto-0.44.0-cp313-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:12ed132e268ee57b274a0f3c9df0115689f99e6d83f8fc04313216e4f85b370c", size = 11376363, upload-time = "2026-05-04T21:11:56.903Z" }, - { url = "https://files.pythonhosted.org/packages/84/b0/c20542dab93ec929ed9db3053227add8ddfbbc96676768671dd9846c7fca/panproto-0.44.0-cp313-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:bf386a1418e100239624ae686f8d6bee341c8071b95de3c7987e0ecbd2c1797c", size = 11889761, upload-time = "2026-05-04T21:11:59.807Z" }, - { url = "https://files.pythonhosted.org/packages/b0/1d/ba73ce471365ea8a9105f37e441c85cf50ec8dd2a2fc7fde6ddd29f95a39/panproto-0.44.0-cp313-abi3-win_amd64.whl", hash = "sha256:473565182f5e874f10445ac76c0bee29ae531fd715ea56f2b78a8404118cfcd7", size = 10907499, upload-time = "2026-05-04T21:12:02.591Z" }, + { url = "https://files.pythonhosted.org/packages/4d/dd/9810ef2efa8335dd527db653214d791a852b896d2a0a6a8c8e9f20ed656c/panproto-0.51.0-cp313-abi3-macosx_10_12_x86_64.whl", hash = "sha256:d97a690159814cb482eb085494cb7b881a30e047c0d2847d88bf97203a3104bd", size = 11726180, upload-time = "2026-05-28T21:24:21.595Z" }, + { url = "https://files.pythonhosted.org/packages/12/99/8979129533b05012be4e6cbdc16b94ba3ed861b371c54606fd2e9c156b38/panproto-0.51.0-cp313-abi3-macosx_11_0_arm64.whl", hash = "sha256:69a68c137dc0c9d5bb7525fec907b4283316d512b0c77ca910ccd7eb6ca7f643", size = 11471081, upload-time = "2026-05-28T21:24:24.356Z" }, + { url = "https://files.pythonhosted.org/packages/3a/5f/deba52da6c9e15688f4d99651977d3f581d10486100f13a517763c7e6e38/panproto-0.51.0-cp313-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:6d735f7d1f1b1685dd9fd7466bf0bdd93d034e511a5d702228664c2966c9415a", size = 12016308, upload-time = "2026-05-28T21:24:27.323Z" }, + { url = "https://files.pythonhosted.org/packages/49/08/0893f0852a0c71eb59c7f76ed9cf321dad668377fee8714f88c3c631ad51/panproto-0.51.0-cp313-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:b22ec1545ea0ba615683864b159bc3832eb311d0241d9fef18414b919980de50", size = 12599142, upload-time = "2026-05-28T21:24:30.547Z" }, + { url = "https://files.pythonhosted.org/packages/69/7d/966f47265f5048a86276938769a51946c05e745c40ce96d81d615b2f479f/panproto-0.51.0-cp313-abi3-win_amd64.whl", hash = "sha256:08c55da3a1659718b6b85be65c5b438567721b84e40105d03862b10b743d6235", size = 11678138, upload-time = "2026-05-28T21:24:33.38Z" }, ] [[package]] @@ -2391,6 +2419,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/e9/44/75a9c9421471a6c4805dbf2356f7c181a29c1879239abab1ea2cc8f38b40/sniffio-1.3.1-py3-none-any.whl", hash = "sha256:2f6da418d1f1e0fddd844478f41680e794e6051915791a034ff65e5f100525a2", size = 10235, upload-time = "2024-02-25T23:20:01.196Z" }, ] +[[package]] +name = "sortedcontainers" +version = "2.4.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/e8/c4/ba2f8066cceb6f23394729afe52f3bf7adec04bf9ed2c820b39e19299111/sortedcontainers-2.4.0.tar.gz", hash = "sha256:25caa5a06cc30b6b83d11423433f65d1f9d76c4c6a0c90e3379eaa43b9bfdb88", size = 30594, upload-time = "2021-05-16T22:03:42.897Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/32/46/9cb0e58b2deb7f82b84065f37f3bffeb12413f947f9388e4cac22c4621ce/sortedcontainers-2.4.0-py2.py3-none-any.whl", hash = "sha256:a163dcaede0f1c021485e957a39245190e74249897e2ae4b2aa38595db237ee0", size = 29575, upload-time = "2021-05-16T22:03:41.177Z" }, +] + [[package]] name = "spacy" version = "3.8.11" @@ -3030,3 +3067,26 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/48/b7/503c98092fb3b344a179579f55814b613c1fbb1c23b3ec14a7b008a66a6e/yarl-1.22.0-cp314-cp314t-win_arm64.whl", hash = "sha256:9f6d73c1436b934e3f01df1e1b21ff765cd1d28c77dfb9ace207f746d4610ee1", size = 85171, upload-time = "2025-10-06T14:12:16.935Z" }, { url = "https://files.pythonhosted.org/packages/73/ae/b48f95715333080afb75a4504487cbe142cae1268afc482d06692d605ae6/yarl-1.22.0-py3-none-any.whl", hash = "sha256:1380560bdba02b6b6c90de54133c81c9f2a453dee9912fe58c1dcced1edb7cff", size = 46814, upload-time = "2025-10-06T14:12:53.872Z" }, ] + +[[package]] +name = "zstandard" +version = "0.25.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/fd/aa/3e0508d5a5dd96529cdc5a97011299056e14c6505b678fd58938792794b1/zstandard-0.25.0.tar.gz", hash = "sha256:7713e1179d162cf5c7906da876ec2ccb9c3a9dcbdffef0cc7f70c3667a205f0b", size = 711513, upload-time = "2025-09-14T22:15:54.002Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/3d/5c/f8923b595b55fe49e30612987ad8bf053aef555c14f05bb659dd5dbe3e8a/zstandard-0.25.0-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:e29f0cf06974c899b2c188ef7f783607dbef36da4c242eb6c82dcd8b512855e3", size = 795887, upload-time = "2025-09-14T22:17:54.198Z" }, + { url = "https://files.pythonhosted.org/packages/8d/09/d0a2a14fc3439c5f874042dca72a79c70a532090b7ba0003be73fee37ae2/zstandard-0.25.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:05df5136bc5a011f33cd25bc9f506e7426c0c9b3f9954f056831ce68f3b6689f", size = 640658, upload-time = "2025-09-14T22:17:55.423Z" }, + { url = "https://files.pythonhosted.org/packages/5d/7c/8b6b71b1ddd517f68ffb55e10834388d4f793c49c6b83effaaa05785b0b4/zstandard-0.25.0-cp314-cp314-manylinux2010_i686.manylinux_2_12_i686.manylinux_2_28_i686.whl", hash = "sha256:f604efd28f239cc21b3adb53eb061e2a205dc164be408e553b41ba2ffe0ca15c", size = 5379849, upload-time = "2025-09-14T22:17:57.372Z" }, + { url = "https://files.pythonhosted.org/packages/a4/86/a48e56320d0a17189ab7a42645387334fba2200e904ee47fc5a26c1fd8ca/zstandard-0.25.0-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:223415140608d0f0da010499eaa8ccdb9af210a543fac54bce15babbcfc78439", size = 5058095, upload-time = "2025-09-14T22:17:59.498Z" }, + { url = "https://files.pythonhosted.org/packages/f8/ad/eb659984ee2c0a779f9d06dbfe45e2dc39d99ff40a319895df2d3d9a48e5/zstandard-0.25.0-cp314-cp314-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:2e54296a283f3ab5a26fc9b8b5d4978ea0532f37b231644f367aa588930aa043", size = 5551751, upload-time = "2025-09-14T22:18:01.618Z" }, + { url = "https://files.pythonhosted.org/packages/61/b3/b637faea43677eb7bd42ab204dfb7053bd5c4582bfe6b1baefa80ac0c47b/zstandard-0.25.0-cp314-cp314-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:ca54090275939dc8ec5dea2d2afb400e0f83444b2fc24e07df7fdef677110859", size = 6364818, upload-time = "2025-09-14T22:18:03.769Z" }, + { url = "https://files.pythonhosted.org/packages/31/dc/cc50210e11e465c975462439a492516a73300ab8caa8f5e0902544fd748b/zstandard-0.25.0-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:e09bb6252b6476d8d56100e8147b803befa9a12cea144bbe629dd508800d1ad0", size = 5560402, upload-time = "2025-09-14T22:18:05.954Z" }, + { url = "https://files.pythonhosted.org/packages/c9/ae/56523ae9c142f0c08efd5e868a6da613ae76614eca1305259c3bf6a0ed43/zstandard-0.25.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:a9ec8c642d1ec73287ae3e726792dd86c96f5681eb8df274a757bf62b750eae7", size = 4955108, upload-time = "2025-09-14T22:18:07.68Z" }, + { url = "https://files.pythonhosted.org/packages/98/cf/c899f2d6df0840d5e384cf4c4121458c72802e8bda19691f3b16619f51e9/zstandard-0.25.0-cp314-cp314-musllinux_1_2_i686.whl", hash = "sha256:a4089a10e598eae6393756b036e0f419e8c1d60f44a831520f9af41c14216cf2", size = 5269248, upload-time = "2025-09-14T22:18:09.753Z" }, + { url = "https://files.pythonhosted.org/packages/1b/c0/59e912a531d91e1c192d3085fc0f6fb2852753c301a812d856d857ea03c6/zstandard-0.25.0-cp314-cp314-musllinux_1_2_ppc64le.whl", hash = "sha256:f67e8f1a324a900e75b5e28ffb152bcac9fbed1cc7b43f99cd90f395c4375344", size = 5430330, upload-time = "2025-09-14T22:18:11.966Z" }, + { url = "https://files.pythonhosted.org/packages/a0/1d/7e31db1240de2df22a58e2ea9a93fc6e38cc29353e660c0272b6735d6669/zstandard-0.25.0-cp314-cp314-musllinux_1_2_s390x.whl", hash = "sha256:9654dbc012d8b06fc3d19cc825af3f7bf8ae242226df5f83936cb39f5fdc846c", size = 5811123, upload-time = "2025-09-14T22:18:13.907Z" }, + { url = "https://files.pythonhosted.org/packages/f6/49/fac46df5ad353d50535e118d6983069df68ca5908d4d65b8c466150a4ff1/zstandard-0.25.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:4203ce3b31aec23012d3a4cf4a2ed64d12fea5269c49aed5e4c3611b938e4088", size = 5359591, upload-time = "2025-09-14T22:18:16.465Z" }, + { url = "https://files.pythonhosted.org/packages/c2/38/f249a2050ad1eea0bb364046153942e34abba95dd5520af199aed86fbb49/zstandard-0.25.0-cp314-cp314-win32.whl", hash = "sha256:da469dc041701583e34de852d8634703550348d5822e66a0c827d39b05365b12", size = 444513, upload-time = "2025-09-14T22:18:20.61Z" }, + { url = "https://files.pythonhosted.org/packages/3a/43/241f9615bcf8ba8903b3f0432da069e857fc4fd1783bd26183db53c4804b/zstandard-0.25.0-cp314-cp314-win_amd64.whl", hash = "sha256:c19bcdd826e95671065f8692b5a4aa95c52dc7a02a4c5a0cac46deb879a017a2", size = 516118, upload-time = "2025-09-14T22:18:17.849Z" }, + { url = "https://files.pythonhosted.org/packages/f0/ef/da163ce2450ed4febf6467d77ccb4cd52c4c30ab45624bad26ca0a27260c/zstandard-0.25.0-cp314-cp314-win_arm64.whl", hash = "sha256:d7541afd73985c630bafcd6338d2518ae96060075f9463d7dc14cfb33514383d", size = 476940, upload-time = "2025-09-14T22:18:19.088Z" }, +] diff --git a/vendor/layers b/vendor/layers new file mode 160000 index 0000000..6f3bfef --- /dev/null +++ b/vendor/layers @@ -0,0 +1 @@ +Subproject commit 6f3bfef92ea69a065a7331e76ca51d90cd8faf62