diff --git a/duui-anonymize/.gitignore b/duui-anonymize/.gitignore new file mode 100644 index 00000000..05703aca --- /dev/null +++ b/duui-anonymize/.gitignore @@ -0,0 +1,14 @@ +/../* +../ +../* +../duui-mm/* +.venv/** +.vscode/** +__pycache__/** +*.pyc + +target/** +dist/** +build/** + +src/test/results/** \ No newline at end of file diff --git a/duui-anonymize/README.md b/duui-anonymize/README.md new file mode 100644 index 00000000..8c005b75 --- /dev/null +++ b/duui-anonymize/README.md @@ -0,0 +1,39 @@ +#### OpenAI Privacy Filter component for DUUI + +OpenAI Privacy Filter: https://github.com/openai/privacy-filter + +#### Input/Output: + +input: Text in the Sofa. Optional selection offsets can be passed through Lua options. + +output: structured redaction spans and redacted text + +#### Output Shape: + +Privacy Filter detects 8 privacy span categories: + +- `account_number` +- `private_address` +- `private_email` +- `private_person` +- `private_phone` +- `private_url` +- `private_date` +- `secret` + +The model emits BIOES token classes for these categories plus `O`, and the service turns the resulting spans into DUUI annotations and redacted text. + +#### Parameter: + +[optional] OPF redaction options such as `model`, `context_window_length`, `trim_whitespace`, `device`, `output_mode`, `decode_mode`, `discard_overlapping_predicted_spans`, `viterbi_calibration_path`, and selection offsets (`selection_begin` / `selection_end`). + +#### Modes: + +- `replacement`: default mode, replaces detected spans with a consistent placeholder. +- `pseudo`: kept as a stub / TODO mode and currently returns the input unchanged. +- `mode` is passed through Lua options. + +#### Entry points: + +- `src/main/docker/python/duui_opf.py`: new OPF entrypoint wrapper. +- `src/main/docker/python/duui_whisperx.py`: compatibility implementation file while the migration is in progress. diff --git a/duui-anonymize/pom.xml b/duui-anonymize/pom.xml new file mode 100644 index 00000000..3e7b0b79 --- /dev/null +++ b/duui-anonymize/pom.xml @@ -0,0 +1,138 @@ + + + 4.0.0 + + org.texttechnology + duui-anonymize + 1.0-SNAPSHOT + + + + AGPL-3.0-or-later + https://www.gnu.org/licenses/agpl.txt + repo + GNU Affero General Public License v3.0 or later + + + + + Texttechnology Lab + https://www.texttechnologylab.org + + + + + mehler + Prof. Dr. Alexander Mehler + mehler@em.uni-frankfurt.de + https://www.texttechnologylab.org/team/alexander-abrami/ + Goethe University Frankfurt / Texttechnology Lab + https://www.texttechnologylab.org + + head of department + + + + aabusale + Ali Abusaleh + a.abusaleh@em.uni-frankfurt.de + https://www.texttechnologylab.org/team/ali-abusaleh/ + Goethe University Frankfurt / Texttechnology Lab + https://www.texttechnologylab.org + + Research assistant + + Europe/Berlin + + + + + + + org.apache.maven.plugins + maven-surefire-plugin + 2.22.0 + + + --illegal-access=permit + --add-opens java.base/java.util=ALL-UNNAMED + + + + + + + + 21 + 21 + UTF-8 + 2.4.0 + + + + + jitpack.io + https://jitpack.io + + + + + + + org.dkpro.core + dkpro-core-asl + ${dkpro.core.version} + pom + import + + + + + + + com.github.texttechnologylab + DockerUnifiedUIMAInterface + fac60bef3f + + + + com.github.texttechnologylab + UIMATypeSystem + 3.0.5 + + + + org.junit.jupiter + junit-jupiter + 5.9.0 + test + + + + org.dkpro.core + dkpro-core-api-anomaly-asl + test + + + + org.dkpro.core + dkpro-core-api-segmentation-asl + test + + + + org.dkpro.core + dkpro-core-io-xmi-asl + test + + + + org.dkpro.core + dkpro-core-api-resources-asl + test + + + + diff --git a/duui-anonymize/requirements.txt b/duui-anonymize/requirements.txt new file mode 100644 index 00000000..49fb44b8 --- /dev/null +++ b/duui-anonymize/requirements.txt @@ -0,0 +1,14 @@ +numpy +dkpro_cassis +fastapi +pydantic +pydantic-settings +pydantic_core +starlette +uvicorn +torch +torchvision +torchaudio +transformers +accelerate +setuptools diff --git a/duui-anonymize/src/main/docker/Dockerfile b/duui-anonymize/src/main/docker/Dockerfile new file mode 100644 index 00000000..660e32ce --- /dev/null +++ b/duui-anonymize/src/main/docker/Dockerfile @@ -0,0 +1,16 @@ +FROM python:3.10 + +WORKDIR /usr/src/app + +EXPOSE 9714 + +COPY ./src/main/python/communication.lua ./communication.lua +COPY ./src/main/python/duui_anonymize.py ./duui_anonymize.py +COPY ./src/main/python/typesystem.xml ./typesystem.xml +COPY ./requirements.txt ./requirements.txt + +RUN pip install torch==2.0.0 torchvision==0.15.1 torchaudio==2.0.1 --index-url https://download.pytorch.org/whl/cpu +RUN pip install -r requirements.txt + +ENTRYPOINT ["uvicorn", "duui_anonymize:app", "--host", "0.0.0.0", "--port" ,"9714"] +CMD ["--workers", "1"] \ No newline at end of file diff --git a/duui-anonymize/src/main/docker/Dockerfile-cuda b/duui-anonymize/src/main/docker/Dockerfile-cuda new file mode 100644 index 00000000..fa72894c --- /dev/null +++ b/duui-anonymize/src/main/docker/Dockerfile-cuda @@ -0,0 +1,26 @@ +FROM nvidia/cuda:11.8.0-cudnn8-devel-ubuntu22.04 + +RUN apt update && \ + DEBIAN_FRONTEND=noninteractive \ + apt install --no-install-recommends -y build-essential software-properties-common && \ + add-apt-repository -y ppa:deadsnakes/ppa && \ + apt install --no-install-recommends -y python3.10 python3-pip python3-setuptools python3-distutils && \ + apt clean && rm -rf /var/lib/apt/lists/* + +RUN ln -s /usr/bin/python3 /usr/bin/python +RUN python -m pip install --upgrade pip + +WORKDIR /usr/src/app + +EXPOSE 9714 + +COPY ./src/main/python/communication.lua ./communication.lua +COPY ./src/main/python/duui_anonymize.py ./duui_anonymize.py +COPY ./src/main/python/typesystem.xml ./typesystem.xml +COPY ./requirements.txt ./requirements.txt + +RUN pip install torch==2.0.0 torchvision==0.15.1 torchaudio==2.0.1 --index-url https://download.pytorch.org/whl/cu118 +RUN pip install -r requirements.txt + +ENTRYPOINT ["uvicorn", "duui_anonymize:app", "--host", "0.0.0.0", "--port" ,"9714"] +CMD ["--workers", "1"] \ No newline at end of file diff --git a/duui-anonymize/src/main/python/communication.lua b/duui-anonymize/src/main/python/communication.lua new file mode 100644 index 00000000..038d7167 --- /dev/null +++ b/duui-anonymize/src/main/python/communication.lua @@ -0,0 +1,110 @@ +-- Bind static classes from java +StandardCharsets = luajava.bindClass("java.nio.charset.StandardCharsets") +util = luajava.bindClass("org.apache.uima.fit.util.JCasUtil") + +-- Read a parameter from params regardless of whether it is a Lua table or a +-- LuaJ-wrapped Java Map. Direct table indexing works for Lua tables; Java +-- Map objects (HashMap, etc.) require params:get(key) instead. +local function param_get(params, key) + if params == nil then return nil end + local v = params[key] + if v ~= nil then return tostring(v) end + local ok, r = pcall(function() return params:get(key) end) + if ok and r ~= nil then return tostring(r) end + return nil +end + +-- Known option keys forwarded to the Python service. +local OPTION_KEYS = { + "mode", "model", "device", + "context_window_length", "trim_whitespace", + "output_mode", "discard_overlapping_predicted_spans", +} + +local function copy_options(params) + local options = {} + print("Copying options:") + for _, key in ipairs(OPTION_KEYS) do + local value = param_get(params, key) + if value ~= nil then + print(" ", key, "=", value) + options[key] = value + end + end + return options +end + +local function resolve_selection(params) + if params == nil then return nil end + + -- selection passed as a nested table + local selection = params["selection"] + if selection == nil then + local ok, r = pcall(function() return params:get("selection") end) + if ok then selection = r end + end + if type(selection) == "table" then + local b = selection["begin"] or selection["start"] + local e = selection["end"] or selection["stop"] + if b ~= nil and e ~= nil then + return { begin = b, ["end"] = e } + end + end + + -- selection passed as flat begin/end keys + local b = param_get(params, "selection_begin") or param_get(params, "selection_start") + local e = param_get(params, "selection_end") or param_get(params, "selection_stop") + if b ~= nil and e ~= nil then + return { begin = tonumber(b), ["end"] = tonumber(e) } + end + + return nil +end + +-- Serialize the CAS into a JSON request sent to the Python service. +function serialize(inputCas, outputStream, params) + local text = inputCas:getSofaDataString() + if text == nil then text = "" end + + local options = copy_options(params) + + outputStream:write(json.encode({ + text = text, + options = options, + selection = resolve_selection(params), + })) +end + +-- Deserialize the JSON response from the Python service back into the CAS. +-- +-- Anomaly annotations are added to the *original* CAS view so their +-- character offsets remain valid against the original document text. +-- The redacted text is stored as the sofa of a separate "opf_redacted" view. +function deserialize(inputCas, inputStream) + local inputString = luajava.newInstance("java.lang.String", inputStream:readAllBytes(), StandardCharsets.UTF_8) + local results = json.decode(inputString) + + -- Store redacted text in its own view. + if results["redacted_text"] ~= nil then + local ok, view = pcall(function() return inputCas:createView("opf_redacted") end) + if ok and view ~= nil then + view:setSofaDataString(results["redacted_text"], "text/plain") + end + end + + -- Add Anomaly annotations to the original view; offsets reference original text. + if results["detected_spans"] ~= nil then + for i, span in ipairs(results["detected_spans"]) do + local anomaly = luajava.newInstance( + "de.tudarmstadt.ukp.dkpro.core.api.anomaly.type.Anomaly", inputCas) + anomaly:setBegin(span["start"]) + anomaly:setEnd(span["end"]) + anomaly:setCategory(span["label"]) + -- description = replacement used (e.g. "[private_person]") or original word + anomaly:setDescription( + (span["placeholder"] ~= nil and span["placeholder"] ~= "") and span["placeholder"] + or span["text"] or span["label"]) + anomaly:addToIndexes() + end + end +end diff --git a/duui-anonymize/src/main/python/duui_anonymize.py b/duui-anonymize/src/main/python/duui_anonymize.py new file mode 100644 index 00000000..effe5371 --- /dev/null +++ b/duui-anonymize/src/main/python/duui_anonymize.py @@ -0,0 +1,281 @@ +from __future__ import annotations + +import logging +import json +from functools import lru_cache +from typing import Any, List, Optional + +import torch +import uvicorn +from cassis import load_typesystem +from fastapi import FastAPI, Request, Response +from fastapi.encoders import jsonable_encoder +from fastapi.exceptions import RequestValidationError +from fastapi.responses import JSONResponse, PlainTextResponse +from pydantic import BaseModel, Field, field_validator +from pydantic_settings import BaseSettings +from transformers import pipeline as hf_pipeline + +logger = logging.getLogger(__name__) +logging.basicConfig(level=logging.INFO) + +DEFAULT_MODEL = "openai/privacy-filter" + +MODE_REMOVE = "remove" +MODE_PLACEHOLDER = "placeholder" # default: replace with [category] +MODE_PSEUDO = "pseudo" # TODO: not yet supported + + +# --------------------------------------------------------------------------- +# Pydantic models +# --------------------------------------------------------------------------- + +class DetectedSpan(BaseModel): + label: str + start: int + end: int + text: str + placeholder: str # replacement text used; empty string for remove mode + + +class DUUIRequest(BaseModel): + text: str + options: dict[str, Any] = Field(default_factory=dict) + selection: Optional[dict] = None + + @field_validator("options", mode="before") + @classmethod + def coerce_options(cls, v: Any) -> dict: + if v is None or isinstance(v, list): + return {} + if not isinstance(v, dict): + return {} + return v + + @field_validator("text", mode="before") + @classmethod + def coerce_text(cls, v: Any) -> str: + return "" if v is None else str(v) + + +class DUUIResponse(BaseModel): + text: str + detected_spans: List[DetectedSpan] + redacted_text: str + warning: Optional[str] = None + + +class DUUIDocumentation(BaseModel): + annotator_name: str + version: str + implementation_lang: str + + +# --------------------------------------------------------------------------- +# Settings +# --------------------------------------------------------------------------- + +class Settings(BaseSettings): + duui_tool_name: str = "DUUI Anonymize" + duui_tool_version: str = "1.0" + default_model: str = DEFAULT_MODEL + + +settings = Settings() + +# --------------------------------------------------------------------------- +# FastAPI app +# --------------------------------------------------------------------------- + +app = FastAPI( + docs_url="/api", + redoc_url=None, + title="DUUI Anonymize", + description="PII detection and redaction for TTLab DUUI using openai/privacy-filter", + version="1.0", + terms_of_service="https://www.texttechnologylab.org/legal_notice/", + contact={ + "name": "Ali Abusaleh", + "url": "https://www.texttechnologylab.org", + "email": "abusaleh@em.uni-frankfurt.de", + }, + license_info={ + "name": "AGPL", + "url": "http://www.gnu.org/licenses/agpl-3.0.en.html", + }, +) + + +@app.exception_handler(RequestValidationError) +async def validation_exception_handler(request: Request, exc: RequestValidationError) -> JSONResponse: + body = await request.body() + logger.error("422 validation errors: %s", exc.errors()) + logger.error("Raw body: %s", body.decode("utf-8", errors="replace")) + return JSONResponse( + status_code=422, + content=jsonable_encoder({"detail": exc.errors(), "body": body.decode("utf-8", errors="replace")}), + ) + + +# --------------------------------------------------------------------------- +# Static assets +# --------------------------------------------------------------------------- + +with open("communication.lua", "rb") as _f: + _communication_lua: str = _f.read().decode("utf-8") + +with open("typesystem.xml", "rb") as _f: + _typesystem = load_typesystem(_f) + + +# --------------------------------------------------------------------------- +# Endpoints +# --------------------------------------------------------------------------- + +@app.get("/v1/details/input_output") +def get_input_output() -> JSONResponse: + return JSONResponse(content=jsonable_encoder({ + "inputs": [], + "outputs": ["de.tudarmstadt.ukp.dkpro.core.api.anomaly.type.Anomaly"], + })) + + +@app.get("/v1/typesystem") +def get_typesystem() -> Response: + return Response(content=_typesystem.to_xml().encode("utf-8"), media_type="application/xml") + + +@app.get("/v1/communication_layer", response_class=PlainTextResponse) +def get_communication_layer() -> str: + return _communication_lua + + +@app.get("/v1/documentation") +def get_documentation() -> DUUIDocumentation: + return DUUIDocumentation( + annotator_name=settings.duui_tool_name, + version=settings.duui_tool_version, + implementation_lang="Python", + ) + + +@app.post("/v1/process") +async def post_process(raw_request: Request) -> DUUIResponse: + body = await raw_request.body() + try: + data = json.loads(body) + except json.JSONDecodeError as exc: + raise RequestValidationError([{"type": "json_invalid", "loc": ("body",), "msg": str(exc), "input": body}]) + request = DUUIRequest.model_validate(data) + return _process(request) + + +# --------------------------------------------------------------------------- +# Business logic +# --------------------------------------------------------------------------- + +@lru_cache(maxsize=4) +def _load_pipeline(model: str, device: str): + dev = 0 if device == "cuda" else -1 + logger.info("Loading pipeline: model=%s device=%s", model, device) + return hf_pipeline( + task="token-classification", + model=model, + aggregation_strategy="simple", + device=dev, + ) + + +def _resolve_selection(options: dict[str, Any], text_length: int) -> Optional[tuple[int, int]]: + sel = options.get("selection") + if isinstance(sel, dict): + begin = sel.get("begin", sel.get("start")) + end = sel.get("end", sel.get("stop")) + else: + begin = options.get("selection_begin", options.get("selection_start")) + end = options.get("selection_end", options.get("selection_stop")) + + if begin is None or end is None: + return None + begin, end = int(begin), int(end) + if begin < 0 or end < begin or end > text_length: + raise ValueError(f"selection must satisfy 0 <= begin <= end <= {text_length}") + return begin, end + + +def _build_redacted(text: str, spans: list[DetectedSpan], mode: str) -> str: + """Apply mode transformation to text using already-computed spans.""" + if not spans: + return text + parts: list[str] = [] + cursor = 0 + for span in sorted(spans, key=lambda s: s.start): + if span.start < cursor: + continue + parts.append(text[cursor:span.start]) + if mode == MODE_PLACEHOLDER: + parts.append(span.placeholder) # e.g. [private_person] + # MODE_REMOVE: append nothing - the PII is deleted + cursor = span.end + parts.append(text[cursor:]) + return "".join(parts) + + +def _process(request: DUUIRequest) -> DUUIResponse: + options = request.options + model = str(options.get("model", settings.default_model)) + device = str(options.get("device") or ("cuda" if torch.cuda.is_available() else "cpu")) + mode = str(options.get("mode", MODE_PLACEHOLDER)) + + print(f"Processing request: model={model} device={device} mode={mode} text_length={len(request.text)}") + + # pseudo mode - not yet supported + if mode == MODE_PSEUDO: + return DUUIResponse( + text=request.text, + detected_spans=[], + redacted_text=request.text, + warning="pseudo mode is not yet supported - input returned unchanged", + ) + + if not request.text: + return DUUIResponse(text="", detected_spans=[], redacted_text="") + + sel = _resolve_selection(options, text_length=len(request.text)) + selected_text = request.text[sel[0]:sel[1]] if sel else request.text + offset = sel[0] if sel else 0 + + pipe = _load_pipeline(model, device) + raw = pipe(selected_text) + + spans = [ + DetectedSpan( + label=item["entity_group"], + start=int(item["start"]) + offset, + end=int(item["end"]) + offset, + text=str(item["word"]).strip(), + placeholder=f"[{item['entity_group']}]" if mode == MODE_PLACEHOLDER else "", + ) + for item in raw + ] + + redacted_text = _build_redacted(request.text, spans, mode) + if sel is not None: + # only the selected window was processed; rebuild full text around it + local_spans = [ + DetectedSpan(label=s.label, start=s.start - offset, end=s.end - offset, + text=s.text, placeholder=s.placeholder) + for s in spans + ] + redacted_window = _build_redacted(selected_text, local_spans, mode) + redacted_text = request.text[:sel[0]] + redacted_window + request.text[sel[1]:] + + return DUUIResponse( + text=request.text, + detected_spans=spans, + redacted_text=redacted_text, + ) + + +if __name__ == "__main__": + uvicorn.run("duui_anonymize:app", host="0.0.0.0", port=9714, workers=1) diff --git a/duui-anonymize/src/main/python/duui_opf.py b/duui-anonymize/src/main/python/duui_opf.py new file mode 100644 index 00000000..5ff804bd --- /dev/null +++ b/duui-anonymize/src/main/python/duui_opf.py @@ -0,0 +1,3 @@ +from duui_anonymize import app + +__all__ = ["app"] diff --git a/duui-anonymize/src/main/python/duui_opf_core.py b/duui-anonymize/src/main/python/duui_opf_core.py new file mode 100644 index 00000000..88670158 --- /dev/null +++ b/duui-anonymize/src/main/python/duui_opf_core.py @@ -0,0 +1,149 @@ +from __future__ import annotations + +from dataclasses import dataclass +from typing import Any, Mapping + +DEFAULT_MODE = "replacement" +PSEUDO_MODE = "pseudo" +DEFAULT_PLACEHOLDER = "" + +_SELECTION_KEYS = { + "selection", + "selection_begin", + "selection_end", + "selection_start", + "selection_stop", +} + +_SERVICE_OPTION_KEYS = { + "model", + "context_window_length", + "trim_whitespace", + "device", + "output_mode", + "discard_overlapping_predicted_spans", + "mode", + "placeholder", +} + +_DECODE_OPTION_KEYS = { + "decode_mode", + "viterbi_calibration_path", + "calibration_path", +} + + +@dataclass(frozen=True) +class SelectionRange: + begin: int + end: int + + +@dataclass(frozen=True) +class RedactionSpan: + label: str + start: int + end: int + text: str + placeholder: str = DEFAULT_PLACEHOLDER + + +def split_options( + options: Mapping[str, Any], +) -> tuple[dict[str, Any], dict[str, Any], str, str]: + service_options: dict[str, Any] = {} + decode_options: dict[str, Any] = {} + mode = DEFAULT_MODE + placeholder = DEFAULT_PLACEHOLDER + + for key, value in options.items(): + if key in _SELECTION_KEYS: + continue + if key == "mode": + mode = str(value) + elif key == "placeholder": + placeholder = str(value) + elif key in _SERVICE_OPTION_KEYS: + service_options[key] = value + elif key == "decode": + continue + elif key in _DECODE_OPTION_KEYS: + if key == "calibration_path": + decode_options["viterbi_calibration_path"] = value + else: + decode_options[key] = value + + return service_options, decode_options, mode, placeholder + + +def resolve_selection( + options: Mapping[str, Any], + *, + text_length: int, +) -> SelectionRange | None: + selection = options.get("selection") + if isinstance(selection, Mapping): + begin = selection.get("begin", selection.get("start")) + end = selection.get("end", selection.get("stop")) + if begin is None or end is None: + return None + return _validate_selection(begin, end, text_length=text_length) + + begin = options.get("selection_begin", options.get("selection_start")) + end = options.get("selection_end", options.get("selection_stop")) + if begin is None or end is None: + return None + return _validate_selection(begin, end, text_length=text_length) + + +def _validate_selection( + begin: Any, + end: Any, + *, + text_length: int, +) -> SelectionRange: + begin_int = int(begin) + end_int = int(end) + if begin_int < 0 or end_int < begin_int or end_int > text_length: + raise ValueError("selection must satisfy 0 <= begin <= end <= text length") + return SelectionRange(begin=begin_int, end=end_int) + + +def apply_replacement_text( + text: str, + spans: list[RedactionSpan], + *, + placeholder: str = DEFAULT_PLACEHOLDER, +) -> str: + if not spans: + return text + + redacted_parts: list[str] = [] + cursor = 0 + for span in sorted(spans, key=lambda item: (item.start, item.end)): + if span.start < cursor: + continue + redacted_parts.append(text[cursor:span.start]) + redacted_parts.append(placeholder) + cursor = max(cursor, span.end) + redacted_parts.append(text[cursor:]) + return "".join(redacted_parts) + + +def apply_selection( + text: str, + selection: SelectionRange | None, +) -> tuple[str, int]: + if selection is None: + return text, 0 + return text[selection.begin:selection.end], selection.begin + + +def compose_selection_output( + text: str, + selection: SelectionRange | None, + replacement: str, +) -> str: + if selection is None: + return replacement + return text[:selection.begin] + replacement + text[selection.end:] diff --git a/duui-anonymize/src/main/python/duui_whisperx.py b/duui-anonymize/src/main/python/duui_whisperx.py new file mode 100644 index 00000000..05fdc719 --- /dev/null +++ b/duui-anonymize/src/main/python/duui_whisperx.py @@ -0,0 +1,430 @@ +from functools import lru_cache +import json +from enum import Enum +from typing import Any, List + +import torch +import uvicorn +from cassis import load_typesystem +from fastapi import FastAPI, Response +from fastapi.encoders import jsonable_encoder +from fastapi.responses import PlainTextResponse +from pydantic import BaseModel, Field +from pydantic_settings import BaseSettings +from starlette.responses import JSONResponse + +from opf import DecodeOptions, OPF + + +class DetectedSpan(BaseModel): + """One detected privacy span returned by OPF.""" + + label: str + start: int + end: int + text: str + placeholder: str + + +class SelectionRange(BaseModel): + """Optional text selection inside the source document.""" + + begin: int + end: int + + +class DUUIRequest(BaseModel): + """Request sent by DUUI and transformed by the Lua communication layer.""" + + text: str + options: dict[str, Any] = Field(default_factory=dict) + selection: SelectionRange | None = None + + +class DUUIResponse(BaseModel): + """Response of this annotator.""" + + schema_version: int + summary: dict[str, Any] + text: str + detected_spans: List[DetectedSpan] + redacted_text: str + warning: str | None = None + selection: SelectionRange | None = None + + +class DUUIDocumentation(BaseModel): + """Documentation response.""" + + annotator_name: str + version: str + implementation_lang: str + + +class Settings(BaseSettings): + """Runtime settings for the DUUI service.""" + + duui_tool_name: str = "OpenAI Privacy Filter" + duui_tool_version: str = "1.0" + default_model: str | None = None + + +class RedactionMode(str, Enum): + REPLACEMENT = "replacement" + PSEUDO = "pseudo" + + +class PrivacyFilterService: + """Class-based service wrapper for OPF redaction.""" + + def __init__(self, settings: Settings) -> None: + self.settings = settings + + def split_options(self, options: dict[str, Any]) -> tuple[dict[str, Any], dict[str, Any]]: + return _split_options(options) + + def selection_from_options( + self, + request_selection: SelectionRange | None, + options: dict[str, Any], + *, + text_length: int, + ) -> SelectionRange | None: + return _selection_from_options( + request_selection, + options, + text_length=text_length, + ) + + def redact_text( + self, + text: str, + request_selection: SelectionRange | None, + options: dict[str, Any], + ) -> DUUIResponse: + return _redact_text(text, request_selection, options) + + +settings = Settings() +service = PrivacyFilterService(settings) +DEFAULT_PLACEHOLDER = "" +DEFAULT_MODE = RedactionMode.REPLACEMENT.value +PSEUDO_MODE = RedactionMode.PSEUDO.value + + +app = FastAPI( + docs_url="/api", + redoc_url=None, + title="OpenAI Privacy Filter", + description="Text privacy redaction for TTLab DUUI", + version="1.0", + terms_of_service="https://www.texttechnologylab.org/legal_notice/", + contact={ + "name": "Daniel Bundan", + "url": "bundan.me", + "email": "s1486849@stud.uni-frankfurt.de", + }, + license_info={ + "name": "AGPL", + "url": "http://www.gnu.org/licenses/agpl-3.0.en.html", + }, +) + +communication = "communication.lua" +with open(communication, 'rb') as f: + communication = f.read().decode("utf-8") + + +# Load the predefined typesystem that is needed for this annotator to work +typesystem_filename = 'typesystem.xml' +with open(typesystem_filename, 'rb') as f: + typesystem = load_typesystem(f) + + +# Get input / output of the annotator +@app.get("/v1/details/input_output") +def get_input_output() -> JSONResponse: + json_item = { + "inputs": [], + "outputs": ["de.tudarmstadt.ukp.dkpro.core.api.anomaly.type.Anomaly"] + } + + json_compatible_item_data = jsonable_encoder(json_item) + return JSONResponse(content=json_compatible_item_data) + + +# Get typesystem of this annotator +@app.get("/v1/typesystem") +def get_typesystem() -> Response: + # TODO remove cassis dependency, as only needed for typesystem at the moment? + xml = typesystem.to_xml() + xml_content = xml.encode("utf-8") + + return Response( + content=xml_content, + media_type="application/xml" + ) + + +# Return Lua communication script +@app.get("/v1/communication_layer", response_class=PlainTextResponse) +def get_communication_layer() -> str: + return communication + + +# Return documentation info +@app.get("/v1/documentation") +def get_documentation() -> DUUIDocumentation: + + documentation = DUUIDocumentation( + annotator_name=settings.duui_tool_name, + version=settings.duui_tool_version, + implementation_lang="Python", + ) + return documentation + + +def _selection_from_options( + request_selection: SelectionRange | None, + options: dict[str, Any], + *, + text_length: int, +) -> SelectionRange | None: + if request_selection is not None: + begin = int(request_selection.begin) + end = int(request_selection.end) + else: + selection = options.pop("selection", None) + if isinstance(selection, dict): + begin = selection.get("begin") + end = selection.get("end") + else: + begin = options.pop("selection_begin", options.pop("selection_start", None)) + end = options.pop("selection_end", options.pop("selection_stop", None)) + + if begin is None or end is None: + return None + + begin = int(begin) + end = int(end) + if begin < 0 or end < begin or end > text_length: + raise ValueError("selection must satisfy 0 <= begin <= end <= text length") + return SelectionRange(begin=begin, end=end) + + +def _json_key(payload: dict[str, Any]) -> str: + return json.dumps(payload, sort_keys=True, separators=(",", ":"), default=str) + + +def _split_options(options: dict[str, Any]) -> tuple[dict[str, Any], dict[str, Any]]: + redactor_options: dict[str, Any] = {} + decode_options: dict[str, Any] = {} + + for key, value in options.items(): + if key in {"decode", "selection", "selection_begin", "selection_end", "selection_start", "selection_stop"}: + continue + if key == "model": + redactor_options["model"] = value + elif key == "context_window_length": + redactor_options["context_window_length"] = value + elif key == "trim_whitespace": + redactor_options["trim_whitespace"] = value + elif key == "device": + redactor_options["device"] = value + elif key == "output_mode": + redactor_options["output_mode"] = value + elif key == "discard_overlapping_predicted_spans": + redactor_options["discard_overlapping_predicted_spans"] = value + elif key == "mode": + redactor_options["mode"] = value + elif key == "placeholder": + redactor_options["placeholder"] = value + elif key == "decode_mode": + decode_options["decode_mode"] = value + elif key in {"viterbi_calibration_path", "calibration_path"}: + decode_options["viterbi_calibration_path"] = value + elif key == "output_text_only": + continue + + return redactor_options, decode_options + + +@lru_cache(maxsize=8) +def _build_redactor(options_json: str) -> OPF: + options = json.loads(options_json) + device = options.get("device") + if device is None: + device = "cuda" if torch.cuda.is_available() else "cpu" + + redactor = OPF( + model=options.get("model", settings.default_model), + context_window_length=options.get("context_window_length"), + trim_whitespace=bool(options.get("trim_whitespace", True)), + device=device, + output_mode=options.get("output_mode", "typed"), + discard_overlapping_predicted_spans=bool( + options.get("discard_overlapping_predicted_spans", False) + ), + output_text_only=False, + ) + + return redactor + + +def _compose_replacement_text( + text: str, + spans: list[DetectedSpan], + *, + placeholder: str = DEFAULT_PLACEHOLDER, +) -> str: + if not spans: + return text + + redacted_parts: list[str] = [] + cursor = 0 + for span in sorted(spans, key=lambda item: (item.start, item.end)): + if span.start < cursor: + continue + redacted_parts.append(text[cursor:span.start]) + redacted_parts.append(placeholder) + cursor = max(cursor, span.end) + redacted_parts.append(text[cursor:]) + return "".join(redacted_parts) + + +def _detect_spans(payload: Any, *, offset: int = 0) -> list[DetectedSpan]: + detected_spans: list[DetectedSpan] = [] + for span in payload: + if isinstance(span, dict): + label = span.get("label") + start = span.get("start") + end = span.get("end") + text = span.get("text") + placeholder = span.get("placeholder") + else: + label = getattr(span, "label", None) + start = getattr(span, "start", None) + end = getattr(span, "end", None) + text = getattr(span, "text", None) + placeholder = getattr(span, "placeholder", None) + + detected_spans.append( + DetectedSpan( + label=str(label), + start=int(start) + offset, + end=int(end) + offset, + text=str(text), + placeholder=str(placeholder), + ) + ) + + return detected_spans + + +def _render_pseudo_response( + *, + text: str, + request_selection: SelectionRange | None, + options: dict[str, Any], +) -> DUUIResponse: + summary = { + "mode": PSEUDO_MODE, + "span_count": 0, + "by_label": {}, + "decoded_mismatch": False, + } + return DUUIResponse( + schema_version=1, + summary=summary, + text=text, + detected_spans=[], + redacted_text=text, + warning="pseudo mode is a stub and returns the input unchanged", + selection=request_selection, + ) + + +def _redact_text(text: str, request_selection: SelectionRange | None, options: dict[str, Any]) -> DUUIResponse: + constructor_options, decode_options = _split_options(options) + mode = str(constructor_options.get("mode", DEFAULT_MODE)) + placeholder = str(constructor_options.get("placeholder", DEFAULT_PLACEHOLDER)) + + if mode == PSEUDO_MODE: + return _render_pseudo_response( + text=text, + request_selection=request_selection, + options=constructor_options, + ) + + redactor = _build_redactor(_json_key(constructor_options)) + decode = DecodeOptions(**decode_options) if decode_options else None + + selected_text = text + selection_offset = 0 + if request_selection is not None: + selection_offset = request_selection.begin + selected_text = text[request_selection.begin:request_selection.end] + + result = redactor.redact(selected_text, decode=decode) + + if isinstance(result, str): + redacted_text = result if request_selection is None else ( + text[:request_selection.begin] + result + text[request_selection.end:] + ) + return DUUIResponse( + schema_version=1, + summary={ + "mode": mode, + "span_count": 0, + "by_label": {}, + "decoded_mismatch": False, + }, + text=text, + detected_spans=[], + redacted_text=redacted_text, + warning=None, + selection=request_selection, + ) + + detected_spans = _detect_spans(result.detected_spans, offset=selection_offset) + redacted_text = _compose_replacement_text( + selected_text, + [ + DetectedSpan( + label=span.label, + start=span.start - selection_offset, + end=span.end - selection_offset, + text=span.text, + placeholder=placeholder, + ) + for span in detected_spans + ], + placeholder=placeholder, + ) + if request_selection is not None: + redacted_text = text[:request_selection.begin] + redacted_text + text[request_selection.end:] + + return DUUIResponse( + schema_version=int(result.schema_version), + summary={**dict(result.summary), "mode": mode}, + text=text, + detected_spans=detected_spans, + redacted_text=redacted_text, + warning=result.warning, + selection=request_selection, + ) + + +# Process request from DUUI +@app.post("/v1/process") +def post_process(request: DUUIRequest) -> DUUIResponse: + selection = service.selection_from_options( + request.selection, + dict(request.options), + text_length=len(request.text), + ) + return service.redact_text(request.text, selection, dict(request.options)) + + +if __name__ == "__main__": + uvicorn.run("duui_opf:app", host="0.0.0.0", port=9714, workers=1) diff --git a/duui-anonymize/src/main/python/typesystem.xml b/duui-anonymize/src/main/python/typesystem.xml new file mode 100644 index 00000000..5032b07b --- /dev/null +++ b/duui-anonymize/src/main/python/typesystem.xml @@ -0,0 +1,2202 @@ + + + + + + de.tudarmstadt.ukp.dkpro.core.api.anomaly.type.Anomaly + + uima.tcas.Annotation + + + description + + uima.cas.String + + + suggestions + An array of the suggested actions to be taken for this anomaly. + uima.cas.FSArray + de.tudarmstadt.ukp.dkpro.core.api.anomaly.type.SuggestedAction + + + category + + uima.cas.String + + + + + de.tudarmstadt.ukp.dkpro.core.api.anomaly.type.GrammarAnomaly + + de.tudarmstadt.ukp.dkpro.core.api.anomaly.type.Anomaly + + + de.tudarmstadt.ukp.dkpro.core.api.anomaly.type.SpellingAnomaly + + de.tudarmstadt.ukp.dkpro.core.api.anomaly.type.Anomaly + + + de.tudarmstadt.ukp.dkpro.core.api.anomaly.type.SuggestedAction + + uima.tcas.Annotation + + + replacement + The text covered by the Anomaly annotation should be replaced with the contents of this + feature. + + uima.cas.String + + + certainty + A score representing how certain is this suggested action. + Usually in [0,1]. + + uima.cas.Float + + + + + de.tudarmstadt.ukp.dkpro.core.api.coref.type.CoreferenceChain + Marks the beginning of a chain. + uima.cas.AnnotationBase + + + first + This is the first corefernce link in coreference chain + de.tudarmstadt.ukp.dkpro.core.api.coref.type.CoreferenceLink + + + + + de.tudarmstadt.ukp.dkpro.core.api.coref.type.CoreferenceLink + A link in the coreference chain. + uima.tcas.Annotation + + + next + If there is one, it is the next coreference link to the current coreference link + + de.tudarmstadt.ukp.dkpro.core.api.coref.type.CoreferenceLink + + + referenceType + The role or type which the covered text has in the coreference chain. + uima.cas.String + + + referenceRelation + The type of relation between this link and the next link in the chain. + uima.cas.String + + + + + de.tudarmstadt.ukp.dkpro.core.api.frequency.tfidf.type.Tfidf + Annotates the tf.idf score of a token, stem, or lemma. + uima.tcas.Annotation + + + tfidfValue + The tf.idf score. + uima.cas.Double + + + term + The string that was used to compute this tf.idf score. + If a stem or lemma was used, the covered text of this annotation does not need to be equal to + this string. + + This string can be used to construct a vector space with the right terms without having to + access the indexes again. + + uima.cas.String + + + + + de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.morph.Morpheme + + uima.tcas.Annotation + + + morphTag + + uima.cas.String + + + + + de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.morph.MorphologicalFeatures + Morphological categories that can be attached to tokens. + + The features are supposed to match the Universal Dependency v1 features. + + uima.tcas.Annotation + + + gender + + uima.cas.String + + + number + Singular/plural + uima.cas.String + + + case + Nouns: nominative, genetiv, dative, ... + uima.cas.String + + + degree + Adjectives: comparative/Superlative + uima.cas.String + + + verbForm + + uima.cas.String + + + tense + Verbs: past tense, present tense, future tense, etc. + uima.cas.String + + + mood + Verbs: indicative, imperative, subjunctive + uima.cas.String + + + voice + Verbs: active/passive + uima.cas.String + + + definiteness + Definite or indefinite + uima.cas.String + + + value + The original morphological analysis results as produced by a tool or as recorded in a + corpus (if available). If the categories were originally encoded in such a string, the other + features are filled by analyzing this string. If the categories were provided separately, e.g. + by different attributed in an XML-encoded corpus, this field may remain empty. + + uima.cas.String + + + person + Verbs: 1st, 2nd, 3rd person + uima.cas.String + + + aspect + Verbs: perfective, imperfective + uima.cas.String + + + animacy + + uima.cas.String + + + negative + + uima.cas.String + + + numType + + uima.cas.String + + + possessive + + uima.cas.String + + + pronType + + uima.cas.String + + + reflex + + uima.cas.String + + + transitivity + Verbs: transitive/intransitive + + @deprecated + + uima.cas.String + + + + + de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS + The part of speech of a word or a phrase. + uima.tcas.Annotation + + + PosValue + Fine-grained POS tag. This is the tag as produced by a POS tagger or obtained from a + reader. + + uima.cas.String + + + coarseValue + Coarse-grained POS tag. This may be produced by a POS tagger or reader in addition to + the fine-grained tag. + + uima.cas.String + + + + + de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS_ADJ + Adjective + de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS + + + de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS_ADP + Adposition + de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS + + + de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS_ADV + Adverb + de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS + + + de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS_AUX + Auxiliary verb + de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS + + + de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS_CONJ + Conjunction + de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS + + + de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS_DET + Determiner + de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS + + + de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS_INTJ + Interjection + de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS + + + de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS_NOUN + Noun + de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS + + + de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS_NUM + Numeral + de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS + + + de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS_PART + Particle + de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS + + + de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS_PRON + Pronoun + de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS + + + de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS_PROPN + Proper noun + de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS + + + de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS_PUNCT + Punctuation + de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS + + + de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS_SCONJ + Subordinating conjunction + de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS + + + de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS_SYM + Symbol + de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS + + + de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS_VERB + Verb + de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS + + + de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS_X + Other + de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS + + + de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.tweet.POS_AT + at-mention (indicates another user as a recipient of a tweet) + de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS_X + + + de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.tweet.POS_DM + discourse marker, indications of continuation of a message across multiple tweets + de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS_X + + + de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.tweet.POS_EMO + emoticon + de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS_X + + + de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.tweet.POS_HASH + Hashtag (indicates topic/category for tweet) + de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS_X + + + de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.tweet.POS_INT + proper noun + verbal + de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS_X + + + de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.tweet.POS_NNV + nominal + verbal + de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS_NOUN + + + de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.tweet.POS_NPV + proper noun + verbal + de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS_NOUN + + + de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.tweet.POS_URL + URL or email address + de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS_X + + + de.tudarmstadt.ukp.dkpro.core.api.metadata.type.MetaDataStringField + <p>A general purpose annotation to store document-wide information in the form of + arbitrary key-value string pairs.</p> + + uima.tcas.Annotation + + + key + Name of a metadata field. + uima.cas.String + + + value + The field value. + uima.cas.String + + + + + de.tudarmstadt.ukp.dkpro.core.api.metadata.type.TagDescription + Description of an individual tag. + uima.cas.TOP + + + name + The name of the tag. + uima.cas.String + + + + + de.tudarmstadt.ukp.dkpro.core.api.metadata.type.TagsetDescription + Information about a tagset (controlled vocabulary). + uima.tcas.Annotation + + + layer + The layer to which the tagset applies. This is + typically the name of an UIMA type such as + "de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS". + + uima.cas.String + + + name + The name of the tagset. + uima.cas.String + + + tags + Descriptions of the tags belonging to this tagset. + uima.cas.FSArray + de.tudarmstadt.ukp.dkpro.core.api.metadata.type.TagDescription + + + componentName + + uima.cas.String + + + modelLocation + + uima.cas.String + + + modelVariant + + uima.cas.String + + + modelLanguage + + uima.cas.String + + + modelVersion + + uima.cas.String + + + input + True if the tagset is used as input by the component/model, otherwise false. + + uima.cas.Boolean + + + + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.Animal + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity + + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.Cardinal + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity + + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.ContactInfo + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity + + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.Date + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity + + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.Disease + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity + + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.Event + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity + + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.Fac + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity + + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.FacDesc + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity + + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.Game + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity + + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.Gpe + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity + + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.GpeDesc + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity + + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.Language + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity + + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.Law + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity + + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.Location + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity + + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.Money + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity + + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity + Named entities refer e.g. to persons, locations, organizations and so on. They often consist of + multiple tokens. + + uima.tcas.Annotation + + + value + The class/category of the named entity, e.g. person, location, etc. + uima.cas.String + + + identifier + Identifier of the named entity, e.g. a reference into a person database. + uima.cas.String + + + + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.Nationality + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity + + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.Norp + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity + + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.Ordinal + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity + + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.OrgDesc + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity + + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.Organization + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity + + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.PerDesc + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity + + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.Percent + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity + + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.Person + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity + + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.Plant + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity + + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.Product + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity + + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.ProductDesc + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity + + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.Quantity + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity + + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.Substance + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity + + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.Time + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity + + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.WorkOfArt + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity + + + de.tudarmstadt.ukp.dkpro.core.api.phonetics.type.PhoneticTranscription + <p>Represents the phonetic transcription of some textual element (usually a Token). + Phonetic transcriptions are e.g. generated by transcription processes like Soundex or Metaphone.</p> + + uima.tcas.Annotation + + + transcription + The actual transcription + uima.cas.String + + + name + The name of the transcription process that was used + uima.cas.String + + + + + de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Compound + This type represents a decompounding word, i.e.: flowerpot. Each Compound one have at least two + Splits. + + uima.tcas.Annotation + + + splits + A word that can be decomposed into different parts. + uima.cas.FSArray + de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Split + + + + + de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.CompoundPart + <p>A CompoundPart represents one fragment from the compounding word. Besides that, it can + store other CompoundParts if it can be split again. The way it stores a decompounding word represents a + decompounding tree.</p> + + de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Split + + + de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Div + Document structure element. + uima.tcas.Annotation + + + divType + + uima.cas.String + + + id + If this unit had an ID in the source format from which it was imported, it may be + stored here. IDs are typically not assigned by DKPro Core components. If an ID is present, it + should be respected by writers. + + uima.cas.String + + + + + de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Document + + de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Div + + + de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Heading + Document title, section heading, etc. + de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Div + + + de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma + + uima.tcas.Annotation + + + value + + uima.cas.String + + + + + de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.LexicalPhrase + + uima.tcas.Annotation + + + text + + uima.cas.String + + + + + de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.LinkingMorpheme + This type represents a linking morpheme between two CompoundParts. + de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Split + + + de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.NGram + + uima.tcas.Annotation + + + text + + uima.cas.String + + + + + de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Paragraph + + de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Div + + + de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence + + uima.tcas.Annotation + + + id + If this unit had an ID in the source format from which it was imported, it may be + stored here. IDs are typically not assigned by DKPro Core components. If an ID is present, it + should be respected by writers. + + uima.cas.String + + + + + de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Split + This type represents a part of a decompounding word. A Split can be either a CompoundPart or a + LinkingMorpheme. + + uima.tcas.Annotation + + + splits + Sub-splits of the current split. + uima.cas.FSArray + de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Split + + + + + de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Stem + + uima.tcas.Annotation + + + value + + uima.cas.String + + + + + de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.StopWord + + uima.tcas.Annotation + + + de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.SurfaceForm + This annotation can be used to indicate an alternate surface form. E.g. some corpora consider a + normalized form of the text with resolved contractions as the canonical form and only maintain the + original surface form as a secondary information. One example is the Conll-U format. + + uima.tcas.Annotation + + + value + Alternate surface form. + uima.cas.String + + + + + de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token + <p>Token is one of the two types commonly produced by a segmenter (the other being + Sentence). A Token usually represents a word, although it may be used to represent multiple tightly + connected words (e.g. "New York") or parts of a word (e.g. the possessive "'s"). One may choose to split + compound words into multiple tokens, e.g. ("CamelCase" -&gt; "Camel", "Case"; "Zauberstab" -&gt; + "Zauber", "stab"). Most processing components operate on Tokens, usually within the limits of the + surrounding Sentence. E.g. a part-of-speech tagger analyses each Token in a Sentence and assigns a + part-of-speech to each Token.</p> + + uima.tcas.Annotation + + + parent + the parent of this token. This feature is meant to be used in when the token + participates in a constituency parse and then refers to a constituent containing this token. The + type of this feature is {@link Annotation} to avoid adding a dependency on the syntax API + module. + + uima.tcas.Annotation + + + lemma + + de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma + + + stem + + de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Stem + + + pos + + de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS + + + morph + The morphological feature associated with this token. + de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.morph.MorphologicalFeatures + + + + id + If this unit had an ID in the source format from which it was imported, it may be + stored here. IDs are typically not assigned by DKPro Core components. If an ID is present, it + should be respected by writers. + + uima.cas.String + + + form + Potentially normalized form of the token text that should be used instead of the + covered text if set. + + de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.TokenForm + + + syntacticFunction + + uima.cas.String + + + order + Disambiguates the token order for tokens which have the same offsets, e.g. when the + contraction "à" is analyzed as two tokens "a" and "a". + + uima.cas.Integer + + + + + de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.TokenForm + A alternative token text which should be used instead of the covered text if set on a token. + + uima.tcas.Annotation + + + value + + uima.cas.String + + + + + de.tudarmstadt.ukp.dkpro.core.api.semantics.type.SemArg + The SemArg annotation is attached to semantic arguments of semantic + predicates. Semantic arguments are characterized by their semantic role, e.g. Agent, + Experiencer, Topic. The semantic role of an argument is related to its semantic type + (for communication verbs, the Agent can be a person or an organization, but + typically not food). + + uima.tcas.Annotation + + + de.tudarmstadt.ukp.dkpro.core.api.semantics.type.SemArgLink + The SemArgLink type is used to attach SemPred annotations to their respective SemArg + annotations while giving each link a role. + + uima.cas.TOP + + + role + The role which the argument takes. The value depends on the theory being used, e.g. + Arg0, Arg1, etc. or Buyer, Seller, etc. + + uima.cas.String + + + target + The target argument. + de.tudarmstadt.ukp.dkpro.core.api.semantics.type.SemArg + + + + + de.tudarmstadt.ukp.dkpro.core.api.semantics.type.SemPred + One of the predicates of a sentence (often a main verb, but nouns and adjectives can also be + predicates). + The SemPred annotation can be attached to predicates in a sentence. + Semantic predicates express events or situations and take semantic arguments + expressing the participants in these events or situations. All forms of main verbs + can be annotated with a SemPred. However, there are also many nouns and + adjectives that take arguments and can thus be annotated with a SemanticPredicate, + e.g. event nouns, such as "suggestion" (with arguments what and by whom), or + relational adjectives, such as "proud" (with arguments who and of what). + + uima.tcas.Annotation + + + arguments + The predicate's arguments. + uima.cas.FSArray + de.tudarmstadt.ukp.dkpro.core.api.semantics.type.SemArgLink + + + category + A more detailed specification of the predicate type depending on the theory being used, + e.g. a frame name. + + uima.cas.String + + + + + de.tudarmstadt.ukp.dkpro.core.api.semantics.type.SemanticArgument + The SemanticArgument annotation is attached to semantic arguments of semantic + predicates. Semantic arguments are characterized by their semantic role, e.g. Agent, + Experiencer, Topic. The semantic role of an argument is related to its semantic type + (for communication verbs, the Agent can be a person or an organization, but + typically not food). The semantic type of arguments is not yet covered by the + SemanticType. + + @deprecated Use SemArg instead. + + uima.tcas.Annotation + + + role + The role which the argument takes. The value depends on the theory being used, e.g. + Arg0, Arg1, etc. or Buyer, Seller, etc. + + uima.cas.String + + + + + de.tudarmstadt.ukp.dkpro.core.api.semantics.type.SemanticField + The SemanticField is a coarse-grained semantic category that can be attached to + nouns, verbs or adjectives. Semantic field information is present e.g. in WordNet as + lexicographer file names. Previously, this kind of semantic information has also + been called supersenses or semantic types. + + uima.tcas.Annotation + + + value + The value or name of the semantic field. Examples of semantic field values are: + location, artifact, event, communication, attribute + + uima.cas.String + + + + + de.tudarmstadt.ukp.dkpro.core.api.semantics.type.SemanticPredicate + One of the predicates of a sentence (often a main verb, but nouns and adjectives can also be + predicates). The SemanticPredicate annotation can be attached to predicates in a sentence. + Semantic predicates express events or situations and take semantic arguments + expressing the participants in these events ore situations. All forms of main verbs + can be annotated with a SemanticPredicate. However, there are also many nouns and + adjectives that take arguments and can thus be annotated with a SemanticPredicate, + e.g. event nouns, such as "suggestion" (with arguments what and by whom), or + relational adjectives, such as "proud" (with arguments who and of what). + + @deprecated use SemPred instead + + uima.tcas.Annotation + + + category + A more detailed specification of the predicate type depending on the theory being used, + e.g. a frame name. + + uima.cas.String + + + arguments + The predicate's arguments. + uima.cas.FSArray + de.tudarmstadt.ukp.dkpro.core.api.semantics.type.SemanticArgument + + + + + de.tudarmstadt.ukp.dkpro.core.api.semantics.type.WordSense + + uima.tcas.Annotation + + + value + The sense identifier. + uima.cas.String + + + + + de.tudarmstadt.ukp.dkpro.core.api.structure.type.Field + + uima.tcas.Annotation + + + name + the name of the tag + uima.cas.String + + + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.PennTree + <p>The Penn Treebank-style phrase structure string.</p> + uima.tcas.Annotation + + + PennTree + Contains a Penn Treebank-style representation of a tree. + uima.cas.String + + + TransformationNames + The name(s) of the transformation(s) that have been performed on the PennTree + + uima.cas.String + + + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.chunk.ADJC + adjective chunks + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.chunk.Chunk + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.chunk.ADVC + adverb chunks + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.chunk.Chunk + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.chunk.CONCJ + complex coordinating conjunctions such as "as well (as)" or "rather (than)" + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.chunk.Chunk + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.chunk.Chunk + + uima.tcas.Annotation + + + chunkValue + + uima.cas.String + + + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.chunk.INTJ + interjection + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.chunk.Chunk + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.chunk.LST + enumeration symbol + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.chunk.Chunk + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.chunk.NC + noun chunk (non-recursive noun phrase) + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.chunk.Chunk + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.chunk.O + other or outside a chunk + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.chunk.Chunk + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.chunk.PC + prepositional chunk + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.chunk.Chunk + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.chunk.PRT + verb particle + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.chunk.Chunk + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.chunk.VC + verb complex + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.chunk.Chunk + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.ADJP + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.ADVP + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.CONJP + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent + + uima.tcas.Annotation + + + constituentType + + uima.cas.String + + + parent + The parent constituent + uima.tcas.Annotation + + + children + + uima.cas.FSArray + uima.tcas.Annotation + + + syntacticFunction + + uima.cas.String + + + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.FRAG + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.INTJ + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.LST + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.NAC + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.NP + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.NX + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.PARN + This cateory is called PRN in the Penn Treebank tagset. However, PRN is a reserved device name + on Window. Thus we had to rename this category. The old PRN type is still present in the DKPro Core type + system, but it is deprecated, no longer used, and no JCas classes are generated for it. + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.PP + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.PRP + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.PRT + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.QP + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.ROOT + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.RRC + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.S + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.SBAR + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.SBARQ + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.SINV + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.SQ + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.UCP + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.VP + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.WHADJP + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.WHADVP + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.WHNP + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.WHPP + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.X + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.ABBREV + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.ACOMP + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.ADVCL + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.ADVMOD + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.AGENT + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.AMOD + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.APPOS + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.ATTR + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.AUX0 + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.AUXPASS + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.CC + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.CCOMP + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.COMPLM + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.CONJ + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.CONJP + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.CONJ_YET + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.COP + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.CSUBJ + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.CSUBJPASS + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.DEP + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.DET + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.DOBJ + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + A dependency relation between two tokens. The dependency annotation begin and end offsets + correspond to those of the dependent. + + uima.tcas.Annotation + + + Governor + The governor word + de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token + + + Dependent + The dependent word + de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token + + + DependencyType + The dependency type + uima.cas.String + + + flavor + Flavor of the dependency relation (basic, collapsed, enhanced, etc...) + uima.cas.String + + + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.EXPL + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.INFMOD + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.IOBJ + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.MARK + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.MEASURE + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.MWE + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.NEG + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.NN + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.NPADVMOD + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.NSUBJ + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.NSUBJPASS + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.NUM + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.NUMBER + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.PARATAXIS + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.PARTMOD + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.PCOMP + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.POBJ + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.POSS + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.POSSESSIVE + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.PRECONJ + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.PRED + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.PREDET + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.PREP + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.PREPC + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.PRT + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.PUNCT + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.PURPCL + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.QUANTMOD + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.RCMOD + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.REF + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.REL + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.ROOT + Dependency tree root. + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.TMOD + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.XCOMP + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.XSUBJ + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.transform.type.SofaChangeAnnotation + Encodes an edit operation that can be interpreted by the ApplyChangesAnnotator. + uima.tcas.Annotation + + + value + In case of an "insert" or "replace" operation, this feature indicates the value to be + inserted or replaced. + + uima.cas.String + + + operation + Operation to perform: "insert", "replace", "delete" + uima.cas.String + + + reason + The reason for the change. + uima.cas.String + + + + + de.tudarmstadt.ukp.dkpro.core.io.jwpl.type.ArticleInfo + Contains basic information about the article. + uima.tcas.Annotation + + + Authors + Number of unique authors of this article + uima.cas.Integer + + + Revisions + Number of revisions of this article. + uima.cas.Integer + + + FirstAppearance + The Timestamp of the first appearance of this article. + uima.cas.Long + + + LastAppearance + The Timestamp of the last appearance of this article. + uima.cas.Long + + + + + de.tudarmstadt.ukp.dkpro.core.io.jwpl.type.DBConfig + Database configuration for the connection to the database where the CAS data was retrieved. + + uima.tcas.Annotation + + + Host + DB Host + uima.cas.String + + + DB + Database + uima.cas.String + + + User + Username + uima.cas.String + + + Password + User password + uima.cas.String + + + Language + Wikipedia Language Versions + uima.cas.String + + + + + de.tudarmstadt.ukp.dkpro.core.io.jwpl.type.WikipediaLink + Wikipedia link + uima.tcas.Annotation + + + LinkType + The type of the link, e.g. internal, external, image, ... + uima.cas.String + + + Target + The link target url + uima.cas.String + + + Anchor + The anchor of the link + uima.cas.String + + + + + de.tudarmstadt.ukp.dkpro.core.io.jwpl.type.WikipediaRevision + Represents a revision in Wikipedia. + uima.tcas.Annotation + + + revisionId + The ID of the revision. + uima.cas.Integer + + + pageId + The pageId of the Wikipedia page of this revision. + uima.cas.Integer + + + contributorName + The username of the user/contributor who edited this revision. + uima.cas.String + + + comment + The comment that the editor entered for this revision. + uima.cas.String + + + contributorId + The userId of the user/contributor who created this revision + uima.cas.Integer + + + timestamp + The timestamp of the revision, given in milliseconds since the standard base time + (January 1, 1970, 00:00:00 GMT) + + uima.cas.Long + + + minor + Whether this revision has been marked as minor edit by its contributor. + uima.cas.Boolean + + + + + de.tudarmstadt.ukp.dkpro.core.mallet.type.TopicDistribution + An array representing the topic proportions in a document. + uima.tcas.Annotation + + + TopicProportions + Each topic's proportion in the document. + uima.cas.DoubleArray + + + TopicAssignment + Pointers to topics the document has been assigned to. + uima.cas.IntegerArray + + + + + de.tudarmstadt.ukp.dkpro.core.mallet.type.WordEmbedding + An array representing the word embedding vector. + uima.tcas.Annotation + + + WordEmbedding + A word embedding vector. + uima.cas.FloatArray + + + + + de.tudarmstadt.ukp.dkpro.core.mecab.type.JapaneseToken + + de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token + + + kana + + uima.cas.String + + + ibo + + uima.cas.String + + + kei + + uima.cas.String + + + dan + Specifies the kind of the verb if the current token is a verb. Either it is a vowel + stem verb (ichi-dan) or a consonant stem verb (go-dan). Blank if not a verb. + + uima.cas.String + + + + + de.tudarmstadt.ukp.dkpro.core.performance.type.TimerAnnotation + Used for storing timing information (e.g. for performance testing). + uima.tcas.Annotation + + + startTime + + uima.cas.Long + + + endTime + + uima.cas.Long + + + name + The name of the timer. + Used to automatically determine whether this is an upstream or downstream timer. + + uima.cas.String + + + + + de.tudarmstadt.ukp.dkpro.core.type.ReadabilityScore + + uima.tcas.Annotation + + + measureName + + uima.cas.String + + + score + + uima.cas.Double + + + + + org.dkpro.core.api.xml.type.XmlAttribute + + uima.cas.TOP + + + uri + Namespace URI of the attribute. + uima.cas.String + + + localName + Local name of the attribute. + uima.cas.String + + + value + Value of the XML attribute. + uima.cas.String + + + qName + + uima.cas.String + + + valueType + + uima.cas.String + + + + + org.dkpro.core.api.xml.type.XmlDocument + XML document + uima.tcas.Annotation + + + root + Root element of the XML document. + org.dkpro.core.api.xml.type.XmlElement + + + + + org.dkpro.core.api.xml.type.XmlElement + XML element + org.dkpro.core.api.xml.type.XmlNode + + + uri + Namespace URI of the element. + uima.cas.String + + + localName + Local name of the XML element. + uima.cas.String + + + attributes + Array of attributes of the XML element. + uima.cas.FSArray + org.dkpro.core.api.xml.type.XmlAttribute + + + children + Children of this XML element. + uima.cas.FSArray + org.dkpro.core.api.xml.type.XmlNode + + + qName + + uima.cas.String + + + + + org.dkpro.core.api.xml.type.XmlNode + Supertype for XmlElements and XmlTextNodes. + uima.tcas.Annotation + + + parent + + org.dkpro.core.api.xml.type.XmlElement + + + + + org.dkpro.core.api.xml.type.XmlTextNode + XML text node. + org.dkpro.core.api.xml.type.XmlNode + + + text + + uima.cas.String + + + captured + Whether the text node has been added to the document text. + uima.cas.Boolean + + + + + org.dkpro.core.io.nift.metadata.ArticleMetaData + A document annotation that describes the metadata of a + newspaper article. + + uima.cas.AnnotationBase + + + guid + The GUID field specifies a (4-byte) integer that is + guaranteed + to be unique for every document + in the corpus. + + uima.cas.Integer + + + alternateUrl + This field specifies the location on nytimes.com of + the article. When present, this URL is preferred to the URL field + on articles published on or after April 02, + 2006, as the linked + page will have richer content. + + uima.cas.String + + + url + This field specifies the location on nytimes.com of + the article. The 'Alternative Url' + field is preferred to this field + on articles published on or after + April 02, 2006, as the + linked page + will have richer content. + + uima.cas.String + + + publicationDate + This field specifies the date of the article's + publication. This field is specified in the + format + YYYYMMDD'T'HHMMSS where: + 1. YYYY is the four-digit year. + 2. MM is + the two-digit month [01-12]. + 3. DD is the two-digit day [01-31]. + 4. + T is a constant value. + 5. HH is the two-digit hour [00-23]. + 6. MM is + the two-digit minute-past-the hour [00-59] + 7. SS is the two-digit + seconds-past-the-minute [00-59]. + Please note that values for HH,MM, + and SS are not defined for this + corpus, that is to day + HH,MM, and SS + are always defined to be '00'. + + uima.cas.String + + + typesOfMaterial + This field specifies a normalized list of terms + describing the general editorial category of the article. + These + tags are algorithmically assigned and + manually verified by + nytimes.com production staff. + Examples Include: + * REVIEW + * OBITUARY + * ANALYSIS + + uima.cas.StringArray + + + headline + This field specifies the headline of the article as it + appeared in the + print edition of the New York + Times. + + uima.cas.String + + + onlineHeadline + This field specifies the headline displayed with the + article on + nytimes.com. Often + this differs from the headline used in + print. + + uima.cas.String + + + columnName + If the article is part of a regular column, this field + specifies the + name of that column. + Sample Column Names: + 1. World News + Briefs + 2. WEDDINGS + 3. The Accessories Channel + + uima.cas.String + + + author + This field is based on the normalized byline in the + original corpus data: "The Normalized Byline field is the byline + normalized to the form (last name, first + name)". + + uima.cas.String + + + descriptors + The 'descriptors' field specifies a list of + descriptive terms drawn + from a normalized controlled + vocabulary + corresponding to subjects mentioned in the article. These tags + are + hand-assigned by + a team of library scientists working in the New + York Times Indexing + service. + Examples Include: + * ECONOMIC CONDITIONS + AND TRENDS + * AIRPLANES + * VIOLINS + + uima.cas.StringArray + + + onlineDescriptors + This field specifies a list of descriptors from a + normalized + controlled + vocabulary that + correspond to topics mentioned + in the article. These + tags are + algorithmically + assigned and manually + verified by + nytimes.com production staff. + Examples Include: + * Marriages + * Parks and Other Recreation Areas + * Cooking and Cookbooks + + uima.cas.StringArray + + + generalOnlineDescriptors + The 'general online descriptors' field specifies a + list of descriptors that are at a higher level of + generality than + the other tags associated with the article. These tags are + algorithmically + assigned and manually verified by nytimes.com + production staff. + Examples Include: + * Surfing + * Venice Biennale + * Ranches + + uima.cas.String + + + onlineSection + This field specifies the section(s) on nytimes.com in + which the + article is placed. If + the article is placed in multiple + sections, this field will be + specified as a ';' delineated + list. + + uima.cas.String + + + section + This field specifies the section of the paper in which + the article + appears. This is not + the name of the section, but rather + a letter or number that indicates + the section. + + uima.cas.String + + + taxonomicClassifiers + This field specifies a list of taxonomic classifiers + that place this + article into a + hierarchy of articles. The individual + terms of each taxonomic classifier + are separated with the '/' character. + These tags are algorithmically assigned and manually + verified + by nytimes.com production staff. + Examples Include: + * Top/Features/Travel/Guides/Destinations/North America/United States/Arizona + * Top/News/U.S./Rockies + * Top/Opinion + + uima.cas.StringArray + + + + + diff --git a/duui-anonymize/src/main/resources/TypeSystemAnonymize.xml b/duui-anonymize/src/main/resources/TypeSystemAnonymize.xml new file mode 100644 index 00000000..bdf518f0 --- /dev/null +++ b/duui-anonymize/src/main/resources/TypeSystemAnonymize.xml @@ -0,0 +1,46 @@ + + + + + de.tudarmstadt.ukp.dkpro.core.api.anomaly.type.Anomaly + Marks a span of text as a privacy-sensitive entity detected by the anonymizer. + uima.tcas.Annotation + + + description + Human-readable description or replacement placeholder for the detected span. + uima.cas.String + + + suggestions + Suggested replacement actions for this anomaly. + uima.cas.FSArray + de.tudarmstadt.ukp.dkpro.core.api.anomaly.type.SuggestedAction + + + category + Privacy category of the detected span, e.g. private_person, email_address, phone_number. + uima.cas.String + + + + + + de.tudarmstadt.ukp.dkpro.core.api.anomaly.type.SuggestedAction + A suggested replacement for an anomaly span. + uima.tcas.Annotation + + + replacement + The text to substitute for the anomaly-covered span. + uima.cas.String + + + certainty + Confidence score in [0,1] for this replacement suggestion. + uima.cas.Float + + + + + diff --git a/duui-anonymize/src/test/java/AnonymizeTests.java b/duui-anonymize/src/test/java/AnonymizeTests.java new file mode 100644 index 00000000..c292b5bc --- /dev/null +++ b/duui-anonymize/src/test/java/AnonymizeTests.java @@ -0,0 +1,577 @@ +import de.tudarmstadt.ukp.dkpro.core.api.anomaly.type.Anomaly; +import org.apache.commons.compress.compressors.CompressorException; +import org.apache.uima.UIMAException; +import org.apache.uima.fit.factory.JCasFactory; +import org.apache.uima.fit.util.JCasUtil; +import org.apache.uima.jcas.JCas; +import org.apache.uima.util.XmlCasSerializer; +import org.junit.jupiter.api.*; +import org.texttechnologylab.DockerUnifiedUIMAInterface.DUUIComposer; +import org.texttechnologylab.DockerUnifiedUIMAInterface.driver.DUUIRemoteDriver; +import org.texttechnologylab.DockerUnifiedUIMAInterface.lua.DUUILuaContext; +import org.xml.sax.SAXException; +import org.texttechnologylab.DockerUnifiedUIMAInterface.driver.DUUIUIMADriver; +import org.dkpro.core.io.xmi.XmiWriter; + +import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; +import static org.junit.jupiter.api.Assertions.*; + +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.net.URISyntaxException; +import java.net.UnknownHostException; +import java.nio.charset.StandardCharsets; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.util.Collection; + +/** + * Integration tests for the DUUI anonymization annotator. + * + * Prerequisites: the Python service must be running on {@value #SERVICE_URL}. + * Start it with: + * uvicorn duui:app --host 0.0.0.0 --port 9714 --workers 1 + * + * Each test loads English text containing personally identifiable information (PII), + * sends it through the OPF anonymizer via DUUI, and asserts that + * {@link Anomaly} annotations are written to the CAS output view. + */ +public class AnonymizeTests { + + static final String SERVICE_URL = "http://127.0.0.1:9714"; + + static final String RESULTS_DIR = "src/test/results"; + + static DUUIComposer composer; + static JCas cas; + + // ------------------------------------------------------------------- + // JUnit lifecycle + // ------------------------------------------------------------------- + + @BeforeAll + static void beforeAll() throws URISyntaxException, IOException, UIMAException, SAXException, CompressorException { + Files.createDirectories(Paths.get(RESULTS_DIR)); + + composer = new DUUIComposer() + .withSkipVerification(true) + .withLuaContext(new DUUILuaContext().withJsonLibrary()); + + DUUIUIMADriver uimaDriver = new DUUIUIMADriver().withDebug(false); + DUUIRemoteDriver remoteDriver = new DUUIRemoteDriver(); + composer.addDriver(remoteDriver, uimaDriver); + + cas = JCasFactory.createJCas(); + } + + @AfterAll + static void afterAll() throws UnknownHostException { + composer.shutdown(); + } + + @AfterEach + void afterEach(TestInfo testInfo) throws IOException, SAXException { + composer.resetPipeline(); + + String methodName = testInfo.getTestMethod() + .map(m -> m.getName()) + .orElseGet(() -> testInfo.getDisplayName().replaceAll("[^a-zA-Z0-9_]", "_")); + + String inputText = cas.getDocumentText() != null ? cas.getDocumentText() : ""; + String redactedText = extractRedactedText(); + Collection anomalies = collectAnomalies(); + + // Write XMI + ByteArrayOutputStream xmiBytes = new ByteArrayOutputStream(); + XmlCasSerializer.serialize(cas.getCas(), null, xmiBytes); + String xmiString = xmiBytes.toString(StandardCharsets.UTF_8); + Files.writeString(Paths.get(RESULTS_DIR, methodName + ".xmi"), xmiString); + + // Write JSON summary for later comparison + String json = buildResultJson(methodName, inputText, redactedText, anomalies); + Files.writeString(Paths.get(RESULTS_DIR, methodName + ".json"), json); + + System.out.println("=== " + methodName + " ==="); + System.out.println(json); + + cas.reset(); + } + + // ------------------------------------------------------------------- + // Helpers + // ------------------------------------------------------------------- + + /** Populate the CAS with the given text and language code. */ + private static void createCas(String language, String text) throws UIMAException { + cas.setDocumentLanguage(language); + cas.setDocumentText(text); + } + + /** + * Collect all {@link Anomaly} annotations from the default CAS view. + * Anomalies are always indexed against the original document text so that + * their character offsets are valid. The "opf_redacted" view only carries + * the redacted sofa string and no annotations. + */ + private static Collection collectAnomalies() { + return JCasUtil.select(cas, Anomaly.class); + } + + /** Returns the sofa string of the opf_redacted view, or the default view's string. */ + private static String extractRedactedText() { + try { + JCas view = cas.getView("opf_redacted"); + String s = view.getSofaDataString(); + return s != null ? s : ""; + } catch (Exception ignored) { + String s = cas.getSofaDataString(); + return s != null ? s : ""; + } + } + + private static String buildResultJson( + String testName, + String inputText, + String redactedText, + Collection anomalies) { + + StringBuilder sb = new StringBuilder(); + sb.append("{\n"); + sb.append(" \"test\": ").append(jsonStr(testName)).append(",\n"); + sb.append(" \"input\": ").append(jsonStr(inputText)).append(",\n"); + sb.append(" \"output\": ").append(jsonStr(redactedText)).append(",\n"); + sb.append(" \"anomaly_count\": ").append(anomalies.size()).append(",\n"); + sb.append(" \"anomalies\": [\n"); + int idx = 0; + for (Anomaly a : anomalies) { + String spanText = ""; + try { + if (a.getBegin() >= 0 && a.getEnd() <= inputText.length()) { + spanText = inputText.substring(a.getBegin(), a.getEnd()); + } + } catch (Exception ignored) {} + sb.append(" {\n"); + sb.append(" \"begin\": ").append(a.getBegin()).append(",\n"); + sb.append(" \"end\": ").append(a.getEnd()).append(",\n"); + sb.append(" \"category\": ").append(jsonStr(a.getCategory())).append(",\n"); + sb.append(" \"description\": ").append(jsonStr(a.getDescription())).append(",\n"); + sb.append(" \"text\": ").append(jsonStr(spanText)).append("\n"); + sb.append(" }"); + if (++idx < anomalies.size()) sb.append(","); + sb.append("\n"); + } + sb.append(" ]\n"); + sb.append("}\n"); + return sb.toString(); + } + + private static String jsonStr(String s) { + if (s == null) return "null"; + return "\"" + s + .replace("\\", "\\\\") + .replace("\"", "\\\"") + .replace("\n", "\\n") + .replace("\r", "\\r") + .replace("\t", "\\t") + + "\""; + } + + // ------------------------------------------------------------------- + // Mode tests + // ------------------------------------------------------------------- + + /** Placeholder mode (default): PII replaced with [category] tag in redacted_text and Anomaly description. */ + @Test + @DisplayName("Placeholder mode: PII replaced with [category] tag") + void testPlaceholderMode() throws Exception { + String text = "Send the report to max.mustermann@uni-frankfurt.de by Friday."; + createCas("en", text); + + composer.add(new DUUIRemoteDriver.Component(SERVICE_URL) + .withParameter("mode", "remove")); + composer.run(cas); + + Collection anomalies = collectAnomalies(); + System.out.println("Anomaly count: " + anomalies.size()); + anomalies.forEach(a -> System.out.printf(" [%d-%d] category=%s description=%s%n", + a.getBegin(), a.getEnd(), a.getCategory(), a.getDescription())); + + assertFalse(anomalies.isEmpty(), "Expected at least one Anomaly for the email address"); + assertTrue(anomalies.stream().anyMatch(a -> { + String d = a.getDescription(); + return d != null && d.startsWith("[") && d.endsWith("]"); + }), "Anomaly description should be a bracketed [category] tag in placeholder mode"); + + String redacted = extractRedactedText(); + assertFalse(redacted.contains("max.mustermann@uni-frankfurt.de"), + "Redacted text should not contain the original email"); + assertTrue(redacted.contains("[private_email]") || redacted.contains("[private_person]"), + "Redacted text should contain a [category] replacement tag"); + } + + /** Remove mode: PII spans are deleted from redacted_text; Anomaly description is the original word. */ + @Test + @DisplayName("Remove mode: PII deleted from redacted text") + void testRemoveMode() throws Exception { + String text = "Call John Smith at john.smith@company.com or +1-800-555-0199 for help."; + createCas("en", text); + + composer.add(new DUUIRemoteDriver.Component(SERVICE_URL) + .withParameter("mode", "remove")); + composer.run(cas); + + Collection anomalies = collectAnomalies(); + System.out.println("Anomaly count: " + anomalies.size()); + anomalies.forEach(a -> System.out.printf(" [%d-%d] category=%s text=%s%n", + a.getBegin(), a.getEnd(), a.getCategory(), text.substring(a.getBegin(), a.getEnd()))); + + assertFalse(anomalies.isEmpty(), "Expected anomalies in remove mode"); + + String redacted = extractRedactedText(); + System.out.printf(" original (%d): %s%n", text.length(), text); + System.out.printf(" redacted (%d): %s%n", redacted.length(), redacted); + assertTrue(redacted.length() < text.length(), + "Redacted text should be shorter after PII removal"); + // original PII tokens must be absent from the redacted string + for (Anomaly a : anomalies) { + String pii = text.substring(a.getBegin(), a.getEnd()); + assertFalse(redacted.contains(pii), + "Removed PII token '" + pii + "' should not appear in redacted text"); + } + } + + /** Pseudo mode: not yet supported - service returns input unchanged with no annotations. */ + @Test + @DisplayName("Pseudo mode: not yet supported, returns input unchanged") + void testPseudoMode() throws Exception { + String text = "Alice and Bob met at the Frankfurt main station."; + createCas("en", text); + + composer.add(new DUUIRemoteDriver.Component(SERVICE_URL) + .withParameter("mode", "pseudo")); + composer.run(cas); + + Collection anomalies = collectAnomalies(); + System.out.println("Anomaly count (pseudo mode): " + anomalies.size()); + assertTrue(anomalies.isEmpty(), + "Pseudo mode (unsupported stub) should produce no Anomaly annotations"); + } + + // ------------------------------------------------------------------- + // PII type tests (mode=placeholder so description = [category]) + // ------------------------------------------------------------------- + + /** private_person: full name in a simple sentence. */ + @Test + @DisplayName("Type: private_person") + void testTypePerson() throws Exception { + String text = "John Smith called the bank to report a fraud."; + createCas("en", text); + + composer.add(new DUUIRemoteDriver.Component(SERVICE_URL) + .withParameter("mode", "placeholder")); + composer.run(cas); + + Collection anomalies = collectAnomalies(); + anomalies.forEach(a -> System.out.printf(" [%d-%d] %s = %s%n", + a.getBegin(), a.getEnd(), a.getCategory(), a.getDescription())); + + assertFalse(anomalies.isEmpty(), "Expected at least one annotation"); + assertTrue(anomalies.stream().anyMatch(a -> "private_person".equals(a.getCategory())), + "Expected category 'private_person' for 'John Smith'"); + } + + /** private_email: plain email address. */ + @Test + @DisplayName("Type: private_email") + void testTypeEmail() throws Exception { + String text = "Please contact alice@example.com for further assistance."; + createCas("en", text); + + composer.add(new DUUIRemoteDriver.Component(SERVICE_URL) + .withParameter("mode", "placeholder")); + composer.run(cas); + + Collection anomalies = collectAnomalies(); + anomalies.forEach(a -> System.out.printf(" [%d-%d] %s = %s%n", + a.getBegin(), a.getEnd(), a.getCategory(), a.getDescription())); + + assertTrue(anomalies.stream().anyMatch(a -> "private_email".equals(a.getCategory())), + "Expected category 'private_email' for 'alice@example.com'"); + } + + /** private_phone: international phone number. */ + @Test + @DisplayName("Type: private_phone") + void testTypePhone() throws Exception { + String text = "You can reach Dr. Miller at +49 69 1234 5678 during office hours."; + createCas("en", text); + + composer.add(new DUUIRemoteDriver.Component(SERVICE_URL) + .withParameter("mode", "placeholder")); + composer.run(cas); + + Collection anomalies = collectAnomalies(); + anomalies.forEach(a -> System.out.printf(" [%d-%d] %s = %s%n", + a.getBegin(), a.getEnd(), a.getCategory(), a.getDescription())); + + assertFalse(anomalies.isEmpty(), "Expected phone or person annotation"); + long phoneCount = anomalies.stream().filter(a -> "private_phone".equals(a.getCategory())).count(); + System.out.println("private_phone spans: " + phoneCount); + assertTrue(phoneCount > 0, "Expected category 'private_phone' for '+49 69 1234 5678'"); + } + + /** private_address: street address with postcode. */ + @Test + @DisplayName("Type: private_address") + void testTypeAddress() throws Exception { + String text = "She lives at 742 Evergreen Terrace, Springfield, IL 62704."; + createCas("en", text); + + composer.add(new DUUIRemoteDriver.Component(SERVICE_URL) + .withParameter("mode", "placeholder")); + composer.run(cas); + + Collection anomalies = collectAnomalies(); + anomalies.forEach(a -> System.out.printf(" [%d-%d] %s = %s%n", + a.getBegin(), a.getEnd(), a.getCategory(), a.getDescription())); + + long addrCount = anomalies.stream().filter(a -> "private_address".equals(a.getCategory())).count(); + System.out.println("private_address spans: " + addrCount); + assertTrue(addrCount > 0, "Expected category 'private_address' for the street address"); + } + + /** private_url: personal homepage URL. */ + @Test + @DisplayName("Type: private_url") + void testTypeUrl() throws Exception { + String text = "My personal page is at https://janedoe.personal-site.com/about and I post there."; + createCas("en", text); + + composer.add(new DUUIRemoteDriver.Component(SERVICE_URL) + .withParameter("mode", "placeholder")); + composer.run(cas); + + Collection anomalies = collectAnomalies(); + anomalies.forEach(a -> System.out.printf(" [%d-%d] %s = %s%n", + a.getBegin(), a.getEnd(), a.getCategory(), a.getDescription())); + + long urlCount = anomalies.stream().filter(a -> "private_url".equals(a.getCategory())).count(); + System.out.println("private_url spans: " + urlCount); + assertTrue(urlCount > 0, "Expected category 'private_url' for the personal URL"); + } + + /** private_date: personally identifying date (e.g. birth date). */ + @Test + @DisplayName("Type: private_date") + void testTypeDate() throws Exception { + String text = "Jane Doe was born on March 15, 1990 in Chicago."; + createCas("en", text); + + composer.add(new DUUIRemoteDriver.Component(SERVICE_URL) + .withParameter("mode", "placeholder")); + composer.run(cas); + + Collection anomalies = collectAnomalies(); + anomalies.forEach(a -> System.out.printf(" [%d-%d] %s = %s%n", + a.getBegin(), a.getEnd(), a.getCategory(), a.getDescription())); + + assertFalse(anomalies.isEmpty(), "Expected at least one annotation (person or date)"); + long dateCount = anomalies.stream().filter(a -> "private_date".equals(a.getCategory())).count(); + System.out.println("private_date spans: " + dateCount); + assertTrue(dateCount > 0, "Expected category 'private_date' for 'March 15, 1990'"); + } + + /** account_number: credit-card style number string. */ + @Test + @DisplayName("Type: account_number") + void testTypeAccountNumber() throws Exception { + String text = "Please transfer funds to account number 4532-0151-1283-0366 at Deutsche Bank."; + createCas("en", text); + + composer.add(new DUUIRemoteDriver.Component(SERVICE_URL) + .withParameter("mode", "placeholder")); + composer.run(cas); + + Collection anomalies = collectAnomalies(); + anomalies.forEach(a -> System.out.printf(" [%d-%d] %s = %s%n", + a.getBegin(), a.getEnd(), a.getCategory(), a.getDescription())); + + long acctCount = anomalies.stream().filter(a -> "account_number".equals(a.getCategory())).count(); + System.out.println("account_number spans: " + acctCount); + assertTrue(acctCount > 0, "Expected category 'account_number' for the card number"); + } + + /** secret: API key / credential in text. */ + @Test + @DisplayName("Type: secret") + void testTypeSecret() throws Exception { + String text = "The API key is sk-proj-abc123XYZ987 and the password is H@nt3r2secure!."; + createCas("en", text); + + composer.add(new DUUIRemoteDriver.Component(SERVICE_URL) + .withParameter("mode", "placeholder")); + composer.run(cas); + + Collection anomalies = collectAnomalies(); + anomalies.forEach(a -> System.out.printf(" [%d-%d] %s = %s%n", + a.getBegin(), a.getEnd(), a.getCategory(), a.getDescription())); + + long secretCount = anomalies.stream().filter(a -> "secret".equals(a.getCategory())).count(); + System.out.println("secret spans: " + secretCount); + assertTrue(secretCount > 0, "Expected category 'secret' for API key / password"); + } + + // ------------------------------------------------------------------- + // Feature / combination tests + // ------------------------------------------------------------------- + + /** Multiple PII types in one document; verifies distinct categories are detected. */ + @Test + @DisplayName("Multiple PII types in one document") + void testMultiplePiiEntities() throws Exception { + String text = + "Patient: Jane Doe, DOB: 1985-03-22. " + + "Contact: jane.doe@hospital.org, Tel: 069-9876-5432. " + + "Address: 60325 Frankfurt am Main, Goethe-Platz 1."; + createCas("en", text); + + composer.add(new DUUIRemoteDriver.Component(SERVICE_URL) + .withParameter("mode", "placeholder")); + composer.run(cas); + + Collection anomalies = collectAnomalies(); + System.out.println("Anomaly count: " + anomalies.size()); + anomalies.forEach(a -> System.out.printf(" [%d-%d] %s = '%s'%n", + a.getBegin(), a.getEnd(), a.getCategory(), text.substring(a.getBegin(), a.getEnd()))); + + assertTrue(anomalies.size() >= 2, + "Expected at least 2 distinct PII annotations"); + + long distinctCategories = anomalies.stream().map(Anomaly::getCategory).distinct().count(); + System.out.println("Distinct categories: " + distinctCategories); + assertTrue(distinctCategories >= 2, + "Expected annotations from at least 2 different PII categories"); + } + + /** Smoke test with two PII types in one sentence. */ + @Test + @DisplayName("Smoke test: person + email in one sentence") + void testSimplePerson() throws Exception { + String text = "My name is Harry Potter and my email is harry.potter@hogwarts.edu."; + createCas("en", text); + + composer.add(new DUUIRemoteDriver.Component(SERVICE_URL) + .withParameter("mode", "placeholder")); + composer.run(cas); + + Collection anomalies = collectAnomalies(); + System.out.println("Anomaly count: " + anomalies.size()); + anomalies.forEach(a -> System.out.printf(" [%d-%d] %s = %s%n", + a.getBegin(), a.getEnd(), a.getCategory(), a.getDescription())); + + assertFalse(anomalies.isEmpty(), + "Expected at least one Anomaly annotation"); + } + + /** Ambiguous context where person identity is inferred from surrounding detail. */ + @Test + @DisplayName("Complex context: identity inferred from description") + void testComplexContext() throws Exception { + String text = "His name is Harry, he works at the TTLAB in Frankfurt, " + + "he's the only Chinese guy in the office."; + createCas("en", text); + + composer.add(new DUUIRemoteDriver.Component(SERVICE_URL) + .withParameter("mode", "remove")); // or remove/placeholder mode, should still detect the same spans + composer.run(cas); + + Collection anomalies = collectAnomalies(); + System.out.println("Anomaly count: " + anomalies.size()); + anomalies.forEach(a -> System.out.printf(" [%d-%d] %s = %s%n", + a.getBegin(), a.getEnd(), a.getCategory(), a.getDescription())); + + assertFalse(anomalies.isEmpty(), "Expected at least one annotation in complex context"); + } + + /** Selection window: only span offsets within [selBegin, selEnd] must be annotated. */ + @Test + @DisplayName("Selection window constrains annotation range") + void testSelectionWindow() throws Exception { + // window [9, 30] covers "John Adams at 555-0100" + String text = "Call Dr. John Adams at 555-0100 today."; + int selBegin = 9; + int selEnd = 30; + createCas("en", text); + + composer.add(new DUUIRemoteDriver.Component(SERVICE_URL) + .withParameter("mode", "placeholder") + .withParameter("selection_begin", String.valueOf(selBegin)) + .withParameter("selection_end", String.valueOf(selEnd))); + composer.run(cas); + + Collection anomalies = collectAnomalies(); + System.out.println("Anomaly count (selection window): " + anomalies.size()); + for (Anomaly a : anomalies) { + assertTrue(a.getBegin() >= selBegin && a.getEnd() <= selEnd, + String.format("Anomaly [%d-%d] outside window [%d-%d]", + a.getBegin(), a.getEnd(), selBegin, selEnd)); + } + } + + /** Empty document must not throw and must return zero annotations. */ + @Test + @DisplayName("Empty document produces no anomalies") + void testEmptyDocument() throws Exception { + createCas("en", ""); + + composer.add(new DUUIRemoteDriver.Component(SERVICE_URL) + .withParameter("mode", "placeholder")); + composer.run(cas); + + assertTrue(collectAnomalies().isEmpty(), + "An empty document should produce zero Anomaly annotations"); + } + + /** German text must not throw; detection quality may vary. */ + @Test + @DisplayName("German text does not cause an exception") + void testGermanText() throws Exception { + String text = "Herr Klaus Muller wohnt in der Goethestrasse 12, 60313 Frankfurt am Main."; + createCas("de", text); + + composer.add(new DUUIRemoteDriver.Component(SERVICE_URL) + .withParameter("mode", "placeholder")); + + assertDoesNotThrow(() -> composer.run(cas)); + System.out.println("German Anomaly count: " + collectAnomalies().size()); + } + + /** XMI round-trip: annotate and write to src/test/results/ for manual inspection. */ + @Test + @DisplayName("XMI output is written to src/test/results/") + void testXmiOutput() throws Exception { + String text = "Maria Schmidt (m.schmidt@example.de) lives at Berliner Str. 5, 10115 Berlin."; + createCas("en", text); + + composer.add(new DUUIRemoteDriver.Component(SERVICE_URL) + .withParameter("mode", "placeholder")); + + composer.add(new DUUIUIMADriver.Component( + createEngineDescription(XmiWriter.class, + XmiWriter.PARAM_TARGET_LOCATION, RESULTS_DIR, + XmiWriter.PARAM_PRETTY_PRINT, true, + XmiWriter.PARAM_OVERWRITE, true, + XmiWriter.PARAM_VERSION, "1.1" + ) + ).build()); + + composer.run(cas); + + Collection anomalies = collectAnomalies(); + System.out.println("XMI test Anomaly count: " + anomalies.size()); + assertFalse(anomalies.isEmpty(), + "Expected at least one Anomaly annotation for the PII-rich document"); + } +} diff --git a/duui-anonymize/src/test/resources/sample_pii_de.txt b/duui-anonymize/src/test/resources/sample_pii_de.txt new file mode 100644 index 00000000..391ff970 --- /dev/null +++ b/duui-anonymize/src/test/resources/sample_pii_de.txt @@ -0,0 +1,4 @@ +Frau Anna Müller (anna.mueller@beispiel.de) hat am 15. März 2024 angerufen. +Ihre Telefonnummer lautet 069-8765-4321. +Wohnadresse: Goethestraße 3, 60313 Frankfurt am Main. +Geburtsdatum: 12.07.1985. diff --git a/duui-anonymize/src/test/resources/sample_pii_en.txt b/duui-anonymize/src/test/resources/sample_pii_en.txt new file mode 100644 index 00000000..1acc74b6 --- /dev/null +++ b/duui-anonymize/src/test/resources/sample_pii_en.txt @@ -0,0 +1,4 @@ +John Smith called the helpdesk on Monday. +His email is john.smith@company.org and his phone number is +49 69 1234 5678. +He lives at Mainzer Landstraße 50, 60329 Frankfurt am Main. +Date of birth: 1978-11-04. Employee ID: EMP-00421. diff --git a/duui-anonymize/tests/test_communication_contract.py b/duui-anonymize/tests/test_communication_contract.py new file mode 100644 index 00000000..d2df6286 --- /dev/null +++ b/duui-anonymize/tests/test_communication_contract.py @@ -0,0 +1,20 @@ +from __future__ import annotations + +from pathlib import Path +import unittest + + +class CommunicationContractTests(unittest.TestCase): + def test_lua_contract_mentions_text_options_selection_and_redacted_view(self) -> None: + lua_path = Path(__file__).resolve().parents[1] / "src/main/python/communication.lua" + contents = lua_path.read_text(encoding="utf-8") + + self.assertIn('text = text', contents) + self.assertIn('options = copy_options(params)', contents) + self.assertIn('selection = resolve_selection(params)', contents) + self.assertIn('createView("opf_redacted")', contents) + self.assertIn('detected_spans', contents) + + +if __name__ == "__main__": + unittest.main() diff --git a/duui-anonymize/tests/test_duui_opf_core.py b/duui-anonymize/tests/test_duui_opf_core.py new file mode 100644 index 00000000..ce71f6c5 --- /dev/null +++ b/duui-anonymize/tests/test_duui_opf_core.py @@ -0,0 +1,79 @@ +from __future__ import annotations + +import unittest +from pathlib import Path +import sys + + +ROOT = Path(__file__).resolve().parents[1] +sys.path.insert(0, str(ROOT / "src/main/python")) + +from duui_opf_core import ( + DEFAULT_PLACEHOLDER, + DEFAULT_MODE, + PSEUDO_MODE, + RedactionSpan, + SelectionRange, + apply_replacement_text, + apply_selection, + compose_selection_output, + resolve_selection, + split_options, +) + + +class DuuiOpfCoreTests(unittest.TestCase): + def test_split_options_separates_service_and_decode_values(self) -> None: + service_options, decode_options, mode, placeholder = split_options( + { + "model": "local-checkpoint", + "context_window_length": 128, + "trim_whitespace": False, + "device": "cpu", + "output_mode": "typed", + "discard_overlapping_predicted_spans": True, + "mode": PSEUDO_MODE, + "placeholder": "", + "decode_mode": "argmax", + "calibration_path": "/tmp/calibration.json", + "selection_begin": 2, + "selection_end": 8, + } + ) + + self.assertEqual(service_options["model"], "local-checkpoint") + self.assertEqual(service_options["device"], "cpu") + self.assertEqual(decode_options["decode_mode"], "argmax") + self.assertEqual(decode_options["viterbi_calibration_path"], "/tmp/calibration.json") + self.assertEqual(mode, PSEUDO_MODE) + self.assertEqual(placeholder, "") + + def test_resolve_selection_accepts_nested_or_flat_offsets(self) -> None: + nested = resolve_selection({"selection": {"begin": 4, "end": 9}}, text_length=20) + flat = resolve_selection({"selection_begin": 1, "selection_end": 3}, text_length=20) + + self.assertEqual(nested, SelectionRange(begin=4, end=9)) + self.assertEqual(flat, SelectionRange(begin=1, end=3)) + + def test_apply_replacement_text_uses_one_placeholder(self) -> None: + redacted = apply_replacement_text( + "Alice called Bob.", + [ + RedactionSpan(label="private_person", start=0, end=5, text="Alice"), + RedactionSpan(label="private_person", start=13, end=16, text="Bob"), + ], + ) + + self.assertEqual(redacted, f"{DEFAULT_PLACEHOLDER} called {DEFAULT_PLACEHOLDER}.") + + def test_apply_selection_and_compose_output(self) -> None: + selection = SelectionRange(begin=6, end=11) + selected_text, offset = apply_selection("hello world", selection) + + self.assertEqual(selected_text, "world") + self.assertEqual(offset, 6) + self.assertEqual(compose_selection_output("hello world", selection, "there"), "hello there") + + +if __name__ == "__main__": + unittest.main()