diff --git a/duui-anonymize/.gitignore b/duui-anonymize/.gitignore
new file mode 100644
index 00000000..05703aca
--- /dev/null
+++ b/duui-anonymize/.gitignore
@@ -0,0 +1,14 @@
+/../*
+../
+../*
+../duui-mm/*
+.venv/**
+.vscode/**
+__pycache__/**
+*.pyc
+
+target/**
+dist/**
+build/**
+
+src/test/results/**
\ No newline at end of file
diff --git a/duui-anonymize/README.md b/duui-anonymize/README.md
new file mode 100644
index 00000000..8c005b75
--- /dev/null
+++ b/duui-anonymize/README.md
@@ -0,0 +1,39 @@
+#### OpenAI Privacy Filter component for DUUI
+
+OpenAI Privacy Filter: https://github.com/openai/privacy-filter
+
+#### Input/Output:
+
+input: Text in the Sofa. Optional selection offsets can be passed through Lua options.
+
+output: structured redaction spans and redacted text
+
+#### Output Shape:
+
+Privacy Filter detects 8 privacy span categories:
+
+- `account_number`
+- `private_address`
+- `private_email`
+- `private_person`
+- `private_phone`
+- `private_url`
+- `private_date`
+- `secret`
+
+The model emits BIOES token classes for these categories plus `O`, and the service turns the resulting spans into DUUI annotations and redacted text.
+
+#### Parameter:
+
+[optional] OPF redaction options such as `model`, `context_window_length`, `trim_whitespace`, `device`, `output_mode`, `decode_mode`, `discard_overlapping_predicted_spans`, `viterbi_calibration_path`, and selection offsets (`selection_begin` / `selection_end`).
+
+#### Modes:
+
+- `replacement`: default mode, replaces detected spans with a consistent placeholder.
+- `pseudo`: kept as a stub / TODO mode and currently returns the input unchanged.
+- `mode` is passed through Lua options.
+
+#### Entry points:
+
+- `src/main/docker/python/duui_opf.py`: new OPF entrypoint wrapper.
+- `src/main/docker/python/duui_whisperx.py`: compatibility implementation file while the migration is in progress.
diff --git a/duui-anonymize/pom.xml b/duui-anonymize/pom.xml
new file mode 100644
index 00000000..3e7b0b79
--- /dev/null
+++ b/duui-anonymize/pom.xml
@@ -0,0 +1,138 @@
+
+
+ 4.0.0
+
+ org.texttechnology
+ duui-anonymize
+ 1.0-SNAPSHOT
+
+
+
+ AGPL-3.0-or-later
+ https://www.gnu.org/licenses/agpl.txt
+ repo
+ GNU Affero General Public License v3.0 or later
+
+
+
+
+ Texttechnology Lab
+ https://www.texttechnologylab.org
+
+
+
+
+ mehler
+ Prof. Dr. Alexander Mehler
+ mehler@em.uni-frankfurt.de
+ https://www.texttechnologylab.org/team/alexander-abrami/
+ Goethe University Frankfurt / Texttechnology Lab
+ https://www.texttechnologylab.org
+
+ head of department
+
+
+
+ aabusale
+ Ali Abusaleh
+ a.abusaleh@em.uni-frankfurt.de
+ https://www.texttechnologylab.org/team/ali-abusaleh/
+ Goethe University Frankfurt / Texttechnology Lab
+ https://www.texttechnologylab.org
+
+ Research assistant
+
+ Europe/Berlin
+
+
+
+
+
+
+ org.apache.maven.plugins
+ maven-surefire-plugin
+ 2.22.0
+
+
+ --illegal-access=permit
+ --add-opens java.base/java.util=ALL-UNNAMED
+
+
+
+
+
+
+
+ 21
+ 21
+ UTF-8
+ 2.4.0
+
+
+
+
+ jitpack.io
+ https://jitpack.io
+
+
+
+
+
+
+ org.dkpro.core
+ dkpro-core-asl
+ ${dkpro.core.version}
+ pom
+ import
+
+
+
+
+
+
+ com.github.texttechnologylab
+ DockerUnifiedUIMAInterface
+ fac60bef3f
+
+
+
+ com.github.texttechnologylab
+ UIMATypeSystem
+ 3.0.5
+
+
+
+ org.junit.jupiter
+ junit-jupiter
+ 5.9.0
+ test
+
+
+
+ org.dkpro.core
+ dkpro-core-api-anomaly-asl
+ test
+
+
+
+ org.dkpro.core
+ dkpro-core-api-segmentation-asl
+ test
+
+
+
+ org.dkpro.core
+ dkpro-core-io-xmi-asl
+ test
+
+
+
+ org.dkpro.core
+ dkpro-core-api-resources-asl
+ test
+
+
+
+
diff --git a/duui-anonymize/requirements.txt b/duui-anonymize/requirements.txt
new file mode 100644
index 00000000..49fb44b8
--- /dev/null
+++ b/duui-anonymize/requirements.txt
@@ -0,0 +1,14 @@
+numpy
+dkpro_cassis
+fastapi
+pydantic
+pydantic-settings
+pydantic_core
+starlette
+uvicorn
+torch
+torchvision
+torchaudio
+transformers
+accelerate
+setuptools
diff --git a/duui-anonymize/src/main/docker/Dockerfile b/duui-anonymize/src/main/docker/Dockerfile
new file mode 100644
index 00000000..660e32ce
--- /dev/null
+++ b/duui-anonymize/src/main/docker/Dockerfile
@@ -0,0 +1,16 @@
+FROM python:3.10
+
+WORKDIR /usr/src/app
+
+EXPOSE 9714
+
+COPY ./src/main/python/communication.lua ./communication.lua
+COPY ./src/main/python/duui_anonymize.py ./duui_anonymize.py
+COPY ./src/main/python/typesystem.xml ./typesystem.xml
+COPY ./requirements.txt ./requirements.txt
+
+RUN pip install torch==2.0.0 torchvision==0.15.1 torchaudio==2.0.1 --index-url https://download.pytorch.org/whl/cpu
+RUN pip install -r requirements.txt
+
+ENTRYPOINT ["uvicorn", "duui_anonymize:app", "--host", "0.0.0.0", "--port" ,"9714"]
+CMD ["--workers", "1"]
\ No newline at end of file
diff --git a/duui-anonymize/src/main/docker/Dockerfile-cuda b/duui-anonymize/src/main/docker/Dockerfile-cuda
new file mode 100644
index 00000000..fa72894c
--- /dev/null
+++ b/duui-anonymize/src/main/docker/Dockerfile-cuda
@@ -0,0 +1,26 @@
+FROM nvidia/cuda:11.8.0-cudnn8-devel-ubuntu22.04
+
+RUN apt update && \
+ DEBIAN_FRONTEND=noninteractive \
+ apt install --no-install-recommends -y build-essential software-properties-common && \
+ add-apt-repository -y ppa:deadsnakes/ppa && \
+ apt install --no-install-recommends -y python3.10 python3-pip python3-setuptools python3-distutils && \
+ apt clean && rm -rf /var/lib/apt/lists/*
+
+RUN ln -s /usr/bin/python3 /usr/bin/python
+RUN python -m pip install --upgrade pip
+
+WORKDIR /usr/src/app
+
+EXPOSE 9714
+
+COPY ./src/main/python/communication.lua ./communication.lua
+COPY ./src/main/python/duui_anonymize.py ./duui_anonymize.py
+COPY ./src/main/python/typesystem.xml ./typesystem.xml
+COPY ./requirements.txt ./requirements.txt
+
+RUN pip install torch==2.0.0 torchvision==0.15.1 torchaudio==2.0.1 --index-url https://download.pytorch.org/whl/cu118
+RUN pip install -r requirements.txt
+
+ENTRYPOINT ["uvicorn", "duui_anonymize:app", "--host", "0.0.0.0", "--port" ,"9714"]
+CMD ["--workers", "1"]
\ No newline at end of file
diff --git a/duui-anonymize/src/main/python/communication.lua b/duui-anonymize/src/main/python/communication.lua
new file mode 100644
index 00000000..038d7167
--- /dev/null
+++ b/duui-anonymize/src/main/python/communication.lua
@@ -0,0 +1,110 @@
+-- Bind static classes from java
+StandardCharsets = luajava.bindClass("java.nio.charset.StandardCharsets")
+util = luajava.bindClass("org.apache.uima.fit.util.JCasUtil")
+
+-- Read a parameter from params regardless of whether it is a Lua table or a
+-- LuaJ-wrapped Java Map. Direct table indexing works for Lua tables; Java
+-- Map objects (HashMap, etc.) require params:get(key) instead.
+local function param_get(params, key)
+ if params == nil then return nil end
+ local v = params[key]
+ if v ~= nil then return tostring(v) end
+ local ok, r = pcall(function() return params:get(key) end)
+ if ok and r ~= nil then return tostring(r) end
+ return nil
+end
+
+-- Known option keys forwarded to the Python service.
+local OPTION_KEYS = {
+ "mode", "model", "device",
+ "context_window_length", "trim_whitespace",
+ "output_mode", "discard_overlapping_predicted_spans",
+}
+
+local function copy_options(params)
+ local options = {}
+ print("Copying options:")
+ for _, key in ipairs(OPTION_KEYS) do
+ local value = param_get(params, key)
+ if value ~= nil then
+ print(" ", key, "=", value)
+ options[key] = value
+ end
+ end
+ return options
+end
+
+local function resolve_selection(params)
+ if params == nil then return nil end
+
+ -- selection passed as a nested table
+ local selection = params["selection"]
+ if selection == nil then
+ local ok, r = pcall(function() return params:get("selection") end)
+ if ok then selection = r end
+ end
+ if type(selection) == "table" then
+ local b = selection["begin"] or selection["start"]
+ local e = selection["end"] or selection["stop"]
+ if b ~= nil and e ~= nil then
+ return { begin = b, ["end"] = e }
+ end
+ end
+
+ -- selection passed as flat begin/end keys
+ local b = param_get(params, "selection_begin") or param_get(params, "selection_start")
+ local e = param_get(params, "selection_end") or param_get(params, "selection_stop")
+ if b ~= nil and e ~= nil then
+ return { begin = tonumber(b), ["end"] = tonumber(e) }
+ end
+
+ return nil
+end
+
+-- Serialize the CAS into a JSON request sent to the Python service.
+function serialize(inputCas, outputStream, params)
+ local text = inputCas:getSofaDataString()
+ if text == nil then text = "" end
+
+ local options = copy_options(params)
+
+ outputStream:write(json.encode({
+ text = text,
+ options = options,
+ selection = resolve_selection(params),
+ }))
+end
+
+-- Deserialize the JSON response from the Python service back into the CAS.
+--
+-- Anomaly annotations are added to the *original* CAS view so their
+-- character offsets remain valid against the original document text.
+-- The redacted text is stored as the sofa of a separate "opf_redacted" view.
+function deserialize(inputCas, inputStream)
+ local inputString = luajava.newInstance("java.lang.String", inputStream:readAllBytes(), StandardCharsets.UTF_8)
+ local results = json.decode(inputString)
+
+ -- Store redacted text in its own view.
+ if results["redacted_text"] ~= nil then
+ local ok, view = pcall(function() return inputCas:createView("opf_redacted") end)
+ if ok and view ~= nil then
+ view:setSofaDataString(results["redacted_text"], "text/plain")
+ end
+ end
+
+ -- Add Anomaly annotations to the original view; offsets reference original text.
+ if results["detected_spans"] ~= nil then
+ for i, span in ipairs(results["detected_spans"]) do
+ local anomaly = luajava.newInstance(
+ "de.tudarmstadt.ukp.dkpro.core.api.anomaly.type.Anomaly", inputCas)
+ anomaly:setBegin(span["start"])
+ anomaly:setEnd(span["end"])
+ anomaly:setCategory(span["label"])
+ -- description = replacement used (e.g. "[private_person]") or original word
+ anomaly:setDescription(
+ (span["placeholder"] ~= nil and span["placeholder"] ~= "") and span["placeholder"]
+ or span["text"] or span["label"])
+ anomaly:addToIndexes()
+ end
+ end
+end
diff --git a/duui-anonymize/src/main/python/duui_anonymize.py b/duui-anonymize/src/main/python/duui_anonymize.py
new file mode 100644
index 00000000..effe5371
--- /dev/null
+++ b/duui-anonymize/src/main/python/duui_anonymize.py
@@ -0,0 +1,281 @@
+from __future__ import annotations
+
+import logging
+import json
+from functools import lru_cache
+from typing import Any, List, Optional
+
+import torch
+import uvicorn
+from cassis import load_typesystem
+from fastapi import FastAPI, Request, Response
+from fastapi.encoders import jsonable_encoder
+from fastapi.exceptions import RequestValidationError
+from fastapi.responses import JSONResponse, PlainTextResponse
+from pydantic import BaseModel, Field, field_validator
+from pydantic_settings import BaseSettings
+from transformers import pipeline as hf_pipeline
+
+logger = logging.getLogger(__name__)
+logging.basicConfig(level=logging.INFO)
+
+DEFAULT_MODEL = "openai/privacy-filter"
+
+MODE_REMOVE = "remove"
+MODE_PLACEHOLDER = "placeholder" # default: replace with [category]
+MODE_PSEUDO = "pseudo" # TODO: not yet supported
+
+
+# ---------------------------------------------------------------------------
+# Pydantic models
+# ---------------------------------------------------------------------------
+
+class DetectedSpan(BaseModel):
+ label: str
+ start: int
+ end: int
+ text: str
+ placeholder: str # replacement text used; empty string for remove mode
+
+
+class DUUIRequest(BaseModel):
+ text: str
+ options: dict[str, Any] = Field(default_factory=dict)
+ selection: Optional[dict] = None
+
+ @field_validator("options", mode="before")
+ @classmethod
+ def coerce_options(cls, v: Any) -> dict:
+ if v is None or isinstance(v, list):
+ return {}
+ if not isinstance(v, dict):
+ return {}
+ return v
+
+ @field_validator("text", mode="before")
+ @classmethod
+ def coerce_text(cls, v: Any) -> str:
+ return "" if v is None else str(v)
+
+
+class DUUIResponse(BaseModel):
+ text: str
+ detected_spans: List[DetectedSpan]
+ redacted_text: str
+ warning: Optional[str] = None
+
+
+class DUUIDocumentation(BaseModel):
+ annotator_name: str
+ version: str
+ implementation_lang: str
+
+
+# ---------------------------------------------------------------------------
+# Settings
+# ---------------------------------------------------------------------------
+
+class Settings(BaseSettings):
+ duui_tool_name: str = "DUUI Anonymize"
+ duui_tool_version: str = "1.0"
+ default_model: str = DEFAULT_MODEL
+
+
+settings = Settings()
+
+# ---------------------------------------------------------------------------
+# FastAPI app
+# ---------------------------------------------------------------------------
+
+app = FastAPI(
+ docs_url="/api",
+ redoc_url=None,
+ title="DUUI Anonymize",
+ description="PII detection and redaction for TTLab DUUI using openai/privacy-filter",
+ version="1.0",
+ terms_of_service="https://www.texttechnologylab.org/legal_notice/",
+ contact={
+ "name": "Ali Abusaleh",
+ "url": "https://www.texttechnologylab.org",
+ "email": "abusaleh@em.uni-frankfurt.de",
+ },
+ license_info={
+ "name": "AGPL",
+ "url": "http://www.gnu.org/licenses/agpl-3.0.en.html",
+ },
+)
+
+
+@app.exception_handler(RequestValidationError)
+async def validation_exception_handler(request: Request, exc: RequestValidationError) -> JSONResponse:
+ body = await request.body()
+ logger.error("422 validation errors: %s", exc.errors())
+ logger.error("Raw body: %s", body.decode("utf-8", errors="replace"))
+ return JSONResponse(
+ status_code=422,
+ content=jsonable_encoder({"detail": exc.errors(), "body": body.decode("utf-8", errors="replace")}),
+ )
+
+
+# ---------------------------------------------------------------------------
+# Static assets
+# ---------------------------------------------------------------------------
+
+with open("communication.lua", "rb") as _f:
+ _communication_lua: str = _f.read().decode("utf-8")
+
+with open("typesystem.xml", "rb") as _f:
+ _typesystem = load_typesystem(_f)
+
+
+# ---------------------------------------------------------------------------
+# Endpoints
+# ---------------------------------------------------------------------------
+
+@app.get("/v1/details/input_output")
+def get_input_output() -> JSONResponse:
+ return JSONResponse(content=jsonable_encoder({
+ "inputs": [],
+ "outputs": ["de.tudarmstadt.ukp.dkpro.core.api.anomaly.type.Anomaly"],
+ }))
+
+
+@app.get("/v1/typesystem")
+def get_typesystem() -> Response:
+ return Response(content=_typesystem.to_xml().encode("utf-8"), media_type="application/xml")
+
+
+@app.get("/v1/communication_layer", response_class=PlainTextResponse)
+def get_communication_layer() -> str:
+ return _communication_lua
+
+
+@app.get("/v1/documentation")
+def get_documentation() -> DUUIDocumentation:
+ return DUUIDocumentation(
+ annotator_name=settings.duui_tool_name,
+ version=settings.duui_tool_version,
+ implementation_lang="Python",
+ )
+
+
+@app.post("/v1/process")
+async def post_process(raw_request: Request) -> DUUIResponse:
+ body = await raw_request.body()
+ try:
+ data = json.loads(body)
+ except json.JSONDecodeError as exc:
+ raise RequestValidationError([{"type": "json_invalid", "loc": ("body",), "msg": str(exc), "input": body}])
+ request = DUUIRequest.model_validate(data)
+ return _process(request)
+
+
+# ---------------------------------------------------------------------------
+# Business logic
+# ---------------------------------------------------------------------------
+
+@lru_cache(maxsize=4)
+def _load_pipeline(model: str, device: str):
+ dev = 0 if device == "cuda" else -1
+ logger.info("Loading pipeline: model=%s device=%s", model, device)
+ return hf_pipeline(
+ task="token-classification",
+ model=model,
+ aggregation_strategy="simple",
+ device=dev,
+ )
+
+
+def _resolve_selection(options: dict[str, Any], text_length: int) -> Optional[tuple[int, int]]:
+ sel = options.get("selection")
+ if isinstance(sel, dict):
+ begin = sel.get("begin", sel.get("start"))
+ end = sel.get("end", sel.get("stop"))
+ else:
+ begin = options.get("selection_begin", options.get("selection_start"))
+ end = options.get("selection_end", options.get("selection_stop"))
+
+ if begin is None or end is None:
+ return None
+ begin, end = int(begin), int(end)
+ if begin < 0 or end < begin or end > text_length:
+ raise ValueError(f"selection must satisfy 0 <= begin <= end <= {text_length}")
+ return begin, end
+
+
+def _build_redacted(text: str, spans: list[DetectedSpan], mode: str) -> str:
+ """Apply mode transformation to text using already-computed spans."""
+ if not spans:
+ return text
+ parts: list[str] = []
+ cursor = 0
+ for span in sorted(spans, key=lambda s: s.start):
+ if span.start < cursor:
+ continue
+ parts.append(text[cursor:span.start])
+ if mode == MODE_PLACEHOLDER:
+ parts.append(span.placeholder) # e.g. [private_person]
+ # MODE_REMOVE: append nothing - the PII is deleted
+ cursor = span.end
+ parts.append(text[cursor:])
+ return "".join(parts)
+
+
+def _process(request: DUUIRequest) -> DUUIResponse:
+ options = request.options
+ model = str(options.get("model", settings.default_model))
+ device = str(options.get("device") or ("cuda" if torch.cuda.is_available() else "cpu"))
+ mode = str(options.get("mode", MODE_PLACEHOLDER))
+
+ print(f"Processing request: model={model} device={device} mode={mode} text_length={len(request.text)}")
+
+ # pseudo mode - not yet supported
+ if mode == MODE_PSEUDO:
+ return DUUIResponse(
+ text=request.text,
+ detected_spans=[],
+ redacted_text=request.text,
+ warning="pseudo mode is not yet supported - input returned unchanged",
+ )
+
+ if not request.text:
+ return DUUIResponse(text="", detected_spans=[], redacted_text="")
+
+ sel = _resolve_selection(options, text_length=len(request.text))
+ selected_text = request.text[sel[0]:sel[1]] if sel else request.text
+ offset = sel[0] if sel else 0
+
+ pipe = _load_pipeline(model, device)
+ raw = pipe(selected_text)
+
+ spans = [
+ DetectedSpan(
+ label=item["entity_group"],
+ start=int(item["start"]) + offset,
+ end=int(item["end"]) + offset,
+ text=str(item["word"]).strip(),
+ placeholder=f"[{item['entity_group']}]" if mode == MODE_PLACEHOLDER else "",
+ )
+ for item in raw
+ ]
+
+ redacted_text = _build_redacted(request.text, spans, mode)
+ if sel is not None:
+ # only the selected window was processed; rebuild full text around it
+ local_spans = [
+ DetectedSpan(label=s.label, start=s.start - offset, end=s.end - offset,
+ text=s.text, placeholder=s.placeholder)
+ for s in spans
+ ]
+ redacted_window = _build_redacted(selected_text, local_spans, mode)
+ redacted_text = request.text[:sel[0]] + redacted_window + request.text[sel[1]:]
+
+ return DUUIResponse(
+ text=request.text,
+ detected_spans=spans,
+ redacted_text=redacted_text,
+ )
+
+
+if __name__ == "__main__":
+ uvicorn.run("duui_anonymize:app", host="0.0.0.0", port=9714, workers=1)
diff --git a/duui-anonymize/src/main/python/duui_opf.py b/duui-anonymize/src/main/python/duui_opf.py
new file mode 100644
index 00000000..5ff804bd
--- /dev/null
+++ b/duui-anonymize/src/main/python/duui_opf.py
@@ -0,0 +1,3 @@
+from duui_anonymize import app
+
+__all__ = ["app"]
diff --git a/duui-anonymize/src/main/python/duui_opf_core.py b/duui-anonymize/src/main/python/duui_opf_core.py
new file mode 100644
index 00000000..88670158
--- /dev/null
+++ b/duui-anonymize/src/main/python/duui_opf_core.py
@@ -0,0 +1,149 @@
+from __future__ import annotations
+
+from dataclasses import dataclass
+from typing import Any, Mapping
+
+DEFAULT_MODE = "replacement"
+PSEUDO_MODE = "pseudo"
+DEFAULT_PLACEHOLDER = ""
+
+_SELECTION_KEYS = {
+ "selection",
+ "selection_begin",
+ "selection_end",
+ "selection_start",
+ "selection_stop",
+}
+
+_SERVICE_OPTION_KEYS = {
+ "model",
+ "context_window_length",
+ "trim_whitespace",
+ "device",
+ "output_mode",
+ "discard_overlapping_predicted_spans",
+ "mode",
+ "placeholder",
+}
+
+_DECODE_OPTION_KEYS = {
+ "decode_mode",
+ "viterbi_calibration_path",
+ "calibration_path",
+}
+
+
+@dataclass(frozen=True)
+class SelectionRange:
+ begin: int
+ end: int
+
+
+@dataclass(frozen=True)
+class RedactionSpan:
+ label: str
+ start: int
+ end: int
+ text: str
+ placeholder: str = DEFAULT_PLACEHOLDER
+
+
+def split_options(
+ options: Mapping[str, Any],
+) -> tuple[dict[str, Any], dict[str, Any], str, str]:
+ service_options: dict[str, Any] = {}
+ decode_options: dict[str, Any] = {}
+ mode = DEFAULT_MODE
+ placeholder = DEFAULT_PLACEHOLDER
+
+ for key, value in options.items():
+ if key in _SELECTION_KEYS:
+ continue
+ if key == "mode":
+ mode = str(value)
+ elif key == "placeholder":
+ placeholder = str(value)
+ elif key in _SERVICE_OPTION_KEYS:
+ service_options[key] = value
+ elif key == "decode":
+ continue
+ elif key in _DECODE_OPTION_KEYS:
+ if key == "calibration_path":
+ decode_options["viterbi_calibration_path"] = value
+ else:
+ decode_options[key] = value
+
+ return service_options, decode_options, mode, placeholder
+
+
+def resolve_selection(
+ options: Mapping[str, Any],
+ *,
+ text_length: int,
+) -> SelectionRange | None:
+ selection = options.get("selection")
+ if isinstance(selection, Mapping):
+ begin = selection.get("begin", selection.get("start"))
+ end = selection.get("end", selection.get("stop"))
+ if begin is None or end is None:
+ return None
+ return _validate_selection(begin, end, text_length=text_length)
+
+ begin = options.get("selection_begin", options.get("selection_start"))
+ end = options.get("selection_end", options.get("selection_stop"))
+ if begin is None or end is None:
+ return None
+ return _validate_selection(begin, end, text_length=text_length)
+
+
+def _validate_selection(
+ begin: Any,
+ end: Any,
+ *,
+ text_length: int,
+) -> SelectionRange:
+ begin_int = int(begin)
+ end_int = int(end)
+ if begin_int < 0 or end_int < begin_int or end_int > text_length:
+ raise ValueError("selection must satisfy 0 <= begin <= end <= text length")
+ return SelectionRange(begin=begin_int, end=end_int)
+
+
+def apply_replacement_text(
+ text: str,
+ spans: list[RedactionSpan],
+ *,
+ placeholder: str = DEFAULT_PLACEHOLDER,
+) -> str:
+ if not spans:
+ return text
+
+ redacted_parts: list[str] = []
+ cursor = 0
+ for span in sorted(spans, key=lambda item: (item.start, item.end)):
+ if span.start < cursor:
+ continue
+ redacted_parts.append(text[cursor:span.start])
+ redacted_parts.append(placeholder)
+ cursor = max(cursor, span.end)
+ redacted_parts.append(text[cursor:])
+ return "".join(redacted_parts)
+
+
+def apply_selection(
+ text: str,
+ selection: SelectionRange | None,
+) -> tuple[str, int]:
+ if selection is None:
+ return text, 0
+ return text[selection.begin:selection.end], selection.begin
+
+
+def compose_selection_output(
+ text: str,
+ selection: SelectionRange | None,
+ replacement: str,
+) -> str:
+ if selection is None:
+ return replacement
+ return text[:selection.begin] + replacement + text[selection.end:]
diff --git a/duui-anonymize/src/main/python/duui_whisperx.py b/duui-anonymize/src/main/python/duui_whisperx.py
new file mode 100644
index 00000000..05fdc719
--- /dev/null
+++ b/duui-anonymize/src/main/python/duui_whisperx.py
@@ -0,0 +1,430 @@
+from functools import lru_cache
+import json
+from enum import Enum
+from typing import Any, List
+
+import torch
+import uvicorn
+from cassis import load_typesystem
+from fastapi import FastAPI, Response
+from fastapi.encoders import jsonable_encoder
+from fastapi.responses import PlainTextResponse
+from pydantic import BaseModel, Field
+from pydantic_settings import BaseSettings
+from starlette.responses import JSONResponse
+
+from opf import DecodeOptions, OPF
+
+
+class DetectedSpan(BaseModel):
+ """One detected privacy span returned by OPF."""
+
+ label: str
+ start: int
+ end: int
+ text: str
+ placeholder: str
+
+
+class SelectionRange(BaseModel):
+ """Optional text selection inside the source document."""
+
+ begin: int
+ end: int
+
+
+class DUUIRequest(BaseModel):
+ """Request sent by DUUI and transformed by the Lua communication layer."""
+
+ text: str
+ options: dict[str, Any] = Field(default_factory=dict)
+ selection: SelectionRange | None = None
+
+
+class DUUIResponse(BaseModel):
+ """Response of this annotator."""
+
+ schema_version: int
+ summary: dict[str, Any]
+ text: str
+ detected_spans: List[DetectedSpan]
+ redacted_text: str
+ warning: str | None = None
+ selection: SelectionRange | None = None
+
+
+class DUUIDocumentation(BaseModel):
+ """Documentation response."""
+
+ annotator_name: str
+ version: str
+ implementation_lang: str
+
+
+class Settings(BaseSettings):
+ """Runtime settings for the DUUI service."""
+
+ duui_tool_name: str = "OpenAI Privacy Filter"
+ duui_tool_version: str = "1.0"
+ default_model: str | None = None
+
+
+class RedactionMode(str, Enum):
+ REPLACEMENT = "replacement"
+ PSEUDO = "pseudo"
+
+
+class PrivacyFilterService:
+ """Class-based service wrapper for OPF redaction."""
+
+ def __init__(self, settings: Settings) -> None:
+ self.settings = settings
+
+ def split_options(self, options: dict[str, Any]) -> tuple[dict[str, Any], dict[str, Any]]:
+ return _split_options(options)
+
+ def selection_from_options(
+ self,
+ request_selection: SelectionRange | None,
+ options: dict[str, Any],
+ *,
+ text_length: int,
+ ) -> SelectionRange | None:
+ return _selection_from_options(
+ request_selection,
+ options,
+ text_length=text_length,
+ )
+
+ def redact_text(
+ self,
+ text: str,
+ request_selection: SelectionRange | None,
+ options: dict[str, Any],
+ ) -> DUUIResponse:
+ return _redact_text(text, request_selection, options)
+
+
+settings = Settings()
+service = PrivacyFilterService(settings)
+DEFAULT_PLACEHOLDER = ""
+DEFAULT_MODE = RedactionMode.REPLACEMENT.value
+PSEUDO_MODE = RedactionMode.PSEUDO.value
+
+
+app = FastAPI(
+ docs_url="/api",
+ redoc_url=None,
+ title="OpenAI Privacy Filter",
+ description="Text privacy redaction for TTLab DUUI",
+ version="1.0",
+ terms_of_service="https://www.texttechnologylab.org/legal_notice/",
+ contact={
+ "name": "Daniel Bundan",
+ "url": "bundan.me",
+ "email": "s1486849@stud.uni-frankfurt.de",
+ },
+ license_info={
+ "name": "AGPL",
+ "url": "http://www.gnu.org/licenses/agpl-3.0.en.html",
+ },
+)
+
+communication = "communication.lua"
+with open(communication, 'rb') as f:
+ communication = f.read().decode("utf-8")
+
+
+# Load the predefined typesystem that is needed for this annotator to work
+typesystem_filename = 'typesystem.xml'
+with open(typesystem_filename, 'rb') as f:
+ typesystem = load_typesystem(f)
+
+
+# Get input / output of the annotator
+@app.get("/v1/details/input_output")
+def get_input_output() -> JSONResponse:
+ json_item = {
+ "inputs": [],
+ "outputs": ["de.tudarmstadt.ukp.dkpro.core.api.anomaly.type.Anomaly"]
+ }
+
+ json_compatible_item_data = jsonable_encoder(json_item)
+ return JSONResponse(content=json_compatible_item_data)
+
+
+# Get typesystem of this annotator
+@app.get("/v1/typesystem")
+def get_typesystem() -> Response:
+ # TODO remove cassis dependency, as only needed for typesystem at the moment?
+ xml = typesystem.to_xml()
+ xml_content = xml.encode("utf-8")
+
+ return Response(
+ content=xml_content,
+ media_type="application/xml"
+ )
+
+
+# Return Lua communication script
+@app.get("/v1/communication_layer", response_class=PlainTextResponse)
+def get_communication_layer() -> str:
+ return communication
+
+
+# Return documentation info
+@app.get("/v1/documentation")
+def get_documentation() -> DUUIDocumentation:
+
+ documentation = DUUIDocumentation(
+ annotator_name=settings.duui_tool_name,
+ version=settings.duui_tool_version,
+ implementation_lang="Python",
+ )
+ return documentation
+
+
+def _selection_from_options(
+ request_selection: SelectionRange | None,
+ options: dict[str, Any],
+ *,
+ text_length: int,
+) -> SelectionRange | None:
+ if request_selection is not None:
+ begin = int(request_selection.begin)
+ end = int(request_selection.end)
+ else:
+ selection = options.pop("selection", None)
+ if isinstance(selection, dict):
+ begin = selection.get("begin")
+ end = selection.get("end")
+ else:
+ begin = options.pop("selection_begin", options.pop("selection_start", None))
+ end = options.pop("selection_end", options.pop("selection_stop", None))
+
+ if begin is None or end is None:
+ return None
+
+ begin = int(begin)
+ end = int(end)
+ if begin < 0 or end < begin or end > text_length:
+ raise ValueError("selection must satisfy 0 <= begin <= end <= text length")
+ return SelectionRange(begin=begin, end=end)
+
+
+def _json_key(payload: dict[str, Any]) -> str:
+ return json.dumps(payload, sort_keys=True, separators=(",", ":"), default=str)
+
+
+def _split_options(options: dict[str, Any]) -> tuple[dict[str, Any], dict[str, Any]]:
+ redactor_options: dict[str, Any] = {}
+ decode_options: dict[str, Any] = {}
+
+ for key, value in options.items():
+ if key in {"decode", "selection", "selection_begin", "selection_end", "selection_start", "selection_stop"}:
+ continue
+ if key == "model":
+ redactor_options["model"] = value
+ elif key == "context_window_length":
+ redactor_options["context_window_length"] = value
+ elif key == "trim_whitespace":
+ redactor_options["trim_whitespace"] = value
+ elif key == "device":
+ redactor_options["device"] = value
+ elif key == "output_mode":
+ redactor_options["output_mode"] = value
+ elif key == "discard_overlapping_predicted_spans":
+ redactor_options["discard_overlapping_predicted_spans"] = value
+ elif key == "mode":
+ redactor_options["mode"] = value
+ elif key == "placeholder":
+ redactor_options["placeholder"] = value
+ elif key == "decode_mode":
+ decode_options["decode_mode"] = value
+ elif key in {"viterbi_calibration_path", "calibration_path"}:
+ decode_options["viterbi_calibration_path"] = value
+ elif key == "output_text_only":
+ continue
+
+ return redactor_options, decode_options
+
+
+@lru_cache(maxsize=8)
+def _build_redactor(options_json: str) -> OPF:
+ options = json.loads(options_json)
+ device = options.get("device")
+ if device is None:
+ device = "cuda" if torch.cuda.is_available() else "cpu"
+
+ redactor = OPF(
+ model=options.get("model", settings.default_model),
+ context_window_length=options.get("context_window_length"),
+ trim_whitespace=bool(options.get("trim_whitespace", True)),
+ device=device,
+ output_mode=options.get("output_mode", "typed"),
+ discard_overlapping_predicted_spans=bool(
+ options.get("discard_overlapping_predicted_spans", False)
+ ),
+ output_text_only=False,
+ )
+
+ return redactor
+
+
+def _compose_replacement_text(
+ text: str,
+ spans: list[DetectedSpan],
+ *,
+ placeholder: str = DEFAULT_PLACEHOLDER,
+) -> str:
+ if not spans:
+ return text
+
+ redacted_parts: list[str] = []
+ cursor = 0
+ for span in sorted(spans, key=lambda item: (item.start, item.end)):
+ if span.start < cursor:
+ continue
+ redacted_parts.append(text[cursor:span.start])
+ redacted_parts.append(placeholder)
+ cursor = max(cursor, span.end)
+ redacted_parts.append(text[cursor:])
+ return "".join(redacted_parts)
+
+
+def _detect_spans(payload: Any, *, offset: int = 0) -> list[DetectedSpan]:
+ detected_spans: list[DetectedSpan] = []
+ for span in payload:
+ if isinstance(span, dict):
+ label = span.get("label")
+ start = span.get("start")
+ end = span.get("end")
+ text = span.get("text")
+ placeholder = span.get("placeholder")
+ else:
+ label = getattr(span, "label", None)
+ start = getattr(span, "start", None)
+ end = getattr(span, "end", None)
+ text = getattr(span, "text", None)
+ placeholder = getattr(span, "placeholder", None)
+
+ detected_spans.append(
+ DetectedSpan(
+ label=str(label),
+ start=int(start) + offset,
+ end=int(end) + offset,
+ text=str(text),
+ placeholder=str(placeholder),
+ )
+ )
+
+ return detected_spans
+
+
+def _render_pseudo_response(
+ *,
+ text: str,
+ request_selection: SelectionRange | None,
+ options: dict[str, Any],
+) -> DUUIResponse:
+ summary = {
+ "mode": PSEUDO_MODE,
+ "span_count": 0,
+ "by_label": {},
+ "decoded_mismatch": False,
+ }
+ return DUUIResponse(
+ schema_version=1,
+ summary=summary,
+ text=text,
+ detected_spans=[],
+ redacted_text=text,
+ warning="pseudo mode is a stub and returns the input unchanged",
+ selection=request_selection,
+ )
+
+
+def _redact_text(text: str, request_selection: SelectionRange | None, options: dict[str, Any]) -> DUUIResponse:
+ constructor_options, decode_options = _split_options(options)
+ mode = str(constructor_options.get("mode", DEFAULT_MODE))
+ placeholder = str(constructor_options.get("placeholder", DEFAULT_PLACEHOLDER))
+
+ if mode == PSEUDO_MODE:
+ return _render_pseudo_response(
+ text=text,
+ request_selection=request_selection,
+ options=constructor_options,
+ )
+
+ redactor = _build_redactor(_json_key(constructor_options))
+ decode = DecodeOptions(**decode_options) if decode_options else None
+
+ selected_text = text
+ selection_offset = 0
+ if request_selection is not None:
+ selection_offset = request_selection.begin
+ selected_text = text[request_selection.begin:request_selection.end]
+
+ result = redactor.redact(selected_text, decode=decode)
+
+ if isinstance(result, str):
+ redacted_text = result if request_selection is None else (
+ text[:request_selection.begin] + result + text[request_selection.end:]
+ )
+ return DUUIResponse(
+ schema_version=1,
+ summary={
+ "mode": mode,
+ "span_count": 0,
+ "by_label": {},
+ "decoded_mismatch": False,
+ },
+ text=text,
+ detected_spans=[],
+ redacted_text=redacted_text,
+ warning=None,
+ selection=request_selection,
+ )
+
+ detected_spans = _detect_spans(result.detected_spans, offset=selection_offset)
+ redacted_text = _compose_replacement_text(
+ selected_text,
+ [
+ DetectedSpan(
+ label=span.label,
+ start=span.start - selection_offset,
+ end=span.end - selection_offset,
+ text=span.text,
+ placeholder=placeholder,
+ )
+ for span in detected_spans
+ ],
+ placeholder=placeholder,
+ )
+ if request_selection is not None:
+ redacted_text = text[:request_selection.begin] + redacted_text + text[request_selection.end:]
+
+ return DUUIResponse(
+ schema_version=int(result.schema_version),
+ summary={**dict(result.summary), "mode": mode},
+ text=text,
+ detected_spans=detected_spans,
+ redacted_text=redacted_text,
+ warning=result.warning,
+ selection=request_selection,
+ )
+
+
+# Process request from DUUI
+@app.post("/v1/process")
+def post_process(request: DUUIRequest) -> DUUIResponse:
+ selection = service.selection_from_options(
+ request.selection,
+ dict(request.options),
+ text_length=len(request.text),
+ )
+ return service.redact_text(request.text, selection, dict(request.options))
+
+
+if __name__ == "__main__":
+ uvicorn.run("duui_opf:app", host="0.0.0.0", port=9714, workers=1)
diff --git a/duui-anonymize/src/main/python/typesystem.xml b/duui-anonymize/src/main/python/typesystem.xml
new file mode 100644
index 00000000..5032b07b
--- /dev/null
+++ b/duui-anonymize/src/main/python/typesystem.xml
@@ -0,0 +1,2202 @@
+
+
+
+
+
+ de.tudarmstadt.ukp.dkpro.core.api.anomaly.type.Anomaly
+
+ uima.tcas.Annotation
+
+
+ description
+
+ uima.cas.String
+
+
+ suggestions
+ An array of the suggested actions to be taken for this anomaly.
+ uima.cas.FSArray
+ de.tudarmstadt.ukp.dkpro.core.api.anomaly.type.SuggestedAction
+
+
+ category
+
+ uima.cas.String
+
+
+
+
+ de.tudarmstadt.ukp.dkpro.core.api.anomaly.type.GrammarAnomaly
+
+ de.tudarmstadt.ukp.dkpro.core.api.anomaly.type.Anomaly
+
+
+ de.tudarmstadt.ukp.dkpro.core.api.anomaly.type.SpellingAnomaly
+
+ de.tudarmstadt.ukp.dkpro.core.api.anomaly.type.Anomaly
+
+
+ de.tudarmstadt.ukp.dkpro.core.api.anomaly.type.SuggestedAction
+
+ uima.tcas.Annotation
+
+
+ replacement
+ The text covered by the Anomaly annotation should be replaced with the contents of this
+ feature.
+
+ uima.cas.String
+
+
+ certainty
+ A score representing how certain is this suggested action.
+ Usually in [0,1].
+
+ uima.cas.Float
+
+
+
+
+ de.tudarmstadt.ukp.dkpro.core.api.coref.type.CoreferenceChain
+ Marks the beginning of a chain.
+ uima.cas.AnnotationBase
+
+
+ first
+ This is the first corefernce link in coreference chain
+ de.tudarmstadt.ukp.dkpro.core.api.coref.type.CoreferenceLink
+
+
+
+
+ de.tudarmstadt.ukp.dkpro.core.api.coref.type.CoreferenceLink
+ A link in the coreference chain.
+ uima.tcas.Annotation
+
+
+ next
+ If there is one, it is the next coreference link to the current coreference link
+
+ de.tudarmstadt.ukp.dkpro.core.api.coref.type.CoreferenceLink
+
+
+ referenceType
+ The role or type which the covered text has in the coreference chain.
+ uima.cas.String
+
+
+ referenceRelation
+ The type of relation between this link and the next link in the chain.
+ uima.cas.String
+
+
+
+
+ de.tudarmstadt.ukp.dkpro.core.api.frequency.tfidf.type.Tfidf
+ Annotates the tf.idf score of a token, stem, or lemma.
+ uima.tcas.Annotation
+
+
+ tfidfValue
+ The tf.idf score.
+ uima.cas.Double
+
+
+ term
+ The string that was used to compute this tf.idf score.
+ If a stem or lemma was used, the covered text of this annotation does not need to be equal to
+ this string.
+
+ This string can be used to construct a vector space with the right terms without having to
+ access the indexes again.
+
+ uima.cas.String
+
+
+
+
+ de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.morph.Morpheme
+
+ uima.tcas.Annotation
+
+
+ morphTag
+
+ uima.cas.String
+
+
+
+
+ de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.morph.MorphologicalFeatures
+ Morphological categories that can be attached to tokens.
+
+ The features are supposed to match the Universal Dependency v1 features.
+
+ uima.tcas.Annotation
+
+
+ gender
+
+ uima.cas.String
+
+
+ number
+ Singular/plural
+ uima.cas.String
+
+
+ case
+ Nouns: nominative, genetiv, dative, ...
+ uima.cas.String
+
+
+ degree
+ Adjectives: comparative/Superlative
+ uima.cas.String
+
+
+ verbForm
+
+ uima.cas.String
+
+
+ tense
+ Verbs: past tense, present tense, future tense, etc.
+ uima.cas.String
+
+
+ mood
+ Verbs: indicative, imperative, subjunctive
+ uima.cas.String
+
+
+ voice
+ Verbs: active/passive
+ uima.cas.String
+
+
+ definiteness
+ Definite or indefinite
+ uima.cas.String
+
+
+ value
+ The original morphological analysis results as produced by a tool or as recorded in a
+ corpus (if available). If the categories were originally encoded in such a string, the other
+ features are filled by analyzing this string. If the categories were provided separately, e.g.
+ by different attributed in an XML-encoded corpus, this field may remain empty.
+
+ uima.cas.String
+
+
+ person
+ Verbs: 1st, 2nd, 3rd person
+ uima.cas.String
+
+
+ aspect
+ Verbs: perfective, imperfective
+ uima.cas.String
+
+
+ animacy
+
+ uima.cas.String
+
+
+ negative
+
+ uima.cas.String
+
+
+ numType
+
+ uima.cas.String
+
+
+ possessive
+
+ uima.cas.String
+
+
+ pronType
+
+ uima.cas.String
+
+
+ reflex
+
+ uima.cas.String
+
+
+ transitivity
+ Verbs: transitive/intransitive
+
+ @deprecated
+
+ uima.cas.String
+
+
+
+
+ de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS
+ The part of speech of a word or a phrase.
+ uima.tcas.Annotation
+
+
+ PosValue
+ Fine-grained POS tag. This is the tag as produced by a POS tagger or obtained from a
+ reader.
+
+ uima.cas.String
+
+
+ coarseValue
+ Coarse-grained POS tag. This may be produced by a POS tagger or reader in addition to
+ the fine-grained tag.
+
+ uima.cas.String
+
+
+
+
+ de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS_ADJ
+ Adjective
+ de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS
+
+
+ de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS_ADP
+ Adposition
+ de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS
+
+
+ de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS_ADV
+ Adverb
+ de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS
+
+
+ de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS_AUX
+ Auxiliary verb
+ de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS
+
+
+ de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS_CONJ
+ Conjunction
+ de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS
+
+
+ de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS_DET
+ Determiner
+ de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS
+
+
+ de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS_INTJ
+ Interjection
+ de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS
+
+
+ de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS_NOUN
+ Noun
+ de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS
+
+
+ de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS_NUM
+ Numeral
+ de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS
+
+
+ de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS_PART
+ Particle
+ de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS
+
+
+ de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS_PRON
+ Pronoun
+ de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS
+
+
+ de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS_PROPN
+ Proper noun
+ de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS
+
+
+ de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS_PUNCT
+ Punctuation
+ de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS
+
+
+ de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS_SCONJ
+ Subordinating conjunction
+ de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS
+
+
+ de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS_SYM
+ Symbol
+ de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS
+
+
+ de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS_VERB
+ Verb
+ de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS
+
+
+ de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS_X
+ Other
+ de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS
+
+
+ de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.tweet.POS_AT
+ at-mention (indicates another user as a recipient of a tweet)
+ de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS_X
+
+
+ de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.tweet.POS_DM
+ discourse marker, indications of continuation of a message across multiple tweets
+ de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS_X
+
+
+ de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.tweet.POS_EMO
+ emoticon
+ de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS_X
+
+
+ de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.tweet.POS_HASH
+ Hashtag (indicates topic/category for tweet)
+ de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS_X
+
+
+ de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.tweet.POS_INT
+ proper noun + verbal
+ de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS_X
+
+
+ de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.tweet.POS_NNV
+ nominal + verbal
+ de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS_NOUN
+
+
+ de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.tweet.POS_NPV
+ proper noun + verbal
+ de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS_NOUN
+
+
+ de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.tweet.POS_URL
+ URL or email address
+ de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS_X
+
+
+ de.tudarmstadt.ukp.dkpro.core.api.metadata.type.MetaDataStringField
+ <p>A general purpose annotation to store document-wide information in the form of
+ arbitrary key-value string pairs.</p>
+
+ uima.tcas.Annotation
+
+
+ key
+ Name of a metadata field.
+ uima.cas.String
+
+
+ value
+ The field value.
+ uima.cas.String
+
+
+
+
+ de.tudarmstadt.ukp.dkpro.core.api.metadata.type.TagDescription
+ Description of an individual tag.
+ uima.cas.TOP
+
+
+ name
+ The name of the tag.
+ uima.cas.String
+
+
+
+
+ de.tudarmstadt.ukp.dkpro.core.api.metadata.type.TagsetDescription
+ Information about a tagset (controlled vocabulary).
+ uima.tcas.Annotation
+
+
+ layer
+ The layer to which the tagset applies. This is
+ typically the name of an UIMA type such as
+ "de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS".
+
+ uima.cas.String
+
+
+ name
+ The name of the tagset.
+ uima.cas.String
+
+
+ tags
+ Descriptions of the tags belonging to this tagset.
+ uima.cas.FSArray
+ de.tudarmstadt.ukp.dkpro.core.api.metadata.type.TagDescription
+
+
+ componentName
+
+ uima.cas.String
+
+
+ modelLocation
+
+ uima.cas.String
+
+
+ modelVariant
+
+ uima.cas.String
+
+
+ modelLanguage
+
+ uima.cas.String
+
+
+ modelVersion
+
+ uima.cas.String
+
+
+ input
+ True if the tagset is used as input by the component/model, otherwise false.
+
+ uima.cas.Boolean
+
+
+
+
+ de.tudarmstadt.ukp.dkpro.core.api.ner.type.Animal
+
+ de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity
+
+
+ de.tudarmstadt.ukp.dkpro.core.api.ner.type.Cardinal
+
+ de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity
+
+
+ de.tudarmstadt.ukp.dkpro.core.api.ner.type.ContactInfo
+
+ de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity
+
+
+ de.tudarmstadt.ukp.dkpro.core.api.ner.type.Date
+
+ de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity
+
+
+ de.tudarmstadt.ukp.dkpro.core.api.ner.type.Disease
+
+ de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity
+
+
+ de.tudarmstadt.ukp.dkpro.core.api.ner.type.Event
+
+ de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity
+
+
+ de.tudarmstadt.ukp.dkpro.core.api.ner.type.Fac
+
+ de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity
+
+
+ de.tudarmstadt.ukp.dkpro.core.api.ner.type.FacDesc
+
+ de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity
+
+
+ de.tudarmstadt.ukp.dkpro.core.api.ner.type.Game
+
+ de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity
+
+
+ de.tudarmstadt.ukp.dkpro.core.api.ner.type.Gpe
+
+ de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity
+
+
+ de.tudarmstadt.ukp.dkpro.core.api.ner.type.GpeDesc
+
+ de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity
+
+
+ de.tudarmstadt.ukp.dkpro.core.api.ner.type.Language
+
+ de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity
+
+
+ de.tudarmstadt.ukp.dkpro.core.api.ner.type.Law
+
+ de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity
+
+
+ de.tudarmstadt.ukp.dkpro.core.api.ner.type.Location
+
+ de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity
+
+
+ de.tudarmstadt.ukp.dkpro.core.api.ner.type.Money
+
+ de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity
+
+
+ de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity
+ Named entities refer e.g. to persons, locations, organizations and so on. They often consist of
+ multiple tokens.
+
+ uima.tcas.Annotation
+
+
+ value
+ The class/category of the named entity, e.g. person, location, etc.
+ uima.cas.String
+
+
+ identifier
+ Identifier of the named entity, e.g. a reference into a person database.
+ uima.cas.String
+
+
+
+
+ de.tudarmstadt.ukp.dkpro.core.api.ner.type.Nationality
+
+ de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity
+
+
+ de.tudarmstadt.ukp.dkpro.core.api.ner.type.Norp
+
+ de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity
+
+
+ de.tudarmstadt.ukp.dkpro.core.api.ner.type.Ordinal
+
+ de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity
+
+
+ de.tudarmstadt.ukp.dkpro.core.api.ner.type.OrgDesc
+
+ de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity
+
+
+ de.tudarmstadt.ukp.dkpro.core.api.ner.type.Organization
+
+ de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity
+
+
+ de.tudarmstadt.ukp.dkpro.core.api.ner.type.PerDesc
+
+ de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity
+
+
+ de.tudarmstadt.ukp.dkpro.core.api.ner.type.Percent
+
+ de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity
+
+
+ de.tudarmstadt.ukp.dkpro.core.api.ner.type.Person
+
+ de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity
+
+
+ de.tudarmstadt.ukp.dkpro.core.api.ner.type.Plant
+
+ de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity
+
+
+ de.tudarmstadt.ukp.dkpro.core.api.ner.type.Product
+
+ de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity
+
+
+ de.tudarmstadt.ukp.dkpro.core.api.ner.type.ProductDesc
+
+ de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity
+
+
+ de.tudarmstadt.ukp.dkpro.core.api.ner.type.Quantity
+
+ de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity
+
+
+ de.tudarmstadt.ukp.dkpro.core.api.ner.type.Substance
+
+ de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity
+
+
+ de.tudarmstadt.ukp.dkpro.core.api.ner.type.Time
+
+ de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity
+
+
+ de.tudarmstadt.ukp.dkpro.core.api.ner.type.WorkOfArt
+
+ de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity
+
+
+ de.tudarmstadt.ukp.dkpro.core.api.phonetics.type.PhoneticTranscription
+ <p>Represents the phonetic transcription of some textual element (usually a Token).
+ Phonetic transcriptions are e.g. generated by transcription processes like Soundex or Metaphone.</p>
+
+ uima.tcas.Annotation
+
+
+ transcription
+ The actual transcription
+ uima.cas.String
+
+
+ name
+ The name of the transcription process that was used
+ uima.cas.String
+
+
+
+
+ de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Compound
+ This type represents a decompounding word, i.e.: flowerpot. Each Compound one have at least two
+ Splits.
+
+ uima.tcas.Annotation
+
+
+ splits
+ A word that can be decomposed into different parts.
+ uima.cas.FSArray
+ de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Split
+
+
+
+
+ de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.CompoundPart
+ <p>A CompoundPart represents one fragment from the compounding word. Besides that, it can
+ store other CompoundParts if it can be split again. The way it stores a decompounding word represents a
+ decompounding tree.</p>
+
+ de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Split
+
+
+ de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Div
+ Document structure element.
+ uima.tcas.Annotation
+
+
+ divType
+
+ uima.cas.String
+
+
+ id
+ If this unit had an ID in the source format from which it was imported, it may be
+ stored here. IDs are typically not assigned by DKPro Core components. If an ID is present, it
+ should be respected by writers.
+
+ uima.cas.String
+
+
+
+
+ de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Document
+
+ de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Div
+
+
+ de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Heading
+ Document title, section heading, etc.
+ de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Div
+
+
+ de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma
+
+ uima.tcas.Annotation
+
+
+ value
+
+ uima.cas.String
+
+
+
+
+ de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.LexicalPhrase
+
+ uima.tcas.Annotation
+
+
+ text
+
+ uima.cas.String
+
+
+
+
+ de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.LinkingMorpheme
+ This type represents a linking morpheme between two CompoundParts.
+ de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Split
+
+
+ de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.NGram
+
+ uima.tcas.Annotation
+
+
+ text
+
+ uima.cas.String
+
+
+
+
+ de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Paragraph
+
+ de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Div
+
+
+ de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence
+
+ uima.tcas.Annotation
+
+
+ id
+ If this unit had an ID in the source format from which it was imported, it may be
+ stored here. IDs are typically not assigned by DKPro Core components. If an ID is present, it
+ should be respected by writers.
+
+ uima.cas.String
+
+
+
+
+ de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Split
+ This type represents a part of a decompounding word. A Split can be either a CompoundPart or a
+ LinkingMorpheme.
+
+ uima.tcas.Annotation
+
+
+ splits
+ Sub-splits of the current split.
+ uima.cas.FSArray
+ de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Split
+
+
+
+
+ de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Stem
+
+ uima.tcas.Annotation
+
+
+ value
+
+ uima.cas.String
+
+
+
+
+ de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.StopWord
+
+ uima.tcas.Annotation
+
+
+ de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.SurfaceForm
+ This annotation can be used to indicate an alternate surface form. E.g. some corpora consider a
+ normalized form of the text with resolved contractions as the canonical form and only maintain the
+ original surface form as a secondary information. One example is the Conll-U format.
+
+ uima.tcas.Annotation
+
+
+ value
+ Alternate surface form.
+ uima.cas.String
+
+
+
+
+ de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token
+ <p>Token is one of the two types commonly produced by a segmenter (the other being
+ Sentence). A Token usually represents a word, although it may be used to represent multiple tightly
+ connected words (e.g. "New York") or parts of a word (e.g. the possessive "'s"). One may choose to split
+ compound words into multiple tokens, e.g. ("CamelCase" -> "Camel", "Case"; "Zauberstab" ->
+ "Zauber", "stab"). Most processing components operate on Tokens, usually within the limits of the
+ surrounding Sentence. E.g. a part-of-speech tagger analyses each Token in a Sentence and assigns a
+ part-of-speech to each Token.</p>
+
+ uima.tcas.Annotation
+
+
+ parent
+ the parent of this token. This feature is meant to be used in when the token
+ participates in a constituency parse and then refers to a constituent containing this token. The
+ type of this feature is {@link Annotation} to avoid adding a dependency on the syntax API
+ module.
+
+ uima.tcas.Annotation
+
+
+ lemma
+
+ de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma
+
+
+ stem
+
+ de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Stem
+
+
+ pos
+
+ de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS
+
+
+ morph
+ The morphological feature associated with this token.
+ de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.morph.MorphologicalFeatures
+
+
+
+ id
+ If this unit had an ID in the source format from which it was imported, it may be
+ stored here. IDs are typically not assigned by DKPro Core components. If an ID is present, it
+ should be respected by writers.
+
+ uima.cas.String
+
+
+ form
+ Potentially normalized form of the token text that should be used instead of the
+ covered text if set.
+
+ de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.TokenForm
+
+
+ syntacticFunction
+
+ uima.cas.String
+
+
+ order
+ Disambiguates the token order for tokens which have the same offsets, e.g. when the
+ contraction "à" is analyzed as two tokens "a" and "a".
+
+ uima.cas.Integer
+
+
+
+
+ de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.TokenForm
+ A alternative token text which should be used instead of the covered text if set on a token.
+
+ uima.tcas.Annotation
+
+
+ value
+
+ uima.cas.String
+
+
+
+
+ de.tudarmstadt.ukp.dkpro.core.api.semantics.type.SemArg
+ The SemArg annotation is attached to semantic arguments of semantic
+ predicates. Semantic arguments are characterized by their semantic role, e.g. Agent,
+ Experiencer, Topic. The semantic role of an argument is related to its semantic type
+ (for communication verbs, the Agent can be a person or an organization, but
+ typically not food).
+
+ uima.tcas.Annotation
+
+
+ de.tudarmstadt.ukp.dkpro.core.api.semantics.type.SemArgLink
+ The SemArgLink type is used to attach SemPred annotations to their respective SemArg
+ annotations while giving each link a role.
+
+ uima.cas.TOP
+
+
+ role
+ The role which the argument takes. The value depends on the theory being used, e.g.
+ Arg0, Arg1, etc. or Buyer, Seller, etc.
+
+ uima.cas.String
+
+
+ target
+ The target argument.
+ de.tudarmstadt.ukp.dkpro.core.api.semantics.type.SemArg
+
+
+
+
+ de.tudarmstadt.ukp.dkpro.core.api.semantics.type.SemPred
+ One of the predicates of a sentence (often a main verb, but nouns and adjectives can also be
+ predicates).
+ The SemPred annotation can be attached to predicates in a sentence.
+ Semantic predicates express events or situations and take semantic arguments
+ expressing the participants in these events or situations. All forms of main verbs
+ can be annotated with a SemPred. However, there are also many nouns and
+ adjectives that take arguments and can thus be annotated with a SemanticPredicate,
+ e.g. event nouns, such as "suggestion" (with arguments what and by whom), or
+ relational adjectives, such as "proud" (with arguments who and of what).
+
+ uima.tcas.Annotation
+
+
+ arguments
+ The predicate's arguments.
+ uima.cas.FSArray
+ de.tudarmstadt.ukp.dkpro.core.api.semantics.type.SemArgLink
+
+
+ category
+ A more detailed specification of the predicate type depending on the theory being used,
+ e.g. a frame name.
+
+ uima.cas.String
+
+
+
+
+ de.tudarmstadt.ukp.dkpro.core.api.semantics.type.SemanticArgument
+ The SemanticArgument annotation is attached to semantic arguments of semantic
+ predicates. Semantic arguments are characterized by their semantic role, e.g. Agent,
+ Experiencer, Topic. The semantic role of an argument is related to its semantic type
+ (for communication verbs, the Agent can be a person or an organization, but
+ typically not food). The semantic type of arguments is not yet covered by the
+ SemanticType.
+
+ @deprecated Use SemArg instead.
+
+ uima.tcas.Annotation
+
+
+ role
+ The role which the argument takes. The value depends on the theory being used, e.g.
+ Arg0, Arg1, etc. or Buyer, Seller, etc.
+
+ uima.cas.String
+
+
+
+
+ de.tudarmstadt.ukp.dkpro.core.api.semantics.type.SemanticField
+ The SemanticField is a coarse-grained semantic category that can be attached to
+ nouns, verbs or adjectives. Semantic field information is present e.g. in WordNet as
+ lexicographer file names. Previously, this kind of semantic information has also
+ been called supersenses or semantic types.
+
+ uima.tcas.Annotation
+
+
+ value
+ The value or name of the semantic field. Examples of semantic field values are:
+ location, artifact, event, communication, attribute
+
+ uima.cas.String
+
+
+
+
+ de.tudarmstadt.ukp.dkpro.core.api.semantics.type.SemanticPredicate
+ One of the predicates of a sentence (often a main verb, but nouns and adjectives can also be
+ predicates). The SemanticPredicate annotation can be attached to predicates in a sentence.
+ Semantic predicates express events or situations and take semantic arguments
+ expressing the participants in these events ore situations. All forms of main verbs
+ can be annotated with a SemanticPredicate. However, there are also many nouns and
+ adjectives that take arguments and can thus be annotated with a SemanticPredicate,
+ e.g. event nouns, such as "suggestion" (with arguments what and by whom), or
+ relational adjectives, such as "proud" (with arguments who and of what).
+
+ @deprecated use SemPred instead
+
+ uima.tcas.Annotation
+
+
+ category
+ A more detailed specification of the predicate type depending on the theory being used,
+ e.g. a frame name.
+
+ uima.cas.String
+
+
+ arguments
+ The predicate's arguments.
+ uima.cas.FSArray
+ de.tudarmstadt.ukp.dkpro.core.api.semantics.type.SemanticArgument
+
+
+
+
+ de.tudarmstadt.ukp.dkpro.core.api.semantics.type.WordSense
+
+ uima.tcas.Annotation
+
+
+ value
+ The sense identifier.
+ uima.cas.String
+
+
+
+
+ de.tudarmstadt.ukp.dkpro.core.api.structure.type.Field
+
+ uima.tcas.Annotation
+
+
+ name
+ the name of the tag
+ uima.cas.String
+
+
+
+
+ de.tudarmstadt.ukp.dkpro.core.api.syntax.type.PennTree
+ <p>The Penn Treebank-style phrase structure string.</p>
+ uima.tcas.Annotation
+
+
+ PennTree
+ Contains a Penn Treebank-style representation of a tree.
+ uima.cas.String
+
+
+ TransformationNames
+ The name(s) of the transformation(s) that have been performed on the PennTree
+
+ uima.cas.String
+
+
+
+
+ de.tudarmstadt.ukp.dkpro.core.api.syntax.type.chunk.ADJC
+ adjective chunks
+ de.tudarmstadt.ukp.dkpro.core.api.syntax.type.chunk.Chunk
+
+
+ de.tudarmstadt.ukp.dkpro.core.api.syntax.type.chunk.ADVC
+ adverb chunks
+ de.tudarmstadt.ukp.dkpro.core.api.syntax.type.chunk.Chunk
+
+
+ de.tudarmstadt.ukp.dkpro.core.api.syntax.type.chunk.CONCJ
+ complex coordinating conjunctions such as "as well (as)" or "rather (than)"
+ de.tudarmstadt.ukp.dkpro.core.api.syntax.type.chunk.Chunk
+
+
+ de.tudarmstadt.ukp.dkpro.core.api.syntax.type.chunk.Chunk
+
+ uima.tcas.Annotation
+
+
+ chunkValue
+
+ uima.cas.String
+
+
+
+
+ de.tudarmstadt.ukp.dkpro.core.api.syntax.type.chunk.INTJ
+ interjection
+ de.tudarmstadt.ukp.dkpro.core.api.syntax.type.chunk.Chunk
+
+
+ de.tudarmstadt.ukp.dkpro.core.api.syntax.type.chunk.LST
+ enumeration symbol
+ de.tudarmstadt.ukp.dkpro.core.api.syntax.type.chunk.Chunk
+
+
+ de.tudarmstadt.ukp.dkpro.core.api.syntax.type.chunk.NC
+ noun chunk (non-recursive noun phrase)
+ de.tudarmstadt.ukp.dkpro.core.api.syntax.type.chunk.Chunk
+
+
+ de.tudarmstadt.ukp.dkpro.core.api.syntax.type.chunk.O
+ other or outside a chunk
+ de.tudarmstadt.ukp.dkpro.core.api.syntax.type.chunk.Chunk
+
+
+ de.tudarmstadt.ukp.dkpro.core.api.syntax.type.chunk.PC
+ prepositional chunk
+ de.tudarmstadt.ukp.dkpro.core.api.syntax.type.chunk.Chunk
+
+
+ de.tudarmstadt.ukp.dkpro.core.api.syntax.type.chunk.PRT
+ verb particle
+ de.tudarmstadt.ukp.dkpro.core.api.syntax.type.chunk.Chunk
+
+
+ de.tudarmstadt.ukp.dkpro.core.api.syntax.type.chunk.VC
+ verb complex
+ de.tudarmstadt.ukp.dkpro.core.api.syntax.type.chunk.Chunk
+
+
+ de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.ADJP
+
+ de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent
+
+
+ de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.ADVP
+
+ de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent
+
+
+ de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.CONJP
+
+ de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent
+
+
+ de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent
+
+ uima.tcas.Annotation
+
+
+ constituentType
+
+ uima.cas.String
+
+
+ parent
+ The parent constituent
+ uima.tcas.Annotation
+
+
+ children
+
+ uima.cas.FSArray
+ uima.tcas.Annotation
+
+
+ syntacticFunction
+
+ uima.cas.String
+
+
+
+
+ de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.FRAG
+
+ de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent
+
+
+ de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.INTJ
+
+ de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent
+
+
+ de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.LST
+
+ de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent
+
+
+ de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.NAC
+
+ de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent
+
+
+ de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.NP
+
+ de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent
+
+
+ de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.NX
+
+ de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent
+
+
+ de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.PARN
+ This cateory is called PRN in the Penn Treebank tagset. However, PRN is a reserved device name
+ on Window. Thus we had to rename this category. The old PRN type is still present in the DKPro Core type
+ system, but it is deprecated, no longer used, and no JCas classes are generated for it.
+
+ de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent
+
+
+ de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.PP
+
+ de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent
+
+
+ de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.PRP
+
+ de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent
+
+
+ de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.PRT
+
+ de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent
+
+
+ de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.QP
+
+ de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent
+
+
+ de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.ROOT
+
+ de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent
+
+
+ de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.RRC
+
+ de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent
+
+
+ de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.S
+
+ de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent
+
+
+ de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.SBAR
+
+ de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent
+
+
+ de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.SBARQ
+
+ de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent
+
+
+ de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.SINV
+
+ de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent
+
+
+ de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.SQ
+
+ de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent
+
+
+ de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.UCP
+
+ de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent
+
+
+ de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.VP
+
+ de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent
+
+
+ de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.WHADJP
+
+ de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent
+
+
+ de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.WHADVP
+
+ de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent
+
+
+ de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.WHNP
+
+ de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent
+
+
+ de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.WHPP
+
+ de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent
+
+
+ de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.X
+
+ de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent
+
+
+ de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.ABBREV
+
+ de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency
+
+
+ de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.ACOMP
+
+ de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency
+
+
+ de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.ADVCL
+
+ de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency
+
+
+ de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.ADVMOD
+
+ de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency
+
+
+ de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.AGENT
+
+ de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency
+
+
+ de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.AMOD
+
+ de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency
+
+
+ de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.APPOS
+
+ de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency
+
+
+ de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.ATTR
+
+ de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency
+
+
+ de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.AUX0
+
+ de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency
+
+
+ de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.AUXPASS
+
+ de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency
+
+
+ de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.CC
+
+ de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency
+
+
+ de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.CCOMP
+
+ de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency
+
+
+ de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.COMPLM
+
+ de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency
+
+
+ de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.CONJ
+
+ de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency
+
+
+ de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.CONJP
+
+ de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency
+
+
+ de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.CONJ_YET
+
+ de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency
+
+
+ de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.COP
+
+ de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency
+
+
+ de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.CSUBJ
+
+ de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency
+
+
+ de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.CSUBJPASS
+
+ de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency
+
+
+ de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.DEP
+
+ de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency
+
+
+ de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.DET
+
+ de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency
+
+
+ de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.DOBJ
+
+ de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency
+
+
+ de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency
+ A dependency relation between two tokens. The dependency annotation begin and end offsets
+ correspond to those of the dependent.
+
+ uima.tcas.Annotation
+
+
+ Governor
+ The governor word
+ de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token
+
+
+ Dependent
+ The dependent word
+ de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token
+
+
+ DependencyType
+ The dependency type
+ uima.cas.String
+
+
+ flavor
+ Flavor of the dependency relation (basic, collapsed, enhanced, etc...)
+ uima.cas.String
+
+
+
+
+ de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.EXPL
+
+ de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency
+
+
+ de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.INFMOD
+
+ de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency
+
+
+ de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.IOBJ
+
+ de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency
+
+
+ de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.MARK
+
+ de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency
+
+
+ de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.MEASURE
+
+ de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency
+
+
+ de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.MWE
+
+ de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency
+
+
+ de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.NEG
+
+ de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency
+
+
+ de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.NN
+
+ de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency
+
+
+ de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.NPADVMOD
+
+ de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency
+
+
+ de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.NSUBJ
+
+ de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency
+
+
+ de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.NSUBJPASS
+
+ de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency
+
+
+ de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.NUM
+
+ de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency
+
+
+ de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.NUMBER
+
+ de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency
+
+
+ de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.PARATAXIS
+
+ de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency
+
+
+ de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.PARTMOD
+
+ de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency
+
+
+ de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.PCOMP
+
+ de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency
+
+
+ de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.POBJ
+
+ de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency
+
+
+ de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.POSS
+
+ de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency
+
+
+ de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.POSSESSIVE
+
+ de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency
+
+
+ de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.PRECONJ
+
+ de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency
+
+
+ de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.PRED
+
+ de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency
+
+
+ de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.PREDET
+
+ de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency
+
+
+ de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.PREP
+
+ de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency
+
+
+ de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.PREPC
+
+ de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency
+
+
+ de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.PRT
+
+ de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency
+
+
+ de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.PUNCT
+
+ de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency
+
+
+ de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.PURPCL
+
+ de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency
+
+
+ de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.QUANTMOD
+
+ de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency
+
+
+ de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.RCMOD
+
+ de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency
+
+
+ de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.REF
+
+ de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency
+
+
+ de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.REL
+
+ de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency
+
+
+ de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.ROOT
+ Dependency tree root.
+ de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency
+
+
+ de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.TMOD
+
+ de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency
+
+
+ de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.XCOMP
+
+ de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency
+
+
+ de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.XSUBJ
+
+ de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency
+
+
+ de.tudarmstadt.ukp.dkpro.core.api.transform.type.SofaChangeAnnotation
+ Encodes an edit operation that can be interpreted by the ApplyChangesAnnotator.
+ uima.tcas.Annotation
+
+
+ value
+ In case of an "insert" or "replace" operation, this feature indicates the value to be
+ inserted or replaced.
+
+ uima.cas.String
+
+
+ operation
+ Operation to perform: "insert", "replace", "delete"
+ uima.cas.String
+
+
+ reason
+ The reason for the change.
+ uima.cas.String
+
+
+
+
+ de.tudarmstadt.ukp.dkpro.core.io.jwpl.type.ArticleInfo
+ Contains basic information about the article.
+ uima.tcas.Annotation
+
+
+ Authors
+ Number of unique authors of this article
+ uima.cas.Integer
+
+
+ Revisions
+ Number of revisions of this article.
+ uima.cas.Integer
+
+
+ FirstAppearance
+ The Timestamp of the first appearance of this article.
+ uima.cas.Long
+
+
+ LastAppearance
+ The Timestamp of the last appearance of this article.
+ uima.cas.Long
+
+
+
+
+ de.tudarmstadt.ukp.dkpro.core.io.jwpl.type.DBConfig
+ Database configuration for the connection to the database where the CAS data was retrieved.
+
+ uima.tcas.Annotation
+
+
+ Host
+ DB Host
+ uima.cas.String
+
+
+ DB
+ Database
+ uima.cas.String
+
+
+ User
+ Username
+ uima.cas.String
+
+
+ Password
+ User password
+ uima.cas.String
+
+
+ Language
+ Wikipedia Language Versions
+ uima.cas.String
+
+
+
+
+ de.tudarmstadt.ukp.dkpro.core.io.jwpl.type.WikipediaLink
+ Wikipedia link
+ uima.tcas.Annotation
+
+
+ LinkType
+ The type of the link, e.g. internal, external, image, ...
+ uima.cas.String
+
+
+ Target
+ The link target url
+ uima.cas.String
+
+
+ Anchor
+ The anchor of the link
+ uima.cas.String
+
+
+
+
+ de.tudarmstadt.ukp.dkpro.core.io.jwpl.type.WikipediaRevision
+ Represents a revision in Wikipedia.
+ uima.tcas.Annotation
+
+
+ revisionId
+ The ID of the revision.
+ uima.cas.Integer
+
+
+ pageId
+ The pageId of the Wikipedia page of this revision.
+ uima.cas.Integer
+
+
+ contributorName
+ The username of the user/contributor who edited this revision.
+ uima.cas.String
+
+
+ comment
+ The comment that the editor entered for this revision.
+ uima.cas.String
+
+
+ contributorId
+ The userId of the user/contributor who created this revision
+ uima.cas.Integer
+
+
+ timestamp
+ The timestamp of the revision, given in milliseconds since the standard base time
+ (January 1, 1970, 00:00:00 GMT)
+
+ uima.cas.Long
+
+
+ minor
+ Whether this revision has been marked as minor edit by its contributor.
+ uima.cas.Boolean
+
+
+
+
+ de.tudarmstadt.ukp.dkpro.core.mallet.type.TopicDistribution
+ An array representing the topic proportions in a document.
+ uima.tcas.Annotation
+
+
+ TopicProportions
+ Each topic's proportion in the document.
+ uima.cas.DoubleArray
+
+
+ TopicAssignment
+ Pointers to topics the document has been assigned to.
+ uima.cas.IntegerArray
+
+
+
+
+ de.tudarmstadt.ukp.dkpro.core.mallet.type.WordEmbedding
+ An array representing the word embedding vector.
+ uima.tcas.Annotation
+
+
+ WordEmbedding
+ A word embedding vector.
+ uima.cas.FloatArray
+
+
+
+
+ de.tudarmstadt.ukp.dkpro.core.mecab.type.JapaneseToken
+
+ de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token
+
+
+ kana
+
+ uima.cas.String
+
+
+ ibo
+
+ uima.cas.String
+
+
+ kei
+
+ uima.cas.String
+
+
+ dan
+ Specifies the kind of the verb if the current token is a verb. Either it is a vowel
+ stem verb (ichi-dan) or a consonant stem verb (go-dan). Blank if not a verb.
+
+ uima.cas.String
+
+
+
+
+ de.tudarmstadt.ukp.dkpro.core.performance.type.TimerAnnotation
+ Used for storing timing information (e.g. for performance testing).
+ uima.tcas.Annotation
+
+
+ startTime
+
+ uima.cas.Long
+
+
+ endTime
+
+ uima.cas.Long
+
+
+ name
+ The name of the timer.
+ Used to automatically determine whether this is an upstream or downstream timer.
+
+ uima.cas.String
+
+
+
+
+ de.tudarmstadt.ukp.dkpro.core.type.ReadabilityScore
+
+ uima.tcas.Annotation
+
+
+ measureName
+
+ uima.cas.String
+
+
+ score
+
+ uima.cas.Double
+
+
+
+
+ org.dkpro.core.api.xml.type.XmlAttribute
+
+ uima.cas.TOP
+
+
+ uri
+ Namespace URI of the attribute.
+ uima.cas.String
+
+
+ localName
+ Local name of the attribute.
+ uima.cas.String
+
+
+ value
+ Value of the XML attribute.
+ uima.cas.String
+
+
+ qName
+
+ uima.cas.String
+
+
+ valueType
+
+ uima.cas.String
+
+
+
+
+ org.dkpro.core.api.xml.type.XmlDocument
+ XML document
+ uima.tcas.Annotation
+
+
+ root
+ Root element of the XML document.
+ org.dkpro.core.api.xml.type.XmlElement
+
+
+
+
+ org.dkpro.core.api.xml.type.XmlElement
+ XML element
+ org.dkpro.core.api.xml.type.XmlNode
+
+
+ uri
+ Namespace URI of the element.
+ uima.cas.String
+
+
+ localName
+ Local name of the XML element.
+ uima.cas.String
+
+
+ attributes
+ Array of attributes of the XML element.
+ uima.cas.FSArray
+ org.dkpro.core.api.xml.type.XmlAttribute
+
+
+ children
+ Children of this XML element.
+ uima.cas.FSArray
+ org.dkpro.core.api.xml.type.XmlNode
+
+
+ qName
+
+ uima.cas.String
+
+
+
+
+ org.dkpro.core.api.xml.type.XmlNode
+ Supertype for XmlElements and XmlTextNodes.
+ uima.tcas.Annotation
+
+
+ parent
+
+ org.dkpro.core.api.xml.type.XmlElement
+
+
+
+
+ org.dkpro.core.api.xml.type.XmlTextNode
+ XML text node.
+ org.dkpro.core.api.xml.type.XmlNode
+
+
+ text
+
+ uima.cas.String
+
+
+ captured
+ Whether the text node has been added to the document text.
+ uima.cas.Boolean
+
+
+
+
+ org.dkpro.core.io.nift.metadata.ArticleMetaData
+ A document annotation that describes the metadata of a
+ newspaper article.
+
+ uima.cas.AnnotationBase
+
+
+ guid
+ The GUID field specifies a (4-byte) integer that is
+ guaranteed
+ to be unique for every document
+ in the corpus.
+
+ uima.cas.Integer
+
+
+ alternateUrl
+ This field specifies the location on nytimes.com of
+ the article. When present, this URL is preferred to the URL field
+ on articles published on or after April 02,
+ 2006, as the linked
+ page will have richer content.
+
+ uima.cas.String
+
+
+ url
+ This field specifies the location on nytimes.com of
+ the article. The 'Alternative Url'
+ field is preferred to this field
+ on articles published on or after
+ April 02, 2006, as the
+ linked page
+ will have richer content.
+
+ uima.cas.String
+
+
+ publicationDate
+ This field specifies the date of the article's
+ publication. This field is specified in the
+ format
+ YYYYMMDD'T'HHMMSS where:
+ 1. YYYY is the four-digit year.
+ 2. MM is
+ the two-digit month [01-12].
+ 3. DD is the two-digit day [01-31].
+ 4.
+ T is a constant value.
+ 5. HH is the two-digit hour [00-23].
+ 6. MM is
+ the two-digit minute-past-the hour [00-59]
+ 7. SS is the two-digit
+ seconds-past-the-minute [00-59].
+ Please note that values for HH,MM,
+ and SS are not defined for this
+ corpus, that is to day
+ HH,MM, and SS
+ are always defined to be '00'.
+
+ uima.cas.String
+
+
+ typesOfMaterial
+ This field specifies a normalized list of terms
+ describing the general editorial category of the article.
+ These
+ tags are algorithmically assigned and
+ manually verified by
+ nytimes.com production staff.
+ Examples Include:
+ * REVIEW
+ * OBITUARY
+ * ANALYSIS
+
+ uima.cas.StringArray
+
+
+ headline
+ This field specifies the headline of the article as it
+ appeared in the
+ print edition of the New York
+ Times.
+
+ uima.cas.String
+
+
+ onlineHeadline
+ This field specifies the headline displayed with the
+ article on
+ nytimes.com. Often
+ this differs from the headline used in
+ print.
+
+ uima.cas.String
+
+
+ columnName
+ If the article is part of a regular column, this field
+ specifies the
+ name of that column.
+ Sample Column Names:
+ 1. World News
+ Briefs
+ 2. WEDDINGS
+ 3. The Accessories Channel
+
+ uima.cas.String
+
+
+ author
+ This field is based on the normalized byline in the
+ original corpus data: "The Normalized Byline field is the byline
+ normalized to the form (last name, first
+ name)".
+
+ uima.cas.String
+
+
+ descriptors
+ The 'descriptors' field specifies a list of
+ descriptive terms drawn
+ from a normalized controlled
+ vocabulary
+ corresponding to subjects mentioned in the article. These tags
+ are
+ hand-assigned by
+ a team of library scientists working in the New
+ York Times Indexing
+ service.
+ Examples Include:
+ * ECONOMIC CONDITIONS
+ AND TRENDS
+ * AIRPLANES
+ * VIOLINS
+
+ uima.cas.StringArray
+
+
+ onlineDescriptors
+ This field specifies a list of descriptors from a
+ normalized
+ controlled
+ vocabulary that
+ correspond to topics mentioned
+ in the article. These
+ tags are
+ algorithmically
+ assigned and manually
+ verified by
+ nytimes.com production staff.
+ Examples Include:
+ * Marriages
+ * Parks and Other Recreation Areas
+ * Cooking and Cookbooks
+
+ uima.cas.StringArray
+
+
+ generalOnlineDescriptors
+ The 'general online descriptors' field specifies a
+ list of descriptors that are at a higher level of
+ generality than
+ the other tags associated with the article. These tags are
+ algorithmically
+ assigned and manually verified by nytimes.com
+ production staff.
+ Examples Include:
+ * Surfing
+ * Venice Biennale
+ * Ranches
+
+ uima.cas.String
+
+
+ onlineSection
+ This field specifies the section(s) on nytimes.com in
+ which the
+ article is placed. If
+ the article is placed in multiple
+ sections, this field will be
+ specified as a ';' delineated
+ list.
+
+ uima.cas.String
+
+
+ section
+ This field specifies the section of the paper in which
+ the article
+ appears. This is not
+ the name of the section, but rather
+ a letter or number that indicates
+ the section.
+
+ uima.cas.String
+
+
+ taxonomicClassifiers
+ This field specifies a list of taxonomic classifiers
+ that place this
+ article into a
+ hierarchy of articles. The individual
+ terms of each taxonomic classifier
+ are separated with the '/' character.
+ These tags are algorithmically assigned and manually
+ verified
+ by nytimes.com production staff.
+ Examples Include:
+ * Top/Features/Travel/Guides/Destinations/North America/United States/Arizona
+ * Top/News/U.S./Rockies
+ * Top/Opinion
+
+ uima.cas.StringArray
+
+
+
+
+
diff --git a/duui-anonymize/src/main/resources/TypeSystemAnonymize.xml b/duui-anonymize/src/main/resources/TypeSystemAnonymize.xml
new file mode 100644
index 00000000..bdf518f0
--- /dev/null
+++ b/duui-anonymize/src/main/resources/TypeSystemAnonymize.xml
@@ -0,0 +1,46 @@
+
+
+
+
+ de.tudarmstadt.ukp.dkpro.core.api.anomaly.type.Anomaly
+ Marks a span of text as a privacy-sensitive entity detected by the anonymizer.
+ uima.tcas.Annotation
+
+
+ description
+ Human-readable description or replacement placeholder for the detected span.
+ uima.cas.String
+
+
+ suggestions
+ Suggested replacement actions for this anomaly.
+ uima.cas.FSArray
+ de.tudarmstadt.ukp.dkpro.core.api.anomaly.type.SuggestedAction
+
+
+ category
+ Privacy category of the detected span, e.g. private_person, email_address, phone_number.
+ uima.cas.String
+
+
+
+
+
+ de.tudarmstadt.ukp.dkpro.core.api.anomaly.type.SuggestedAction
+ A suggested replacement for an anomaly span.
+ uima.tcas.Annotation
+
+
+ replacement
+ The text to substitute for the anomaly-covered span.
+ uima.cas.String
+
+
+ certainty
+ Confidence score in [0,1] for this replacement suggestion.
+ uima.cas.Float
+
+
+
+
+
diff --git a/duui-anonymize/src/test/java/AnonymizeTests.java b/duui-anonymize/src/test/java/AnonymizeTests.java
new file mode 100644
index 00000000..c292b5bc
--- /dev/null
+++ b/duui-anonymize/src/test/java/AnonymizeTests.java
@@ -0,0 +1,577 @@
+import de.tudarmstadt.ukp.dkpro.core.api.anomaly.type.Anomaly;
+import org.apache.commons.compress.compressors.CompressorException;
+import org.apache.uima.UIMAException;
+import org.apache.uima.fit.factory.JCasFactory;
+import org.apache.uima.fit.util.JCasUtil;
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.util.XmlCasSerializer;
+import org.junit.jupiter.api.*;
+import org.texttechnologylab.DockerUnifiedUIMAInterface.DUUIComposer;
+import org.texttechnologylab.DockerUnifiedUIMAInterface.driver.DUUIRemoteDriver;
+import org.texttechnologylab.DockerUnifiedUIMAInterface.lua.DUUILuaContext;
+import org.xml.sax.SAXException;
+import org.texttechnologylab.DockerUnifiedUIMAInterface.driver.DUUIUIMADriver;
+import org.dkpro.core.io.xmi.XmiWriter;
+
+import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription;
+import static org.junit.jupiter.api.Assertions.*;
+
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.net.URISyntaxException;
+import java.net.UnknownHostException;
+import java.nio.charset.StandardCharsets;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.util.Collection;
+
+/**
+ * Integration tests for the DUUI anonymization annotator.
+ *
+ * Prerequisites: the Python service must be running on {@value #SERVICE_URL}.
+ * Start it with:
+ * uvicorn duui:app --host 0.0.0.0 --port 9714 --workers 1
+ *
+ * Each test loads English text containing personally identifiable information (PII),
+ * sends it through the OPF anonymizer via DUUI, and asserts that
+ * {@link Anomaly} annotations are written to the CAS output view.
+ */
+public class AnonymizeTests {
+
+ static final String SERVICE_URL = "http://127.0.0.1:9714";
+
+ static final String RESULTS_DIR = "src/test/results";
+
+ static DUUIComposer composer;
+ static JCas cas;
+
+ // -------------------------------------------------------------------
+ // JUnit lifecycle
+ // -------------------------------------------------------------------
+
+ @BeforeAll
+ static void beforeAll() throws URISyntaxException, IOException, UIMAException, SAXException, CompressorException {
+ Files.createDirectories(Paths.get(RESULTS_DIR));
+
+ composer = new DUUIComposer()
+ .withSkipVerification(true)
+ .withLuaContext(new DUUILuaContext().withJsonLibrary());
+
+ DUUIUIMADriver uimaDriver = new DUUIUIMADriver().withDebug(false);
+ DUUIRemoteDriver remoteDriver = new DUUIRemoteDriver();
+ composer.addDriver(remoteDriver, uimaDriver);
+
+ cas = JCasFactory.createJCas();
+ }
+
+ @AfterAll
+ static void afterAll() throws UnknownHostException {
+ composer.shutdown();
+ }
+
+ @AfterEach
+ void afterEach(TestInfo testInfo) throws IOException, SAXException {
+ composer.resetPipeline();
+
+ String methodName = testInfo.getTestMethod()
+ .map(m -> m.getName())
+ .orElseGet(() -> testInfo.getDisplayName().replaceAll("[^a-zA-Z0-9_]", "_"));
+
+ String inputText = cas.getDocumentText() != null ? cas.getDocumentText() : "";
+ String redactedText = extractRedactedText();
+ Collection anomalies = collectAnomalies();
+
+ // Write XMI
+ ByteArrayOutputStream xmiBytes = new ByteArrayOutputStream();
+ XmlCasSerializer.serialize(cas.getCas(), null, xmiBytes);
+ String xmiString = xmiBytes.toString(StandardCharsets.UTF_8);
+ Files.writeString(Paths.get(RESULTS_DIR, methodName + ".xmi"), xmiString);
+
+ // Write JSON summary for later comparison
+ String json = buildResultJson(methodName, inputText, redactedText, anomalies);
+ Files.writeString(Paths.get(RESULTS_DIR, methodName + ".json"), json);
+
+ System.out.println("=== " + methodName + " ===");
+ System.out.println(json);
+
+ cas.reset();
+ }
+
+ // -------------------------------------------------------------------
+ // Helpers
+ // -------------------------------------------------------------------
+
+ /** Populate the CAS with the given text and language code. */
+ private static void createCas(String language, String text) throws UIMAException {
+ cas.setDocumentLanguage(language);
+ cas.setDocumentText(text);
+ }
+
+ /**
+ * Collect all {@link Anomaly} annotations from the default CAS view.
+ * Anomalies are always indexed against the original document text so that
+ * their character offsets are valid. The "opf_redacted" view only carries
+ * the redacted sofa string and no annotations.
+ */
+ private static Collection collectAnomalies() {
+ return JCasUtil.select(cas, Anomaly.class);
+ }
+
+ /** Returns the sofa string of the opf_redacted view, or the default view's string. */
+ private static String extractRedactedText() {
+ try {
+ JCas view = cas.getView("opf_redacted");
+ String s = view.getSofaDataString();
+ return s != null ? s : "";
+ } catch (Exception ignored) {
+ String s = cas.getSofaDataString();
+ return s != null ? s : "";
+ }
+ }
+
+ private static String buildResultJson(
+ String testName,
+ String inputText,
+ String redactedText,
+ Collection anomalies) {
+
+ StringBuilder sb = new StringBuilder();
+ sb.append("{\n");
+ sb.append(" \"test\": ").append(jsonStr(testName)).append(",\n");
+ sb.append(" \"input\": ").append(jsonStr(inputText)).append(",\n");
+ sb.append(" \"output\": ").append(jsonStr(redactedText)).append(",\n");
+ sb.append(" \"anomaly_count\": ").append(anomalies.size()).append(",\n");
+ sb.append(" \"anomalies\": [\n");
+ int idx = 0;
+ for (Anomaly a : anomalies) {
+ String spanText = "";
+ try {
+ if (a.getBegin() >= 0 && a.getEnd() <= inputText.length()) {
+ spanText = inputText.substring(a.getBegin(), a.getEnd());
+ }
+ } catch (Exception ignored) {}
+ sb.append(" {\n");
+ sb.append(" \"begin\": ").append(a.getBegin()).append(",\n");
+ sb.append(" \"end\": ").append(a.getEnd()).append(",\n");
+ sb.append(" \"category\": ").append(jsonStr(a.getCategory())).append(",\n");
+ sb.append(" \"description\": ").append(jsonStr(a.getDescription())).append(",\n");
+ sb.append(" \"text\": ").append(jsonStr(spanText)).append("\n");
+ sb.append(" }");
+ if (++idx < anomalies.size()) sb.append(",");
+ sb.append("\n");
+ }
+ sb.append(" ]\n");
+ sb.append("}\n");
+ return sb.toString();
+ }
+
+ private static String jsonStr(String s) {
+ if (s == null) return "null";
+ return "\"" + s
+ .replace("\\", "\\\\")
+ .replace("\"", "\\\"")
+ .replace("\n", "\\n")
+ .replace("\r", "\\r")
+ .replace("\t", "\\t")
+ + "\"";
+ }
+
+ // -------------------------------------------------------------------
+ // Mode tests
+ // -------------------------------------------------------------------
+
+ /** Placeholder mode (default): PII replaced with [category] tag in redacted_text and Anomaly description. */
+ @Test
+ @DisplayName("Placeholder mode: PII replaced with [category] tag")
+ void testPlaceholderMode() throws Exception {
+ String text = "Send the report to max.mustermann@uni-frankfurt.de by Friday.";
+ createCas("en", text);
+
+ composer.add(new DUUIRemoteDriver.Component(SERVICE_URL)
+ .withParameter("mode", "remove"));
+ composer.run(cas);
+
+ Collection anomalies = collectAnomalies();
+ System.out.println("Anomaly count: " + anomalies.size());
+ anomalies.forEach(a -> System.out.printf(" [%d-%d] category=%s description=%s%n",
+ a.getBegin(), a.getEnd(), a.getCategory(), a.getDescription()));
+
+ assertFalse(anomalies.isEmpty(), "Expected at least one Anomaly for the email address");
+ assertTrue(anomalies.stream().anyMatch(a -> {
+ String d = a.getDescription();
+ return d != null && d.startsWith("[") && d.endsWith("]");
+ }), "Anomaly description should be a bracketed [category] tag in placeholder mode");
+
+ String redacted = extractRedactedText();
+ assertFalse(redacted.contains("max.mustermann@uni-frankfurt.de"),
+ "Redacted text should not contain the original email");
+ assertTrue(redacted.contains("[private_email]") || redacted.contains("[private_person]"),
+ "Redacted text should contain a [category] replacement tag");
+ }
+
+ /** Remove mode: PII spans are deleted from redacted_text; Anomaly description is the original word. */
+ @Test
+ @DisplayName("Remove mode: PII deleted from redacted text")
+ void testRemoveMode() throws Exception {
+ String text = "Call John Smith at john.smith@company.com or +1-800-555-0199 for help.";
+ createCas("en", text);
+
+ composer.add(new DUUIRemoteDriver.Component(SERVICE_URL)
+ .withParameter("mode", "remove"));
+ composer.run(cas);
+
+ Collection anomalies = collectAnomalies();
+ System.out.println("Anomaly count: " + anomalies.size());
+ anomalies.forEach(a -> System.out.printf(" [%d-%d] category=%s text=%s%n",
+ a.getBegin(), a.getEnd(), a.getCategory(), text.substring(a.getBegin(), a.getEnd())));
+
+ assertFalse(anomalies.isEmpty(), "Expected anomalies in remove mode");
+
+ String redacted = extractRedactedText();
+ System.out.printf(" original (%d): %s%n", text.length(), text);
+ System.out.printf(" redacted (%d): %s%n", redacted.length(), redacted);
+ assertTrue(redacted.length() < text.length(),
+ "Redacted text should be shorter after PII removal");
+ // original PII tokens must be absent from the redacted string
+ for (Anomaly a : anomalies) {
+ String pii = text.substring(a.getBegin(), a.getEnd());
+ assertFalse(redacted.contains(pii),
+ "Removed PII token '" + pii + "' should not appear in redacted text");
+ }
+ }
+
+ /** Pseudo mode: not yet supported - service returns input unchanged with no annotations. */
+ @Test
+ @DisplayName("Pseudo mode: not yet supported, returns input unchanged")
+ void testPseudoMode() throws Exception {
+ String text = "Alice and Bob met at the Frankfurt main station.";
+ createCas("en", text);
+
+ composer.add(new DUUIRemoteDriver.Component(SERVICE_URL)
+ .withParameter("mode", "pseudo"));
+ composer.run(cas);
+
+ Collection anomalies = collectAnomalies();
+ System.out.println("Anomaly count (pseudo mode): " + anomalies.size());
+ assertTrue(anomalies.isEmpty(),
+ "Pseudo mode (unsupported stub) should produce no Anomaly annotations");
+ }
+
+ // -------------------------------------------------------------------
+ // PII type tests (mode=placeholder so description = [category])
+ // -------------------------------------------------------------------
+
+ /** private_person: full name in a simple sentence. */
+ @Test
+ @DisplayName("Type: private_person")
+ void testTypePerson() throws Exception {
+ String text = "John Smith called the bank to report a fraud.";
+ createCas("en", text);
+
+ composer.add(new DUUIRemoteDriver.Component(SERVICE_URL)
+ .withParameter("mode", "placeholder"));
+ composer.run(cas);
+
+ Collection anomalies = collectAnomalies();
+ anomalies.forEach(a -> System.out.printf(" [%d-%d] %s = %s%n",
+ a.getBegin(), a.getEnd(), a.getCategory(), a.getDescription()));
+
+ assertFalse(anomalies.isEmpty(), "Expected at least one annotation");
+ assertTrue(anomalies.stream().anyMatch(a -> "private_person".equals(a.getCategory())),
+ "Expected category 'private_person' for 'John Smith'");
+ }
+
+ /** private_email: plain email address. */
+ @Test
+ @DisplayName("Type: private_email")
+ void testTypeEmail() throws Exception {
+ String text = "Please contact alice@example.com for further assistance.";
+ createCas("en", text);
+
+ composer.add(new DUUIRemoteDriver.Component(SERVICE_URL)
+ .withParameter("mode", "placeholder"));
+ composer.run(cas);
+
+ Collection anomalies = collectAnomalies();
+ anomalies.forEach(a -> System.out.printf(" [%d-%d] %s = %s%n",
+ a.getBegin(), a.getEnd(), a.getCategory(), a.getDescription()));
+
+ assertTrue(anomalies.stream().anyMatch(a -> "private_email".equals(a.getCategory())),
+ "Expected category 'private_email' for 'alice@example.com'");
+ }
+
+ /** private_phone: international phone number. */
+ @Test
+ @DisplayName("Type: private_phone")
+ void testTypePhone() throws Exception {
+ String text = "You can reach Dr. Miller at +49 69 1234 5678 during office hours.";
+ createCas("en", text);
+
+ composer.add(new DUUIRemoteDriver.Component(SERVICE_URL)
+ .withParameter("mode", "placeholder"));
+ composer.run(cas);
+
+ Collection anomalies = collectAnomalies();
+ anomalies.forEach(a -> System.out.printf(" [%d-%d] %s = %s%n",
+ a.getBegin(), a.getEnd(), a.getCategory(), a.getDescription()));
+
+ assertFalse(anomalies.isEmpty(), "Expected phone or person annotation");
+ long phoneCount = anomalies.stream().filter(a -> "private_phone".equals(a.getCategory())).count();
+ System.out.println("private_phone spans: " + phoneCount);
+ assertTrue(phoneCount > 0, "Expected category 'private_phone' for '+49 69 1234 5678'");
+ }
+
+ /** private_address: street address with postcode. */
+ @Test
+ @DisplayName("Type: private_address")
+ void testTypeAddress() throws Exception {
+ String text = "She lives at 742 Evergreen Terrace, Springfield, IL 62704.";
+ createCas("en", text);
+
+ composer.add(new DUUIRemoteDriver.Component(SERVICE_URL)
+ .withParameter("mode", "placeholder"));
+ composer.run(cas);
+
+ Collection anomalies = collectAnomalies();
+ anomalies.forEach(a -> System.out.printf(" [%d-%d] %s = %s%n",
+ a.getBegin(), a.getEnd(), a.getCategory(), a.getDescription()));
+
+ long addrCount = anomalies.stream().filter(a -> "private_address".equals(a.getCategory())).count();
+ System.out.println("private_address spans: " + addrCount);
+ assertTrue(addrCount > 0, "Expected category 'private_address' for the street address");
+ }
+
+ /** private_url: personal homepage URL. */
+ @Test
+ @DisplayName("Type: private_url")
+ void testTypeUrl() throws Exception {
+ String text = "My personal page is at https://janedoe.personal-site.com/about and I post there.";
+ createCas("en", text);
+
+ composer.add(new DUUIRemoteDriver.Component(SERVICE_URL)
+ .withParameter("mode", "placeholder"));
+ composer.run(cas);
+
+ Collection anomalies = collectAnomalies();
+ anomalies.forEach(a -> System.out.printf(" [%d-%d] %s = %s%n",
+ a.getBegin(), a.getEnd(), a.getCategory(), a.getDescription()));
+
+ long urlCount = anomalies.stream().filter(a -> "private_url".equals(a.getCategory())).count();
+ System.out.println("private_url spans: " + urlCount);
+ assertTrue(urlCount > 0, "Expected category 'private_url' for the personal URL");
+ }
+
+ /** private_date: personally identifying date (e.g. birth date). */
+ @Test
+ @DisplayName("Type: private_date")
+ void testTypeDate() throws Exception {
+ String text = "Jane Doe was born on March 15, 1990 in Chicago.";
+ createCas("en", text);
+
+ composer.add(new DUUIRemoteDriver.Component(SERVICE_URL)
+ .withParameter("mode", "placeholder"));
+ composer.run(cas);
+
+ Collection anomalies = collectAnomalies();
+ anomalies.forEach(a -> System.out.printf(" [%d-%d] %s = %s%n",
+ a.getBegin(), a.getEnd(), a.getCategory(), a.getDescription()));
+
+ assertFalse(anomalies.isEmpty(), "Expected at least one annotation (person or date)");
+ long dateCount = anomalies.stream().filter(a -> "private_date".equals(a.getCategory())).count();
+ System.out.println("private_date spans: " + dateCount);
+ assertTrue(dateCount > 0, "Expected category 'private_date' for 'March 15, 1990'");
+ }
+
+ /** account_number: credit-card style number string. */
+ @Test
+ @DisplayName("Type: account_number")
+ void testTypeAccountNumber() throws Exception {
+ String text = "Please transfer funds to account number 4532-0151-1283-0366 at Deutsche Bank.";
+ createCas("en", text);
+
+ composer.add(new DUUIRemoteDriver.Component(SERVICE_URL)
+ .withParameter("mode", "placeholder"));
+ composer.run(cas);
+
+ Collection anomalies = collectAnomalies();
+ anomalies.forEach(a -> System.out.printf(" [%d-%d] %s = %s%n",
+ a.getBegin(), a.getEnd(), a.getCategory(), a.getDescription()));
+
+ long acctCount = anomalies.stream().filter(a -> "account_number".equals(a.getCategory())).count();
+ System.out.println("account_number spans: " + acctCount);
+ assertTrue(acctCount > 0, "Expected category 'account_number' for the card number");
+ }
+
+ /** secret: API key / credential in text. */
+ @Test
+ @DisplayName("Type: secret")
+ void testTypeSecret() throws Exception {
+ String text = "The API key is sk-proj-abc123XYZ987 and the password is H@nt3r2secure!.";
+ createCas("en", text);
+
+ composer.add(new DUUIRemoteDriver.Component(SERVICE_URL)
+ .withParameter("mode", "placeholder"));
+ composer.run(cas);
+
+ Collection anomalies = collectAnomalies();
+ anomalies.forEach(a -> System.out.printf(" [%d-%d] %s = %s%n",
+ a.getBegin(), a.getEnd(), a.getCategory(), a.getDescription()));
+
+ long secretCount = anomalies.stream().filter(a -> "secret".equals(a.getCategory())).count();
+ System.out.println("secret spans: " + secretCount);
+ assertTrue(secretCount > 0, "Expected category 'secret' for API key / password");
+ }
+
+ // -------------------------------------------------------------------
+ // Feature / combination tests
+ // -------------------------------------------------------------------
+
+ /** Multiple PII types in one document; verifies distinct categories are detected. */
+ @Test
+ @DisplayName("Multiple PII types in one document")
+ void testMultiplePiiEntities() throws Exception {
+ String text =
+ "Patient: Jane Doe, DOB: 1985-03-22. " +
+ "Contact: jane.doe@hospital.org, Tel: 069-9876-5432. " +
+ "Address: 60325 Frankfurt am Main, Goethe-Platz 1.";
+ createCas("en", text);
+
+ composer.add(new DUUIRemoteDriver.Component(SERVICE_URL)
+ .withParameter("mode", "placeholder"));
+ composer.run(cas);
+
+ Collection anomalies = collectAnomalies();
+ System.out.println("Anomaly count: " + anomalies.size());
+ anomalies.forEach(a -> System.out.printf(" [%d-%d] %s = '%s'%n",
+ a.getBegin(), a.getEnd(), a.getCategory(), text.substring(a.getBegin(), a.getEnd())));
+
+ assertTrue(anomalies.size() >= 2,
+ "Expected at least 2 distinct PII annotations");
+
+ long distinctCategories = anomalies.stream().map(Anomaly::getCategory).distinct().count();
+ System.out.println("Distinct categories: " + distinctCategories);
+ assertTrue(distinctCategories >= 2,
+ "Expected annotations from at least 2 different PII categories");
+ }
+
+ /** Smoke test with two PII types in one sentence. */
+ @Test
+ @DisplayName("Smoke test: person + email in one sentence")
+ void testSimplePerson() throws Exception {
+ String text = "My name is Harry Potter and my email is harry.potter@hogwarts.edu.";
+ createCas("en", text);
+
+ composer.add(new DUUIRemoteDriver.Component(SERVICE_URL)
+ .withParameter("mode", "placeholder"));
+ composer.run(cas);
+
+ Collection anomalies = collectAnomalies();
+ System.out.println("Anomaly count: " + anomalies.size());
+ anomalies.forEach(a -> System.out.printf(" [%d-%d] %s = %s%n",
+ a.getBegin(), a.getEnd(), a.getCategory(), a.getDescription()));
+
+ assertFalse(anomalies.isEmpty(),
+ "Expected at least one Anomaly annotation");
+ }
+
+ /** Ambiguous context where person identity is inferred from surrounding detail. */
+ @Test
+ @DisplayName("Complex context: identity inferred from description")
+ void testComplexContext() throws Exception {
+ String text = "His name is Harry, he works at the TTLAB in Frankfurt, " +
+ "he's the only Chinese guy in the office.";
+ createCas("en", text);
+
+ composer.add(new DUUIRemoteDriver.Component(SERVICE_URL)
+ .withParameter("mode", "remove")); // or remove/placeholder mode, should still detect the same spans
+ composer.run(cas);
+
+ Collection anomalies = collectAnomalies();
+ System.out.println("Anomaly count: " + anomalies.size());
+ anomalies.forEach(a -> System.out.printf(" [%d-%d] %s = %s%n",
+ a.getBegin(), a.getEnd(), a.getCategory(), a.getDescription()));
+
+ assertFalse(anomalies.isEmpty(), "Expected at least one annotation in complex context");
+ }
+
+ /** Selection window: only span offsets within [selBegin, selEnd] must be annotated. */
+ @Test
+ @DisplayName("Selection window constrains annotation range")
+ void testSelectionWindow() throws Exception {
+ // window [9, 30] covers "John Adams at 555-0100"
+ String text = "Call Dr. John Adams at 555-0100 today.";
+ int selBegin = 9;
+ int selEnd = 30;
+ createCas("en", text);
+
+ composer.add(new DUUIRemoteDriver.Component(SERVICE_URL)
+ .withParameter("mode", "placeholder")
+ .withParameter("selection_begin", String.valueOf(selBegin))
+ .withParameter("selection_end", String.valueOf(selEnd)));
+ composer.run(cas);
+
+ Collection anomalies = collectAnomalies();
+ System.out.println("Anomaly count (selection window): " + anomalies.size());
+ for (Anomaly a : anomalies) {
+ assertTrue(a.getBegin() >= selBegin && a.getEnd() <= selEnd,
+ String.format("Anomaly [%d-%d] outside window [%d-%d]",
+ a.getBegin(), a.getEnd(), selBegin, selEnd));
+ }
+ }
+
+ /** Empty document must not throw and must return zero annotations. */
+ @Test
+ @DisplayName("Empty document produces no anomalies")
+ void testEmptyDocument() throws Exception {
+ createCas("en", "");
+
+ composer.add(new DUUIRemoteDriver.Component(SERVICE_URL)
+ .withParameter("mode", "placeholder"));
+ composer.run(cas);
+
+ assertTrue(collectAnomalies().isEmpty(),
+ "An empty document should produce zero Anomaly annotations");
+ }
+
+ /** German text must not throw; detection quality may vary. */
+ @Test
+ @DisplayName("German text does not cause an exception")
+ void testGermanText() throws Exception {
+ String text = "Herr Klaus Muller wohnt in der Goethestrasse 12, 60313 Frankfurt am Main.";
+ createCas("de", text);
+
+ composer.add(new DUUIRemoteDriver.Component(SERVICE_URL)
+ .withParameter("mode", "placeholder"));
+
+ assertDoesNotThrow(() -> composer.run(cas));
+ System.out.println("German Anomaly count: " + collectAnomalies().size());
+ }
+
+ /** XMI round-trip: annotate and write to src/test/results/ for manual inspection. */
+ @Test
+ @DisplayName("XMI output is written to src/test/results/")
+ void testXmiOutput() throws Exception {
+ String text = "Maria Schmidt (m.schmidt@example.de) lives at Berliner Str. 5, 10115 Berlin.";
+ createCas("en", text);
+
+ composer.add(new DUUIRemoteDriver.Component(SERVICE_URL)
+ .withParameter("mode", "placeholder"));
+
+ composer.add(new DUUIUIMADriver.Component(
+ createEngineDescription(XmiWriter.class,
+ XmiWriter.PARAM_TARGET_LOCATION, RESULTS_DIR,
+ XmiWriter.PARAM_PRETTY_PRINT, true,
+ XmiWriter.PARAM_OVERWRITE, true,
+ XmiWriter.PARAM_VERSION, "1.1"
+ )
+ ).build());
+
+ composer.run(cas);
+
+ Collection anomalies = collectAnomalies();
+ System.out.println("XMI test Anomaly count: " + anomalies.size());
+ assertFalse(anomalies.isEmpty(),
+ "Expected at least one Anomaly annotation for the PII-rich document");
+ }
+}
diff --git a/duui-anonymize/src/test/resources/sample_pii_de.txt b/duui-anonymize/src/test/resources/sample_pii_de.txt
new file mode 100644
index 00000000..391ff970
--- /dev/null
+++ b/duui-anonymize/src/test/resources/sample_pii_de.txt
@@ -0,0 +1,4 @@
+Frau Anna Müller (anna.mueller@beispiel.de) hat am 15. März 2024 angerufen.
+Ihre Telefonnummer lautet 069-8765-4321.
+Wohnadresse: Goethestraße 3, 60313 Frankfurt am Main.
+Geburtsdatum: 12.07.1985.
diff --git a/duui-anonymize/src/test/resources/sample_pii_en.txt b/duui-anonymize/src/test/resources/sample_pii_en.txt
new file mode 100644
index 00000000..1acc74b6
--- /dev/null
+++ b/duui-anonymize/src/test/resources/sample_pii_en.txt
@@ -0,0 +1,4 @@
+John Smith called the helpdesk on Monday.
+His email is john.smith@company.org and his phone number is +49 69 1234 5678.
+He lives at Mainzer Landstraße 50, 60329 Frankfurt am Main.
+Date of birth: 1978-11-04. Employee ID: EMP-00421.
diff --git a/duui-anonymize/tests/test_communication_contract.py b/duui-anonymize/tests/test_communication_contract.py
new file mode 100644
index 00000000..d2df6286
--- /dev/null
+++ b/duui-anonymize/tests/test_communication_contract.py
@@ -0,0 +1,20 @@
+from __future__ import annotations
+
+from pathlib import Path
+import unittest
+
+
+class CommunicationContractTests(unittest.TestCase):
+ def test_lua_contract_mentions_text_options_selection_and_redacted_view(self) -> None:
+ lua_path = Path(__file__).resolve().parents[1] / "src/main/python/communication.lua"
+ contents = lua_path.read_text(encoding="utf-8")
+
+ self.assertIn('text = text', contents)
+ self.assertIn('options = copy_options(params)', contents)
+ self.assertIn('selection = resolve_selection(params)', contents)
+ self.assertIn('createView("opf_redacted")', contents)
+ self.assertIn('detected_spans', contents)
+
+
+if __name__ == "__main__":
+ unittest.main()
diff --git a/duui-anonymize/tests/test_duui_opf_core.py b/duui-anonymize/tests/test_duui_opf_core.py
new file mode 100644
index 00000000..ce71f6c5
--- /dev/null
+++ b/duui-anonymize/tests/test_duui_opf_core.py
@@ -0,0 +1,79 @@
+from __future__ import annotations
+
+import unittest
+from pathlib import Path
+import sys
+
+
+ROOT = Path(__file__).resolve().parents[1]
+sys.path.insert(0, str(ROOT / "src/main/python"))
+
+from duui_opf_core import (
+ DEFAULT_PLACEHOLDER,
+ DEFAULT_MODE,
+ PSEUDO_MODE,
+ RedactionSpan,
+ SelectionRange,
+ apply_replacement_text,
+ apply_selection,
+ compose_selection_output,
+ resolve_selection,
+ split_options,
+)
+
+
+class DuuiOpfCoreTests(unittest.TestCase):
+ def test_split_options_separates_service_and_decode_values(self) -> None:
+ service_options, decode_options, mode, placeholder = split_options(
+ {
+ "model": "local-checkpoint",
+ "context_window_length": 128,
+ "trim_whitespace": False,
+ "device": "cpu",
+ "output_mode": "typed",
+ "discard_overlapping_predicted_spans": True,
+ "mode": PSEUDO_MODE,
+ "placeholder": "",
+ "decode_mode": "argmax",
+ "calibration_path": "/tmp/calibration.json",
+ "selection_begin": 2,
+ "selection_end": 8,
+ }
+ )
+
+ self.assertEqual(service_options["model"], "local-checkpoint")
+ self.assertEqual(service_options["device"], "cpu")
+ self.assertEqual(decode_options["decode_mode"], "argmax")
+ self.assertEqual(decode_options["viterbi_calibration_path"], "/tmp/calibration.json")
+ self.assertEqual(mode, PSEUDO_MODE)
+ self.assertEqual(placeholder, "")
+
+ def test_resolve_selection_accepts_nested_or_flat_offsets(self) -> None:
+ nested = resolve_selection({"selection": {"begin": 4, "end": 9}}, text_length=20)
+ flat = resolve_selection({"selection_begin": 1, "selection_end": 3}, text_length=20)
+
+ self.assertEqual(nested, SelectionRange(begin=4, end=9))
+ self.assertEqual(flat, SelectionRange(begin=1, end=3))
+
+ def test_apply_replacement_text_uses_one_placeholder(self) -> None:
+ redacted = apply_replacement_text(
+ "Alice called Bob.",
+ [
+ RedactionSpan(label="private_person", start=0, end=5, text="Alice"),
+ RedactionSpan(label="private_person", start=13, end=16, text="Bob"),
+ ],
+ )
+
+ self.assertEqual(redacted, f"{DEFAULT_PLACEHOLDER} called {DEFAULT_PLACEHOLDER}.")
+
+ def test_apply_selection_and_compose_output(self) -> None:
+ selection = SelectionRange(begin=6, end=11)
+ selected_text, offset = apply_selection("hello world", selection)
+
+ self.assertEqual(selected_text, "world")
+ self.assertEqual(offset, 6)
+ self.assertEqual(compose_selection_output("hello world", selection, "there"), "hello there")
+
+
+if __name__ == "__main__":
+ unittest.main()