From 0db4825de978e0195586533d05a4a151480c530b Mon Sep 17 00:00:00 2001 From: Ali Abusaleh Date: Tue, 26 May 2026 14:16:59 +0200 Subject: [PATCH 1/3] add openAI privacy filter skeleton --- duui-anonymize/.gitignore | 9 + duui-anonymize/README.md | 39 + duui-anonymize/pom.xml | 138 ++ duui-anonymize/requirements.txt | 13 + duui-anonymize/src/main/docker/Dockerfile | 20 + .../src/main/docker/Dockerfile-cuda | 30 + .../src/main/docker/python/communication.lua | 105 + .../src/main/python/communication.lua | 105 + .../src/main/python/duui_anonymize.py | 375 +++ duui-anonymize/src/main/python/duui_opf.py | 3 + .../src/main/python/duui_opf_core.py | 149 ++ .../src/main/python/duui_whisperx.py | 430 ++++ duui-anonymize/src/main/python/typesystem.xml | 2202 +++++++++++++++++ .../main/resources/TypeSystemAnonymize.xml | 46 + .../src/test/java/AnonymizeTests.java | 447 ++++ .../src/test/resources/sample_pii_de.txt | 4 + .../src/test/resources/sample_pii_en.txt | 4 + .../test/results/testCustomPlaceholder.json | 15 + .../test/results/testCustomPlaceholder.xmi | 1 + .../src/test/results/testEmailRedaction.json | 8 + .../src/test/results/testEmailRedaction.xmi | 1 + .../src/test/results/testEmptyDocument.json | 8 + .../src/test/results/testEmptyDocument.xmi | 1 + .../src/test/results/testGermanText.json | 15 + .../src/test/results/testGermanText.xmi | 1 + .../test/results/testMultiplePiiEntities.json | 22 + .../test/results/testMultiplePiiEntities.xmi | 1 + .../results/testPhoneNumberRedaction.json | 15 + .../test/results/testPhoneNumberRedaction.xmi | 1 + .../src/test/results/testPseudoMode.json | 15 + .../src/test/results/testPseudoMode.xmi | 1 + .../src/test/results/testSelectionWindow.json | 22 + .../src/test/results/testSelectionWindow.xmi | 1 + .../test/results/testSimplePersonName.json | 8 + .../src/test/results/testSimplePersonName.xmi | 1 + .../src/test/results/testXmiOutput.json | 15 + .../src/test/results/testXmiOutput.xmi | 1 + .../tests/test_communication_contract.py | 20 + duui-anonymize/tests/test_duui_opf_core.py | 79 + 39 files changed, 4371 insertions(+) create mode 100644 duui-anonymize/.gitignore create mode 100644 duui-anonymize/README.md create mode 100644 duui-anonymize/pom.xml create mode 100644 duui-anonymize/requirements.txt create mode 100644 duui-anonymize/src/main/docker/Dockerfile create mode 100644 duui-anonymize/src/main/docker/Dockerfile-cuda create mode 100644 duui-anonymize/src/main/docker/python/communication.lua create mode 100644 duui-anonymize/src/main/python/communication.lua create mode 100644 duui-anonymize/src/main/python/duui_anonymize.py create mode 100644 duui-anonymize/src/main/python/duui_opf.py create mode 100644 duui-anonymize/src/main/python/duui_opf_core.py create mode 100644 duui-anonymize/src/main/python/duui_whisperx.py create mode 100644 duui-anonymize/src/main/python/typesystem.xml create mode 100644 duui-anonymize/src/main/resources/TypeSystemAnonymize.xml create mode 100644 duui-anonymize/src/test/java/AnonymizeTests.java create mode 100644 duui-anonymize/src/test/resources/sample_pii_de.txt create mode 100644 duui-anonymize/src/test/resources/sample_pii_en.txt create mode 100644 duui-anonymize/src/test/results/testCustomPlaceholder.json create mode 100644 duui-anonymize/src/test/results/testCustomPlaceholder.xmi create mode 100644 duui-anonymize/src/test/results/testEmailRedaction.json create mode 100644 duui-anonymize/src/test/results/testEmailRedaction.xmi create mode 100644 duui-anonymize/src/test/results/testEmptyDocument.json create mode 100644 duui-anonymize/src/test/results/testEmptyDocument.xmi create mode 100644 duui-anonymize/src/test/results/testGermanText.json create mode 100644 duui-anonymize/src/test/results/testGermanText.xmi create mode 100644 duui-anonymize/src/test/results/testMultiplePiiEntities.json create mode 100644 duui-anonymize/src/test/results/testMultiplePiiEntities.xmi create mode 100644 duui-anonymize/src/test/results/testPhoneNumberRedaction.json create mode 100644 duui-anonymize/src/test/results/testPhoneNumberRedaction.xmi create mode 100644 duui-anonymize/src/test/results/testPseudoMode.json create mode 100644 duui-anonymize/src/test/results/testPseudoMode.xmi create mode 100644 duui-anonymize/src/test/results/testSelectionWindow.json create mode 100644 duui-anonymize/src/test/results/testSelectionWindow.xmi create mode 100644 duui-anonymize/src/test/results/testSimplePersonName.json create mode 100644 duui-anonymize/src/test/results/testSimplePersonName.xmi create mode 100644 duui-anonymize/src/test/results/testXmiOutput.json create mode 100644 duui-anonymize/src/test/results/testXmiOutput.xmi create mode 100644 duui-anonymize/tests/test_communication_contract.py create mode 100644 duui-anonymize/tests/test_duui_opf_core.py diff --git a/duui-anonymize/.gitignore b/duui-anonymize/.gitignore new file mode 100644 index 00000000..6649f848 --- /dev/null +++ b/duui-anonymize/.gitignore @@ -0,0 +1,9 @@ +/../* +../ +../* +../duui-mm/* +.venv/** +.vscode/** +__pycache__/** +*.pyc + diff --git a/duui-anonymize/README.md b/duui-anonymize/README.md new file mode 100644 index 00000000..8c005b75 --- /dev/null +++ b/duui-anonymize/README.md @@ -0,0 +1,39 @@ +#### OpenAI Privacy Filter component for DUUI + +OpenAI Privacy Filter: https://github.com/openai/privacy-filter + +#### Input/Output: + +input: Text in the Sofa. Optional selection offsets can be passed through Lua options. + +output: structured redaction spans and redacted text + +#### Output Shape: + +Privacy Filter detects 8 privacy span categories: + +- `account_number` +- `private_address` +- `private_email` +- `private_person` +- `private_phone` +- `private_url` +- `private_date` +- `secret` + +The model emits BIOES token classes for these categories plus `O`, and the service turns the resulting spans into DUUI annotations and redacted text. + +#### Parameter: + +[optional] OPF redaction options such as `model`, `context_window_length`, `trim_whitespace`, `device`, `output_mode`, `decode_mode`, `discard_overlapping_predicted_spans`, `viterbi_calibration_path`, and selection offsets (`selection_begin` / `selection_end`). + +#### Modes: + +- `replacement`: default mode, replaces detected spans with a consistent placeholder. +- `pseudo`: kept as a stub / TODO mode and currently returns the input unchanged. +- `mode` is passed through Lua options. + +#### Entry points: + +- `src/main/docker/python/duui_opf.py`: new OPF entrypoint wrapper. +- `src/main/docker/python/duui_whisperx.py`: compatibility implementation file while the migration is in progress. diff --git a/duui-anonymize/pom.xml b/duui-anonymize/pom.xml new file mode 100644 index 00000000..3e7b0b79 --- /dev/null +++ b/duui-anonymize/pom.xml @@ -0,0 +1,138 @@ + + + 4.0.0 + + org.texttechnology + duui-anonymize + 1.0-SNAPSHOT + + + + AGPL-3.0-or-later + https://www.gnu.org/licenses/agpl.txt + repo + GNU Affero General Public License v3.0 or later + + + + + Texttechnology Lab + https://www.texttechnologylab.org + + + + + mehler + Prof. Dr. Alexander Mehler + mehler@em.uni-frankfurt.de + https://www.texttechnologylab.org/team/alexander-abrami/ + Goethe University Frankfurt / Texttechnology Lab + https://www.texttechnologylab.org + + head of department + + + + aabusale + Ali Abusaleh + a.abusaleh@em.uni-frankfurt.de + https://www.texttechnologylab.org/team/ali-abusaleh/ + Goethe University Frankfurt / Texttechnology Lab + https://www.texttechnologylab.org + + Research assistant + + Europe/Berlin + + + + + + + org.apache.maven.plugins + maven-surefire-plugin + 2.22.0 + + + --illegal-access=permit + --add-opens java.base/java.util=ALL-UNNAMED + + + + + + + + 21 + 21 + UTF-8 + 2.4.0 + + + + + jitpack.io + https://jitpack.io + + + + + + + org.dkpro.core + dkpro-core-asl + ${dkpro.core.version} + pom + import + + + + + + + com.github.texttechnologylab + DockerUnifiedUIMAInterface + fac60bef3f + + + + com.github.texttechnologylab + UIMATypeSystem + 3.0.5 + + + + org.junit.jupiter + junit-jupiter + 5.9.0 + test + + + + org.dkpro.core + dkpro-core-api-anomaly-asl + test + + + + org.dkpro.core + dkpro-core-api-segmentation-asl + test + + + + org.dkpro.core + dkpro-core-io-xmi-asl + test + + + + org.dkpro.core + dkpro-core-api-resources-asl + test + + + + diff --git a/duui-anonymize/requirements.txt b/duui-anonymize/requirements.txt new file mode 100644 index 00000000..002c4342 --- /dev/null +++ b/duui-anonymize/requirements.txt @@ -0,0 +1,13 @@ +numpy +dkpro_cassis +fastapi +pydantic +pydantic-settings +pydantic_core +starlette +uvicorn +torch +torchvision +torchaudio +setuptools +opf @ git+https://github.com/openai/privacy-filter.git diff --git a/duui-anonymize/src/main/docker/Dockerfile b/duui-anonymize/src/main/docker/Dockerfile new file mode 100644 index 00000000..5818820a --- /dev/null +++ b/duui-anonymize/src/main/docker/Dockerfile @@ -0,0 +1,20 @@ +FROM python:3.10 + +WORKDIR /usr/src/app + +EXPOSE 9714 + +COPY ./src/main/python/communication.lua ./communication.lua +COPY ./src/main/python/duui_anonymize.py ./duui_anonymize.py +COPY ./src/main/python/duui_opf_core.py ./duui_opf_core.py +COPY ./src/main/python/typesystem.xml ./typesystem.xml +COPY ./requirements.txt ./requirements.txt + +RUN apt-get update +RUN apt-get install ffmpeg -y + +RUN pip install torch==2.0.0 torchvision==0.15.1 torchaudio==2.0.1 --index-url https://download.pytorch.org/whl/cpu +RUN pip install -r requirements.txt + +ENTRYPOINT ["uvicorn", "duui_anonymize:app", "--host", "0.0.0.0", "--port" ,"9714"] +CMD ["--workers", "1"] \ No newline at end of file diff --git a/duui-anonymize/src/main/docker/Dockerfile-cuda b/duui-anonymize/src/main/docker/Dockerfile-cuda new file mode 100644 index 00000000..4af68c5f --- /dev/null +++ b/duui-anonymize/src/main/docker/Dockerfile-cuda @@ -0,0 +1,30 @@ +FROM nvidia/cuda:11.8.0-cudnn8-devel-ubuntu22.04 + +RUN apt update && \ + DEBIAN_FRONTEND=noninteractive \ + apt install --no-install-recommends -y build-essential software-properties-common && \ + add-apt-repository -y ppa:deadsnakes/ppa && \ + apt install --no-install-recommends -y python3.10 python3-pip python3-setuptools python3-distutils && \ + apt clean && rm -rf /var/lib/apt/lists/* + +RUN ln -s /usr/bin/python3 /usr/bin/python +RUN python -m pip install --upgrade pip + +WORKDIR /usr/src/app + +EXPOSE 9714 + +COPY ./src/main/python/communication.lua ./communication.lua +COPY ./src/main/python/duui_anonymize.py ./duui_anonymize.py +COPY ./src/main/python/duui_opf_core.py ./duui_opf_core.py +COPY ./src/main/python/typesystem.xml ./typesystem.xml +COPY ./requirements.txt ./requirements.txt + +RUN apt-get update +RUN apt-get install ffmpeg -y + +RUN pip install torch==2.0.0 torchvision==0.15.1 torchaudio==2.0.1 --index-url https://download.pytorch.org/whl/cu118 +RUN pip install -r requirements.txt + +ENTRYPOINT ["uvicorn", "duui_anonymize:app", "--host", "0.0.0.0", "--port" ,"9714"] +CMD ["--workers", "1"] \ No newline at end of file diff --git a/duui-anonymize/src/main/docker/python/communication.lua b/duui-anonymize/src/main/docker/python/communication.lua new file mode 100644 index 00000000..0aa23413 --- /dev/null +++ b/duui-anonymize/src/main/docker/python/communication.lua @@ -0,0 +1,105 @@ +-- Bind static classes from java +StandardCharsets = luajava.bindClass("java.nio.charset.StandardCharsets") +util = luajava.bindClass("org.apache.uima.fit.util.JCasUtil") + +local function copy_options(params) + local options = {} + + for key, value in pairs(params or {}) do + if key ~= "selection" and key ~= "selection_begin" and key ~= "selection_end" and key ~= "selection_start" and key ~= "selection_stop" then + options[key] = value + end + end + + return options +end + +local function resolve_selection(params) + if params == nil then + return nil + end + + local selection = params["selection"] + if type(selection) == "table" then + local begin = selection["begin"] or selection["start"] + local ending = selection["end"] or selection["stop"] + if begin ~= nil and ending ~= nil then + return { + begin = begin, + ["end"] = ending, + } + end + end + + local begin = params["selection_begin"] or params["selection_start"] + local ending = params["selection_end"] or params["selection_stop"] + if begin ~= nil and ending ~= nil then + return { + begin = begin, + ["end"] = ending, + } + end + + return nil +end + +-- This "serialize" function is called to transform the CAS object into an stream that is sent to the annotator +-- Inputs: +-- - inputCas: The actual CAS object to serialize +-- - outputStream: Stream that is sent to the annotator, can be e.g. a string, JSON payload, ... +function serialize(inputCas, outputStream, params) + local text = inputCas:getSofaDataString() + if text == nil then + text = "" + end + + -- Encode data as JSON object and write to stream + outputStream:write(json.encode({ + text = text, + options = copy_options(params), + -- selection = resolve_selection(params) + })) +end + +-- This "deserialize" function is called on receiving the results from the annotator that have to be transformed into a CAS object +-- Inputs: +-- - inputCas: The actual CAS object to deserialize into +-- - inputStream: Stream that is received from to the annotator, can be e.g. a string, JSON payload, ... +function deserialize(inputCas, inputStream) + -- Get string from stream, assume UTF-8 encoding + local inputString = luajava.newInstance("java.lang.String", inputStream:readAllBytes(), StandardCharsets.UTF_8) + + -- Parse JSON data from string into object + local results = json.decode(inputString) + + local targetCas = inputCas + if inputCas.createView ~= nil then + local ok, view = pcall(function() + return inputCas:createView("opf_redacted") + end) + if ok and view ~= nil then + targetCas = view + end + end + + if results["redacted_text"] ~= nil then + targetCas:setSofaDataString(results["redacted_text"], "text/plain") + elseif results["text"] ~= nil then + targetCas:setSofaDataString(results["text"], "text/plain") + end + + if results["detected_spans"] ~= nil then + for i, sent in ipairs(results["detected_spans"]) do + local anomaly = luajava.newInstance("de.tudarmstadt.ukp.dkpro.core.api.anomaly.type.Anomaly", targetCas) + anomaly:setBegin(sent["start"]) + anomaly:setEnd(sent["end"]) + anomaly:setCategory(sent["label"]) + anomaly:setDescription(sent["placeholder"] or sent["text"] or sent["label"]) + anomaly:addToIndexes() + end + end + + if results["selection"] ~= nil then + -- Selection metadata is available in the JSON response for downstream consumers. + end +end diff --git a/duui-anonymize/src/main/python/communication.lua b/duui-anonymize/src/main/python/communication.lua new file mode 100644 index 00000000..288ae5f3 --- /dev/null +++ b/duui-anonymize/src/main/python/communication.lua @@ -0,0 +1,105 @@ +-- Bind static classes from java +StandardCharsets = luajava.bindClass("java.nio.charset.StandardCharsets") +util = luajava.bindClass("org.apache.uima.fit.util.JCasUtil") + +local function copy_options(params) + local options = {} + + for key, value in pairs(params or {}) do + if key ~= "selection" and key ~= "selection_begin" and key ~= "selection_end" and key ~= "selection_start" and key ~= "selection_stop" then + options[key] = value + end + end + + return options +end + +local function resolve_selection(params) + if params == nil then + return nil + end + + local selection = params["selection"] + if type(selection) == "table" then + local begin = selection["begin"] or selection["start"] + local ending = selection["end"] or selection["stop"] + if begin ~= nil and ending ~= nil then + return { + begin = begin, + ["end"] = ending, + } + end + end + + local begin = params["selection_begin"] or params["selection_start"] + local ending = params["selection_end"] or params["selection_stop"] + if begin ~= nil and ending ~= nil then + return { + begin = begin, + ["end"] = ending, + } + end + + return nil +end + +-- This "serialize" function is called to transform the CAS object into an stream that is sent to the annotator +-- Inputs: +-- - inputCas: The actual CAS object to serialize +-- - outputStream: Stream that is sent to the annotator, can be e.g. a string, JSON payload, ... +function serialize(inputCas, outputStream, params) + local text = inputCas:getSofaDataString() + if text == nil then + text = "" + end + + -- Encode data as JSON object and write to stream + outputStream:write(json.encode({ + text = text, + options = copy_options(params), + selection = resolve_selection(params) + })) +end + +-- This "deserialize" function is called on receiving the results from the annotator that have to be transformed into a CAS object +-- Inputs: +-- - inputCas: The actual CAS object to deserialize into +-- - inputStream: Stream that is received from to the annotator, can be e.g. a string, JSON payload, ... +function deserialize(inputCas, inputStream) + -- Get string from stream, assume UTF-8 encoding + local inputString = luajava.newInstance("java.lang.String", inputStream:readAllBytes(), StandardCharsets.UTF_8) + + -- Parse JSON data from string into object + local results = json.decode(inputString) + + local targetCas = inputCas + if inputCas.createView ~= nil then + local ok, view = pcall(function() + return inputCas:createView("opf_redacted") + end) + if ok and view ~= nil then + targetCas = view + end + end + + if results["redacted_text"] ~= nil then + targetCas:setSofaDataString(results["redacted_text"], "text/plain") + elseif results["text"] ~= nil then + targetCas:setSofaDataString(results["text"], "text/plain") + end + + if results["detected_spans"] ~= nil then + for i, sent in ipairs(results["detected_spans"]) do + local anomaly = luajava.newInstance("de.tudarmstadt.ukp.dkpro.core.api.anomaly.type.Anomaly", targetCas) + anomaly:setBegin(sent["start"]) + anomaly:setEnd(sent["end"]) + anomaly:setCategory(sent["label"]) + anomaly:setDescription(sent["placeholder"] or sent["text"] or sent["label"]) + anomaly:addToIndexes() + end + end + + if results["selection"] ~= nil then + -- Selection metadata is available in the JSON response for downstream consumers. + end +end diff --git a/duui-anonymize/src/main/python/duui_anonymize.py b/duui-anonymize/src/main/python/duui_anonymize.py new file mode 100644 index 00000000..76d6df52 --- /dev/null +++ b/duui-anonymize/src/main/python/duui_anonymize.py @@ -0,0 +1,375 @@ +from __future__ import annotations + +import logging +from functools import lru_cache +import json +from enum import Enum +from typing import Any, List, Optional, Union + +import torch +import uvicorn +from cassis import load_typesystem +from fastapi import FastAPI, Request, Response +from fastapi.encoders import jsonable_encoder +from fastapi.exceptions import RequestValidationError +from fastapi.responses import JSONResponse, PlainTextResponse +from pydantic import BaseModel, Field, field_validator +from pydantic_settings import BaseSettings + +from opf import DecodeOptions, OPF + +logger = logging.getLogger(__name__) +logging.basicConfig(level=logging.INFO) + + +# --------------------------------------------------------------------------- +# Pydantic models +# --------------------------------------------------------------------------- + +class DetectedSpan(BaseModel): + label: str + start: int + end: int + text: str + placeholder: str + + +class SelectionRange(BaseModel): + begin: int + end: int + + +class DUUIRequest(BaseModel): + text: str + options: dict[str, Any] = Field(default_factory=dict) + selection: Optional[SelectionRange] = None + + @field_validator("options", mode="before") + @classmethod + def coerce_options(cls, v: Any) -> dict: + """ + Lua JSON libraries encode empty tables as [] instead of {}. + Accept None, empty list, or any list by falling back to an empty dict. + """ + if v is None: + return {} + if isinstance(v, list): + return {} + if not isinstance(v, dict): + return {} + return v + + @field_validator("text", mode="before") + @classmethod + def coerce_text(cls, v: Any) -> str: + """Tolerate Java String objects forwarded via LuaJ.""" + if v is None: + return "" + return str(v) + + +class DUUIResponse(BaseModel): + schema_version: int + summary: dict[str, Any] + text: str + detected_spans: List[DetectedSpan] + redacted_text: str + warning: Optional[str] = None + selection: Optional[SelectionRange] = None + + +class DUUIDocumentation(BaseModel): + annotator_name: str + version: str + implementation_lang: str + + +# --------------------------------------------------------------------------- +# Settings +# --------------------------------------------------------------------------- + +class Settings(BaseSettings): + duui_tool_name: str = "DUUI Anonymize" + duui_tool_version: str = "1.0" + default_model: Optional[str] = None + + +class RedactionMode(str, Enum): + REPLACEMENT = "replacement" + PSEUDO = "pseudo" + + +settings = Settings() +DEFAULT_PLACEHOLDER = "" +DEFAULT_MODE = RedactionMode.REPLACEMENT.value +PSEUDO_MODE = RedactionMode.PSEUDO.value + +# --------------------------------------------------------------------------- +# FastAPI app +# --------------------------------------------------------------------------- + +app = FastAPI( + docs_url="/api", + redoc_url=None, + title="DUUI Anonymize", + description="Text anonymization / PII redaction for TTLab DUUI using the OpenAI Privacy Filter", + version="1.0", + terms_of_service="https://www.texttechnologylab.org/legal_notice/", + contact={ + "name": "Ali Abusaleh", + "url": "https://www.texttechnologylab.org", + "email": "abusaleh@em.uni-frankfurt.de", + }, + license_info={ + "name": "AGPL", + "url": "http://www.gnu.org/licenses/agpl-3.0.en.html", + }, +) + + +@app.exception_handler(RequestValidationError) +async def validation_exception_handler(request: Request, exc: RequestValidationError) -> JSONResponse: + body = await request.body() + logger.error("422 Unprocessable Entity — validation errors: %s", exc.errors()) + logger.error("Raw request body: %s", body.decode("utf-8", errors="replace")) + return JSONResponse( + status_code=422, + content=jsonable_encoder({"detail": exc.errors(), "body": body.decode("utf-8", errors="replace")}), + ) + + +# --------------------------------------------------------------------------- +# Static assets loaded at startup +# --------------------------------------------------------------------------- + +with open("communication.lua", "rb") as _f: + _communication_lua: str = _f.read().decode("utf-8") + +with open("typesystem.xml", "rb") as _f: + _typesystem = load_typesystem(_f) + + +# --------------------------------------------------------------------------- +# Endpoints +# --------------------------------------------------------------------------- + +@app.get("/v1/details/input_output") +def get_input_output() -> JSONResponse: + return JSONResponse(content=jsonable_encoder({ + "inputs": [], + "outputs": ["de.tudarmstadt.ukp.dkpro.core.api.anomaly.type.Anomaly"], + })) + + +@app.get("/v1/typesystem") +def get_typesystem() -> Response: + xml_content = _typesystem.to_xml().encode("utf-8") + return Response(content=xml_content, media_type="application/xml") + + +@app.get("/v1/communication_layer", response_class=PlainTextResponse) +def get_communication_layer() -> str: + return _communication_lua + + +@app.get("/v1/documentation") +def get_documentation() -> DUUIDocumentation: + return DUUIDocumentation( + annotator_name=settings.duui_tool_name, + version=settings.duui_tool_version, + implementation_lang="Python", + ) + + +@app.post("/v1/process") +async def post_process(raw_request: Request) -> DUUIResponse: + # DUUI does not set Content-Type: application/json, so FastAPI will not + # deserialize the body automatically. We parse it manually here. + body = await raw_request.body() + try: + data = json.loads(body) + except json.JSONDecodeError as exc: + raise RequestValidationError([{"type": "json_invalid", "loc": ("body",), "msg": str(exc), "input": body}]) + request = DUUIRequest.model_validate(data) + options = dict(request.options) + selection = _resolve_selection(request.selection, options, text_length=len(request.text)) + return _redact_text(request.text, selection, options) + + +# --------------------------------------------------------------------------- +# Business logic +# --------------------------------------------------------------------------- + +def _resolve_selection( + request_selection: Optional[SelectionRange], + options: dict[str, Any], + *, + text_length: int, +) -> Optional[SelectionRange]: + if request_selection is not None: + begin = int(request_selection.begin) + end_val = int(request_selection.end) + else: + raw = options.pop("selection", None) + if isinstance(raw, dict): + begin = raw.get("begin") + end_val = raw.get("end") + else: + begin = options.pop("selection_begin", options.pop("selection_start", None)) + end_val = options.pop("selection_end", options.pop("selection_stop", None)) + + if begin is None or end_val is None: + return None + + begin = int(begin) + end_val = int(end_val) + if begin < 0 or end_val < begin or end_val > text_length: + raise ValueError("selection must satisfy 0 <= begin <= end <= text length") + return SelectionRange(begin=begin, end=end_val) + + +def _json_cache_key(payload: dict[str, Any]) -> str: + return json.dumps(payload, sort_keys=True, separators=(",", ":"), default=str) + + +def _split_options(options: dict[str, Any]) -> tuple[dict[str, Any], dict[str, Any]]: + _skip = {"decode", "selection", "selection_begin", "selection_end", "selection_start", "selection_stop"} + _redactor_keys = {"model", "context_window_length", "trim_whitespace", "device", + "output_mode", "discard_overlapping_predicted_spans", "mode", "placeholder"} + _decode_keys = {"decode_mode", "viterbi_calibration_path", "calibration_path"} + + redactor_opts: dict[str, Any] = {} + decode_opts: dict[str, Any] = {} + for key, value in options.items(): + if key in _skip: + continue + if key in _redactor_keys: + redactor_opts[key] = value + elif key in _decode_keys: + k = "viterbi_calibration_path" if key == "calibration_path" else key + decode_opts[k] = value + return redactor_opts, decode_opts + + +@lru_cache(maxsize=8) +def _build_redactor(options_json: str) -> OPF: + opts = json.loads(options_json) + device = opts.get("device") or ("cuda" if torch.cuda.is_available() else "cpu") + return OPF( + model=opts.get("model", settings.default_model), + context_window_length=opts.get("context_window_length"), + trim_whitespace=bool(opts.get("trim_whitespace", True)), + device=device, + output_mode=opts.get("output_mode", "typed"), + discard_overlapping_predicted_spans=bool(opts.get("discard_overlapping_predicted_spans", False)), + output_text_only=False, + ) + + +def _compose_redacted(text: str, spans: list[DetectedSpan], *, placeholder: str) -> str: + if not spans: + return text + parts: list[str] = [] + cursor = 0 + for span in sorted(spans, key=lambda s: (s.start, s.end)): + if span.start < cursor: + continue + parts.append(text[cursor:span.start]) + parts.append(placeholder) + cursor = max(cursor, span.end) + parts.append(text[cursor:]) + return "".join(parts) + + +def _parse_spans(payload: Any, *, offset: int = 0) -> list[DetectedSpan]: + spans: list[DetectedSpan] = [] + for item in payload: + if isinstance(item, dict): + label = item.get("label") + start = item.get("start") + end_val = item.get("end") + text = item.get("text") + placeholder = item.get("placeholder") + else: + label = getattr(item, "label", None) + start = getattr(item, "start", None) + end_val = getattr(item, "end", None) + text = getattr(item, "text", None) + placeholder = getattr(item, "placeholder", None) + spans.append(DetectedSpan( + label=str(label), + start=int(start) + offset, + end=int(end_val) + offset, + text=str(text), + placeholder=str(placeholder), + )) + return spans + + +def _redact_text( + text: str, + selection: Optional[SelectionRange], + options: dict[str, Any], +) -> DUUIResponse: + redactor_opts, decode_opts = _split_options(options) + mode = str(redactor_opts.get("mode", DEFAULT_MODE)) + placeholder = str(redactor_opts.get("placeholder", DEFAULT_PLACEHOLDER)) + + if mode == PSEUDO_MODE: + return DUUIResponse( + schema_version=1, + summary={"mode": PSEUDO_MODE, "span_count": 0, "by_label": {}, "decoded_mismatch": False}, + text=text, + detected_spans=[], + redacted_text=text, + warning="pseudo mode returns the input unchanged", + selection=selection, + ) + + redactor = _build_redactor(_json_cache_key(redactor_opts)) + decode = DecodeOptions(**decode_opts) if decode_opts else None + + selected_text = text + offset = 0 + if selection is not None: + offset = selection.begin + selected_text = text[selection.begin:selection.end] + + result = redactor.redact(selected_text, decode=decode) + + if isinstance(result, str): + redacted_text = result if selection is None else ( + text[:selection.begin] + result + text[selection.end:] + ) + return DUUIResponse( + schema_version=1, + summary={"mode": mode, "span_count": 0, "by_label": {}, "decoded_mismatch": False}, + text=text, + detected_spans=[], + redacted_text=redacted_text, + selection=selection, + ) + + detected_spans = _parse_spans(result.detected_spans, offset=offset) + local_spans = [ + DetectedSpan(label=s.label, start=s.start - offset, end=s.end - offset, + text=s.text, placeholder=placeholder) + for s in detected_spans + ] + redacted_text = _compose_redacted(selected_text, local_spans, placeholder=placeholder) + if selection is not None: + redacted_text = text[:selection.begin] + redacted_text + text[selection.end:] + + return DUUIResponse( + schema_version=int(result.schema_version), + summary={**dict(result.summary), "mode": mode}, + text=text, + detected_spans=detected_spans, + redacted_text=redacted_text, + warning=result.warning, + selection=selection, + ) + + +if __name__ == "__main__": + uvicorn.run("duui_anonymize:app", host="0.0.0.0", port=9714, workers=1) diff --git a/duui-anonymize/src/main/python/duui_opf.py b/duui-anonymize/src/main/python/duui_opf.py new file mode 100644 index 00000000..5ff804bd --- /dev/null +++ b/duui-anonymize/src/main/python/duui_opf.py @@ -0,0 +1,3 @@ +from duui_anonymize import app + +__all__ = ["app"] diff --git a/duui-anonymize/src/main/python/duui_opf_core.py b/duui-anonymize/src/main/python/duui_opf_core.py new file mode 100644 index 00000000..88670158 --- /dev/null +++ b/duui-anonymize/src/main/python/duui_opf_core.py @@ -0,0 +1,149 @@ +from __future__ import annotations + +from dataclasses import dataclass +from typing import Any, Mapping + +DEFAULT_MODE = "replacement" +PSEUDO_MODE = "pseudo" +DEFAULT_PLACEHOLDER = "" + +_SELECTION_KEYS = { + "selection", + "selection_begin", + "selection_end", + "selection_start", + "selection_stop", +} + +_SERVICE_OPTION_KEYS = { + "model", + "context_window_length", + "trim_whitespace", + "device", + "output_mode", + "discard_overlapping_predicted_spans", + "mode", + "placeholder", +} + +_DECODE_OPTION_KEYS = { + "decode_mode", + "viterbi_calibration_path", + "calibration_path", +} + + +@dataclass(frozen=True) +class SelectionRange: + begin: int + end: int + + +@dataclass(frozen=True) +class RedactionSpan: + label: str + start: int + end: int + text: str + placeholder: str = DEFAULT_PLACEHOLDER + + +def split_options( + options: Mapping[str, Any], +) -> tuple[dict[str, Any], dict[str, Any], str, str]: + service_options: dict[str, Any] = {} + decode_options: dict[str, Any] = {} + mode = DEFAULT_MODE + placeholder = DEFAULT_PLACEHOLDER + + for key, value in options.items(): + if key in _SELECTION_KEYS: + continue + if key == "mode": + mode = str(value) + elif key == "placeholder": + placeholder = str(value) + elif key in _SERVICE_OPTION_KEYS: + service_options[key] = value + elif key == "decode": + continue + elif key in _DECODE_OPTION_KEYS: + if key == "calibration_path": + decode_options["viterbi_calibration_path"] = value + else: + decode_options[key] = value + + return service_options, decode_options, mode, placeholder + + +def resolve_selection( + options: Mapping[str, Any], + *, + text_length: int, +) -> SelectionRange | None: + selection = options.get("selection") + if isinstance(selection, Mapping): + begin = selection.get("begin", selection.get("start")) + end = selection.get("end", selection.get("stop")) + if begin is None or end is None: + return None + return _validate_selection(begin, end, text_length=text_length) + + begin = options.get("selection_begin", options.get("selection_start")) + end = options.get("selection_end", options.get("selection_stop")) + if begin is None or end is None: + return None + return _validate_selection(begin, end, text_length=text_length) + + +def _validate_selection( + begin: Any, + end: Any, + *, + text_length: int, +) -> SelectionRange: + begin_int = int(begin) + end_int = int(end) + if begin_int < 0 or end_int < begin_int or end_int > text_length: + raise ValueError("selection must satisfy 0 <= begin <= end <= text length") + return SelectionRange(begin=begin_int, end=end_int) + + +def apply_replacement_text( + text: str, + spans: list[RedactionSpan], + *, + placeholder: str = DEFAULT_PLACEHOLDER, +) -> str: + if not spans: + return text + + redacted_parts: list[str] = [] + cursor = 0 + for span in sorted(spans, key=lambda item: (item.start, item.end)): + if span.start < cursor: + continue + redacted_parts.append(text[cursor:span.start]) + redacted_parts.append(placeholder) + cursor = max(cursor, span.end) + redacted_parts.append(text[cursor:]) + return "".join(redacted_parts) + + +def apply_selection( + text: str, + selection: SelectionRange | None, +) -> tuple[str, int]: + if selection is None: + return text, 0 + return text[selection.begin:selection.end], selection.begin + + +def compose_selection_output( + text: str, + selection: SelectionRange | None, + replacement: str, +) -> str: + if selection is None: + return replacement + return text[:selection.begin] + replacement + text[selection.end:] diff --git a/duui-anonymize/src/main/python/duui_whisperx.py b/duui-anonymize/src/main/python/duui_whisperx.py new file mode 100644 index 00000000..05fdc719 --- /dev/null +++ b/duui-anonymize/src/main/python/duui_whisperx.py @@ -0,0 +1,430 @@ +from functools import lru_cache +import json +from enum import Enum +from typing import Any, List + +import torch +import uvicorn +from cassis import load_typesystem +from fastapi import FastAPI, Response +from fastapi.encoders import jsonable_encoder +from fastapi.responses import PlainTextResponse +from pydantic import BaseModel, Field +from pydantic_settings import BaseSettings +from starlette.responses import JSONResponse + +from opf import DecodeOptions, OPF + + +class DetectedSpan(BaseModel): + """One detected privacy span returned by OPF.""" + + label: str + start: int + end: int + text: str + placeholder: str + + +class SelectionRange(BaseModel): + """Optional text selection inside the source document.""" + + begin: int + end: int + + +class DUUIRequest(BaseModel): + """Request sent by DUUI and transformed by the Lua communication layer.""" + + text: str + options: dict[str, Any] = Field(default_factory=dict) + selection: SelectionRange | None = None + + +class DUUIResponse(BaseModel): + """Response of this annotator.""" + + schema_version: int + summary: dict[str, Any] + text: str + detected_spans: List[DetectedSpan] + redacted_text: str + warning: str | None = None + selection: SelectionRange | None = None + + +class DUUIDocumentation(BaseModel): + """Documentation response.""" + + annotator_name: str + version: str + implementation_lang: str + + +class Settings(BaseSettings): + """Runtime settings for the DUUI service.""" + + duui_tool_name: str = "OpenAI Privacy Filter" + duui_tool_version: str = "1.0" + default_model: str | None = None + + +class RedactionMode(str, Enum): + REPLACEMENT = "replacement" + PSEUDO = "pseudo" + + +class PrivacyFilterService: + """Class-based service wrapper for OPF redaction.""" + + def __init__(self, settings: Settings) -> None: + self.settings = settings + + def split_options(self, options: dict[str, Any]) -> tuple[dict[str, Any], dict[str, Any]]: + return _split_options(options) + + def selection_from_options( + self, + request_selection: SelectionRange | None, + options: dict[str, Any], + *, + text_length: int, + ) -> SelectionRange | None: + return _selection_from_options( + request_selection, + options, + text_length=text_length, + ) + + def redact_text( + self, + text: str, + request_selection: SelectionRange | None, + options: dict[str, Any], + ) -> DUUIResponse: + return _redact_text(text, request_selection, options) + + +settings = Settings() +service = PrivacyFilterService(settings) +DEFAULT_PLACEHOLDER = "" +DEFAULT_MODE = RedactionMode.REPLACEMENT.value +PSEUDO_MODE = RedactionMode.PSEUDO.value + + +app = FastAPI( + docs_url="/api", + redoc_url=None, + title="OpenAI Privacy Filter", + description="Text privacy redaction for TTLab DUUI", + version="1.0", + terms_of_service="https://www.texttechnologylab.org/legal_notice/", + contact={ + "name": "Daniel Bundan", + "url": "bundan.me", + "email": "s1486849@stud.uni-frankfurt.de", + }, + license_info={ + "name": "AGPL", + "url": "http://www.gnu.org/licenses/agpl-3.0.en.html", + }, +) + +communication = "communication.lua" +with open(communication, 'rb') as f: + communication = f.read().decode("utf-8") + + +# Load the predefined typesystem that is needed for this annotator to work +typesystem_filename = 'typesystem.xml' +with open(typesystem_filename, 'rb') as f: + typesystem = load_typesystem(f) + + +# Get input / output of the annotator +@app.get("/v1/details/input_output") +def get_input_output() -> JSONResponse: + json_item = { + "inputs": [], + "outputs": ["de.tudarmstadt.ukp.dkpro.core.api.anomaly.type.Anomaly"] + } + + json_compatible_item_data = jsonable_encoder(json_item) + return JSONResponse(content=json_compatible_item_data) + + +# Get typesystem of this annotator +@app.get("/v1/typesystem") +def get_typesystem() -> Response: + # TODO remove cassis dependency, as only needed for typesystem at the moment? + xml = typesystem.to_xml() + xml_content = xml.encode("utf-8") + + return Response( + content=xml_content, + media_type="application/xml" + ) + + +# Return Lua communication script +@app.get("/v1/communication_layer", response_class=PlainTextResponse) +def get_communication_layer() -> str: + return communication + + +# Return documentation info +@app.get("/v1/documentation") +def get_documentation() -> DUUIDocumentation: + + documentation = DUUIDocumentation( + annotator_name=settings.duui_tool_name, + version=settings.duui_tool_version, + implementation_lang="Python", + ) + return documentation + + +def _selection_from_options( + request_selection: SelectionRange | None, + options: dict[str, Any], + *, + text_length: int, +) -> SelectionRange | None: + if request_selection is not None: + begin = int(request_selection.begin) + end = int(request_selection.end) + else: + selection = options.pop("selection", None) + if isinstance(selection, dict): + begin = selection.get("begin") + end = selection.get("end") + else: + begin = options.pop("selection_begin", options.pop("selection_start", None)) + end = options.pop("selection_end", options.pop("selection_stop", None)) + + if begin is None or end is None: + return None + + begin = int(begin) + end = int(end) + if begin < 0 or end < begin or end > text_length: + raise ValueError("selection must satisfy 0 <= begin <= end <= text length") + return SelectionRange(begin=begin, end=end) + + +def _json_key(payload: dict[str, Any]) -> str: + return json.dumps(payload, sort_keys=True, separators=(",", ":"), default=str) + + +def _split_options(options: dict[str, Any]) -> tuple[dict[str, Any], dict[str, Any]]: + redactor_options: dict[str, Any] = {} + decode_options: dict[str, Any] = {} + + for key, value in options.items(): + if key in {"decode", "selection", "selection_begin", "selection_end", "selection_start", "selection_stop"}: + continue + if key == "model": + redactor_options["model"] = value + elif key == "context_window_length": + redactor_options["context_window_length"] = value + elif key == "trim_whitespace": + redactor_options["trim_whitespace"] = value + elif key == "device": + redactor_options["device"] = value + elif key == "output_mode": + redactor_options["output_mode"] = value + elif key == "discard_overlapping_predicted_spans": + redactor_options["discard_overlapping_predicted_spans"] = value + elif key == "mode": + redactor_options["mode"] = value + elif key == "placeholder": + redactor_options["placeholder"] = value + elif key == "decode_mode": + decode_options["decode_mode"] = value + elif key in {"viterbi_calibration_path", "calibration_path"}: + decode_options["viterbi_calibration_path"] = value + elif key == "output_text_only": + continue + + return redactor_options, decode_options + + +@lru_cache(maxsize=8) +def _build_redactor(options_json: str) -> OPF: + options = json.loads(options_json) + device = options.get("device") + if device is None: + device = "cuda" if torch.cuda.is_available() else "cpu" + + redactor = OPF( + model=options.get("model", settings.default_model), + context_window_length=options.get("context_window_length"), + trim_whitespace=bool(options.get("trim_whitespace", True)), + device=device, + output_mode=options.get("output_mode", "typed"), + discard_overlapping_predicted_spans=bool( + options.get("discard_overlapping_predicted_spans", False) + ), + output_text_only=False, + ) + + return redactor + + +def _compose_replacement_text( + text: str, + spans: list[DetectedSpan], + *, + placeholder: str = DEFAULT_PLACEHOLDER, +) -> str: + if not spans: + return text + + redacted_parts: list[str] = [] + cursor = 0 + for span in sorted(spans, key=lambda item: (item.start, item.end)): + if span.start < cursor: + continue + redacted_parts.append(text[cursor:span.start]) + redacted_parts.append(placeholder) + cursor = max(cursor, span.end) + redacted_parts.append(text[cursor:]) + return "".join(redacted_parts) + + +def _detect_spans(payload: Any, *, offset: int = 0) -> list[DetectedSpan]: + detected_spans: list[DetectedSpan] = [] + for span in payload: + if isinstance(span, dict): + label = span.get("label") + start = span.get("start") + end = span.get("end") + text = span.get("text") + placeholder = span.get("placeholder") + else: + label = getattr(span, "label", None) + start = getattr(span, "start", None) + end = getattr(span, "end", None) + text = getattr(span, "text", None) + placeholder = getattr(span, "placeholder", None) + + detected_spans.append( + DetectedSpan( + label=str(label), + start=int(start) + offset, + end=int(end) + offset, + text=str(text), + placeholder=str(placeholder), + ) + ) + + return detected_spans + + +def _render_pseudo_response( + *, + text: str, + request_selection: SelectionRange | None, + options: dict[str, Any], +) -> DUUIResponse: + summary = { + "mode": PSEUDO_MODE, + "span_count": 0, + "by_label": {}, + "decoded_mismatch": False, + } + return DUUIResponse( + schema_version=1, + summary=summary, + text=text, + detected_spans=[], + redacted_text=text, + warning="pseudo mode is a stub and returns the input unchanged", + selection=request_selection, + ) + + +def _redact_text(text: str, request_selection: SelectionRange | None, options: dict[str, Any]) -> DUUIResponse: + constructor_options, decode_options = _split_options(options) + mode = str(constructor_options.get("mode", DEFAULT_MODE)) + placeholder = str(constructor_options.get("placeholder", DEFAULT_PLACEHOLDER)) + + if mode == PSEUDO_MODE: + return _render_pseudo_response( + text=text, + request_selection=request_selection, + options=constructor_options, + ) + + redactor = _build_redactor(_json_key(constructor_options)) + decode = DecodeOptions(**decode_options) if decode_options else None + + selected_text = text + selection_offset = 0 + if request_selection is not None: + selection_offset = request_selection.begin + selected_text = text[request_selection.begin:request_selection.end] + + result = redactor.redact(selected_text, decode=decode) + + if isinstance(result, str): + redacted_text = result if request_selection is None else ( + text[:request_selection.begin] + result + text[request_selection.end:] + ) + return DUUIResponse( + schema_version=1, + summary={ + "mode": mode, + "span_count": 0, + "by_label": {}, + "decoded_mismatch": False, + }, + text=text, + detected_spans=[], + redacted_text=redacted_text, + warning=None, + selection=request_selection, + ) + + detected_spans = _detect_spans(result.detected_spans, offset=selection_offset) + redacted_text = _compose_replacement_text( + selected_text, + [ + DetectedSpan( + label=span.label, + start=span.start - selection_offset, + end=span.end - selection_offset, + text=span.text, + placeholder=placeholder, + ) + for span in detected_spans + ], + placeholder=placeholder, + ) + if request_selection is not None: + redacted_text = text[:request_selection.begin] + redacted_text + text[request_selection.end:] + + return DUUIResponse( + schema_version=int(result.schema_version), + summary={**dict(result.summary), "mode": mode}, + text=text, + detected_spans=detected_spans, + redacted_text=redacted_text, + warning=result.warning, + selection=request_selection, + ) + + +# Process request from DUUI +@app.post("/v1/process") +def post_process(request: DUUIRequest) -> DUUIResponse: + selection = service.selection_from_options( + request.selection, + dict(request.options), + text_length=len(request.text), + ) + return service.redact_text(request.text, selection, dict(request.options)) + + +if __name__ == "__main__": + uvicorn.run("duui_opf:app", host="0.0.0.0", port=9714, workers=1) diff --git a/duui-anonymize/src/main/python/typesystem.xml b/duui-anonymize/src/main/python/typesystem.xml new file mode 100644 index 00000000..5032b07b --- /dev/null +++ b/duui-anonymize/src/main/python/typesystem.xml @@ -0,0 +1,2202 @@ + + + + + + de.tudarmstadt.ukp.dkpro.core.api.anomaly.type.Anomaly + + uima.tcas.Annotation + + + description + + uima.cas.String + + + suggestions + An array of the suggested actions to be taken for this anomaly. + uima.cas.FSArray + de.tudarmstadt.ukp.dkpro.core.api.anomaly.type.SuggestedAction + + + category + + uima.cas.String + + + + + de.tudarmstadt.ukp.dkpro.core.api.anomaly.type.GrammarAnomaly + + de.tudarmstadt.ukp.dkpro.core.api.anomaly.type.Anomaly + + + de.tudarmstadt.ukp.dkpro.core.api.anomaly.type.SpellingAnomaly + + de.tudarmstadt.ukp.dkpro.core.api.anomaly.type.Anomaly + + + de.tudarmstadt.ukp.dkpro.core.api.anomaly.type.SuggestedAction + + uima.tcas.Annotation + + + replacement + The text covered by the Anomaly annotation should be replaced with the contents of this + feature. + + uima.cas.String + + + certainty + A score representing how certain is this suggested action. + Usually in [0,1]. + + uima.cas.Float + + + + + de.tudarmstadt.ukp.dkpro.core.api.coref.type.CoreferenceChain + Marks the beginning of a chain. + uima.cas.AnnotationBase + + + first + This is the first corefernce link in coreference chain + de.tudarmstadt.ukp.dkpro.core.api.coref.type.CoreferenceLink + + + + + de.tudarmstadt.ukp.dkpro.core.api.coref.type.CoreferenceLink + A link in the coreference chain. + uima.tcas.Annotation + + + next + If there is one, it is the next coreference link to the current coreference link + + de.tudarmstadt.ukp.dkpro.core.api.coref.type.CoreferenceLink + + + referenceType + The role or type which the covered text has in the coreference chain. + uima.cas.String + + + referenceRelation + The type of relation between this link and the next link in the chain. + uima.cas.String + + + + + de.tudarmstadt.ukp.dkpro.core.api.frequency.tfidf.type.Tfidf + Annotates the tf.idf score of a token, stem, or lemma. + uima.tcas.Annotation + + + tfidfValue + The tf.idf score. + uima.cas.Double + + + term + The string that was used to compute this tf.idf score. + If a stem or lemma was used, the covered text of this annotation does not need to be equal to + this string. + + This string can be used to construct a vector space with the right terms without having to + access the indexes again. + + uima.cas.String + + + + + de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.morph.Morpheme + + uima.tcas.Annotation + + + morphTag + + uima.cas.String + + + + + de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.morph.MorphologicalFeatures + Morphological categories that can be attached to tokens. + + The features are supposed to match the Universal Dependency v1 features. + + uima.tcas.Annotation + + + gender + + uima.cas.String + + + number + Singular/plural + uima.cas.String + + + case + Nouns: nominative, genetiv, dative, ... + uima.cas.String + + + degree + Adjectives: comparative/Superlative + uima.cas.String + + + verbForm + + uima.cas.String + + + tense + Verbs: past tense, present tense, future tense, etc. + uima.cas.String + + + mood + Verbs: indicative, imperative, subjunctive + uima.cas.String + + + voice + Verbs: active/passive + uima.cas.String + + + definiteness + Definite or indefinite + uima.cas.String + + + value + The original morphological analysis results as produced by a tool or as recorded in a + corpus (if available). If the categories were originally encoded in such a string, the other + features are filled by analyzing this string. If the categories were provided separately, e.g. + by different attributed in an XML-encoded corpus, this field may remain empty. + + uima.cas.String + + + person + Verbs: 1st, 2nd, 3rd person + uima.cas.String + + + aspect + Verbs: perfective, imperfective + uima.cas.String + + + animacy + + uima.cas.String + + + negative + + uima.cas.String + + + numType + + uima.cas.String + + + possessive + + uima.cas.String + + + pronType + + uima.cas.String + + + reflex + + uima.cas.String + + + transitivity + Verbs: transitive/intransitive + + @deprecated + + uima.cas.String + + + + + de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS + The part of speech of a word or a phrase. + uima.tcas.Annotation + + + PosValue + Fine-grained POS tag. This is the tag as produced by a POS tagger or obtained from a + reader. + + uima.cas.String + + + coarseValue + Coarse-grained POS tag. This may be produced by a POS tagger or reader in addition to + the fine-grained tag. + + uima.cas.String + + + + + de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS_ADJ + Adjective + de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS + + + de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS_ADP + Adposition + de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS + + + de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS_ADV + Adverb + de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS + + + de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS_AUX + Auxiliary verb + de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS + + + de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS_CONJ + Conjunction + de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS + + + de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS_DET + Determiner + de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS + + + de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS_INTJ + Interjection + de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS + + + de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS_NOUN + Noun + de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS + + + de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS_NUM + Numeral + de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS + + + de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS_PART + Particle + de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS + + + de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS_PRON + Pronoun + de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS + + + de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS_PROPN + Proper noun + de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS + + + de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS_PUNCT + Punctuation + de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS + + + de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS_SCONJ + Subordinating conjunction + de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS + + + de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS_SYM + Symbol + de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS + + + de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS_VERB + Verb + de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS + + + de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS_X + Other + de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS + + + de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.tweet.POS_AT + at-mention (indicates another user as a recipient of a tweet) + de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS_X + + + de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.tweet.POS_DM + discourse marker, indications of continuation of a message across multiple tweets + de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS_X + + + de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.tweet.POS_EMO + emoticon + de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS_X + + + de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.tweet.POS_HASH + Hashtag (indicates topic/category for tweet) + de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS_X + + + de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.tweet.POS_INT + proper noun + verbal + de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS_X + + + de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.tweet.POS_NNV + nominal + verbal + de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS_NOUN + + + de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.tweet.POS_NPV + proper noun + verbal + de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS_NOUN + + + de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.tweet.POS_URL + URL or email address + de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS_X + + + de.tudarmstadt.ukp.dkpro.core.api.metadata.type.MetaDataStringField + <p>A general purpose annotation to store document-wide information in the form of + arbitrary key-value string pairs.</p> + + uima.tcas.Annotation + + + key + Name of a metadata field. + uima.cas.String + + + value + The field value. + uima.cas.String + + + + + de.tudarmstadt.ukp.dkpro.core.api.metadata.type.TagDescription + Description of an individual tag. + uima.cas.TOP + + + name + The name of the tag. + uima.cas.String + + + + + de.tudarmstadt.ukp.dkpro.core.api.metadata.type.TagsetDescription + Information about a tagset (controlled vocabulary). + uima.tcas.Annotation + + + layer + The layer to which the tagset applies. This is + typically the name of an UIMA type such as + "de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS". + + uima.cas.String + + + name + The name of the tagset. + uima.cas.String + + + tags + Descriptions of the tags belonging to this tagset. + uima.cas.FSArray + de.tudarmstadt.ukp.dkpro.core.api.metadata.type.TagDescription + + + componentName + + uima.cas.String + + + modelLocation + + uima.cas.String + + + modelVariant + + uima.cas.String + + + modelLanguage + + uima.cas.String + + + modelVersion + + uima.cas.String + + + input + True if the tagset is used as input by the component/model, otherwise false. + + uima.cas.Boolean + + + + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.Animal + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity + + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.Cardinal + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity + + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.ContactInfo + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity + + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.Date + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity + + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.Disease + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity + + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.Event + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity + + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.Fac + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity + + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.FacDesc + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity + + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.Game + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity + + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.Gpe + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity + + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.GpeDesc + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity + + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.Language + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity + + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.Law + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity + + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.Location + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity + + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.Money + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity + + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity + Named entities refer e.g. to persons, locations, organizations and so on. They often consist of + multiple tokens. + + uima.tcas.Annotation + + + value + The class/category of the named entity, e.g. person, location, etc. + uima.cas.String + + + identifier + Identifier of the named entity, e.g. a reference into a person database. + uima.cas.String + + + + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.Nationality + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity + + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.Norp + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity + + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.Ordinal + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity + + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.OrgDesc + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity + + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.Organization + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity + + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.PerDesc + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity + + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.Percent + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity + + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.Person + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity + + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.Plant + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity + + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.Product + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity + + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.ProductDesc + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity + + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.Quantity + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity + + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.Substance + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity + + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.Time + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity + + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.WorkOfArt + + de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity + + + de.tudarmstadt.ukp.dkpro.core.api.phonetics.type.PhoneticTranscription + <p>Represents the phonetic transcription of some textual element (usually a Token). + Phonetic transcriptions are e.g. generated by transcription processes like Soundex or Metaphone.</p> + + uima.tcas.Annotation + + + transcription + The actual transcription + uima.cas.String + + + name + The name of the transcription process that was used + uima.cas.String + + + + + de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Compound + This type represents a decompounding word, i.e.: flowerpot. Each Compound one have at least two + Splits. + + uima.tcas.Annotation + + + splits + A word that can be decomposed into different parts. + uima.cas.FSArray + de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Split + + + + + de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.CompoundPart + <p>A CompoundPart represents one fragment from the compounding word. Besides that, it can + store other CompoundParts if it can be split again. The way it stores a decompounding word represents a + decompounding tree.</p> + + de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Split + + + de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Div + Document structure element. + uima.tcas.Annotation + + + divType + + uima.cas.String + + + id + If this unit had an ID in the source format from which it was imported, it may be + stored here. IDs are typically not assigned by DKPro Core components. If an ID is present, it + should be respected by writers. + + uima.cas.String + + + + + de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Document + + de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Div + + + de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Heading + Document title, section heading, etc. + de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Div + + + de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma + + uima.tcas.Annotation + + + value + + uima.cas.String + + + + + de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.LexicalPhrase + + uima.tcas.Annotation + + + text + + uima.cas.String + + + + + de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.LinkingMorpheme + This type represents a linking morpheme between two CompoundParts. + de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Split + + + de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.NGram + + uima.tcas.Annotation + + + text + + uima.cas.String + + + + + de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Paragraph + + de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Div + + + de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence + + uima.tcas.Annotation + + + id + If this unit had an ID in the source format from which it was imported, it may be + stored here. IDs are typically not assigned by DKPro Core components. If an ID is present, it + should be respected by writers. + + uima.cas.String + + + + + de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Split + This type represents a part of a decompounding word. A Split can be either a CompoundPart or a + LinkingMorpheme. + + uima.tcas.Annotation + + + splits + Sub-splits of the current split. + uima.cas.FSArray + de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Split + + + + + de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Stem + + uima.tcas.Annotation + + + value + + uima.cas.String + + + + + de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.StopWord + + uima.tcas.Annotation + + + de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.SurfaceForm + This annotation can be used to indicate an alternate surface form. E.g. some corpora consider a + normalized form of the text with resolved contractions as the canonical form and only maintain the + original surface form as a secondary information. One example is the Conll-U format. + + uima.tcas.Annotation + + + value + Alternate surface form. + uima.cas.String + + + + + de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token + <p>Token is one of the two types commonly produced by a segmenter (the other being + Sentence). A Token usually represents a word, although it may be used to represent multiple tightly + connected words (e.g. "New York") or parts of a word (e.g. the possessive "'s"). One may choose to split + compound words into multiple tokens, e.g. ("CamelCase" -&gt; "Camel", "Case"; "Zauberstab" -&gt; + "Zauber", "stab"). Most processing components operate on Tokens, usually within the limits of the + surrounding Sentence. E.g. a part-of-speech tagger analyses each Token in a Sentence and assigns a + part-of-speech to each Token.</p> + + uima.tcas.Annotation + + + parent + the parent of this token. This feature is meant to be used in when the token + participates in a constituency parse and then refers to a constituent containing this token. The + type of this feature is {@link Annotation} to avoid adding a dependency on the syntax API + module. + + uima.tcas.Annotation + + + lemma + + de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma + + + stem + + de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Stem + + + pos + + de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS + + + morph + The morphological feature associated with this token. + de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.morph.MorphologicalFeatures + + + + id + If this unit had an ID in the source format from which it was imported, it may be + stored here. IDs are typically not assigned by DKPro Core components. If an ID is present, it + should be respected by writers. + + uima.cas.String + + + form + Potentially normalized form of the token text that should be used instead of the + covered text if set. + + de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.TokenForm + + + syntacticFunction + + uima.cas.String + + + order + Disambiguates the token order for tokens which have the same offsets, e.g. when the + contraction "à" is analyzed as two tokens "a" and "a". + + uima.cas.Integer + + + + + de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.TokenForm + A alternative token text which should be used instead of the covered text if set on a token. + + uima.tcas.Annotation + + + value + + uima.cas.String + + + + + de.tudarmstadt.ukp.dkpro.core.api.semantics.type.SemArg + The SemArg annotation is attached to semantic arguments of semantic + predicates. Semantic arguments are characterized by their semantic role, e.g. Agent, + Experiencer, Topic. The semantic role of an argument is related to its semantic type + (for communication verbs, the Agent can be a person or an organization, but + typically not food). + + uima.tcas.Annotation + + + de.tudarmstadt.ukp.dkpro.core.api.semantics.type.SemArgLink + The SemArgLink type is used to attach SemPred annotations to their respective SemArg + annotations while giving each link a role. + + uima.cas.TOP + + + role + The role which the argument takes. The value depends on the theory being used, e.g. + Arg0, Arg1, etc. or Buyer, Seller, etc. + + uima.cas.String + + + target + The target argument. + de.tudarmstadt.ukp.dkpro.core.api.semantics.type.SemArg + + + + + de.tudarmstadt.ukp.dkpro.core.api.semantics.type.SemPred + One of the predicates of a sentence (often a main verb, but nouns and adjectives can also be + predicates). + The SemPred annotation can be attached to predicates in a sentence. + Semantic predicates express events or situations and take semantic arguments + expressing the participants in these events or situations. All forms of main verbs + can be annotated with a SemPred. However, there are also many nouns and + adjectives that take arguments and can thus be annotated with a SemanticPredicate, + e.g. event nouns, such as "suggestion" (with arguments what and by whom), or + relational adjectives, such as "proud" (with arguments who and of what). + + uima.tcas.Annotation + + + arguments + The predicate's arguments. + uima.cas.FSArray + de.tudarmstadt.ukp.dkpro.core.api.semantics.type.SemArgLink + + + category + A more detailed specification of the predicate type depending on the theory being used, + e.g. a frame name. + + uima.cas.String + + + + + de.tudarmstadt.ukp.dkpro.core.api.semantics.type.SemanticArgument + The SemanticArgument annotation is attached to semantic arguments of semantic + predicates. Semantic arguments are characterized by their semantic role, e.g. Agent, + Experiencer, Topic. The semantic role of an argument is related to its semantic type + (for communication verbs, the Agent can be a person or an organization, but + typically not food). The semantic type of arguments is not yet covered by the + SemanticType. + + @deprecated Use SemArg instead. + + uima.tcas.Annotation + + + role + The role which the argument takes. The value depends on the theory being used, e.g. + Arg0, Arg1, etc. or Buyer, Seller, etc. + + uima.cas.String + + + + + de.tudarmstadt.ukp.dkpro.core.api.semantics.type.SemanticField + The SemanticField is a coarse-grained semantic category that can be attached to + nouns, verbs or adjectives. Semantic field information is present e.g. in WordNet as + lexicographer file names. Previously, this kind of semantic information has also + been called supersenses or semantic types. + + uima.tcas.Annotation + + + value + The value or name of the semantic field. Examples of semantic field values are: + location, artifact, event, communication, attribute + + uima.cas.String + + + + + de.tudarmstadt.ukp.dkpro.core.api.semantics.type.SemanticPredicate + One of the predicates of a sentence (often a main verb, but nouns and adjectives can also be + predicates). The SemanticPredicate annotation can be attached to predicates in a sentence. + Semantic predicates express events or situations and take semantic arguments + expressing the participants in these events ore situations. All forms of main verbs + can be annotated with a SemanticPredicate. However, there are also many nouns and + adjectives that take arguments and can thus be annotated with a SemanticPredicate, + e.g. event nouns, such as "suggestion" (with arguments what and by whom), or + relational adjectives, such as "proud" (with arguments who and of what). + + @deprecated use SemPred instead + + uima.tcas.Annotation + + + category + A more detailed specification of the predicate type depending on the theory being used, + e.g. a frame name. + + uima.cas.String + + + arguments + The predicate's arguments. + uima.cas.FSArray + de.tudarmstadt.ukp.dkpro.core.api.semantics.type.SemanticArgument + + + + + de.tudarmstadt.ukp.dkpro.core.api.semantics.type.WordSense + + uima.tcas.Annotation + + + value + The sense identifier. + uima.cas.String + + + + + de.tudarmstadt.ukp.dkpro.core.api.structure.type.Field + + uima.tcas.Annotation + + + name + the name of the tag + uima.cas.String + + + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.PennTree + <p>The Penn Treebank-style phrase structure string.</p> + uima.tcas.Annotation + + + PennTree + Contains a Penn Treebank-style representation of a tree. + uima.cas.String + + + TransformationNames + The name(s) of the transformation(s) that have been performed on the PennTree + + uima.cas.String + + + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.chunk.ADJC + adjective chunks + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.chunk.Chunk + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.chunk.ADVC + adverb chunks + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.chunk.Chunk + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.chunk.CONCJ + complex coordinating conjunctions such as "as well (as)" or "rather (than)" + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.chunk.Chunk + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.chunk.Chunk + + uima.tcas.Annotation + + + chunkValue + + uima.cas.String + + + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.chunk.INTJ + interjection + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.chunk.Chunk + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.chunk.LST + enumeration symbol + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.chunk.Chunk + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.chunk.NC + noun chunk (non-recursive noun phrase) + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.chunk.Chunk + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.chunk.O + other or outside a chunk + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.chunk.Chunk + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.chunk.PC + prepositional chunk + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.chunk.Chunk + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.chunk.PRT + verb particle + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.chunk.Chunk + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.chunk.VC + verb complex + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.chunk.Chunk + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.ADJP + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.ADVP + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.CONJP + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent + + uima.tcas.Annotation + + + constituentType + + uima.cas.String + + + parent + The parent constituent + uima.tcas.Annotation + + + children + + uima.cas.FSArray + uima.tcas.Annotation + + + syntacticFunction + + uima.cas.String + + + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.FRAG + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.INTJ + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.LST + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.NAC + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.NP + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.NX + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.PARN + This cateory is called PRN in the Penn Treebank tagset. However, PRN is a reserved device name + on Window. Thus we had to rename this category. The old PRN type is still present in the DKPro Core type + system, but it is deprecated, no longer used, and no JCas classes are generated for it. + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.PP + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.PRP + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.PRT + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.QP + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.ROOT + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.RRC + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.S + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.SBAR + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.SBARQ + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.SINV + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.SQ + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.UCP + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.VP + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.WHADJP + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.WHADVP + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.WHNP + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.WHPP + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.X + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.ABBREV + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.ACOMP + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.ADVCL + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.ADVMOD + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.AGENT + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.AMOD + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.APPOS + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.ATTR + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.AUX0 + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.AUXPASS + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.CC + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.CCOMP + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.COMPLM + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.CONJ + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.CONJP + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.CONJ_YET + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.COP + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.CSUBJ + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.CSUBJPASS + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.DEP + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.DET + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.DOBJ + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + A dependency relation between two tokens. The dependency annotation begin and end offsets + correspond to those of the dependent. + + uima.tcas.Annotation + + + Governor + The governor word + de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token + + + Dependent + The dependent word + de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token + + + DependencyType + The dependency type + uima.cas.String + + + flavor + Flavor of the dependency relation (basic, collapsed, enhanced, etc...) + uima.cas.String + + + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.EXPL + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.INFMOD + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.IOBJ + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.MARK + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.MEASURE + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.MWE + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.NEG + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.NN + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.NPADVMOD + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.NSUBJ + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.NSUBJPASS + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.NUM + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.NUMBER + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.PARATAXIS + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.PARTMOD + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.PCOMP + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.POBJ + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.POSS + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.POSSESSIVE + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.PRECONJ + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.PRED + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.PREDET + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.PREP + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.PREPC + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.PRT + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.PUNCT + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.PURPCL + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.QUANTMOD + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.RCMOD + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.REF + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.REL + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.ROOT + Dependency tree root. + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.TMOD + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.XCOMP + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.XSUBJ + + de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency + + + de.tudarmstadt.ukp.dkpro.core.api.transform.type.SofaChangeAnnotation + Encodes an edit operation that can be interpreted by the ApplyChangesAnnotator. + uima.tcas.Annotation + + + value + In case of an "insert" or "replace" operation, this feature indicates the value to be + inserted or replaced. + + uima.cas.String + + + operation + Operation to perform: "insert", "replace", "delete" + uima.cas.String + + + reason + The reason for the change. + uima.cas.String + + + + + de.tudarmstadt.ukp.dkpro.core.io.jwpl.type.ArticleInfo + Contains basic information about the article. + uima.tcas.Annotation + + + Authors + Number of unique authors of this article + uima.cas.Integer + + + Revisions + Number of revisions of this article. + uima.cas.Integer + + + FirstAppearance + The Timestamp of the first appearance of this article. + uima.cas.Long + + + LastAppearance + The Timestamp of the last appearance of this article. + uima.cas.Long + + + + + de.tudarmstadt.ukp.dkpro.core.io.jwpl.type.DBConfig + Database configuration for the connection to the database where the CAS data was retrieved. + + uima.tcas.Annotation + + + Host + DB Host + uima.cas.String + + + DB + Database + uima.cas.String + + + User + Username + uima.cas.String + + + Password + User password + uima.cas.String + + + Language + Wikipedia Language Versions + uima.cas.String + + + + + de.tudarmstadt.ukp.dkpro.core.io.jwpl.type.WikipediaLink + Wikipedia link + uima.tcas.Annotation + + + LinkType + The type of the link, e.g. internal, external, image, ... + uima.cas.String + + + Target + The link target url + uima.cas.String + + + Anchor + The anchor of the link + uima.cas.String + + + + + de.tudarmstadt.ukp.dkpro.core.io.jwpl.type.WikipediaRevision + Represents a revision in Wikipedia. + uima.tcas.Annotation + + + revisionId + The ID of the revision. + uima.cas.Integer + + + pageId + The pageId of the Wikipedia page of this revision. + uima.cas.Integer + + + contributorName + The username of the user/contributor who edited this revision. + uima.cas.String + + + comment + The comment that the editor entered for this revision. + uima.cas.String + + + contributorId + The userId of the user/contributor who created this revision + uima.cas.Integer + + + timestamp + The timestamp of the revision, given in milliseconds since the standard base time + (January 1, 1970, 00:00:00 GMT) + + uima.cas.Long + + + minor + Whether this revision has been marked as minor edit by its contributor. + uima.cas.Boolean + + + + + de.tudarmstadt.ukp.dkpro.core.mallet.type.TopicDistribution + An array representing the topic proportions in a document. + uima.tcas.Annotation + + + TopicProportions + Each topic's proportion in the document. + uima.cas.DoubleArray + + + TopicAssignment + Pointers to topics the document has been assigned to. + uima.cas.IntegerArray + + + + + de.tudarmstadt.ukp.dkpro.core.mallet.type.WordEmbedding + An array representing the word embedding vector. + uima.tcas.Annotation + + + WordEmbedding + A word embedding vector. + uima.cas.FloatArray + + + + + de.tudarmstadt.ukp.dkpro.core.mecab.type.JapaneseToken + + de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token + + + kana + + uima.cas.String + + + ibo + + uima.cas.String + + + kei + + uima.cas.String + + + dan + Specifies the kind of the verb if the current token is a verb. Either it is a vowel + stem verb (ichi-dan) or a consonant stem verb (go-dan). Blank if not a verb. + + uima.cas.String + + + + + de.tudarmstadt.ukp.dkpro.core.performance.type.TimerAnnotation + Used for storing timing information (e.g. for performance testing). + uima.tcas.Annotation + + + startTime + + uima.cas.Long + + + endTime + + uima.cas.Long + + + name + The name of the timer. + Used to automatically determine whether this is an upstream or downstream timer. + + uima.cas.String + + + + + de.tudarmstadt.ukp.dkpro.core.type.ReadabilityScore + + uima.tcas.Annotation + + + measureName + + uima.cas.String + + + score + + uima.cas.Double + + + + + org.dkpro.core.api.xml.type.XmlAttribute + + uima.cas.TOP + + + uri + Namespace URI of the attribute. + uima.cas.String + + + localName + Local name of the attribute. + uima.cas.String + + + value + Value of the XML attribute. + uima.cas.String + + + qName + + uima.cas.String + + + valueType + + uima.cas.String + + + + + org.dkpro.core.api.xml.type.XmlDocument + XML document + uima.tcas.Annotation + + + root + Root element of the XML document. + org.dkpro.core.api.xml.type.XmlElement + + + + + org.dkpro.core.api.xml.type.XmlElement + XML element + org.dkpro.core.api.xml.type.XmlNode + + + uri + Namespace URI of the element. + uima.cas.String + + + localName + Local name of the XML element. + uima.cas.String + + + attributes + Array of attributes of the XML element. + uima.cas.FSArray + org.dkpro.core.api.xml.type.XmlAttribute + + + children + Children of this XML element. + uima.cas.FSArray + org.dkpro.core.api.xml.type.XmlNode + + + qName + + uima.cas.String + + + + + org.dkpro.core.api.xml.type.XmlNode + Supertype for XmlElements and XmlTextNodes. + uima.tcas.Annotation + + + parent + + org.dkpro.core.api.xml.type.XmlElement + + + + + org.dkpro.core.api.xml.type.XmlTextNode + XML text node. + org.dkpro.core.api.xml.type.XmlNode + + + text + + uima.cas.String + + + captured + Whether the text node has been added to the document text. + uima.cas.Boolean + + + + + org.dkpro.core.io.nift.metadata.ArticleMetaData + A document annotation that describes the metadata of a + newspaper article. + + uima.cas.AnnotationBase + + + guid + The GUID field specifies a (4-byte) integer that is + guaranteed + to be unique for every document + in the corpus. + + uima.cas.Integer + + + alternateUrl + This field specifies the location on nytimes.com of + the article. When present, this URL is preferred to the URL field + on articles published on or after April 02, + 2006, as the linked + page will have richer content. + + uima.cas.String + + + url + This field specifies the location on nytimes.com of + the article. The 'Alternative Url' + field is preferred to this field + on articles published on or after + April 02, 2006, as the + linked page + will have richer content. + + uima.cas.String + + + publicationDate + This field specifies the date of the article's + publication. This field is specified in the + format + YYYYMMDD'T'HHMMSS where: + 1. YYYY is the four-digit year. + 2. MM is + the two-digit month [01-12]. + 3. DD is the two-digit day [01-31]. + 4. + T is a constant value. + 5. HH is the two-digit hour [00-23]. + 6. MM is + the two-digit minute-past-the hour [00-59] + 7. SS is the two-digit + seconds-past-the-minute [00-59]. + Please note that values for HH,MM, + and SS are not defined for this + corpus, that is to day + HH,MM, and SS + are always defined to be '00'. + + uima.cas.String + + + typesOfMaterial + This field specifies a normalized list of terms + describing the general editorial category of the article. + These + tags are algorithmically assigned and + manually verified by + nytimes.com production staff. + Examples Include: + * REVIEW + * OBITUARY + * ANALYSIS + + uima.cas.StringArray + + + headline + This field specifies the headline of the article as it + appeared in the + print edition of the New York + Times. + + uima.cas.String + + + onlineHeadline + This field specifies the headline displayed with the + article on + nytimes.com. Often + this differs from the headline used in + print. + + uima.cas.String + + + columnName + If the article is part of a regular column, this field + specifies the + name of that column. + Sample Column Names: + 1. World News + Briefs + 2. WEDDINGS + 3. The Accessories Channel + + uima.cas.String + + + author + This field is based on the normalized byline in the + original corpus data: "The Normalized Byline field is the byline + normalized to the form (last name, first + name)". + + uima.cas.String + + + descriptors + The 'descriptors' field specifies a list of + descriptive terms drawn + from a normalized controlled + vocabulary + corresponding to subjects mentioned in the article. These tags + are + hand-assigned by + a team of library scientists working in the New + York Times Indexing + service. + Examples Include: + * ECONOMIC CONDITIONS + AND TRENDS + * AIRPLANES + * VIOLINS + + uima.cas.StringArray + + + onlineDescriptors + This field specifies a list of descriptors from a + normalized + controlled + vocabulary that + correspond to topics mentioned + in the article. These + tags are + algorithmically + assigned and manually + verified by + nytimes.com production staff. + Examples Include: + * Marriages + * Parks and Other Recreation Areas + * Cooking and Cookbooks + + uima.cas.StringArray + + + generalOnlineDescriptors + The 'general online descriptors' field specifies a + list of descriptors that are at a higher level of + generality than + the other tags associated with the article. These tags are + algorithmically + assigned and manually verified by nytimes.com + production staff. + Examples Include: + * Surfing + * Venice Biennale + * Ranches + + uima.cas.String + + + onlineSection + This field specifies the section(s) on nytimes.com in + which the + article is placed. If + the article is placed in multiple + sections, this field will be + specified as a ';' delineated + list. + + uima.cas.String + + + section + This field specifies the section of the paper in which + the article + appears. This is not + the name of the section, but rather + a letter or number that indicates + the section. + + uima.cas.String + + + taxonomicClassifiers + This field specifies a list of taxonomic classifiers + that place this + article into a + hierarchy of articles. The individual + terms of each taxonomic classifier + are separated with the '/' character. + These tags are algorithmically assigned and manually + verified + by nytimes.com production staff. + Examples Include: + * Top/Features/Travel/Guides/Destinations/North America/United States/Arizona + * Top/News/U.S./Rockies + * Top/Opinion + + uima.cas.StringArray + + + + + diff --git a/duui-anonymize/src/main/resources/TypeSystemAnonymize.xml b/duui-anonymize/src/main/resources/TypeSystemAnonymize.xml new file mode 100644 index 00000000..bdf518f0 --- /dev/null +++ b/duui-anonymize/src/main/resources/TypeSystemAnonymize.xml @@ -0,0 +1,46 @@ + + + + + de.tudarmstadt.ukp.dkpro.core.api.anomaly.type.Anomaly + Marks a span of text as a privacy-sensitive entity detected by the anonymizer. + uima.tcas.Annotation + + + description + Human-readable description or replacement placeholder for the detected span. + uima.cas.String + + + suggestions + Suggested replacement actions for this anomaly. + uima.cas.FSArray + de.tudarmstadt.ukp.dkpro.core.api.anomaly.type.SuggestedAction + + + category + Privacy category of the detected span, e.g. private_person, email_address, phone_number. + uima.cas.String + + + + + + de.tudarmstadt.ukp.dkpro.core.api.anomaly.type.SuggestedAction + A suggested replacement for an anomaly span. + uima.tcas.Annotation + + + replacement + The text to substitute for the anomaly-covered span. + uima.cas.String + + + certainty + Confidence score in [0,1] for this replacement suggestion. + uima.cas.Float + + + + + diff --git a/duui-anonymize/src/test/java/AnonymizeTests.java b/duui-anonymize/src/test/java/AnonymizeTests.java new file mode 100644 index 00000000..b1c9d316 --- /dev/null +++ b/duui-anonymize/src/test/java/AnonymizeTests.java @@ -0,0 +1,447 @@ +import de.tudarmstadt.ukp.dkpro.core.api.anomaly.type.Anomaly; +import org.apache.commons.compress.compressors.CompressorException; +import org.apache.uima.UIMAException; +import org.apache.uima.fit.factory.JCasFactory; +import org.apache.uima.fit.util.JCasUtil; +import org.apache.uima.jcas.JCas; +import org.apache.uima.util.XmlCasSerializer; +import org.junit.jupiter.api.*; +import org.texttechnologylab.DockerUnifiedUIMAInterface.DUUIComposer; +import org.texttechnologylab.DockerUnifiedUIMAInterface.driver.DUUIRemoteDriver; +import org.texttechnologylab.DockerUnifiedUIMAInterface.lua.DUUILuaContext; +import org.xml.sax.SAXException; +import org.texttechnologylab.DockerUnifiedUIMAInterface.driver.DUUIUIMADriver; +import org.dkpro.core.io.xmi.XmiWriter; + +import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; +import static org.junit.jupiter.api.Assertions.*; + +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.net.URISyntaxException; +import java.net.UnknownHostException; +import java.nio.charset.StandardCharsets; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.util.Collection; + +/** + * Integration tests for the DUUI anonymization annotator. + * + * Prerequisites: the Python service must be running on {@value #SERVICE_URL}. + * Start it with: + * uvicorn duui:app --host 0.0.0.0 --port 9714 --workers 1 + * + * Each test loads English text containing personally identifiable information (PII), + * sends it through the OPF anonymizer via DUUI, and asserts that + * {@link Anomaly} annotations are written to the CAS output view. + */ +public class AnonymizeTests { + + static final String SERVICE_URL = "http://127.0.0.1:9714"; + + static final String RESULTS_DIR = "src/test/results"; + + static DUUIComposer composer; + static JCas cas; + + // ------------------------------------------------------------------- + // JUnit lifecycle + // ------------------------------------------------------------------- + + @BeforeAll + static void beforeAll() throws URISyntaxException, IOException, UIMAException, SAXException, CompressorException { + Files.createDirectories(Paths.get(RESULTS_DIR)); + + composer = new DUUIComposer() + .withSkipVerification(true) + .withLuaContext(new DUUILuaContext().withJsonLibrary()); + + DUUIUIMADriver uimaDriver = new DUUIUIMADriver().withDebug(false); + DUUIRemoteDriver remoteDriver = new DUUIRemoteDriver(); + composer.addDriver(remoteDriver, uimaDriver); + + cas = JCasFactory.createJCas(); + } + + @AfterAll + static void afterAll() throws UnknownHostException { + composer.shutdown(); + } + + @AfterEach + void afterEach(TestInfo testInfo) throws IOException, SAXException { + composer.resetPipeline(); + + String methodName = testInfo.getTestMethod() + .map(m -> m.getName()) + .orElseGet(() -> testInfo.getDisplayName().replaceAll("[^a-zA-Z0-9_]", "_")); + + String inputText = cas.getDocumentText() != null ? cas.getDocumentText() : ""; + String redactedText = extractRedactedText(); + Collection anomalies = collectAnomalies(); + + // Write XMI + ByteArrayOutputStream xmiBytes = new ByteArrayOutputStream(); + XmlCasSerializer.serialize(cas.getCas(), null, xmiBytes); + String xmiString = xmiBytes.toString(StandardCharsets.UTF_8); + Files.writeString(Paths.get(RESULTS_DIR, methodName + ".xmi"), xmiString); + + // Write JSON summary for later comparison + String json = buildResultJson(methodName, inputText, redactedText, anomalies); + Files.writeString(Paths.get(RESULTS_DIR, methodName + ".json"), json); + + System.out.println("=== " + methodName + " ==="); + System.out.println(json); + + cas.reset(); + } + + // ------------------------------------------------------------------- + // Helpers + // ------------------------------------------------------------------- + + /** Populate the CAS with the given text and language code. */ + private static void createCas(String language, String text) throws UIMAException { + cas.setDocumentLanguage(language); + cas.setDocumentText(text); + } + + /** + * Collect all {@link Anomaly} annotations across every CAS view. + * The anonymizer writes results to an "opf_redacted" SOFA view, but also + * to the default view depending on the service configuration. + */ + private static Collection collectAnomalies() { + // prefer the dedicated redaction view when available + try { + JCas redactedView = cas.getView("opf_redacted"); + Collection spans = JCasUtil.select(redactedView, Anomaly.class); + if (!spans.isEmpty()) { + return spans; + } + } catch (Exception ignored) { + // view does not exist — fall through to default view + } + return JCasUtil.select(cas, Anomaly.class); + } + + /** Returns the sofa string of the opf_redacted view, or the default view's string. */ + private static String extractRedactedText() { + try { + JCas view = cas.getView("opf_redacted"); + String s = view.getSofaDataString(); + return s != null ? s : ""; + } catch (Exception ignored) { + String s = cas.getSofaDataString(); + return s != null ? s : ""; + } + } + + private static String buildResultJson( + String testName, + String inputText, + String redactedText, + Collection anomalies) { + + StringBuilder sb = new StringBuilder(); + sb.append("{\n"); + sb.append(" \"test\": ").append(jsonStr(testName)).append(",\n"); + sb.append(" \"input\": ").append(jsonStr(inputText)).append(",\n"); + sb.append(" \"redacted\": ").append(jsonStr(redactedText)).append(",\n"); + sb.append(" \"anomaly_count\": ").append(anomalies.size()).append(",\n"); + sb.append(" \"anomalies\": [\n"); + int idx = 0; + for (Anomaly a : anomalies) { + String spanText = ""; + try { + if (a.getBegin() >= 0 && a.getEnd() <= inputText.length()) { + spanText = inputText.substring(a.getBegin(), a.getEnd()); + } + } catch (Exception ignored) {} + sb.append(" {\n"); + sb.append(" \"begin\": ").append(a.getBegin()).append(",\n"); + sb.append(" \"end\": ").append(a.getEnd()).append(",\n"); + sb.append(" \"category\": ").append(jsonStr(a.getCategory())).append(",\n"); + sb.append(" \"description\": ").append(jsonStr(a.getDescription())).append(",\n"); + sb.append(" \"text\": ").append(jsonStr(spanText)).append("\n"); + sb.append(" }"); + if (++idx < anomalies.size()) sb.append(","); + sb.append("\n"); + } + sb.append(" ]\n"); + sb.append("}\n"); + return sb.toString(); + } + + private static String jsonStr(String s) { + if (s == null) return "null"; + return "\"" + s + .replace("\\", "\\\\") + .replace("\"", "\\\"") + .replace("\n", "\\n") + .replace("\r", "\\r") + .replace("\t", "\\t") + + "\""; + } + + // ------------------------------------------------------------------- + // Tests + // ------------------------------------------------------------------- + + /** + * Smoke test: plain English sentence with a person name. + * Expects at least one Anomaly annotation to be produced. + */ + @Test + @DisplayName("Simple person-name redaction") + void testSimplePersonName() throws Exception { + String text = "John Smith called the bank to report a fraud."; + createCas("en", text); + + composer.add( + new DUUIRemoteDriver.Component(SERVICE_URL) + ); + + composer.run(cas); + + Collection anomalies = collectAnomalies(); + System.out.println("Anomaly count: " + anomalies.size()); + for (Anomaly a : anomalies) { + System.out.printf(" [%d-%d] category=%s description=%s%n", + a.getBegin(), a.getEnd(), a.getCategory(), a.getDescription()); + } + + assertFalse(anomalies.isEmpty(), + "Expected at least one Anomaly annotation for 'John Smith'"); + } + + /** + * Email address redaction. + */ + @Test + @DisplayName("Email address redaction") + void testEmailRedaction() throws Exception { + String text = "Please contact support at alice@example.com for further assistance."; + createCas("en", text); + + composer.add( + new DUUIRemoteDriver.Component(SERVICE_URL) + ); + + composer.run(cas); + + Collection anomalies = collectAnomalies(); + System.out.println("Anomaly count: " + anomalies.size()); + assertFalse(anomalies.isEmpty(), + "Expected at least one Anomaly for the email address"); + } + + /** + * Phone number redaction. + */ + @Test + @DisplayName("Phone number redaction") + void testPhoneNumberRedaction() throws Exception { + String text = "You can reach Dr. Miller at +49 69 1234 5678 during office hours."; + createCas("en", text); + + composer.add( + new DUUIRemoteDriver.Component(SERVICE_URL) + ); + + composer.run(cas); + + Collection anomalies = collectAnomalies(); + System.out.println("Anomaly count: " + anomalies.size()); + assertFalse(anomalies.isEmpty(), + "Expected at least one Anomaly for the phone number or person name"); + } + + /** + * Multiple PII entities in a single document. + * Asserts that distinct spans covering the name, email, and phone are returned. + */ + @Test + @DisplayName("Multiple PII entities in one document") + void testMultiplePiiEntities() throws Exception { + String text = + "Patient: Jane Doe, DOB: 1985-03-22. " + + "Contact: jane.doe@hospital.org, Tel: 069-9876-5432. " + + "Address: 60325 Frankfurt am Main, Goethe-Platz 1."; + createCas("en", text); + + composer.add( + new DUUIRemoteDriver.Component(SERVICE_URL) + ); + + composer.run(cas); + + Collection anomalies = collectAnomalies(); + System.out.println("Anomaly count: " + anomalies.size()); + for (Anomaly a : anomalies) { + System.out.printf(" [%d-%d] category=%s text=%s%n", + a.getBegin(), a.getEnd(), a.getCategory(), + text.substring(a.getBegin(), a.getEnd())); + } + + assertTrue(anomalies.size() >= 2, + "Expected at least 2 Anomaly annotations for a document with multiple PII entities"); + } + + /** + * Pseudo-mode: the service should return the text unchanged (stub behavior). + * Asserts no Anomaly annotations are created. + */ + @Test + @DisplayName("Pseudo mode returns unchanged text") + void testPseudoMode() throws Exception { + String text = "Alice and Bob met at the Frankfurt main station."; + createCas("en", text); + + composer.add( + new DUUIRemoteDriver.Component(SERVICE_URL) + .withParameter("mode", "pseudo") + ); + + composer.run(cas); + + Collection anomalies = collectAnomalies(); + System.out.println("Anomaly count (pseudo mode): " + anomalies.size()); + assertTrue(anomalies.isEmpty(), + "Pseudo mode should produce no Anomaly annotations (stub returns input unchanged)"); + } + + /** + * Custom placeholder: verifies the {@link Anomaly#getDescription()} contains the + * user-supplied placeholder string instead of the default {@code }. + */ + @Test + @DisplayName("Custom placeholder is reflected in Anomaly description") + void testCustomPlaceholder() throws Exception { + String text = "Send the report to max.mustermann@uni-frankfurt.de by Friday."; + String placeholder = "***PRIVATE***"; + createCas("en", text); + + composer.add( + new DUUIRemoteDriver.Component(SERVICE_URL) + .withParameter("placeholder", placeholder) + ); + + composer.run(cas); + + Collection anomalies = collectAnomalies(); + System.out.println("Anomaly count: " + anomalies.size()); + + boolean foundCustomPlaceholder = anomalies.stream() + .anyMatch(a -> placeholder.equals(a.getDescription())); + assertTrue(foundCustomPlaceholder, + "At least one Anomaly should carry the custom placeholder '" + placeholder + "'"); + } + + /** + * Selection window: only the text between offsets [8, 36] should be analysed. + * Entities outside that window must not be annotated. + */ + @Test + @DisplayName("Selection window constrains annotation range") + void testSelectionWindow() throws Exception { + // offsets: 0123456789012345678901234567890123456789 + // Call Dr. John Adams at 555-0100 today. + // window [8, 28] covers "John Adams at 555-0100" + String text = "Call Dr. John Adams at 555-0100 today."; + int selBegin = 9; + int selEnd = 30; + createCas("en", text); + + composer.add( + new DUUIRemoteDriver.Component(SERVICE_URL) + .withParameter("selection_begin", String.valueOf(selBegin)) + .withParameter("selection_end", String.valueOf(selEnd)) + ); + + composer.run(cas); + + Collection anomalies = collectAnomalies(); + System.out.println("Anomaly count (selection window): " + anomalies.size()); + for (Anomaly a : anomalies) { + assertTrue(a.getBegin() >= selBegin && a.getEnd() <= selEnd, + String.format("Anomaly [%d-%d] falls outside the selection window [%d-%d]", + a.getBegin(), a.getEnd(), selBegin, selEnd)); + } + } + + /** + * Empty document: the annotator must not throw and must return no anomalies. + */ + @Test + @DisplayName("Empty document produces no anomalies") + void testEmptyDocument() throws Exception { + createCas("en", ""); + + composer.add( + new DUUIRemoteDriver.Component(SERVICE_URL) + ); + + composer.run(cas); + + Collection anomalies = collectAnomalies(); + assertTrue(anomalies.isEmpty(), + "An empty document should produce zero Anomaly annotations"); + } + + /** + * German text: verifies the annotator handles non-English input without crashing. + * The model may or may not detect German PII depending on the loaded checkpoint; + * we only assert no exception is thrown. + */ + @Test + @DisplayName("German text does not cause an exception") + void testGermanText() throws Exception { + String text = "Herr Klaus Muller wohnt in der Goethestrasse 12, 60313 Frankfurt am Main."; + createCas("de", text); + + composer.add( + new DUUIRemoteDriver.Component(SERVICE_URL) + ); + + // Should complete without throwing + assertDoesNotThrow(() -> composer.run(cas)); + + Collection anomalies = collectAnomalies(); + System.out.println("German Anomaly count: " + anomalies.size()); + } + + /** + * XMI serialisation round-trip: runs the annotator and writes the CAS to an XMI + * file so the result can be inspected with the UIMA CAS Editor. + */ + @Test + @DisplayName("XMI output is written to src/test/results/") + void testXmiOutput() throws Exception { + String text = + "Maria Schmidt (m.schmidt@example.de) lives at Berliner Str. 5, 10115 Berlin."; + createCas("en", text); + + composer.add( + new DUUIRemoteDriver.Component(SERVICE_URL) + ); + + composer.add(new DUUIUIMADriver.Component( + createEngineDescription(XmiWriter.class, + XmiWriter.PARAM_TARGET_LOCATION, RESULTS_DIR, + XmiWriter.PARAM_PRETTY_PRINT, true, + XmiWriter.PARAM_OVERWRITE, true, + XmiWriter.PARAM_VERSION, "1.1" + ) + ).build()); + + composer.run(cas); + + Collection anomalies = collectAnomalies(); + System.out.println("XMI test Anomaly count: " + anomalies.size()); + assertFalse(anomalies.isEmpty(), + "Expected at least one Anomaly annotation for the PII-rich document"); + } +} diff --git a/duui-anonymize/src/test/resources/sample_pii_de.txt b/duui-anonymize/src/test/resources/sample_pii_de.txt new file mode 100644 index 00000000..391ff970 --- /dev/null +++ b/duui-anonymize/src/test/resources/sample_pii_de.txt @@ -0,0 +1,4 @@ +Frau Anna Müller (anna.mueller@beispiel.de) hat am 15. März 2024 angerufen. +Ihre Telefonnummer lautet 069-8765-4321. +Wohnadresse: Goethestraße 3, 60313 Frankfurt am Main. +Geburtsdatum: 12.07.1985. diff --git a/duui-anonymize/src/test/resources/sample_pii_en.txt b/duui-anonymize/src/test/resources/sample_pii_en.txt new file mode 100644 index 00000000..1acc74b6 --- /dev/null +++ b/duui-anonymize/src/test/resources/sample_pii_en.txt @@ -0,0 +1,4 @@ +John Smith called the helpdesk on Monday. +His email is john.smith@company.org and his phone number is +49 69 1234 5678. +He lives at Mainzer Landstraße 50, 60329 Frankfurt am Main. +Date of birth: 1978-11-04. Employee ID: EMP-00421. diff --git a/duui-anonymize/src/test/results/testCustomPlaceholder.json b/duui-anonymize/src/test/results/testCustomPlaceholder.json new file mode 100644 index 00000000..fc0010e6 --- /dev/null +++ b/duui-anonymize/src/test/results/testCustomPlaceholder.json @@ -0,0 +1,15 @@ +{ + "test": "testCustomPlaceholder", + "input": "Send the report to max.mustermann@uni-frankfurt.de by Friday.", + "redacted": "Send the report to ann@uni-frankfurt.de by Friday.", + "anomaly_count": 1, + "anomalies": [ + { + "begin": 19, + "end": 30, + "category": "private_person", + "description": "", + "text": "max.musterm" + } + ] +} diff --git a/duui-anonymize/src/test/results/testCustomPlaceholder.xmi b/duui-anonymize/src/test/results/testCustomPlaceholder.xmi new file mode 100644 index 00000000..5876ca42 --- /dev/null +++ b/duui-anonymize/src/test/results/testCustomPlaceholder.xmi @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/duui-anonymize/src/test/results/testEmailRedaction.json b/duui-anonymize/src/test/results/testEmailRedaction.json new file mode 100644 index 00000000..af7f2370 --- /dev/null +++ b/duui-anonymize/src/test/results/testEmailRedaction.json @@ -0,0 +1,8 @@ +{ + "test": "testEmailRedaction", + "input": "Please contact support at alice@example.com for further assistance.", + "redacted": "Please contact support at alice@example.com for further assistance.", + "anomaly_count": 0, + "anomalies": [ + ] +} diff --git a/duui-anonymize/src/test/results/testEmailRedaction.xmi b/duui-anonymize/src/test/results/testEmailRedaction.xmi new file mode 100644 index 00000000..a53ad21e --- /dev/null +++ b/duui-anonymize/src/test/results/testEmailRedaction.xmi @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/duui-anonymize/src/test/results/testEmptyDocument.json b/duui-anonymize/src/test/results/testEmptyDocument.json new file mode 100644 index 00000000..8400af10 --- /dev/null +++ b/duui-anonymize/src/test/results/testEmptyDocument.json @@ -0,0 +1,8 @@ +{ + "test": "testEmptyDocument", + "input": "", + "redacted": "", + "anomaly_count": 0, + "anomalies": [ + ] +} diff --git a/duui-anonymize/src/test/results/testEmptyDocument.xmi b/duui-anonymize/src/test/results/testEmptyDocument.xmi new file mode 100644 index 00000000..294d340e --- /dev/null +++ b/duui-anonymize/src/test/results/testEmptyDocument.xmi @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/duui-anonymize/src/test/results/testGermanText.json b/duui-anonymize/src/test/results/testGermanText.json new file mode 100644 index 00000000..d5373b0e --- /dev/null +++ b/duui-anonymize/src/test/results/testGermanText.json @@ -0,0 +1,15 @@ +{ + "test": "testGermanText", + "input": "Herr Klaus Muller wohnt in der Goethestrasse 12, 60313 Frankfurt am Main.", + "redacted": "Herr Klaus wohnt in der Goethestrasse 12, 60313 Frankfurt am Main.", + "anomaly_count": 1, + "anomalies": [ + { + "begin": 11, + "end": 17, + "category": "private_person", + "description": "", + "text": "Muller" + } + ] +} diff --git a/duui-anonymize/src/test/results/testGermanText.xmi b/duui-anonymize/src/test/results/testGermanText.xmi new file mode 100644 index 00000000..dbaeaf6e --- /dev/null +++ b/duui-anonymize/src/test/results/testGermanText.xmi @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/duui-anonymize/src/test/results/testMultiplePiiEntities.json b/duui-anonymize/src/test/results/testMultiplePiiEntities.json new file mode 100644 index 00000000..9810c920 --- /dev/null +++ b/duui-anonymize/src/test/results/testMultiplePiiEntities.json @@ -0,0 +1,22 @@ +{ + "test": "testMultiplePiiEntities", + "input": "Patient: Jane Doe, DOB: 1985-03-22. Contact: jane.doe@hospital.org, Tel: 069-9876-5432. Address: 60325 Frankfurt am Main, Goethe-Platz 1.", + "redacted": "Patient: , DOB: 3-22. Contact: jane.doe@hospital.org, Tel: 069-9876-5432. Address: 60325 Frankfurt am Main, Goethe-Platz 1.", + "anomaly_count": 2, + "anomalies": [ + { + "begin": 9, + "end": 17, + "category": "private_person", + "description": "", + "text": "Jane Doe" + }, + { + "begin": 24, + "end": 30, + "category": "private_date", + "description": "", + "text": "1985-0" + } + ] +} diff --git a/duui-anonymize/src/test/results/testMultiplePiiEntities.xmi b/duui-anonymize/src/test/results/testMultiplePiiEntities.xmi new file mode 100644 index 00000000..e05d339d --- /dev/null +++ b/duui-anonymize/src/test/results/testMultiplePiiEntities.xmi @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/duui-anonymize/src/test/results/testPhoneNumberRedaction.json b/duui-anonymize/src/test/results/testPhoneNumberRedaction.json new file mode 100644 index 00000000..aff12c6f --- /dev/null +++ b/duui-anonymize/src/test/results/testPhoneNumberRedaction.json @@ -0,0 +1,15 @@ +{ + "test": "testPhoneNumberRedaction", + "input": "You can reach Dr. Miller at +49 69 1234 5678 during office hours.", + "redacted": "You can reach at +49 69 1234 5678 during office hours.", + "anomaly_count": 1, + "anomalies": [ + { + "begin": 14, + "end": 24, + "category": "private_person", + "description": "", + "text": "Dr. Miller" + } + ] +} diff --git a/duui-anonymize/src/test/results/testPhoneNumberRedaction.xmi b/duui-anonymize/src/test/results/testPhoneNumberRedaction.xmi new file mode 100644 index 00000000..4249f4ac --- /dev/null +++ b/duui-anonymize/src/test/results/testPhoneNumberRedaction.xmi @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/duui-anonymize/src/test/results/testPseudoMode.json b/duui-anonymize/src/test/results/testPseudoMode.json new file mode 100644 index 00000000..a712cc0c --- /dev/null +++ b/duui-anonymize/src/test/results/testPseudoMode.json @@ -0,0 +1,15 @@ +{ + "test": "testPseudoMode", + "input": "Alice and Bob met at the Frankfurt main station.", + "redacted": "Alice and met at the Frankfurt main station.", + "anomaly_count": 1, + "anomalies": [ + { + "begin": 10, + "end": 13, + "category": "private_person", + "description": "", + "text": "Bob" + } + ] +} diff --git a/duui-anonymize/src/test/results/testPseudoMode.xmi b/duui-anonymize/src/test/results/testPseudoMode.xmi new file mode 100644 index 00000000..8a4b914e --- /dev/null +++ b/duui-anonymize/src/test/results/testPseudoMode.xmi @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/duui-anonymize/src/test/results/testSelectionWindow.json b/duui-anonymize/src/test/results/testSelectionWindow.json new file mode 100644 index 00000000..e073b535 --- /dev/null +++ b/duui-anonymize/src/test/results/testSelectionWindow.json @@ -0,0 +1,22 @@ +{ + "test": "testSelectionWindow", + "input": "Call Dr. John Adams at 555-0100 today.", + "redacted": "Call Dr. at 0 today.", + "anomaly_count": 2, + "anomalies": [ + { + "begin": 9, + "end": 19, + "category": "private_person", + "description": "", + "text": "John Adams" + }, + { + "begin": 23, + "end": 30, + "category": "private_phone", + "description": "", + "text": "555-010" + } + ] +} diff --git a/duui-anonymize/src/test/results/testSelectionWindow.xmi b/duui-anonymize/src/test/results/testSelectionWindow.xmi new file mode 100644 index 00000000..76f71548 --- /dev/null +++ b/duui-anonymize/src/test/results/testSelectionWindow.xmi @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/duui-anonymize/src/test/results/testSimplePersonName.json b/duui-anonymize/src/test/results/testSimplePersonName.json new file mode 100644 index 00000000..30b0e20c --- /dev/null +++ b/duui-anonymize/src/test/results/testSimplePersonName.json @@ -0,0 +1,8 @@ +{ + "test": "testSimplePersonName", + "input": "John Smith called the bank to report a fraud.", + "redacted": "John Smith called the bank to report a fraud.", + "anomaly_count": 0, + "anomalies": [ + ] +} diff --git a/duui-anonymize/src/test/results/testSimplePersonName.xmi b/duui-anonymize/src/test/results/testSimplePersonName.xmi new file mode 100644 index 00000000..f1a952e4 --- /dev/null +++ b/duui-anonymize/src/test/results/testSimplePersonName.xmi @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/duui-anonymize/src/test/results/testXmiOutput.json b/duui-anonymize/src/test/results/testXmiOutput.json new file mode 100644 index 00000000..c8e7cb5c --- /dev/null +++ b/duui-anonymize/src/test/results/testXmiOutput.json @@ -0,0 +1,15 @@ +{ + "test": "testXmiOutput", + "input": "Maria Schmidt (m.schmidt@example.de) lives at Berliner Str. 5, 10115 Berlin.", + "redacted": "Maria Schmidt (le.de) lives at Berliner Str. 5, 10115 Berlin.", + "anomaly_count": 1, + "anomalies": [ + { + "begin": 15, + "end": 30, + "category": "private_email", + "description": "", + "text": "m.schmidt@examp" + } + ] +} diff --git a/duui-anonymize/src/test/results/testXmiOutput.xmi b/duui-anonymize/src/test/results/testXmiOutput.xmi new file mode 100644 index 00000000..4aff8e61 --- /dev/null +++ b/duui-anonymize/src/test/results/testXmiOutput.xmi @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/duui-anonymize/tests/test_communication_contract.py b/duui-anonymize/tests/test_communication_contract.py new file mode 100644 index 00000000..d2df6286 --- /dev/null +++ b/duui-anonymize/tests/test_communication_contract.py @@ -0,0 +1,20 @@ +from __future__ import annotations + +from pathlib import Path +import unittest + + +class CommunicationContractTests(unittest.TestCase): + def test_lua_contract_mentions_text_options_selection_and_redacted_view(self) -> None: + lua_path = Path(__file__).resolve().parents[1] / "src/main/python/communication.lua" + contents = lua_path.read_text(encoding="utf-8") + + self.assertIn('text = text', contents) + self.assertIn('options = copy_options(params)', contents) + self.assertIn('selection = resolve_selection(params)', contents) + self.assertIn('createView("opf_redacted")', contents) + self.assertIn('detected_spans', contents) + + +if __name__ == "__main__": + unittest.main() diff --git a/duui-anonymize/tests/test_duui_opf_core.py b/duui-anonymize/tests/test_duui_opf_core.py new file mode 100644 index 00000000..ce71f6c5 --- /dev/null +++ b/duui-anonymize/tests/test_duui_opf_core.py @@ -0,0 +1,79 @@ +from __future__ import annotations + +import unittest +from pathlib import Path +import sys + + +ROOT = Path(__file__).resolve().parents[1] +sys.path.insert(0, str(ROOT / "src/main/python")) + +from duui_opf_core import ( + DEFAULT_PLACEHOLDER, + DEFAULT_MODE, + PSEUDO_MODE, + RedactionSpan, + SelectionRange, + apply_replacement_text, + apply_selection, + compose_selection_output, + resolve_selection, + split_options, +) + + +class DuuiOpfCoreTests(unittest.TestCase): + def test_split_options_separates_service_and_decode_values(self) -> None: + service_options, decode_options, mode, placeholder = split_options( + { + "model": "local-checkpoint", + "context_window_length": 128, + "trim_whitespace": False, + "device": "cpu", + "output_mode": "typed", + "discard_overlapping_predicted_spans": True, + "mode": PSEUDO_MODE, + "placeholder": "", + "decode_mode": "argmax", + "calibration_path": "/tmp/calibration.json", + "selection_begin": 2, + "selection_end": 8, + } + ) + + self.assertEqual(service_options["model"], "local-checkpoint") + self.assertEqual(service_options["device"], "cpu") + self.assertEqual(decode_options["decode_mode"], "argmax") + self.assertEqual(decode_options["viterbi_calibration_path"], "/tmp/calibration.json") + self.assertEqual(mode, PSEUDO_MODE) + self.assertEqual(placeholder, "") + + def test_resolve_selection_accepts_nested_or_flat_offsets(self) -> None: + nested = resolve_selection({"selection": {"begin": 4, "end": 9}}, text_length=20) + flat = resolve_selection({"selection_begin": 1, "selection_end": 3}, text_length=20) + + self.assertEqual(nested, SelectionRange(begin=4, end=9)) + self.assertEqual(flat, SelectionRange(begin=1, end=3)) + + def test_apply_replacement_text_uses_one_placeholder(self) -> None: + redacted = apply_replacement_text( + "Alice called Bob.", + [ + RedactionSpan(label="private_person", start=0, end=5, text="Alice"), + RedactionSpan(label="private_person", start=13, end=16, text="Bob"), + ], + ) + + self.assertEqual(redacted, f"{DEFAULT_PLACEHOLDER} called {DEFAULT_PLACEHOLDER}.") + + def test_apply_selection_and_compose_output(self) -> None: + selection = SelectionRange(begin=6, end=11) + selected_text, offset = apply_selection("hello world", selection) + + self.assertEqual(selected_text, "world") + self.assertEqual(offset, 6) + self.assertEqual(compose_selection_output("hello world", selection, "there"), "hello there") + + +if __name__ == "__main__": + unittest.main() From 2219bc4a55d795dcd64ed9954532baa8713ff857 Mon Sep 17 00:00:00 2001 From: Ali Abusaleh Date: Tue, 26 May 2026 15:16:25 +0200 Subject: [PATCH 2/3] add mode options --- duui-anonymize/.gitignore | 3 + duui-anonymize/requirements.txt | 3 +- duui-anonymize/src/main/docker/Dockerfile | 4 - .../src/main/docker/Dockerfile-cuda | 4 - .../src/main/docker/python/communication.lua | 99 ++--- .../src/main/python/communication.lua | 125 +++--- .../src/main/python/duui_anonymize.py | 278 ++++-------- .../src/test/java/AnonymizeTests.java | 416 ++++++++++++------ .../src/test/results/testComplexContext.json | 15 + .../src/test/results/testComplexContext.xmi | 1 + .../test/results/testCustomPlaceholder.json | 21 +- .../test/results/testCustomPlaceholder.xmi | 2 +- .../src/test/results/testEmailRedaction.json | 18 +- .../src/test/results/testEmailRedaction.xmi | 2 +- .../src/test/results/testEmptyDocument.json | 2 +- .../src/test/results/testEmptyDocument.xmi | 2 +- .../src/test/results/testGermanText.json | 31 +- .../src/test/results/testGermanText.xmi | 2 +- .../test/results/testMultiplePiiEntities.json | 72 ++- .../test/results/testMultiplePiiEntities.xmi | 2 +- .../results/testPhoneNumberRedaction.json | 31 +- .../test/results/testPhoneNumberRedaction.xmi | 2 +- .../src/test/results/testPlaceholderMode.json | 22 + .../src/test/results/testPlaceholderMode.xmi | 1 + .../src/test/results/testPseudoMode.json | 17 +- .../src/test/results/testPseudoMode.xmi | 2 +- .../src/test/results/testRemoveMode.json | 50 +++ .../src/test/results/testRemoveMode.xmi | 1 + .../src/test/results/testSelectionWindow.json | 30 +- .../src/test/results/testSelectionWindow.xmi | 2 +- .../src/test/results/testSimplePerson.json | 36 ++ .../src/test/results/testSimplePerson.xmi | 1 + .../test/results/testSimplePersonName.json | 18 +- .../src/test/results/testSimplePersonName.xmi | 2 +- .../test/results/testTypeAccountNumber.json | 22 + .../test/results/testTypeAccountNumber.xmi | 1 + .../src/test/results/testTypeAddress.json | 22 + .../src/test/results/testTypeAddress.xmi | 1 + .../src/test/results/testTypeDate.json | 36 ++ .../src/test/results/testTypeDate.xmi | 1 + .../src/test/results/testTypeEmail.json | 22 + .../src/test/results/testTypeEmail.xmi | 1 + .../src/test/results/testTypePerson.json | 22 + .../src/test/results/testTypePerson.xmi | 1 + .../src/test/results/testTypePhone.json | 36 ++ .../src/test/results/testTypePhone.xmi | 1 + .../src/test/results/testTypeSecret.json | 36 ++ .../src/test/results/testTypeSecret.xmi | 1 + .../src/test/results/testTypeUrl.json | 22 + .../src/test/results/testTypeUrl.xmi | 1 + .../src/test/results/testXmiOutput.json | 45 +- .../src/test/results/testXmiOutput.xmi | 2 +- 52 files changed, 1077 insertions(+), 513 deletions(-) create mode 100644 duui-anonymize/src/test/results/testComplexContext.json create mode 100644 duui-anonymize/src/test/results/testComplexContext.xmi create mode 100644 duui-anonymize/src/test/results/testPlaceholderMode.json create mode 100644 duui-anonymize/src/test/results/testPlaceholderMode.xmi create mode 100644 duui-anonymize/src/test/results/testRemoveMode.json create mode 100644 duui-anonymize/src/test/results/testRemoveMode.xmi create mode 100644 duui-anonymize/src/test/results/testSimplePerson.json create mode 100644 duui-anonymize/src/test/results/testSimplePerson.xmi create mode 100644 duui-anonymize/src/test/results/testTypeAccountNumber.json create mode 100644 duui-anonymize/src/test/results/testTypeAccountNumber.xmi create mode 100644 duui-anonymize/src/test/results/testTypeAddress.json create mode 100644 duui-anonymize/src/test/results/testTypeAddress.xmi create mode 100644 duui-anonymize/src/test/results/testTypeDate.json create mode 100644 duui-anonymize/src/test/results/testTypeDate.xmi create mode 100644 duui-anonymize/src/test/results/testTypeEmail.json create mode 100644 duui-anonymize/src/test/results/testTypeEmail.xmi create mode 100644 duui-anonymize/src/test/results/testTypePerson.json create mode 100644 duui-anonymize/src/test/results/testTypePerson.xmi create mode 100644 duui-anonymize/src/test/results/testTypePhone.json create mode 100644 duui-anonymize/src/test/results/testTypePhone.xmi create mode 100644 duui-anonymize/src/test/results/testTypeSecret.json create mode 100644 duui-anonymize/src/test/results/testTypeSecret.xmi create mode 100644 duui-anonymize/src/test/results/testTypeUrl.json create mode 100644 duui-anonymize/src/test/results/testTypeUrl.xmi diff --git a/duui-anonymize/.gitignore b/duui-anonymize/.gitignore index 6649f848..fc82091d 100644 --- a/duui-anonymize/.gitignore +++ b/duui-anonymize/.gitignore @@ -7,3 +7,6 @@ __pycache__/** *.pyc +target/** +dist/** +build/** \ No newline at end of file diff --git a/duui-anonymize/requirements.txt b/duui-anonymize/requirements.txt index 002c4342..49fb44b8 100644 --- a/duui-anonymize/requirements.txt +++ b/duui-anonymize/requirements.txt @@ -9,5 +9,6 @@ uvicorn torch torchvision torchaudio +transformers +accelerate setuptools -opf @ git+https://github.com/openai/privacy-filter.git diff --git a/duui-anonymize/src/main/docker/Dockerfile b/duui-anonymize/src/main/docker/Dockerfile index 5818820a..660e32ce 100644 --- a/duui-anonymize/src/main/docker/Dockerfile +++ b/duui-anonymize/src/main/docker/Dockerfile @@ -6,13 +6,9 @@ EXPOSE 9714 COPY ./src/main/python/communication.lua ./communication.lua COPY ./src/main/python/duui_anonymize.py ./duui_anonymize.py -COPY ./src/main/python/duui_opf_core.py ./duui_opf_core.py COPY ./src/main/python/typesystem.xml ./typesystem.xml COPY ./requirements.txt ./requirements.txt -RUN apt-get update -RUN apt-get install ffmpeg -y - RUN pip install torch==2.0.0 torchvision==0.15.1 torchaudio==2.0.1 --index-url https://download.pytorch.org/whl/cpu RUN pip install -r requirements.txt diff --git a/duui-anonymize/src/main/docker/Dockerfile-cuda b/duui-anonymize/src/main/docker/Dockerfile-cuda index 4af68c5f..fa72894c 100644 --- a/duui-anonymize/src/main/docker/Dockerfile-cuda +++ b/duui-anonymize/src/main/docker/Dockerfile-cuda @@ -16,13 +16,9 @@ EXPOSE 9714 COPY ./src/main/python/communication.lua ./communication.lua COPY ./src/main/python/duui_anonymize.py ./duui_anonymize.py -COPY ./src/main/python/duui_opf_core.py ./duui_opf_core.py COPY ./src/main/python/typesystem.xml ./typesystem.xml COPY ./requirements.txt ./requirements.txt -RUN apt-get update -RUN apt-get install ffmpeg -y - RUN pip install torch==2.0.0 torchvision==0.15.1 torchaudio==2.0.1 --index-url https://download.pytorch.org/whl/cu118 RUN pip install -r requirements.txt diff --git a/duui-anonymize/src/main/docker/python/communication.lua b/duui-anonymize/src/main/docker/python/communication.lua index 0aa23413..9c1876a2 100644 --- a/duui-anonymize/src/main/docker/python/communication.lua +++ b/duui-anonymize/src/main/docker/python/communication.lua @@ -2,15 +2,23 @@ StandardCharsets = luajava.bindClass("java.nio.charset.StandardCharsets") util = luajava.bindClass("org.apache.uima.fit.util.JCasUtil") +-- Known option keys forwarded to the Python service. +-- pairs() does not iterate Java map objects in LuaJ, so we read each key explicitly. +local OPTION_KEYS = { + "mode", "model", "device", + "context_window_length", "trim_whitespace", + "output_mode", "discard_overlapping_predicted_spans", +} + local function copy_options(params) + if params == nil then return {} end local options = {} - - for key, value in pairs(params or {}) do - if key ~= "selection" and key ~= "selection_begin" and key ~= "selection_end" and key ~= "selection_start" and key ~= "selection_stop" then + for _, key in ipairs(OPTION_KEYS) do + local value = params[key] + if value ~= nil then options[key] = value end end - return options end @@ -21,85 +29,64 @@ local function resolve_selection(params) local selection = params["selection"] if type(selection) == "table" then - local begin = selection["begin"] or selection["start"] - local ending = selection["end"] or selection["stop"] + local begin = selection["begin"] or selection["start"] + local ending = selection["end"] or selection["stop"] if begin ~= nil and ending ~= nil then - return { - begin = begin, - ["end"] = ending, - } + return { begin = begin, ["end"] = ending } end end - local begin = params["selection_begin"] or params["selection_start"] - local ending = params["selection_end"] or params["selection_stop"] + local begin = params["selection_begin"] or params["selection_start"] + local ending = params["selection_end"] or params["selection_stop"] if begin ~= nil and ending ~= nil then - return { - begin = begin, - ["end"] = ending, - } + return { begin = begin, ["end"] = ending } end return nil end --- This "serialize" function is called to transform the CAS object into an stream that is sent to the annotator --- Inputs: --- - inputCas: The actual CAS object to serialize --- - outputStream: Stream that is sent to the annotator, can be e.g. a string, JSON payload, ... +-- Serialize the CAS into a JSON request sent to the Python service. function serialize(inputCas, outputStream, params) local text = inputCas:getSofaDataString() - if text == nil then - text = "" - end + if text == nil then text = "" end - -- Encode data as JSON object and write to stream outputStream:write(json.encode({ - text = text, - options = copy_options(params), - -- selection = resolve_selection(params) + text = text, + options = copy_options(params), + selection = resolve_selection(params), })) end --- This "deserialize" function is called on receiving the results from the annotator that have to be transformed into a CAS object --- Inputs: --- - inputCas: The actual CAS object to deserialize into --- - inputStream: Stream that is received from to the annotator, can be e.g. a string, JSON payload, ... +-- Deserialize the JSON response from the Python service back into the CAS. +-- +-- Anomaly annotations are added to the *original* CAS view so their +-- character offsets remain valid against the original document text. +-- The redacted text is stored as the sofa of a separate "opf_redacted" view. function deserialize(inputCas, inputStream) - -- Get string from stream, assume UTF-8 encoding local inputString = luajava.newInstance("java.lang.String", inputStream:readAllBytes(), StandardCharsets.UTF_8) - - -- Parse JSON data from string into object local results = json.decode(inputString) - local targetCas = inputCas - if inputCas.createView ~= nil then - local ok, view = pcall(function() - return inputCas:createView("opf_redacted") - end) + -- Store redacted text in its own view (offsets here belong to redacted text, + -- so we do NOT add Anomaly annotations to this view). + if results["redacted_text"] ~= nil then + local ok, view = pcall(function() return inputCas:createView("opf_redacted") end) if ok and view ~= nil then - targetCas = view + view:setSofaDataString(results["redacted_text"], "text/plain") end end - if results["redacted_text"] ~= nil then - targetCas:setSofaDataString(results["redacted_text"], "text/plain") - elseif results["text"] ~= nil then - targetCas:setSofaDataString(results["text"], "text/plain") - end - + -- Add Anomaly annotations to the original view; offsets reference original text. if results["detected_spans"] ~= nil then - for i, sent in ipairs(results["detected_spans"]) do - local anomaly = luajava.newInstance("de.tudarmstadt.ukp.dkpro.core.api.anomaly.type.Anomaly", targetCas) - anomaly:setBegin(sent["start"]) - anomaly:setEnd(sent["end"]) - anomaly:setCategory(sent["label"]) - anomaly:setDescription(sent["placeholder"] or sent["text"] or sent["label"]) + for i, span in ipairs(results["detected_spans"]) do + local anomaly = luajava.newInstance( + "de.tudarmstadt.ukp.dkpro.core.api.anomaly.type.Anomaly", inputCas) + anomaly:setBegin(span["start"]) + anomaly:setEnd(span["end"]) + anomaly:setCategory(span["label"]) + -- description = replacement used (e.g. "[private_person]") or original word + anomaly:setDescription(span["placeholder"] ~= "" and span["placeholder"] + or span["text"] or span["label"]) anomaly:addToIndexes() end end - - if results["selection"] ~= nil then - -- Selection metadata is available in the JSON response for downstream consumers. - end end diff --git a/duui-anonymize/src/main/python/communication.lua b/duui-anonymize/src/main/python/communication.lua index 288ae5f3..12156714 100644 --- a/duui-anonymize/src/main/python/communication.lua +++ b/duui-anonymize/src/main/python/communication.lua @@ -2,104 +2,107 @@ StandardCharsets = luajava.bindClass("java.nio.charset.StandardCharsets") util = luajava.bindClass("org.apache.uima.fit.util.JCasUtil") +-- Read a parameter from params regardless of whether it is a Lua table or a +-- LuaJ-wrapped Java Map. Direct table indexing works for Lua tables; Java +-- Map objects (HashMap, etc.) require params:get(key) instead. +local function param_get(params, key) + if params == nil then return nil end + local v = params[key] + if v ~= nil then return tostring(v) end + local ok, r = pcall(function() return params:get(key) end) + if ok and r ~= nil then return tostring(r) end + return nil +end + +-- Known option keys forwarded to the Python service. +local OPTION_KEYS = { + "mode", "model", "device", + "context_window_length", "trim_whitespace", + "output_mode", "discard_overlapping_predicted_spans", +} + local function copy_options(params) local options = {} - - for key, value in pairs(params or {}) do - if key ~= "selection" and key ~= "selection_begin" and key ~= "selection_end" and key ~= "selection_start" and key ~= "selection_stop" then + for _, key in ipairs(OPTION_KEYS) do + local value = param_get(params, key) + if value ~= nil then options[key] = value end end - return options end local function resolve_selection(params) - if params == nil then - return nil - end + if params == nil then return nil end + -- selection passed as a nested table local selection = params["selection"] + if selection == nil then + local ok, r = pcall(function() return params:get("selection") end) + if ok then selection = r end + end if type(selection) == "table" then - local begin = selection["begin"] or selection["start"] - local ending = selection["end"] or selection["stop"] - if begin ~= nil and ending ~= nil then - return { - begin = begin, - ["end"] = ending, - } + local b = selection["begin"] or selection["start"] + local e = selection["end"] or selection["stop"] + if b ~= nil and e ~= nil then + return { begin = b, ["end"] = e } end end - local begin = params["selection_begin"] or params["selection_start"] - local ending = params["selection_end"] or params["selection_stop"] - if begin ~= nil and ending ~= nil then - return { - begin = begin, - ["end"] = ending, - } + -- selection passed as flat begin/end keys + local b = param_get(params, "selection_begin") or param_get(params, "selection_start") + local e = param_get(params, "selection_end") or param_get(params, "selection_stop") + if b ~= nil and e ~= nil then + return { begin = tonumber(b), ["end"] = tonumber(e) } end return nil end --- This "serialize" function is called to transform the CAS object into an stream that is sent to the annotator --- Inputs: --- - inputCas: The actual CAS object to serialize --- - outputStream: Stream that is sent to the annotator, can be e.g. a string, JSON payload, ... +-- Serialize the CAS into a JSON request sent to the Python service. function serialize(inputCas, outputStream, params) local text = inputCas:getSofaDataString() - if text == nil then - text = "" - end + if text == nil then text = "" end + + local options = copy_options(params) - -- Encode data as JSON object and write to stream outputStream:write(json.encode({ - text = text, - options = copy_options(params), - selection = resolve_selection(params) + text = text, + options = options, + selection = resolve_selection(params), })) end --- This "deserialize" function is called on receiving the results from the annotator that have to be transformed into a CAS object --- Inputs: --- - inputCas: The actual CAS object to deserialize into --- - inputStream: Stream that is received from to the annotator, can be e.g. a string, JSON payload, ... +-- Deserialize the JSON response from the Python service back into the CAS. +-- +-- Anomaly annotations are added to the *original* CAS view so their +-- character offsets remain valid against the original document text. +-- The redacted text is stored as the sofa of a separate "opf_redacted" view. function deserialize(inputCas, inputStream) - -- Get string from stream, assume UTF-8 encoding local inputString = luajava.newInstance("java.lang.String", inputStream:readAllBytes(), StandardCharsets.UTF_8) - - -- Parse JSON data from string into object local results = json.decode(inputString) - local targetCas = inputCas - if inputCas.createView ~= nil then - local ok, view = pcall(function() - return inputCas:createView("opf_redacted") - end) + -- Store redacted text in its own view. + if results["redacted_text"] ~= nil then + local ok, view = pcall(function() return inputCas:createView("opf_redacted") end) if ok and view ~= nil then - targetCas = view + view:setSofaDataString(results["redacted_text"], "text/plain") end end - if results["redacted_text"] ~= nil then - targetCas:setSofaDataString(results["redacted_text"], "text/plain") - elseif results["text"] ~= nil then - targetCas:setSofaDataString(results["text"], "text/plain") - end - + -- Add Anomaly annotations to the original view; offsets reference original text. if results["detected_spans"] ~= nil then - for i, sent in ipairs(results["detected_spans"]) do - local anomaly = luajava.newInstance("de.tudarmstadt.ukp.dkpro.core.api.anomaly.type.Anomaly", targetCas) - anomaly:setBegin(sent["start"]) - anomaly:setEnd(sent["end"]) - anomaly:setCategory(sent["label"]) - anomaly:setDescription(sent["placeholder"] or sent["text"] or sent["label"]) + for i, span in ipairs(results["detected_spans"]) do + local anomaly = luajava.newInstance( + "de.tudarmstadt.ukp.dkpro.core.api.anomaly.type.Anomaly", inputCas) + anomaly:setBegin(span["start"]) + anomaly:setEnd(span["end"]) + anomaly:setCategory(span["label"]) + -- description = replacement used (e.g. "[private_person]") or original word + anomaly:setDescription( + (span["placeholder"] ~= nil and span["placeholder"] ~= "") and span["placeholder"] + or span["text"] or span["label"]) anomaly:addToIndexes() end end - - if results["selection"] ~= nil then - -- Selection metadata is available in the JSON response for downstream consumers. - end end diff --git a/duui-anonymize/src/main/python/duui_anonymize.py b/duui-anonymize/src/main/python/duui_anonymize.py index 76d6df52..effe5371 100644 --- a/duui-anonymize/src/main/python/duui_anonymize.py +++ b/duui-anonymize/src/main/python/duui_anonymize.py @@ -1,10 +1,9 @@ from __future__ import annotations import logging -from functools import lru_cache import json -from enum import Enum -from typing import Any, List, Optional, Union +from functools import lru_cache +from typing import Any, List, Optional import torch import uvicorn @@ -15,12 +14,17 @@ from fastapi.responses import JSONResponse, PlainTextResponse from pydantic import BaseModel, Field, field_validator from pydantic_settings import BaseSettings - -from opf import DecodeOptions, OPF +from transformers import pipeline as hf_pipeline logger = logging.getLogger(__name__) logging.basicConfig(level=logging.INFO) +DEFAULT_MODEL = "openai/privacy-filter" + +MODE_REMOVE = "remove" +MODE_PLACEHOLDER = "placeholder" # default: replace with [category] +MODE_PSEUDO = "pseudo" # TODO: not yet supported + # --------------------------------------------------------------------------- # Pydantic models @@ -31,29 +35,18 @@ class DetectedSpan(BaseModel): start: int end: int text: str - placeholder: str - - -class SelectionRange(BaseModel): - begin: int - end: int + placeholder: str # replacement text used; empty string for remove mode class DUUIRequest(BaseModel): text: str options: dict[str, Any] = Field(default_factory=dict) - selection: Optional[SelectionRange] = None + selection: Optional[dict] = None @field_validator("options", mode="before") @classmethod def coerce_options(cls, v: Any) -> dict: - """ - Lua JSON libraries encode empty tables as [] instead of {}. - Accept None, empty list, or any list by falling back to an empty dict. - """ - if v is None: - return {} - if isinstance(v, list): + if v is None or isinstance(v, list): return {} if not isinstance(v, dict): return {} @@ -62,20 +55,14 @@ def coerce_options(cls, v: Any) -> dict: @field_validator("text", mode="before") @classmethod def coerce_text(cls, v: Any) -> str: - """Tolerate Java String objects forwarded via LuaJ.""" - if v is None: - return "" - return str(v) + return "" if v is None else str(v) class DUUIResponse(BaseModel): - schema_version: int - summary: dict[str, Any] text: str detected_spans: List[DetectedSpan] redacted_text: str warning: Optional[str] = None - selection: Optional[SelectionRange] = None class DUUIDocumentation(BaseModel): @@ -91,18 +78,10 @@ class DUUIDocumentation(BaseModel): class Settings(BaseSettings): duui_tool_name: str = "DUUI Anonymize" duui_tool_version: str = "1.0" - default_model: Optional[str] = None - - -class RedactionMode(str, Enum): - REPLACEMENT = "replacement" - PSEUDO = "pseudo" + default_model: str = DEFAULT_MODEL settings = Settings() -DEFAULT_PLACEHOLDER = "" -DEFAULT_MODE = RedactionMode.REPLACEMENT.value -PSEUDO_MODE = RedactionMode.PSEUDO.value # --------------------------------------------------------------------------- # FastAPI app @@ -112,7 +91,7 @@ class RedactionMode(str, Enum): docs_url="/api", redoc_url=None, title="DUUI Anonymize", - description="Text anonymization / PII redaction for TTLab DUUI using the OpenAI Privacy Filter", + description="PII detection and redaction for TTLab DUUI using openai/privacy-filter", version="1.0", terms_of_service="https://www.texttechnologylab.org/legal_notice/", contact={ @@ -130,8 +109,8 @@ class RedactionMode(str, Enum): @app.exception_handler(RequestValidationError) async def validation_exception_handler(request: Request, exc: RequestValidationError) -> JSONResponse: body = await request.body() - logger.error("422 Unprocessable Entity — validation errors: %s", exc.errors()) - logger.error("Raw request body: %s", body.decode("utf-8", errors="replace")) + logger.error("422 validation errors: %s", exc.errors()) + logger.error("Raw body: %s", body.decode("utf-8", errors="replace")) return JSONResponse( status_code=422, content=jsonable_encoder({"detail": exc.errors(), "body": body.decode("utf-8", errors="replace")}), @@ -139,7 +118,7 @@ async def validation_exception_handler(request: Request, exc: RequestValidationE # --------------------------------------------------------------------------- -# Static assets loaded at startup +# Static assets # --------------------------------------------------------------------------- with open("communication.lua", "rb") as _f: @@ -163,8 +142,7 @@ def get_input_output() -> JSONResponse: @app.get("/v1/typesystem") def get_typesystem() -> Response: - xml_content = _typesystem.to_xml().encode("utf-8") - return Response(content=xml_content, media_type="application/xml") + return Response(content=_typesystem.to_xml().encode("utf-8"), media_type="application/xml") @app.get("/v1/communication_layer", response_class=PlainTextResponse) @@ -183,191 +161,119 @@ def get_documentation() -> DUUIDocumentation: @app.post("/v1/process") async def post_process(raw_request: Request) -> DUUIResponse: - # DUUI does not set Content-Type: application/json, so FastAPI will not - # deserialize the body automatically. We parse it manually here. body = await raw_request.body() try: data = json.loads(body) except json.JSONDecodeError as exc: raise RequestValidationError([{"type": "json_invalid", "loc": ("body",), "msg": str(exc), "input": body}]) request = DUUIRequest.model_validate(data) - options = dict(request.options) - selection = _resolve_selection(request.selection, options, text_length=len(request.text)) - return _redact_text(request.text, selection, options) + return _process(request) # --------------------------------------------------------------------------- # Business logic # --------------------------------------------------------------------------- -def _resolve_selection( - request_selection: Optional[SelectionRange], - options: dict[str, Any], - *, - text_length: int, -) -> Optional[SelectionRange]: - if request_selection is not None: - begin = int(request_selection.begin) - end_val = int(request_selection.end) - else: - raw = options.pop("selection", None) - if isinstance(raw, dict): - begin = raw.get("begin") - end_val = raw.get("end") - else: - begin = options.pop("selection_begin", options.pop("selection_start", None)) - end_val = options.pop("selection_end", options.pop("selection_stop", None)) - - if begin is None or end_val is None: - return None - - begin = int(begin) - end_val = int(end_val) - if begin < 0 or end_val < begin or end_val > text_length: - raise ValueError("selection must satisfy 0 <= begin <= end <= text length") - return SelectionRange(begin=begin, end=end_val) - - -def _json_cache_key(payload: dict[str, Any]) -> str: - return json.dumps(payload, sort_keys=True, separators=(",", ":"), default=str) +@lru_cache(maxsize=4) +def _load_pipeline(model: str, device: str): + dev = 0 if device == "cuda" else -1 + logger.info("Loading pipeline: model=%s device=%s", model, device) + return hf_pipeline( + task="token-classification", + model=model, + aggregation_strategy="simple", + device=dev, + ) -def _split_options(options: dict[str, Any]) -> tuple[dict[str, Any], dict[str, Any]]: - _skip = {"decode", "selection", "selection_begin", "selection_end", "selection_start", "selection_stop"} - _redactor_keys = {"model", "context_window_length", "trim_whitespace", "device", - "output_mode", "discard_overlapping_predicted_spans", "mode", "placeholder"} - _decode_keys = {"decode_mode", "viterbi_calibration_path", "calibration_path"} +def _resolve_selection(options: dict[str, Any], text_length: int) -> Optional[tuple[int, int]]: + sel = options.get("selection") + if isinstance(sel, dict): + begin = sel.get("begin", sel.get("start")) + end = sel.get("end", sel.get("stop")) + else: + begin = options.get("selection_begin", options.get("selection_start")) + end = options.get("selection_end", options.get("selection_stop")) - redactor_opts: dict[str, Any] = {} - decode_opts: dict[str, Any] = {} - for key, value in options.items(): - if key in _skip: - continue - if key in _redactor_keys: - redactor_opts[key] = value - elif key in _decode_keys: - k = "viterbi_calibration_path" if key == "calibration_path" else key - decode_opts[k] = value - return redactor_opts, decode_opts - - -@lru_cache(maxsize=8) -def _build_redactor(options_json: str) -> OPF: - opts = json.loads(options_json) - device = opts.get("device") or ("cuda" if torch.cuda.is_available() else "cpu") - return OPF( - model=opts.get("model", settings.default_model), - context_window_length=opts.get("context_window_length"), - trim_whitespace=bool(opts.get("trim_whitespace", True)), - device=device, - output_mode=opts.get("output_mode", "typed"), - discard_overlapping_predicted_spans=bool(opts.get("discard_overlapping_predicted_spans", False)), - output_text_only=False, - ) + if begin is None or end is None: + return None + begin, end = int(begin), int(end) + if begin < 0 or end < begin or end > text_length: + raise ValueError(f"selection must satisfy 0 <= begin <= end <= {text_length}") + return begin, end -def _compose_redacted(text: str, spans: list[DetectedSpan], *, placeholder: str) -> str: +def _build_redacted(text: str, spans: list[DetectedSpan], mode: str) -> str: + """Apply mode transformation to text using already-computed spans.""" if not spans: return text parts: list[str] = [] cursor = 0 - for span in sorted(spans, key=lambda s: (s.start, s.end)): + for span in sorted(spans, key=lambda s: s.start): if span.start < cursor: continue parts.append(text[cursor:span.start]) - parts.append(placeholder) - cursor = max(cursor, span.end) + if mode == MODE_PLACEHOLDER: + parts.append(span.placeholder) # e.g. [private_person] + # MODE_REMOVE: append nothing - the PII is deleted + cursor = span.end parts.append(text[cursor:]) return "".join(parts) -def _parse_spans(payload: Any, *, offset: int = 0) -> list[DetectedSpan]: - spans: list[DetectedSpan] = [] - for item in payload: - if isinstance(item, dict): - label = item.get("label") - start = item.get("start") - end_val = item.get("end") - text = item.get("text") - placeholder = item.get("placeholder") - else: - label = getattr(item, "label", None) - start = getattr(item, "start", None) - end_val = getattr(item, "end", None) - text = getattr(item, "text", None) - placeholder = getattr(item, "placeholder", None) - spans.append(DetectedSpan( - label=str(label), - start=int(start) + offset, - end=int(end_val) + offset, - text=str(text), - placeholder=str(placeholder), - )) - return spans - - -def _redact_text( - text: str, - selection: Optional[SelectionRange], - options: dict[str, Any], -) -> DUUIResponse: - redactor_opts, decode_opts = _split_options(options) - mode = str(redactor_opts.get("mode", DEFAULT_MODE)) - placeholder = str(redactor_opts.get("placeholder", DEFAULT_PLACEHOLDER)) - - if mode == PSEUDO_MODE: +def _process(request: DUUIRequest) -> DUUIResponse: + options = request.options + model = str(options.get("model", settings.default_model)) + device = str(options.get("device") or ("cuda" if torch.cuda.is_available() else "cpu")) + mode = str(options.get("mode", MODE_PLACEHOLDER)) + + print(f"Processing request: model={model} device={device} mode={mode} text_length={len(request.text)}") + + # pseudo mode - not yet supported + if mode == MODE_PSEUDO: return DUUIResponse( - schema_version=1, - summary={"mode": PSEUDO_MODE, "span_count": 0, "by_label": {}, "decoded_mismatch": False}, - text=text, + text=request.text, detected_spans=[], - redacted_text=text, - warning="pseudo mode returns the input unchanged", - selection=selection, + redacted_text=request.text, + warning="pseudo mode is not yet supported - input returned unchanged", ) - redactor = _build_redactor(_json_cache_key(redactor_opts)) - decode = DecodeOptions(**decode_opts) if decode_opts else None + if not request.text: + return DUUIResponse(text="", detected_spans=[], redacted_text="") - selected_text = text - offset = 0 - if selection is not None: - offset = selection.begin - selected_text = text[selection.begin:selection.end] + sel = _resolve_selection(options, text_length=len(request.text)) + selected_text = request.text[sel[0]:sel[1]] if sel else request.text + offset = sel[0] if sel else 0 - result = redactor.redact(selected_text, decode=decode) + pipe = _load_pipeline(model, device) + raw = pipe(selected_text) - if isinstance(result, str): - redacted_text = result if selection is None else ( - text[:selection.begin] + result + text[selection.end:] - ) - return DUUIResponse( - schema_version=1, - summary={"mode": mode, "span_count": 0, "by_label": {}, "decoded_mismatch": False}, - text=text, - detected_spans=[], - redacted_text=redacted_text, - selection=selection, + spans = [ + DetectedSpan( + label=item["entity_group"], + start=int(item["start"]) + offset, + end=int(item["end"]) + offset, + text=str(item["word"]).strip(), + placeholder=f"[{item['entity_group']}]" if mode == MODE_PLACEHOLDER else "", ) - - detected_spans = _parse_spans(result.detected_spans, offset=offset) - local_spans = [ - DetectedSpan(label=s.label, start=s.start - offset, end=s.end - offset, - text=s.text, placeholder=placeholder) - for s in detected_spans + for item in raw ] - redacted_text = _compose_redacted(selected_text, local_spans, placeholder=placeholder) - if selection is not None: - redacted_text = text[:selection.begin] + redacted_text + text[selection.end:] + + redacted_text = _build_redacted(request.text, spans, mode) + if sel is not None: + # only the selected window was processed; rebuild full text around it + local_spans = [ + DetectedSpan(label=s.label, start=s.start - offset, end=s.end - offset, + text=s.text, placeholder=s.placeholder) + for s in spans + ] + redacted_window = _build_redacted(selected_text, local_spans, mode) + redacted_text = request.text[:sel[0]] + redacted_window + request.text[sel[1]:] return DUUIResponse( - schema_version=int(result.schema_version), - summary={**dict(result.summary), "mode": mode}, - text=text, - detected_spans=detected_spans, + text=request.text, + detected_spans=spans, redacted_text=redacted_text, - warning=result.warning, - selection=selection, ) diff --git a/duui-anonymize/src/test/java/AnonymizeTests.java b/duui-anonymize/src/test/java/AnonymizeTests.java index b1c9d316..55ff1ec5 100644 --- a/duui-anonymize/src/test/java/AnonymizeTests.java +++ b/duui-anonymize/src/test/java/AnonymizeTests.java @@ -109,21 +109,12 @@ private static void createCas(String language, String text) throws UIMAException } /** - * Collect all {@link Anomaly} annotations across every CAS view. - * The anonymizer writes results to an "opf_redacted" SOFA view, but also - * to the default view depending on the service configuration. + * Collect all {@link Anomaly} annotations from the default CAS view. + * Anomalies are always indexed against the original document text so that + * their character offsets are valid. The "opf_redacted" view only carries + * the redacted sofa string and no annotations. */ private static Collection collectAnomalies() { - // prefer the dedicated redaction view when available - try { - JCas redactedView = cas.getView("opf_redacted"); - Collection spans = JCasUtil.select(redactedView, Anomaly.class); - if (!spans.isEmpty()) { - return spans; - } - } catch (Exception ignored) { - // view does not exist — fall through to default view - } return JCasUtil.select(cas, Anomaly.class); } @@ -149,7 +140,7 @@ private static String buildResultJson( sb.append("{\n"); sb.append(" \"test\": ").append(jsonStr(testName)).append(",\n"); sb.append(" \"input\": ").append(jsonStr(inputText)).append(",\n"); - sb.append(" \"redacted\": ").append(jsonStr(redactedText)).append(",\n"); + sb.append(" \"output\": ").append(jsonStr(redactedText)).append(",\n"); sb.append(" \"anomaly_count\": ").append(anomalies.size()).append(",\n"); sb.append(" \"anomalies\": [\n"); int idx = 0; @@ -187,84 +178,258 @@ private static String jsonStr(String s) { } // ------------------------------------------------------------------- - // Tests + // Mode tests // ------------------------------------------------------------------- - /** - * Smoke test: plain English sentence with a person name. - * Expects at least one Anomaly annotation to be produced. - */ + /** Placeholder mode (default): PII replaced with [category] tag in redacted_text and Anomaly description. */ @Test - @DisplayName("Simple person-name redaction") - void testSimplePersonName() throws Exception { - String text = "John Smith called the bank to report a fraud."; + @DisplayName("Placeholder mode: PII replaced with [category] tag") + void testPlaceholderMode() throws Exception { + String text = "Send the report to max.mustermann@uni-frankfurt.de by Friday."; createCas("en", text); - composer.add( - new DUUIRemoteDriver.Component(SERVICE_URL) - ); + composer.add(new DUUIRemoteDriver.Component(SERVICE_URL) + .withParameter("mode", "placeholder")); + composer.run(cas); + Collection anomalies = collectAnomalies(); + System.out.println("Anomaly count: " + anomalies.size()); + anomalies.forEach(a -> System.out.printf(" [%d-%d] category=%s description=%s%n", + a.getBegin(), a.getEnd(), a.getCategory(), a.getDescription())); + + assertFalse(anomalies.isEmpty(), "Expected at least one Anomaly for the email address"); + assertTrue(anomalies.stream().anyMatch(a -> { + String d = a.getDescription(); + return d != null && d.startsWith("[") && d.endsWith("]"); + }), "Anomaly description should be a bracketed [category] tag in placeholder mode"); + + String redacted = extractRedactedText(); + assertFalse(redacted.contains("max.mustermann@uni-frankfurt.de"), + "Redacted text should not contain the original email"); + assertTrue(redacted.contains("[private_email]") || redacted.contains("[private_person]"), + "Redacted text should contain a [category] replacement tag"); + } + + /** Remove mode: PII spans are deleted from redacted_text; Anomaly description is the original word. */ + @Test + @DisplayName("Remove mode: PII deleted from redacted text") + void testRemoveMode() throws Exception { + String text = "Call John Smith at john.smith@company.com or +1-800-555-0199 for help."; + createCas("en", text); + + composer.add(new DUUIRemoteDriver.Component(SERVICE_URL) + .withParameter("mode", "remove")); composer.run(cas); Collection anomalies = collectAnomalies(); System.out.println("Anomaly count: " + anomalies.size()); + anomalies.forEach(a -> System.out.printf(" [%d-%d] category=%s text=%s%n", + a.getBegin(), a.getEnd(), a.getCategory(), text.substring(a.getBegin(), a.getEnd()))); + + assertFalse(anomalies.isEmpty(), "Expected anomalies in remove mode"); + + String redacted = extractRedactedText(); + System.out.printf(" original (%d): %s%n", text.length(), text); + System.out.printf(" redacted (%d): %s%n", redacted.length(), redacted); + assertTrue(redacted.length() < text.length(), + "Redacted text should be shorter after PII removal"); + // original PII tokens must be absent from the redacted string for (Anomaly a : anomalies) { - System.out.printf(" [%d-%d] category=%s description=%s%n", - a.getBegin(), a.getEnd(), a.getCategory(), a.getDescription()); + String pii = text.substring(a.getBegin(), a.getEnd()); + assertFalse(redacted.contains(pii), + "Removed PII token '" + pii + "' should not appear in redacted text"); } + } - assertFalse(anomalies.isEmpty(), - "Expected at least one Anomaly annotation for 'John Smith'"); + /** Pseudo mode: not yet supported - service returns input unchanged with no annotations. */ + @Test + @DisplayName("Pseudo mode: not yet supported, returns input unchanged") + void testPseudoMode() throws Exception { + String text = "Alice and Bob met at the Frankfurt main station."; + createCas("en", text); + + composer.add(new DUUIRemoteDriver.Component(SERVICE_URL) + .withParameter("mode", "pseudo")); + composer.run(cas); + + Collection anomalies = collectAnomalies(); + System.out.println("Anomaly count (pseudo mode): " + anomalies.size()); + assertTrue(anomalies.isEmpty(), + "Pseudo mode (unsupported stub) should produce no Anomaly annotations"); } - /** - * Email address redaction. - */ + // ------------------------------------------------------------------- + // PII type tests (mode=placeholder so description = [category]) + // ------------------------------------------------------------------- + + /** private_person: full name in a simple sentence. */ @Test - @DisplayName("Email address redaction") - void testEmailRedaction() throws Exception { - String text = "Please contact support at alice@example.com for further assistance."; + @DisplayName("Type: private_person") + void testTypePerson() throws Exception { + String text = "John Smith called the bank to report a fraud."; createCas("en", text); - composer.add( - new DUUIRemoteDriver.Component(SERVICE_URL) - ); + composer.add(new DUUIRemoteDriver.Component(SERVICE_URL) + .withParameter("mode", "placeholder")); + composer.run(cas); + + Collection anomalies = collectAnomalies(); + anomalies.forEach(a -> System.out.printf(" [%d-%d] %s = %s%n", + a.getBegin(), a.getEnd(), a.getCategory(), a.getDescription())); + + assertFalse(anomalies.isEmpty(), "Expected at least one annotation"); + assertTrue(anomalies.stream().anyMatch(a -> "private_person".equals(a.getCategory())), + "Expected category 'private_person' for 'John Smith'"); + } + + /** private_email: plain email address. */ + @Test + @DisplayName("Type: private_email") + void testTypeEmail() throws Exception { + String text = "Please contact alice@example.com for further assistance."; + createCas("en", text); + composer.add(new DUUIRemoteDriver.Component(SERVICE_URL) + .withParameter("mode", "placeholder")); composer.run(cas); Collection anomalies = collectAnomalies(); - System.out.println("Anomaly count: " + anomalies.size()); - assertFalse(anomalies.isEmpty(), - "Expected at least one Anomaly for the email address"); + anomalies.forEach(a -> System.out.printf(" [%d-%d] %s = %s%n", + a.getBegin(), a.getEnd(), a.getCategory(), a.getDescription())); + + assertTrue(anomalies.stream().anyMatch(a -> "private_email".equals(a.getCategory())), + "Expected category 'private_email' for 'alice@example.com'"); } - /** - * Phone number redaction. - */ + /** private_phone: international phone number. */ @Test - @DisplayName("Phone number redaction") - void testPhoneNumberRedaction() throws Exception { + @DisplayName("Type: private_phone") + void testTypePhone() throws Exception { String text = "You can reach Dr. Miller at +49 69 1234 5678 during office hours."; createCas("en", text); - composer.add( - new DUUIRemoteDriver.Component(SERVICE_URL) - ); + composer.add(new DUUIRemoteDriver.Component(SERVICE_URL) + .withParameter("mode", "placeholder")); + composer.run(cas); + + Collection anomalies = collectAnomalies(); + anomalies.forEach(a -> System.out.printf(" [%d-%d] %s = %s%n", + a.getBegin(), a.getEnd(), a.getCategory(), a.getDescription())); + assertFalse(anomalies.isEmpty(), "Expected phone or person annotation"); + long phoneCount = anomalies.stream().filter(a -> "private_phone".equals(a.getCategory())).count(); + System.out.println("private_phone spans: " + phoneCount); + assertTrue(phoneCount > 0, "Expected category 'private_phone' for '+49 69 1234 5678'"); + } + + /** private_address: street address with postcode. */ + @Test + @DisplayName("Type: private_address") + void testTypeAddress() throws Exception { + String text = "She lives at 742 Evergreen Terrace, Springfield, IL 62704."; + createCas("en", text); + + composer.add(new DUUIRemoteDriver.Component(SERVICE_URL) + .withParameter("mode", "placeholder")); composer.run(cas); Collection anomalies = collectAnomalies(); - System.out.println("Anomaly count: " + anomalies.size()); - assertFalse(anomalies.isEmpty(), - "Expected at least one Anomaly for the phone number or person name"); + anomalies.forEach(a -> System.out.printf(" [%d-%d] %s = %s%n", + a.getBegin(), a.getEnd(), a.getCategory(), a.getDescription())); + + long addrCount = anomalies.stream().filter(a -> "private_address".equals(a.getCategory())).count(); + System.out.println("private_address spans: " + addrCount); + assertTrue(addrCount > 0, "Expected category 'private_address' for the street address"); } - /** - * Multiple PII entities in a single document. - * Asserts that distinct spans covering the name, email, and phone are returned. - */ + /** private_url: personal homepage URL. */ @Test - @DisplayName("Multiple PII entities in one document") + @DisplayName("Type: private_url") + void testTypeUrl() throws Exception { + String text = "My personal page is at https://janedoe.personal-site.com/about and I post there."; + createCas("en", text); + + composer.add(new DUUIRemoteDriver.Component(SERVICE_URL) + .withParameter("mode", "placeholder")); + composer.run(cas); + + Collection anomalies = collectAnomalies(); + anomalies.forEach(a -> System.out.printf(" [%d-%d] %s = %s%n", + a.getBegin(), a.getEnd(), a.getCategory(), a.getDescription())); + + long urlCount = anomalies.stream().filter(a -> "private_url".equals(a.getCategory())).count(); + System.out.println("private_url spans: " + urlCount); + assertTrue(urlCount > 0, "Expected category 'private_url' for the personal URL"); + } + + /** private_date: personally identifying date (e.g. birth date). */ + @Test + @DisplayName("Type: private_date") + void testTypeDate() throws Exception { + String text = "Jane Doe was born on March 15, 1990 in Chicago."; + createCas("en", text); + + composer.add(new DUUIRemoteDriver.Component(SERVICE_URL) + .withParameter("mode", "placeholder")); + composer.run(cas); + + Collection anomalies = collectAnomalies(); + anomalies.forEach(a -> System.out.printf(" [%d-%d] %s = %s%n", + a.getBegin(), a.getEnd(), a.getCategory(), a.getDescription())); + + assertFalse(anomalies.isEmpty(), "Expected at least one annotation (person or date)"); + long dateCount = anomalies.stream().filter(a -> "private_date".equals(a.getCategory())).count(); + System.out.println("private_date spans: " + dateCount); + assertTrue(dateCount > 0, "Expected category 'private_date' for 'March 15, 1990'"); + } + + /** account_number: credit-card style number string. */ + @Test + @DisplayName("Type: account_number") + void testTypeAccountNumber() throws Exception { + String text = "Please transfer funds to account number 4532-0151-1283-0366 at Deutsche Bank."; + createCas("en", text); + + composer.add(new DUUIRemoteDriver.Component(SERVICE_URL) + .withParameter("mode", "placeholder")); + composer.run(cas); + + Collection anomalies = collectAnomalies(); + anomalies.forEach(a -> System.out.printf(" [%d-%d] %s = %s%n", + a.getBegin(), a.getEnd(), a.getCategory(), a.getDescription())); + + long acctCount = anomalies.stream().filter(a -> "account_number".equals(a.getCategory())).count(); + System.out.println("account_number spans: " + acctCount); + assertTrue(acctCount > 0, "Expected category 'account_number' for the card number"); + } + + /** secret: API key / credential in text. */ + @Test + @DisplayName("Type: secret") + void testTypeSecret() throws Exception { + String text = "The API key is sk-proj-abc123XYZ987 and the password is H@nt3r2secure!."; + createCas("en", text); + + composer.add(new DUUIRemoteDriver.Component(SERVICE_URL) + .withParameter("mode", "placeholder")); + composer.run(cas); + + Collection anomalies = collectAnomalies(); + anomalies.forEach(a -> System.out.printf(" [%d-%d] %s = %s%n", + a.getBegin(), a.getEnd(), a.getCategory(), a.getDescription())); + + long secretCount = anomalies.stream().filter(a -> "secret".equals(a.getCategory())).count(); + System.out.println("secret spans: " + secretCount); + assertTrue(secretCount > 0, "Expected category 'secret' for API key / password"); + } + + // ------------------------------------------------------------------- + // Feature / combination tests + // ------------------------------------------------------------------- + + /** Multiple PII types in one document; verifies distinct categories are detected. */ + @Test + @DisplayName("Multiple PII types in one document") void testMultiplePiiEntities() throws Exception { String text = "Patient: Jane Doe, DOB: 1985-03-22. " + @@ -272,161 +437,126 @@ void testMultiplePiiEntities() throws Exception { "Address: 60325 Frankfurt am Main, Goethe-Platz 1."; createCas("en", text); - composer.add( - new DUUIRemoteDriver.Component(SERVICE_URL) - ); - + composer.add(new DUUIRemoteDriver.Component(SERVICE_URL) + .withParameter("mode", "placeholder")); composer.run(cas); Collection anomalies = collectAnomalies(); System.out.println("Anomaly count: " + anomalies.size()); - for (Anomaly a : anomalies) { - System.out.printf(" [%d-%d] category=%s text=%s%n", - a.getBegin(), a.getEnd(), a.getCategory(), - text.substring(a.getBegin(), a.getEnd())); - } + anomalies.forEach(a -> System.out.printf(" [%d-%d] %s = '%s'%n", + a.getBegin(), a.getEnd(), a.getCategory(), text.substring(a.getBegin(), a.getEnd()))); assertTrue(anomalies.size() >= 2, - "Expected at least 2 Anomaly annotations for a document with multiple PII entities"); + "Expected at least 2 distinct PII annotations"); + + long distinctCategories = anomalies.stream().map(Anomaly::getCategory).distinct().count(); + System.out.println("Distinct categories: " + distinctCategories); + assertTrue(distinctCategories >= 2, + "Expected annotations from at least 2 different PII categories"); } - /** - * Pseudo-mode: the service should return the text unchanged (stub behavior). - * Asserts no Anomaly annotations are created. - */ + /** Smoke test with two PII types in one sentence. */ @Test - @DisplayName("Pseudo mode returns unchanged text") - void testPseudoMode() throws Exception { - String text = "Alice and Bob met at the Frankfurt main station."; + @DisplayName("Smoke test: person + email in one sentence") + void testSimplePerson() throws Exception { + String text = "My name is Harry Potter and my email is harry.potter@hogwarts.edu."; createCas("en", text); - composer.add( - new DUUIRemoteDriver.Component(SERVICE_URL) - .withParameter("mode", "pseudo") - ); - + composer.add(new DUUIRemoteDriver.Component(SERVICE_URL) + .withParameter("mode", "placeholder")); composer.run(cas); Collection anomalies = collectAnomalies(); - System.out.println("Anomaly count (pseudo mode): " + anomalies.size()); - assertTrue(anomalies.isEmpty(), - "Pseudo mode should produce no Anomaly annotations (stub returns input unchanged)"); + System.out.println("Anomaly count: " + anomalies.size()); + anomalies.forEach(a -> System.out.printf(" [%d-%d] %s = %s%n", + a.getBegin(), a.getEnd(), a.getCategory(), a.getDescription())); + + assertFalse(anomalies.isEmpty(), + "Expected at least one Anomaly annotation"); } - /** - * Custom placeholder: verifies the {@link Anomaly#getDescription()} contains the - * user-supplied placeholder string instead of the default {@code }. - */ + /** Ambiguous context where person identity is inferred from surrounding detail. */ @Test - @DisplayName("Custom placeholder is reflected in Anomaly description") - void testCustomPlaceholder() throws Exception { - String text = "Send the report to max.mustermann@uni-frankfurt.de by Friday."; - String placeholder = "***PRIVATE***"; + @DisplayName("Complex context: identity inferred from description") + void testComplexContext() throws Exception { + String text = "His name is Harry, he works at the TTLAB in Frankfurt, " + + "he's the only Chinese guy in the office."; createCas("en", text); - composer.add( - new DUUIRemoteDriver.Component(SERVICE_URL) - .withParameter("placeholder", placeholder) - ); - + composer.add(new DUUIRemoteDriver.Component(SERVICE_URL) + .withParameter("mode", "remove")); // or remove/placeholder mode, should still detect the same spans composer.run(cas); Collection anomalies = collectAnomalies(); System.out.println("Anomaly count: " + anomalies.size()); + anomalies.forEach(a -> System.out.printf(" [%d-%d] %s = %s%n", + a.getBegin(), a.getEnd(), a.getCategory(), a.getDescription())); - boolean foundCustomPlaceholder = anomalies.stream() - .anyMatch(a -> placeholder.equals(a.getDescription())); - assertTrue(foundCustomPlaceholder, - "At least one Anomaly should carry the custom placeholder '" + placeholder + "'"); + assertFalse(anomalies.isEmpty(), "Expected at least one annotation in complex context"); } - /** - * Selection window: only the text between offsets [8, 36] should be analysed. - * Entities outside that window must not be annotated. - */ + /** Selection window: only span offsets within [selBegin, selEnd] must be annotated. */ @Test @DisplayName("Selection window constrains annotation range") void testSelectionWindow() throws Exception { - // offsets: 0123456789012345678901234567890123456789 - // Call Dr. John Adams at 555-0100 today. - // window [8, 28] covers "John Adams at 555-0100" + // window [9, 30] covers "John Adams at 555-0100" String text = "Call Dr. John Adams at 555-0100 today."; int selBegin = 9; int selEnd = 30; createCas("en", text); - composer.add( - new DUUIRemoteDriver.Component(SERVICE_URL) - .withParameter("selection_begin", String.valueOf(selBegin)) - .withParameter("selection_end", String.valueOf(selEnd)) - ); - + composer.add(new DUUIRemoteDriver.Component(SERVICE_URL) + .withParameter("mode", "placeholder") + .withParameter("selection_begin", String.valueOf(selBegin)) + .withParameter("selection_end", String.valueOf(selEnd))); composer.run(cas); Collection anomalies = collectAnomalies(); System.out.println("Anomaly count (selection window): " + anomalies.size()); for (Anomaly a : anomalies) { assertTrue(a.getBegin() >= selBegin && a.getEnd() <= selEnd, - String.format("Anomaly [%d-%d] falls outside the selection window [%d-%d]", + String.format("Anomaly [%d-%d] outside window [%d-%d]", a.getBegin(), a.getEnd(), selBegin, selEnd)); } } - /** - * Empty document: the annotator must not throw and must return no anomalies. - */ + /** Empty document must not throw and must return zero annotations. */ @Test @DisplayName("Empty document produces no anomalies") void testEmptyDocument() throws Exception { createCas("en", ""); - composer.add( - new DUUIRemoteDriver.Component(SERVICE_URL) - ); - + composer.add(new DUUIRemoteDriver.Component(SERVICE_URL) + .withParameter("mode", "placeholder")); composer.run(cas); - Collection anomalies = collectAnomalies(); - assertTrue(anomalies.isEmpty(), + assertTrue(collectAnomalies().isEmpty(), "An empty document should produce zero Anomaly annotations"); } - /** - * German text: verifies the annotator handles non-English input without crashing. - * The model may or may not detect German PII depending on the loaded checkpoint; - * we only assert no exception is thrown. - */ + /** German text must not throw; detection quality may vary. */ @Test @DisplayName("German text does not cause an exception") void testGermanText() throws Exception { String text = "Herr Klaus Muller wohnt in der Goethestrasse 12, 60313 Frankfurt am Main."; createCas("de", text); - composer.add( - new DUUIRemoteDriver.Component(SERVICE_URL) - ); + composer.add(new DUUIRemoteDriver.Component(SERVICE_URL) + .withParameter("mode", "placeholder")); - // Should complete without throwing assertDoesNotThrow(() -> composer.run(cas)); - - Collection anomalies = collectAnomalies(); - System.out.println("German Anomaly count: " + anomalies.size()); + System.out.println("German Anomaly count: " + collectAnomalies().size()); } - /** - * XMI serialisation round-trip: runs the annotator and writes the CAS to an XMI - * file so the result can be inspected with the UIMA CAS Editor. - */ + /** XMI round-trip: annotate and write to src/test/results/ for manual inspection. */ @Test @DisplayName("XMI output is written to src/test/results/") void testXmiOutput() throws Exception { - String text = - "Maria Schmidt (m.schmidt@example.de) lives at Berliner Str. 5, 10115 Berlin."; + String text = "Maria Schmidt (m.schmidt@example.de) lives at Berliner Str. 5, 10115 Berlin."; createCas("en", text); - composer.add( - new DUUIRemoteDriver.Component(SERVICE_URL) - ); + composer.add(new DUUIRemoteDriver.Component(SERVICE_URL) + .withParameter("mode", "placeholder")); composer.add(new DUUIUIMADriver.Component( createEngineDescription(XmiWriter.class, diff --git a/duui-anonymize/src/test/results/testComplexContext.json b/duui-anonymize/src/test/results/testComplexContext.json new file mode 100644 index 00000000..ec52ea8b --- /dev/null +++ b/duui-anonymize/src/test/results/testComplexContext.json @@ -0,0 +1,15 @@ +{ + "test": "testComplexContext", + "input": "His name is Harry, he works at the TTLAB in Frankfurt, he's the only Chinese guy in the office.", + "output": "His name is[private_person], he works at the TTLAB in Frankfurt, he's the only Chinese guy in the office.", + "anomaly_count": 1, + "anomalies": [ + { + "begin": 11, + "end": 17, + "category": "private_person", + "description": "[private_person]", + "text": " Harry" + } + ] +} diff --git a/duui-anonymize/src/test/results/testComplexContext.xmi b/duui-anonymize/src/test/results/testComplexContext.xmi new file mode 100644 index 00000000..d223c1f1 --- /dev/null +++ b/duui-anonymize/src/test/results/testComplexContext.xmi @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/duui-anonymize/src/test/results/testCustomPlaceholder.json b/duui-anonymize/src/test/results/testCustomPlaceholder.json index fc0010e6..05ef427d 100644 --- a/duui-anonymize/src/test/results/testCustomPlaceholder.json +++ b/duui-anonymize/src/test/results/testCustomPlaceholder.json @@ -1,15 +1,22 @@ { "test": "testCustomPlaceholder", "input": "Send the report to max.mustermann@uni-frankfurt.de by Friday.", - "redacted": "Send the report to ann@uni-frankfurt.de by Friday.", - "anomaly_count": 1, + "redacted": "Send the report to[private_email][private_email] by Friday.", + "anomaly_count": 2, "anomalies": [ { - "begin": 19, - "end": 30, - "category": "private_person", - "description": "", - "text": "max.musterm" + "begin": 18, + "end": 47, + "category": "private_email", + "description": "[private_email]", + "text": " max.mustermann@uni-frankfurt" + }, + { + "begin": 47, + "end": 50, + "category": "private_email", + "description": "[private_email]", + "text": ".de" } ] } diff --git a/duui-anonymize/src/test/results/testCustomPlaceholder.xmi b/duui-anonymize/src/test/results/testCustomPlaceholder.xmi index 5876ca42..ff0c6045 100644 --- a/duui-anonymize/src/test/results/testCustomPlaceholder.xmi +++ b/duui-anonymize/src/test/results/testCustomPlaceholder.xmi @@ -1 +1 @@ - \ No newline at end of file + \ No newline at end of file diff --git a/duui-anonymize/src/test/results/testEmailRedaction.json b/duui-anonymize/src/test/results/testEmailRedaction.json index af7f2370..a54fa4ca 100644 --- a/duui-anonymize/src/test/results/testEmailRedaction.json +++ b/duui-anonymize/src/test/results/testEmailRedaction.json @@ -1,8 +1,22 @@ { "test": "testEmailRedaction", "input": "Please contact support at alice@example.com for further assistance.", - "redacted": "Please contact support at alice@example.com for further assistance.", - "anomaly_count": 0, + "redacted": "Please contact support at[private_email][private_email] for further assistance.", + "anomaly_count": 2, "anomalies": [ + { + "begin": 25, + "end": 39, + "category": "private_email", + "description": "[private_email]", + "text": " alice@example" + }, + { + "begin": 39, + "end": 43, + "category": "private_email", + "description": "[private_email]", + "text": ".com" + } ] } diff --git a/duui-anonymize/src/test/results/testEmailRedaction.xmi b/duui-anonymize/src/test/results/testEmailRedaction.xmi index a53ad21e..df0fcb56 100644 --- a/duui-anonymize/src/test/results/testEmailRedaction.xmi +++ b/duui-anonymize/src/test/results/testEmailRedaction.xmi @@ -1 +1 @@ - \ No newline at end of file + \ No newline at end of file diff --git a/duui-anonymize/src/test/results/testEmptyDocument.json b/duui-anonymize/src/test/results/testEmptyDocument.json index 8400af10..6886cf97 100644 --- a/duui-anonymize/src/test/results/testEmptyDocument.json +++ b/duui-anonymize/src/test/results/testEmptyDocument.json @@ -1,7 +1,7 @@ { "test": "testEmptyDocument", "input": "", - "redacted": "", + "output": "", "anomaly_count": 0, "anomalies": [ ] diff --git a/duui-anonymize/src/test/results/testEmptyDocument.xmi b/duui-anonymize/src/test/results/testEmptyDocument.xmi index 294d340e..45caf477 100644 --- a/duui-anonymize/src/test/results/testEmptyDocument.xmi +++ b/duui-anonymize/src/test/results/testEmptyDocument.xmi @@ -1 +1 @@ - \ No newline at end of file + \ No newline at end of file diff --git a/duui-anonymize/src/test/results/testGermanText.json b/duui-anonymize/src/test/results/testGermanText.json index d5373b0e..f59c3220 100644 --- a/duui-anonymize/src/test/results/testGermanText.json +++ b/duui-anonymize/src/test/results/testGermanText.json @@ -1,15 +1,36 @@ { "test": "testGermanText", "input": "Herr Klaus Muller wohnt in der Goethestrasse 12, 60313 Frankfurt am Main.", - "redacted": "Herr Klaus wohnt in der Goethestrasse 12, 60313 Frankfurt am Main.", - "anomaly_count": 1, + "output": "[private_person][private_person] wohnt in der[private_address][private_address].", + "anomaly_count": 4, "anomalies": [ { - "begin": 11, + "begin": 0, + "end": 10, + "category": "private_person", + "description": "[private_person]", + "text": "Herr Klaus" + }, + { + "begin": 10, "end": 17, "category": "private_person", - "description": "", - "text": "Muller" + "description": "[private_person]", + "text": " Muller" + }, + { + "begin": 30, + "end": 67, + "category": "private_address", + "description": "[private_address]", + "text": " Goethestrasse 12, 60313 Frankfurt am" + }, + { + "begin": 67, + "end": 72, + "category": "private_address", + "description": "[private_address]", + "text": " Main" } ] } diff --git a/duui-anonymize/src/test/results/testGermanText.xmi b/duui-anonymize/src/test/results/testGermanText.xmi index dbaeaf6e..d76766e6 100644 --- a/duui-anonymize/src/test/results/testGermanText.xmi +++ b/duui-anonymize/src/test/results/testGermanText.xmi @@ -1 +1 @@ - \ No newline at end of file + \ No newline at end of file diff --git a/duui-anonymize/src/test/results/testMultiplePiiEntities.json b/duui-anonymize/src/test/results/testMultiplePiiEntities.json index 9810c920..047322f6 100644 --- a/duui-anonymize/src/test/results/testMultiplePiiEntities.json +++ b/duui-anonymize/src/test/results/testMultiplePiiEntities.json @@ -1,22 +1,78 @@ { "test": "testMultiplePiiEntities", "input": "Patient: Jane Doe, DOB: 1985-03-22. Contact: jane.doe@hospital.org, Tel: 069-9876-5432. Address: 60325 Frankfurt am Main, Goethe-Platz 1.", - "redacted": "Patient: , DOB: 3-22. Contact: jane.doe@hospital.org, Tel: 069-9876-5432. Address: 60325 Frankfurt am Main, Goethe-Platz 1.", - "anomaly_count": 2, + "output": "Patient:[private_person][private_person], DOB: [private_date][private_date]. Contact:[private_email][private_email], Tel: [private_phone][private_phone]. Address: [private_address][private_address].", + "anomaly_count": 10, "anomalies": [ { - "begin": 9, + "begin": 8, + "end": 13, + "category": "private_person", + "description": "[private_person]", + "text": " Jane" + }, + { + "begin": 13, "end": 17, "category": "private_person", - "description": "", - "text": "Jane Doe" + "description": "[private_person]", + "text": " Doe" }, { "begin": 24, - "end": 30, + "end": 32, "category": "private_date", - "description": "", - "text": "1985-0" + "description": "[private_date]", + "text": "1985-03-" + }, + { + "begin": 32, + "end": 34, + "category": "private_date", + "description": "[private_date]", + "text": "22" + }, + { + "begin": 44, + "end": 62, + "category": "private_email", + "description": "[private_email]", + "text": " jane.doe@hospital" + }, + { + "begin": 62, + "end": 66, + "category": "private_email", + "description": "[private_email]", + "text": ".org" + }, + { + "begin": 73, + "end": 85, + "category": "private_phone", + "description": "[private_phone]", + "text": "069-9876-543" + }, + { + "begin": 85, + "end": 86, + "category": "private_phone", + "description": "[private_phone]", + "text": "2" + }, + { + "begin": 97, + "end": 135, + "category": "private_address", + "description": "[private_address]", + "text": "60325 Frankfurt am Main, Goethe-Platz " + }, + { + "begin": 135, + "end": 136, + "category": "private_address", + "description": "[private_address]", + "text": "1" } ] } diff --git a/duui-anonymize/src/test/results/testMultiplePiiEntities.xmi b/duui-anonymize/src/test/results/testMultiplePiiEntities.xmi index e05d339d..7e0beb85 100644 --- a/duui-anonymize/src/test/results/testMultiplePiiEntities.xmi +++ b/duui-anonymize/src/test/results/testMultiplePiiEntities.xmi @@ -1 +1 @@ - \ No newline at end of file + \ No newline at end of file diff --git a/duui-anonymize/src/test/results/testPhoneNumberRedaction.json b/duui-anonymize/src/test/results/testPhoneNumberRedaction.json index aff12c6f..025d709a 100644 --- a/duui-anonymize/src/test/results/testPhoneNumberRedaction.json +++ b/duui-anonymize/src/test/results/testPhoneNumberRedaction.json @@ -1,15 +1,36 @@ { "test": "testPhoneNumberRedaction", "input": "You can reach Dr. Miller at +49 69 1234 5678 during office hours.", - "redacted": "You can reach at +49 69 1234 5678 during office hours.", - "anomaly_count": 1, + "redacted": "You can reach[private_person][private_person] at[private_phone][private_phone] during office hours.", + "anomaly_count": 4, "anomalies": [ { - "begin": 14, + "begin": 13, + "end": 17, + "category": "private_person", + "description": "[private_person]", + "text": " Dr." + }, + { + "begin": 17, "end": 24, "category": "private_person", - "description": "", - "text": "Dr. Miller" + "description": "[private_person]", + "text": " Miller" + }, + { + "begin": 27, + "end": 43, + "category": "private_phone", + "description": "[private_phone]", + "text": " +49 69 1234 567" + }, + { + "begin": 43, + "end": 44, + "category": "private_phone", + "description": "[private_phone]", + "text": "8" } ] } diff --git a/duui-anonymize/src/test/results/testPhoneNumberRedaction.xmi b/duui-anonymize/src/test/results/testPhoneNumberRedaction.xmi index 4249f4ac..832fbcf6 100644 --- a/duui-anonymize/src/test/results/testPhoneNumberRedaction.xmi +++ b/duui-anonymize/src/test/results/testPhoneNumberRedaction.xmi @@ -1 +1 @@ - \ No newline at end of file + \ No newline at end of file diff --git a/duui-anonymize/src/test/results/testPlaceholderMode.json b/duui-anonymize/src/test/results/testPlaceholderMode.json new file mode 100644 index 00000000..2e65f167 --- /dev/null +++ b/duui-anonymize/src/test/results/testPlaceholderMode.json @@ -0,0 +1,22 @@ +{ + "test": "testPlaceholderMode", + "input": "Send the report to max.mustermann@uni-frankfurt.de by Friday.", + "output": "Send the report to[private_email][private_email] by Friday.", + "anomaly_count": 2, + "anomalies": [ + { + "begin": 18, + "end": 47, + "category": "private_email", + "description": "[private_email]", + "text": " max.mustermann@uni-frankfurt" + }, + { + "begin": 47, + "end": 50, + "category": "private_email", + "description": "[private_email]", + "text": ".de" + } + ] +} diff --git a/duui-anonymize/src/test/results/testPlaceholderMode.xmi b/duui-anonymize/src/test/results/testPlaceholderMode.xmi new file mode 100644 index 00000000..113975bb --- /dev/null +++ b/duui-anonymize/src/test/results/testPlaceholderMode.xmi @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/duui-anonymize/src/test/results/testPseudoMode.json b/duui-anonymize/src/test/results/testPseudoMode.json index a712cc0c..ff34d374 100644 --- a/duui-anonymize/src/test/results/testPseudoMode.json +++ b/duui-anonymize/src/test/results/testPseudoMode.json @@ -1,15 +1,22 @@ { "test": "testPseudoMode", "input": "Alice and Bob met at the Frankfurt main station.", - "redacted": "Alice and met at the Frankfurt main station.", - "anomaly_count": 1, + "output": "[private_person] and[private_person] met at the Frankfurt main station.", + "anomaly_count": 2, "anomalies": [ { - "begin": 10, + "begin": 0, + "end": 5, + "category": "private_person", + "description": "[private_person]", + "text": "Alice" + }, + { + "begin": 9, "end": 13, "category": "private_person", - "description": "", - "text": "Bob" + "description": "[private_person]", + "text": " Bob" } ] } diff --git a/duui-anonymize/src/test/results/testPseudoMode.xmi b/duui-anonymize/src/test/results/testPseudoMode.xmi index 8a4b914e..fd9a06c1 100644 --- a/duui-anonymize/src/test/results/testPseudoMode.xmi +++ b/duui-anonymize/src/test/results/testPseudoMode.xmi @@ -1 +1 @@ - \ No newline at end of file + \ No newline at end of file diff --git a/duui-anonymize/src/test/results/testRemoveMode.json b/duui-anonymize/src/test/results/testRemoveMode.json new file mode 100644 index 00000000..d37ad17d --- /dev/null +++ b/duui-anonymize/src/test/results/testRemoveMode.json @@ -0,0 +1,50 @@ +{ + "test": "testRemoveMode", + "input": "Call John Smith at john.smith@company.com or +1-800-555-0199 for help.", + "output": "Call[private_person][private_person] at[private_email][private_email] or[private_phone][private_phone] for help.", + "anomaly_count": 6, + "anomalies": [ + { + "begin": 4, + "end": 9, + "category": "private_person", + "description": "[private_person]", + "text": " John" + }, + { + "begin": 9, + "end": 15, + "category": "private_person", + "description": "[private_person]", + "text": " Smith" + }, + { + "begin": 18, + "end": 37, + "category": "private_email", + "description": "[private_email]", + "text": " john.smith@company" + }, + { + "begin": 37, + "end": 41, + "category": "private_email", + "description": "[private_email]", + "text": ".com" + }, + { + "begin": 44, + "end": 59, + "category": "private_phone", + "description": "[private_phone]", + "text": " +1-800-555-019" + }, + { + "begin": 59, + "end": 60, + "category": "private_phone", + "description": "[private_phone]", + "text": "9" + } + ] +} diff --git a/duui-anonymize/src/test/results/testRemoveMode.xmi b/duui-anonymize/src/test/results/testRemoveMode.xmi new file mode 100644 index 00000000..ec65475f --- /dev/null +++ b/duui-anonymize/src/test/results/testRemoveMode.xmi @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/duui-anonymize/src/test/results/testSelectionWindow.json b/duui-anonymize/src/test/results/testSelectionWindow.json index e073b535..3c9d39bf 100644 --- a/duui-anonymize/src/test/results/testSelectionWindow.json +++ b/duui-anonymize/src/test/results/testSelectionWindow.json @@ -1,22 +1,36 @@ { "test": "testSelectionWindow", "input": "Call Dr. John Adams at 555-0100 today.", - "redacted": "Call Dr. at 0 today.", - "anomaly_count": 2, + "output": "Call[private_person][private_person] at [private_phone][private_phone].", + "anomaly_count": 4, "anomalies": [ { - "begin": 9, + "begin": 4, + "end": 13, + "category": "private_person", + "description": "[private_person]", + "text": " Dr. John" + }, + { + "begin": 13, "end": 19, "category": "private_person", - "description": "", - "text": "John Adams" + "description": "[private_person]", + "text": " Adams" }, { "begin": 23, - "end": 30, + "end": 31, + "category": "private_phone", + "description": "[private_phone]", + "text": "555-0100" + }, + { + "begin": 31, + "end": 37, "category": "private_phone", - "description": "", - "text": "555-010" + "description": "[private_phone]", + "text": " today" } ] } diff --git a/duui-anonymize/src/test/results/testSelectionWindow.xmi b/duui-anonymize/src/test/results/testSelectionWindow.xmi index 76f71548..080206f1 100644 --- a/duui-anonymize/src/test/results/testSelectionWindow.xmi +++ b/duui-anonymize/src/test/results/testSelectionWindow.xmi @@ -1 +1 @@ - \ No newline at end of file + \ No newline at end of file diff --git a/duui-anonymize/src/test/results/testSimplePerson.json b/duui-anonymize/src/test/results/testSimplePerson.json new file mode 100644 index 00000000..9a671f3a --- /dev/null +++ b/duui-anonymize/src/test/results/testSimplePerson.json @@ -0,0 +1,36 @@ +{ + "test": "testSimplePerson", + "input": "My name is Harry Potter and my email is harry.potter@hogwarts.edu.", + "output": "My name is[private_person][private_person] and my email is[private_email][private_email].", + "anomaly_count": 4, + "anomalies": [ + { + "begin": 10, + "end": 16, + "category": "private_person", + "description": "[private_person]", + "text": " Harry" + }, + { + "begin": 16, + "end": 23, + "category": "private_person", + "description": "[private_person]", + "text": " Potter" + }, + { + "begin": 39, + "end": 61, + "category": "private_email", + "description": "[private_email]", + "text": " harry.potter@hogwarts" + }, + { + "begin": 61, + "end": 65, + "category": "private_email", + "description": "[private_email]", + "text": ".edu" + } + ] +} diff --git a/duui-anonymize/src/test/results/testSimplePerson.xmi b/duui-anonymize/src/test/results/testSimplePerson.xmi new file mode 100644 index 00000000..c6ecea54 --- /dev/null +++ b/duui-anonymize/src/test/results/testSimplePerson.xmi @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/duui-anonymize/src/test/results/testSimplePersonName.json b/duui-anonymize/src/test/results/testSimplePersonName.json index 30b0e20c..cd1e49a6 100644 --- a/duui-anonymize/src/test/results/testSimplePersonName.json +++ b/duui-anonymize/src/test/results/testSimplePersonName.json @@ -1,8 +1,22 @@ { "test": "testSimplePersonName", "input": "John Smith called the bank to report a fraud.", - "redacted": "John Smith called the bank to report a fraud.", - "anomaly_count": 0, + "redacted": "[private_person][private_person] called the bank to report a fraud.", + "anomaly_count": 2, "anomalies": [ + { + "begin": 0, + "end": 4, + "category": "private_person", + "description": "[private_person]", + "text": "John" + }, + { + "begin": 4, + "end": 10, + "category": "private_person", + "description": "[private_person]", + "text": " Smith" + } ] } diff --git a/duui-anonymize/src/test/results/testSimplePersonName.xmi b/duui-anonymize/src/test/results/testSimplePersonName.xmi index f1a952e4..45df7107 100644 --- a/duui-anonymize/src/test/results/testSimplePersonName.xmi +++ b/duui-anonymize/src/test/results/testSimplePersonName.xmi @@ -1 +1 @@ - \ No newline at end of file + \ No newline at end of file diff --git a/duui-anonymize/src/test/results/testTypeAccountNumber.json b/duui-anonymize/src/test/results/testTypeAccountNumber.json new file mode 100644 index 00000000..c9fb1855 --- /dev/null +++ b/duui-anonymize/src/test/results/testTypeAccountNumber.json @@ -0,0 +1,22 @@ +{ + "test": "testTypeAccountNumber", + "input": "Please transfer funds to account number 4532-0151-1283-0366 at Deutsche Bank.", + "output": "Please transfer funds to account number [account_number][account_number] at Deutsche Bank.", + "anomaly_count": 2, + "anomalies": [ + { + "begin": 40, + "end": 58, + "category": "account_number", + "description": "[account_number]", + "text": "4532-0151-1283-036" + }, + { + "begin": 58, + "end": 59, + "category": "account_number", + "description": "[account_number]", + "text": "6" + } + ] +} diff --git a/duui-anonymize/src/test/results/testTypeAccountNumber.xmi b/duui-anonymize/src/test/results/testTypeAccountNumber.xmi new file mode 100644 index 00000000..fe02421c --- /dev/null +++ b/duui-anonymize/src/test/results/testTypeAccountNumber.xmi @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/duui-anonymize/src/test/results/testTypeAddress.json b/duui-anonymize/src/test/results/testTypeAddress.json new file mode 100644 index 00000000..b0021aee --- /dev/null +++ b/duui-anonymize/src/test/results/testTypeAddress.json @@ -0,0 +1,22 @@ +{ + "test": "testTypeAddress", + "input": "She lives at 742 Evergreen Terrace, Springfield, IL 62704.", + "output": "She lives at [private_address][private_address].", + "anomaly_count": 2, + "anomalies": [ + { + "begin": 13, + "end": 55, + "category": "private_address", + "description": "[private_address]", + "text": "742 Evergreen Terrace, Springfield, IL 627" + }, + { + "begin": 55, + "end": 57, + "category": "private_address", + "description": "[private_address]", + "text": "04" + } + ] +} diff --git a/duui-anonymize/src/test/results/testTypeAddress.xmi b/duui-anonymize/src/test/results/testTypeAddress.xmi new file mode 100644 index 00000000..45f6b8f8 --- /dev/null +++ b/duui-anonymize/src/test/results/testTypeAddress.xmi @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/duui-anonymize/src/test/results/testTypeDate.json b/duui-anonymize/src/test/results/testTypeDate.json new file mode 100644 index 00000000..3ae5700c --- /dev/null +++ b/duui-anonymize/src/test/results/testTypeDate.json @@ -0,0 +1,36 @@ +{ + "test": "testTypeDate", + "input": "Jane Doe was born on March 15, 1990 in Chicago.", + "output": "[private_person][private_person] was born on[private_date][private_date] in Chicago.", + "anomaly_count": 4, + "anomalies": [ + { + "begin": 0, + "end": 4, + "category": "private_person", + "description": "[private_person]", + "text": "Jane" + }, + { + "begin": 4, + "end": 8, + "category": "private_person", + "description": "[private_person]", + "text": " Doe" + }, + { + "begin": 20, + "end": 34, + "category": "private_date", + "description": "[private_date]", + "text": " March 15, 199" + }, + { + "begin": 34, + "end": 35, + "category": "private_date", + "description": "[private_date]", + "text": "0" + } + ] +} diff --git a/duui-anonymize/src/test/results/testTypeDate.xmi b/duui-anonymize/src/test/results/testTypeDate.xmi new file mode 100644 index 00000000..09708693 --- /dev/null +++ b/duui-anonymize/src/test/results/testTypeDate.xmi @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/duui-anonymize/src/test/results/testTypeEmail.json b/duui-anonymize/src/test/results/testTypeEmail.json new file mode 100644 index 00000000..ee484679 --- /dev/null +++ b/duui-anonymize/src/test/results/testTypeEmail.json @@ -0,0 +1,22 @@ +{ + "test": "testTypeEmail", + "input": "Please contact alice@example.com for further assistance.", + "output": "Please contact[private_email][private_email] for further assistance.", + "anomaly_count": 2, + "anomalies": [ + { + "begin": 14, + "end": 28, + "category": "private_email", + "description": "[private_email]", + "text": " alice@example" + }, + { + "begin": 28, + "end": 32, + "category": "private_email", + "description": "[private_email]", + "text": ".com" + } + ] +} diff --git a/duui-anonymize/src/test/results/testTypeEmail.xmi b/duui-anonymize/src/test/results/testTypeEmail.xmi new file mode 100644 index 00000000..2ef3adcd --- /dev/null +++ b/duui-anonymize/src/test/results/testTypeEmail.xmi @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/duui-anonymize/src/test/results/testTypePerson.json b/duui-anonymize/src/test/results/testTypePerson.json new file mode 100644 index 00000000..cac4b168 --- /dev/null +++ b/duui-anonymize/src/test/results/testTypePerson.json @@ -0,0 +1,22 @@ +{ + "test": "testTypePerson", + "input": "John Smith called the bank to report a fraud.", + "output": "[private_person][private_person] called the bank to report a fraud.", + "anomaly_count": 2, + "anomalies": [ + { + "begin": 0, + "end": 4, + "category": "private_person", + "description": "[private_person]", + "text": "John" + }, + { + "begin": 4, + "end": 10, + "category": "private_person", + "description": "[private_person]", + "text": " Smith" + } + ] +} diff --git a/duui-anonymize/src/test/results/testTypePerson.xmi b/duui-anonymize/src/test/results/testTypePerson.xmi new file mode 100644 index 00000000..02f64cc8 --- /dev/null +++ b/duui-anonymize/src/test/results/testTypePerson.xmi @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/duui-anonymize/src/test/results/testTypePhone.json b/duui-anonymize/src/test/results/testTypePhone.json new file mode 100644 index 00000000..537891ff --- /dev/null +++ b/duui-anonymize/src/test/results/testTypePhone.json @@ -0,0 +1,36 @@ +{ + "test": "testTypePhone", + "input": "You can reach Dr. Miller at +49 69 1234 5678 during office hours.", + "output": "You can reach[private_person][private_person] at[private_phone][private_phone] during office hours.", + "anomaly_count": 4, + "anomalies": [ + { + "begin": 13, + "end": 17, + "category": "private_person", + "description": "[private_person]", + "text": " Dr." + }, + { + "begin": 17, + "end": 24, + "category": "private_person", + "description": "[private_person]", + "text": " Miller" + }, + { + "begin": 27, + "end": 43, + "category": "private_phone", + "description": "[private_phone]", + "text": " +49 69 1234 567" + }, + { + "begin": 43, + "end": 44, + "category": "private_phone", + "description": "[private_phone]", + "text": "8" + } + ] +} diff --git a/duui-anonymize/src/test/results/testTypePhone.xmi b/duui-anonymize/src/test/results/testTypePhone.xmi new file mode 100644 index 00000000..fd689e5a --- /dev/null +++ b/duui-anonymize/src/test/results/testTypePhone.xmi @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/duui-anonymize/src/test/results/testTypeSecret.json b/duui-anonymize/src/test/results/testTypeSecret.json new file mode 100644 index 00000000..25b8f1fb --- /dev/null +++ b/duui-anonymize/src/test/results/testTypeSecret.json @@ -0,0 +1,36 @@ +{ + "test": "testTypeSecret", + "input": "The API key is sk-proj-abc123XYZ987 and the password is H@nt3r2secure!.", + "output": "The API key is[secret][secret] and the password is[secret][secret]", + "anomaly_count": 4, + "anomalies": [ + { + "begin": 14, + "end": 32, + "category": "secret", + "description": "[secret]", + "text": " sk-proj-abc123XYZ" + }, + { + "begin": 32, + "end": 35, + "category": "secret", + "description": "[secret]", + "text": "987" + }, + { + "begin": 55, + "end": 69, + "category": "secret", + "description": "[secret]", + "text": " H@nt3r2secure" + }, + { + "begin": 69, + "end": 71, + "category": "secret", + "description": "[secret]", + "text": "!." + } + ] +} diff --git a/duui-anonymize/src/test/results/testTypeSecret.xmi b/duui-anonymize/src/test/results/testTypeSecret.xmi new file mode 100644 index 00000000..0318ff59 --- /dev/null +++ b/duui-anonymize/src/test/results/testTypeSecret.xmi @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/duui-anonymize/src/test/results/testTypeUrl.json b/duui-anonymize/src/test/results/testTypeUrl.json new file mode 100644 index 00000000..bc2b97ea --- /dev/null +++ b/duui-anonymize/src/test/results/testTypeUrl.json @@ -0,0 +1,22 @@ +{ + "test": "testTypeUrl", + "input": "My personal page is at https://janedoe.personal-site.com/about and I post there.", + "output": "My personal page is at[private_url][private_url] and I post there.", + "anomaly_count": 2, + "anomalies": [ + { + "begin": 22, + "end": 56, + "category": "private_url", + "description": "[private_url]", + "text": " https://janedoe.personal-site.com" + }, + { + "begin": 56, + "end": 62, + "category": "private_url", + "description": "[private_url]", + "text": "/about" + } + ] +} diff --git a/duui-anonymize/src/test/results/testTypeUrl.xmi b/duui-anonymize/src/test/results/testTypeUrl.xmi new file mode 100644 index 00000000..6b6ef186 --- /dev/null +++ b/duui-anonymize/src/test/results/testTypeUrl.xmi @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/duui-anonymize/src/test/results/testXmiOutput.json b/duui-anonymize/src/test/results/testXmiOutput.json index c8e7cb5c..d53f9c66 100644 --- a/duui-anonymize/src/test/results/testXmiOutput.json +++ b/duui-anonymize/src/test/results/testXmiOutput.json @@ -1,15 +1,50 @@ { "test": "testXmiOutput", "input": "Maria Schmidt (m.schmidt@example.de) lives at Berliner Str. 5, 10115 Berlin.", - "redacted": "Maria Schmidt (le.de) lives at Berliner Str. 5, 10115 Berlin.", - "anomaly_count": 1, + "output": "[private_person][private_person] ([private_email][private_email]) lives at[private_address][private_address]", + "anomaly_count": 6, "anomalies": [ + { + "begin": 0, + "end": 5, + "category": "private_person", + "description": "[private_person]", + "text": "Maria" + }, + { + "begin": 5, + "end": 13, + "category": "private_person", + "description": "[private_person]", + "text": " Schmidt" + }, { "begin": 15, - "end": 30, + "end": 32, + "category": "private_email", + "description": "[private_email]", + "text": "m.schmidt@example" + }, + { + "begin": 32, + "end": 35, "category": "private_email", - "description": "", - "text": "m.schmidt@examp" + "description": "[private_email]", + "text": ".de" + }, + { + "begin": 45, + "end": 75, + "category": "private_address", + "description": "[private_address]", + "text": " Berliner Str. 5, 10115 Berlin" + }, + { + "begin": 75, + "end": 76, + "category": "private_address", + "description": "[private_address]", + "text": "." } ] } diff --git a/duui-anonymize/src/test/results/testXmiOutput.xmi b/duui-anonymize/src/test/results/testXmiOutput.xmi index 4aff8e61..e79bf930 100644 --- a/duui-anonymize/src/test/results/testXmiOutput.xmi +++ b/duui-anonymize/src/test/results/testXmiOutput.xmi @@ -1 +1 @@ - \ No newline at end of file + \ No newline at end of file From 28d77d32273a23311b33103dea5d3291dc52d830 Mon Sep 17 00:00:00 2001 From: Ali Abusaleh Date: Tue, 26 May 2026 15:38:23 +0200 Subject: [PATCH 3/3] fix mode passing --- duui-anonymize/.gitignore | 4 +- .../src/main/docker/python/communication.lua | 92 ------------------- .../src/main/python/communication.lua | 2 + .../src/test/java/AnonymizeTests.java | 2 +- .../src/test/results/testComplexContext.json | 15 --- .../src/test/results/testComplexContext.xmi | 1 - .../test/results/testCustomPlaceholder.json | 22 ----- .../test/results/testCustomPlaceholder.xmi | 1 - .../src/test/results/testEmailRedaction.json | 22 ----- .../src/test/results/testEmailRedaction.xmi | 1 - .../src/test/results/testEmptyDocument.json | 8 -- .../src/test/results/testEmptyDocument.xmi | 1 - .../src/test/results/testGermanText.json | 36 -------- .../src/test/results/testGermanText.xmi | 1 - .../test/results/testMultiplePiiEntities.json | 78 ---------------- .../test/results/testMultiplePiiEntities.xmi | 1 - .../results/testPhoneNumberRedaction.json | 36 -------- .../test/results/testPhoneNumberRedaction.xmi | 1 - .../src/test/results/testPlaceholderMode.json | 22 ----- .../src/test/results/testPlaceholderMode.xmi | 1 - .../src/test/results/testPseudoMode.json | 22 ----- .../src/test/results/testPseudoMode.xmi | 1 - .../src/test/results/testRemoveMode.json | 50 ---------- .../src/test/results/testRemoveMode.xmi | 1 - .../src/test/results/testSelectionWindow.json | 36 -------- .../src/test/results/testSelectionWindow.xmi | 1 - .../src/test/results/testSimplePerson.json | 36 -------- .../src/test/results/testSimplePerson.xmi | 1 - .../test/results/testSimplePersonName.json | 22 ----- .../src/test/results/testSimplePersonName.xmi | 1 - .../test/results/testTypeAccountNumber.json | 22 ----- .../test/results/testTypeAccountNumber.xmi | 1 - .../src/test/results/testTypeAddress.json | 22 ----- .../src/test/results/testTypeAddress.xmi | 1 - .../src/test/results/testTypeDate.json | 36 -------- .../src/test/results/testTypeDate.xmi | 1 - .../src/test/results/testTypeEmail.json | 22 ----- .../src/test/results/testTypeEmail.xmi | 1 - .../src/test/results/testTypePerson.json | 22 ----- .../src/test/results/testTypePerson.xmi | 1 - .../src/test/results/testTypePhone.json | 36 -------- .../src/test/results/testTypePhone.xmi | 1 - .../src/test/results/testTypeSecret.json | 36 -------- .../src/test/results/testTypeSecret.xmi | 1 - .../src/test/results/testTypeUrl.json | 22 ----- .../src/test/results/testTypeUrl.xmi | 1 - .../src/test/results/testXmiOutput.json | 50 ---------- .../src/test/results/testXmiOutput.xmi | 1 - 48 files changed, 6 insertions(+), 789 deletions(-) delete mode 100644 duui-anonymize/src/main/docker/python/communication.lua delete mode 100644 duui-anonymize/src/test/results/testComplexContext.json delete mode 100644 duui-anonymize/src/test/results/testComplexContext.xmi delete mode 100644 duui-anonymize/src/test/results/testCustomPlaceholder.json delete mode 100644 duui-anonymize/src/test/results/testCustomPlaceholder.xmi delete mode 100644 duui-anonymize/src/test/results/testEmailRedaction.json delete mode 100644 duui-anonymize/src/test/results/testEmailRedaction.xmi delete mode 100644 duui-anonymize/src/test/results/testEmptyDocument.json delete mode 100644 duui-anonymize/src/test/results/testEmptyDocument.xmi delete mode 100644 duui-anonymize/src/test/results/testGermanText.json delete mode 100644 duui-anonymize/src/test/results/testGermanText.xmi delete mode 100644 duui-anonymize/src/test/results/testMultiplePiiEntities.json delete mode 100644 duui-anonymize/src/test/results/testMultiplePiiEntities.xmi delete mode 100644 duui-anonymize/src/test/results/testPhoneNumberRedaction.json delete mode 100644 duui-anonymize/src/test/results/testPhoneNumberRedaction.xmi delete mode 100644 duui-anonymize/src/test/results/testPlaceholderMode.json delete mode 100644 duui-anonymize/src/test/results/testPlaceholderMode.xmi delete mode 100644 duui-anonymize/src/test/results/testPseudoMode.json delete mode 100644 duui-anonymize/src/test/results/testPseudoMode.xmi delete mode 100644 duui-anonymize/src/test/results/testRemoveMode.json delete mode 100644 duui-anonymize/src/test/results/testRemoveMode.xmi delete mode 100644 duui-anonymize/src/test/results/testSelectionWindow.json delete mode 100644 duui-anonymize/src/test/results/testSelectionWindow.xmi delete mode 100644 duui-anonymize/src/test/results/testSimplePerson.json delete mode 100644 duui-anonymize/src/test/results/testSimplePerson.xmi delete mode 100644 duui-anonymize/src/test/results/testSimplePersonName.json delete mode 100644 duui-anonymize/src/test/results/testSimplePersonName.xmi delete mode 100644 duui-anonymize/src/test/results/testTypeAccountNumber.json delete mode 100644 duui-anonymize/src/test/results/testTypeAccountNumber.xmi delete mode 100644 duui-anonymize/src/test/results/testTypeAddress.json delete mode 100644 duui-anonymize/src/test/results/testTypeAddress.xmi delete mode 100644 duui-anonymize/src/test/results/testTypeDate.json delete mode 100644 duui-anonymize/src/test/results/testTypeDate.xmi delete mode 100644 duui-anonymize/src/test/results/testTypeEmail.json delete mode 100644 duui-anonymize/src/test/results/testTypeEmail.xmi delete mode 100644 duui-anonymize/src/test/results/testTypePerson.json delete mode 100644 duui-anonymize/src/test/results/testTypePerson.xmi delete mode 100644 duui-anonymize/src/test/results/testTypePhone.json delete mode 100644 duui-anonymize/src/test/results/testTypePhone.xmi delete mode 100644 duui-anonymize/src/test/results/testTypeSecret.json delete mode 100644 duui-anonymize/src/test/results/testTypeSecret.xmi delete mode 100644 duui-anonymize/src/test/results/testTypeUrl.json delete mode 100644 duui-anonymize/src/test/results/testTypeUrl.xmi delete mode 100644 duui-anonymize/src/test/results/testXmiOutput.json delete mode 100644 duui-anonymize/src/test/results/testXmiOutput.xmi diff --git a/duui-anonymize/.gitignore b/duui-anonymize/.gitignore index fc82091d..05703aca 100644 --- a/duui-anonymize/.gitignore +++ b/duui-anonymize/.gitignore @@ -9,4 +9,6 @@ __pycache__/** target/** dist/** -build/** \ No newline at end of file +build/** + +src/test/results/** \ No newline at end of file diff --git a/duui-anonymize/src/main/docker/python/communication.lua b/duui-anonymize/src/main/docker/python/communication.lua deleted file mode 100644 index 9c1876a2..00000000 --- a/duui-anonymize/src/main/docker/python/communication.lua +++ /dev/null @@ -1,92 +0,0 @@ --- Bind static classes from java -StandardCharsets = luajava.bindClass("java.nio.charset.StandardCharsets") -util = luajava.bindClass("org.apache.uima.fit.util.JCasUtil") - --- Known option keys forwarded to the Python service. --- pairs() does not iterate Java map objects in LuaJ, so we read each key explicitly. -local OPTION_KEYS = { - "mode", "model", "device", - "context_window_length", "trim_whitespace", - "output_mode", "discard_overlapping_predicted_spans", -} - -local function copy_options(params) - if params == nil then return {} end - local options = {} - for _, key in ipairs(OPTION_KEYS) do - local value = params[key] - if value ~= nil then - options[key] = value - end - end - return options -end - -local function resolve_selection(params) - if params == nil then - return nil - end - - local selection = params["selection"] - if type(selection) == "table" then - local begin = selection["begin"] or selection["start"] - local ending = selection["end"] or selection["stop"] - if begin ~= nil and ending ~= nil then - return { begin = begin, ["end"] = ending } - end - end - - local begin = params["selection_begin"] or params["selection_start"] - local ending = params["selection_end"] or params["selection_stop"] - if begin ~= nil and ending ~= nil then - return { begin = begin, ["end"] = ending } - end - - return nil -end - --- Serialize the CAS into a JSON request sent to the Python service. -function serialize(inputCas, outputStream, params) - local text = inputCas:getSofaDataString() - if text == nil then text = "" end - - outputStream:write(json.encode({ - text = text, - options = copy_options(params), - selection = resolve_selection(params), - })) -end - --- Deserialize the JSON response from the Python service back into the CAS. --- --- Anomaly annotations are added to the *original* CAS view so their --- character offsets remain valid against the original document text. --- The redacted text is stored as the sofa of a separate "opf_redacted" view. -function deserialize(inputCas, inputStream) - local inputString = luajava.newInstance("java.lang.String", inputStream:readAllBytes(), StandardCharsets.UTF_8) - local results = json.decode(inputString) - - -- Store redacted text in its own view (offsets here belong to redacted text, - -- so we do NOT add Anomaly annotations to this view). - if results["redacted_text"] ~= nil then - local ok, view = pcall(function() return inputCas:createView("opf_redacted") end) - if ok and view ~= nil then - view:setSofaDataString(results["redacted_text"], "text/plain") - end - end - - -- Add Anomaly annotations to the original view; offsets reference original text. - if results["detected_spans"] ~= nil then - for i, span in ipairs(results["detected_spans"]) do - local anomaly = luajava.newInstance( - "de.tudarmstadt.ukp.dkpro.core.api.anomaly.type.Anomaly", inputCas) - anomaly:setBegin(span["start"]) - anomaly:setEnd(span["end"]) - anomaly:setCategory(span["label"]) - -- description = replacement used (e.g. "[private_person]") or original word - anomaly:setDescription(span["placeholder"] ~= "" and span["placeholder"] - or span["text"] or span["label"]) - anomaly:addToIndexes() - end - end -end diff --git a/duui-anonymize/src/main/python/communication.lua b/duui-anonymize/src/main/python/communication.lua index 12156714..038d7167 100644 --- a/duui-anonymize/src/main/python/communication.lua +++ b/duui-anonymize/src/main/python/communication.lua @@ -23,9 +23,11 @@ local OPTION_KEYS = { local function copy_options(params) local options = {} + print("Copying options:") for _, key in ipairs(OPTION_KEYS) do local value = param_get(params, key) if value ~= nil then + print(" ", key, "=", value) options[key] = value end end diff --git a/duui-anonymize/src/test/java/AnonymizeTests.java b/duui-anonymize/src/test/java/AnonymizeTests.java index 55ff1ec5..c292b5bc 100644 --- a/duui-anonymize/src/test/java/AnonymizeTests.java +++ b/duui-anonymize/src/test/java/AnonymizeTests.java @@ -189,7 +189,7 @@ void testPlaceholderMode() throws Exception { createCas("en", text); composer.add(new DUUIRemoteDriver.Component(SERVICE_URL) - .withParameter("mode", "placeholder")); + .withParameter("mode", "remove")); composer.run(cas); Collection anomalies = collectAnomalies(); diff --git a/duui-anonymize/src/test/results/testComplexContext.json b/duui-anonymize/src/test/results/testComplexContext.json deleted file mode 100644 index ec52ea8b..00000000 --- a/duui-anonymize/src/test/results/testComplexContext.json +++ /dev/null @@ -1,15 +0,0 @@ -{ - "test": "testComplexContext", - "input": "His name is Harry, he works at the TTLAB in Frankfurt, he's the only Chinese guy in the office.", - "output": "His name is[private_person], he works at the TTLAB in Frankfurt, he's the only Chinese guy in the office.", - "anomaly_count": 1, - "anomalies": [ - { - "begin": 11, - "end": 17, - "category": "private_person", - "description": "[private_person]", - "text": " Harry" - } - ] -} diff --git a/duui-anonymize/src/test/results/testComplexContext.xmi b/duui-anonymize/src/test/results/testComplexContext.xmi deleted file mode 100644 index d223c1f1..00000000 --- a/duui-anonymize/src/test/results/testComplexContext.xmi +++ /dev/null @@ -1 +0,0 @@ - \ No newline at end of file diff --git a/duui-anonymize/src/test/results/testCustomPlaceholder.json b/duui-anonymize/src/test/results/testCustomPlaceholder.json deleted file mode 100644 index 05ef427d..00000000 --- a/duui-anonymize/src/test/results/testCustomPlaceholder.json +++ /dev/null @@ -1,22 +0,0 @@ -{ - "test": "testCustomPlaceholder", - "input": "Send the report to max.mustermann@uni-frankfurt.de by Friday.", - "redacted": "Send the report to[private_email][private_email] by Friday.", - "anomaly_count": 2, - "anomalies": [ - { - "begin": 18, - "end": 47, - "category": "private_email", - "description": "[private_email]", - "text": " max.mustermann@uni-frankfurt" - }, - { - "begin": 47, - "end": 50, - "category": "private_email", - "description": "[private_email]", - "text": ".de" - } - ] -} diff --git a/duui-anonymize/src/test/results/testCustomPlaceholder.xmi b/duui-anonymize/src/test/results/testCustomPlaceholder.xmi deleted file mode 100644 index ff0c6045..00000000 --- a/duui-anonymize/src/test/results/testCustomPlaceholder.xmi +++ /dev/null @@ -1 +0,0 @@ - \ No newline at end of file diff --git a/duui-anonymize/src/test/results/testEmailRedaction.json b/duui-anonymize/src/test/results/testEmailRedaction.json deleted file mode 100644 index a54fa4ca..00000000 --- a/duui-anonymize/src/test/results/testEmailRedaction.json +++ /dev/null @@ -1,22 +0,0 @@ -{ - "test": "testEmailRedaction", - "input": "Please contact support at alice@example.com for further assistance.", - "redacted": "Please contact support at[private_email][private_email] for further assistance.", - "anomaly_count": 2, - "anomalies": [ - { - "begin": 25, - "end": 39, - "category": "private_email", - "description": "[private_email]", - "text": " alice@example" - }, - { - "begin": 39, - "end": 43, - "category": "private_email", - "description": "[private_email]", - "text": ".com" - } - ] -} diff --git a/duui-anonymize/src/test/results/testEmailRedaction.xmi b/duui-anonymize/src/test/results/testEmailRedaction.xmi deleted file mode 100644 index df0fcb56..00000000 --- a/duui-anonymize/src/test/results/testEmailRedaction.xmi +++ /dev/null @@ -1 +0,0 @@ - \ No newline at end of file diff --git a/duui-anonymize/src/test/results/testEmptyDocument.json b/duui-anonymize/src/test/results/testEmptyDocument.json deleted file mode 100644 index 6886cf97..00000000 --- a/duui-anonymize/src/test/results/testEmptyDocument.json +++ /dev/null @@ -1,8 +0,0 @@ -{ - "test": "testEmptyDocument", - "input": "", - "output": "", - "anomaly_count": 0, - "anomalies": [ - ] -} diff --git a/duui-anonymize/src/test/results/testEmptyDocument.xmi b/duui-anonymize/src/test/results/testEmptyDocument.xmi deleted file mode 100644 index 45caf477..00000000 --- a/duui-anonymize/src/test/results/testEmptyDocument.xmi +++ /dev/null @@ -1 +0,0 @@ - \ No newline at end of file diff --git a/duui-anonymize/src/test/results/testGermanText.json b/duui-anonymize/src/test/results/testGermanText.json deleted file mode 100644 index f59c3220..00000000 --- a/duui-anonymize/src/test/results/testGermanText.json +++ /dev/null @@ -1,36 +0,0 @@ -{ - "test": "testGermanText", - "input": "Herr Klaus Muller wohnt in der Goethestrasse 12, 60313 Frankfurt am Main.", - "output": "[private_person][private_person] wohnt in der[private_address][private_address].", - "anomaly_count": 4, - "anomalies": [ - { - "begin": 0, - "end": 10, - "category": "private_person", - "description": "[private_person]", - "text": "Herr Klaus" - }, - { - "begin": 10, - "end": 17, - "category": "private_person", - "description": "[private_person]", - "text": " Muller" - }, - { - "begin": 30, - "end": 67, - "category": "private_address", - "description": "[private_address]", - "text": " Goethestrasse 12, 60313 Frankfurt am" - }, - { - "begin": 67, - "end": 72, - "category": "private_address", - "description": "[private_address]", - "text": " Main" - } - ] -} diff --git a/duui-anonymize/src/test/results/testGermanText.xmi b/duui-anonymize/src/test/results/testGermanText.xmi deleted file mode 100644 index d76766e6..00000000 --- a/duui-anonymize/src/test/results/testGermanText.xmi +++ /dev/null @@ -1 +0,0 @@ - \ No newline at end of file diff --git a/duui-anonymize/src/test/results/testMultiplePiiEntities.json b/duui-anonymize/src/test/results/testMultiplePiiEntities.json deleted file mode 100644 index 047322f6..00000000 --- a/duui-anonymize/src/test/results/testMultiplePiiEntities.json +++ /dev/null @@ -1,78 +0,0 @@ -{ - "test": "testMultiplePiiEntities", - "input": "Patient: Jane Doe, DOB: 1985-03-22. Contact: jane.doe@hospital.org, Tel: 069-9876-5432. Address: 60325 Frankfurt am Main, Goethe-Platz 1.", - "output": "Patient:[private_person][private_person], DOB: [private_date][private_date]. Contact:[private_email][private_email], Tel: [private_phone][private_phone]. Address: [private_address][private_address].", - "anomaly_count": 10, - "anomalies": [ - { - "begin": 8, - "end": 13, - "category": "private_person", - "description": "[private_person]", - "text": " Jane" - }, - { - "begin": 13, - "end": 17, - "category": "private_person", - "description": "[private_person]", - "text": " Doe" - }, - { - "begin": 24, - "end": 32, - "category": "private_date", - "description": "[private_date]", - "text": "1985-03-" - }, - { - "begin": 32, - "end": 34, - "category": "private_date", - "description": "[private_date]", - "text": "22" - }, - { - "begin": 44, - "end": 62, - "category": "private_email", - "description": "[private_email]", - "text": " jane.doe@hospital" - }, - { - "begin": 62, - "end": 66, - "category": "private_email", - "description": "[private_email]", - "text": ".org" - }, - { - "begin": 73, - "end": 85, - "category": "private_phone", - "description": "[private_phone]", - "text": "069-9876-543" - }, - { - "begin": 85, - "end": 86, - "category": "private_phone", - "description": "[private_phone]", - "text": "2" - }, - { - "begin": 97, - "end": 135, - "category": "private_address", - "description": "[private_address]", - "text": "60325 Frankfurt am Main, Goethe-Platz " - }, - { - "begin": 135, - "end": 136, - "category": "private_address", - "description": "[private_address]", - "text": "1" - } - ] -} diff --git a/duui-anonymize/src/test/results/testMultiplePiiEntities.xmi b/duui-anonymize/src/test/results/testMultiplePiiEntities.xmi deleted file mode 100644 index 7e0beb85..00000000 --- a/duui-anonymize/src/test/results/testMultiplePiiEntities.xmi +++ /dev/null @@ -1 +0,0 @@ - \ No newline at end of file diff --git a/duui-anonymize/src/test/results/testPhoneNumberRedaction.json b/duui-anonymize/src/test/results/testPhoneNumberRedaction.json deleted file mode 100644 index 025d709a..00000000 --- a/duui-anonymize/src/test/results/testPhoneNumberRedaction.json +++ /dev/null @@ -1,36 +0,0 @@ -{ - "test": "testPhoneNumberRedaction", - "input": "You can reach Dr. Miller at +49 69 1234 5678 during office hours.", - "redacted": "You can reach[private_person][private_person] at[private_phone][private_phone] during office hours.", - "anomaly_count": 4, - "anomalies": [ - { - "begin": 13, - "end": 17, - "category": "private_person", - "description": "[private_person]", - "text": " Dr." - }, - { - "begin": 17, - "end": 24, - "category": "private_person", - "description": "[private_person]", - "text": " Miller" - }, - { - "begin": 27, - "end": 43, - "category": "private_phone", - "description": "[private_phone]", - "text": " +49 69 1234 567" - }, - { - "begin": 43, - "end": 44, - "category": "private_phone", - "description": "[private_phone]", - "text": "8" - } - ] -} diff --git a/duui-anonymize/src/test/results/testPhoneNumberRedaction.xmi b/duui-anonymize/src/test/results/testPhoneNumberRedaction.xmi deleted file mode 100644 index 832fbcf6..00000000 --- a/duui-anonymize/src/test/results/testPhoneNumberRedaction.xmi +++ /dev/null @@ -1 +0,0 @@ - \ No newline at end of file diff --git a/duui-anonymize/src/test/results/testPlaceholderMode.json b/duui-anonymize/src/test/results/testPlaceholderMode.json deleted file mode 100644 index 2e65f167..00000000 --- a/duui-anonymize/src/test/results/testPlaceholderMode.json +++ /dev/null @@ -1,22 +0,0 @@ -{ - "test": "testPlaceholderMode", - "input": "Send the report to max.mustermann@uni-frankfurt.de by Friday.", - "output": "Send the report to[private_email][private_email] by Friday.", - "anomaly_count": 2, - "anomalies": [ - { - "begin": 18, - "end": 47, - "category": "private_email", - "description": "[private_email]", - "text": " max.mustermann@uni-frankfurt" - }, - { - "begin": 47, - "end": 50, - "category": "private_email", - "description": "[private_email]", - "text": ".de" - } - ] -} diff --git a/duui-anonymize/src/test/results/testPlaceholderMode.xmi b/duui-anonymize/src/test/results/testPlaceholderMode.xmi deleted file mode 100644 index 113975bb..00000000 --- a/duui-anonymize/src/test/results/testPlaceholderMode.xmi +++ /dev/null @@ -1 +0,0 @@ - \ No newline at end of file diff --git a/duui-anonymize/src/test/results/testPseudoMode.json b/duui-anonymize/src/test/results/testPseudoMode.json deleted file mode 100644 index ff34d374..00000000 --- a/duui-anonymize/src/test/results/testPseudoMode.json +++ /dev/null @@ -1,22 +0,0 @@ -{ - "test": "testPseudoMode", - "input": "Alice and Bob met at the Frankfurt main station.", - "output": "[private_person] and[private_person] met at the Frankfurt main station.", - "anomaly_count": 2, - "anomalies": [ - { - "begin": 0, - "end": 5, - "category": "private_person", - "description": "[private_person]", - "text": "Alice" - }, - { - "begin": 9, - "end": 13, - "category": "private_person", - "description": "[private_person]", - "text": " Bob" - } - ] -} diff --git a/duui-anonymize/src/test/results/testPseudoMode.xmi b/duui-anonymize/src/test/results/testPseudoMode.xmi deleted file mode 100644 index fd9a06c1..00000000 --- a/duui-anonymize/src/test/results/testPseudoMode.xmi +++ /dev/null @@ -1 +0,0 @@ - \ No newline at end of file diff --git a/duui-anonymize/src/test/results/testRemoveMode.json b/duui-anonymize/src/test/results/testRemoveMode.json deleted file mode 100644 index d37ad17d..00000000 --- a/duui-anonymize/src/test/results/testRemoveMode.json +++ /dev/null @@ -1,50 +0,0 @@ -{ - "test": "testRemoveMode", - "input": "Call John Smith at john.smith@company.com or +1-800-555-0199 for help.", - "output": "Call[private_person][private_person] at[private_email][private_email] or[private_phone][private_phone] for help.", - "anomaly_count": 6, - "anomalies": [ - { - "begin": 4, - "end": 9, - "category": "private_person", - "description": "[private_person]", - "text": " John" - }, - { - "begin": 9, - "end": 15, - "category": "private_person", - "description": "[private_person]", - "text": " Smith" - }, - { - "begin": 18, - "end": 37, - "category": "private_email", - "description": "[private_email]", - "text": " john.smith@company" - }, - { - "begin": 37, - "end": 41, - "category": "private_email", - "description": "[private_email]", - "text": ".com" - }, - { - "begin": 44, - "end": 59, - "category": "private_phone", - "description": "[private_phone]", - "text": " +1-800-555-019" - }, - { - "begin": 59, - "end": 60, - "category": "private_phone", - "description": "[private_phone]", - "text": "9" - } - ] -} diff --git a/duui-anonymize/src/test/results/testRemoveMode.xmi b/duui-anonymize/src/test/results/testRemoveMode.xmi deleted file mode 100644 index ec65475f..00000000 --- a/duui-anonymize/src/test/results/testRemoveMode.xmi +++ /dev/null @@ -1 +0,0 @@ - \ No newline at end of file diff --git a/duui-anonymize/src/test/results/testSelectionWindow.json b/duui-anonymize/src/test/results/testSelectionWindow.json deleted file mode 100644 index 3c9d39bf..00000000 --- a/duui-anonymize/src/test/results/testSelectionWindow.json +++ /dev/null @@ -1,36 +0,0 @@ -{ - "test": "testSelectionWindow", - "input": "Call Dr. John Adams at 555-0100 today.", - "output": "Call[private_person][private_person] at [private_phone][private_phone].", - "anomaly_count": 4, - "anomalies": [ - { - "begin": 4, - "end": 13, - "category": "private_person", - "description": "[private_person]", - "text": " Dr. John" - }, - { - "begin": 13, - "end": 19, - "category": "private_person", - "description": "[private_person]", - "text": " Adams" - }, - { - "begin": 23, - "end": 31, - "category": "private_phone", - "description": "[private_phone]", - "text": "555-0100" - }, - { - "begin": 31, - "end": 37, - "category": "private_phone", - "description": "[private_phone]", - "text": " today" - } - ] -} diff --git a/duui-anonymize/src/test/results/testSelectionWindow.xmi b/duui-anonymize/src/test/results/testSelectionWindow.xmi deleted file mode 100644 index 080206f1..00000000 --- a/duui-anonymize/src/test/results/testSelectionWindow.xmi +++ /dev/null @@ -1 +0,0 @@ - \ No newline at end of file diff --git a/duui-anonymize/src/test/results/testSimplePerson.json b/duui-anonymize/src/test/results/testSimplePerson.json deleted file mode 100644 index 9a671f3a..00000000 --- a/duui-anonymize/src/test/results/testSimplePerson.json +++ /dev/null @@ -1,36 +0,0 @@ -{ - "test": "testSimplePerson", - "input": "My name is Harry Potter and my email is harry.potter@hogwarts.edu.", - "output": "My name is[private_person][private_person] and my email is[private_email][private_email].", - "anomaly_count": 4, - "anomalies": [ - { - "begin": 10, - "end": 16, - "category": "private_person", - "description": "[private_person]", - "text": " Harry" - }, - { - "begin": 16, - "end": 23, - "category": "private_person", - "description": "[private_person]", - "text": " Potter" - }, - { - "begin": 39, - "end": 61, - "category": "private_email", - "description": "[private_email]", - "text": " harry.potter@hogwarts" - }, - { - "begin": 61, - "end": 65, - "category": "private_email", - "description": "[private_email]", - "text": ".edu" - } - ] -} diff --git a/duui-anonymize/src/test/results/testSimplePerson.xmi b/duui-anonymize/src/test/results/testSimplePerson.xmi deleted file mode 100644 index c6ecea54..00000000 --- a/duui-anonymize/src/test/results/testSimplePerson.xmi +++ /dev/null @@ -1 +0,0 @@ - \ No newline at end of file diff --git a/duui-anonymize/src/test/results/testSimplePersonName.json b/duui-anonymize/src/test/results/testSimplePersonName.json deleted file mode 100644 index cd1e49a6..00000000 --- a/duui-anonymize/src/test/results/testSimplePersonName.json +++ /dev/null @@ -1,22 +0,0 @@ -{ - "test": "testSimplePersonName", - "input": "John Smith called the bank to report a fraud.", - "redacted": "[private_person][private_person] called the bank to report a fraud.", - "anomaly_count": 2, - "anomalies": [ - { - "begin": 0, - "end": 4, - "category": "private_person", - "description": "[private_person]", - "text": "John" - }, - { - "begin": 4, - "end": 10, - "category": "private_person", - "description": "[private_person]", - "text": " Smith" - } - ] -} diff --git a/duui-anonymize/src/test/results/testSimplePersonName.xmi b/duui-anonymize/src/test/results/testSimplePersonName.xmi deleted file mode 100644 index 45df7107..00000000 --- a/duui-anonymize/src/test/results/testSimplePersonName.xmi +++ /dev/null @@ -1 +0,0 @@ - \ No newline at end of file diff --git a/duui-anonymize/src/test/results/testTypeAccountNumber.json b/duui-anonymize/src/test/results/testTypeAccountNumber.json deleted file mode 100644 index c9fb1855..00000000 --- a/duui-anonymize/src/test/results/testTypeAccountNumber.json +++ /dev/null @@ -1,22 +0,0 @@ -{ - "test": "testTypeAccountNumber", - "input": "Please transfer funds to account number 4532-0151-1283-0366 at Deutsche Bank.", - "output": "Please transfer funds to account number [account_number][account_number] at Deutsche Bank.", - "anomaly_count": 2, - "anomalies": [ - { - "begin": 40, - "end": 58, - "category": "account_number", - "description": "[account_number]", - "text": "4532-0151-1283-036" - }, - { - "begin": 58, - "end": 59, - "category": "account_number", - "description": "[account_number]", - "text": "6" - } - ] -} diff --git a/duui-anonymize/src/test/results/testTypeAccountNumber.xmi b/duui-anonymize/src/test/results/testTypeAccountNumber.xmi deleted file mode 100644 index fe02421c..00000000 --- a/duui-anonymize/src/test/results/testTypeAccountNumber.xmi +++ /dev/null @@ -1 +0,0 @@ - \ No newline at end of file diff --git a/duui-anonymize/src/test/results/testTypeAddress.json b/duui-anonymize/src/test/results/testTypeAddress.json deleted file mode 100644 index b0021aee..00000000 --- a/duui-anonymize/src/test/results/testTypeAddress.json +++ /dev/null @@ -1,22 +0,0 @@ -{ - "test": "testTypeAddress", - "input": "She lives at 742 Evergreen Terrace, Springfield, IL 62704.", - "output": "She lives at [private_address][private_address].", - "anomaly_count": 2, - "anomalies": [ - { - "begin": 13, - "end": 55, - "category": "private_address", - "description": "[private_address]", - "text": "742 Evergreen Terrace, Springfield, IL 627" - }, - { - "begin": 55, - "end": 57, - "category": "private_address", - "description": "[private_address]", - "text": "04" - } - ] -} diff --git a/duui-anonymize/src/test/results/testTypeAddress.xmi b/duui-anonymize/src/test/results/testTypeAddress.xmi deleted file mode 100644 index 45f6b8f8..00000000 --- a/duui-anonymize/src/test/results/testTypeAddress.xmi +++ /dev/null @@ -1 +0,0 @@ - \ No newline at end of file diff --git a/duui-anonymize/src/test/results/testTypeDate.json b/duui-anonymize/src/test/results/testTypeDate.json deleted file mode 100644 index 3ae5700c..00000000 --- a/duui-anonymize/src/test/results/testTypeDate.json +++ /dev/null @@ -1,36 +0,0 @@ -{ - "test": "testTypeDate", - "input": "Jane Doe was born on March 15, 1990 in Chicago.", - "output": "[private_person][private_person] was born on[private_date][private_date] in Chicago.", - "anomaly_count": 4, - "anomalies": [ - { - "begin": 0, - "end": 4, - "category": "private_person", - "description": "[private_person]", - "text": "Jane" - }, - { - "begin": 4, - "end": 8, - "category": "private_person", - "description": "[private_person]", - "text": " Doe" - }, - { - "begin": 20, - "end": 34, - "category": "private_date", - "description": "[private_date]", - "text": " March 15, 199" - }, - { - "begin": 34, - "end": 35, - "category": "private_date", - "description": "[private_date]", - "text": "0" - } - ] -} diff --git a/duui-anonymize/src/test/results/testTypeDate.xmi b/duui-anonymize/src/test/results/testTypeDate.xmi deleted file mode 100644 index 09708693..00000000 --- a/duui-anonymize/src/test/results/testTypeDate.xmi +++ /dev/null @@ -1 +0,0 @@ - \ No newline at end of file diff --git a/duui-anonymize/src/test/results/testTypeEmail.json b/duui-anonymize/src/test/results/testTypeEmail.json deleted file mode 100644 index ee484679..00000000 --- a/duui-anonymize/src/test/results/testTypeEmail.json +++ /dev/null @@ -1,22 +0,0 @@ -{ - "test": "testTypeEmail", - "input": "Please contact alice@example.com for further assistance.", - "output": "Please contact[private_email][private_email] for further assistance.", - "anomaly_count": 2, - "anomalies": [ - { - "begin": 14, - "end": 28, - "category": "private_email", - "description": "[private_email]", - "text": " alice@example" - }, - { - "begin": 28, - "end": 32, - "category": "private_email", - "description": "[private_email]", - "text": ".com" - } - ] -} diff --git a/duui-anonymize/src/test/results/testTypeEmail.xmi b/duui-anonymize/src/test/results/testTypeEmail.xmi deleted file mode 100644 index 2ef3adcd..00000000 --- a/duui-anonymize/src/test/results/testTypeEmail.xmi +++ /dev/null @@ -1 +0,0 @@ - \ No newline at end of file diff --git a/duui-anonymize/src/test/results/testTypePerson.json b/duui-anonymize/src/test/results/testTypePerson.json deleted file mode 100644 index cac4b168..00000000 --- a/duui-anonymize/src/test/results/testTypePerson.json +++ /dev/null @@ -1,22 +0,0 @@ -{ - "test": "testTypePerson", - "input": "John Smith called the bank to report a fraud.", - "output": "[private_person][private_person] called the bank to report a fraud.", - "anomaly_count": 2, - "anomalies": [ - { - "begin": 0, - "end": 4, - "category": "private_person", - "description": "[private_person]", - "text": "John" - }, - { - "begin": 4, - "end": 10, - "category": "private_person", - "description": "[private_person]", - "text": " Smith" - } - ] -} diff --git a/duui-anonymize/src/test/results/testTypePerson.xmi b/duui-anonymize/src/test/results/testTypePerson.xmi deleted file mode 100644 index 02f64cc8..00000000 --- a/duui-anonymize/src/test/results/testTypePerson.xmi +++ /dev/null @@ -1 +0,0 @@ - \ No newline at end of file diff --git a/duui-anonymize/src/test/results/testTypePhone.json b/duui-anonymize/src/test/results/testTypePhone.json deleted file mode 100644 index 537891ff..00000000 --- a/duui-anonymize/src/test/results/testTypePhone.json +++ /dev/null @@ -1,36 +0,0 @@ -{ - "test": "testTypePhone", - "input": "You can reach Dr. Miller at +49 69 1234 5678 during office hours.", - "output": "You can reach[private_person][private_person] at[private_phone][private_phone] during office hours.", - "anomaly_count": 4, - "anomalies": [ - { - "begin": 13, - "end": 17, - "category": "private_person", - "description": "[private_person]", - "text": " Dr." - }, - { - "begin": 17, - "end": 24, - "category": "private_person", - "description": "[private_person]", - "text": " Miller" - }, - { - "begin": 27, - "end": 43, - "category": "private_phone", - "description": "[private_phone]", - "text": " +49 69 1234 567" - }, - { - "begin": 43, - "end": 44, - "category": "private_phone", - "description": "[private_phone]", - "text": "8" - } - ] -} diff --git a/duui-anonymize/src/test/results/testTypePhone.xmi b/duui-anonymize/src/test/results/testTypePhone.xmi deleted file mode 100644 index fd689e5a..00000000 --- a/duui-anonymize/src/test/results/testTypePhone.xmi +++ /dev/null @@ -1 +0,0 @@ - \ No newline at end of file diff --git a/duui-anonymize/src/test/results/testTypeSecret.json b/duui-anonymize/src/test/results/testTypeSecret.json deleted file mode 100644 index 25b8f1fb..00000000 --- a/duui-anonymize/src/test/results/testTypeSecret.json +++ /dev/null @@ -1,36 +0,0 @@ -{ - "test": "testTypeSecret", - "input": "The API key is sk-proj-abc123XYZ987 and the password is H@nt3r2secure!.", - "output": "The API key is[secret][secret] and the password is[secret][secret]", - "anomaly_count": 4, - "anomalies": [ - { - "begin": 14, - "end": 32, - "category": "secret", - "description": "[secret]", - "text": " sk-proj-abc123XYZ" - }, - { - "begin": 32, - "end": 35, - "category": "secret", - "description": "[secret]", - "text": "987" - }, - { - "begin": 55, - "end": 69, - "category": "secret", - "description": "[secret]", - "text": " H@nt3r2secure" - }, - { - "begin": 69, - "end": 71, - "category": "secret", - "description": "[secret]", - "text": "!." - } - ] -} diff --git a/duui-anonymize/src/test/results/testTypeSecret.xmi b/duui-anonymize/src/test/results/testTypeSecret.xmi deleted file mode 100644 index 0318ff59..00000000 --- a/duui-anonymize/src/test/results/testTypeSecret.xmi +++ /dev/null @@ -1 +0,0 @@ - \ No newline at end of file diff --git a/duui-anonymize/src/test/results/testTypeUrl.json b/duui-anonymize/src/test/results/testTypeUrl.json deleted file mode 100644 index bc2b97ea..00000000 --- a/duui-anonymize/src/test/results/testTypeUrl.json +++ /dev/null @@ -1,22 +0,0 @@ -{ - "test": "testTypeUrl", - "input": "My personal page is at https://janedoe.personal-site.com/about and I post there.", - "output": "My personal page is at[private_url][private_url] and I post there.", - "anomaly_count": 2, - "anomalies": [ - { - "begin": 22, - "end": 56, - "category": "private_url", - "description": "[private_url]", - "text": " https://janedoe.personal-site.com" - }, - { - "begin": 56, - "end": 62, - "category": "private_url", - "description": "[private_url]", - "text": "/about" - } - ] -} diff --git a/duui-anonymize/src/test/results/testTypeUrl.xmi b/duui-anonymize/src/test/results/testTypeUrl.xmi deleted file mode 100644 index 6b6ef186..00000000 --- a/duui-anonymize/src/test/results/testTypeUrl.xmi +++ /dev/null @@ -1 +0,0 @@ - \ No newline at end of file diff --git a/duui-anonymize/src/test/results/testXmiOutput.json b/duui-anonymize/src/test/results/testXmiOutput.json deleted file mode 100644 index d53f9c66..00000000 --- a/duui-anonymize/src/test/results/testXmiOutput.json +++ /dev/null @@ -1,50 +0,0 @@ -{ - "test": "testXmiOutput", - "input": "Maria Schmidt (m.schmidt@example.de) lives at Berliner Str. 5, 10115 Berlin.", - "output": "[private_person][private_person] ([private_email][private_email]) lives at[private_address][private_address]", - "anomaly_count": 6, - "anomalies": [ - { - "begin": 0, - "end": 5, - "category": "private_person", - "description": "[private_person]", - "text": "Maria" - }, - { - "begin": 5, - "end": 13, - "category": "private_person", - "description": "[private_person]", - "text": " Schmidt" - }, - { - "begin": 15, - "end": 32, - "category": "private_email", - "description": "[private_email]", - "text": "m.schmidt@example" - }, - { - "begin": 32, - "end": 35, - "category": "private_email", - "description": "[private_email]", - "text": ".de" - }, - { - "begin": 45, - "end": 75, - "category": "private_address", - "description": "[private_address]", - "text": " Berliner Str. 5, 10115 Berlin" - }, - { - "begin": 75, - "end": 76, - "category": "private_address", - "description": "[private_address]", - "text": "." - } - ] -} diff --git a/duui-anonymize/src/test/results/testXmiOutput.xmi b/duui-anonymize/src/test/results/testXmiOutput.xmi deleted file mode 100644 index e79bf930..00000000 --- a/duui-anonymize/src/test/results/testXmiOutput.xmi +++ /dev/null @@ -1 +0,0 @@ - \ No newline at end of file