Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 14 additions & 0 deletions duui-anonymize/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
/../*
../
../*
../duui-mm/*
.venv/**
.vscode/**
__pycache__/**
*.pyc

target/**
dist/**
build/**

src/test/results/**
39 changes: 39 additions & 0 deletions duui-anonymize/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
#### OpenAI Privacy Filter component for DUUI

OpenAI Privacy Filter: https://github.com/openai/privacy-filter

#### Input/Output:

input: Text in the Sofa. Optional selection offsets can be passed through Lua options.

output: structured redaction spans and redacted text

#### Output Shape:

Privacy Filter detects 8 privacy span categories:

- `account_number`
- `private_address`
- `private_email`
- `private_person`
- `private_phone`
- `private_url`
- `private_date`
- `secret`

The model emits BIOES token classes for these categories plus `O`, and the service turns the resulting spans into DUUI annotations and redacted text.

#### Parameter:

[optional] OPF redaction options such as `model`, `context_window_length`, `trim_whitespace`, `device`, `output_mode`, `decode_mode`, `discard_overlapping_predicted_spans`, `viterbi_calibration_path`, and selection offsets (`selection_begin` / `selection_end`).

#### Modes:

- `replacement`: default mode, replaces detected spans with a consistent placeholder.
- `pseudo`: kept as a stub / TODO mode and currently returns the input unchanged.
- `mode` is passed through Lua options.

#### Entry points:

- `src/main/docker/python/duui_opf.py`: new OPF entrypoint wrapper.
- `src/main/docker/python/duui_whisperx.py`: compatibility implementation file while the migration is in progress.
138 changes: 138 additions & 0 deletions duui-anonymize/pom.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,138 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>

<groupId>org.texttechnology</groupId>
<artifactId>duui-anonymize</artifactId>
<version>1.0-SNAPSHOT</version>

<licenses>
<license>
<name>AGPL-3.0-or-later</name>
<url>https://www.gnu.org/licenses/agpl.txt</url>
<distribution>repo</distribution>
<comments>GNU Affero General Public License v3.0 or later</comments>
</license>
</licenses>

<organization>
<name>Texttechnology Lab</name>
<url>https://www.texttechnologylab.org</url>
</organization>

<developers>
<developer>
<id>mehler</id>
<name>Prof. Dr. Alexander Mehler</name>
<email>mehler@em.uni-frankfurt.de</email>
<url>https://www.texttechnologylab.org/team/alexander-abrami/</url>
<organization>Goethe University Frankfurt / Texttechnology Lab</organization>
<organizationUrl>https://www.texttechnologylab.org</organizationUrl>
<roles>
<role>head of department</role>
</roles>
</developer>
<developer>
<id>aabusale</id>
<name>Ali Abusaleh</name>
<email>a.abusaleh@em.uni-frankfurt.de</email>
<url>https://www.texttechnologylab.org/team/ali-abusaleh/</url>
<organization>Goethe University Frankfurt / Texttechnology Lab</organization>
<organizationUrl>https://www.texttechnologylab.org</organizationUrl>
<roles>
<role>Research assistant</role>
</roles>
<timezone>Europe/Berlin</timezone>
</developer>
</developers>

<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-surefire-plugin</artifactId>
<version>2.22.0</version>
<configuration>
<argLine>
--illegal-access=permit
--add-opens java.base/java.util=ALL-UNNAMED
</argLine>
</configuration>
</plugin>
</plugins>
</build>

<properties>
<maven.compiler.source>21</maven.compiler.source>
<maven.compiler.target>21</maven.compiler.target>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<dkpro.core.version>2.4.0</dkpro.core.version>
</properties>

<repositories>
<repository>
<id>jitpack.io</id>
<url>https://jitpack.io</url>
</repository>
</repositories>

<dependencyManagement>
<dependencies>
<dependency>
<groupId>org.dkpro.core</groupId>
<artifactId>dkpro-core-asl</artifactId>
<version>${dkpro.core.version}</version>
<type>pom</type>
<scope>import</scope>
</dependency>
</dependencies>
</dependencyManagement>

<dependencies>
<dependency>
<groupId>com.github.texttechnologylab</groupId>
<artifactId>DockerUnifiedUIMAInterface</artifactId>
<version>fac60bef3f</version>
</dependency>

<dependency>
<groupId>com.github.texttechnologylab</groupId>
<artifactId>UIMATypeSystem</artifactId>
<version>3.0.5</version>
</dependency>

<dependency>
<groupId>org.junit.jupiter</groupId>
<artifactId>junit-jupiter</artifactId>
<version>5.9.0</version>
<scope>test</scope>
</dependency>

<dependency>
<groupId>org.dkpro.core</groupId>
<artifactId>dkpro-core-api-anomaly-asl</artifactId>
<scope>test</scope>
</dependency>

<dependency>
<groupId>org.dkpro.core</groupId>
<artifactId>dkpro-core-api-segmentation-asl</artifactId>
<scope>test</scope>
</dependency>

<dependency>
<groupId>org.dkpro.core</groupId>
<artifactId>dkpro-core-io-xmi-asl</artifactId>
<scope>test</scope>
</dependency>

<dependency>
<groupId>org.dkpro.core</groupId>
<artifactId>dkpro-core-api-resources-asl</artifactId>
<scope>test</scope>
</dependency>
</dependencies>

</project>
14 changes: 14 additions & 0 deletions duui-anonymize/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
numpy
dkpro_cassis
fastapi
pydantic
pydantic-settings
pydantic_core
starlette
uvicorn
torch
torchvision
torchaudio
transformers
accelerate
setuptools
16 changes: 16 additions & 0 deletions duui-anonymize/src/main/docker/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
FROM python:3.10

WORKDIR /usr/src/app

EXPOSE 9714

COPY ./src/main/python/communication.lua ./communication.lua
COPY ./src/main/python/duui_anonymize.py ./duui_anonymize.py
COPY ./src/main/python/typesystem.xml ./typesystem.xml
COPY ./requirements.txt ./requirements.txt

RUN pip install torch==2.0.0 torchvision==0.15.1 torchaudio==2.0.1 --index-url https://download.pytorch.org/whl/cpu
RUN pip install -r requirements.txt

ENTRYPOINT ["uvicorn", "duui_anonymize:app", "--host", "0.0.0.0", "--port" ,"9714"]
CMD ["--workers", "1"]
26 changes: 26 additions & 0 deletions duui-anonymize/src/main/docker/Dockerfile-cuda
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
FROM nvidia/cuda:11.8.0-cudnn8-devel-ubuntu22.04

RUN apt update && \
DEBIAN_FRONTEND=noninteractive \
apt install --no-install-recommends -y build-essential software-properties-common && \
add-apt-repository -y ppa:deadsnakes/ppa && \
apt install --no-install-recommends -y python3.10 python3-pip python3-setuptools python3-distutils && \
apt clean && rm -rf /var/lib/apt/lists/*

RUN ln -s /usr/bin/python3 /usr/bin/python
RUN python -m pip install --upgrade pip

WORKDIR /usr/src/app

EXPOSE 9714

COPY ./src/main/python/communication.lua ./communication.lua
COPY ./src/main/python/duui_anonymize.py ./duui_anonymize.py
COPY ./src/main/python/typesystem.xml ./typesystem.xml
COPY ./requirements.txt ./requirements.txt

RUN pip install torch==2.0.0 torchvision==0.15.1 torchaudio==2.0.1 --index-url https://download.pytorch.org/whl/cu118
RUN pip install -r requirements.txt

ENTRYPOINT ["uvicorn", "duui_anonymize:app", "--host", "0.0.0.0", "--port" ,"9714"]
CMD ["--workers", "1"]
110 changes: 110 additions & 0 deletions duui-anonymize/src/main/python/communication.lua
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
-- Bind static classes from java
StandardCharsets = luajava.bindClass("java.nio.charset.StandardCharsets")
util = luajava.bindClass("org.apache.uima.fit.util.JCasUtil")

-- Read a parameter from params regardless of whether it is a Lua table or a
-- LuaJ-wrapped Java Map. Direct table indexing works for Lua tables; Java
-- Map objects (HashMap, etc.) require params:get(key) instead.
local function param_get(params, key)
if params == nil then return nil end
local v = params[key]
if v ~= nil then return tostring(v) end
local ok, r = pcall(function() return params:get(key) end)
if ok and r ~= nil then return tostring(r) end
return nil
end

-- Known option keys forwarded to the Python service.
local OPTION_KEYS = {
"mode", "model", "device",
"context_window_length", "trim_whitespace",
"output_mode", "discard_overlapping_predicted_spans",
}

local function copy_options(params)
local options = {}
print("Copying options:")
for _, key in ipairs(OPTION_KEYS) do
local value = param_get(params, key)
if value ~= nil then
print(" ", key, "=", value)
options[key] = value
end
end
return options
end

local function resolve_selection(params)
if params == nil then return nil end

-- selection passed as a nested table
local selection = params["selection"]
if selection == nil then
local ok, r = pcall(function() return params:get("selection") end)
if ok then selection = r end
end
if type(selection) == "table" then
local b = selection["begin"] or selection["start"]
local e = selection["end"] or selection["stop"]
if b ~= nil and e ~= nil then
return { begin = b, ["end"] = e }
end
end

-- selection passed as flat begin/end keys
local b = param_get(params, "selection_begin") or param_get(params, "selection_start")
local e = param_get(params, "selection_end") or param_get(params, "selection_stop")
if b ~= nil and e ~= nil then
return { begin = tonumber(b), ["end"] = tonumber(e) }
end

return nil
end

-- Serialize the CAS into a JSON request sent to the Python service.
function serialize(inputCas, outputStream, params)
local text = inputCas:getSofaDataString()
if text == nil then text = "" end

local options = copy_options(params)

outputStream:write(json.encode({
text = text,
options = options,
selection = resolve_selection(params),
}))
end

-- Deserialize the JSON response from the Python service back into the CAS.
--
-- Anomaly annotations are added to the *original* CAS view so their
-- character offsets remain valid against the original document text.
-- The redacted text is stored as the sofa of a separate "opf_redacted" view.
function deserialize(inputCas, inputStream)
local inputString = luajava.newInstance("java.lang.String", inputStream:readAllBytes(), StandardCharsets.UTF_8)
local results = json.decode(inputString)

-- Store redacted text in its own view.
if results["redacted_text"] ~= nil then
local ok, view = pcall(function() return inputCas:createView("opf_redacted") end)
if ok and view ~= nil then
view:setSofaDataString(results["redacted_text"], "text/plain")
end
end

-- Add Anomaly annotations to the original view; offsets reference original text.
if results["detected_spans"] ~= nil then
for i, span in ipairs(results["detected_spans"]) do
local anomaly = luajava.newInstance(
"de.tudarmstadt.ukp.dkpro.core.api.anomaly.type.Anomaly", inputCas)
anomaly:setBegin(span["start"])
anomaly:setEnd(span["end"])
anomaly:setCategory(span["label"])
-- description = replacement used (e.g. "[private_person]") or original word
anomaly:setDescription(
(span["placeholder"] ~= nil and span["placeholder"] ~= "") and span["placeholder"]
or span["text"] or span["label"])
anomaly:addToIndexes()
end
end
end
Loading