Skip to content

Commit c0a6889

Browse files
author
zhanglongbin
committed
2 parents 5564ad1 + fe8f1bf commit c0a6889

37 files changed

Lines changed: 233 additions & 1329 deletions

.github/workflows/build.yml

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
name: Docker Image CI
2+
3+
on:
4+
push:
5+
branches:
6+
- main
7+
tags:
8+
- 'v[0-9]+.[0-9]+.[0-9]+'
9+
10+
jobs:
11+
build:
12+
runs-on: ubuntu-latest
13+
14+
steps:
15+
- name: Checkout repository
16+
uses: actions/checkout@v4
17+
18+
- name: Set up QEMU
19+
uses: docker/setup-qemu-action@v3
20+
21+
- name: Set up Docker Buildx
22+
uses: docker/setup-buildx-action@v3
23+
24+
- name: Log in to Docker Hub
25+
uses: docker/login-action@v3
26+
with:
27+
username: ${{ secrets.DOCKER_USERNAME }}
28+
password: ${{ secrets.DOCKER_PASSWORD }}
29+
30+
- name: Log in to ACR
31+
uses: docker/login-action@v3
32+
with:
33+
username: ${{ secrets.ACR_USERNAME }}
34+
password: ${{ secrets.ACR_PASSWORD }}
35+
registry: ${{ secrets.ACR_REGISTRY }}
36+
37+
- name: Set Docker image tag
38+
id: tag
39+
run: |
40+
if [[ "${GITHUB_REF_TYPE}" == "tag" ]]; then
41+
echo "TAGS=opencsghq/dataflow:${GITHUB_REF_NAME},${{ secrets.ACR_REGISTRY }}/opencsghq/dataflow:${GITHUB_REF_NAME}" >> $GITHUB_ENV
42+
else
43+
echo "TAGS=opencsghq/dataflow:latest,${{ secrets.ACR_REGISTRY }}/opencsghq/dataflow:latest" >> $GITHUB_ENV
44+
fi
45+
46+
- name: Build and push Docker image
47+
uses: docker/build-push-action@v6
48+
with:
49+
context: .
50+
file: ./Dockerfile
51+
push: true
52+
provenance: false
53+
tags: ${{ env.TAGS }}
54+
platforms: linux/amd64,linux/arm64

Dockerfile

Lines changed: 26 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
1-
FROM swr.cn-north-4.myhuaweicloud.com/ddn-k8s/docker.io/python:3.10.14
2-
# FROM python:3.10.14
1+
ARG BUILD_CN=false
2+
3+
FROM docker.io/python:3.10.14
4+
35
# prepare the java env
46
WORKDIR /opt
57
# download jdk
@@ -13,22 +15,33 @@ ENV JAVA_HOME=/opt/jdk
1315

1416
WORKDIR /dataflow
1517

16-
17-
RUN echo "deb http://mirrors.aliyun.com/debian bookworm main contrib non-free" > /etc/apt/sources.list && \
18-
echo "deb http://mirrors.aliyun.com/debian-security bookworm-security main contrib non-free" >> /etc/apt/sources.list && \
19-
echo "deb http://mirrors.aliyun.com/debian bookworm-updates main contrib non-free" >> /etc/apt/sources.list
18+
RUN if [ "$BUILD_CN" = "true" ]; then \
19+
echo "deb http://mirrors.aliyun.com/debian bookworm main contrib non-free" > /etc/apt/sources.list; \
20+
echo "deb http://mirrors.aliyun.com/debian-security bookworm-security main contrib non-free" >> /etc/apt/sources.list; \
21+
echo "deb http://mirrors.aliyun.com/debian bookworm-updates main contrib non-free" >> /etc/apt/sources.list; \
22+
fi
2023

2124
# install 3rd-party system dependencies
2225
# RUN apt-get update && apt-get install ffmpeg libsm6 libxext6 libpq-dev -y
23-
RUN apt-get update && apt-get install libpq-dev libgl1-mesa-glx -y
24-
RUN apt install git-lfs && git lfs install && apt clean && rm -rf /var/lib/apt/lists/*
26+
RUN apt-get update && \
27+
apt-get install --no-install-recommends -y \
28+
libpq-dev \
29+
libgl1-mesa-glx \
30+
git-lfs && \
31+
apt-get clean && rm -rf /var/lib/apt/lists/* && \
32+
git lfs install
2533

2634
# install data-flow then
2735
COPY . .
2836

29-
ENV PIP_INDEX_URL=https://mirrors.aliyun.com/pypi/simple/
3037
# Install deps
31-
RUN pip install --no-cache-dir --use-deprecated=legacy-resolver -r docker/dataflow_requirements.txt
38+
# RUN pip install --no-cache-dir --use-deprecated=legacy-resolver -r docker/dataflow_requirements.txt
39+
RUN if [ "$BUILD_CN" = "true" ]; then \
40+
pip install --no-cache-dir -r docker/dataflow_requirements.txt -i https://mirrors.aliyun.com/pypi/simple/; \
41+
else \
42+
pip install --no-cache-dir -r docker/dataflow_requirements.txt; \
43+
fi
44+
3245
# compile code
3346
# RUN python -m compileall .
3447
# RUN find ./ -name "*.py" -delete
@@ -37,9 +50,9 @@ RUN pip install --no-cache-dir --use-deprecated=legacy-resolver -r docker/datafl
3750
#ENV PLAYWRIGHT_DOWNLOAD_HOST=https://storage.aliyun.com/playwright
3851
#RUN playwright install --with-deps
3952

40-
RUN git config --global user.email "dataflow@opencsg.com"
41-
RUN git config --global user.name "dataflow"
42-
RUN git config --global --add safe.directory '*'
53+
RUN git config --global user.email "dataflow@opencsg.com" && \
54+
git config --global user.name "dataflow" && \
55+
git config --global --add safe.directory '*'
4356

4457
# Start fastapi API Server
4558
EXPOSE 8000

README.md

Lines changed: 31 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -32,8 +32,11 @@ This project inherits the [Apache License 2.0](LICENSE) from Data Juicer.
3232

3333
```
3434
docker build -t dataflow . -f Dockerfile
35-
```
3635
36+
docker buildx build --provenance false --platform linux/amd64 -t dataflow . -f Dockerfile
37+
38+
docker buildx build --provenance false --platform linux/arm64 -t dataflow . -f Dockerfile
39+
```
3740

3841
## Prerequisites
3942

@@ -42,7 +45,7 @@ Launch postgres container
4245
```bash
4346
docker run -d --name dataflow-pg \
4447
-p 5433:5432 \
45-
-v /home/pgdata:/var/lib/postgresql/data \
48+
-v /tmp/data_flow/pgdata:/var/lib/postgresql/data \
4649
-e POSTGRES_DB=data_flow \
4750
-e POSTGRES_USER=postgres \
4851
-e POSTGRES_PASSWORD=postgres \
@@ -54,7 +57,7 @@ Launch mongoDB container
5457
```bash
5558
docker run -d --name dataflow-mongo \
5659
-p 27017:27017 \
57-
-v /home/mongodata:/data/db \
60+
-v /tmp/data_flow/mongodata:/data/db \
5861
-e MONGO_INITDB_ROOT_USERNAME=root \
5962
-e MONGO_INITDB_ROOT_PASSWORD=example \
6063
opencsg-registry.cn-beijing.cr.aliyuncs.com/opencsghq/mongo:8.0.12
@@ -64,8 +67,8 @@ Launch redis container
6467

6568
```bash
6669
docker run -d --name dataflow-redis \
67-
-p 6379:6379 \
68-
-v /home/redisdata:/data \
70+
-p 16379:6379 \
71+
-v /tmp/data_flow/redisdata:/data \
6972
opencsg-registry.cn-beijing.cr.aliyuncs.com/opencsghq/redis:7.2.5
7073
```
7174

@@ -74,14 +77,14 @@ docker run -d --name dataflow-redis \
7477
```bash
7578

7679
docker run -d --name dataflow-api -p 8000:8000 \
77-
-v /home/apidata:/data/dataflow_data \
80+
-v /tmp/data_flow/apidata:/data/dataflow_data \
7881
-c "uvicorn data_server.main:app --host 0.0.0.0 --port 8000" \
7982
-e DATA_DIR=/data/dataflow_data \
8083
-e CSGHUB_ENDPOINT=https://hub.opencsg.com \
8184
-e MAX_WORKERS=99 \
8285
-e RAY_ADDRESS=auto \
8386
-e RAY_ENABLE=False \
84-
-e RAY_LOG_DIR=/home/output \
87+
-e RAY_LOG_DIR=/data/ray_output \
8588
-e API_SERVER=0.0.0.0 \
8689
-e API_PORT=8000 \
8790
-e ENABLE_OPENTELEMETRY=False \
@@ -91,7 +94,7 @@ docker run -d --name dataflow-api -p 8000:8000 \
9194
-e DATABASE_HOSTNAME=127.0.0.1 \
9295
-e DATABASE_PORT=5433 \
9396
-e STUDIO_JUMP_URL=https://data-label.opencsg.com \
94-
-e REDIS_HOST_URL=redis://127.0.0.1:6379 \
97+
-e REDIS_HOST_URL=redis://127.0.0.1:16379 \
9598
-e MONG_HOST_URL=mongodb://root:example@127.0.0.1:27017 \
9699
dataflow
97100

@@ -102,14 +105,14 @@ docker run -d --name dataflow-api -p 8000:8000 \
102105
```bash
103106

104107
docker run -d --name celery-work -p 8001:8001 \
105-
-v /home/celery-data:/data/dataflow_celery \
108+
-v /tmp/data_flow/celery-data:/data/dataflow_celery \
106109
-c "celery -A data_celery.main:celery_app worker --loglevel=info --pool=gevent" \
107110
-e DATA_DIR=/data/dataflow_celery \
108111
-e CSGHUB_ENDPOINT=https://hub.opencsg.com \
109112
-e MAX_WORKERS=99 \
110113
-e RAY_ADDRESS=auto \
111114
-e RAY_ENABLE=False \
112-
-e RAY_LOG_DIR=/home/output \
115+
-e RAY_LOG_DIR=/data/ray_output \
113116
-e API_SERVER=0.0.0.0 \
114117
-e API_PORT=8001 \
115118
-e ENABLE_OPENTELEMETRY=False \
@@ -118,23 +121,35 @@ docker run -d --name celery-work -p 8001:8001 \
118121
-e DATABASE_PASSWORD=postgres \
119122
-e DATABASE_HOSTNAME=127.0.0.1 \
120123
-e DATABASE_PORT=5433 \
121-
-e REDIS_HOST_URL=redis://127.0.0.1:6379 \
124+
-e REDIS_HOST_URL=redis://127.0.0.1:16379 \
122125
-e MONG_HOST_URL=mongodb://root:example@127.0.0.1:27017 \
123126
dataflow-celery
124127

125128
```
126129

127130
## Run data-flow server in development mode locally
128131

132+
### Create a Virtual Environment
133+
129134
```bash
130-
# Create virtual python 3.10 environment
135+
uv venv --python 3.10
136+
137+
source .venv/bin/activate
138+
139+
# or
140+
131141
conda create -n dataflow python=3.10
142+
```
143+
144+
```bash
132145

133146
# Install dependencies
134-
pip install '.[dist]' -i https://pypi.tuna.tsinghua.edu.cn/simple/
135-
pip install '.[tools]' -i https://pypi.tuna.tsinghua.edu.cn/simple/
136-
pip install '.[sci]' -i https://pypi.tuna.tsinghua.edu.cn/simple/
137-
pip install -r docker/requirements.txt
147+
#pip install '.[dist]' -i https://pypi.tuna.tsinghua.edu.cn/simple/
148+
#pip install '.[tools]' -i https://pypi.tuna.tsinghua.edu.cn/simple/
149+
#pip install '.[sci]' -i https://pypi.tuna.tsinghua.edu.cn/simple/
150+
#pip install -r docker/requirements.txt
151+
152+
uv pip install -r docker/dataflow_requirements.txt -i https://mirrors.aliyun.com/pypi/simple/
138153

139154
# Run the server locally
140155
uvicorn data_server.main:app --reload

data_agents/__init__.py

Whitespace-only changes.

data_agents/administrator_agent.py

Lines changed: 0 additions & 36 deletions
This file was deleted.

0 commit comments

Comments
 (0)