Skip to content

Commit 5e5d348

Browse files
committed
use spark base image
1 parent 03c1da9 commit 5e5d348

3 files changed

Lines changed: 55 additions & 93 deletions

File tree

dev/Dockerfile

Lines changed: 54 additions & 69 deletions
Original file line numberDiff line numberDiff line change
@@ -13,81 +13,66 @@
1313
# See the License for the specific language governing permissions and
1414
# limitations under the License.
1515

16-
FROM python:3.12-bullseye
16+
ARG IMAGE_SPARK_VERSION=3.5.6
1717

18-
RUN apt-get -qq update && \
19-
apt-get -qq install -y --no-install-recommends \
20-
sudo \
21-
curl \
22-
vim \
23-
unzip \
24-
openjdk-11-jdk \
25-
build-essential \
26-
software-properties-common \
27-
ssh && \
28-
apt-get -qq clean && \
29-
rm -rf /var/lib/apt/lists/*
18+
FROM apache/spark:${IMAGE_SPARK_VERSION}
3019

31-
# Optional env variables
32-
ENV SPARK_HOME=${SPARK_HOME:-"/opt/spark"}
33-
ENV HADOOP_HOME=${HADOOP_HOME:-"/opt/hadoop"}
34-
ENV PYTHONPATH=$SPARK_HOME/python:$SPARK_HOME/python/lib/py4j-0.10.9.7-src.zip:$PYTHONPATH
20+
ARG SPARK_VERSION=3.5.6
21+
ARG SCALA_VERSION=2.12
22+
ARG ICEBERG_SPARK_RUNTIME_VERSION=3.5_2.12
23+
ARG ICEBERG_VERSION=1.10.0
24+
ARG HADOOP_VERSION=3.3.4
25+
ARG AWS_SDK_VERSION=1.12.753
26+
ARG MAVEN_MIRROR=https://repo.maven.apache.org/maven2
3527

36-
RUN mkdir -p ${HADOOP_HOME} && mkdir -p ${SPARK_HOME} && mkdir -p /home/iceberg/spark-events
37-
WORKDIR ${SPARK_HOME}
28+
# Install dependencies and download JARs in single layer
29+
USER root
30+
RUN apt-get update && \
31+
apt-get install -y --no-install-recommends \
32+
wget \
33+
curl && \
34+
# Create temporary directory for downloads
35+
mkdir -p /tmp/jars && \
36+
cd /tmp/jars && \
37+
# Download JARs with error handling
38+
for url in \
39+
"${MAVEN_MIRROR}/org/apache/spark/spark-connect_${SCALA_VERSION}/${SPARK_VERSION}/spark-connect_${SCALA_VERSION}-${SPARK_VERSION}.jar" \
40+
"${MAVEN_MIRROR}/org/apache/iceberg/iceberg-spark-runtime-${ICEBERG_SPARK_RUNTIME_VERSION}/${ICEBERG_VERSION}/iceberg-spark-runtime-${ICEBERG_SPARK_RUNTIME_VERSION}-${ICEBERG_VERSION}.jar" \
41+
"${MAVEN_MIRROR}/org/apache/iceberg/iceberg-aws-bundle/${ICEBERG_VERSION}/iceberg-aws-bundle-${ICEBERG_VERSION}.jar" \
42+
"${MAVEN_MIRROR}/org/apache/hadoop/hadoop-aws/${HADOOP_VERSION}/hadoop-aws-${HADOOP_VERSION}.jar" \
43+
"${MAVEN_MIRROR}/com/amazonaws/aws-java-sdk-bundle/${AWS_SDK_VERSION}/aws-java-sdk-bundle-${AWS_SDK_VERSION}.jar" \
44+
; do \
45+
echo "Downloading: ${url}" && \
46+
wget --progress=dot:giga --retry-connrefused --waitretry=1 --timeout=60 "${url}" || exit 1; \
47+
done && \
48+
# Move JARs to Spark directory
49+
mv *.jar "${SPARK_HOME}/jars/" && \
50+
chown spark:spark "${SPARK_HOME}/jars"/*.jar && \
51+
# Create Spark events directory
52+
mkdir -p "/home/iceberg/spark-events" && \
53+
chown spark:spark "/home/iceberg/spark-events" && \
54+
# Cleanup
55+
cd / && \
56+
rm -rf /tmp/jars && \
57+
apt-get remove -y wget && \
58+
apt-get autoremove -y && \
59+
apt-get clean && \
60+
rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
3861

39-
ENV SPARK_VERSION=3.5.6
40-
ENV SCALA_VERSION=2.12
41-
ENV ICEBERG_SPARK_RUNTIME_VERSION=3.5_${SCALA_VERSION}
42-
ENV ICEBERG_VERSION=1.10.0
43-
ENV PYICEBERG_VERSION=0.10.0
44-
ENV HADOOP_VERSION=3.3.4
45-
ENV AWS_SDK_VERSION=1.12.753
62+
# Switch back to spark user
63+
USER spark
4664

47-
# Try the primary Apache mirror (downloads.apache.org) first, then fall back to the archive
48-
RUN set -eux; \
49-
FILE=spark-${SPARK_VERSION}-bin-hadoop3.tgz; \
50-
URLS="https://downloads.apache.org/spark/spark-${SPARK_VERSION}/${FILE} https://archive.apache.org/dist/spark/spark-${SPARK_VERSION}/${FILE}"; \
51-
for url in $URLS; do \
52-
echo "Attempting download: $url"; \
53-
if curl --retry 3 --retry-delay 5 -f -s -C - "$url" -o "$FILE"; then \
54-
echo "Downloaded from: $url"; \
55-
break; \
56-
else \
57-
echo "Failed to download from: $url"; \
58-
fi; \
59-
done; \
60-
if [ ! -f "$FILE" ]; then echo "Failed to download Spark from all mirrors" >&2; exit 1; fi; \
61-
tar xzf "$FILE" --directory /opt/spark --strip-components 1; \
62-
rm -rf "$FILE"
65+
# Working directory
66+
WORKDIR "${SPARK_HOME}"
6367

64-
# Download Spark Connect server JAR
65-
RUN curl --retry 5 -s -L https://repo1.maven.org/maven2/org/apache/spark/spark-connect_${SCALA_VERSION}/${SPARK_VERSION}/spark-connect_${SCALA_VERSION}-${SPARK_VERSION}.jar \
66-
-Lo /opt/spark/jars/spark-connect_${SCALA_VERSION}-${SPARK_VERSION}.jar
68+
# Copy Spark configuration
69+
COPY spark-defaults.conf "${SPARK_HOME}/conf/"
6770

68-
# Download iceberg spark runtime
69-
RUN curl --retry 5 -s https://repo1.maven.org/maven2/org/apache/iceberg/iceberg-spark-runtime-${ICEBERG_SPARK_RUNTIME_VERSION}/${ICEBERG_VERSION}/iceberg-spark-runtime-${ICEBERG_SPARK_RUNTIME_VERSION}-${ICEBERG_VERSION}.jar \
70-
-Lo /opt/spark/jars/iceberg-spark-runtime-${ICEBERG_SPARK_RUNTIME_VERSION}-${ICEBERG_VERSION}.jar
71+
# Create healthcheck
72+
HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \
73+
CMD curl -f http://localhost:15002/ || exit 1
7174

72-
# Download AWS bundle
73-
RUN curl --retry 5 -s https://repo1.maven.org/maven2/org/apache/iceberg/iceberg-aws-bundle/${ICEBERG_VERSION}/iceberg-aws-bundle-${ICEBERG_VERSION}.jar \
74-
-Lo /opt/spark/jars/iceberg-aws-bundle-${ICEBERG_VERSION}.jar
75+
# Expose Spark Connect port (default is 15002)
76+
EXPOSE 15002
7577

76-
# Download hadoop-aws (required for S3 support)
77-
RUN curl --retry 5 -s https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/${HADOOP_VERSION}/hadoop-aws-${HADOOP_VERSION}.jar \
78-
-Lo /opt/spark/jars/hadoop-aws-${HADOOP_VERSION}.jar
79-
80-
# Download AWS SDK bundle
81-
RUN curl --retry 5 -s https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk-bundle/${AWS_SDK_VERSION}/aws-java-sdk-bundle-${AWS_SDK_VERSION}.jar \
82-
-Lo /opt/spark/jars/aws-java-sdk-bundle-${AWS_SDK_VERSION}.jar
83-
84-
COPY spark-defaults.conf /opt/spark/conf
85-
ENV PATH="/opt/spark/sbin:/opt/spark/bin:${PATH}"
86-
87-
RUN chmod u+x /opt/spark/sbin/* && \
88-
chmod u+x /opt/spark/bin/*
89-
90-
COPY entrypoint.sh .
91-
92-
ENTRYPOINT ["./entrypoint.sh"]
93-
CMD ["notebook"]
78+
CMD ["/bin/bash", "-c", "${SPARK_HOME}/sbin/start-connect-server.sh && tail -f /dev/null"]

dev/docker-compose-integration.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -86,7 +86,7 @@ services:
8686
"
8787
hive:
8888
build: hive/
89-
container_name: hive
89+
container_name: pyiceberg-hive
9090
hostname: hive
9191
networks:
9292
iceberg_net:

dev/entrypoint.sh

Lines changed: 0 additions & 23 deletions
This file was deleted.

0 commit comments

Comments
 (0)