|
13 | 13 | # See the License for the specific language governing permissions and |
14 | 14 | # limitations under the License. |
15 | 15 |
|
16 | | -FROM python:3.12-bullseye |
| 16 | +ARG IMAGE_SPARK_VERSION=3.5.6 |
17 | 17 |
|
18 | | -RUN apt-get -qq update && \ |
19 | | - apt-get -qq install -y --no-install-recommends \ |
20 | | - sudo \ |
21 | | - curl \ |
22 | | - vim \ |
23 | | - unzip \ |
24 | | - openjdk-11-jdk \ |
25 | | - build-essential \ |
26 | | - software-properties-common \ |
27 | | - ssh && \ |
28 | | - apt-get -qq clean && \ |
29 | | - rm -rf /var/lib/apt/lists/* |
| 18 | +FROM apache/spark:${IMAGE_SPARK_VERSION} |
30 | 19 |
|
31 | | -# Optional env variables |
32 | | -ENV SPARK_HOME=${SPARK_HOME:-"/opt/spark"} |
33 | | -ENV HADOOP_HOME=${HADOOP_HOME:-"/opt/hadoop"} |
34 | | -ENV PYTHONPATH=$SPARK_HOME/python:$SPARK_HOME/python/lib/py4j-0.10.9.7-src.zip:$PYTHONPATH |
| 20 | +ARG SPARK_VERSION=3.5.6 |
| 21 | +ARG SCALA_VERSION=2.12 |
| 22 | +ARG ICEBERG_SPARK_RUNTIME_VERSION=3.5_2.12 |
| 23 | +ARG ICEBERG_VERSION=1.10.0 |
| 24 | +ARG HADOOP_VERSION=3.3.4 |
| 25 | +ARG AWS_SDK_VERSION=1.12.753 |
| 26 | +ARG MAVEN_MIRROR=https://repo.maven.apache.org/maven2 |
35 | 27 |
|
36 | | -RUN mkdir -p ${HADOOP_HOME} && mkdir -p ${SPARK_HOME} && mkdir -p /home/iceberg/spark-events |
37 | | -WORKDIR ${SPARK_HOME} |
| 28 | +# Install dependencies and download JARs in single layer |
| 29 | +USER root |
| 30 | +RUN apt-get update && \ |
| 31 | + apt-get install -y --no-install-recommends \ |
| 32 | + wget \ |
| 33 | + curl && \ |
| 34 | + # Create temporary directory for downloads |
| 35 | + mkdir -p /tmp/jars && \ |
| 36 | + cd /tmp/jars && \ |
| 37 | + # Download JARs with error handling |
| 38 | + for url in \ |
| 39 | + "${MAVEN_MIRROR}/org/apache/spark/spark-connect_${SCALA_VERSION}/${SPARK_VERSION}/spark-connect_${SCALA_VERSION}-${SPARK_VERSION}.jar" \ |
| 40 | + "${MAVEN_MIRROR}/org/apache/iceberg/iceberg-spark-runtime-${ICEBERG_SPARK_RUNTIME_VERSION}/${ICEBERG_VERSION}/iceberg-spark-runtime-${ICEBERG_SPARK_RUNTIME_VERSION}-${ICEBERG_VERSION}.jar" \ |
| 41 | + "${MAVEN_MIRROR}/org/apache/iceberg/iceberg-aws-bundle/${ICEBERG_VERSION}/iceberg-aws-bundle-${ICEBERG_VERSION}.jar" \ |
| 42 | + "${MAVEN_MIRROR}/org/apache/hadoop/hadoop-aws/${HADOOP_VERSION}/hadoop-aws-${HADOOP_VERSION}.jar" \ |
| 43 | + "${MAVEN_MIRROR}/com/amazonaws/aws-java-sdk-bundle/${AWS_SDK_VERSION}/aws-java-sdk-bundle-${AWS_SDK_VERSION}.jar" \ |
| 44 | + ; do \ |
| 45 | + echo "Downloading: ${url}" && \ |
| 46 | + wget --progress=dot:giga --retry-connrefused --waitretry=1 --timeout=60 "${url}" || exit 1; \ |
| 47 | + done && \ |
| 48 | + # Move JARs to Spark directory |
| 49 | + mv *.jar "${SPARK_HOME}/jars/" && \ |
| 50 | + chown spark:spark "${SPARK_HOME}/jars"/*.jar && \ |
| 51 | + # Create Spark events directory |
| 52 | + mkdir -p "/home/iceberg/spark-events" && \ |
| 53 | + chown spark:spark "/home/iceberg/spark-events" && \ |
| 54 | + # Cleanup |
| 55 | + cd / && \ |
| 56 | + rm -rf /tmp/jars && \ |
| 57 | + apt-get remove -y wget && \ |
| 58 | + apt-get autoremove -y && \ |
| 59 | + apt-get clean && \ |
| 60 | + rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* |
38 | 61 |
|
39 | | -ENV SPARK_VERSION=3.5.6 |
40 | | -ENV SCALA_VERSION=2.12 |
41 | | -ENV ICEBERG_SPARK_RUNTIME_VERSION=3.5_${SCALA_VERSION} |
42 | | -ENV ICEBERG_VERSION=1.10.0 |
43 | | -ENV PYICEBERG_VERSION=0.10.0 |
44 | | -ENV HADOOP_VERSION=3.3.4 |
45 | | -ENV AWS_SDK_VERSION=1.12.753 |
| 62 | +# Switch back to spark user |
| 63 | +USER spark |
46 | 64 |
|
47 | | -# Try the primary Apache mirror (downloads.apache.org) first, then fall back to the archive |
48 | | -RUN set -eux; \ |
49 | | - FILE=spark-${SPARK_VERSION}-bin-hadoop3.tgz; \ |
50 | | - URLS="https://downloads.apache.org/spark/spark-${SPARK_VERSION}/${FILE} https://archive.apache.org/dist/spark/spark-${SPARK_VERSION}/${FILE}"; \ |
51 | | - for url in $URLS; do \ |
52 | | - echo "Attempting download: $url"; \ |
53 | | - if curl --retry 3 --retry-delay 5 -f -s -C - "$url" -o "$FILE"; then \ |
54 | | - echo "Downloaded from: $url"; \ |
55 | | - break; \ |
56 | | - else \ |
57 | | - echo "Failed to download from: $url"; \ |
58 | | - fi; \ |
59 | | - done; \ |
60 | | - if [ ! -f "$FILE" ]; then echo "Failed to download Spark from all mirrors" >&2; exit 1; fi; \ |
61 | | - tar xzf "$FILE" --directory /opt/spark --strip-components 1; \ |
62 | | - rm -rf "$FILE" |
| 65 | +# Working directory |
| 66 | +WORKDIR "${SPARK_HOME}" |
63 | 67 |
|
64 | | -# Download Spark Connect server JAR |
65 | | -RUN curl --retry 5 -s -L https://repo1.maven.org/maven2/org/apache/spark/spark-connect_${SCALA_VERSION}/${SPARK_VERSION}/spark-connect_${SCALA_VERSION}-${SPARK_VERSION}.jar \ |
66 | | - -Lo /opt/spark/jars/spark-connect_${SCALA_VERSION}-${SPARK_VERSION}.jar |
| 68 | +# Copy Spark configuration |
| 69 | +COPY spark-defaults.conf "${SPARK_HOME}/conf/" |
67 | 70 |
|
68 | | -# Download iceberg spark runtime |
69 | | -RUN curl --retry 5 -s https://repo1.maven.org/maven2/org/apache/iceberg/iceberg-spark-runtime-${ICEBERG_SPARK_RUNTIME_VERSION}/${ICEBERG_VERSION}/iceberg-spark-runtime-${ICEBERG_SPARK_RUNTIME_VERSION}-${ICEBERG_VERSION}.jar \ |
70 | | - -Lo /opt/spark/jars/iceberg-spark-runtime-${ICEBERG_SPARK_RUNTIME_VERSION}-${ICEBERG_VERSION}.jar |
| 71 | +# Create healthcheck |
| 72 | +HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \ |
| 73 | + CMD curl -f http://localhost:15002/ || exit 1 |
71 | 74 |
|
72 | | -# Download AWS bundle |
73 | | -RUN curl --retry 5 -s https://repo1.maven.org/maven2/org/apache/iceberg/iceberg-aws-bundle/${ICEBERG_VERSION}/iceberg-aws-bundle-${ICEBERG_VERSION}.jar \ |
74 | | - -Lo /opt/spark/jars/iceberg-aws-bundle-${ICEBERG_VERSION}.jar |
| 75 | +# Expose Spark Connect port (default is 15002) |
| 76 | +EXPOSE 15002 |
75 | 77 |
|
76 | | -# Download hadoop-aws (required for S3 support) |
77 | | -RUN curl --retry 5 -s https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/${HADOOP_VERSION}/hadoop-aws-${HADOOP_VERSION}.jar \ |
78 | | - -Lo /opt/spark/jars/hadoop-aws-${HADOOP_VERSION}.jar |
79 | | - |
80 | | -# Download AWS SDK bundle |
81 | | -RUN curl --retry 5 -s https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk-bundle/${AWS_SDK_VERSION}/aws-java-sdk-bundle-${AWS_SDK_VERSION}.jar \ |
82 | | - -Lo /opt/spark/jars/aws-java-sdk-bundle-${AWS_SDK_VERSION}.jar |
83 | | - |
84 | | -COPY spark-defaults.conf /opt/spark/conf |
85 | | -ENV PATH="/opt/spark/sbin:/opt/spark/bin:${PATH}" |
86 | | - |
87 | | -RUN chmod u+x /opt/spark/sbin/* && \ |
88 | | - chmod u+x /opt/spark/bin/* |
89 | | - |
90 | | -COPY entrypoint.sh . |
91 | | - |
92 | | -ENTRYPOINT ["./entrypoint.sh"] |
93 | | -CMD ["notebook"] |
| 78 | +CMD ["/bin/bash", "-c", "${SPARK_HOME}/sbin/start-connect-server.sh && tail -f /dev/null"] |
0 commit comments