1313# See the License for the specific language governing permissions and
1414# limitations under the License.
1515
16- ARG IMAGE_SPARK_VERSION =3.5.6
16+ ARG BASE_IMAGE_SPARK_VERSION =3.5.6
1717
18- FROM apache/spark:${IMAGE_SPARK_VERSION }
18+ FROM apache/spark:${BASE_IMAGE_SPARK_VERSION }
1919
20+ # Dependency versions - keep these compatible
2021ARG SPARK_VERSION=3.5.6
2122ARG SCALA_VERSION=2.12
2223ARG ICEBERG_SPARK_RUNTIME_VERSION=3.5_2.12
@@ -25,54 +26,48 @@ ARG HADOOP_VERSION=3.3.4
2526ARG AWS_SDK_VERSION=1.12.753
2627ARG MAVEN_MIRROR=https://repo.maven.apache.org/maven2
2728
28- # Install dependencies and download JARs in single layer
2929USER root
30+ WORKDIR ${SPARK_HOME}
31+
32+ # Install curl for JAR downloads
3033RUN apt-get update && \
31- apt-get install -y --no-install-recommends \
32- wget \
33- curl && \
34- # Create temporary directory for downloads
35- mkdir -p /tmp/jars && \
36- cd /tmp/jars && \
37- # Download JARs with error handling
38- for url in \
39- "${MAVEN_MIRROR}/org/apache/spark/spark-connect_${SCALA_VERSION}/${SPARK_VERSION}/spark-connect_${SCALA_VERSION}-${SPARK_VERSION}.jar" \
40- "${MAVEN_MIRROR}/org/apache/iceberg/iceberg-spark-runtime-${ICEBERG_SPARK_RUNTIME_VERSION}/${ICEBERG_VERSION}/iceberg-spark-runtime-${ICEBERG_SPARK_RUNTIME_VERSION}-${ICEBERG_VERSION}.jar" \
41- "${MAVEN_MIRROR}/org/apache/iceberg/iceberg-aws-bundle/${ICEBERG_VERSION}/iceberg-aws-bundle-${ICEBERG_VERSION}.jar" \
42- "${MAVEN_MIRROR}/org/apache/hadoop/hadoop-aws/${HADOOP_VERSION}/hadoop-aws-${HADOOP_VERSION}.jar" \
43- "${MAVEN_MIRROR}/com/amazonaws/aws-java-sdk-bundle/${AWS_SDK_VERSION}/aws-java-sdk-bundle-${AWS_SDK_VERSION}.jar" \
44- ; do \
45- echo "Downloading: ${url}" && \
46- wget --progress=dot:giga --retry-connrefused --waitretry=1 --timeout=60 "${url}" || exit 1; \
47- done && \
48- # Move JARs to Spark directory
49- mv *.jar "${SPARK_HOME}/jars/" && \
50- chown spark:spark "${SPARK_HOME}/jars" /*.jar && \
51- # Create Spark events directory
52- mkdir -p "/home/iceberg/spark-events" && \
53- chown spark:spark "/home/iceberg/spark-events" && \
54- # Cleanup
55- cd / && \
56- rm -rf /tmp/jars && \
57- apt-get remove -y wget && \
58- apt-get autoremove -y && \
59- apt-get clean && \
60- rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
34+ apt-get install -y --no-install-recommends curl && \
35+ rm -rf /var/lib/apt/lists/*
6136
62- # Switch back to spark user
63- USER spark
37+ # Copy configuration (early for better caching)
38+ COPY --chown= spark:spark spark-defaults.conf ${SPARK_HOME}/conf/
6439
65- # Working directory
66- WORKDIR "${SPARK_HOME}"
40+ # Create event log directory
41+ RUN mkdir -p /home/iceberg/spark-events && \
42+ chown -R spark:spark /home/iceberg
6743
68- # Copy Spark configuration
69- COPY spark-defaults.conf "${SPARK_HOME}/conf/"
44+ # Required JAR dependencies
45+ ENV JARS_TO_DOWNLOAD="\
46+ org/apache/spark/spark-connect_${SCALA_VERSION}/${SPARK_VERSION}/spark-connect_${SCALA_VERSION}-${SPARK_VERSION}.jar \
47+ org/apache/iceberg/iceberg-spark-runtime-${ICEBERG_SPARK_RUNTIME_VERSION}/${ICEBERG_VERSION}/iceberg-spark-runtime-${ICEBERG_SPARK_RUNTIME_VERSION}-${ICEBERG_VERSION}.jar \
48+ org/apache/iceberg/iceberg-aws-bundle/${ICEBERG_VERSION}/iceberg-aws-bundle-${ICEBERG_VERSION}.jar \
49+ org/apache/hadoop/hadoop-aws/${HADOOP_VERSION}/hadoop-aws-${HADOOP_VERSION}.jar \
50+ com/amazonaws/aws-java-sdk-bundle/${AWS_SDK_VERSION}/aws-java-sdk-bundle-${AWS_SDK_VERSION}.jar"
7051
71- # Create healthcheck
72- HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \
73- CMD curl -f http://localhost:15002/ || exit 1
52+ # Download JARs with retry logic
53+ RUN set -e && \
54+ cd "${SPARK_HOME}/jars" && \
55+ for jar_path in ${JARS_TO_DOWNLOAD}; do \
56+ jar_name=$(basename "${jar_path}" ) && \
57+ echo "Downloading ${jar_name}..." && \
58+ curl -fsSL --retry 3 --retry-delay 5 \
59+ -o "${jar_name}" \
60+ "${MAVEN_MIRROR}/${jar_path}" && \
61+ echo "✓ Downloaded ${jar_name}" ; \
62+ done && \
63+ chown -R spark:spark "${SPARK_HOME}/jars"
64+
65+ USER spark
66+ WORKDIR ${SPARK_HOME}
7467
75- # Expose Spark Connect port (default is 15002)
76- EXPOSE 15002
68+ # Health check for Spark Connect server
69+ HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \
70+ CMD curl -sf http://localhost:15002/ || exit 1
7771
78- CMD ["/bin/bash" , "-c" , "${SPARK_HOME}/sbin/start-connect-server.sh && tail -f /dev/null" ]
72+ # Start Spark Connect server
73+ CMD ["sh" , "-c" , "SPARK_NO_DAEMONIZE=true ${SPARK_HOME}/sbin/start-connect-server.sh" ]
0 commit comments