Skip to content

Commit 4cc22b5

Browse files
committed
refactor
1 parent 5e5d348 commit 4cc22b5

2 files changed

Lines changed: 40 additions & 46 deletions

File tree

Makefile

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -101,7 +101,6 @@ test-integration-setup: ## Start Docker services for integration tests
101101
docker compose -f dev/docker-compose-integration.yml kill
102102
docker compose -f dev/docker-compose-integration.yml rm -f
103103
docker compose -f dev/docker-compose-integration.yml up -d
104-
sleep 10
105104
${TEST_RUNNER} python dev/provision.py
106105

107106
test-integration-exec: ## Run integration tests (excluding provision)

dev/Dockerfile

Lines changed: 40 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -13,10 +13,11 @@
1313
# See the License for the specific language governing permissions and
1414
# limitations under the License.
1515

16-
ARG IMAGE_SPARK_VERSION=3.5.6
16+
ARG BASE_IMAGE_SPARK_VERSION=3.5.6
1717

18-
FROM apache/spark:${IMAGE_SPARK_VERSION}
18+
FROM apache/spark:${BASE_IMAGE_SPARK_VERSION}
1919

20+
# Dependency versions - keep these compatible
2021
ARG SPARK_VERSION=3.5.6
2122
ARG SCALA_VERSION=2.12
2223
ARG ICEBERG_SPARK_RUNTIME_VERSION=3.5_2.12
@@ -25,54 +26,48 @@ ARG HADOOP_VERSION=3.3.4
2526
ARG AWS_SDK_VERSION=1.12.753
2627
ARG MAVEN_MIRROR=https://repo.maven.apache.org/maven2
2728

28-
# Install dependencies and download JARs in single layer
2929
USER root
30+
WORKDIR ${SPARK_HOME}
31+
32+
# Install curl for JAR downloads
3033
RUN apt-get update && \
31-
apt-get install -y --no-install-recommends \
32-
wget \
33-
curl && \
34-
# Create temporary directory for downloads
35-
mkdir -p /tmp/jars && \
36-
cd /tmp/jars && \
37-
# Download JARs with error handling
38-
for url in \
39-
"${MAVEN_MIRROR}/org/apache/spark/spark-connect_${SCALA_VERSION}/${SPARK_VERSION}/spark-connect_${SCALA_VERSION}-${SPARK_VERSION}.jar" \
40-
"${MAVEN_MIRROR}/org/apache/iceberg/iceberg-spark-runtime-${ICEBERG_SPARK_RUNTIME_VERSION}/${ICEBERG_VERSION}/iceberg-spark-runtime-${ICEBERG_SPARK_RUNTIME_VERSION}-${ICEBERG_VERSION}.jar" \
41-
"${MAVEN_MIRROR}/org/apache/iceberg/iceberg-aws-bundle/${ICEBERG_VERSION}/iceberg-aws-bundle-${ICEBERG_VERSION}.jar" \
42-
"${MAVEN_MIRROR}/org/apache/hadoop/hadoop-aws/${HADOOP_VERSION}/hadoop-aws-${HADOOP_VERSION}.jar" \
43-
"${MAVEN_MIRROR}/com/amazonaws/aws-java-sdk-bundle/${AWS_SDK_VERSION}/aws-java-sdk-bundle-${AWS_SDK_VERSION}.jar" \
44-
; do \
45-
echo "Downloading: ${url}" && \
46-
wget --progress=dot:giga --retry-connrefused --waitretry=1 --timeout=60 "${url}" || exit 1; \
47-
done && \
48-
# Move JARs to Spark directory
49-
mv *.jar "${SPARK_HOME}/jars/" && \
50-
chown spark:spark "${SPARK_HOME}/jars"/*.jar && \
51-
# Create Spark events directory
52-
mkdir -p "/home/iceberg/spark-events" && \
53-
chown spark:spark "/home/iceberg/spark-events" && \
54-
# Cleanup
55-
cd / && \
56-
rm -rf /tmp/jars && \
57-
apt-get remove -y wget && \
58-
apt-get autoremove -y && \
59-
apt-get clean && \
60-
rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
34+
apt-get install -y --no-install-recommends curl && \
35+
rm -rf /var/lib/apt/lists/*
6136

62-
# Switch back to spark user
63-
USER spark
37+
# Copy configuration (early for better caching)
38+
COPY --chown=spark:spark spark-defaults.conf ${SPARK_HOME}/conf/
6439

65-
# Working directory
66-
WORKDIR "${SPARK_HOME}"
40+
# Create event log directory
41+
RUN mkdir -p /home/iceberg/spark-events && \
42+
chown -R spark:spark /home/iceberg
6743

68-
# Copy Spark configuration
69-
COPY spark-defaults.conf "${SPARK_HOME}/conf/"
44+
# Required JAR dependencies
45+
ENV JARS_TO_DOWNLOAD="\
46+
org/apache/spark/spark-connect_${SCALA_VERSION}/${SPARK_VERSION}/spark-connect_${SCALA_VERSION}-${SPARK_VERSION}.jar \
47+
org/apache/iceberg/iceberg-spark-runtime-${ICEBERG_SPARK_RUNTIME_VERSION}/${ICEBERG_VERSION}/iceberg-spark-runtime-${ICEBERG_SPARK_RUNTIME_VERSION}-${ICEBERG_VERSION}.jar \
48+
org/apache/iceberg/iceberg-aws-bundle/${ICEBERG_VERSION}/iceberg-aws-bundle-${ICEBERG_VERSION}.jar \
49+
org/apache/hadoop/hadoop-aws/${HADOOP_VERSION}/hadoop-aws-${HADOOP_VERSION}.jar \
50+
com/amazonaws/aws-java-sdk-bundle/${AWS_SDK_VERSION}/aws-java-sdk-bundle-${AWS_SDK_VERSION}.jar"
7051

71-
# Create healthcheck
72-
HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \
73-
CMD curl -f http://localhost:15002/ || exit 1
52+
# Download JARs with retry logic
53+
RUN set -e && \
54+
cd "${SPARK_HOME}/jars" && \
55+
for jar_path in ${JARS_TO_DOWNLOAD}; do \
56+
jar_name=$(basename "${jar_path}") && \
57+
echo "Downloading ${jar_name}..." && \
58+
curl -fsSL --retry 3 --retry-delay 5 \
59+
-o "${jar_name}" \
60+
"${MAVEN_MIRROR}/${jar_path}" && \
61+
echo "✓ Downloaded ${jar_name}"; \
62+
done && \
63+
chown -R spark:spark "${SPARK_HOME}/jars"
64+
65+
USER spark
66+
WORKDIR ${SPARK_HOME}
7467

75-
# Expose Spark Connect port (default is 15002)
76-
EXPOSE 15002
68+
# Health check for Spark Connect server
69+
HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \
70+
CMD curl -sf http://localhost:15002/ || exit 1
7771

78-
CMD ["/bin/bash", "-c", "${SPARK_HOME}/sbin/start-connect-server.sh && tail -f /dev/null"]
72+
# Start Spark Connect server
73+
CMD ["sh", "-c", "SPARK_NO_DAEMONIZE=true ${SPARK_HOME}/sbin/start-connect-server.sh"]

0 commit comments

Comments
 (0)