Skip to content

Commit

Permalink
[CI] Docker build improvements (#1425)
Browse files Browse the repository at this point in the history
* Improve docker file

* Make it work for both latest version and released version

* Fix build for macos

* Fix buildx for macos

* Install docker-buildx for macos-12

* Change matrix definition for docker build

* Check total physical memory and DRIVER_MEM, EXECUTOR_MEM configurations before launching

* Fix gresearch spark-extension version and improved code for detecting spark versions

* Use array form of CMD to allow control signals to be passed to jupyter lab
  • Loading branch information
Kontinuation committed May 24, 2024
1 parent 7e41f6b commit 2de9160
Show file tree
Hide file tree
Showing 7 changed files with 157 additions and 34 deletions.
23 changes: 16 additions & 7 deletions .github/workflows/docker-build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -25,13 +25,17 @@ jobs:
fail-fast: true
matrix:
os: ['ubuntu-latest', 'macos-12']
spark: ['3.5.1', '3.4.3', '3.3.4']
include:
- spark: 3.4.1
sedona: 1.4.1
- spark: 3.4.1
sedona: latest
- spark: 3.3.2
sedona: latest
- spark: 3.5.1
sedona: "latest"
geotools: "auto"
- spark: 3.4.3
sedona: 1.6.0
geotools: 28.2
- spark: 3.3.4
sedona: 1.6.0
geotools: 28.2
runs-on: ${{ matrix.os }}
defaults:
run:
Expand All @@ -54,7 +58,12 @@ jobs:
run: |
brew install docker
colima start
DOCKER_CONFIG=${DOCKER_CONFIG:-$HOME/.docker}
mkdir -p $DOCKER_CONFIG/cli-plugins
curl -SL https://github.com/docker/buildx/releases/download/v0.14.1/buildx-v0.14.1.darwin-amd64 -o $DOCKER_CONFIG/cli-plugins/docker-buildx
chmod +x $DOCKER_CONFIG/cli-plugins/docker-buildx
- env:
SPARK_VERSION: ${{ matrix.spark }}
SEDONA_VERSION: ${{ matrix.sedona }}
run: ./docker/sedona-spark-jupyterlab/build.sh ${SPARK_VERSION} ${SEDONA_VERSION}
GEOTOOLS_VERSION: ${{ matrix.geotools }}
run: ./docker/sedona-spark-jupyterlab/build.sh ${SPARK_VERSION} ${SEDONA_VERSION} local ${GEOTOOLS_VERSION}
3 changes: 0 additions & 3 deletions docker/sedona-spark-jupyterlab/.dockerignore

This file was deleted.

64 changes: 58 additions & 6 deletions docker/sedona-spark-jupyterlab/build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -20,26 +20,77 @@
SPARK_VERSION=$1
SEDONA_VERSION=$2
BUILD_MODE=$3
GEOTOOLS_VERSION=${4:-auto}

lower_version=$(echo -e ${SPARK_VERSION}"\n3.4" | sort -V | head -n1)
if [ $lower_version = "3.4" ]; then
SEDONA_SPARK_VERSION=3.4
else
SEDONA_SPARK_VERSION=${SPARK_VERSION:0:3}
if [ ${SPARK_VERSION:0:1} -eq "3" ] && [ ${SPARK_VERSION:2:1} -le "3" ]; then
# 3.0, 3.1, 3.2, 3.3
SEDONA_SPARK_VERSION=3.0
fi

# Function to compare two version numbers
version_gt() {
# Compare two version numbers
# Returns 0 if the first version is greater, 1 otherwise
[ "$(printf '%s\n' "$@" | sort -V | head -n 1)" != "$1" ]
}

# Function to get the latest version of a Maven package
get_latest_version_with_suffix() {
BASE_URL=$1
SUFFIX=$2

# Fetch the maven-metadata.xml file
METADATA_URL="${BASE_URL}maven-metadata.xml"
METADATA_XML=$(curl -s $METADATA_URL)

# Extract versions from the XML
VERSIONS=$(echo "$METADATA_XML" | grep -o '<version>[^<]*</version>' | awk -F'[<>]' '{print $3}')

LATEST_VERSION=""

# Filter versions that end with the specified suffix and find the largest one
for VERSION in $VERSIONS; do
if [[ $VERSION == *$SUFFIX ]]; then
if [[ -z $LATEST_VERSION ]] || version_gt $VERSION $LATEST_VERSION; then
LATEST_VERSION=$VERSION
fi
fi
done

if [[ -z $LATEST_VERSION ]]; then
exit 1
else
echo $LATEST_VERSION
fi
}

if [ "$GEOTOOLS_VERSION" = "auto" ]; then
GEOTOOLS_VERSION=$(mvn help:evaluate -Dexpression=geotools.version -q -DforceStdout)
echo "GeoTools version inferred from pom.xml: $GEOTOOLS_VERSION"
fi

GEOTOOLS_WRAPPER_VERSION="${SEDONA_VERSION}-${GEOTOOLS_VERSION}"
if [ "$SEDONA_VERSION" = "latest" ]; then
GEOTOOLS_WRAPPER_VERSION=$(get_latest_version_with_suffix "https://repo1.maven.org/maven2/org/datasyslab/geotools-wrapper/" "$GEOTOOLS_VERSION")
if [ -z "$GEOTOOLS_WRAPPER_VERSION" ]; then
echo "No geotools-wrapper version with suffix $GEOTOOLS_VERSION"
exit 1
fi
echo "Using latest geotools-wrapper version: $GEOTOOLS_WRAPPER_VERSION"

# The compilation must take place outside Docker to avoid unnecessary maven packages
mvn clean install -DskipTests -Dspark=${SEDONA_SPARK_VERSION} -Dgeotools -Dscala=2.12
mvn clean install -DskipTests -Dspark=${SEDONA_SPARK_VERSION} -Dscala=2.12
fi

# -- Building the image

if [ -z "$BUILD_MODE" ] || [ "$BUILD_MODE" = "local" ]; then
# If local, build the image for the local environment
docker build \
docker buildx build \
--build-arg spark_version="${SPARK_VERSION}" \
--build-arg sedona_version="${SEDONA_VERSION}" \
--build-arg geotools_wrapper_version="${GEOTOOLS_WRAPPER_VERSION}" \
-f docker/sedona-spark-jupyterlab/sedona-jupyterlab.dockerfile \
-t apache/sedona:${SEDONA_VERSION} .
else
Expand All @@ -50,6 +101,7 @@ else
--output type=registry \
--build-arg spark_version="${SPARK_VERSION}" \
--build-arg sedona_version="${SEDONA_VERSION}" \
--build-arg geotools_wrapper_version="${GEOTOOLS_WRAPPER_VERSION}" \
-f docker/sedona-spark-jupyterlab/sedona-jupyterlab.dockerfile \
-t apache/sedona:${SEDONA_VERSION} .
fi
11 changes: 2 additions & 9 deletions docker/sedona-spark-jupyterlab/sedona-jupyterlab.dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -73,12 +73,5 @@ EXPOSE 4040

WORKDIR ${SHARED_WORKSPACE}



CMD DRIVER_MEM=${DRIVER_MEM:-4g} && \
EXECUTOR_MEM=${EXECUTOR_MEM:-4g} && \
echo "spark.driver.memory $DRIVER_MEM" >> ${SPARK_HOME}/conf/spark-defaults.conf && \
echo "spark.executor.memory $EXECUTOR_MEM" >> ${SPARK_HOME}/conf/spark-defaults.conf && \
service ssh start && \
${SPARK_HOME}/sbin/start-all.sh && \
jupyter lab --ip=0.0.0.0 --port=8888 --no-browser --allow-root --NotebookApp.token=
COPY docker/sedona-spark-jupyterlab/start.sh /opt/
CMD ["/bin/bash", "/opt/start.sh"]
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
# Ignore everything
*

# Allow files and folders with a pattern starting with !
!docker/**
!docs/usecases/**
!python/**
!spark-shaded/target/**
66 changes: 66 additions & 0 deletions docker/sedona-spark-jupyterlab/start.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
#!/usr/bin/env bash

DRIVER_MEM=${DRIVER_MEM:-4g}
EXECUTOR_MEM=${EXECUTOR_MEM:-4g}

# Function to convert memory string to megabytes
convert_to_mb() {
local mem_str=$1
local mem_value=${mem_str%[gGmM]}
local mem_unit=${mem_str: -1}

case $mem_unit in
[gG])
echo $(($mem_value * 1024))
;;
[mM])
echo $mem_value
;;
*)
echo "Invalid memory unit: $mem_str" >&2
return 1
;;
esac
}

# Convert DRIVER_MEM and EXECUTOR_MEM to megabytes
DRIVER_MEM_MB=$(convert_to_mb $DRIVER_MEM)
if [ $? -ne 0 ]; then
echo "Error converting DRIVER_MEM to megabytes." >&2
exit 1
fi

EXECUTOR_MEM_MB=$(convert_to_mb $EXECUTOR_MEM)
if [ $? -ne 0 ]; then
echo "Error converting EXECUTOR_MEM to megabytes." >&2
exit 1
fi

# Get total physical memory in megabytes
TOTAL_PHYSICAL_MEM_MB=$(free -m | awk '/^Mem:/{print $2}')

# Calculate the total required memory
TOTAL_REQUIRED_MEM_MB=$(($DRIVER_MEM_MB + $EXECUTOR_MEM_MB))

# Compare total required memory with total physical memory
if [ $TOTAL_REQUIRED_MEM_MB -gt $TOTAL_PHYSICAL_MEM_MB ]; then
echo "Error: Insufficient memory" >&2
echo " total: $TOTAL_PHYSICAL_MEM_MB MB" >&2
echo " required: $TOTAL_REQUIRED_MEM_MB MB (driver: $DRIVER_MEM_MB MB, executor: $EXECUTOR_MEM_MB MB)" >&2
echo "Please tune DRIVER_MEM and EXECUTOR_MEM to smaller values." >&2
echo "e.g: docker run -e DRIVER_MEM=2g -e EXECUTOR_MEM=2g ..." >&2
exit 1
fi

# Configure spark
cp ${SPARK_HOME}/conf/spark-env.sh.template ${SPARK_HOME}/conf/spark-env.sh
echo "SPARK_WORKER_MEMORY=${EXECUTOR_MEM}" >> ${SPARK_HOME}/conf/spark-env.sh
echo "spark.driver.memory $DRIVER_MEM" >> ${SPARK_HOME}/conf/spark-defaults.conf
echo "spark.executor.memory $EXECUTOR_MEM" >> ${SPARK_HOME}/conf/spark-defaults.conf

# Start spark standalone cluster
service ssh start
${SPARK_HOME}/sbin/start-all.sh

# Start jupyter lab
exec jupyter lab --ip=0.0.0.0 --port=8888 --no-browser --allow-root --NotebookApp.token=
16 changes: 7 additions & 9 deletions docker/sedona.sh
Original file line number Diff line number Diff line change
Expand Up @@ -23,33 +23,31 @@ geotools_wrapper_version=$2
spark_version=$3
spark_extension_version=$4

lower_version=$(echo -e $spark_version"\n3.4" | sort -V | head -n1)
if [ $lower_version = "3.4" ]; then
sedona_spark_version=3.4
else
spark_compat_version=${spark_version:0:3}
sedona_spark_version=${spark_compat_version}
if [ ${spark_version:0:1} -eq "3" ] && [ ${spark_version:2:1} -le "3" ]; then
# 3.0, 3.1, 3.2, 3.3
sedona_spark_version=3.0
fi

if [ $sedona_version = "latest" ]; then
# Code to execute when SEDONA_VERSION is "latest"
cp ${SEDONA_HOME}/spark-shaded/target/sedona-spark-shaded-*.jar ${SPARK_HOME}/jars/
cd ${SEDONA_HOME}/python;pip3 install shapely==1.8.4;pip3 install .
cd ${SEDONA_HOME}/python;pip3 install .
else
# Code to execute when SEDONA_VERSION is not "latest"
# Download Sedona
curl https://repo1.maven.org/maven2/org/apache/sedona/sedona-spark-shaded-${sedona_spark_version}_2.12/${sedona_version}/sedona-spark-shaded-${sedona_spark_version}_2.12-${sedona_version}.jar -o $SPARK_HOME/jars/sedona-spark-shaded-${sedona_spark_version}_2.12-${sedona_version}.jar

# Install Sedona Python
pip3 install shapely==1.8.4
pip3 install apache-sedona==${sedona_version}

fi

# Download gresearch spark extension
curl https://repo1.maven.org/maven2/uk/co/gresearch/spark/spark-extension_2.12/${spark_extension_version}-${sedona_spark_version}/spark-extension_2.12-${spark_extension_version}-${sedona_spark_version}.jar -o $SPARK_HOME/jars/spark-extension_2.12-${spark_extension_version}-${sedona_spark_version}.jar
curl https://repo1.maven.org/maven2/uk/co/gresearch/spark/spark-extension_2.12/${spark_extension_version}-${spark_compat_version}/spark-extension_2.12-${spark_extension_version}-${spark_compat_version}.jar -o $SPARK_HOME/jars/spark-extension_2.12-${spark_extension_version}-${spark_compat_version}.jar

# Install Spark extension Python
pip3 install pyspark-extension==${spark_extension_version}.${sedona_spark_version}
pip3 install pyspark-extension==${spark_extension_version}.${spark_compat_version}

# Download GeoTools jar
curl https://repo1.maven.org/maven2/org/datasyslab/geotools-wrapper/${geotools_wrapper_version}/geotools-wrapper-${geotools_wrapper_version}.jar -o $SPARK_HOME/jars/geotools-wrapper-${geotools_wrapper_version}.jar

0 comments on commit 2de9160

Please sign in to comment.