[GSProcessing] Bump GSProcessing version to 0.4.1 (#1172)

*Issue #, if available:* *Description of changes:* * Copy over new Dockerfiles, moving to new images where possible. * Update pyproject.toml, making requirements less strict and bump PyTorch to 2.3.1 By submitting this pull request, I confirm that you can use, modify, copy, and redistribute this contribution, under the terms of your choice.
awslabs · Feb 18, 2025 · 15672bd · 15672bd
1 parent fdb2575
commit 15672bd
Show file tree

Hide file tree

Showing 5 changed files with 1,005 additions and 7 deletions.
diff --git a/graphstorm-processing/docker/0.4.1/emr-serverless/Dockerfile.cpu b/graphstorm-processing/docker/0.4.1/emr-serverless/Dockerfile.cpu
@@ -0,0 +1,57 @@
+ARG ARCH=x86_64
+FROM public.ecr.aws/emr-serverless/spark/emr-7.3.0:20241008-${ARCH} as base
+
+USER root
+ENV PYTHON_VERSION=3.9.18
+
+# Python won’t try to write .pyc or .pyo files on the import of source modules
+# Force stdin, stdout and stderr to be totally unbuffered. Good for logging
+ENV PYTHONDONTWRITEBYTECODE=1
+ENV PYTHONUNBUFFERED=1
+ENV PYTHONIOENCODING=UTF-8
+
+
+FROM base AS arch-x86_64
+
+FROM base AS arch-arm64
+RUN yum install -y python3-devel && \
+        rm -rf /var/cache/yum
+
+FROM arch-${ARCH} AS runtime
+
+
+WORKDIR /usr/lib/spark/code/
+
+# Install GSProcessing requirements to pyenv Python
+COPY requirements.txt requirements.txt
+# Use --mount=type=cache,target=/root/.cache when Buildkit CI issue is fixed:
+# https://github.com/moby/buildkit/issues/1512
+RUN pip install -r /usr/lib/spark/code/requirements.txt \
+    && rm -rf /root/.cache
+
+# Install Huggingface model cache if it is necessary
+ARG MODEL=""
+ENV HF_HOME=/home/hadoop/.cache/huggingface/hub
+RUN if [ -z "${MODEL}" ]; then \
+        echo "Skip installing model cache"; \
+else \
+        echo "Installing model cache for $MODEL" && \
+        python3 -c "from transformers import AutoTokenizer; AutoTokenizer.from_pretrained('${MODEL}')"; \
+        python3 -c "from transformers import AutoModel; AutoModel.from_pretrained('${MODEL}')"; \
+fi
+
+# We use this file as an indicator of the execution environment
+RUN touch /usr/lib/spark/code/EMR_SERVERLESS_EXECUTION
+
+# GSProcessing codebase
+COPY code/ /usr/lib/spark/code/
+
+RUN python3 -m pip install --no-deps /usr/lib/spark/code/graphstorm-processing/
+FROM base AS prod
+
+FROM base AS test
+RUN python3 -m pip install mock pytest && \
+    rm -rf /root/.cache
+
+USER hadoop:hadoop
+WORKDIR /home/hadoop
diff --git a/graphstorm-processing/docker/0.4.1/emr/Dockerfile.cpu b/graphstorm-processing/docker/0.4.1/emr/Dockerfile.cpu
@@ -0,0 +1,78 @@
+# TODO: Pin image version
+FROM public.ecr.aws/amazoncorretto/amazoncorretto:17 as base
+
+ENV PYTHON_VERSION=3.9.18
+
+# Python won’t try to write .pyc or .pyo files on the import of source modules
+# Force stdin, stdout and stderr to be totally unbuffered. Good for logging
+ENV PYTHONDONTWRITEBYTECODE=1
+ENV PYTHONUNBUFFERED=1
+ENV PYTHONIOENCODING=UTF-8
+
+ENV PYENV_ROOT="${HOME}/.pyenv"
+ENV PATH="${PYENV_ROOT}/shims:${PYENV_ROOT}/bin:${PATH}"
+
+ENV PYSPARK_DRIVER_PYTHON=${PYENV_ROOT}/shims/python
+ENV PYSPARK_PYTHON=${PYENV_ROOT}/shims/python
+
+# pyenv and Spark/YARN dependencies
+RUN yum erase -y openssl-devel && \
+    yum install -y \
+        bzip2-devel\
+        gcc \
+        git \
+        headless \
+        hostname \
+        java-17-amazon-corretto-headless \
+        libffi-devel \
+        make \
+        ncurses-devel \
+        openssl11-devel \
+        readline-devel \
+        sqlite-devel \
+        sudo \
+        tar \
+        xz-devel && \
+        rm -rf /var/cache/yum
+
+# Install Python through pyenv
+RUN git clone https://github.com/pyenv/pyenv.git ${PYENV_ROOT} --single-branch && \
+    pyenv install ${PYTHON_VERSION} && \
+    pyenv global ${PYTHON_VERSION}
+
+FROM base AS runtime
+
+WORKDIR /usr/lib/spark/code/
+
+
+# Install GSProcessing requirements to pyenv Python
+COPY requirements.txt requirements.txt
+# Use --mount=type=cache,target=/root/.cache when Buildkit CI issue is fixed:
+# https://github.com/moby/buildkit/issues/1512
+RUN pip3 install -r /usr/lib/spark/code/requirements.txt \
+    && rm -rf /root/.cache
+
+# Install Huggingface model cache if it is necessary
+# This needs to happen after the transformers library has been installed above
+ARG MODEL=""
+ENV HF_HOME=/usr/lib/spark/.cache/huggingface/hub
+RUN if [ -z "${MODEL}" ]; then \
+        echo "Skip installing model cache"; \
+else \
+        echo "Installing model cache for $MODEL" && \
+        python3 -c "from transformers import AutoTokenizer; AutoTokenizer.from_pretrained('${MODEL}')"; \
+        python3 -c "from transformers import AutoModel; AutoModel.from_pretrained('${MODEL}')"; \
+fi
+
+# We use this file as an indicator of the execution environment
+RUN touch /usr/lib/spark/code/EMR_EXECUTION
+
+# GSProcessing codebase
+COPY code/ /usr/lib/spark/code/
+
+RUN python3 -m pip install --no-deps /usr/lib/spark/code/graphstorm-processing/
+FROM base AS prod
+
+FROM base AS test
+RUN python3 -m pip install mock pytest && \
+    rm -rf /root/.cache