Skip to content

Commit

Permalink
[GSProcessing] Bump GSProcessing version to 0.4.1 (#1172)
Browse files Browse the repository at this point in the history
*Issue #, if available:*

*Description of changes:*

* Copy over new Dockerfiles, moving to new images where possible.
* Update pyproject.toml, making requirements less strict and bump
PyTorch to 2.3.1

By submitting this pull request, I confirm that you can use, modify,
copy, and redistribute this contribution, under the terms of your
choice.
  • Loading branch information
thvasilo authored Feb 18, 2025
1 parent fdb2575 commit 15672bd
Show file tree
Hide file tree
Showing 5 changed files with 1,005 additions and 7 deletions.
57 changes: 57 additions & 0 deletions graphstorm-processing/docker/0.4.1/emr-serverless/Dockerfile.cpu
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
ARG ARCH=x86_64
FROM public.ecr.aws/emr-serverless/spark/emr-7.3.0:20241008-${ARCH} as base

USER root
ENV PYTHON_VERSION=3.9.18

# Python won’t try to write .pyc or .pyo files on the import of source modules
# Force stdin, stdout and stderr to be totally unbuffered. Good for logging
ENV PYTHONDONTWRITEBYTECODE=1
ENV PYTHONUNBUFFERED=1
ENV PYTHONIOENCODING=UTF-8


FROM base AS arch-x86_64

FROM base AS arch-arm64
RUN yum install -y python3-devel && \
rm -rf /var/cache/yum

FROM arch-${ARCH} AS runtime


WORKDIR /usr/lib/spark/code/

# Install GSProcessing requirements to pyenv Python
COPY requirements.txt requirements.txt
# Use --mount=type=cache,target=/root/.cache when Buildkit CI issue is fixed:
# https://github.com/moby/buildkit/issues/1512
RUN pip install -r /usr/lib/spark/code/requirements.txt \
&& rm -rf /root/.cache

# Install Huggingface model cache if it is necessary
ARG MODEL=""
ENV HF_HOME=/home/hadoop/.cache/huggingface/hub
RUN if [ -z "${MODEL}" ]; then \
echo "Skip installing model cache"; \
else \
echo "Installing model cache for $MODEL" && \
python3 -c "from transformers import AutoTokenizer; AutoTokenizer.from_pretrained('${MODEL}')"; \
python3 -c "from transformers import AutoModel; AutoModel.from_pretrained('${MODEL}')"; \
fi

# We use this file as an indicator of the execution environment
RUN touch /usr/lib/spark/code/EMR_SERVERLESS_EXECUTION

# GSProcessing codebase
COPY code/ /usr/lib/spark/code/

RUN python3 -m pip install --no-deps /usr/lib/spark/code/graphstorm-processing/
FROM base AS prod

FROM base AS test
RUN python3 -m pip install mock pytest && \
rm -rf /root/.cache

USER hadoop:hadoop
WORKDIR /home/hadoop
78 changes: 78 additions & 0 deletions graphstorm-processing/docker/0.4.1/emr/Dockerfile.cpu
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
# TODO: Pin image version
FROM public.ecr.aws/amazoncorretto/amazoncorretto:17 as base

ENV PYTHON_VERSION=3.9.18

# Python won’t try to write .pyc or .pyo files on the import of source modules
# Force stdin, stdout and stderr to be totally unbuffered. Good for logging
ENV PYTHONDONTWRITEBYTECODE=1
ENV PYTHONUNBUFFERED=1
ENV PYTHONIOENCODING=UTF-8

ENV PYENV_ROOT="${HOME}/.pyenv"
ENV PATH="${PYENV_ROOT}/shims:${PYENV_ROOT}/bin:${PATH}"

ENV PYSPARK_DRIVER_PYTHON=${PYENV_ROOT}/shims/python
ENV PYSPARK_PYTHON=${PYENV_ROOT}/shims/python

# pyenv and Spark/YARN dependencies
RUN yum erase -y openssl-devel && \
yum install -y \
bzip2-devel\
gcc \
git \
headless \
hostname \
java-17-amazon-corretto-headless \
libffi-devel \
make \
ncurses-devel \
openssl11-devel \
readline-devel \
sqlite-devel \
sudo \
tar \
xz-devel && \
rm -rf /var/cache/yum

# Install Python through pyenv
RUN git clone https://github.com/pyenv/pyenv.git ${PYENV_ROOT} --single-branch && \
pyenv install ${PYTHON_VERSION} && \
pyenv global ${PYTHON_VERSION}

FROM base AS runtime

WORKDIR /usr/lib/spark/code/


# Install GSProcessing requirements to pyenv Python
COPY requirements.txt requirements.txt
# Use --mount=type=cache,target=/root/.cache when Buildkit CI issue is fixed:
# https://github.com/moby/buildkit/issues/1512
RUN pip3 install -r /usr/lib/spark/code/requirements.txt \
&& rm -rf /root/.cache

# Install Huggingface model cache if it is necessary
# This needs to happen after the transformers library has been installed above
ARG MODEL=""
ENV HF_HOME=/usr/lib/spark/.cache/huggingface/hub
RUN if [ -z "${MODEL}" ]; then \
echo "Skip installing model cache"; \
else \
echo "Installing model cache for $MODEL" && \
python3 -c "from transformers import AutoTokenizer; AutoTokenizer.from_pretrained('${MODEL}')"; \
python3 -c "from transformers import AutoModel; AutoModel.from_pretrained('${MODEL}')"; \
fi

# We use this file as an indicator of the execution environment
RUN touch /usr/lib/spark/code/EMR_EXECUTION

# GSProcessing codebase
COPY code/ /usr/lib/spark/code/

RUN python3 -m pip install --no-deps /usr/lib/spark/code/graphstorm-processing/
FROM base AS prod

FROM base AS test
RUN python3 -m pip install mock pytest && \
rm -rf /root/.cache
Loading

0 comments on commit 15672bd

Please sign in to comment.