PAIR-code · RyanMullins · Aug 12, 2024 · Aug 13, 2024 · Aug 14, 2024 · Aug 14, 2024
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -73,4 +73,5 @@ jobs:
     - name: Build Docker image
       uses: docker/build-push-action@v4
       with:
+        target: lit-nlp-prod
         tags: lit-nlp:ci-${{ github.sha }}
diff --git a/Dockerfile b/Dockerfile
@@ -14,22 +14,44 @@
 # ==============================================================================
 # Use the official lightweight Python image.
 # https://hub.docker.com/_/python
-FROM python:3.10-slim
+
+# ---- LIT Base Container ----
+
+FROM python:3.11-slim AS lit-nlp-base
 
 # Update Ubuntu packages and install basic utils
 RUN apt-get update
 RUN apt-get install -y wget curl gnupg2 gcc g++ git
 
+# Copy local code to the container image.
+ENV APP_HOME /app
+WORKDIR $APP_HOME
+
+COPY ./lit_nlp/examples/gunicorn_config.py ./
+
+
+
+# ---- LIT Container for Hosted Demos ----
+
+FROM lit-nlp-base AS lit-nlp-prod
+
+RUN python -m pip install 'lit-nlp[examples]'
+
+WORKDIR $APP_HOME
+ENTRYPOINT ["gunicorn", "--config=gunicorn_config.py"]
+
+
+
+# ---- LIT Container for Developing and Testing Hosted Demos ----
+
+FROM lit-nlp-base AS lit-nlp-dev
+
 # Install yarn
 RUN curl -sS https://dl.yarnpkg.com/debian/pubkey.gpg | apt-key add -
 RUN echo "deb https://dl.yarnpkg.com/debian/ stable main" | \
     tee /etc/apt/sources.list.d/yarn.list
 RUN apt update && apt -y install yarn
 
-# Copy local code to the container image.
-ENV APP_HOME=/app
-WORKDIR $APP_HOME
-
 # Set up python environment with production dependencies
 # This step is slow as it installs many packages.
 COPY ./requirements*.txt ./
@@ -47,7 +69,4 @@ RUN yarn && yarn build && rm -rf node_modules/*
 # Note that the config file supports configuring the LIT demo that is launched
 # via the DEMO_NAME and DEMO_PORT environment variables.
 WORKDIR $APP_HOME
-ENTRYPOINT [ \
-  "gunicorn", \
-  "--config=lit_nlp/examples/gunicorn_config.py" \
-]
+ENTRYPOINT ["gunicorn", "--config=gunicorn_config.py"]
diff --git a/lit_nlp/examples/gcp/Dockerfile b/lit_nlp/examples/gcp/Dockerfile
@@ -0,0 +1,163 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+# Use the official lightweight Python image.
+# https://hub.docker.com/_/python
+
+# ---- LIT on GCP Base Images ----
+
+FROM python:3.11-slim AS lit-gcp-app-server-base
+
+# Update Ubuntu packages and install basic utils
+RUN apt-get update
+RUN apt-get install -y wget curl gnupg2 gcc g++ git
+
+# Copy local code to the container image.
+ENV APP_HOME /app
+WORKDIR $APP_HOME
+
+COPY ./lit_nlp/examples/gcp/server_gunicorn_config.py ./gunicorn_config.py
+
+
+
+FROM nvidia/cuda:12.5.1-base-ubuntu22.04 AS lit-gcp-model-server-base
+ENV DEBIAN_FRONTEND=noninteractive
+ENV LANG C.UTF-8
+
+ARG PYTHON_VERSION=python3.11
+
+RUN apt-get update
+
+# Install the CUDA Keyring package
+# See https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#network-repo-installation-for-ubuntu
+RUN apt-get install -y curl gnupg ca-certificates
+RUN curl https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb \
+    -o cuda-keyring_1.1-1_all.deb
+RUN dpkg -i cuda-keyring_1.1-1_all.deb
+
+# Install system and CUDA packages
+RUN apt-get install -y --no-install-recommends \
+    cuda-command-line-tools-12-3 \
+    cuda-cudart-dev-12-3 \
+    cuda-nvcc-12-3 \
+    cuda-cupti-12-3 \
+    cuda-nvprune-12-3 \
+    cuda-libraries-12-3 \
+    cuda-nvrtc-12-3 \
+    libcufft-12-3 \
+    libcurand-12-3 \
+    libcusolver-12-3 \
+    libcusparse-12-3 \
+    libcublas-12-3 \
+    libcudnn8=8.9.6.50-1+cuda12.2 \
+    libnvinfer-plugin8=8.6.1.6-1+cuda12.0 \
+    libnvinfer8=8.6.1.6-1+cuda12.0 \
+    build-essential \
+    pkg-config \
+    software-properties-common \
+    unzip
+
+# Install Python 3.11
+RUN apt-get install -y --no-install-recommends \
+    $PYTHON_VERSION \
+    $PYTHON_VERSION-venv \
+    $PYTHON_VERSION-distutils \
+    $PYTHON_VERSION-dev
+RUN ln -sf /usr/bin/$PYTHON_VERSION /usr/bin/python3
+RUN ln -sf /usr/bin/$PYTHON_VERSION /usr/bin/python
+
+# Install pip
+RUN curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py
+RUN python3 get-pip.py
+RUN python3 -m pip install --no-cache-dir --upgrade pip
+
+RUN apt-get clean
+RUN rm -rf /var/lib/apt/lists/*
+
+
+
+# ---- LIT on GCP Development Images ----
+
+FROM lit-gcp-app-server-base AS lit-gcp-app-server-dev
+
+# Install yarn
+RUN curl -sS https://dl.yarnpkg.com/debian/pubkey.gpg | apt-key add -
+RUN echo "deb https://dl.yarnpkg.com/debian/ stable main" | \
+    tee /etc/apt/sources.list.d/yarn.list
+RUN apt update && apt -y install yarn
+
+# Set up python environment with production dependencies
+# This step is slow as it installs many packages.
+COPY requirements_core.txt ./
+COPY lit_nlp/examples/prompt_debugging/requirements.txt \
+     lit_nlp/examples/prompt_debugging/requirements.txt
+COPY lit_nlp/examples/gcp/requirements.txt \
+     lit_nlp/examples/gcp/requirements.txt
+RUN python -m pip install -r lit_nlp/examples/gcp/requirements.txt
+
+# Copy the rest of the lit_nlp package
+COPY . ./
+
+# Build front-end with yarn
+WORKDIR $APP_HOME/lit_nlp/client
+ENV NODE_OPTIONS "--openssl-legacy-provider"
+RUN yarn && yarn build && rm -rf node_modules/*
+
+# Run LIT server
+# Note that the config file supports configuring the LIT demo that is launched
+# via the DEMO_NAME and DEMO_PORT environment variables.
+WORKDIR $APP_HOME
+ENTRYPOINT ["gunicorn", "--config=gunicorn_config.py"]
+
+
+
+FROM lit-gcp-model-server-base AS lit-gcp-model-server-dev
+ENV APP_HOME /app
+WORKDIR $APP_HOME
+
+# Install Node.js v18 (the base image ships with Node.js v12)
+# See https://github.com/nodesource/distributions
+RUN curl -fsSL https://deb.nodesource.com/setup_18.x -o nodesource_setup.sh
+RUN bash nodesource_setup.sh
+RUN apt-get install -y nodejs
+
+# Install yarn
+RUN curl -sS https://dl.yarnpkg.com/debian/pubkey.gpg | apt-key add -
+RUN echo "deb https://dl.yarnpkg.com/debian/ stable main" | \
+    tee /etc/apt/sources.list.d/yarn.list
+RUN apt update && apt -y install yarn
+
+COPY ./lit_nlp/examples/gcp/model_server_gunicorn_config.py ./
+
+# TODO(b/353980272): Replace this with a requirements file specific to the GCP
+# exmaple, this should include the core lit-nlp package.
+COPY requirements_core.txt ./
+COPY lit_nlp/examples/prompt_debugging/requirements.txt \
+     lit_nlp/examples/prompt_debugging/requirements.txt
+COPY lit_nlp/examples/gcp/requirements.txt \
+     lit_nlp/examples/gcp/requirements.txt
+RUN python -m pip install -r lit_nlp/examples/gcp/requirements.txt
+
+# Copy the rest of the lit_nlp package
+COPY . ./
+
+# Build front-end with yarn
+WORKDIR $APP_HOME/lit_nlp/client
+ENV NODE_OPTIONS "--openssl-legacy-provider"
+RUN yarn && yarn build && rm -rf node_modules/*
+
+# TODO(b/353980272): Replace this with the GCP-specific config
+# See https://github.com/PAIR-code/lit/blob/main/Dockerfile
+WORKDIR $APP_HOME
+ENTRYPOINT ["gunicorn", "--config=model_server_gunicorn_config.py"]
diff --git a/lit_nlp/examples/gcp/README.md b/lit_nlp/examples/gcp/README.md
@@ -0,0 +1,32 @@
+# Using LLMs in LIT on Google Cloud Platform
+
+## Developing
+
+### Use a virtual environment
+
+```shell
+# Create and activate the virtual environment
+python3 -m venv ~/.venvs/lit-on-gcp
+source ~/.venvs/lit-on-gcp/bin/activate
+
+# Install the requirements and LIT in editable mode
+pip install -f ./lit_nlp/examples/gcp/requirements.txt
+pip install -e .
+
+# Optionally, install tetsing requirements
+pip install -f ./requirements_test.txt
+pytest pytest lit_nlp/examples/gcp
+```
+
+### Build the Docker image
+
+```shell
+docker build -f ./lit_nlp/examples/gcp/Dockerfile -t lit-app:gcp-dev .
+```
+
+### Run GPT-2 in a Docker container
+
+```shell
+# Runs GPT-2 in Keras on Tensorflow
+docker run --rm -p 5432:5432 -e MODEL_CONFIG=gpt2:gpt2_base_en lit-app:gcp-dev
+```
diff --git a/lit_nlp/examples/gcp/constants.py b/lit_nlp/examples/gcp/constants.py
@@ -0,0 +1,6 @@
+import enum
+
+class LlmHTTPEndpoints(enum.Enum):
+  GENERATE = 'predict'
+  SALIENCE = 'salience'
+  TOKENIZE = 'tokenize'
diff --git a/lit_nlp/examples/gcp/model.py b/lit_nlp/examples/gcp/model.py
@@ -0,0 +1,122 @@
+"""Wrapper for connetecting to LLMs on GCP via the model_server HTTP API."""
+
+from lit_nlp import app as lit_app
+from lit_nlp.api import model as lit_model
+from lit_nlp.api import types as lit_types
+from lit_nlp.examples.gcp import constants as lit_gcp_constants
+from lit_nlp.examples.prompt_debugging import constants as pd_constants
+from lit_nlp.examples.prompt_debugging import utils as pd_utils
+from lit_nlp.lib import serialize
+import requests
+
+"""
+Plan for this module:
+
+"""
+
+_LlmHTTPEndpoints = lit_gcp_constants.LlmHTTPEndpoints
+
+LLM_ON_GCP_INIT_SPEC: lit_types.Spec = {
+    # Note that `new_name` is not actually passed to LlmOverHTTP but the
+    # `/create_model` API will validate the config with a `new_name` in it.
+    'new_name': lit_types.String(required=False),
+    'base_url': lit_types.String(),
+    'identity_token': lit_types.String(default=""),
+    'max_concurrent_requests': lit_types.Integer(default=1),
+    'max_qps': lit_types.Integer(default=25, required=False),
+}
+
+
+class LlmOverHTTP(lit_model.BatchedRemoteModel):
+
+  def __init__(
+    self,
+    base_url: str,
+    identity_token: str,
+    endpoint: str | _LlmHTTPEndpoints,
+    max_concurrent_requests: int = 4,
+    max_qps: int | float = 25
+  ):
+    super().__init__(max_concurrent_requests, max_qps)
+    self.endpoint = _LlmHTTPEndpoints(endpoint)
+    self.url = f'{base_url}/{self.endpoint.value}'
+    self.identity_token = identity_token
+
+  def input_spec(self) -> lit_types.Spec:
+    input_spec = pd_constants.INPUT_SPEC
+
+    if self.endpoint == _LlmHTTPEndpoints.SALIENCE:
+      input_spec |= pd_constants.INPUT_SPEC_SALIENCE
+
+    return input_spec
+
+  def output_spec(self) -> lit_types.Spec:
+    if self.endpoint == _LlmHTTPEndpoints.GENERATE:
+      return (
+          pd_constants.OUTPUT_SPEC_GENERATION
+          | pd_constants.OUTPUT_SPEC_GENERATION_EMBEDDINGS
+      )
+    elif self.endpoint == _LlmHTTPEndpoints.SALIENCE:
+      return pd_constants.OUTPUT_SPEC_SALIENCE
+    else:
+      return pd_constants.OUTPUT_SPEC_TOKENIZER
+
+  def predict_minibatch(
+      self, inputs: list[lit_types.JsonDict]
+  ) -> list[lit_types.JsonDict]:
+    """Run prediction on a batch of inputs.
+
+    Subclass should implement this.
+
+    Args:
+      inputs: sequence of inputs, following model.input_spec()
+
+    Returns:
+      list of outputs, following model.output_spec()
+    """
+    inputs = {'inputs': inputs}
+    headers = {
+        'Authorization': f'Bearer {self.identity_token}',  
+        'Content-Type': 'application/json'
+    }
+    response = requests.post(
+        self.url, headers=headers,data=serialize.to_json(inputs, simple=True)
+    )
+
+    if not (200 <= response.status_code < 300):
+      raise RuntimeError()
+
+    outputs = serialize.from_json(response.text)
+    return outputs
+
+
+def initialize_model_group_for_salience(
+    new_name: str, base_url: str, *args, **kw
+) -> lit_model.ModelMap:
+  """Creates '{name}' and '_{name}_salience' and '_{name}_tokenizer'."""
+  salience_name, tokenizer_name = pd_utils.generate_model_group_names(new_name)
+
+  generation_model = LlmOverHTTP(
+      *args, base_url=base_url, endpoint=_LlmHTTPEndpoints.GENERATE, **kw
+  )
+  salience_model = LlmOverHTTP(
+      *args, base_url=base_url, endpoint=_LlmHTTPEndpoints.SALIENCE, **kw
+  )
+  tokenizer_model = LlmOverHTTP(
+      *args, base_url=base_url, endpoint=_LlmHTTPEndpoints.TOKENIZE, **kw
+  )
+
+  return {
+      new_name: generation_model,
+      salience_name: salience_model,
+      tokenizer_name: tokenizer_model,
+  }
+
+
+def get_model_loaders() -> lit_app.ModelLoadersMap:
+  return {
+      'LLM (self hosted)': (
+          initialize_model_group_for_salience,
+          LLM_ON_GCP_INIT_SPEC
+      )
+  }