From c1be68fb95c4f0bc7534ee25babd8d2a408a078d Mon Sep 17 00:00:00 2001 From: Tom Stesco Date: Wed, 29 Jan 2025 15:25:44 +0000 Subject: [PATCH 1/6] remove HF token fron .env in tt-studio --- app/.env.default | 1 - 1 file changed, 1 deletion(-) diff --git a/app/.env.default b/app/.env.default index 6b6f24ba..db81d8fe 100644 --- a/app/.env.default +++ b/app/.env.default @@ -6,4 +6,3 @@ VLLM_LLAMA31_ENV_FILE="" # SECURITY WARNING: keep these secret in production! JWT_SECRET=test-secret-456 DJANGO_SECRET_KEY=django-insecure-default -HF_TOKEN= # Get this from Hugging Face From aed2b297dc37d7e7e73775298e150c7304e7b217 Mon Sep 17 00:00:00 2001 From: Tom Stesco Date: Wed, 29 Jan 2025 15:31:25 +0000 Subject: [PATCH 2/6] startup.sh makes HOST_PERSISTENT_STORAGE_VOLUME if it doesnt exist --- startup.sh | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/startup.sh b/startup.sh index 7eb6e04a..54c59ec5 100755 --- a/startup.sh +++ b/startup.sh @@ -156,8 +156,16 @@ else exit 1 fi -# Step 2: Source env vars +# Step 2: Source env vars, ensure directories source "${ENV_FILE_PATH}" +# make persistent volume on host user user permissions +if [ ! -d "$HOST_PERSISTENT_STORAGE_VOLUME" ]; then + mkdir "$HOST_PERSISTENT_STORAGE_VOLUME" + if [ $? -ne 0 ]; then + echo "⛔ Error: Failed to create directory $HOST_PERSISTENT_STORAGE_VOLUME" + exit 1 + fi +fi # Step 3: Check if the Docker network already exists NETWORK_NAME="tt_studio_network" From 3a55bbb31153ba1d67131d9bfe45a788b077c3e2 Mon Sep 17 00:00:00 2001 From: Tom Stesco Date: Wed, 29 Jan 2025 15:32:08 +0000 Subject: [PATCH 3/6] startup.sh uses safety set -euo pipefail --- startup.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/startup.sh b/startup.sh index 54c59ec5..ab86be0a 100755 --- a/startup.sh +++ b/startup.sh @@ -4,6 +4,8 @@ # # SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC +set -euo pipefail # Exit on error, print commands, unset variables treated as errors, and exit on pipeline failure + # Define setup script path SETUP_SCRIPT="./setup.sh" From 4fa20eed8eff28183eb43c61afa55b3934755c61 Mon Sep 17 00:00:00 2001 From: Tom Stesco Date: Wed, 29 Jan 2025 16:37:54 +0000 Subject: [PATCH 4/6] remove HF_TOKEN from app/docker-compose.yml --- app/docker-compose.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/app/docker-compose.yml b/app/docker-compose.yml index 76895274..3ae07cba 100644 --- a/app/docker-compose.yml +++ b/app/docker-compose.yml @@ -40,7 +40,6 @@ services: - BACKEND_API_HOSTNAME - VLLM_LLAMA31_ENV_FILE - JWT_SECRET - - HF_TOKEN volumes: # mounting docker unix socket allows for backend container to run docker cmds - /var/run/docker.sock:/var/run/docker.sock From ea577a70d3094245e10ba2218328674879a69aca Mon Sep 17 00:00:00 2001 From: Tom Stesco Date: Wed, 29 Jan 2025 20:05:00 +0000 Subject: [PATCH 5/6] remove VLLM_LLAMA31_ENV_FILE now redundant --- app/api/shared_config/backend_config.py | 2 -- app/docker-compose.yml | 1 - 2 files changed, 3 deletions(-) diff --git a/app/api/shared_config/backend_config.py b/app/api/shared_config/backend_config.py index 88cd722c..b6f6a56c 100644 --- a/app/api/shared_config/backend_config.py +++ b/app/api/shared_config/backend_config.py @@ -18,7 +18,6 @@ class BackendConfig: weights_dir: str model_container_cache_root: str jwt_secret: str - hf_token: str # environment variables are ideally terminated on import to fail-fast and provide obvious @@ -36,7 +35,6 @@ class BackendConfig: weights_dir="model_weights", model_container_cache_root="/home/user/cache_root", jwt_secret=os.environ["JWT_SECRET"], - hf_token=os.environ["HF_TOKEN"], ) # make backend volume if not existing diff --git a/app/docker-compose.yml b/app/docker-compose.yml index 3ae07cba..e2ffb8ce 100644 --- a/app/docker-compose.yml +++ b/app/docker-compose.yml @@ -38,7 +38,6 @@ services: - HOST_PERSISTENT_STORAGE_VOLUME - INTERNAL_PERSISTENT_STORAGE_VOLUME - BACKEND_API_HOSTNAME - - VLLM_LLAMA31_ENV_FILE - JWT_SECRET volumes: # mounting docker unix socket allows for backend container to run docker cmds From c4cc18d0d8381b617e31dbf6a6c29e1b33fe43c6 Mon Sep 17 00:00:00 2001 From: Tom Stesco Date: Fri, 31 Jan 2025 05:28:32 +0000 Subject: [PATCH 6/6] Adding Llama 3.x integration using new setup.sh and LLM code base * support multiple models using same container, adds support for MODEL_ID environment variable in tt-inference-server. * update volume initialization for new file permissions strategy * add SetupTypes to handle different first run and validation behaviour * hf_model_id is used to define model_id and model_name if provided (rename hf_model_path to hf_model_id) * /home/user/cache_root changed to /home/container_app_user/cache_root * fix get_devices_mounts, add mapping * use MODEL_ID if in container env_vars to map to impl model config * set defaults for ModelImpl * add configs for llama 3.x models * remove HF_TOKEN from tt-studio .env for ease of setup * add environment file processing --- app/api/docker_control/docker_utils.py | 46 +++-- app/api/model_control/apps.py | 2 +- app/api/model_control/views.py | 2 +- app/api/shared_config/backend_config.py | 2 +- app/api/shared_config/device_config.py | 3 + app/api/shared_config/model_config.py | 216 +++++++++++++++--------- app/api/shared_config/setup_config.py | 6 + 7 files changed, 177 insertions(+), 100 deletions(-) create mode 100644 app/api/shared_config/setup_config.py diff --git a/app/api/docker_control/docker_utils.py b/app/api/docker_control/docker_utils.py index c9d1cc56..163f4ce5 100644 --- a/app/api/docker_control/docker_utils.py +++ b/app/api/docker_control/docker_utils.py @@ -35,7 +35,9 @@ def run_container(impl, weights_id): logger.info(f"run_container called for {impl.model_name}") run_kwargs = copy.deepcopy(impl.docker_config) # handle runtime configuration changes to docker kwargs - run_kwargs.update({"devices": get_devices_mounts(impl)}) + device_mounts = get_devices_mounts(impl) + if device_mounts: + run_kwargs.update({"devices": device_mounts}) run_kwargs.update({"ports": get_port_mounts(impl)}) # add bridge inter-container network run_kwargs.update({"network": backend_config.docker_bridge_network_name}) @@ -87,14 +89,18 @@ def get_devices_mounts(impl): device_config = get_runtime_device_configuration(impl.device_configurations) assert isinstance(device_config, DeviceConfigurations) # TODO: add logic to handle multiple devices and multiple containers - # e.g. running falcon-7B and mistral-7B on 2x n150 machine - if device_config in {DeviceConfigurations.N150, DeviceConfigurations.E150}: - devices = ["/dev/tenstorrent/0:/dev/tenstorrent/0"] - elif device_config == DeviceConfigurations.N300x4: - devices = ["/dev/tenstorrent:/dev/tenstorrent"] - elif device_config == DeviceConfigurations.CPU: - devices = None - return devices + single_device_mounts = ["/dev/tenstorrent/0:/dev/tenstorrent/0"] + all_device_mounts = ["/dev/tenstorrent:/dev/tenstorrent"] + device_map = { + DeviceConfigurations.E150: single_device_mounts, + DeviceConfigurations.N150: single_device_mounts, + DeviceConfigurations.N150_WH_ARCH_YAML: single_device_mounts, + DeviceConfigurations.N300: single_device_mounts, + DeviceConfigurations.N300x4_WH_ARCH_YAML: all_device_mounts, + DeviceConfigurations.N300x4: all_device_mounts, + } + device_mounts = device_map.get(device_config) + return device_mounts def get_port_mounts(impl): @@ -187,15 +193,19 @@ def get_container_status(): def update_deploy_cache(): data = get_container_status() for con_id, con in data.items(): - model_impl = [ - v - for k, v in model_implmentations.items() - if v.image_version == con["image_name"] - ] - assert ( - len(model_impl) == 1 - ), f"Cannot find model_impl={model_impl} for {con['image_name']}" - model_impl = model_impl[0] + con_model_id = con['env_vars'].get("MODEL_ID") + model_impl = model_implmentations.get(con_model_id) + if not model_impl: + # fallback to finding first impl that uses that container + model_impl = [ + v + for k, v in model_implmentations.items() + if v.image_version == con["image_name"] + ] + assert ( + len(model_impl) == 1 + ), f"Cannot find model_impl={model_impl} for {con['image_name']}" + model_impl = model_impl[0] con["model_id"] = model_impl.model_id con["weights_id"] = con["env_vars"].get("MODEL_WEIGHTS_ID") con["model_impl"] = model_impl diff --git a/app/api/model_control/apps.py b/app/api/model_control/apps.py index b310145a..e7a0b543 100644 --- a/app/api/model_control/apps.py +++ b/app/api/model_control/apps.py @@ -19,4 +19,4 @@ def ready(self): # run once logger.info("Initializing models API") for model_id, impl in model_implmentations.items(): - impl.init_volumes() + impl.setup() diff --git a/app/api/model_control/views.py b/app/api/model_control/views.py index b4b4e1af..7123819e 100644 --- a/app/api/model_control/views.py +++ b/app/api/model_control/views.py @@ -38,7 +38,7 @@ def post(self, request, *args, **kwargs): internal_url = "http://" + deploy["internal_url"] logger.info(f"internal_url:= {internal_url}") logger.info(f"using vllm model:= {deploy["model_impl"].model_name}") - data["model"] = deploy["model_impl"].hf_model_path + data["model"] = deploy["model_impl"].hf_model_id response_stream = stream_response_from_external_api(internal_url, data) return StreamingHttpResponse(response_stream, content_type="text/plain") else: diff --git a/app/api/shared_config/backend_config.py b/app/api/shared_config/backend_config.py index b6f6a56c..555b205b 100644 --- a/app/api/shared_config/backend_config.py +++ b/app/api/shared_config/backend_config.py @@ -33,7 +33,7 @@ class BackendConfig: django_deploy_cache_name="deploy_cache", docker_bridge_network_name="tt_studio_network", weights_dir="model_weights", - model_container_cache_root="/home/user/cache_root", + model_container_cache_root="/home/container_app_user/cache_root", jwt_secret=os.environ["JWT_SECRET"], ) diff --git a/app/api/shared_config/device_config.py b/app/api/shared_config/device_config.py index e4035204..151b1433 100644 --- a/app/api/shared_config/device_config.py +++ b/app/api/shared_config/device_config.py @@ -10,6 +10,9 @@ class DeviceConfigurations(Enum): CPU = auto() E150 = auto() N150 = auto() + N300 = auto() + T3K_RING = auto() + T3K_LINE = auto() N150_WH_ARCH_YAML = auto() N300x4 = auto() N300x4_WH_ARCH_YAML = auto() diff --git a/app/api/shared_config/model_config.py b/app/api/shared_config/model_config.py index 590ea2a9..e33c3a33 100644 --- a/app/api/shared_config/model_config.py +++ b/app/api/shared_config/model_config.py @@ -9,6 +9,7 @@ from shared_config.device_config import DeviceConfigurations from shared_config.backend_config import backend_config +from shared_config.setup_config import SetupTypes from shared_config.logger_config import get_logger logger = get_logger(__name__) @@ -16,18 +17,24 @@ def load_dotenv_dict(env_path: Union[str, Path]) -> Dict[str, str]: + if not env_path: + return {} + + # instead, use tt-studio configured JWT_SECRET + exluded_keys = ["JWT_SECRET"] env_path = Path(env_path) if not env_path.exists(): logger.error(f"Env file not found: {env_path}") env_dict = {} + logger.info(f"Using env file: {env_path}") with open(env_path) as f: lines = f.readlines() for line in lines: if line.strip() and not line.startswith('#'): key, value = line.strip().split('=', 1) # expand any $VAR or ${VAR} and ~ - value = os.path.expandvars(value) - env_dict[key] = value + if key not in exluded_keys: + env_dict[key] = value return env_dict @@ -37,25 +44,29 @@ class ModelImpl: Model implementation configuration defines everything known about a model implementations before runtime, e.g. not handling ports, available devices""" - model_name: str - model_id: str image_name: str image_tag: str device_configurations: Set["DeviceConfigurations"] docker_config: Dict[str, Any] - user_uid: int # user inside docker container uid (for file permissions) - user_gid: int # user inside docker container gid (for file permissions) - shm_size: str - service_port: int service_route: str + setup_type: SetupTypes + hf_model_id: str = None + model_name: str = None # uses defaults based on hf_model_id + model_id: str = None # uses defaults based on hf_model_id + impl_id: str = "tt-metal" # implementation ID + version: str = "0.0.1" + shm_size: str = "32G" + service_port: int = 7000 env_file: str = "" health_route: str = "/health" - hf_model_path: str = "" def __post_init__(self): + # _init methods compute values that are dependent on other values + self._init_model_name() + self.docker_config.update({"volumes": self.get_volume_mounts()}) self.docker_config["shm_size"] = self.shm_size - self.docker_config["environment"]["HF_MODEL_PATH"] = self.hf_model_path + self.docker_config["environment"]["HF_MODEL_PATH"] = self.hf_model_id self.docker_config["environment"]["HF_HOME"] = Path( backend_config.model_container_cache_root ).joinpath("huggingface") @@ -64,14 +75,6 @@ def __post_init__(self): if DeviceConfigurations.N150 in self.device_configurations or DeviceConfigurations.N300x4 in self.device_configurations: self.docker_config["environment"]["WH_ARCH_YAML"] = "wormhole_b0_80_arch_eth_dispatch.yaml" - if self.env_file: - logger.info(f"Using env file: {self.env_file}") - # env file should be in persistent volume mounted - env_dict = load_dotenv_dict(self.env_file) - # env file overrides any existing docker environment variables - self.docker_config["environment"].update(env_dict) - - # Set environment variable if N150_WH_ARCH_YAML or N300x4_WH_ARCH_YAML is in the device configurations if ( DeviceConfigurations.N150_WH_ARCH_YAML in self.device_configurations @@ -81,12 +84,16 @@ def __post_init__(self): "wormhole_b0_80_arch_eth_dispatch.yaml" ) - if self.env_file: - logger.info(f"Using env file: {self.env_file}") - # env file should be in persistent volume mounted - env_dict = load_dotenv_dict(self.env_file) - # env file overrides any existing docker environment variables - self.docker_config["environment"].update(env_dict) + # model env file must be interpreted here + if not self.env_file: + _env_file = self.get_model_env_file() + else: + _env_file = self.env_file + + # env file should be in persistent volume mounted + env_dict = load_dotenv_dict(_env_file) + # env file overrides any existing docker environment variables + self.docker_config["environment"].update(env_dict) # Set environment variable if N150_WH_ARCH_YAML or N300x4_WH_ARCH_YAML is in the device configurations if ( @@ -97,13 +104,6 @@ def __post_init__(self): "wormhole_b0_80_arch_eth_dispatch.yaml" ) - if self.env_file: - logger.info(f"Using env file: {self.env_file}") - # env file should be in persistent volume mounted - env_dict = load_dotenv_dict(self.env_file) - # env file overrides any existing docker environment variables - self.docker_config["environment"].update(env_dict) - @property def image_version(self) -> str: return f"{self.image_name}:{self.image_tag}" @@ -143,6 +143,36 @@ def model_container_weights_dir(self) -> Path: def backend_hf_home(self) -> Path: return self.backend_weights_dir.joinpath("huggingface") + def _init_model_name(self): + # Note: ONLY run this in __post_init__ + # need to use __setattr__ because instance is frozen + assert self.hf_model_id or self.model_name, "either hf_model_id or model_name must be set." + if not self.model_name: + # use basename of HF model ID to use same format as tt-transformers + object.__setattr__(self, 'model_name', Path(self.hf_model_id).name) + if not self.model_id: + object.__setattr__(self, 'model_id', self.get_default_model_id()) + if not self.hf_model_id: + logger.info(f"model_name:={self.model_name} does not have a hf_model_id set") + + def get_default_model_id(self): + return f"id_{self.impl_id}-{self.model_name}-v{self.version}" + + def get_model_env_file(self): + ret_env_file = None + model_env_dir_name = "model_envs" + model_env_dir = Path(backend_config.persistent_storage_volume).joinpath(model_env_dir_name) + if model_env_dir.exists(): + env_fname = f"{self.model_name}.env" + model_env_fpath = model_env_dir.joinpath(env_fname) + if model_env_fpath.exists(): + ret_env_file = model_env_fpath + else: + logger.warning(f"for model {self.model_name} env file: {model_env_fpath} does not exist, have you run tt-inference-server setup.sh for the model?") + else: + logger.warning(f"{model_env_dir} does not exist, have you run tt-inference-server setup.sh?") + return ret_env_file + def get_volume_mounts(self): # use type=volume for persistent storage with a Docker managed named volume # target: this should be set to same location as the CACHE_ROOT environment var @@ -156,14 +186,24 @@ def get_volume_mounts(self): } return volume_mounts + def setup(self): + # verify model setup and runtime setup + self.init_volumes() + def init_volumes(self): - # need to make directory in app backend container to allow for correct perimission to be set - self.volume_path.mkdir(parents=True, exist_ok=True) - os.chown(self.volume_path, uid=self.user_uid, gid=self.user_gid) - self.backend_weights_dir.mkdir(parents=True, exist_ok=True) - os.chown(self.backend_weights_dir, uid=self.user_uid, gid=self.user_gid) - # self.backend_hf_home.mkdir(parents=True, exist_ok=True) - # os.chown(self.backend_hf_home, uid=self.user_uid, gid=self.user_gid) + # check volumes + if self.setup_type == SetupTypes.TT_INFERENCE_SERVER: + if self.volume_path.exists(): + logger.info(f"Found {self.volume_path}") + else: + logger.info(f"Model volume does not exist: {self.volume_path}") + logger.error(f"Initialize this model by running the tt-inference-server setup.sh script") + elif self.setup_type == SetupTypes.MAKE_VOLUMES: + if not self.volume_path.exists(): + # if not setup is required for the model, backend can make the volume + self.volume_path.mkdir(parents=True, exist_ok=True) + elif self.setup_type == SetupTypes.NO_SETUP: + logger.info(f"Model {self.model_id} does not require a volume") def asdict(self): return asdict(self) @@ -172,14 +212,12 @@ def asdict(self): def base_docker_config(): return { # Note: mounts and devices are determined in `docker_utils.py` - "user": "user", "auto_remove": True, "cap_add": "ALL", # TODO: add minimal permissions "detach": True, "environment": { "JWT_SECRET": backend_config.jwt_secret, "CACHE_ROOT": backend_config.model_container_cache_root, - "HF_TOKEN": backend_config.hf_token, }, } @@ -194,71 +232,91 @@ def base_docker_config(): image_tag="v0.0.1-tt-metal-65d246482b3f", device_configurations={DeviceConfigurations.N150}, docker_config=base_docker_config(), - user_uid=1000, - user_gid=1000, shm_size="32G", service_port=7000, service_route="/objdetection_v2", + setup_type=SetupTypes.NO_SETUP, ), ModelImpl( + hf_model_id="meta-llama/Llama-3.1-70B-Instruct", model_name="Mock-Llama-3.1-70B-Instruct", model_id="id_mock_vllm_modelv0.0.1", image_name="ghcr.io/tenstorrent/tt-inference-server/mock.vllm.openai.api", image_tag="v0.0.1-tt-metal-385904186f81-384f1790c3be", - hf_model_path="meta-llama/Llama-3.1-70B-Instruct", device_configurations={DeviceConfigurations.CPU}, docker_config=base_docker_config(), - user_uid=1000, - user_gid=1000, shm_size="1G", service_port=7000, service_route="/v1/chat/completions", + setup_type=SetupTypes.MAKE_VOLUMES, ), ModelImpl( - model_name="Falcon-7B-Instruct", - model_id="id_tt-metal-falcon-7bv0.0.13", - image_name="tt-metal-falcon-7b", - image_tag="v0.0.13", - device_configurations={DeviceConfigurations.N150_WH_ARCH_YAML}, - hf_model_path="tiiuae/falcon-7b-instruct", - docker_config=base_docker_config(), - user_uid=1000, - user_gid=1000, - shm_size="32G", - service_port=7000, - service_route="/inference/falcon7b", - ), - ModelImpl( - model_name="Llama-3.1-70B-Instruct", - model_id="id_tt-metal-llama-3.1-70b-instructv0.0.1", + hf_model_id="meta-llama/Llama-3.1-70B-Instruct", image_name="ghcr.io/tenstorrent/tt-inference-server/tt-metal-llama3-70b-src-base-vllm", image_tag="v0.0.3-tt-metal-385904186f81-384f1790c3be", - hf_model_path="meta-llama/Llama-3.1-70B-Instruct", device_configurations={DeviceConfigurations.N300x4_WH_ARCH_YAML}, docker_config=base_docker_config(), - user_uid=1000, - user_gid=1000, shm_size="32G", service_port=7000, service_route="/v1/chat/completions", env_file=os.environ.get("VLLM_LLAMA31_ENV_FILE"), + setup_type=SetupTypes.TT_INFERENCE_SERVER, + ), + ModelImpl( + hf_model_id="meta-llama/Llama-3.2-1B-Instruct", + image_name="ghcr.io/tenstorrent/tt-inference-server/vllm-llama3-src-dev-ubuntu-20.04-amd64", + image_tag="v0.0.1-47fb1a2fb6e0-2f33504bad49", + device_configurations={DeviceConfigurations.N300x4}, + docker_config=base_docker_config(), + service_route="/v1/chat/completions", + setup_type=SetupTypes.TT_INFERENCE_SERVER, + ), + ModelImpl( + hf_model_id="meta-llama/Llama-3.2-3B-Instruct", + image_name="ghcr.io/tenstorrent/tt-inference-server/vllm-llama3-src-dev-ubuntu-20.04-amd64", + image_tag="v0.0.1-47fb1a2fb6e0-2f33504bad49", + device_configurations={DeviceConfigurations.N300x4}, + docker_config=base_docker_config(), + service_route="/v1/chat/completions", + setup_type=SetupTypes.TT_INFERENCE_SERVER, + ), + ModelImpl( + hf_model_id="meta-llama/Llama-3.1-8B-Instruct", + image_name="ghcr.io/tenstorrent/tt-inference-server/vllm-llama3-src-dev-ubuntu-20.04-amd64", + image_tag="v0.0.1-47fb1a2fb6e0-2f33504bad49", + device_configurations={DeviceConfigurations.N300x4}, + docker_config=base_docker_config(), + service_route="/v1/chat/completions", + setup_type=SetupTypes.TT_INFERENCE_SERVER, + ), + ModelImpl( + hf_model_id="meta-llama/Llama-3.2-11B-Vision-Instruct", + image_name="ghcr.io/tenstorrent/tt-inference-server/vllm-llama3-src-dev-ubuntu-20.04-amd64", + image_tag="v0.0.1-47fb1a2fb6e0-2f33504bad49", + device_configurations={DeviceConfigurations.N300x4}, + docker_config=base_docker_config(), + service_route="/v1/chat/completions", + setup_type=SetupTypes.TT_INFERENCE_SERVER, + ), + ModelImpl( + hf_model_id="meta-llama/Llama-3.1-70B-Instruct", + image_name="ghcr.io/tenstorrent/tt-inference-server/vllm-llama3-src-dev-ubuntu-20.04-amd64", + image_tag="v0.0.1-47fb1a2fb6e0-2f33504bad49", + device_configurations={DeviceConfigurations.N300x4}, + docker_config=base_docker_config(), + service_route="/v1/chat/completions", + setup_type=SetupTypes.TT_INFERENCE_SERVER, + ), + ModelImpl( + hf_model_id="meta-llama/Llama-3.3-70B-Instruct", + image_name="ghcr.io/tenstorrent/tt-inference-server/vllm-llama3-src-dev-ubuntu-20.04-amd64", + image_tag="v0.0.1-47fb1a2fb6e0-2f33504bad49", + device_configurations={DeviceConfigurations.N300x4}, + docker_config=base_docker_config(), + service_route="/v1/chat/completions", + setup_type=SetupTypes.TT_INFERENCE_SERVER, ), #! Add new model vLLM model implementations here - # ModelImpl( - # model_name="", #? Add the model name for the vLLM model based on persistent storage - # model_id="", #? Add the model id for the vLLM model based on persistent storage - # image_name="ghcr.io/tenstorrent/tt-inference-server/tt-metal-llama3-70b-src-base-vllm", - # image_tag="v0.0.1-tt-metal-685ef1303b5a-54b9157d852b", - # hf_model_path="meta-llama/Llama-3.1-70B-Instruct", - # device_configurations={DeviceConfigurations.N300x4}, - # docker_config=base_docker_config(), - # user_uid=1000, - # user_gid=1000, - # shm_size="32G", - # service_port=7000, - # service_route="/inference/**", #? Add the correct route for the vLLM model - # env_file=os.environ.get("VLLM_LLAMA31_ENV_FILE"), - # ) ] def validate_model_implemenation_config(impl): diff --git a/app/api/shared_config/setup_config.py b/app/api/shared_config/setup_config.py new file mode 100644 index 00000000..c89a966f --- /dev/null +++ b/app/api/shared_config/setup_config.py @@ -0,0 +1,6 @@ +from enum import IntEnum, auto + +class SetupTypes(IntEnum): + NO_SETUP = auto() # 1 + MAKE_VOLUMES = auto() # 2 + TT_INFERENCE_SERVER = auto() # 3