Skip to content

Commit

Permalink
Manifest updates (#81)
Browse files Browse the repository at this point in the history
* squashed modify filter for LoRA affinity

modify filter for LoRA affinity

* update llm service and llm server pool yaml, readme

* remove ununsed method from metrics.go

* add flowchart image

* update size flowchart image

* remove image name

* update queueingThresholdLoRA to 50

* rollback filter related changes

* rollback filter related changes in docs

* addressing comments

* addressing comments
  • Loading branch information
kaushikmitr authored Dec 10, 2024
1 parent ca47aa2 commit 83f701b
Show file tree
Hide file tree
Showing 6 changed files with 158 additions and 45 deletions.
95 changes: 86 additions & 9 deletions examples/poc/manifests/llmservice.yaml
Original file line number Diff line number Diff line change
@@ -1,4 +1,14 @@
apiVersion: inference.networking.x-k8s.io/v1alpha1
kind: LLMServerPool
metadata:
labels:
name: vllm-llama2-7b-pool
spec:
targetPort: 8000
modelServerSelector:
"app": "vllm-llama2-7b-pool"
---
apiVersion: inference.networking.x-k8s.io/v1alpha1
kind: LLMService
metadata:
labels:
Expand All @@ -7,17 +17,84 @@ metadata:
name: llmservice-sample
spec:
models:
- name: sql-code-assist
- name: npc-bot
- name: sql-lora
objective:
desiredAveragePerOutputTokenLatencyAtP95OverMultipleRequests: 50
targetModels:
- name: sql-lora
weight: 100
- name: sql-lora-0
objective:
desiredAveragePerOutputTokenLatencyAtP95OverMultipleRequests: 50
targetModels:
- name: sql-lora-0
weight: 100
- name: sql-lora-1
objective:
desiredAveragePerOutputTokenLatencyAtP95OverMultipleRequests: 50
targetModels:
- name: sql-lora-1
weight: 100
- name: sql-lora-2
objective:
desiredAveragePerOutputTokenLatencyAtP95OverMultipleRequests: 50
targetModels:
- name: sql-lora-2
weight: 100
- name: sql-lora-3
objective:
desiredAveragePerOutputTokenLatencyAtP95OverMultipleRequests: 50
targetModels:
- name: sql-lora-3
weight: 100
- name: sql-lora-4
objective:
desiredAveragePerOutputTokenLatencyAtP95OverMultipleRequests: 50
targetModels:
- name: sql-lora-4
weight: 100
- name: tweet-summary
objective:
desiredAveragePerOutputTokenLatencyAtP95OverMultipleRequests: 50
targetModels:
- name: tweet-summary
weight: 100
- name: tweet-summary-0
objective:
desiredAveragePerOutputTokenLatencyAtP95OverMultipleRequests: 50
targetModels:
- name: tweet-summary-0
weight: 100
- name: tweet-summary-1
objective:
desiredAveragePerOutputTokenLatencyAtP95OverMultipleRequests: 50
targetModels:
- name: tweet-summary-1
weight: 100
- name: tweet-summary-2
objective:
desiredAveragePerOutputTokenLatencyAtP95OverMultipleRequests: 50
targetModels:
- name: tweet-summary-2
weight: 100
- name: tweet-summary-3
objective:
desiredAveragePerOutputTokenLatencyAtP95OverMultipleRequests: 50
targetModels:
- name: tweet-summary-3
weight: 100
- name: tweet-summary-4
objective:
desiredAveragePerOutputTokenLatencyAtP95OverMultipleRequests: 50
targetModels:
- name: tweet-summary-4
weight: 100
- name: meta-llama/Llama-2-7b-hf
objective:
desiredAveragePerOutputTokenLatencyAtP95OverMultipleRequests: 50
targetModels:
- name: npc-bot-v1
weight: 50
- name: npc-bot-v2
weight: 50
- name: meta-llama/Llama-2-7b-hf
weight: 100
poolRef:
- kind: LLMServerPool
name: test-pool
- name: gemini-pool
kind: LLMServerPool
name: vllm-llama2-7b-pool
23 changes: 19 additions & 4 deletions examples/poc/manifests/vllm/vllm-lora-deployment.yaml
Original file line number Diff line number Diff line change
@@ -1,17 +1,32 @@
apiVersion: v1
kind: Service
metadata:
name: vllm-llama2-7b-pool
spec:
selector:
app: vllm-llama2-7b-pool
ports:
- protocol: TCP
port: 8000
targetPort: 8000
type: ClusterIP

---

apiVersion: apps/v1
kind: Deployment
metadata:
name: vllm
name: vllm-llama2-7b-pool
namespace: default
spec:
replicas: 6
replicas: 3
selector:
matchLabels:
app: vllm
app: vllm-llama2-7b-pool
template:
metadata:
labels:
app: vllm
app: vllm-llama2-7b-pool
spec:
containers:
- name: lora
Expand Down
12 changes: 0 additions & 12 deletions examples/poc/manifests/vllm/vllm-lora-service.yaml

This file was deleted.

61 changes: 47 additions & 14 deletions pkg/manifests/ext_proc.yaml
Original file line number Diff line number Diff line change
@@ -1,44 +1,77 @@
kind: ClusterRole
apiVersion: rbac.authorization.k8s.io/v1
metadata:
name: pod-read
rules:
- apiGroups: ["inference.networking.x-k8s.io"]
resources: ["llmservices"]
verbs: ["get", "watch", "list"]
- apiGroups: [""]
resources: ["pods"]
verbs: ["get", "watch", "list"]
- apiGroups: ["inference.networking.x-k8s.io"]
resources: ["llmserverpools"]
verbs: ["get", "watch", "list"]
- apiGroups: ["discovery.k8s.io"]
resources: ["endpointslices"]
verbs: ["get", "watch", "list"]
---
kind: ClusterRoleBinding
apiVersion: rbac.authorization.k8s.io/v1
metadata:
name: pod-read-binding
subjects:
- kind: ServiceAccount
name: default
namespace: default
roleRef:
kind: ClusterRole
name: pod-read
---

apiVersion: apps/v1
kind: Deployment
metadata:
name: instance-gateway-ext-proc
name: inference-gateway-ext-proc
namespace: default
labels:
app: instance-gateway-ext-proc
app: inference-gateway-ext-proc
spec:
replicas: 1
selector:
matchLabels:
app: instance-gateway-ext-proc
app: inference-gateway-ext-proc
template:
metadata:
labels:
app: instance-gateway-ext-proc
app: inference-gateway-ext-proc
spec:
containers:
- name: instance-gateway-ext-proc
- name: inference-gateway-ext-proc
# TODO(https://github.com/kubernetes-sigs/llm-instance-gateway/issues/34) Update the image and args.
image: <BUILT-IMAGE>
args:
# TODO(https://github.com/kubernetes-sigs/llm-instance-gateway/issues/12) Remove this once ext proc can dynamically reconcile on LLMServerPool.
- -pods
- "vllm-78665f78c4-h4kx4,vllm-78665f78c4-hnz84"
- -podIPs
- "10.24.11.6:8000,10.24.5.7:8000"
- -serverPoolName
- "vllm-llama2-7b-pool"
- -v
- "3"
- -serviceName
- "vllm-llama2-7b-pool"
ports:
- containerPort: 9002

- name: curl
image: curlimages/curl
command: ["sleep", "3600"]
---
apiVersion: v1
kind: Service
metadata:
name: instance-gateway-ext-proc
name: inference-gateway-ext-proc
namespace: default
spec:
selector:
app: instance-gateway-ext-proc
app: inference-gateway-ext-proc
ports:
- protocol: TCP
port: 9002
Expand All @@ -55,12 +88,12 @@ spec:
- backendRefs:
- group: ""
kind: Service
name: instance-gateway-ext-proc
name: inference-gateway-ext-proc
port: 9002
processingMode:
request:
body: Buffered
response:
response: {}
# The timeouts are likely not needed here. We can experiment with removing/tuning them slowly.
# The connection limits are more important and will cause the opaque: ext_proc_gRPC_error_14 error in Envoy GW if not configured correctly.
messageTimeout: 1000s
Expand Down
8 changes: 4 additions & 4 deletions pkg/manifests/gateway.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,9 @@
apiVersion: gateway.networking.k8s.io/v1
kind: Gateway
metadata:
name: <GATEWAY-NAME>
name: inference-gateway
spec:
gatewayClassName: <GATEWAY-NAME>
gatewayClassName: inference-gateway
listeners:
- name: http
protocol: HTTP
Expand All @@ -17,7 +17,7 @@ spec:
apiVersion: gateway.networking.k8s.io/v1
kind: GatewayClass
metadata:
name: <GATEWAY-NAME>
name: inference-gateway
spec:
controllerName: gateway.envoyproxy.io/gatewayclass-controller
---
Expand All @@ -38,7 +38,7 @@ metadata:
name: llm-route
spec:
parentRefs:
- name: <GATEWAY-NAME>
- name: inference-gateway
sectionName: llm-gw
rules:
- backendRefs:
Expand Down
4 changes: 2 additions & 2 deletions pkg/manifests/patch_policy.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ spec:
targetRef:
group: gateway.networking.k8s.io
kind: Gateway
name: <GATEWAY-NAME>
name: inference-gateway
type: JSONPatch
jsonPatches:
# Necessary to create a cluster of the type: ORIGINAL_DST to allow for
Expand Down Expand Up @@ -36,7 +36,7 @@ spec:
max_requests: 40000

- type: "type.googleapis.com/envoy.config.route.v3.RouteConfiguration"
name: default/<GATEWAY-NAME>/llm-gw
name: default/inference-gateway/llm-gw
operation:
op: replace
path: "/virtual_hosts/0/routes/0/route/cluster"
Expand Down

0 comments on commit 83f701b

Please sign in to comment.