diff --git a/.dockerignore b/.dockerignore
index b0c8185..23e3147 100644
--- a/.dockerignore
+++ b/.dockerignore
@@ -3,7 +3,7 @@
 .github
 .vscode
 .gitignore
-Dockerfile
+*Dockerfile
 README.md
 node_modules
 package-lock.json
@@ -11,4 +11,5 @@ pnpm-lock.yaml
 eslint.config.mjs
 LICENSE
 volumes
-docker-compose.yaml
\ No newline at end of file
+docker-compose*
+Makefile
\ No newline at end of file
diff --git a/Dockerfile b/Dockerfile
index f96d875..bfeb2ab 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -3,8 +3,6 @@ WORKDIR /app
 COPY . .
 
 HEALTHCHECK --interval=300s --timeout=30s --start-period=5s --retries=3 CMD [ "node", "healthy-check.js" ]
-# RUN npm install -g pnpm && pnpm install
-RUN npm install -g pnpm nodemon && pnpm install
+RUN npm install -g pnpm && pnpm install
 EXPOSE 8000
-# ENTRYPOINT [ "npm", "start" ]
-ENTRYPOINT [ "npm", "run", "dev" ]
\ No newline at end of file
+ENTRYPOINT [ "npm", "start" ]
\ No newline at end of file
diff --git a/Makefile b/Makefile
index 50a90b2..d9490d5 100644
--- a/Makefile
+++ b/Makefile
@@ -54,6 +54,7 @@ model-prepare:
 	@mkdir -p $(MODEL_SAVE_PATH) && [ -f $(MODEL_SAVE_PATH)/$(LANGUAGE_MODEL_NAME) ] || wget -O $(MODEL_SAVE_PATH)/$(LANGUAGE_MODEL_NAME) $(LANGUAGE_MODEL_URL)
 	@mkdir -p $(MODEL_SAVE_PATH) && [ -f $(MODEL_SAVE_PATH)/$(EMBEDDING_MODEL_NAME) ] || wget -O $(MODEL_SAVE_PATH)/$(EMBEDDING_MODEL_NAME) $(EMBEDDING_MODEL_URL)
 
+#  normal build & up
 .PHONY: compose-build
 compose-build: env model-prepare
 	@docker compose -f docker-compose.yaml build
@@ -61,3 +62,17 @@ compose-build: env model-prepare
 .PHONY: up
 up: compose-build
 	@docker compose -f docker-compose.yaml up -d
+
+#  dev build & up
+.PHONY: compose-build-dev
+compose-build-dev: env model-prepare
+	@docker compose -f docker-compose-dev.yaml build
+
+.PHONY: dev
+dev: env model-prepare
+	@docker compose -f docker-compose-dev.yaml up -d
+
+# stop
+.PHONY: stop
+stop:
+	docker compose stop
\ No newline at end of file
diff --git a/README.md b/README.md
index 1bb243f..8497328 100644
--- a/README.md
+++ b/README.md
@@ -32,7 +32,12 @@ make build
 # if you want to start only this project in docker, please run
 make start
 # PLEASE NOTE: make start will automatically run make build first
+
+# to run a container bind to your local machine volume, run
+make dev
+# this will do the same thing as `make up` but allows you to make changes and sync with container
 ```
+**NOTE:** `make dev` Requires Node.js environment installed, or at least have `node_modules` specified in `package.json` installed on your server. Please see [Local Machine](#local-machine) section.
 
 ## Lint
 To start lint your code, simply run
@@ -41,4 +46,5 @@ npm run lint
 ```
 
 ## Monitor
-This project got monitor build with swagger-stats, when you got this project running, just go to `<Your Server>:<Your Port>/stats`
\ No newline at end of file
+This project got monitor build with swagger-stats, when you got this project running, just go to `<Your Server>:<Your Port>/stats`.  
+For example, [http://localhost:8000/stats](http://localhost:8000/stats)
\ No newline at end of file
diff --git a/actions/inference.js b/actions/inference.js
index 3463936..32258b5 100644
--- a/actions/inference.js
+++ b/actions/inference.js
@@ -44,6 +44,8 @@ function generateResponseContent(id, object, model, system_fingerprint, stream,
     return resp;
 }
 
+const default_stop_keywords = ['### user:']
+
 export async function chatCompletion(req, res) {
     const api_key = (req.headers.authorization || '').split('Bearer ').pop();
     if(!api_key) {
@@ -51,9 +53,15 @@ export async function chatCompletion(req, res) {
         return;
     }
 
-    const system_fingerprint = generateFingerprint();
-    let {messages, ...request_body} = req.body;
+    let {messages, max_tokens, ...request_body} = req.body;
+
+    // format requests to llamacpp format input
     request_body.prompt = formatOpenAIContext(messages);
+    if(max_tokens) request_body.n_predict = max_tokens;
+    if(!request_body.stop) request_body.stop = [...default_stop_keywords];
+
+    // extra
+    const system_fingerprint = generateFingerprint();
     const model = request_body.model || process.env.LANGUAGE_MODEL_NAME
 
     if(request_body.stream) {
diff --git a/devDockerfile b/devDockerfile
new file mode 100644
index 0000000..ec3cc1d
--- /dev/null
+++ b/devDockerfile
@@ -0,0 +1,8 @@
+FROM node:20.15.1-slim
+WORKDIR /app
+COPY . .
+
+HEALTHCHECK --interval=300s --timeout=30s --start-period=5s --retries=3 CMD [ "node", "healthy-check.js" ]
+RUN npm install -g pnpm nodemon && pnpm install
+EXPOSE 8000
+ENTRYPOINT [ "npm", "run", "dev" ]
\ No newline at end of file
diff --git a/docker-compose-dev.yaml b/docker-compose-dev.yaml
new file mode 100644
index 0000000..d34616f
--- /dev/null
+++ b/docker-compose-dev.yaml
@@ -0,0 +1,48 @@
+services:
+  llamacpp:
+    container_name: ${INFERENCE_ENG}
+    image: gclub/llama.cpp:${INFERENCE_ENG_VERSION}
+    restart: always
+    deploy: # https://github.com/compose-spec/compose-spec/blob/master/deploy.md
+      resources:
+        reservations:
+          cpus: "${NUM_CPU_CORES}"
+    volumes:
+      - "${DOCKER_VOLUME_DIRECTORY:-.}/${MODEL_SAVE_PATH}:/models"
+    expose:
+      - ${ENG_ACCESS_PORT}
+    ports:
+      - ${INFERENCE_ENG_PORT}:${ENG_ACCESS_PORT}
+    command: ["-m", "models/${LANGUAGE_MODEL_NAME}","-c","8192"]
+
+  embedding_eng:
+    container_name: ${EMBEDDING_ENG}
+    image: gclub/llama.cpp:${INFERENCE_ENG_VERSION}
+    restart: always
+    deploy: # https://github.com/compose-spec/compose-spec/blob/master/deploy.md
+      resources:
+        reservations:
+          cpus: "${NUM_CPU_CORES_EMBEDDING}"
+    volumes:
+      - "${DOCKER_VOLUME_DIRECTORY:-.}/${MODEL_SAVE_PATH}:/models"
+    expose:
+      - ${ENG_ACCESS_PORT}
+    ports:
+      - ${EMBEDDING_ENG_PORT}:${ENG_ACCESS_PORT}
+    command: ["-m", "models/${EMBEDDING_MODEL_NAME}","--embeddings","--pooling","mean","-c","512"]
+  
+  voyager:
+    container_name: voyager
+    restart: always
+    build:
+      dockerfile: devDockerfile
+      context: .
+    volumes:
+      - .:/app
+    expose:
+      - ${APP_PORT}
+    ports:
+      - ${APP_PORT}:${APP_PORT}
+    depends_on:
+      - llamacpp
+      - embedding_eng
\ No newline at end of file
diff --git a/docker-compose.yaml b/docker-compose.yaml
index 72c67d6..d87728b 100644
--- a/docker-compose.yaml
+++ b/docker-compose.yaml
@@ -37,8 +37,6 @@ services:
     build:
       dockerfile: Dockerfile
       context: .
-    volumes:
-      - .:/app
     expose:
       - ${APP_PORT}
     ports: