Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Hotfit for Nitro loading on CPU with hyper-threading support #931

Merged
merged 6 commits into from
Dec 12, 2023
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions core/src/types/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -275,6 +275,7 @@ export type ModelSettingParams = {
ngl?: number;
embedding?: boolean;
n_parallel?: number;
cpu_threads?: number;
system_prompt?: string;
user_prompt?: string;
ai_prompt?: string;
Expand Down
4 changes: 3 additions & 1 deletion extensions/inference-nitro-extension/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
"kill-port": "^2.0.1",
"path-browserify": "^1.0.1",
"rxjs": "^7.8.1",
"systeminformation": "^5.21.20",
"tcp-port-used": "^1.0.2",
"ts-loader": "^9.5.0",
"ulid": "^2.3.0"
Expand All @@ -52,6 +53,7 @@
"tcp-port-used",
"kill-port",
"fetch-retry",
"electron-log"
"electron-log",
"systeminformation"
]
}
6 changes: 6 additions & 0 deletions extensions/inference-nitro-extension/src/@types/global.d.ts
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ declare const INFERENCE_URL: string;
interface EngineSettings {
ctx_len: number;
ngl: number;
cpu_threads: number;
cont_batching: boolean;
embedding: boolean;
}
Expand All @@ -24,3 +25,8 @@ interface ModelOperationResponse {
error?: any;
modelFile?: string;
}

interface ResourcesInfo {
numCpuPhysicalCore: number;
memAvailable: number;
}
2 changes: 1 addition & 1 deletion extensions/inference-nitro-extension/src/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@ import {
EventName,
MessageRequest,
MessageStatus,
ModelSettingParams,
ExtensionType,
ThreadContent,
ThreadMessage,
Expand Down Expand Up @@ -41,6 +40,7 @@ export default class JanInferenceNitroExtension implements InferenceExtension {
private static _engineSettings: EngineSettings = {
ctx_len: 2048,
ngl: 100,
cpu_threads: 1,
cont_batching: false,
embedding: false,
};
Expand Down
51 changes: 38 additions & 13 deletions extensions/inference-nitro-extension/src/module.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ const path = require("path");
const { spawn } = require("child_process");
const tcpPortUsed = require("tcp-port-used");
const fetchRetry = require("fetch-retry")(global.fetch);
const si = require("systeminformation");

const log = require("electron-log");

Expand Down Expand Up @@ -38,23 +39,29 @@ function stopModel(): Promise<ModelOperationResponse> {
* TODO: Should pass absolute of the model file instead of just the name - So we can modurize the module.ts to npm package
* TODO: Should it be startModel instead?
*/
function initModel(wrapper: any): Promise<ModelOperationResponse> {
async function initModel(wrapper: any): Promise<ModelOperationResponse> {
currentModelFile = wrapper.modelFullPath;
if (wrapper.model.engine !== "nitro") {
return Promise.resolve({ error: "Not a nitro model" });
} else {
log.info("Started to load model " + wrapper.model.modelFullPath);
// Gather system information for CPU physical cores and memory
const nitroResourceProbe = await getResourcesInfo();
console.log(
"Nitro with physical core: " + nitroResourceProbe.numCpuPhysicalCore
);
const settings = {
llama_model_path: currentModelFile,
...wrapper.model.settings,
// This is critical and requires real system information
cpu_threads: nitroResourceProbe.numCpuPhysicalCore,
};
log.info(`Load model settings: ${JSON.stringify(settings, null, 2)}`);
return (
// 1. Check if the port is used, if used, attempt to unload model / kill nitro process
validateModelVersion()
.then(checkAndUnloadNitro)
// 2. Spawn the Nitro subprocess
.then(spawnNitroProcess)
.then(await spawnNitroProcess(nitroResourceProbe))
// 4. Load the model into the Nitro subprocess (HTTP POST request)
.then(() => loadLLMModel(settings))
// 5. Check if the model is loaded successfully
Expand Down Expand Up @@ -166,8 +173,8 @@ async function checkAndUnloadNitro() {
* Using child-process to spawn the process
* Should run exactly platform specified Nitro binary version
*/
async function spawnNitroProcess(): Promise<void> {
return new Promise((resolve, reject) => {
async function spawnNitroProcess(nitroResourceProbe: any): Promise<any> {
return new Promise(async (resolve, reject) => {
let binaryFolder = path.join(__dirname, "bin"); // Current directory by default
let binaryName;

Expand All @@ -191,9 +198,13 @@ async function spawnNitroProcess(): Promise<void> {
const binaryPath = path.join(binaryFolder, binaryName);

// Execute the binary
subprocess = spawn(binaryPath, [1, "127.0.0.1", PORT], {
cwd: binaryFolder,
});
subprocess = spawn(
binaryPath,
[nitroResourceProbe.numCpuPhysicalCore, "127.0.0.1", PORT],
hiro-v marked this conversation as resolved.
Show resolved Hide resolved
{
cwd: binaryFolder,
}
);

// Handle subprocess output
subprocess.stdout.on("data", (data) => {
Expand All @@ -211,7 +222,7 @@ async function spawnNitroProcess(): Promise<void> {
reject(`Nitro process exited. ${code ?? ""}`);
});
tcpPortUsed.waitUntilUsed(PORT, 300, 30000).then(() => {
resolve();
resolve(nitroResourceProbe);
});
});
}
Expand Down Expand Up @@ -263,17 +274,31 @@ function validateModelVersion(): Promise<void> {
});
}

/**
* Cleans up any registered resources.
* Its module specific function, should be called when application is closed
*/

function dispose() {
// clean other registered resources here
killSubprocess();
}

/**
* Get the system resources information
*/
async function getResourcesInfo(): Promise<ResourcesInfo> {
return new Promise(async (resolve) => {
const cpu = await si.cpu();
const mem = await si.mem();

const response = {
numCpuPhysicalCore: cpu.physicalCores,
memAvailable: mem.available,
};
resolve(response);
});
}

module.exports = {
initModel,
stopModel,
killSubprocess,
dispose,
};
1 change: 0 additions & 1 deletion extensions/inference-openai-extension/src/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@ import {
EventName,
MessageRequest,
MessageStatus,
ModelSettingParams,
ExtensionType,
ThreadContent,
ThreadMessage,
Expand Down