From 03911cc47986f40afaed9034b55d42b72e58006c Mon Sep 17 00:00:00 2001 From: Michael Hart Date: Fri, 29 Sep 2023 01:43:01 -0400 Subject: [PATCH] feat: Add Cloudflare Vectorize and Workers AI embeddings (#2740) * Add Cloudflare Vectorize and Workers AI embeddings * Add id options * Make params mandatory in CloudflareWorkersAIEmbeddings constructor * Add Vectorize/WorkersAI example to CF test exports * Small interface tweaks, add docs * Revert optional dep listing * Fix typo * Fix typo * Update docs, fix CI * Fix build --------- Co-authored-by: jacoblee93 --- .../integrations/cloudflare_ai.mdx | 43 ++++ .../integrations/cloudflare_vectorize.mdx | 57 +++++ .../integrations/vercel_postgres.mdx | 6 +- .../test-exports-cf/src/index.ts | 19 +- .../test-exports-cf/wrangler.toml | 9 +- .../cloudflare_vectorize/example.ts | 56 +++++ langchain/.gitignore | 6 + langchain/package.json | 23 +- langchain/scripts/create-entrypoints.js | 4 + .../src/embeddings/cloudflare_workersai.ts | 94 ++++++++ langchain/src/load/import_constants.ts | 2 + langchain/src/load/import_type.d.ts | 6 + .../src/vectorstores/cloudflare_vectorize.ts | 227 ++++++++++++++++++ langchain/tsconfig.json | 2 + yarn.lock | 219 ++++++++++++++++- 15 files changed, 744 insertions(+), 29 deletions(-) create mode 100644 docs/extras/modules/data_connection/text_embedding/integrations/cloudflare_ai.mdx create mode 100644 docs/extras/modules/data_connection/vectorstores/integrations/cloudflare_vectorize.mdx create mode 100644 examples/src/indexes/vector_stores/cloudflare_vectorize/example.ts create mode 100644 langchain/src/embeddings/cloudflare_workersai.ts create mode 100644 langchain/src/vectorstores/cloudflare_vectorize.ts diff --git a/docs/extras/modules/data_connection/text_embedding/integrations/cloudflare_ai.mdx b/docs/extras/modules/data_connection/text_embedding/integrations/cloudflare_ai.mdx new file mode 100644 index 000000000000..060b1fd412c9 --- /dev/null +++ b/docs/extras/modules/data_connection/text_embedding/integrations/cloudflare_ai.mdx @@ -0,0 +1,43 @@ +--- +hide_table_of_contents: true +--- + +# Cloudflare Workers AI + +If you're deploying your project in a Cloudflare worker, you can use Cloudflare's [built-in Workers AI embeddings](https://developers.cloudflare.com/workers-ai/) with LangChain.js. + +## Setup + +First, [follow the official docs](https://developers.cloudflare.com/workers-ai/get-started/workers-wrangler/) to set up your worker. + +You'll also need to install the official Cloudflare AI SDK: + +```bash npm2yarn +npm install @cloudflare/ai +``` + +## Usage + +Below is an example worker that uses Workers AI embeddings with a [Cloudflare Vectorize](/docs/modules/data_connection/vectorstores/integrations/cloudflare_vectorize) vectorstore. + +:::note +If running locally, be sure to run wrangler as `npx wrangler dev --remote`! +::: + +```toml +name = "langchain-test" +main = "worker.js" +compatibility_date = "2023-09-22" + +[[vectorize]] +binding = "VECTORIZE_INDEX" +index_name = "langchain-test" + +[ai] +binding = "AI" +``` + +import CodeBlock from "@theme/CodeBlock"; +import Example from "@examples/indexes/vector_stores/cloudflare_vectorize/example.ts"; + +{Example} diff --git a/docs/extras/modules/data_connection/vectorstores/integrations/cloudflare_vectorize.mdx b/docs/extras/modules/data_connection/vectorstores/integrations/cloudflare_vectorize.mdx new file mode 100644 index 000000000000..6b5953d71095 --- /dev/null +++ b/docs/extras/modules/data_connection/vectorstores/integrations/cloudflare_vectorize.mdx @@ -0,0 +1,57 @@ +--- +hide_table_of_contents: true +--- + +# Cloudflare Vectorize + +If you're deploying your project in a Cloudflare worker, you can use [Cloudflare Vectorize](https://developers.cloudflare.com/vectorize/) with LangChain.js. +It's a powerful and convenient option that's built directly into Cloudflare. + +## Setup + +:::tip Compatibility +Cloudflare Vectorize is currently in open beta, and requires a Cloudflare account on a paid plan to use. +::: + +After [setting up your project](https://developers.cloudflare.com/vectorize/get-started/intro/#prerequisites), +create an index by running the following Wrangler command: + +```bash +$ npx wrangler vectorize create --preset @cf/baai/bge-small-en-v1.5 +``` + +You can see a full list of options for the `vectorize` command [in the official documentation](https://developers.cloudflare.com/workers/wrangler/commands/#vectorize). + +You'll then need to update your `wrangler.toml` file to include an entry for `[[vectorize]]`: + +```toml +[[vectorize]] +binding = "VECTORIZE_INDEX" +index_name = "" +``` + +## Usage + +Below is an example worker that adds documents to a vectorstore, queries it, or clears it depending on the path used. It also uses [Cloudflare Workers AI Embeddings](/docs/modules/data_connection/text_embedding/integrations/cloudflare_ai). + +:::note +If running locally, be sure to run wrangler as `npx wrangler dev --remote`! +::: + +```toml +name = "langchain-test" +main = "worker.js" +compatibility_date = "2023-09-22" + +[[vectorize]] +binding = "VECTORIZE_INDEX" +index_name = "langchain-test" + +[ai] +binding = "AI" +``` + +import CodeBlock from "@theme/CodeBlock"; +import Example from "@examples/indexes/vector_stores/cloudflare_vectorize/example.ts"; + +{Example} diff --git a/docs/extras/modules/data_connection/vectorstores/integrations/vercel_postgres.mdx b/docs/extras/modules/data_connection/vectorstores/integrations/vercel_postgres.mdx index b040142438fe..7e8166e7cd12 100644 --- a/docs/extras/modules/data_connection/vectorstores/integrations/vercel_postgres.mdx +++ b/docs/extras/modules/data_connection/vectorstores/integrations/vercel_postgres.mdx @@ -1,6 +1,6 @@ # Vercel Postgres -LangChain.js supports using the [`@vercel/postgres`](https://www.npmjs.com/package/@vercel/postgres) package to use to generic Postgres databases +LangChain.js supports using the [`@vercel/postgres`](https://www.npmjs.com/package/@vercel/postgres) package to use generic Postgres databases as vector stores, provided they support the [`pgvector`](https://github.com/pgvector/pgvector) Postgres extension. This integration is particularly useful from web environments like Edge functions. @@ -13,7 +13,7 @@ To work with Vercel Postgres, you need to install the `@vercel/postgres` package npm install @vercel/postgres ``` -This integration automatically connects using the connection string set under `process.env.POSTGRES_URL`. +This integration automatically connects using the connection string set under `process.env.POSTGRES_URL`. You can also pass a connection string manually like this: ```typescript @@ -29,7 +29,7 @@ const vectorstore = await VercelPostgres.initialize( ### Connecting to Vercel Postgres -A simple way to get started is to create a serverless [Vercel Postgres instance](https://vercel.com/docs/storage/vercel-postgres/quickstart). +A simple way to get started is to create a serverless [Vercel Postgres instance](https://vercel.com/docs/storage/vercel-postgres/quickstart). If you're deploying to a Vercel project with an associated Vercel Postgres instance, the required `POSTGRES_URL` environment variable will already be populated in hosted environments. diff --git a/environment_tests/test-exports-cf/src/index.ts b/environment_tests/test-exports-cf/src/index.ts index 906974a6cdb9..81e9fec2c3bf 100644 --- a/environment_tests/test-exports-cf/src/index.ts +++ b/environment_tests/test-exports-cf/src/index.ts @@ -36,17 +36,16 @@ export default { env: Env, ctx: ExecutionContext ): Promise { - - const constructorParameters - = env.AZURE_OPENAI_API_KEY ? { - azureOpenAIApiKey: env.AZURE_OPENAI_API_KEY, - azureOpenAIApiInstanceName: env.AZURE_OPENAI_API_INSTANCE_NAME, - azureOpenAIApiDeploymentName: env.AZURE_OPENAI_API_DEPLOYMENT_NAME, - azureOpenAIApiVersion: env.AZURE_OPENAI_API_VERSION, - } + const constructorParameters = env.AZURE_OPENAI_API_KEY + ? { + azureOpenAIApiKey: env.AZURE_OPENAI_API_KEY, + azureOpenAIApiInstanceName: env.AZURE_OPENAI_API_INSTANCE_NAME, + azureOpenAIApiDeploymentName: env.AZURE_OPENAI_API_DEPLOYMENT_NAME, + azureOpenAIApiVersion: env.AZURE_OPENAI_API_VERSION, + } : { - openAIApiKey: env.OPENAI_API_KEY, - } + openAIApiKey: env.OPENAI_API_KEY, + }; // Intantiate a few things to test the exports new OpenAI(constructorParameters); diff --git a/environment_tests/test-exports-cf/wrangler.toml b/environment_tests/test-exports-cf/wrangler.toml index f1caece42e64..a30700992e7f 100644 --- a/environment_tests/test-exports-cf/wrangler.toml +++ b/environment_tests/test-exports-cf/wrangler.toml @@ -1,3 +1,10 @@ name = "test-exports-cf" main = "src/index.ts" -compatibility_date = "2023-04-05" +compatibility_date = "2023-09-22" + +[[vectorize]] +binding = "VECTORIZE_INDEX" +index_name = "langchain-test" + +[ai] +binding = "AI" diff --git a/examples/src/indexes/vector_stores/cloudflare_vectorize/example.ts b/examples/src/indexes/vector_stores/cloudflare_vectorize/example.ts new file mode 100644 index 000000000000..009744df3db6 --- /dev/null +++ b/examples/src/indexes/vector_stores/cloudflare_vectorize/example.ts @@ -0,0 +1,56 @@ +import type { + VectorizeIndex, + Fetcher, + Request, +} from "@cloudflare/workers-types"; + +import { CloudflareVectorizeStore } from "langchain/vectorstores/cloudflare_vectorize"; +import { CloudflareWorkersAIEmbeddings } from "langchain/embeddings/cloudflare_workersai"; + +export interface Env { + VECTORIZE_INDEX: VectorizeIndex; + AI: Fetcher; +} + +export default { + async fetch(request: Request, env: Env) { + const { pathname } = new URL(request.url); + const embeddings = new CloudflareWorkersAIEmbeddings({ + binding: env.AI, + modelName: "@cf/baai/bge-small-en-v1.5", + }); + const store = new CloudflareVectorizeStore(embeddings, { + index: env.VECTORIZE_INDEX, + }); + if (pathname === "/") { + const results = await store.similaritySearch("hello", 5); + return Response.json(results); + } else if (pathname === "/load") { + // Upsertion by id is supported + await store.addDocuments( + [ + { + pageContent: "hello", + metadata: {}, + }, + { + pageContent: "world", + metadata: {}, + }, + { + pageContent: "hi", + metadata: {}, + }, + ], + { ids: ["id1", "id2", "id3"] } + ); + + return Response.json({ success: true }); + } else if (pathname === "/clear") { + await store.delete({ ids: ["id1", "id2", "id3"] }); + return Response.json({ success: true }); + } + + return Response.json({ error: "Not Found" }, { status: 404 }); + }, +}; diff --git a/langchain/.gitignore b/langchain/.gitignore index 01e4fb1b533b..d3c53bdd107c 100644 --- a/langchain/.gitignore +++ b/langchain/.gitignore @@ -64,6 +64,9 @@ embeddings/base.d.ts embeddings/cache_backed.cjs embeddings/cache_backed.js embeddings/cache_backed.d.ts +embeddings/cloudflare_workersai.cjs +embeddings/cloudflare_workersai.js +embeddings/cloudflare_workersai.d.ts embeddings/fake.cjs embeddings/fake.js embeddings/fake.d.ts @@ -166,6 +169,9 @@ vectorstores/elasticsearch.d.ts vectorstores/memory.cjs vectorstores/memory.js vectorstores/memory.d.ts +vectorstores/cloudflare_vectorize.cjs +vectorstores/cloudflare_vectorize.js +vectorstores/cloudflare_vectorize.d.ts vectorstores/chroma.cjs vectorstores/chroma.js vectorstores/chroma.d.ts diff --git a/langchain/package.json b/langchain/package.json index 5ec4d0542582..19f911a33505 100644 --- a/langchain/package.json +++ b/langchain/package.json @@ -76,6 +76,9 @@ "embeddings/cache_backed.cjs", "embeddings/cache_backed.js", "embeddings/cache_backed.d.ts", + "embeddings/cloudflare_workersai.cjs", + "embeddings/cloudflare_workersai.js", + "embeddings/cloudflare_workersai.d.ts", "embeddings/fake.cjs", "embeddings/fake.js", "embeddings/fake.d.ts", @@ -178,6 +181,9 @@ "vectorstores/memory.cjs", "vectorstores/memory.js", "vectorstores/memory.d.ts", + "vectorstores/cloudflare_vectorize.cjs", + "vectorstores/cloudflare_vectorize.js", + "vectorstores/cloudflare_vectorize.d.ts", "vectorstores/chroma.cjs", "vectorstores/chroma.js", "vectorstores/chroma.d.ts", @@ -668,7 +674,8 @@ "@aws-sdk/types": "^3.357.0", "@azure/storage-blob": "^12.15.0", "@clickhouse/client": "^0.0.14", - "@cloudflare/workers-types": "^4.20230904.0", + "@cloudflare/ai": "^1.0.12", + "@cloudflare/workers-types": "^4.20230922.0", "@elastic/elasticsearch": "^8.4.0", "@faker-js/faker": "^7.6.0", "@getmetal/metal-sdk": "^4.0.0", @@ -789,7 +796,7 @@ "@aws-sdk/credential-provider-node": "^3.388.0", "@azure/storage-blob": "^12.15.0", "@clickhouse/client": "^0.0.14", - "@cloudflare/workers-types": "^4.20230904.0", + "@cloudflare/ai": "^1.0.12", "@elastic/elasticsearch": "^8.4.0", "@getmetal/metal-sdk": "*", "@getzep/zep-js": "^0.7.0", @@ -893,7 +900,7 @@ "@clickhouse/client": { "optional": true }, - "@cloudflare/workers-types": { + "@cloudflare/ai": { "optional": true }, "@elastic/elasticsearch": { @@ -1262,6 +1269,11 @@ "import": "./embeddings/cache_backed.js", "require": "./embeddings/cache_backed.cjs" }, + "./embeddings/cloudflare_workersai": { + "types": "./embeddings/cloudflare_workersai.d.ts", + "import": "./embeddings/cloudflare_workersai.js", + "require": "./embeddings/cloudflare_workersai.cjs" + }, "./embeddings/fake": { "types": "./embeddings/fake.d.ts", "import": "./embeddings/fake.js", @@ -1432,6 +1444,11 @@ "import": "./vectorstores/memory.js", "require": "./vectorstores/memory.cjs" }, + "./vectorstores/cloudflare_vectorize": { + "types": "./vectorstores/cloudflare_vectorize.d.ts", + "import": "./vectorstores/cloudflare_vectorize.js", + "require": "./vectorstores/cloudflare_vectorize.cjs" + }, "./vectorstores/chroma": { "types": "./vectorstores/chroma.d.ts", "import": "./vectorstores/chroma.js", diff --git a/langchain/scripts/create-entrypoints.js b/langchain/scripts/create-entrypoints.js index e66dee376395..bb0251045332 100644 --- a/langchain/scripts/create-entrypoints.js +++ b/langchain/scripts/create-entrypoints.js @@ -35,6 +35,7 @@ const entrypoints = { // embeddings "embeddings/base": "embeddings/base", "embeddings/cache_backed": "embeddings/cache_backed", + "embeddings/cloudflare_workersai": "embeddings/cloudflare_workersai", "embeddings/fake": "embeddings/fake", "embeddings/ollama": "embeddings/ollama", "embeddings/openai": "embeddings/openai", @@ -72,6 +73,7 @@ const entrypoints = { "vectorstores/base": "vectorstores/base", "vectorstores/elasticsearch": "vectorstores/elasticsearch", "vectorstores/memory": "vectorstores/memory", + "vectorstores/cloudflare_vectorize": "vectorstores/cloudflare_vectorize", "vectorstores/chroma": "vectorstores/chroma", "vectorstores/googlevertexai": "vectorstores/googlevertexai", "vectorstores/hnswlib": "vectorstores/hnswlib", @@ -280,6 +282,7 @@ const requiresOptionalDependency = [ "callbacks/handlers/llmonitor", "chains/load", "chains/sql_db", + "embeddings/cloudflare_workersai", "embeddings/cohere", "embeddings/googlevertexai", "embeddings/googlepalm", @@ -301,6 +304,7 @@ const requiresOptionalDependency = [ "prompts/load", "vectorstores/analyticdb", "vectorstores/chroma", + "vectorstores/cloudflare_vectorize", "vectorstores/googlevertexai", "vectorstores/elasticsearch", "vectorstores/hnswlib", diff --git a/langchain/src/embeddings/cloudflare_workersai.ts b/langchain/src/embeddings/cloudflare_workersai.ts new file mode 100644 index 000000000000..191213dfbf5f --- /dev/null +++ b/langchain/src/embeddings/cloudflare_workersai.ts @@ -0,0 +1,94 @@ +import { Ai } from "@cloudflare/ai"; +import { Fetcher } from "@cloudflare/workers-types"; +import { chunkArray } from "../util/chunk.js"; +import { Embeddings, EmbeddingsParams } from "./base.js"; + +type AiTextEmbeddingsInput = { + text: string | string[]; +}; + +type AiTextEmbeddingsOutput = { + shape: number[]; + data: number[][]; +}; + +export interface CloudflareWorkersAIEmbeddingsParams extends EmbeddingsParams { + /** Binding */ + binding: Fetcher; + + /** Model name to use */ + modelName?: string; + + /** + * The maximum number of documents to embed in a single request. + */ + batchSize?: number; + + /** + * Whether to strip new lines from the input text. This is recommended by + * OpenAI, but may not be suitable for all use cases. + */ + stripNewLines?: boolean; +} + +export class CloudflareWorkersAIEmbeddings extends Embeddings { + modelName = "@cf/baai/bge-base-en-v1.5"; + + batchSize = 50; + + stripNewLines = true; + + ai: Ai; + + constructor(fields: CloudflareWorkersAIEmbeddingsParams) { + super(fields); + + if (!fields.binding) { + throw new Error( + "Must supply a Workers AI binding, eg { binding: env.AI }" + ); + } + this.ai = new Ai(fields.binding); + this.modelName = fields.modelName ?? this.modelName; + this.stripNewLines = fields.stripNewLines ?? this.stripNewLines; + } + + async embedDocuments(texts: string[]): Promise { + const batches = chunkArray( + this.stripNewLines ? texts.map((t) => t.replace(/\n/g, " ")) : texts, + this.batchSize + ); + + const batchRequests = batches.map((batch) => this.runEmbedding(batch)); + const batchResponses = await Promise.all(batchRequests); + const embeddings: number[][] = []; + + for (let i = 0; i < batchResponses.length; i += 1) { + const batchResponse = batchResponses[i]; + for (let j = 0; j < batchResponse.length; j += 1) { + embeddings.push(batchResponse[j]); + } + } + + return embeddings; + } + + async embedQuery(text: string): Promise { + const data = await this.runEmbedding([ + this.stripNewLines ? text.replace(/\n/g, " ") : text, + ]); + return data[0]; + } + + private async runEmbedding(texts: string[]) { + return this.caller.call(async () => { + const response: AiTextEmbeddingsOutput = await this.ai.run( + this.modelName, + { + text: texts, + } as AiTextEmbeddingsInput + ); + return response.data; + }); + } +} diff --git a/langchain/src/load/import_constants.ts b/langchain/src/load/import_constants.ts index 82cb0a4fab50..1139b3ccb149 100644 --- a/langchain/src/load/import_constants.ts +++ b/langchain/src/load/import_constants.ts @@ -13,6 +13,7 @@ export const optionalImportEntrypoints = [ "langchain/chains/query_constructor", "langchain/chains/query_constructor/ir", "langchain/chains/sql_db", + "langchain/embeddings/cloudflare_workersai", "langchain/embeddings/cohere", "langchain/embeddings/tensorflow", "langchain/embeddings/hf", @@ -34,6 +35,7 @@ export const optionalImportEntrypoints = [ "langchain/prompts/load", "langchain/vectorstores/analyticdb", "langchain/vectorstores/elasticsearch", + "langchain/vectorstores/cloudflare_vectorize", "langchain/vectorstores/chroma", "langchain/vectorstores/googlevertexai", "langchain/vectorstores/hnswlib", diff --git a/langchain/src/load/import_type.d.ts b/langchain/src/load/import_type.d.ts index 3d9b50cd9777..c7d4747ada31 100644 --- a/langchain/src/load/import_type.d.ts +++ b/langchain/src/load/import_type.d.ts @@ -37,6 +37,9 @@ export interface OptionalImportMap { "langchain/chains/sql_db"?: | typeof import("../chains/sql_db/index.js") | Promise; + "langchain/embeddings/cloudflare_workersai"?: + | typeof import("../embeddings/cloudflare_workersai.js") + | Promise; "langchain/embeddings/cohere"?: | typeof import("../embeddings/cohere.js") | Promise; @@ -100,6 +103,9 @@ export interface OptionalImportMap { "langchain/vectorstores/elasticsearch"?: | typeof import("../vectorstores/elasticsearch.js") | Promise; + "langchain/vectorstores/cloudflare_vectorize"?: + | typeof import("../vectorstores/cloudflare_vectorize.js") + | Promise; "langchain/vectorstores/chroma"?: | typeof import("../vectorstores/chroma.js") | Promise; diff --git a/langchain/src/vectorstores/cloudflare_vectorize.ts b/langchain/src/vectorstores/cloudflare_vectorize.ts new file mode 100644 index 000000000000..8a5babf49b1f --- /dev/null +++ b/langchain/src/vectorstores/cloudflare_vectorize.ts @@ -0,0 +1,227 @@ +import * as uuid from "uuid"; + +import { + VectorizeIndex, + VectorizeVectorMetadata, +} from "@cloudflare/workers-types"; +import { VectorStore } from "./base.js"; +import { Embeddings } from "../embeddings/base.js"; +import { Document } from "../document.js"; +import { chunkArray } from "../util/chunk.js"; +import { AsyncCaller, type AsyncCallerParams } from "../util/async_caller.js"; + +export interface VectorizeLibArgs extends AsyncCallerParams { + index: VectorizeIndex; + textKey?: string; +} + +/** + * Type that defines the parameters for the delete operation in the + * CloudflareVectorizeStore class. It includes ids, deleteAll flag, and namespace. + */ +export type VectorizeDeleteParams = { + ids: string[]; +}; + +/** + * Class that extends the VectorStore class and provides methods to + * interact with the Cloudflare Vectorize vector database. + */ +export class CloudflareVectorizeStore extends VectorStore { + textKey: string; + + namespace?: string; + + index: VectorizeIndex; + + caller: AsyncCaller; + + _vectorstoreType(): string { + return "cloudflare_vectorize"; + } + + constructor(embeddings: Embeddings, args: VectorizeLibArgs) { + super(embeddings, args); + + this.embeddings = embeddings; + const { index, textKey, ...asyncCallerArgs } = args; + if (!index) { + throw new Error( + "Must supply a Vectorize index binding, eg { index: env.VECTORIZE }" + ); + } + this.index = index; + this.textKey = textKey ?? "text"; + this.caller = new AsyncCaller({ + maxConcurrency: 6, + maxRetries: 0, + ...asyncCallerArgs, + }); + } + + /** + * Method that adds documents to the Vectorize database. + * @param documents Array of documents to add. + * @param options Optional ids for the documents. + * @returns Promise that resolves with the ids of the added documents. + */ + async addDocuments( + documents: Document[], + options?: { ids?: string[] } | string[] + ) { + const texts = documents.map(({ pageContent }) => pageContent); + return this.addVectors( + await this.embeddings.embedDocuments(texts), + documents, + options + ); + } + + /** + * Method that adds vectors to the Vectorize database. + * @param vectors Array of vectors to add. + * @param documents Array of documents associated with the vectors. + * @param options Optional ids for the vectors. + * @returns Promise that resolves with the ids of the added vectors. + */ + async addVectors( + vectors: number[][], + documents: Document[], + options?: { ids?: string[] } | string[] + ) { + const ids = Array.isArray(options) ? options : options?.ids; + const documentIds = ids == null ? documents.map(() => uuid.v4()) : ids; + const vectorizeVectors = vectors.map((values, idx) => { + const metadata: Record = { + ...documents[idx].metadata, + [this.textKey]: documents[idx].pageContent, + }; + return { + id: documentIds[idx], + metadata, + values, + }; + }); + + // Stick to a limit of 500 vectors per upsert request + const chunkSize = 500; + const chunkedVectors = chunkArray(vectorizeVectors, chunkSize); + const batchRequests = chunkedVectors.map((chunk) => + this.caller.call(async () => this.index.upsert(chunk)) + ); + + await Promise.all(batchRequests); + + return documentIds; + } + + /** + * Method that deletes vectors from the Vectorize database. + * @param params Parameters for the delete operation. + * @returns Promise that resolves when the delete operation is complete. + */ + async delete(params: VectorizeDeleteParams): Promise { + const batchSize = 1000; + const batchedIds = chunkArray(params.ids, batchSize); + const batchRequests = batchedIds.map((batchIds) => + this.caller.call(async () => this.index.deleteByIds(batchIds)) + ); + await Promise.all(batchRequests); + } + + /** + * Method that performs a similarity search in the Vectorize database and + * returns the results along with their scores. + * @param query Query vector for the similarity search. + * @param k Number of top results to return. + * @returns Promise that resolves with an array of documents and their scores. + */ + async similaritySearchVectorWithScore( + query: number[], + k: number + ): Promise<[Document, number][]> { + const results = await this.index.query(query, { + returnVectors: true, + topK: k, + }); + + const result: [Document, number][] = []; + + if (results.matches) { + for (const res of results.matches) { + const { [this.textKey]: pageContent, ...metadata } = + res.vector?.metadata ?? {}; + result.push([ + new Document({ metadata, pageContent: pageContent as string }), + res.score, + ]); + } + } + + return result; + } + + /** + * Static method that creates a new instance of the CloudflareVectorizeStore class + * from texts. + * @param texts Array of texts to add to the Vectorize database. + * @param metadatas Metadata associated with the texts. + * @param embeddings Embeddings to use for the texts. + * @param dbConfig Configuration for the Vectorize database. + * @param options Optional ids for the vectors. + * @returns Promise that resolves with a new instance of the CloudflareVectorizeStore class. + */ + static async fromTexts( + texts: string[], + metadatas: + | Record[] + | Record, + embeddings: Embeddings, + dbConfig: VectorizeLibArgs + ): Promise { + const docs: Document[] = []; + for (let i = 0; i < texts.length; i += 1) { + const metadata = Array.isArray(metadatas) ? metadatas[i] : metadatas; + const newDoc = new Document({ + pageContent: texts[i], + metadata, + }); + docs.push(newDoc); + } + return CloudflareVectorizeStore.fromDocuments(docs, embeddings, dbConfig); + } + + /** + * Static method that creates a new instance of the CloudflareVectorizeStore class + * from documents. + * @param docs Array of documents to add to the Vectorize database. + * @param embeddings Embeddings to use for the documents. + * @param dbConfig Configuration for the Vectorize database. + * @param options Optional ids for the vectors. + * @returns Promise that resolves with a new instance of the CloudflareVectorizeStore class. + */ + static async fromDocuments( + docs: Document[], + embeddings: Embeddings, + dbConfig: VectorizeLibArgs + ): Promise { + const instance = new this(embeddings, dbConfig); + await instance.addDocuments(docs); + return instance; + } + + /** + * Static method that creates a new instance of the CloudflareVectorizeStore class + * from an existing index. + * @param embeddings Embeddings to use for the documents. + * @param dbConfig Configuration for the Vectorize database. + * @returns Promise that resolves with a new instance of the CloudflareVectorizeStore class. + */ + static async fromExistingIndex( + embeddings: Embeddings, + dbConfig: VectorizeLibArgs + ): Promise { + const instance = new this(embeddings, dbConfig); + return instance; + } +} diff --git a/langchain/tsconfig.json b/langchain/tsconfig.json index 0577a55113b1..311636a62339 100644 --- a/langchain/tsconfig.json +++ b/langchain/tsconfig.json @@ -54,6 +54,7 @@ "src/chains/sql_db/index.ts", "src/embeddings/base.ts", "src/embeddings/cache_backed.ts", + "src/embeddings/cloudflare_workersai.ts", "src/embeddings/fake.ts", "src/embeddings/ollama.ts", "src/embeddings/openai.ts", @@ -88,6 +89,7 @@ "src/vectorstores/base.ts", "src/vectorstores/elasticsearch.ts", "src/vectorstores/memory.ts", + "src/vectorstores/cloudflare_vectorize.ts", "src/vectorstores/chroma.ts", "src/vectorstores/googlevertexai.ts", "src/vectorstores/hnswlib.ts", diff --git a/yarn.lock b/yarn.lock index 9686f205f884..fbd1d927646d 100644 --- a/yarn.lock +++ b/yarn.lock @@ -3578,6 +3578,18 @@ __metadata: languageName: node linkType: hard +"@bcherny/json-schema-ref-parser@npm:10.0.5-fork": + version: 10.0.5-fork + resolution: "@bcherny/json-schema-ref-parser@npm:10.0.5-fork" + dependencies: + "@jsdevtools/ono": ^7.1.3 + "@types/json-schema": ^7.0.6 + call-me-maybe: ^1.0.1 + js-yaml: ^4.1.0 + checksum: e90eb3655c4e15f54ebc5138baac98471d159e3a253b484416c03c2d43f5c3bc80a4d6fe18acd71f77bf2f95f7fbc36730abb21cbd1f9d80a6af630c554e6d62 + languageName: node + linkType: hard + "@bcoe/v8-coverage@npm:^0.2.3": version: 0.2.3 resolution: "@bcoe/v8-coverage@npm:0.2.3" @@ -3595,10 +3607,19 @@ __metadata: languageName: node linkType: hard -"@cloudflare/workers-types@npm:^4.20230904.0": - version: 4.20230904.0 - resolution: "@cloudflare/workers-types@npm:4.20230904.0" - checksum: 4cfe7dd7419716233dad0937766d7b33ee709b12578d0def7cee8b005e79142f22f8675a40119957448073aff416e8376a768115af420b7f54d11635a6cb38a2 +"@cloudflare/ai@npm:^1.0.12": + version: 1.0.12 + resolution: "@cloudflare/ai@npm:1.0.12" + dependencies: + json-schema-to-typescript: ^13.1.1 + checksum: bf196acd46ec8a39973aa74346ec9404a018a6599c9c65037e252bebbd648d936dbff6b4848717f9539bfd024826803a857df88ed138fa7ab697a4fafe41d966 + languageName: node + linkType: hard + +"@cloudflare/workers-types@npm:^4.20230922.0": + version: 4.20230922.0 + resolution: "@cloudflare/workers-types@npm:4.20230922.0" + checksum: 629bab47cdbcb74e3c42fc9486f5186734b6dd734154cea7a0983ad83ee053b4fb1ae13ff618a7287612bc3b3d19ad72d6a34a84289a903623cb8a13af57596b languageName: node linkType: hard @@ -4640,6 +4661,13 @@ __metadata: languageName: node linkType: hard +"@jsdevtools/ono@npm:^7.1.3": + version: 7.1.3 + resolution: "@jsdevtools/ono@npm:7.1.3" + checksum: 2297fcd472ba810bffe8519d2249171132844c7174f3a16634f9260761c8c78bc0428a4190b5b6d72d45673c13918ab9844d706c3ed4ef8f62ab11a2627a08ad + languageName: node + linkType: hard + "@jsdoc/salty@npm:^0.2.1": version: 0.2.5 resolution: "@jsdoc/salty@npm:0.2.5" @@ -6408,6 +6436,16 @@ __metadata: languageName: node linkType: hard +"@types/glob@npm:^7.1.3": + version: 7.2.0 + resolution: "@types/glob@npm:7.2.0" + dependencies: + "@types/minimatch": "*" + "@types/node": "*" + checksum: 6ae717fedfdfdad25f3d5a568323926c64f52ef35897bcac8aca8e19bc50c0bd84630bbd063e5d52078b2137d8e7d3c26eabebd1a2f03ff350fff8a91e79fc19 + languageName: node + linkType: hard + "@types/google-protobuf@npm:3.15.6": version: 3.15.6 resolution: "@types/google-protobuf@npm:3.15.6" @@ -6498,6 +6536,13 @@ __metadata: languageName: node linkType: hard +"@types/json-schema@npm:^7.0.11, @types/json-schema@npm:^7.0.6": + version: 7.0.13 + resolution: "@types/json-schema@npm:7.0.13" + checksum: 345df21a678fa72fb389f35f33de77833d09d4a142bb2bcb27c18690efa4cf70fc2876e43843cefb3fbdb9fcb12cd3e970a90936df30f53bbee899865ff605ab + languageName: node + linkType: hard + "@types/json-schema@npm:^7.0.9": version: 7.0.11 resolution: "@types/json-schema@npm:7.0.11" @@ -6537,6 +6582,13 @@ __metadata: languageName: node linkType: hard +"@types/lodash@npm:^4.14.182": + version: 4.14.199 + resolution: "@types/lodash@npm:4.14.199" + checksum: e68d1fcbbfce953ed87b296a628573f62939227bcda0c934954e862b421e8a34c5e71cad6fea27b9980567909e6a4698f09025692958e36d64ea9ed99ec6fb2e + languageName: node + linkType: hard + "@types/long@npm:^4.0.0, @types/long@npm:^4.0.1": version: 4.0.2 resolution: "@types/long@npm:4.0.2" @@ -6568,7 +6620,7 @@ __metadata: languageName: node linkType: hard -"@types/minimatch@npm:^5.1.2": +"@types/minimatch@npm:*, @types/minimatch@npm:^5.1.2": version: 5.1.2 resolution: "@types/minimatch@npm:5.1.2" checksum: 0391a282860c7cb6fe262c12b99564732401bdaa5e395bee9ca323c312c1a0f45efbf34dce974682036e857db59a5c9b1da522f3d6055aeead7097264c8705a8 @@ -6708,6 +6760,13 @@ __metadata: languageName: node linkType: hard +"@types/prettier@npm:^2.6.1": + version: 2.7.3 + resolution: "@types/prettier@npm:2.7.3" + checksum: 705384209cea6d1433ff6c187c80dcc0b95d99d5c5ce21a46a9a58060c527973506822e428789d842761e0280d25e3359300f017fbe77b9755bc772ab3dc2f83 + languageName: node + linkType: hard + "@types/prop-types@npm:*": version: 15.7.5 resolution: "@types/prop-types@npm:15.7.5" @@ -8183,6 +8242,13 @@ __metadata: languageName: node linkType: hard +"call-me-maybe@npm:^1.0.1": + version: 1.0.2 + resolution: "call-me-maybe@npm:1.0.2" + checksum: 42ff2d0bed5b207e3f0122589162eaaa47ba618f79ad2382fe0ba14d9e49fbf901099a6227440acc5946f86a4953e8aa2d242b330b0a5de4d090bb18f8935cae + languageName: node + linkType: hard + "callsites@npm:^3.0.0, callsites@npm:^3.1.0": version: 3.1.0 resolution: "callsites@npm:3.1.0" @@ -8405,6 +8471,19 @@ __metadata: languageName: node linkType: hard +"cli-color@npm:^2.0.2": + version: 2.0.3 + resolution: "cli-color@npm:2.0.3" + dependencies: + d: ^1.0.1 + es5-ext: ^0.10.61 + es6-iterator: ^2.0.3 + memoizee: ^0.4.15 + timers-ext: ^0.1.7 + checksum: b1c5f3d0ec29cbe22be7a01d90bd0cfa080ffed6f1c321ea20ae3f10c6041f0e411e28ee2b98025945bee3548931deed1ae849b53c21b523ba74efef855cd73d + languageName: node + linkType: hard + "cli-cursor@npm:^3.1.0": version: 3.1.0 resolution: "cli-cursor@npm:3.1.0" @@ -9610,7 +9689,7 @@ __metadata: languageName: node linkType: hard -"es5-ext@npm:^0.10.35, es5-ext@npm:^0.10.50": +"es5-ext@npm:^0.10.35, es5-ext@npm:^0.10.46, es5-ext@npm:^0.10.50, es5-ext@npm:^0.10.53, es5-ext@npm:^0.10.61, es5-ext@npm:~0.10.14, es5-ext@npm:~0.10.2, es5-ext@npm:~0.10.46": version: 0.10.62 resolution: "es5-ext@npm:0.10.62" dependencies: @@ -9642,6 +9721,18 @@ __metadata: languageName: node linkType: hard +"es6-weak-map@npm:^2.0.3": + version: 2.0.3 + resolution: "es6-weak-map@npm:2.0.3" + dependencies: + d: 1 + es5-ext: ^0.10.46 + es6-iterator: ^2.0.3 + es6-symbol: ^3.1.1 + checksum: 19ca15f46d50948ce78c2da5f21fb5b1ef45addd4fe17b5df952ff1f2a3d6ce4781249bc73b90995257264be2a98b2ec749bb2aba0c14b5776a1154178f9c927 + languageName: node + linkType: hard + "esbuild@npm:~0.17.6": version: 0.17.11 resolution: "esbuild@npm:0.17.11" @@ -10068,6 +10159,16 @@ __metadata: languageName: node linkType: hard +"event-emitter@npm:^0.3.5": + version: 0.3.5 + resolution: "event-emitter@npm:0.3.5" + dependencies: + d: 1 + es5-ext: ~0.10.14 + checksum: 27c1399557d9cd7e0aa0b366c37c38a4c17293e3a10258e8b692a847dd5ba9fb90429c3a5a1eeff96f31f6fa03ccbd31d8ad15e00540b22b22f01557be706030 + languageName: node + linkType: hard + "event-target-shim@npm:^5.0.0": version: 5.0.1 resolution: "event-target-shim@npm:5.0.1" @@ -10922,6 +11023,13 @@ __metadata: languageName: node linkType: hard +"get-stdin@npm:^8.0.0": + version: 8.0.0 + resolution: "get-stdin@npm:8.0.0" + checksum: 40128b6cd25781ddbd233344f1a1e4006d4284906191ed0a7d55ec2c1a3e44d650f280b2c9eeab79c03ac3037da80257476c0e4e5af38ddfb902d6ff06282d77 + languageName: node + linkType: hard + "get-stream@npm:^5.1.0": version: 5.2.0 resolution: "get-stream@npm:5.2.0" @@ -11013,6 +11121,17 @@ __metadata: languageName: node linkType: hard +"glob-promise@npm:^4.2.2": + version: 4.2.2 + resolution: "glob-promise@npm:4.2.2" + dependencies: + "@types/glob": ^7.1.3 + peerDependencies: + glob: ^7.1.6 + checksum: c1a3d95f7c8393e4151d4899ec4e42bb2e8237160f840ad1eccbe9247407da8b6c13e28f463022e011708bc40862db87b9b77236d35afa3feb8aa86d518f2dfe + languageName: node + linkType: hard + "glob@npm:^10.2.2": version: 10.3.3 resolution: "glob@npm:10.3.3" @@ -11043,7 +11162,7 @@ __metadata: languageName: node linkType: hard -"glob@npm:^7.0.0, glob@npm:^7.1.3, glob@npm:^7.1.4": +"glob@npm:^7.0.0, glob@npm:^7.1.3, glob@npm:^7.1.4, glob@npm:^7.1.6": version: 7.2.3 resolution: "glob@npm:7.2.3" dependencies: @@ -12059,6 +12178,13 @@ __metadata: languageName: node linkType: hard +"is-promise@npm:^2.2.2": + version: 2.2.2 + resolution: "is-promise@npm:2.2.2" + checksum: 18bf7d1c59953e0ad82a1ed963fb3dc0d135c8f299a14f89a17af312fc918373136e56028e8831700e1933519630cc2fd4179a777030330fde20d34e96f40c78 + languageName: node + linkType: hard + "is-property@npm:^1.0.2": version: 1.0.2 resolution: "is-property@npm:1.0.2" @@ -13063,6 +13189,30 @@ __metadata: languageName: node linkType: hard +"json-schema-to-typescript@npm:^13.1.1": + version: 13.1.1 + resolution: "json-schema-to-typescript@npm:13.1.1" + dependencies: + "@bcherny/json-schema-ref-parser": 10.0.5-fork + "@types/json-schema": ^7.0.11 + "@types/lodash": ^4.14.182 + "@types/prettier": ^2.6.1 + cli-color: ^2.0.2 + get-stdin: ^8.0.0 + glob: ^7.1.6 + glob-promise: ^4.2.2 + is-glob: ^4.0.3 + lodash: ^4.17.21 + minimist: ^1.2.6 + mkdirp: ^1.0.4 + mz: ^2.7.0 + prettier: ^2.6.2 + bin: + json2ts: dist/src/cli.js + checksum: e6d894741703c27e0fae4a30b860fdd063439b78b5f98175203a6fe0c1152e2f6a5c4be34fbb2bc460ca50e14d8514cb3da3979d4f912bf2c6fada2f97b551b8 + languageName: node + linkType: hard + "json-schema-traverse@npm:^0.4.1": version: 0.4.1 resolution: "json-schema-traverse@npm:0.4.1" @@ -13275,7 +13425,8 @@ __metadata: "@aws-sdk/types": ^3.357.0 "@azure/storage-blob": ^12.15.0 "@clickhouse/client": ^0.0.14 - "@cloudflare/workers-types": ^4.20230904.0 + "@cloudflare/ai": ^1.0.12 + "@cloudflare/workers-types": ^4.20230922.0 "@elastic/elasticsearch": ^8.4.0 "@faker-js/faker": ^7.6.0 "@getmetal/metal-sdk": ^4.0.0 @@ -13416,7 +13567,7 @@ __metadata: "@aws-sdk/credential-provider-node": ^3.388.0 "@azure/storage-blob": ^12.15.0 "@clickhouse/client": ^0.0.14 - "@cloudflare/workers-types": ^4.20230904.0 + "@cloudflare/ai": ^1.0.12 "@elastic/elasticsearch": ^8.4.0 "@getmetal/metal-sdk": "*" "@getzep/zep-js": ^0.7.0 @@ -13509,7 +13660,7 @@ __metadata: optional: true "@clickhouse/client": optional: true - "@cloudflare/workers-types": + "@cloudflare/ai": optional: true "@elastic/elasticsearch": optional: true @@ -14125,6 +14276,15 @@ __metadata: languageName: node linkType: hard +"lru-queue@npm:^0.1.0": + version: 0.1.0 + resolution: "lru-queue@npm:0.1.0" + dependencies: + es5-ext: ~0.10.2 + checksum: 7f2c53c5e7f2de20efb6ebb3086b7aea88d6cf9ae91ac5618ece974122960c4e8ed04988e81d92c3e63d60b12c556b14d56ef7a9c5a4627b23859b813e39b1a2 + languageName: node + linkType: hard + "macos-release@npm:^3.1.0": version: 3.1.0 resolution: "macos-release@npm:3.1.0" @@ -14317,6 +14477,22 @@ __metadata: languageName: node linkType: hard +"memoizee@npm:^0.4.15": + version: 0.4.15 + resolution: "memoizee@npm:0.4.15" + dependencies: + d: ^1.0.1 + es5-ext: ^0.10.53 + es6-weak-map: ^2.0.3 + event-emitter: ^0.3.5 + is-promise: ^2.2.2 + lru-queue: ^0.1.0 + next-tick: ^1.1.0 + timers-ext: ^0.1.7 + checksum: 4065d94416dbadac56edf5947bf342beca0e9f051f33ad60d7c4baf3f6ca0f3c6fdb770c5caed5a89c0ceaf9121428582f396445d591785281383d60aa883418 + languageName: node + linkType: hard + "memory-pager@npm:^1.0.2": version: 1.5.0 resolution: "memory-pager@npm:1.5.0" @@ -14769,7 +14945,7 @@ __metadata: languageName: node linkType: hard -"mz@npm:^2.4.0": +"mz@npm:^2.4.0, mz@npm:^2.7.0": version: 2.7.0 resolution: "mz@npm:2.7.0" dependencies: @@ -14833,7 +15009,7 @@ __metadata: languageName: node linkType: hard -"next-tick@npm:^1.1.0": +"next-tick@npm:1, next-tick@npm:^1.1.0": version: 1.1.0 resolution: "next-tick@npm:1.1.0" checksum: 83b5cf36027a53ee6d8b7f9c0782f2ba87f4858d977342bfc3c20c21629290a2111f8374d13a81221179603ffc4364f38374b5655d17b6a8f8a8c77bdea4fe8b @@ -16206,6 +16382,15 @@ __metadata: languageName: node linkType: hard +"prettier@npm:^2.6.2": + version: 2.8.8 + resolution: "prettier@npm:2.8.8" + bin: + prettier: bin-prettier.js + checksum: b49e409431bf129dd89238d64299ba80717b57ff5a6d1c1a8b1a28b590d998a34e083fa13573bc732bb8d2305becb4c9a4407f8486c81fa7d55100eb08263cf8 + languageName: node + linkType: hard + "prettier@npm:^2.8.3": version: 2.8.4 resolution: "prettier@npm:2.8.4" @@ -18065,6 +18250,16 @@ __metadata: languageName: node linkType: hard +"timers-ext@npm:^0.1.7": + version: 0.1.7 + resolution: "timers-ext@npm:0.1.7" + dependencies: + es5-ext: ~0.10.46 + next-tick: 1 + checksum: ef3f27a0702a88d885bcbb0317c3e3ecd094ce644da52e7f7d362394a125d9e3578292a8f8966071a980d8abbc3395725333b1856f3ae93835b46589f700d938 + languageName: node + linkType: hard + "titleize@npm:^3.0.0": version: 3.0.0 resolution: "titleize@npm:3.0.0"