🖼️ fix: correct image extraction (#3538)

* fix: regex issue extracting text with images in markdown * fix: update addImages function to include only the first observed image path in the response message * ci: tests for addImages function: correct image extraction * fix(GoogleClient): linting --------- Co-authored-by: Dongwoo Jeong <dongwoo.jeong@lge.com> Co-authored-by: Dongwoo Jeong <dongwoo@duck.com>
danny-avila · Aug 5, 2024 · 5baa95b · 5baa95b
1 parent 389b2a6
commit 5baa95b
Show file tree

Hide file tree

Showing 3 changed files with 86 additions and 26 deletions.
diff --git a/api/app/clients/GoogleClient.js b/api/app/clients/GoogleClient.js
@@ -626,11 +626,11 @@ class GoogleClient extends BaseClient {
     const { onProgress, abortController } = options;
     const streamRate = this.options.streamRate ?? Constants.DEFAULT_STREAM_RATE;
     const { messages: _messages, context, examples: _examples } = instances?.[0] ?? {};
-  
+
     let examples;
-  
+
     let clientOptions = { ...parameters, maxRetries: 2 };
-  
+
     if (this.project_id) {
       clientOptions['authOptions'] = {
         credentials: {
@@ -639,16 +639,16 @@ class GoogleClient extends BaseClient {
         projectId: this.project_id,
       };
     }
-  
+
     if (!parameters) {
       clientOptions = { ...clientOptions, ...this.modelOptions };
     }
-  
+
     if (this.isGenerativeModel && !this.project_id) {
       clientOptions.modelName = clientOptions.model;
       delete clientOptions.model;
     }
-  
+
     if (_examples && _examples.length) {
       examples = _examples
         .map((ex) => {
@@ -662,26 +662,26 @@ class GoogleClient extends BaseClient {
           };
         })
         .filter((ex) => ex);
-  
+
       clientOptions.examples = examples;
     }
-  
+
     const model = this.createLLM(clientOptions);
-  
+
     let reply = '';
     const messages = this.isTextModel ? _payload.trim() : _messages;
-  
+
     if (!this.isVisionModel && context && messages?.length > 0) {
       messages.unshift(new SystemMessage(context));
     }
-  
+
     const modelName = clientOptions.modelName ?? clientOptions.model ?? '';
     if (modelName?.includes('1.5') && !this.project_id) {
       const client = model;
       const requestOptions = {
         contents: _payload,
       };
-  
+
       if (this.options?.promptPrefix?.length) {
         requestOptions.systemInstruction = {
           parts: [
@@ -691,9 +691,9 @@ class GoogleClient extends BaseClient {
           ],
         };
       }
-  
+
       requestOptions.safetySettings = _payload.safetySettings;
-  
+
       const delay = modelName.includes('flash') ? 8 : 14;
       const result = await client.generateContentStream(requestOptions);
       for await (const chunk of result.stream) {
@@ -706,15 +706,15 @@ class GoogleClient extends BaseClient {
       }
       return reply;
     }
-  
+
     const stream = await model.stream(messages, {
       signal: abortController.signal,
       timeout: 7000,
       safetySettings: _payload.safetySettings,
     });
-  
+
     let delay = this.options.streamRate || 8;
-  
+
     if (!this.options.streamRate) {
       if (this.isGenerativeModel) {
         delay = 12;
@@ -723,15 +723,15 @@ class GoogleClient extends BaseClient {
         delay = 5;
       }
     }
-  
+
     for await (const chunk of stream) {
       const chunkText = chunk?.content ?? chunk;
       await this.generateTextStream(chunkText, onProgress, {
         delay,
       });
       reply += chunkText;
     }
-  
+
     return reply;
   }
 
@@ -869,17 +869,18 @@ class GoogleClient extends BaseClient {
 
   async sendCompletion(payload, opts = {}) {
     payload.safetySettings = this.getSafetySettings();
-  
+
     let reply = '';
     reply = await this.getCompletion(payload, opts);
     return reply.trim();
   }
-  
+
   getSafetySettings() {
     return [
       {
         category: 'HARM_CATEGORY_SEXUALLY_EXPLICIT',
-        threshold: process.env.GOOGLE_SAFETY_SEXUALLY_EXPLICIT || 'HARM_BLOCK_THRESHOLD_UNSPECIFIED',
+        threshold:
+          process.env.GOOGLE_SAFETY_SEXUALLY_EXPLICIT || 'HARM_BLOCK_THRESHOLD_UNSPECIFIED',
       },
       {
         category: 'HARM_CATEGORY_HATE_SPEECH',
@@ -891,7 +892,8 @@ class GoogleClient extends BaseClient {
       },
       {
         category: 'HARM_CATEGORY_DANGEROUS_CONTENT',
-        threshold: process.env.GOOGLE_SAFETY_DANGEROUS_CONTENT || 'HARM_BLOCK_THRESHOLD_UNSPECIFIED',
+        threshold:
+          process.env.GOOGLE_SAFETY_DANGEROUS_CONTENT || 'HARM_BLOCK_THRESHOLD_UNSPECIFIED',
       },
     ];
   }

diff --git a/api/app/clients/output_parsers/addImages.js b/api/app/clients/output_parsers/addImages.js
@@ -60,10 +60,10 @@ function addImages(intermediateSteps, responseMessage) {
     if (!observation || !observation.includes('![')) {
       return;
     }
-    const observedImagePath = observation.match(/!\[.*\]\([^)]*\)/g);
+    const observedImagePath = observation.match(/!\[[^(]*\]\([^)]*\)/g);
     if (observedImagePath && !responseMessage.text.includes(observedImagePath[0])) {
-      responseMessage.text += '\n' + observation;
-      logger.debug('[addImages] added image from intermediateSteps:', observation);
+      responseMessage.text += '\n' + observedImagePath[0];
+      logger.debug('[addImages] added image from intermediateSteps:', observedImagePath[0]);
     }
   });
 }

diff --git a/api/app/clients/output_parsers/addImages.spec.js b/api/app/clients/output_parsers/addImages.spec.js
@@ -81,4 +81,62 @@ describe('addImages', () => {
     addImages(intermediateSteps, responseMessage);
     expect(responseMessage.text).toBe(`${originalText}\n${imageMarkdown}`);
   });
+
+  it('should extract only image markdowns when there is text between them', () => {
+    const markdownWithTextBetweenImages = `
+      ![image1](/images/image1.png)
+      Some text between images that should not be included.
+      ![image2](/images/image2.png)
+      More text that should be ignored.
+      ![image3](/images/image3.png)
+    `;
+    intermediateSteps.push({ observation: markdownWithTextBetweenImages });
+    addImages(intermediateSteps, responseMessage);
+    expect(responseMessage.text).toBe('\n![image1](/images/image1.png)');
+  });
+
+  it('should only return the first image when multiple images are present', () => {
+    const markdownWithMultipleImages = `
+      ![image1](/images/image1.png)
+      ![image2](/images/image2.png)
+      ![image3](/images/image3.png)
+    `;
+    intermediateSteps.push({ observation: markdownWithMultipleImages });
+    addImages(intermediateSteps, responseMessage);
+    expect(responseMessage.text).toBe('\n![image1](/images/image1.png)');
+  });
+
+  it('should not include any text or metadata surrounding the image markdown', () => {
+    const markdownWithMetadata = `
+      Title: Test Document
+      Author: John Doe
+      ![image1](/images/image1.png)
+      Some content after the image.
+      Vector values: [0.1, 0.2, 0.3]
+    `;
+    intermediateSteps.push({ observation: markdownWithMetadata });
+    addImages(intermediateSteps, responseMessage);
+    expect(responseMessage.text).toBe('\n![image1](/images/image1.png)');
+  });
+
+  it('should handle complex markdown with multiple images and only return the first one', () => {
+    const complexMarkdown = `
+      # Document Title
+      
+      ## Section 1
+      Here's some text with an embedded image:
+      ![image1](/images/image1.png)
+      
+      ## Section 2
+      More text here...
+      ![image2](/images/image2.png)
+      
+      ### Subsection
+      Even more content
+      ![image3](/images/image3.png)
+    `;
+    intermediateSteps.push({ observation: complexMarkdown });
+    addImages(intermediateSteps, responseMessage);
+    expect(responseMessage.text).toBe('\n![image1](/images/image1.png)');
+  });
 });