fixed pyscripts and gen signed url

lumina-ai-inc · Oct 5, 2024 · 29ef489 · 29ef489
1 parent 1a0e690
commit 29ef489
Show file tree

Hide file tree

Showing 4 changed files with 26 additions and 12 deletions.
diff --git a/chunkmydocs/src/utils/server/get_task.rs b/chunkmydocs/src/utils/server/get_task.rs
@@ -45,7 +45,13 @@ pub async fn create_task_from_row(
     let message = row.get::<_, Option<String>>("message").unwrap_or_default();
     let file_name = row.get::<_, Option<String>>("file_name");
     let page_count = row.get::<_, Option<i32>>("page_count");
-    let pdf_location = row.get::<_, Option<String>>("pdf_location");
+    let s3_pdf_location: Option<String> = row.get("pdf_location");
+    let pdf_location = match s3_pdf_location {
+        Some(location) => generate_presigned_url(s3_client, &location, None)
+            .await
+            .ok(),
+        None => None,
+    };
     let input_location: String = row.get("input_location");
     let input_file_url = generate_presigned_url(s3_client, &input_location, None)
         .await
@@ -77,7 +83,7 @@ pub async fn create_task_from_row(
         configuration,
         file_name,
         page_count,
-        pdf_location,
+        pdf_location: pdf_location.map(|s| s.to_string()),
     })
 }
 

diff --git a/chunkmydocs/src/utils/workers/process.rs b/chunkmydocs/src/utils/workers/process.rs
@@ -129,13 +129,13 @@ pub async fn process(payload: QueuePayload) -> Result<(), Box<dyn std::error::Er
         }
         let config = Config::from_env()?;
 
-        let pdf_location = format!(
+        let s3_pdf_location = format!(
             "s3://{}/{}/{}/{}",
             config.s3_bucket, user_id, task_id, 
             if file_name.ends_with(".pdf") {
                 file_name.to_string()
             } else {
-                format!("{}.pdf", file_name.rsplit('.').next().unwrap_or(&file_name))
+                format!("{}.pdf", file_name)
             }
         );
         let page_count = match Document::load(&final_output_path) {
@@ -144,8 +144,8 @@ pub async fn process(payload: QueuePayload) -> Result<(), Box<dyn std::error::Er
         };
 
         //upload to s3 the pdf file.
-
-        let _ = match upload_to_s3(&s3_client, &pdf_location, &final_output_path).await {
+        println!("Uploading PDF to S3: {}", s3_pdf_location);
+        let _ = match upload_to_s3(&s3_client, &s3_pdf_location, &final_output_path).await {
             Ok(url) => url,
             Err(e) => return Err(format!("Failed to upload PDF to S3: {}", e).into()),
         };
@@ -154,7 +154,7 @@ pub async fn process(payload: QueuePayload) -> Result<(), Box<dyn std::error::Er
         match client
             .execute(
                 "UPDATE tasks SET pdf_location = $1, page_count = $2, input_file_type = $3 WHERE task_id = $4",
-                &[&pdf_location, &page_count, &extension, &task_id],
+                &[&s3_pdf_location, &page_count, &extension, &task_id],
             )
             .await
         {

diff --git a/pyscripts/main.py b/pyscripts/main.py
@@ -9,10 +9,11 @@
 from enum import Enum
 import numpy as np
 from PyPDF2 import PdfReader, PdfWriter
-
+import requests
+import urllib.request
 from api import process_file
 from download import download_file
-from models import Model, TableOcr, OcrStrategy, UploadForm
+from models import Model, TableOcr, OcrStrategy, UploadForm, TaskResponse
 from annotate import draw_bounding_boxes
 
 import json
@@ -58,7 +59,7 @@ def extract_and_annotate_file(file_path: str, model: Model, target_chunk_length:
 
     print(f"Processing file: {file_path}")
     upload_form = UploadForm(file=file_path, model=model, target_chunk_length=target_chunk_length, ocr_strategy=ocr_strategy)
-    task = process_file(upload_form)
+    task: TaskResponse = process_file(upload_form)
     output = task.output
     print(f"File processed: {file_path}")
 
@@ -69,8 +70,14 @@ def extract_and_annotate_file(file_path: str, model: Model, target_chunk_length:
     output_json_path = save_to_json(output_json_path, output, file_name)
     print(f"Downloaded bounding boxes for {file_path}")
 
-    print(f"Annotating file: {file_path}")
-    draw_bounding_boxes(file_path, output, output_annotated_path)
+    if task.pdf_location:
+        temp_pdf_path = os.path.join(output_dir, f"{file_name}_temp.pdf")
+        urllib.request.urlretrieve(task.pdf_location, temp_pdf_path)
+        print(f"Annotating file: {temp_pdf_path}")
+        draw_bounding_boxes(temp_pdf_path, output, output_annotated_path)
+        os.remove(temp_pdf_path)
+    else:
+        draw_bounding_boxes(file_path, output, output_annotated_path)
     print(f"File annotated: {file_path}")
 
 import concurrent.futures

diff --git a/pyscripts/models.py b/pyscripts/models.py
@@ -42,6 +42,7 @@ class TaskResponse(BaseModel):
     configuration: Configuration
     file_name: Optional[str] = None
     page_count: Optional[int] = None
+    pdf_location: Optional[str] = None
 
     class Config:
         json_encoders = {datetime: lambda v: v.isoformat()}