Skip to content

Commit

Permalink
fixed pyscripts and gen signed url
Browse files Browse the repository at this point in the history
  • Loading branch information
ishaan99k committed Oct 5, 2024
1 parent 1a0e690 commit 29ef489
Show file tree
Hide file tree
Showing 4 changed files with 26 additions and 12 deletions.
10 changes: 8 additions & 2 deletions chunkmydocs/src/utils/server/get_task.rs
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,13 @@ pub async fn create_task_from_row(
let message = row.get::<_, Option<String>>("message").unwrap_or_default();
let file_name = row.get::<_, Option<String>>("file_name");
let page_count = row.get::<_, Option<i32>>("page_count");
let pdf_location = row.get::<_, Option<String>>("pdf_location");
let s3_pdf_location: Option<String> = row.get("pdf_location");
let pdf_location = match s3_pdf_location {
Some(location) => generate_presigned_url(s3_client, &location, None)
.await
.ok(),
None => None,
};
let input_location: String = row.get("input_location");
let input_file_url = generate_presigned_url(s3_client, &input_location, None)
.await
Expand Down Expand Up @@ -77,7 +83,7 @@ pub async fn create_task_from_row(
configuration,
file_name,
page_count,
pdf_location,
pdf_location: pdf_location.map(|s| s.to_string()),
})
}

Expand Down
10 changes: 5 additions & 5 deletions chunkmydocs/src/utils/workers/process.rs
Original file line number Diff line number Diff line change
Expand Up @@ -129,13 +129,13 @@ pub async fn process(payload: QueuePayload) -> Result<(), Box<dyn std::error::Er
}
let config = Config::from_env()?;

let pdf_location = format!(
let s3_pdf_location = format!(
"s3://{}/{}/{}/{}",
config.s3_bucket, user_id, task_id,
if file_name.ends_with(".pdf") {
file_name.to_string()
} else {
format!("{}.pdf", file_name.rsplit('.').next().unwrap_or(&file_name))
format!("{}.pdf", file_name)
}
);
let page_count = match Document::load(&final_output_path) {
Expand All @@ -144,8 +144,8 @@ pub async fn process(payload: QueuePayload) -> Result<(), Box<dyn std::error::Er
};

//upload to s3 the pdf file.

let _ = match upload_to_s3(&s3_client, &pdf_location, &final_output_path).await {
println!("Uploading PDF to S3: {}", s3_pdf_location);
let _ = match upload_to_s3(&s3_client, &s3_pdf_location, &final_output_path).await {
Ok(url) => url,
Err(e) => return Err(format!("Failed to upload PDF to S3: {}", e).into()),
};
Expand All @@ -154,7 +154,7 @@ pub async fn process(payload: QueuePayload) -> Result<(), Box<dyn std::error::Er
match client
.execute(
"UPDATE tasks SET pdf_location = $1, page_count = $2, input_file_type = $3 WHERE task_id = $4",
&[&pdf_location, &page_count, &extension, &task_id],
&[&s3_pdf_location, &page_count, &extension, &task_id],
)
.await
{
Expand Down
17 changes: 12 additions & 5 deletions pyscripts/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,11 @@
from enum import Enum
import numpy as np
from PyPDF2 import PdfReader, PdfWriter

import requests
import urllib.request
from api import process_file
from download import download_file
from models import Model, TableOcr, OcrStrategy, UploadForm
from models import Model, TableOcr, OcrStrategy, UploadForm, TaskResponse
from annotate import draw_bounding_boxes

import json
Expand Down Expand Up @@ -58,7 +59,7 @@ def extract_and_annotate_file(file_path: str, model: Model, target_chunk_length:

print(f"Processing file: {file_path}")
upload_form = UploadForm(file=file_path, model=model, target_chunk_length=target_chunk_length, ocr_strategy=ocr_strategy)
task = process_file(upload_form)
task: TaskResponse = process_file(upload_form)
output = task.output
print(f"File processed: {file_path}")

Expand All @@ -69,8 +70,14 @@ def extract_and_annotate_file(file_path: str, model: Model, target_chunk_length:
output_json_path = save_to_json(output_json_path, output, file_name)
print(f"Downloaded bounding boxes for {file_path}")

print(f"Annotating file: {file_path}")
draw_bounding_boxes(file_path, output, output_annotated_path)
if task.pdf_location:
temp_pdf_path = os.path.join(output_dir, f"{file_name}_temp.pdf")
urllib.request.urlretrieve(task.pdf_location, temp_pdf_path)
print(f"Annotating file: {temp_pdf_path}")
draw_bounding_boxes(temp_pdf_path, output, output_annotated_path)
os.remove(temp_pdf_path)
else:
draw_bounding_boxes(file_path, output, output_annotated_path)
print(f"File annotated: {file_path}")

import concurrent.futures
Expand Down
1 change: 1 addition & 0 deletions pyscripts/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ class TaskResponse(BaseModel):
configuration: Configuration
file_name: Optional[str] = None
page_count: Optional[int] = None
pdf_location: Optional[str] = None

class Config:
json_encoders = {datetime: lambda v: v.isoformat()}
Expand Down

0 comments on commit 29ef489

Please sign in to comment.