diff --git a/chunkmydocs/src/utils/configs/extraction_config.rs b/chunkmydocs/src/utils/configs/extraction_config.rs index f2a0b9a73..a9da2f497 100644 --- a/chunkmydocs/src/utils/configs/extraction_config.rs +++ b/chunkmydocs/src/utils/configs/extraction_config.rs @@ -30,6 +30,8 @@ pub struct Config { pub page_image_density: f32, #[serde(default = "default_segment_bbox_offset")] pub segment_bbox_offset: f32, + #[serde(default = "default_page_limit")] + pub page_limit: i32, } fn default_ocr_concurrency() -> usize { @@ -52,6 +54,10 @@ fn default_segment_bbox_offset() -> f32 { 5.0 } +fn default_page_limit() -> i32 { + 500 +} + mod duration_seconds { use serde::{ Deserialize, Deserializer, Serializer }; use std::time::Duration; diff --git a/chunkmydocs/src/utils/workers/preprocess.rs b/chunkmydocs/src/utils/workers/preprocess.rs index 74b04a4e5..0a1c6da30 100644 --- a/chunkmydocs/src/utils/workers/preprocess.rs +++ b/chunkmydocs/src/utils/workers/preprocess.rs @@ -13,7 +13,6 @@ use crate::utils::storage::services::{ download_to_given_tempfile, upload_to_s3 use chrono::Utc; use std::{ error::Error, path::{ Path, PathBuf }, process::Command }; use tempfile::{ NamedTempFile, TempDir }; -use std::time::Instant; fn is_valid_file_type(file_path: &Path) -> Result<(bool, String), Box> { let output = Command::new("file") @@ -89,6 +88,7 @@ pub async fn process(payload: QueuePayload) -> Result<(), Box> = (async { log_task( @@ -111,8 +111,6 @@ pub async fn process(payload: QueuePayload) -> Result<(), Box Result<(), Box config.page_limit { + log_task( + task_id.clone(), + Status::Failed, + Some(format!("File must be less than {} pages", config.page_limit)), + Some(Utc::now()), + &pg_pool + ).await?; + return Ok(false); + } + + extraction_payload.page_count = Some(page_count); let update_page_count = client.execute( "UPDATE tasks SET page_count = $1, input_file_type = $2 WHERE task_id = $3", @@ -208,7 +212,6 @@ pub async fn process(payload: QueuePayload) -> Result<(), Box { println!("Task succeeded"); if value { - let config = Config::from_env()?; let queue_name = match extraction_payload.model { SegmentationModel::PdlaFast => config.queue_fast, SegmentationModel::Pdla => config.queue_high_quality,