Skip to content

Commit

Permalink
page limits added
Browse files Browse the repository at this point in the history
  • Loading branch information
akhileshsharma99 committed Oct 25, 2024
1 parent a4fc9bf commit b2e74b9
Show file tree
Hide file tree
Showing 2 changed files with 22 additions and 13 deletions.
6 changes: 6 additions & 0 deletions chunkmydocs/src/utils/configs/extraction_config.rs
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,8 @@ pub struct Config {
pub page_image_density: f32,
#[serde(default = "default_segment_bbox_offset")]
pub segment_bbox_offset: f32,
#[serde(default = "default_page_limit")]
pub page_limit: i32,
}

fn default_ocr_concurrency() -> usize {
Expand All @@ -52,6 +54,10 @@ fn default_segment_bbox_offset() -> f32 {
5.0
}

fn default_page_limit() -> i32 {
500
}

mod duration_seconds {
use serde::{ Deserialize, Deserializer, Serializer };
use std::time::Duration;
Expand Down
29 changes: 16 additions & 13 deletions chunkmydocs/src/utils/workers/preprocess.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@ use crate::utils::storage::services::{ download_to_given_tempfile, upload_to_s3
use chrono::Utc;
use std::{ error::Error, path::{ Path, PathBuf }, process::Command };
use tempfile::{ NamedTempFile, TempDir };
use std::time::Instant;

fn is_valid_file_type(file_path: &Path) -> Result<(bool, String), Box<dyn Error>> {
let output = Command::new("file")
Expand Down Expand Up @@ -89,6 +88,7 @@ pub async fn process(payload: QueuePayload) -> Result<(), Box<dyn std::error::Er
let pg_pool = create_pool();
let client: Client = pg_pool.get().await?;
let temp_dir = TempDir::new().unwrap();
let config = Config::from_env()?;

let result: Result<bool, Box<dyn std::error::Error>> = (async {
log_task(
Expand All @@ -111,8 +111,6 @@ pub async fn process(payload: QueuePayload) -> Result<(), Box<dyn std::error::Er
e
})?;

println!("Input file downloaded to: {:?}", input_file.path());

let (is_valid, detected_mime_type) = is_valid_file_type(&input_file.path()).map_err(|e| {
eprintln!("Failed to check file type: {:?}", e);
e
Expand Down Expand Up @@ -154,19 +152,25 @@ pub async fn process(payload: QueuePayload) -> Result<(), Box<dyn std::error::Er
&pg_pool
).await?;

let start_time = Instant::now();
let image_paths = pdf_2_images(&pdf_path, &temp_dir.path())?;
let end_time = Instant::now();

let page_count = image_paths.len() as i32;
extraction_payload.page_count = Some(page_count);

println!("Time taken: {:?}", end_time.duration_since(start_time));
println!("Page count: {}", page_count);
println!(
"Pages per second: {:?}",
(page_count as f32) / (end_time.duration_since(start_time).as_secs() as f32)
);
println!("Page count: {}", page_count.clone());
println!("Page limit: {}", config.page_limit.clone());

if page_count > config.page_limit {
log_task(
task_id.clone(),
Status::Failed,
Some(format!("File must be less than {} pages", config.page_limit)),
Some(Utc::now()),
&pg_pool
).await?;
return Ok(false);
}

extraction_payload.page_count = Some(page_count);

let update_page_count = client.execute(
"UPDATE tasks SET page_count = $1, input_file_type = $2 WHERE task_id = $3",
Expand Down Expand Up @@ -208,7 +212,6 @@ pub async fn process(payload: QueuePayload) -> Result<(), Box<dyn std::error::Er
Ok(value) => {
println!("Task succeeded");
if value {
let config = Config::from_env()?;
let queue_name = match extraction_payload.model {
SegmentationModel::PdlaFast => config.queue_fast,
SegmentationModel::Pdla => config.queue_high_quality,
Expand Down

0 comments on commit b2e74b9

Please sign in to comment.