Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ak/doctr #251

Merged
merged 18 commits into from
Nov 17, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .env.example
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,10 @@ AWS__REGION=us-east-1
AWS__SECRET_KEY=minioadmin
LLM__KEY= # Add your OpenAI key here
PG__URL=postgresql://postgres:postgres@postgres:5432/chunkr
REDIS__URL=redis://redis:6379
RRQ__URL=http://rrq:8000
SEARCH__DENSE_VECTOR_URL=http://dense-vector:80
WORKER__GENERAL_OCR_URL=http://general-ocr:8000
WORKER__GENERAL_OCR_URL=http://doctr:8000
WORKER__PDLA_FAST_URL=http://pdla:8000
WORKER__PDLA_URL=http://pdla:8000
WORKER__TABLE_OCR_URL=http://table-ocr:8000
Expand Down
76 changes: 76 additions & 0 deletions chunkmydocs/src/models/workers/general_ocr.rs
Original file line number Diff line number Diff line change
Expand Up @@ -45,3 +45,79 @@ pub struct PaddleOCRResponse {
pub error_msg: String,
pub result: GeneralOcrResult,
}

#[derive(Debug, Serialize, Deserialize)]
pub struct DoctrResponse {
pub page_content: PageContent,
pub processing_time: f64,
}

impl From<DoctrResponse> for Vec<OCRResult> {
fn from(payload: DoctrResponse) -> Self {
let mut results = Vec::new();

for block in payload.page_content.blocks {
for line in block.lines {
for word in line.words {
let geometry = &word.geometry;
let left = geometry[0][0] as f32 * payload.page_content.dimensions[1] as f32;
let top = geometry[0][1] as f32 * payload.page_content.dimensions[0] as f32;
let right = geometry[1][0] as f32 * payload.page_content.dimensions[1] as f32;
let bottom = geometry[1][1] as f32 * payload.page_content.dimensions[0] as f32;

results.push(OCRResult {
bbox: BoundingBox {
left,
top,
width: right - left,
height: bottom - top,
},
text: word.value,
confidence: Some(word.confidence as f32),
});
}
}
}

results
}
}

#[derive(Debug, Serialize, Deserialize)]
pub struct PageContent {
pub page_idx: i32,
pub dimensions: Vec<i32>,
pub orientation: Detection<Option<f64>>,
pub language: Detection<Option<String>>,
pub blocks: Vec<Block>,
}

#[derive(Debug, Serialize, Deserialize)]
pub struct Detection<T> {
pub value: T,
pub confidence: Option<f64>,
}

#[derive(Debug, Serialize, Deserialize)]
pub struct Block {
pub geometry: Vec<Vec<f64>>,
pub objectness_score: f64,
pub lines: Vec<Line>,
pub artefacts: Vec<String>,
}

#[derive(Debug, Serialize, Deserialize)]
pub struct Line {
pub geometry: Vec<Vec<f64>>,
pub objectness_score: f64,
pub words: Vec<Word>,
}

#[derive(Debug, Serialize, Deserialize)]
pub struct Word {
pub value: String,
pub confidence: f64,
pub geometry: Vec<Vec<f64>>,
pub objectness_score: f64,
pub crop_orientation: Detection<i32>,
}
2 changes: 1 addition & 1 deletion chunkmydocs/src/utils/configs/throttle_config.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ pub struct Config {
}

fn default_general_ocr_rate_limit() -> f32 {
6.0
20.0
}

fn default_llm_ocr_rate_limit() -> f32 {
Expand Down
14 changes: 13 additions & 1 deletion chunkmydocs/src/utils/configs/worker_config.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,15 +4,23 @@ use serde::{Deserialize, Serialize};
use std::time::Duration;

#[derive(Debug, Serialize, Deserialize, Clone)]
pub enum TableOcrModel {
pub enum GeneralOcrModel {
Doctr,
Paddle,
}

#[derive(Debug, Serialize, Deserialize, Clone)]
pub enum TableOcrModel {
LLM,
Paddle,
}

#[derive(Debug, Serialize, Deserialize, Clone)]
pub struct Config {
#[serde(default = "default_batch_size")]
pub batch_size: i32,
#[serde(default = "default_general_ocr_model")]
pub general_ocr_model: GeneralOcrModel,
#[serde(default = "default_general_ocr_url")]
pub general_ocr_url: Option<String>,
#[serde(default = "default_ocr_confidence_threshold")]
Expand Down Expand Up @@ -63,6 +71,10 @@ fn default_batch_size() -> i32 {
300
}

fn default_general_ocr_model() -> GeneralOcrModel {
GeneralOcrModel::Doctr
}

fn default_general_ocr_url() -> Option<String> {
Some("http://localhost:8003".to_string())
}
Expand Down
7 changes: 5 additions & 2 deletions chunkmydocs/src/utils/rate_limit.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
use crate::utils::configs::throttle_config::Config as ThrottleConfig;
use crate::utils::db::deadpool_redis::{Pool, RedisResult};
use crate::utils::db::deadpool_redis::{Pool, RedisError, RedisResult};
use std::time::Duration;

pub struct RateLimiter {
Expand Down Expand Up @@ -82,7 +82,10 @@ impl RateLimiter {
}
tokio::time::sleep(Duration::from_millis(50)).await;
}
Ok(false)
Err(RedisError::from((
redis::ErrorKind::BusyLoadingError,
"Rate limit timeout exceeded",
)))
}
}

Expand Down
2 changes: 1 addition & 1 deletion chunkmydocs/src/utils/services/llm.rs
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ mod tests {
use super::*;

use crate::utils::configs::llm_config::{get_prompt, Config as LlmConfig};
use crate::utils::services::segment_ocr::get_html_from_llm_table_ocr;
use crate::utils::services::ocr::get_html_from_llm_table_ocr;
use std::collections::HashMap;
use std::fs;
use std::path::Path;
Expand Down
2 changes: 1 addition & 1 deletion chunkmydocs/src/utils/services/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,9 @@ pub mod images;
pub mod llm;
pub mod log;
pub mod ocr;
pub mod ocr_url;
pub mod payload;
pub mod pdf;
pub mod pdla;
pub mod search;
pub mod segment_ocr;
pub mod structured_extraction;
Loading