Skip to content

Commit

Permalink
segments in alphabetical order
Browse files Browse the repository at this point in the history
  • Loading branch information
akhileshsharma99 committed Jan 21, 2025
1 parent 8bf37de commit 293ece8
Show file tree
Hide file tree
Showing 10 changed files with 21 additions and 49 deletions.
4 changes: 1 addition & 3 deletions apps/web/.env.example
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,4 @@ VITE_KEYCLOAK_URL=
VITE_KEYCLOAK_REALM=
VITE_KEYCLOAK_CLIENT_ID=
VITE_KEYCLOAK_REDIRECT_URI=http://localhost:5173
VITE_KEYCLOAK_POST_LOGOUT_REDIRECT_URI=http://localhost:5173

VITE_FEATURE_FLAG_PIPELINE=false # true enables Azure Document Intelligence layout analysis, OCR and segment processing heuristics
VITE_KEYCLOAK_POST_LOGOUT_REDIRECT_URI=http://localhost:5173
21 changes: 11 additions & 10 deletions apps/web/src/components/Upload/ConfigControls.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -171,21 +171,23 @@ export function SegmentProcessingControls({
onChange,
showOnlyPage = false,
}: SegmentProcessingControlsProps) {
const [selectedType, setSelectedType] =
useState<keyof SegmentProcessing>("Text");
const [isDropdownOpen, setIsDropdownOpen] = useState(false);
const segmentTypes = showOnlyPage
? (["Page"] as (keyof SegmentProcessing)[])
: (Object.keys(value).filter(
(key) => key !== "Page"
) as (keyof SegmentProcessing)[]);
: (Object.keys(value)
.filter((key) => key !== "Page")
.sort() as (keyof SegmentProcessing)[]);

const defaultSegmentType = segmentTypes[0];
const [selectedType, setSelectedType] =
useState<keyof SegmentProcessing>(defaultSegmentType);
const [isDropdownOpen, setIsDropdownOpen] = useState(false);
const dropdownRef = useRef<HTMLDivElement>(null);

useEffect(() => {
if (showOnlyPage && selectedType !== "Page") {
setSelectedType("Page");
} else if (!showOnlyPage && selectedType === "Page") {
setSelectedType("Text"); // or any other default segment type
setSelectedType(defaultSegmentType);
}
}, [selectedType, showOnlyPage]);

Expand Down Expand Up @@ -270,9 +272,8 @@ export function SegmentProcessingControls({
{segmentTypes.map((type) => (
<button
key={type}
className={`segment-dropdown-item ${
selectedType === type ? "active" : ""
} ${isSegmentModified(type) ? "modified" : ""}`}
className={`segment-dropdown-item ${selectedType === type ? "active" : ""
} ${isSegmentModified(type) ? "modified" : ""}`}
onClick={() => handleTypeSelect(type)}
type="button"
>
Expand Down
1 change: 1 addition & 0 deletions apps/web/src/components/Upload/UploadMain.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,7 @@ export default function UploadMain({
ocr_strategy: config.ocr_strategy,
segment_processing: getEffectiveSegmentProcessing(config),
segmentation_strategy: config.segmentation_strategy,
pipeline: config.pipeline,
};

const response = await uploadFile(uploadPayload);
Expand Down
4 changes: 2 additions & 2 deletions apps/web/src/models/taskConfig.model.ts
Original file line number Diff line number Diff line change
Expand Up @@ -209,8 +209,8 @@ const DEFAULT_FORMULA_CONFIG: SegmentProcessingConfig = {

const DEFAULT_PICTURE_CONFIG: SegmentProcessingConfig = {
crop_image: CroppingStrategy.All,
html: GenerationStrategy.LLM,
markdown: GenerationStrategy.LLM,
html: GenerationStrategy.Auto,
markdown: GenerationStrategy.Auto,
};

export const DEFAULT_SEGMENT_PROCESSING: SegmentProcessing = {
Expand Down
2 changes: 1 addition & 1 deletion compose.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,7 @@ services:
env_file:
- .env
deploy:
replicas: 1
replicas: 0
restart: always
segmentation:
image: luminainc/segmentation:df6e5375
Expand Down
4 changes: 3 additions & 1 deletion core/src/models/chunkr/pipeline.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
use crate::models::chunkr::output::OutputResponse;
use crate::models::chunkr::task::{Status, Task, TaskPayload};
use crate::utils::services::file_operations::convert_to_pdf;
use crate::utils::services::pdf::count_pages;
use crate::utils::storage::services::download_to_tempfile;
use chrono::{DateTime, Utc};
use dashmap::DashMap;
Expand Down Expand Up @@ -73,11 +74,12 @@ impl Pipeline {
};
println!("Task initialized with input file");
}
let page_count = count_pages(self.pdf_file.as_ref().unwrap())?;
task.update(
Some(Status::Processing),
Some("Task started".to_string()),
None,
None,
Some(page_count),
Some(Utc::now()),
None,
None,
Expand Down
1 change: 0 additions & 1 deletion core/src/pipeline/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@ pub mod crop;
pub mod segment_processing;
pub mod segmentation_and_ocr;
pub mod structured_extraction;
pub mod update_metadata;

#[cfg(feature = "azure")]
pub mod azure;
26 changes: 0 additions & 26 deletions core/src/pipeline/update_metadata.rs

This file was deleted.

5 changes: 1 addition & 4 deletions core/src/workers/task.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@ use core::pipeline::crop;
use core::pipeline::segment_processing;
use core::pipeline::segmentation_and_ocr;
use core::pipeline::structured_extraction;
use core::pipeline::update_metadata;
use core::utils::clients::initialize;
use core::utils::rrq::consumer::consumer;

Expand All @@ -33,7 +32,6 @@ async fn execute_step(
"segmentation_and_ocr" => segmentation_and_ocr::process(pipeline).await,
"segment_processing" => segment_processing::process(pipeline).await,
"structured_extraction" => structured_extraction::process(pipeline).await,
"update_metadata" => update_metadata::process(pipeline).await,
_ => Err(format!("Unknown function: {}", step).into()),
}?;
let duration = start.elapsed();
Expand All @@ -52,7 +50,7 @@ async fn execute_step(
fn orchestrate_task(
pipeline: &mut Pipeline,
) -> Result<Vec<&'static str>, Box<dyn std::error::Error>> {
let mut steps = vec!["update_metadata", "convert_to_images"];
let mut steps = vec!["convert_to_images"];
#[cfg(feature = "azure")]
{
match pipeline.get_task()?.configuration.pipeline.clone() {
Expand Down Expand Up @@ -89,7 +87,6 @@ pub async fn process(payload: QueuePayload) -> Result<(), Box<dyn std::error::Er
);
return Ok(());
}

let start_time = std::time::Instant::now();
for step in orchestrate_task(&mut pipeline)? {
execute_step(step, &mut pipeline).await?;
Expand Down
2 changes: 1 addition & 1 deletion services/doctr/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
dotenv.load_dotenv(override=True)

batch_wait_time = float(os.getenv('BATCH_WAIT_TIME', 0.5))
max_batch_size = int(os.getenv('MAX_BATCH_SIZE', 240))
max_batch_size = int(os.getenv('MAX_BATCH_SIZE', 100))

app = FastAPI()

Expand Down

0 comments on commit 293ece8

Please sign in to comment.