The processing is still very slow. Is the entire program executed in parallel or serially? #125

xuboot · 2024-05-15T03:54:35Z

The processing is still very slow. Is the entire program executed in parallel or serially?

xuboot · 2024-05-15T04:22:44Z

@VikParuchuri hello,This is the configuration I changed ，Can it be split into parallel processing so that it is faster and can it also satisfy concurrency? from typing import Optional, List, Dict, Literal, Union
from dotenv import find_dotenv
from pydantic import BaseSettings, Extra, Field
import torch
from pydantic_core.core_schema import computed_field

class Settings(BaseSettings):
# General
TORCH_DEVICE: Optional[str] = Field(None, env="TORCH_DEVICE") # Let PyTorch decide the best device
IMAGE_DPI: int = 96 # DPI to render images pulled from pdf at
EXTRACT_IMAGES: bool = True # Extract images from pdfs and save them

@Extra.classproperty
def TORCH_DEVICE_MODEL(cls) -> str:
    if cls.TORCH_DEVICE is not None:
        return cls.TORCH_DEVICE
    return "cuda" if torch.cuda.is_available() else "cpu"

INFERENCE_RAM: int = 15  # Reduced to avoid OOM errors on T4
VRAM_PER_TASK: float = 3.75  # Adjusted based on T4 VRAM availability
DEFAULT_LANG: str = "English"  # Default language we assume files to be in

SUPPORTED_FILETYPES: Dict[str, str] = {
    "application/pdf": "pdf",
}

# Text line Detection
DETECTOR_BATCH_SIZE: Optional[int] = Field(None, env="DETECTOR_BATCH_SIZE")
SURYA_DETECTOR_DPI: int = 96
DETECTOR_POSTPROCESSING_CPU_WORKERS: int = 4

# OCR
INVALID_CHARS: List[str] = [chr(0xfffd), ""]
OCR_ENGINE: Optional[Literal["surya", "ocrmypdf"]] = Field("ocrmypdf", env="OCR_ENGINE")
OCR_ALL_PAGES: bool = False  # Run OCR on every page even if text can be extracted

# Surya
SURYA_OCR_DPI: int = 96
RECOGNITION_BATCH_SIZE: Optional[int] = Field(None, env="RECOGNITION_BATCH_SIZE")

# Tesseract
OCR_PARALLEL_WORKERS: int = 2  # How many CPU workers to use for OCR
TESSERACT_TIMEOUT: int = 20  # When to give up on OCR
TESSDATA_PREFIX: str = ""

# Texify model
TEXIFY_MODEL_MAX: int = 384  # Max inference length for texify
TEXIFY_TOKEN_BUFFER: int = 256  # Number of tokens to buffer above max for texify
TEXIFY_DPI: int = 96  # DPI to render images at
TEXIFY_BATCH_SIZE: Optional[int] = Field(None, env="TEXIFY_BATCH_SIZE")
TEXIFY_MODEL_NAME: str = "vikp/texify"

# Layout model
SURYA_LAYOUT_DPI: int = 96
BAD_SPAN_TYPES: List[str] = [
    "Caption",
    "Footnote",
    "Page-footer",
    "Page-header",
    "Picture",
]
LAYOUT_MODEL_CHECKPOINT: str = "vikp/surya_layout2"
BBOX_INTERSECTION_THRESH: float = 0.7  # How much the layout and pdf bboxes need to overlap to be the same
LAYOUT_BATCH_SIZE: Optional[int] = Field(None, env="LAYOUT_BATCH_SIZE")

# Ordering model
SURYA_ORDER_DPI: int = 96
ORDER_BATCH_SIZE: Optional[int] = Field(None, env="ORDER_BATCH_SIZE")
ORDER_MAX_BBOXES: int = 255

# Final editing model
EDITOR_BATCH_SIZE: Optional[int] = Field(None, env="EDITOR_BATCH_SIZE")
EDITOR_MAX_LENGTH: int = 1024
EDITOR_MODEL_NAME: str = "vikp/pdf_postprocessor_t5"
ENABLE_EDITOR_MODEL: bool = False  # The editor model can create false positives
EDITOR_CUTOFF_THRESH: float = 0.9  # Ignore predictions below this probability

# Ray
RAY_CACHE_PATH: Optional[str] = Field(None, env="RAY_CACHE_PATH")
RAY_CORES_PER_WORKER: int = 1  # How many cpu cores to allocate per worker

# Debug
DEBUG: bool = Field(False, env="DEBUG")
DEBUG_DATA_FOLDER: Optional[str] = Field(None, env="DEBUG_DATA_FOLDER")
DEBUG_LEVEL: int = 0  # 0 to 2, 2 means log everything

@computed_field
@property
def CUDA(self) -> bool:
    return "cuda" in self.TORCH_DEVICE_MODEL

@computed_field
@property
def MODEL_DTYPE(self) -> torch.dtype:
    if self.CUDA:
        return torch.bfloat16
    else:
        return torch.float32

@computed_field
@property
def TEXIFY_DTYPE(self) -> torch.dtype:
    return torch.float32 if self.TORCH_DEVICE_MODEL == "cpu" else torch.float16

class Config:
    env_file = find_dotenv("local.env")
    extra = "ignore"

Create an instance of the settings

settings = Settings()

VikParuchuri · 2024-05-15T13:17:35Z

You can run in parallel using the marker or chunk_convert scripts, see README

VikParuchuri closed this as completed May 15, 2024

wciq1208 mentioned this issue Jul 9, 2024

Crashed in a multi-threaded environment #225

Closed

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

The processing is still very slow. Is the entire program executed in parallel or serially? #125

The processing is still very slow. Is the entire program executed in parallel or serially? #125

xuboot commented May 15, 2024

xuboot commented May 15, 2024

VikParuchuri commented May 15, 2024

The processing is still very slow. Is the entire program executed in parallel or serially? #125

The processing is still very slow. Is the entire program executed in parallel or serially? #125

Comments

xuboot commented May 15, 2024

xuboot commented May 15, 2024

Create an instance of the settings

VikParuchuri commented May 15, 2024