Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

The processing is still very slow. Is the entire program executed in parallel or serially? #125

Closed
xuboot opened this issue May 15, 2024 · 2 comments

Comments

@xuboot
Copy link

xuboot commented May 15, 2024

The processing is still very slow. Is the entire program executed in parallel or serially?

@xuboot
Copy link
Author

xuboot commented May 15, 2024

@VikParuchuri hello,This is the configuration I changed ,Can it be split into parallel processing so that it is faster and can it also satisfy concurrency? from typing import Optional, List, Dict, Literal, Union
from dotenv import find_dotenv
from pydantic import BaseSettings, Extra, Field
import torch
from pydantic_core.core_schema import computed_field

class Settings(BaseSettings):
# General
TORCH_DEVICE: Optional[str] = Field(None, env="TORCH_DEVICE") # Let PyTorch decide the best device
IMAGE_DPI: int = 96 # DPI to render images pulled from pdf at
EXTRACT_IMAGES: bool = True # Extract images from pdfs and save them

@Extra.classproperty
def TORCH_DEVICE_MODEL(cls) -> str:
    if cls.TORCH_DEVICE is not None:
        return cls.TORCH_DEVICE
    return "cuda" if torch.cuda.is_available() else "cpu"

INFERENCE_RAM: int = 15  # Reduced to avoid OOM errors on T4
VRAM_PER_TASK: float = 3.75  # Adjusted based on T4 VRAM availability
DEFAULT_LANG: str = "English"  # Default language we assume files to be in

SUPPORTED_FILETYPES: Dict[str, str] = {
    "application/pdf": "pdf",
}

# Text line Detection
DETECTOR_BATCH_SIZE: Optional[int] = Field(None, env="DETECTOR_BATCH_SIZE")
SURYA_DETECTOR_DPI: int = 96
DETECTOR_POSTPROCESSING_CPU_WORKERS: int = 4

# OCR
INVALID_CHARS: List[str] = [chr(0xfffd), ""]
OCR_ENGINE: Optional[Literal["surya", "ocrmypdf"]] = Field("ocrmypdf", env="OCR_ENGINE")
OCR_ALL_PAGES: bool = False  # Run OCR on every page even if text can be extracted

# Surya
SURYA_OCR_DPI: int = 96
RECOGNITION_BATCH_SIZE: Optional[int] = Field(None, env="RECOGNITION_BATCH_SIZE")

# Tesseract
OCR_PARALLEL_WORKERS: int = 2  # How many CPU workers to use for OCR
TESSERACT_TIMEOUT: int = 20  # When to give up on OCR
TESSDATA_PREFIX: str = ""

# Texify model
TEXIFY_MODEL_MAX: int = 384  # Max inference length for texify
TEXIFY_TOKEN_BUFFER: int = 256  # Number of tokens to buffer above max for texify
TEXIFY_DPI: int = 96  # DPI to render images at
TEXIFY_BATCH_SIZE: Optional[int] = Field(None, env="TEXIFY_BATCH_SIZE")
TEXIFY_MODEL_NAME: str = "vikp/texify"

# Layout model
SURYA_LAYOUT_DPI: int = 96
BAD_SPAN_TYPES: List[str] = [
    "Caption",
    "Footnote",
    "Page-footer",
    "Page-header",
    "Picture",
]
LAYOUT_MODEL_CHECKPOINT: str = "vikp/surya_layout2"
BBOX_INTERSECTION_THRESH: float = 0.7  # How much the layout and pdf bboxes need to overlap to be the same
LAYOUT_BATCH_SIZE: Optional[int] = Field(None, env="LAYOUT_BATCH_SIZE")

# Ordering model
SURYA_ORDER_DPI: int = 96
ORDER_BATCH_SIZE: Optional[int] = Field(None, env="ORDER_BATCH_SIZE")
ORDER_MAX_BBOXES: int = 255

# Final editing model
EDITOR_BATCH_SIZE: Optional[int] = Field(None, env="EDITOR_BATCH_SIZE")
EDITOR_MAX_LENGTH: int = 1024
EDITOR_MODEL_NAME: str = "vikp/pdf_postprocessor_t5"
ENABLE_EDITOR_MODEL: bool = False  # The editor model can create false positives
EDITOR_CUTOFF_THRESH: float = 0.9  # Ignore predictions below this probability

# Ray
RAY_CACHE_PATH: Optional[str] = Field(None, env="RAY_CACHE_PATH")
RAY_CORES_PER_WORKER: int = 1  # How many cpu cores to allocate per worker

# Debug
DEBUG: bool = Field(False, env="DEBUG")
DEBUG_DATA_FOLDER: Optional[str] = Field(None, env="DEBUG_DATA_FOLDER")
DEBUG_LEVEL: int = 0  # 0 to 2, 2 means log everything

@computed_field
@property
def CUDA(self) -> bool:
    return "cuda" in self.TORCH_DEVICE_MODEL

@computed_field
@property
def MODEL_DTYPE(self) -> torch.dtype:
    if self.CUDA:
        return torch.bfloat16
    else:
        return torch.float32

@computed_field
@property
def TEXIFY_DTYPE(self) -> torch.dtype:
    return torch.float32 if self.TORCH_DEVICE_MODEL == "cpu" else torch.float16

class Config:
    env_file = find_dotenv("local.env")
    extra = "ignore"

Create an instance of the settings

settings = Settings()

@VikParuchuri
Copy link
Owner

You can run in parallel using the marker or chunk_convert scripts, see README

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

2 participants