Skip to content

Commit

Permalink
add DOCXProcessor for docx-Files
Browse files Browse the repository at this point in the history
  • Loading branch information
Elias authored and Elias committed Feb 21, 2024
1 parent 4d765da commit 3530b0a
Show file tree
Hide file tree
Showing 5 changed files with 16 additions and 5 deletions.
2 changes: 1 addition & 1 deletion configs/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ strategies:
# file_selection_pattern: a regex pattern selecting the files to be processed
file_name_pattern: 'regex_pattern_identifying_your_file in /path/to/your_directory'
# file_format
# currently only 'pdf' and 'txt' are supported
# currently only 'pdf', 'docx', and 'txt' are supported
file_format: 'pdf'
# terms
terms:
Expand Down
2 changes: 2 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,8 @@ classifiers = [
dependencies = [
"pandas>=2.2.0",
"pydantic>=2.6.1",
"python-docx>=1.1.0",
"python-pptx>=0.6.23",
"pyyaml>=6.0.1",
"pdfminer.six>=20231228",
"sqlalchemy>=2.0.27",
Expand Down
11 changes: 10 additions & 1 deletion src/rosinenpicker/processors.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from pdfminer.high_level import extract_text
from docx import Document
import re
from .patterns import Pattern

Expand Down Expand Up @@ -102,7 +103,15 @@ class TXTProcessor(DocumentProcessor):
def extract_text(self, file_path):
with open(file_path, "r") as doc:
self.text = doc.read()


class DOCXProcessor(DocumentProcessor):
def extract_text(self, file_path):
d = Document(file_path)
txt = ''
for p in d.paragraphs:
txt = txt + '\n' + p.text
self.text = txt


# Placeholder for future extensions
# class MarkdownProcessor(DocumentProcessor):
Expand Down
2 changes: 1 addition & 1 deletion src/rosinenpicker/pydantic_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ def validate_export_format(cls, ef: str):
@field_validator('file_format')
@classmethod
def validate_file_format(cls, ff: str):
valid_formats = {"txt", "pdf"}
valid_formats = {"txt", "pdf", "docx"}
if ff not in valid_formats:
raise ConfigError(msg=f"Concerning '{ff}': File format must conform to one of these options: {valid_formats}!")
return ff
Expand Down
4 changes: 2 additions & 2 deletions src/rosinenpicker/start.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,14 +9,14 @@
from .database import Base, DbRun, DbStrategy, DbProcessedFile, DbMatch
from .utils import file_sha256
from .exporter import BaseExporter, CSVExporter, XLSXExporter, HTMLExporter, JSONExporter
from .processors import DocumentProcessor, PDFProcessor, TXTProcessor
from .processors import DocumentProcessor, PDFProcessor, TXTProcessor, DOCXProcessor
from sqlalchemy import create_engine, select
from sqlalchemy.orm import sessionmaker, Session
import argparse

# File format
# processor and exporter options according to file_format and export_format
file_format_options = {"pdf": PDFProcessor, "txt": TXTProcessor}
file_format_options = {"pdf": PDFProcessor, "txt": TXTProcessor, "docx": DOCXProcessor}
export_format_options = {"csv": CSVExporter, "xlsx": XLSXExporter, "html": HTMLExporter, "json": JSONExporter}

def read_config_file(config_file: str) -> Config:
Expand Down

0 comments on commit 3530b0a

Please sign in to comment.