Skip to content

Commit

Permalink
fix: crashing on pdf with empty pages
Browse files Browse the repository at this point in the history
  • Loading branch information
4gac committed Nov 24, 2024
1 parent ebcf92a commit 359712f
Show file tree
Hide file tree
Showing 4 changed files with 26 additions and 9 deletions.
Binary file added example/empty_page.pdf
Binary file not shown.
24 changes: 16 additions & 8 deletions src/lang_detect.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,8 +81,9 @@ def detect_lang_pdf_2_pdf(
words: list[str] = []
get_text(container, words)

lang = detect_lang_for_text(" ".join(words))
lang_list.append(lang)
if words:
lang = detect_lang_for_text(" ".join(words))
lang_list.append(lang)

# Count the frequency of each string
string_counts = Counter(lang_list)
Expand Down Expand Up @@ -157,8 +158,11 @@ def detect_lang_pdf_2_txt(
words: list[str] = []
get_text(container, words)

lang = detect_lang_for_text(" ".join(words))
lang_list.append(lang)
print(words)
print(i)
if words:
lang = detect_lang_for_text(" ".join(words))
lang_list.append(lang)

# Count the frequency of each string
string_counts = Counter(lang_list)
Expand All @@ -179,12 +183,13 @@ def detect_lang_txt_2_txt(input: str, output: str) -> None:
with open(input, "r", encoding="utf-8") as infile:
text = infile.read()

detected_language = detect_lang_for_text(text)
if not text:
detected_language = detect_lang_for_text(text)

print("Detected language: " + detected_language)
print("Detected language: " + detected_language)

with open(output, "w", encoding="utf-8") as outfile:
outfile.write(detected_language)
with open(output, "w", encoding="utf-8") as outfile:
outfile.write(detected_language)

except Exception as e:
print(f"An error occurred: {str(e)}", file=sys.stderr)
Expand All @@ -193,6 +198,9 @@ def detect_lang_txt_2_txt(input: str, output: str) -> None:

def detect_lang_str_2_txt(input_text: str, output: str) -> None:
try:
if not input_text:
print("Input is an empty string")
sys.exit(1)
# Detect the language of the input text
detected_language = detect_lang_for_text(input_text)

Expand Down
2 changes: 1 addition & 1 deletion src/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ def main() -> None: # noqa: D103
try:
detect_lang_pdf_2_pdf(input_file, output_file, args.name, args.key)
except Exception as e:
sys.exit("Failed to run OCR: {}".format(e))
sys.exit("Failed to run language detection: {}".format(e))

elif input_file.lower().endswith(".pdf") and output_file.lower().endswith(
".txt",
Expand Down
9 changes: 9 additions & 0 deletions test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,15 @@ else
EXIT_STATUS=1
fi

info "Test #05: Run lang-detect on pdf with empty page"
docker run -v $(pwd):/data -w /data $img lang-detect -i example/empty_page.pdf -o $tmp_dir/empty_page.txt > /dev/null
if [ -f "$(pwd)/$tmp_dir/empty_page.txt" ]; then
success "passed"
else
error "lang-detect to pdf failed on example/empty_page.pdf"
EXIT_STATUS=1
fi

popd > /dev/null

if [ $EXIT_STATUS -eq 1 ]; then
Expand Down

0 comments on commit 359712f

Please sign in to comment.