fix: crashing on pdf with empty pages

pdfix · Nov 24, 2024 · 359712f · 359712f
1 parent ebcf92a
commit 359712f
Show file tree

Hide file tree

Showing 4 changed files with 26 additions and 9 deletions.
diff --git a/example/empty_page.pdf b/example/empty_page.pdf
diff --git a/src/lang_detect.py b/src/lang_detect.py
@@ -81,8 +81,9 @@ def detect_lang_pdf_2_pdf(
         words: list[str] = []
         get_text(container, words)
 
-        lang = detect_lang_for_text(" ".join(words))
-        lang_list.append(lang)
+        if words:
+            lang = detect_lang_for_text(" ".join(words))
+            lang_list.append(lang)
 
     # Count the frequency of each string
     string_counts = Counter(lang_list)
@@ -157,8 +158,11 @@ def detect_lang_pdf_2_txt(
         words: list[str] = []
         get_text(container, words)
 
-        lang = detect_lang_for_text(" ".join(words))
-        lang_list.append(lang)
+        print(words)
+        print(i)
+        if words:
+            lang = detect_lang_for_text(" ".join(words))
+            lang_list.append(lang)
 
     # Count the frequency of each string
     string_counts = Counter(lang_list)
@@ -179,12 +183,13 @@ def detect_lang_txt_2_txt(input: str, output: str) -> None:
         with open(input, "r", encoding="utf-8") as infile:
             text = infile.read()
 
-        detected_language = detect_lang_for_text(text)
+        if not text:
+            detected_language = detect_lang_for_text(text)
 
-        print("Detected language: " + detected_language)
+            print("Detected language: " + detected_language)
 
-        with open(output, "w", encoding="utf-8") as outfile:
-            outfile.write(detected_language)
+            with open(output, "w", encoding="utf-8") as outfile:
+                outfile.write(detected_language)
 
     except Exception as e:
         print(f"An error occurred: {str(e)}", file=sys.stderr)
@@ -193,6 +198,9 @@ def detect_lang_txt_2_txt(input: str, output: str) -> None:
 
 def detect_lang_str_2_txt(input_text: str, output: str) -> None:
     try:
+        if not input_text:
+            print("Input is an empty string")
+            sys.exit(1)
         # Detect the language of the input text
         detected_language = detect_lang_for_text(input_text)
 

diff --git a/src/main.py b/src/main.py
@@ -95,7 +95,7 @@ def main() -> None:  # noqa: D103
             try:
                 detect_lang_pdf_2_pdf(input_file, output_file, args.name, args.key)
             except Exception as e:
-                sys.exit("Failed to run OCR: {}".format(e))
+                sys.exit("Failed to run language detection: {}".format(e))
 
         elif input_file.lower().endswith(".pdf") and output_file.lower().endswith(
             ".txt",

diff --git a/test.sh b/test.sh
@@ -80,6 +80,15 @@ else
   EXIT_STATUS=1
 fi
 
+info "Test #05: Run lang-detect on pdf with empty page"
+docker run -v $(pwd):/data -w /data $img lang-detect -i example/empty_page.pdf -o $tmp_dir/empty_page.txt > /dev/null
+if [ -f "$(pwd)/$tmp_dir/empty_page.txt" ]; then
+  success "passed"
+else
+  error "lang-detect to pdf failed on example/empty_page.pdf"
+  EXIT_STATUS=1
+fi
+
 popd > /dev/null
 
 if [ $EXIT_STATUS -eq 1 ]; then