feat: add CLI command to run OCR on previous images

openfoodfacts · Nov 21, 2024 · 6743d77 · 6743d77
1 parent b2e55be
commit 6743d77
Show file tree

Hide file tree

Showing 5 changed files with 73 additions and 6 deletions.
diff --git a/Makefile b/Makefile
@@ -156,6 +156,10 @@ migrate-db:
 	@echo "🥫 Migrating database …"
 	${DOCKER_COMPOSE} run --rm --no-deps api python3 manage.py migrate
 
+cli: guard-args
+	${DOCKER_COMPOSE} run --rm --no-deps api python3 manage.py ${args}
+
+
 # TODO: migrate to Django
 add-db-revision: guard-message
 	${DOCKER_COMPOSE} run --rm --no-deps api alembic revision --autogenerate -m "${message}"

diff --git a/docs/maintenance.md b/docs/maintenance.md
@@ -0,0 +1,15 @@
+# Maintenance
+
+## How to launch OCR on previously uploaded images
+
+OCR (through Google Cloud Vision) is launched on every new proof image. However, if you want to launch OCR on previously uploaded images, you can do so by running the following command:
+
+```bash
+make cli args='run_ocr'
+```
+
+To override existing OCR results, add the `--override` flag:
+
+```bash
+make cli args='run_ocr --override'
+```
diff --git a/open_prices/proofs/management/commands/run_ocr.py b/open_prices/proofs/management/commands/run_ocr.py
@@ -0,0 +1,31 @@
+import argparse
+import glob
+
+import tqdm
+from django.conf import settings
+from django.core.management.base import BaseCommand
+
+from open_prices.proofs.utils import fetch_and_save_ocr_data
+
+
+class Command(BaseCommand):
+    help = "Run OCR on images with missing OCR files."
+
+    def add_arguments(self, parser: argparse.ArgumentParser) -> None:
+        parser.add_argument(
+            "--override", action="store_true", help="Override existing OCR data."
+        )
+
+    def handle(self, *args, **options) -> None:  # type: ignore
+        self.stdout.write("Starting OCR processing...")
+        override = options["override"]
+        processed = 0
+
+        for image_path_str in tqdm.tqdm(
+            glob.iglob("**/*", root_dir=settings.IMAGES_DIR), desc="images"
+        ):
+            image_path = settings.IMAGES_DIR / image_path_str
+            result = fetch_and_save_ocr_data(image_path, override=override)
+            processed += int(result)
+
+        self.stdout.write("%d OCR saved" % processed)
diff --git a/open_prices/proofs/tests.py b/open_prices/proofs/tests.py
@@ -323,7 +323,8 @@ def test_fetch_and_save_ocr_data_success(self):
                     image_path = Path(f"{tmpdirname}/test.jpg")
                     with image_path.open("w") as f:
                         f.write("test")
-                    fetch_and_save_ocr_data(image_path)
+                    output = fetch_and_save_ocr_data(image_path)
+                    self.assertTrue(output)
                     mock_run_ocr_on_image.assert_called_once_with(
                         image_path, "test_api_key"
                     )
@@ -339,3 +340,12 @@ def test_fetch_and_save_ocr_data_success(self):
                         self.assertEqual(
                             actual_data["responses"], response_data["responses"]
                         )
+
+    def test_fetch_and_save_ocr_data_invalid_extension(self):
+        with self.settings(GOOGLE_CLOUD_VISION_API_KEY="test_api_key"):
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                image_path = Path(f"{tmpdirname}/test.bin")
+                with image_path.open("w") as f:
+                    f.write("test")
+                output = fetch_and_save_ocr_data(image_path)
+                self.assertFalse(output)
diff --git a/open_prices/proofs/utils.py b/open_prices/proofs/utils.py
@@ -178,7 +178,7 @@ def run_ocr_on_image(image_path: Path | str, api_key: str) -> dict[str, Any] | N
     return r.json()
 
 
-def fetch_and_save_ocr_data(image_path: Path | str, override: bool = False) -> None:
+def fetch_and_save_ocr_data(image_path: Path | str, override: bool = False) -> bool:
     """Run OCR on the image stored at the given path and save the result to a
     JSON file.
 
@@ -187,28 +187,35 @@ def fetch_and_save_ocr_data(image_path: Path | str, override: bool = False) -> N
 
     :param image_path: the path to the image
     :param override: whether to override existing OCR data, default to False
+    :return: True if the OCR data was saved, False otherwise
     """
     image_path = Path(image_path)
+
+    if image_path.suffix not in (".jpg", ".jpeg", ".png", ".webp"):
+        logger.debug("Skipping %s, not a supported image type", image_path)
+        return False
+
     api_key = settings.GOOGLE_CLOUD_VISION_API_KEY
 
-    if api_key is None:
+    if not api_key:
         logger.error("No Google Cloud Vision API key found")
-        return
+        return False
 
     ocr_json_path = image_path.with_suffix(".json.gz")
 
     if ocr_json_path.exists() and not override:
         logger.info("OCR data already exists for %s", image_path)
-        return
+        return False
 
     data = run_ocr_on_image(image_path, api_key)
 
     if data is None:
-        return
+        return False
 
     data["created_at"] = int(time.time())
 
     with gzip.open(ocr_json_path, "wt") as f:
         f.write(json.dumps(data))
 
     logger.debug("OCR data saved to %s", ocr_json_path)
+    return True