Merge branch 'master' into dependabot/pip/pytest-dependency-0.6.0

linz · May 27, 2024 · e304a07 · e304a07
2 parents 8e1c25d + eb9ec3f
commit e304a07
Show file tree

Hide file tree

Showing 13 changed files with 138 additions and 61 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,19 @@
 # Changelog
 
+## [4.7.0](https://github.com/linz/topo-imagery/compare/v4.6.0...v4.7.0) (2024-05-23)
+
+
+### Features
+
+* timestamps when created TDE-1147 ([#956](https://github.com/linz/topo-imagery/issues/956)) ([2828f14](https://github.com/linz/topo-imagery/commit/2828f14bc2bfb1bc03963a31a2c2b64ba24f6f75))
+
+
+### Bug Fixes
+
+* add draft lifecycle tag TDE-1161 ([#964](https://github.com/linz/topo-imagery/issues/964)) ([64040d8](https://github.com/linz/topo-imagery/commit/64040d8a1a0a74b1a42ffd99d6379f4995573c98))
+* Make build compatible with new GDAL container TDE-1179 ([#971](https://github.com/linz/topo-imagery/issues/971)) ([719d96e](https://github.com/linz/topo-imagery/commit/719d96e1ff1133c351bbee7a1d93fa359cc702e4))
+* should use unique file names when copying files to standardise TDE-1186 ([#974](https://github.com/linz/topo-imagery/issues/974)) ([8b783dc](https://github.com/linz/topo-imagery/commit/8b783dce9870ff2bb5331552ff730218419f247c))
+
 ## [4.6.0](https://github.com/linz/topo-imagery/compare/v4.5.0...v4.6.0) (2024-05-07)
 
 

diff --git a/Dockerfile b/Dockerfile
@@ -1,20 +1,33 @@
-FROM ghcr.io/osgeo/gdal:ubuntu-small-3.8.4@sha256:60d3bc2f8b09ca1a7ef2db0239699b2c03713aa02be6e525e731c0020bbb10a4
+FROM ghcr.io/osgeo/gdal:ubuntu-small-3.9.0@sha256:d1a38af532e5d9e3991c4a6bddc2f2cb52644dc30a4eb8242101e8e23c3f83f6 as builder
+
+# Avoid blocking `apt-get install` commands
+ARG DEBIAN_FRONTEND=noninteractive
+
+ENV TZ=Etc/UTC
 
 RUN apt-get update
-# Install pip
-RUN apt-get install python3-pip -y
-# Install Poetry
-RUN pip install poetry
+# Install pipx and build dependencies
+RUN apt-get install --assume-yes gcc libgeos-dev pipx python3-dev
+# Install Poetry with the bundle plugin
+RUN pipx install poetry
+RUN pipx inject poetry poetry-plugin-bundle
 
 # Define the working directory for the following commands
-WORKDIR /app
+WORKDIR /src
 
 # Add Poetry config
-COPY poetry.lock pyproject.toml /app/
+COPY poetry.lock pyproject.toml /src/
+
+# Bundle production dependencies into /venv
+RUN /root/.local/bin/poetry bundle venv --no-ansi --no-interaction --only=main -vvv /venv
+
 
-# Install Python dependencies
-RUN poetry config virtualenvs.create false \
-    && poetry install --only main --no-interaction --no-ansi
+FROM ghcr.io/osgeo/gdal:ubuntu-small-3.9.0@sha256:d1a38af532e5d9e3991c4a6bddc2f2cb52644dc30a4eb8242101e8e23c3f83f6
+
+ENV TZ=Etc/UTC
+
+# Copy just the bundle from the first stage
+COPY --from=builder /venv /venv
 
 # Copy Python scripts
 COPY ./scripts/ /app/scripts/
@@ -23,3 +36,5 @@ ENV PYTHONPATH="/app"
 ENV GTIFF_SRS_SOURCE="EPSG"
 
 WORKDIR /app/scripts
+
+ENTRYPOINT ["./docker-entrypoint.sh"]
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -25,7 +25,7 @@ ignore_missing_imports = true
 
 [tool.poetry]
 name = "topo-imagery"
-version = "4.6.0"
+version = "4.7.0"
 description = "A collection of scripts for processing imagery"
 authors = [
     "Blayne Chard <bchard@linz.govt.nz>",

diff --git a/scripts/docker-entrypoint.sh b/scripts/docker-entrypoint.sh
@@ -0,0 +1,7 @@
+#!/bin/sh
+
+set -o errexit
+
+. /venv/bin/activate
+
+exec "$@"
diff --git a/scripts/files/fs.py b/scripts/files/fs.py
@@ -9,6 +9,7 @@
 
 from scripts.aws.aws_helper import is_s3
 from scripts.files import fs_local, fs_s3
+from scripts.stac.util.checksum import multihash_as_hex
 
 if TYPE_CHECKING:
     from mypy_boto3_s3 import S3Client
@@ -93,20 +94,22 @@ def modified(path: str, s3_client: Optional[S3Client] = None) -> datetime:
     return fs_local.modified(Path(path))
 
 
-def write_all(inputs: List[str], target: str, concurrency: Optional[int] = 4) -> List[str]:
+def write_all(
+    inputs: List[str], target: str, concurrency: Optional[int] = 4, generate_name: Optional[bool] = True
+) -> List[str]:
     """Writes list of files to target destination using multithreading.
-
     Args:
         inputs: list of files to read
         target: target folder to write to
         concurrency: max thread pool workers
+        generated_name: create a target file name based on multihash the source filename
 
     Returns:
         list of written file paths
     """
     written_tiffs: List[str] = []
     with ThreadPoolExecutor(max_workers=concurrency) as executor:
-        futuress = {write_file(executor, input_, target): input_ for input_ in inputs}
+        futuress = {write_file(executor, input_, target, generate_name): input_ for input_ in inputs}
         for future in as_completed(futuress):
             if future.exception():
                 get_log().warn("Failed Read-Write", error=future.exception())
@@ -121,15 +124,13 @@ def write_all(inputs: List[str], target: str, concurrency: Optional[int] = 4) ->
 
 
 def write_sidecars(inputs: List[str], target: str, concurrency: Optional[int] = 4) -> None:
-    """Writes list of files to target destination using multithreading.
+    """Writes list of files (if found) to target destination using multithreading.
+    The copy of the files have a generated file name (@see `write_file`)
 
     Args:
         inputs: list of files to read
         target: target folder to write to
         concurrency: max thread pool workers
-
-    Returns:
-
     """
     with ThreadPoolExecutor(max_workers=concurrency) as executor:
         results = {write_file(executor, input_, target): input_ for input_ in inputs}
@@ -141,20 +142,27 @@ def write_sidecars(inputs: List[str], target: str, concurrency: Optional[int] =
                 get_log().info("wrote_sidecar_file", path=future.result())
 
 
-def write_file(executor: ThreadPoolExecutor, input_: str, target: str) -> Future[str]:
+def write_file(executor: ThreadPoolExecutor, input_: str, target: str, generate_name: Optional[bool] = True) -> Future[str]:
     """Read a file from a path and write it to a target path.
-
     Args:
         executor: A ThreadPoolExecutor instance.
         input_: A path to a file to read.
         target: A path to write the file to.
+        generate_name: create a target file name based on multihash the source filename
 
     Returns:
         Future[str]: The result of the execution.
     """
     get_log().info(f"Trying write from file: {input_}")
+
+    if generate_name:
+        file_name, file_extension = os.path.splitext(input_)
+        target_file_name = f"{multihash_as_hex(str.encode(file_name))}{file_extension}"
+    else:
+        target_file_name = os.path.basename(input_)
+
     try:
-        return executor.submit(copy, input_, os.path.join(target, f"{os.path.basename(input_)}"))
+        return executor.submit(copy, input_, os.path.join(target, target_file_name))
     except NoSuchFileError as nsfe:
         future: Future[str] = Future()
         future.set_exception(nsfe)

diff --git a/scripts/files/fs_s3.py b/scripts/files/fs_s3.py
@@ -9,6 +9,7 @@
 
 from scripts.aws.aws_helper import get_session, parse_path
 from scripts.logging.time_helper import time_in_ms
+from scripts.stac.util import checksum
 
 if TYPE_CHECKING:
     from mypy_boto3_s3 import S3Client
@@ -32,13 +33,14 @@ def write(destination: str, source: bytes, content_type: Optional[str] = None) -
     s3_path = parse_path(destination)
     key = s3_path.key
     s3 = resource("s3")
+    multihash = checksum.multihash_as_hex(source)
 
     try:
         s3_object = s3.Object(s3_path.bucket, key)
         if content_type:
-            s3_object.put(Body=source, ContentType=content_type)
+            s3_object.put(Body=source, ContentType=content_type, Metadata={"multihash": multihash})
         else:
-            s3_object.put(Body=source)
+            s3_object.put(Body=source, Metadata={"multihash": multihash})
         get_log().debug("write_s3_success", path=destination, duration=time_in_ms() - start_time)
     except ClientError as ce:
         get_log().error("write_s3_error", path=destination, error=f"Unable to write the file: {ce}")

diff --git a/scripts/files/tests/fs_s3_test.py b/scripts/files/tests/fs_s3_test.py
@@ -47,6 +47,19 @@ def test_write_content_type(subtests: SubTests) -> None:
         assert resp["ContentType"] == ContentType.GEOTIFF.value
 
 
+@mock_aws
+def test_write_multihash_as_metadata(subtests: SubTests) -> None:
+    s3 = resource("s3", region_name=DEFAULT_REGION_NAME)
+    boto3_client = client("s3", region_name=DEFAULT_REGION_NAME)
+    s3.create_bucket(Bucket="testbucket")
+
+    write("s3://testbucket/test.tiff", b"test content", ContentType.GEOTIFF.value)
+    resp = boto3_client.get_object(Bucket="testbucket", Key="test.tiff")
+
+    with subtests.test():
+        assert resp["Metadata"]["multihash"] == "12206ae8a75555209fd6c44157c0aed8016e763ff435a19cf186f76863140143ff72"
+
+
 @mock_aws
 def test_read() -> None:
     s3 = resource("s3", region_name=DEFAULT_REGION_NAME)

diff --git a/scripts/stac/imagery/collection.py b/scripts/stac/imagery/collection.py
@@ -265,18 +265,20 @@ def _title(self) -> str:
             imagery_name = region
             elevation_description = None
 
-        # determine if dataset is preview
+        # determine if the dataset title requires a lifecycle tag
         if self.metadata.get("lifecycle") == "preview":
-            preview = "- Preview"
+            lifecycle_tag = "- Preview"
+        elif self.metadata.get("lifecycle") == "ongoing":
+            lifecycle_tag = "- Draft"
         else:
-            preview = None
+            lifecycle_tag = None
 
         if self.metadata["category"] == SCANNED_AERIAL_PHOTOS:
             if not historic_survey_number:
                 raise MissingMetadataError("historic_survey_number")
             return " ".join(
                 value
-                for value in [imagery_name, self.metadata["gsd"], historic_survey_number, f"({date})", preview or None]
+                for value in [imagery_name, self.metadata["gsd"], historic_survey_number, f"({date})", lifecycle_tag]
                 if value is not None
             )
 
@@ -292,7 +294,7 @@ def _title(self) -> str:
                     self.metadata["gsd"],
                     DATA_CATEGORIES[self.metadata["category"]],
                     f"({date})",
-                    preview or None,
+                    lifecycle_tag,
                 ]
                 if value is not None
             )
@@ -301,12 +303,12 @@ def _title(self) -> str:
                 value
                 for value in [
                     region,
-                    elevation_description or None,
+                    elevation_description,
                     "LiDAR",
                     self.metadata["gsd"],
                     DATA_CATEGORIES[self.metadata["category"]],
                     f"({date})",
-                    preview or None,
+                    lifecycle_tag,
                 ]
                 if value is not None
             )