Skip to content

Commit

Permalink
Merge branch 'master' into dependabot/pip/pytest-dependency-0.6.0
Browse files Browse the repository at this point in the history
  • Loading branch information
l0b0 authored May 27, 2024
2 parents 8e1c25d + eb9ec3f commit e304a07
Show file tree
Hide file tree
Showing 13 changed files with 138 additions and 61 deletions.
14 changes: 14 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,19 @@
# Changelog

## [4.7.0](https://github.com/linz/topo-imagery/compare/v4.6.0...v4.7.0) (2024-05-23)


### Features

* timestamps when created TDE-1147 ([#956](https://github.com/linz/topo-imagery/issues/956)) ([2828f14](https://github.com/linz/topo-imagery/commit/2828f14bc2bfb1bc03963a31a2c2b64ba24f6f75))


### Bug Fixes

* add draft lifecycle tag TDE-1161 ([#964](https://github.com/linz/topo-imagery/issues/964)) ([64040d8](https://github.com/linz/topo-imagery/commit/64040d8a1a0a74b1a42ffd99d6379f4995573c98))
* Make build compatible with new GDAL container TDE-1179 ([#971](https://github.com/linz/topo-imagery/issues/971)) ([719d96e](https://github.com/linz/topo-imagery/commit/719d96e1ff1133c351bbee7a1d93fa359cc702e4))
* should use unique file names when copying files to standardise TDE-1186 ([#974](https://github.com/linz/topo-imagery/issues/974)) ([8b783dc](https://github.com/linz/topo-imagery/commit/8b783dce9870ff2bb5331552ff730218419f247c))

## [4.6.0](https://github.com/linz/topo-imagery/compare/v4.5.0...v4.6.0) (2024-05-07)


Expand Down
35 changes: 25 additions & 10 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,20 +1,33 @@
FROM ghcr.io/osgeo/gdal:ubuntu-small-3.8.4@sha256:60d3bc2f8b09ca1a7ef2db0239699b2c03713aa02be6e525e731c0020bbb10a4
FROM ghcr.io/osgeo/gdal:ubuntu-small-3.9.0@sha256:d1a38af532e5d9e3991c4a6bddc2f2cb52644dc30a4eb8242101e8e23c3f83f6 as builder

# Avoid blocking `apt-get install` commands
ARG DEBIAN_FRONTEND=noninteractive

ENV TZ=Etc/UTC

RUN apt-get update
# Install pip
RUN apt-get install python3-pip -y
# Install Poetry
RUN pip install poetry
# Install pipx and build dependencies
RUN apt-get install --assume-yes gcc libgeos-dev pipx python3-dev
# Install Poetry with the bundle plugin
RUN pipx install poetry
RUN pipx inject poetry poetry-plugin-bundle

# Define the working directory for the following commands
WORKDIR /app
WORKDIR /src

# Add Poetry config
COPY poetry.lock pyproject.toml /app/
COPY poetry.lock pyproject.toml /src/

# Bundle production dependencies into /venv
RUN /root/.local/bin/poetry bundle venv --no-ansi --no-interaction --only=main -vvv /venv


# Install Python dependencies
RUN poetry config virtualenvs.create false \
&& poetry install --only main --no-interaction --no-ansi
FROM ghcr.io/osgeo/gdal:ubuntu-small-3.9.0@sha256:d1a38af532e5d9e3991c4a6bddc2f2cb52644dc30a4eb8242101e8e23c3f83f6

ENV TZ=Etc/UTC

# Copy just the bundle from the first stage
COPY --from=builder /venv /venv

# Copy Python scripts
COPY ./scripts/ /app/scripts/
Expand All @@ -23,3 +36,5 @@ ENV PYTHONPATH="/app"
ENV GTIFF_SRS_SOURCE="EPSG"

WORKDIR /app/scripts

ENTRYPOINT ["./docker-entrypoint.sh"]
68 changes: 38 additions & 30 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ ignore_missing_imports = true

[tool.poetry]
name = "topo-imagery"
version = "4.6.0"
version = "4.7.0"
description = "A collection of scripts for processing imagery"
authors = [
"Blayne Chard <bchard@linz.govt.nz>",
Expand Down
7 changes: 7 additions & 0 deletions scripts/docker-entrypoint.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
#!/bin/sh

set -o errexit

. /venv/bin/activate

exec "$@"
28 changes: 18 additions & 10 deletions scripts/files/fs.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@

from scripts.aws.aws_helper import is_s3
from scripts.files import fs_local, fs_s3
from scripts.stac.util.checksum import multihash_as_hex

if TYPE_CHECKING:
from mypy_boto3_s3 import S3Client
Expand Down Expand Up @@ -93,20 +94,22 @@ def modified(path: str, s3_client: Optional[S3Client] = None) -> datetime:
return fs_local.modified(Path(path))


def write_all(inputs: List[str], target: str, concurrency: Optional[int] = 4) -> List[str]:
def write_all(
inputs: List[str], target: str, concurrency: Optional[int] = 4, generate_name: Optional[bool] = True
) -> List[str]:
"""Writes list of files to target destination using multithreading.
Args:
inputs: list of files to read
target: target folder to write to
concurrency: max thread pool workers
generated_name: create a target file name based on multihash the source filename
Returns:
list of written file paths
"""
written_tiffs: List[str] = []
with ThreadPoolExecutor(max_workers=concurrency) as executor:
futuress = {write_file(executor, input_, target): input_ for input_ in inputs}
futuress = {write_file(executor, input_, target, generate_name): input_ for input_ in inputs}
for future in as_completed(futuress):
if future.exception():
get_log().warn("Failed Read-Write", error=future.exception())
Expand All @@ -121,15 +124,13 @@ def write_all(inputs: List[str], target: str, concurrency: Optional[int] = 4) ->


def write_sidecars(inputs: List[str], target: str, concurrency: Optional[int] = 4) -> None:
"""Writes list of files to target destination using multithreading.
"""Writes list of files (if found) to target destination using multithreading.
The copy of the files have a generated file name (@see `write_file`)
Args:
inputs: list of files to read
target: target folder to write to
concurrency: max thread pool workers
Returns:
"""
with ThreadPoolExecutor(max_workers=concurrency) as executor:
results = {write_file(executor, input_, target): input_ for input_ in inputs}
Expand All @@ -141,20 +142,27 @@ def write_sidecars(inputs: List[str], target: str, concurrency: Optional[int] =
get_log().info("wrote_sidecar_file", path=future.result())


def write_file(executor: ThreadPoolExecutor, input_: str, target: str) -> Future[str]:
def write_file(executor: ThreadPoolExecutor, input_: str, target: str, generate_name: Optional[bool] = True) -> Future[str]:
"""Read a file from a path and write it to a target path.
Args:
executor: A ThreadPoolExecutor instance.
input_: A path to a file to read.
target: A path to write the file to.
generate_name: create a target file name based on multihash the source filename
Returns:
Future[str]: The result of the execution.
"""
get_log().info(f"Trying write from file: {input_}")

if generate_name:
file_name, file_extension = os.path.splitext(input_)
target_file_name = f"{multihash_as_hex(str.encode(file_name))}{file_extension}"
else:
target_file_name = os.path.basename(input_)

try:
return executor.submit(copy, input_, os.path.join(target, f"{os.path.basename(input_)}"))
return executor.submit(copy, input_, os.path.join(target, target_file_name))
except NoSuchFileError as nsfe:
future: Future[str] = Future()
future.set_exception(nsfe)
Expand Down
6 changes: 4 additions & 2 deletions scripts/files/fs_s3.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@

from scripts.aws.aws_helper import get_session, parse_path
from scripts.logging.time_helper import time_in_ms
from scripts.stac.util import checksum

if TYPE_CHECKING:
from mypy_boto3_s3 import S3Client
Expand All @@ -32,13 +33,14 @@ def write(destination: str, source: bytes, content_type: Optional[str] = None) -
s3_path = parse_path(destination)
key = s3_path.key
s3 = resource("s3")
multihash = checksum.multihash_as_hex(source)

try:
s3_object = s3.Object(s3_path.bucket, key)
if content_type:
s3_object.put(Body=source, ContentType=content_type)
s3_object.put(Body=source, ContentType=content_type, Metadata={"multihash": multihash})
else:
s3_object.put(Body=source)
s3_object.put(Body=source, Metadata={"multihash": multihash})
get_log().debug("write_s3_success", path=destination, duration=time_in_ms() - start_time)
except ClientError as ce:
get_log().error("write_s3_error", path=destination, error=f"Unable to write the file: {ce}")
Expand Down
13 changes: 13 additions & 0 deletions scripts/files/tests/fs_s3_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,19 @@ def test_write_content_type(subtests: SubTests) -> None:
assert resp["ContentType"] == ContentType.GEOTIFF.value


@mock_aws
def test_write_multihash_as_metadata(subtests: SubTests) -> None:
s3 = resource("s3", region_name=DEFAULT_REGION_NAME)
boto3_client = client("s3", region_name=DEFAULT_REGION_NAME)
s3.create_bucket(Bucket="testbucket")

write("s3://testbucket/test.tiff", b"test content", ContentType.GEOTIFF.value)
resp = boto3_client.get_object(Bucket="testbucket", Key="test.tiff")

with subtests.test():
assert resp["Metadata"]["multihash"] == "12206ae8a75555209fd6c44157c0aed8016e763ff435a19cf186f76863140143ff72"


@mock_aws
def test_read() -> None:
s3 = resource("s3", region_name=DEFAULT_REGION_NAME)
Expand Down
16 changes: 9 additions & 7 deletions scripts/stac/imagery/collection.py
Original file line number Diff line number Diff line change
Expand Up @@ -265,18 +265,20 @@ def _title(self) -> str:
imagery_name = region
elevation_description = None

# determine if dataset is preview
# determine if the dataset title requires a lifecycle tag
if self.metadata.get("lifecycle") == "preview":
preview = "- Preview"
lifecycle_tag = "- Preview"
elif self.metadata.get("lifecycle") == "ongoing":
lifecycle_tag = "- Draft"
else:
preview = None
lifecycle_tag = None

if self.metadata["category"] == SCANNED_AERIAL_PHOTOS:
if not historic_survey_number:
raise MissingMetadataError("historic_survey_number")
return " ".join(
value
for value in [imagery_name, self.metadata["gsd"], historic_survey_number, f"({date})", preview or None]
for value in [imagery_name, self.metadata["gsd"], historic_survey_number, f"({date})", lifecycle_tag]
if value is not None
)

Expand All @@ -292,7 +294,7 @@ def _title(self) -> str:
self.metadata["gsd"],
DATA_CATEGORIES[self.metadata["category"]],
f"({date})",
preview or None,
lifecycle_tag,
]
if value is not None
)
Expand All @@ -301,12 +303,12 @@ def _title(self) -> str:
value
for value in [
region,
elevation_description or None,
elevation_description,
"LiDAR",
self.metadata["gsd"],
DATA_CATEGORIES[self.metadata["category"]],
f"({date})",
preview or None,
lifecycle_tag,
]
if value is not None
)
Expand Down
Loading

0 comments on commit e304a07

Please sign in to comment.