From 3c8f3214dcf0b432cfd8ec4cf4b81275a3e5a9a4 Mon Sep 17 00:00:00 2001
From: Kevin Meinhardt <kmeinhardt@mozilla.com>
Date: Mon, 6 Jan 2025 16:26:23 +0100
Subject: [PATCH] Add build_info.py including content hash verification for
 static build directories: - Removed the previous build information generation
 method and replaced it with a new Python script (scripts/build_info.py) that
 creates a build info file with content hashes for static assets and locales.
 - Updated Dockerfile to utilize the new build_info.py script for generating
 build information during the Docker build process. - Adjusted the static
 check in the Django app to validate the content hash of static files against
 the expected hash. - Cleaned up .dockerignore and .gitignore files by
 removing the exclusion of build*.py files. - Added unit tests for the new
 build_info.py script and its functionality.

---
 .dockerignore                       |   1 -
 .gitignore                          |   1 -
 Dockerfile                          |  65 ++++++++----
 scripts/build_info.py               |  80 ++++++++++++++
 src/olympia/core/apps.py            |  45 ++------
 src/olympia/core/tests/test_apps.py |  86 ++++-----------
 tests/make/test_build_info.py       | 158 ++++++++++++++++++++++++++++
 7 files changed, 313 insertions(+), 123 deletions(-)
 create mode 100755 scripts/build_info.py
 create mode 100644 tests/make/test_build_info.py

diff --git a/.dockerignore b/.dockerignore
index e7b5b4c3d5aa..7e1e1b89a565 100644
--- a/.dockerignore
+++ b/.dockerignore
@@ -26,7 +26,6 @@
 .tox/
 .vscode
 backups
-build*.py
 buildx-bake-metadata.json
 deps/*
 docker*.yml
diff --git a/.gitignore b/.gitignore
index 1303a8a5131d..8b069175fbda 100644
--- a/.gitignore
+++ b/.gitignore
@@ -25,7 +25,6 @@
 .tox/
 .vscode
 backups
-build*.py
 buildx-bake-metadata.json
 deps/*
 docker*.yml
diff --git a/Dockerfile b/Dockerfile
index bbf5967e18d6..76bdf90a0d62 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -6,6 +6,12 @@ FROM python:3.12-slim-bookworm AS olympia
 
 ENV BUILD_INFO=/build-info.json
 
+# Set permissions to make the file readable by all but only writable by root
+RUN <<EOF
+echo "{}" > ${BUILD_INFO}
+chmod 644 ${BUILD_INFO}
+EOF
+
 # Set shell to bash with logs and errors for build
 SHELL ["/bin/bash", "-xue", "-c"]
 
@@ -20,29 +26,14 @@ ENV HOME=/data/olympia
 WORKDIR ${HOME}
 RUN chown -R olympia:olympia ${HOME}
 
-FROM olympia AS info
+FROM olympia AS build_info
 
-# Build args that represent static build information
-# These are passed to docker via the bake.hcl file and
-# should not be overridden in the container environment.
-ARG DOCKER_COMMIT
-ARG DOCKER_VERSION
-ARG DOCKER_BUILD
+# We only need the DOCKER_TARGET at this point
 ARG DOCKER_TARGET
-
 # Create the build file hard coding build variables to the image
-RUN <<EOF
-cat <<INNEREOF > ${BUILD_INFO}
-{
-  "commit": "${DOCKER_COMMIT}",
-  "version": "${DOCKER_VERSION}",
-  "build": "${DOCKER_BUILD}",
-  "target": "${DOCKER_TARGET}",
-  "source": "https://github.com/mozilla/addons-server"
-}
-INNEREOF
-# Set permissions to make the file readable by all but only writable by root
-chmod 644 ${BUILD_INFO}
+RUN --mount=type=bind,source=scripts/build_info.py,target=${HOME}/scripts/build_info.py \
+<<EOF
+${HOME}/scripts/build_info.py --output ${BUILD_INFO} --target ${DOCKER_TARGET}
 EOF
 
 FROM olympia AS base
@@ -147,7 +138,7 @@ EOF
 FROM base AS development
 
 # Copy build info from info
-COPY --from=info ${BUILD_INFO} ${BUILD_INFO}
+COPY --from=build_info ${BUILD_INFO} ${BUILD_INFO}
 
 FROM base AS locales
 ARG LOCALE_DIR=${HOME}/locale
@@ -178,12 +169,42 @@ COPY --chown=olympia:olympia static/ ${HOME}/static/
 RUN \
     --mount=type=bind,src=src,target=${HOME}/src \
     --mount=type=bind,src=Makefile-docker,target=${HOME}/Makefile-docker \
+    --mount=type=bind,src=scripts/build_info.py,target=${HOME}/scripts/build_info.py \
     --mount=type=bind,src=scripts/update_assets.py,target=${HOME}/scripts/update_assets.py \
     --mount=type=bind,src=manage.py,target=${HOME}/manage.py \
 <<EOF
 make -f Makefile-docker update_assets
 EOF
 
+# This stage extends from olympia so we can run as root
+# prod_info includes production build directories to hash content
+# ensuring built assets are not modified at runtime.
+FROM olympia AS prod_info
+
+# Copy compiled locales from builder
+COPY --from=locales --chown=olympia:olympia ${HOME}/locale ${HOME}/locale
+# Copy assets from assets
+COPY --from=assets --chown=olympia:olympia ${HOME}/site-static ${HOME}/site-static
+
+# Build args that represent static build information
+# These are passed to docker via the bake.hcl file and
+# should not be set in the container environment.
+ARG DOCKER_COMMIT
+ARG DOCKER_VERSION
+ARG DOCKER_BUILD
+ARG DOCKER_TARGET
+
+RUN \
+    --mount=type=bind,source=scripts/build_info.py,target=${HOME}/scripts/build_info.py \
+<<EOF
+${HOME}/scripts/build_info.py \
+  --output "${BUILD_INFO}" \
+  --commit "${DOCKER_COMMIT}" \
+  --version "${DOCKER_VERSION}" \
+  --build "${DOCKER_BUILD}" \
+  --target "${DOCKER_TARGET}"
+EOF
+
 FROM base AS production
 # Copy the rest of the source files from the host
 COPY --chown=olympia:olympia . ${HOME}
@@ -193,7 +214,7 @@ COPY --from=locales --chown=olympia:olympia ${HOME}/locale ${HOME}/locale
 COPY --from=assets --chown=olympia:olympia ${HOME}/site-static ${HOME}/site-static
 COPY --from=assets --chown=olympia:olympia ${HOME}/static-build ${HOME}/static-build
 # Copy build info from info
-COPY --from=info ${BUILD_INFO} ${BUILD_INFO}
+COPY --from=prod_info ${BUILD_INFO} ${BUILD_INFO}
 # Copy compiled locales from builder
 COPY --from=locales --chown=olympia:olympia ${HOME}/locale ${HOME}/locale
 # Copy dependencies from `pip_production`
diff --git a/scripts/build_info.py b/scripts/build_info.py
new file mode 100755
index 000000000000..100b57a8657a
--- /dev/null
+++ b/scripts/build_info.py
@@ -0,0 +1,80 @@
+#!/usr/bin/env python3
+
+import argparse
+import hashlib
+import json
+from pathlib import Path
+
+
+root = Path(__file__).parent.parent
+
+
+def hash_directory(
+    directory: Path,
+    exclude_patterns: list[str] = None,
+    verbose: bool = False,
+) -> str:
+    if not directory.exists():
+        return None
+
+    hasher = hashlib.blake2b(digest_size=32)
+
+    exclude_patterns = exclude_patterns or []
+
+    files = sorted(f for f in directory.rglob('*'))
+
+    for f in files:
+        is_hidden = any(part.startswith('.') for part in f.parts)
+        is_excluded = any(f.match(pattern) for pattern in exclude_patterns)
+        if not is_hidden and not is_excluded:
+            if verbose:
+                print(f'Hashing {f}')
+            hasher.update(f.name.encode())
+            if f.is_file():
+                hasher.update(f.read_bytes())
+        elif verbose:
+            print(f'Skipping {f}')
+
+    return hasher.hexdigest()
+
+
+def build_info(
+    commit: str = None, version: str = None, build: str = None, target: str = None
+):
+    """
+    Create a build info file with the current build information from the environment.
+    """
+    return {
+        'commit': commit,
+        'version': version,
+        'build': build,
+        'target': target,
+        'source': 'https://github.com/mozilla/addons-server',
+        'content_hash': {
+            'site_static_hash': hash_directory(
+                (root / 'site-static'), exclude_patterns=['staticfiles.json']
+            ),
+            'locale_hash': hash_directory((root / 'locale')),
+        },
+    }
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--output', type=str, required=False)
+
+    parser.add_argument('--commit', type=str, required=False)
+    parser.add_argument('--version', type=str, required=False)
+    parser.add_argument('--build', type=str, required=False)
+    # Docker target is used to determine build time and runtime behavior
+    parser.add_argument('--target', type=str, required=True)
+
+    args = parser.parse_args()
+
+    version = build_info(args.commit, args.version, args.build, args.target)
+
+    if args.output:
+        with open(args.output, 'w') as f:
+            json.dump(version, f)
+    else:
+        print(json.dumps(version))
diff --git a/src/olympia/core/apps.py b/src/olympia/core/apps.py
index 9d878f84781e..97198436e2bb 100644
--- a/src/olympia/core/apps.py
+++ b/src/olympia/core/apps.py
@@ -2,20 +2,18 @@
 import os
 import subprocess
 import warnings
-from io import StringIO
 from pwd import getpwnam
 
 from django.apps import AppConfig
 from django.conf import settings
 from django.core.checks import Error, Tags, register
-from django.core.management import call_command
-from django.core.management.base import CommandError
 from django.db import connection
 from django.utils.translation import gettext_lazy as _
 
 import requests
 
 from olympia.core.utils import REQUIRED_VERSION_KEYS, get_version_json
+from scripts.build_info import build_info
 
 
 log = logging.getLogger('z.startup')
@@ -85,46 +83,25 @@ def version_check(app_configs, **kwargs):
 
 @register(CustomTags.custom_setup)
 def static_check(app_configs, **kwargs):
-    errors = []
-    output = StringIO()
+    """
+    Check that the static files have not been modified since the original build.
+    """
     version = get_version_json()
 
     # We only run this check in production images.
     if version.get('target') != 'production':
         return []
 
-    try:
-        call_command('compress_assets', dry_run=True, stdout=output)
-        stripped_output = output.getvalue().strip()
-
-        if stripped_output:
-            file_paths = stripped_output.split('\n')
-            for file_path in file_paths:
-                if not os.path.exists(file_path):
-                    error = f'Compressed asset file does not exist: {file_path}'
-                    errors.append(
-                        Error(
-                            error,
-                            id='setup.E003',
-                        )
-                    )
-        else:
-            errors.append(
-                Error(
-                    'No compressed asset files were found.',
-                    id='setup.E003',
-                )
-            )
-
-    except CommandError as e:
-        errors.append(
+    site_static_hash = build_info().get('content_hash').get('site_static_hash')
+    if site_static_hash != version.get('content_hash').get('site_static_hash'):
+        return [
             Error(
-                f'Error running compress_assets command: {str(e)}',
-                id='setup.E004',
+                'Site static directory does not match expected content hash',
+                id='setup.E003',
             )
-        )
+        ]
 
-    return errors
+    return []
 
 
 @register(CustomTags.custom_setup)
diff --git a/src/olympia/core/tests/test_apps.py b/src/olympia/core/tests/test_apps.py
index 8c6823aeca15..d8368fe95d1c 100644
--- a/src/olympia/core/tests/test_apps.py
+++ b/src/olympia/core/tests/test_apps.py
@@ -2,7 +2,7 @@
 import tempfile
 from unittest import mock
 
-from django.core.management import CommandError, call_command
+from django.core.management import call_command
 from django.core.management.base import SystemCheckError
 from django.test import TestCase
 from django.test.utils import override_settings
@@ -21,6 +21,10 @@ def setUp(self):
             'version': '1.0',
             'build': 'http://example.com/build',
             'source': 'https://github.com/mozilla/addons-server',
+            'content_hash': {
+                'site_static_hash': 'abc',
+                'locale_hash': 'def',
+            },
         }
         patch = mock.patch(
             'olympia.core.apps.get_version_json',
@@ -34,12 +38,10 @@ def setUp(self):
         with open(self.fake_css_file, 'w') as f:
             f.write('body { background: red; }')
 
-        patch_command = mock.patch('olympia.core.apps.call_command')
-        self.mock_call_command = patch_command.start()
-        self.mock_call_command.side_effect = (
-            lambda command, dry_run, stdout: stdout.write(f'{self.fake_css_file}\n')
-        )
-        self.addCleanup(patch_command.stop)
+        mock_build_info = mock.patch('olympia.core.apps.build_info')
+        self.mock_build_info = mock_build_info.start()
+        self.mock_build_info.return_value = self.default_version_json
+        self.addCleanup(mock_build_info.stop)
 
         self.media_root = tempfile.mkdtemp(prefix='media-root')
 
@@ -104,64 +106,6 @@ def test_illegal_override_uid_check(self, mock_getpwnam):
         with override_settings(HOST_UID=1000):
             call_command('check')
 
-    def test_static_check_no_assets_found(self):
-        """
-        Test static_check fails if compress_assets reports no files.
-        """
-        self.mock_get_version_json.return_value['target'] = 'production'
-        # Simulate "compress_assets" returning no file paths.
-        self.mock_call_command.side_effect = (
-            lambda command, dry_run, stdout: stdout.write('')
-        )
-        with self.assertRaisesMessage(
-            SystemCheckError, 'No compressed asset files were found.'
-        ):
-            call_command('check')
-
-    @mock.patch('os.path.exists')
-    def test_static_check_missing_assets(self, mock_exists):
-        """
-        Test static_check fails if at least one specified compressed
-        asset file does not exist.
-        """
-        self.mock_get_version_json.return_value['target'] = 'production'
-        # Simulate "compress_assets" returning a couple of files.
-        self.mock_call_command.side_effect = (
-            lambda command, dry_run, stdout: stdout.write(
-                f'{self.fake_css_file}\nfoo.js\n'
-            )
-        )
-        # Pretend neither file exists on disk.
-        mock_exists.return_value = False
-
-        with self.assertRaisesMessage(
-            SystemCheckError,
-            # Only the first missing file triggers the AssertionError message check
-            'Compressed asset file does not exist: foo.js',
-        ):
-            call_command('check')
-
-    def test_static_check_command_error(self):
-        """
-        Test static_check fails if there's an error during compress_assets.
-        """
-        self.mock_get_version_json.return_value['target'] = 'production'
-        self.mock_call_command.side_effect = CommandError('Oops')
-        with self.assertRaisesMessage(
-            SystemCheckError, 'Error running compress_assets command: Oops'
-        ):
-            call_command('check')
-
-    def test_static_check_command_success(self):
-        """
-        Test static_check succeeds if compress_assets runs without errors.
-        """
-        self.mock_get_version_json.return_value['target'] = 'production'
-        self.mock_call_command.side_effect = (
-            lambda command, dry_run, stdout: stdout.write(f'{self.fake_css_file}\n')
-        )
-        call_command('check')
-
     def test_nginx_skips_check_on_production_target(self):
         fake_media_root = '/fake/not/real'
         with override_settings(MEDIA_ROOT=fake_media_root):
@@ -215,3 +159,15 @@ def test_nginx_raises_unexpected_content(self):
     def test_nginx_raises_unexpected_served_by(self):
         """Test that files are served by nginx and not redirected elsewhere."""
         self._test_nginx_response('http://nginx/user-media', served_by='wow')
+
+    def test_static_check(self):
+        self.mock_build_info.return_value = {
+            'content_hash': {
+                'site_static_hash': 'incorrect',
+            },
+        }
+        with self.assertRaisesMessage(
+            SystemCheckError,
+            'Site static directory does not match expected content hash',
+        ):
+            call_command('check')
diff --git a/tests/make/test_build_info.py b/tests/make/test_build_info.py
new file mode 100644
index 000000000000..6d2bcb43386e
--- /dev/null
+++ b/tests/make/test_build_info.py
@@ -0,0 +1,158 @@
+import json
+import subprocess
+import tempfile
+import unittest
+from pathlib import Path
+from unittest import mock
+
+from scripts.build_info import build_info, hash_directory
+
+
+class TestHashDirectory(unittest.TestCase):
+    def setUp(self):
+        self.dir = Path(tempfile.mkdtemp())
+        self.file_one = self.dir / 'file_one.txt'
+
+    def test_missing_directory(self):
+        self.assertEqual(hash_directory(Path('missing')), None)
+
+    def test_consistent_hash(self):
+        hash_directory(self.dir)
+        self.assertEqual(hash_directory(self.dir), hash_directory(self.dir))
+
+    def test_change_file_contents_updates_hash(self):
+        original_hash = hash_directory(self.dir)
+        self.file_one.write_text('file_one')
+        self.assertNotEqual(hash_directory(self.dir), original_hash)
+
+    def test_change_file_name_updates_hash(self):
+        self.file_one.write_text('initial content')
+        original_hash = hash_directory(self.dir)
+
+        self.file_one.rename(self.dir / 'file_two.txt')
+        self.assertNotEqual(hash_directory(self.dir), original_hash)
+
+    def test_hidden_files_are_not_hashed(self):
+        original_hash = hash_directory(self.dir)
+        (self.dir / '.hidden').touch()
+        self.assertEqual(hash_directory(self.dir), original_hash)
+
+    def test_excluded_files_are_not_hashed(self):
+        original_hash = hash_directory(self.dir)
+        (self.dir / 'file_one.txt').write_text('file_one')
+        self.assertEqual(
+            hash_directory(self.dir, exclude_patterns=['file_one.txt']), original_hash
+        )
+
+    def test_exclude_supports_wildcards(self):
+        original_hash = hash_directory(self.dir)
+        (self.dir / 'file_one.txt').write_text('file_one')
+        self.assertEqual(
+            hash_directory(self.dir, exclude_patterns=['*.txt']), original_hash
+        )
+        (self.dir / 'readmer.md').write_text('readme')
+        self.assertNotEqual(
+            hash_directory(self.dir, exclude_patterns=['*.txt']), original_hash
+        )
+
+    def test_subdirectories_are_hashed(self):
+        subdir = self.dir / 'subdir'
+        subdir.mkdir()
+        (subdir / 'nested_file.txt').write_text('nested content')
+        original_hash = hash_directory(self.dir)
+
+        (subdir / 'nested_file.txt').write_text('modified content')
+        self.assertNotEqual(hash_directory(self.dir), original_hash)
+
+    def test_empty_directory(self):
+        # Should return consistent hash for empty directory
+        self.assertEqual(hash_directory(self.dir), hash_directory(self.dir))
+
+    def test_renamed_directory_updates_hash(self):
+        sub_dir = self.dir / 'subdir'
+        sub_dir.mkdir()
+        original_hash = hash_directory(self.dir)
+
+        sub_dir.rename(self.dir / 'subdir_renamed')
+        self.assertNotEqual(hash_directory(self.dir), original_hash)
+
+    def test_symlinks(self):
+        # Create a file and a symlink to it
+        self.file_one.write_text('content')
+        symlink = self.dir / 'link.txt'
+        symlink.symlink_to(self.file_one)
+
+        original_hash = hash_directory(self.dir)
+        # Modify the target file
+        self.file_one.write_text('new content')
+        # Hash should change when target content changes
+        self.assertNotEqual(hash_directory(self.dir), original_hash)
+
+
+class TestBuildInfo(unittest.TestCase):
+    def _run_build_info(self, *args):
+        result = subprocess.run(
+            ['./scripts/build_info.py', *args],
+            check=True,
+            capture_output=True,
+            text=True,
+        )
+
+        # If we use an output file then there are no logs
+        if '--output' in args:
+            return result
+
+        return json.loads(result.stdout)
+
+    def test_missing_target_raises(self):
+        with self.assertRaises(subprocess.CalledProcessError):
+            self._run_build_info()
+
+    @mock.patch('scripts.build_info.hash_directory', return_value=None)
+    def test_default_build_info(self, mock_hash_directory):
+        self.assertEqual(
+            build_info(target='test'),
+            {
+                'commit': None,
+                'version': None,
+                'build': None,
+                'target': 'test',
+                'source': 'https://github.com/mozilla/addons-server',
+                'content_hash': {'site_static_hash': None, 'locale_hash': None},
+            },
+        )
+
+        site_static, locale = mock_hash_directory.call_args_list
+
+        # Check we are hashing the correct directories
+        assert site_static[0][0].name == 'site-static'
+        assert locale[0][0].name == 'locale'
+
+        # Asset we exclude staticfiles.json from site-static
+        assert 'staticfiles.json' in site_static[1]['exclude_patterns']
+
+    def test_build_info_with_args(self):
+        info = self._run_build_info(
+            '--commit',
+            '1234567890',
+            '--version',
+            '1.0.0',
+            '--build',
+            'www.build.com',
+            '--target',
+            'dev',
+        )
+
+        assert info['commit'] == '1234567890'
+        assert info['version'] == '1.0.0'
+        assert info['build'] == 'www.build.com'
+        assert info['target'] == 'dev'
+
+    def test_build_info_output(self):
+        out_file = Path(tempfile.mkdtemp()) / 'output.txt'
+
+        self._run_build_info('--target', 'test', '--output', out_file.as_posix())
+        assert out_file.exists()
+
+        json_output = json.loads(out_file.read_text())
+        assert json_output['target'] == 'test'