diff --git a/README.md b/README.md index d4d2d57..fb1b693 100644 --- a/README.md +++ b/README.md @@ -169,6 +169,8 @@ Additionally, `HOMCC` provides sandboxed compiler execution for remote compilati HOMCCD_ADDRESS HOMCCD_LOG_LEVEL HOMCCD_VERBOSE + HOMCC_MAX_DEPENDENCY_CACHE_SIZE +
     [homcc]
@@ -188,6 +190,7 @@ Additionally, `HOMCC` provides sandboxed compiler execution for remote compilati
     address=0.0.0.0
     log_level=DEBUG
     verbose=True
+    max_dependency_cache_size=10G
     
     # Client configuration
@@ -207,6 +210,7 @@ Additionally, `HOMCC` provides sandboxed compiler execution for remote compilati
     IP address to listen on
     Detail level for log messages: {DEBUG, INFO, WARNING, ERROR, CRITICAL}
     Enable verbosity mode which implies detailed and colored logging
+    Maximum size of the dependency cache. You must specify either 'M' (Mebibyte) or 'G' (Gibibyte) as suffix.
     
@@ -214,7 +218,6 @@ Additionally, `HOMCC` provides sandboxed compiler execution for remote compilati ## Deployment hints Things to keep in mind when deploying `homccd`: - `homcc` currently does not support any transport encryption such as TLS, so source files would get transmitted over the internet in plain text if not using a VPN. -- `homccd` currently does not support cache eviction. The dependency cache is therefore growing until there is no space any more. We recommend to restart the `homccd` service every 24 hours (e.g. using a cronjob) so that the cache gets cleared regularly. - `homccd` does not limit simultaneous connections of a single client. A malicious client could therefore block the service by always opening up connections until no server slots are available any more. - `homccd` does not limit access to docker containers or chroot environments. A client can choose any docker container or chroot environment available on the server to execute the compilation in. diff --git a/homcc/server/cache.py b/homcc/server/cache.py index 5cc39ed..acb8308 100644 --- a/homcc/server/cache.py +++ b/homcc/server/cache.py @@ -4,9 +4,9 @@ """Caching module of the homcc server.""" import logging +from collections import OrderedDict from pathlib import Path from threading import Lock -from typing import Dict logger = logging.getLogger(__name__) @@ -14,21 +14,60 @@ class Cache: """Represents the homcc server cache that is used to cache dependencies.""" - cache: Dict[str, str] + cache: OrderedDict[str, str] """'Hash' -> 'File path' on server map for holding paths to cached files""" cache_mutex: Lock """Mutex for locking the cache.""" cache_folder: Path """Path to the cache on the file system.""" + max_size_bytes: int + """Maximum size of the cache in bytes.""" + current_size_bytes: int + """Current size of the cache in bytes.""" + + def __init__(self, root_folder: Path, max_size_bytes: int): + if max_size_bytes <= 0: + raise RuntimeError("Maximum size of cache must be strictly positive.") - def __init__(self, root_folder: Path): self.cache_folder = self._create_cache_folder(root_folder) - self.cache: Dict[str, str] = {} + self.cache: OrderedDict[str, str] = OrderedDict() self.cache_mutex: Lock = Lock() + self.max_size_bytes = max_size_bytes + self.current_size_bytes = 0 + + def _get_cache_file_path(self, hash_value: str) -> Path: + return self.cache_folder / hash_value + + def __contains__(self, key: str): + with self.cache_mutex: + contained: bool = key in self.cache + if contained: + self.cache.move_to_end(key) - def __contains__(self, key): + return contained + + def __len__(self) -> int: with self.cache_mutex: - return key in self.cache + return len(self.cache) + + def _evict_oldest(self): + """ + Evicts the oldest entry from the cache. + Note: The caller of this method has to ensure that the cache is locked. + """ + oldest_hash, oldest_path_str = self.cache.popitem(last=False) + oldest_path = Path(oldest_path_str) + + try: + self.current_size_bytes -= oldest_path.stat().st_size + oldest_path.unlink(missing_ok=False) + except FileNotFoundError: + logger.error( + """Tried to evict cache entry with hash '%s', but corresponding cache file at '%s' did not exist. + This may lead to an invalid cache size calculation.""", + oldest_hash, + oldest_path, + ) @staticmethod def _create_cache_folder(root_temp_folder: Path) -> Path: @@ -42,12 +81,26 @@ def _create_cache_folder(root_temp_folder: Path) -> Path: def get(self, hash_value: str) -> str: """Gets an entry (path) from the cache given a hash.""" with self.cache_mutex: + self.cache.move_to_end(hash_value) return self.cache[hash_value] def put(self, hash_value: str, content: bytearray): """Stores a dependency in the cache.""" - cached_file_path = self.cache_folder / hash_value - Path.write_bytes(cached_file_path, content) + if len(content) > self.max_size_bytes: + logger.error( + """File with hash '%s' can not be added to cache as it is larger than the maximum cache size. + (size in bytes: %i, max. cache size in bytes: %i)""", + hash_value, + len(content), + self.max_size_bytes, + ) + raise RuntimeError("Cache size insufficient") + cached_file_path = self._get_cache_file_path(hash_value) with self.cache_mutex: + while self.current_size_bytes + len(content) > self.max_size_bytes: + self._evict_oldest() + + Path.write_bytes(cached_file_path, content) + self.current_size_bytes += len(content) self.cache[hash_value] = str(cached_file_path) diff --git a/homcc/server/main.py b/homcc/server/main.py index 28fa1e0..63bd909 100755 --- a/homcc/server/main.py +++ b/homcc/server/main.py @@ -45,6 +45,7 @@ ServerConfig, parse_cli_args, parse_config, + size_string_to_bytes, ) from homcc.server.server import ( # pylint: disable=wrong-import-position start_server, @@ -67,6 +68,9 @@ def main(): level=LogLevel.INFO, ) + # TODO(o.layer): The argument parsing code should below be moved to/abstracted in parsing.py, + # similar to how it is done for the client + # LOG_LEVEL and VERBOSITY log_level: Optional[str] = homccd_args_dict["log_level"] @@ -100,6 +104,10 @@ def main(): if (address := homccd_args_dict["listen"]) is not None: homccd_config.address = address + # MAX_DEPENDENCY_CACHE_SIZE + if (max_dependency_cache_size := homccd_args_dict["max_dependency_cache_size"]) is not None: + homccd_config.max_dependency_cache_size_bytes = size_string_to_bytes(max_dependency_cache_size) + # provide additional DEBUG information logger.debug( "%s - %s\n" "Caller:\t%s\n" "%s", # homccd location and version; homccd caller; config info diff --git a/homcc/server/parsing.py b/homcc/server/parsing.py index d064e62..8ffc4e9 100644 --- a/homcc/server/parsing.py +++ b/homcc/server/parsing.py @@ -22,6 +22,28 @@ logger = logging.getLogger(__name__) + +def mib_to_bytes(mb: int) -> int: + return mb * 1024**2 + + +def gib_to_bytes(gb: int) -> int: + return gb * 1024**3 + + +def size_string_to_bytes(size_string: str) -> int: + """Converts e.g. 100M or 1G to bytes. Only supports M (Mebibyte) and G (Gibibyte)""" + unit = size_string[-1] + amount = size_string[:-1] + + if unit == "M": + return mib_to_bytes(int(amount)) + elif unit == "G": + return gib_to_bytes(int(amount)) + + raise ArgumentTypeError(f"Invalid size string: '{size_string}'. Specify either M (Mebibyte) or G (Gibibyte).") + + HOMCC_SERVER_CONFIG_SECTION: str = "homccd" DEFAULT_ADDRESS: str = "0.0.0.0" @@ -31,6 +53,7 @@ or os.cpu_count() # total number of physical CPUs on the machine or -1 # fallback error value ) +DEFAULT_MAX_CACHE_SIZE_BYTES: int = gib_to_bytes(10) class ShowVersion(Action): @@ -74,6 +97,7 @@ class EnvironmentVariables: HOMCCD_ADDRESS_ENV_VAR: ClassVar[str] = "HOMCCD_ADDRESS" HOMCCD_LOG_LEVEL_ENV_VAR: ClassVar[str] = "HOMCCD_LOG_LEVEL" HOMCCD_VERBOSE_ENV_VAR: ClassVar[str] = "HOMCCD_VERBOSE" + HOMCCD_MAX_DEPENDENCY_CACHE_SIZE: ClassVar[str] = "HOMCCD_MAX_DEPENDENCY_CACHE_SIZE" @classmethod def __iter__(cls) -> Iterator[str]: @@ -83,6 +107,7 @@ def __iter__(cls) -> Iterator[str]: cls.HOMCCD_ADDRESS_ENV_VAR, cls.HOMCCD_LOG_LEVEL_ENV_VAR, cls.HOMCCD_VERBOSE_ENV_VAR, + cls.HOMCCD_MAX_DEPENDENCY_CACHE_SIZE, ) @classmethod @@ -112,12 +137,20 @@ def get_verbose(cls) -> Optional[bool]: return re.match(r"^(1)|(yes)|(true)|(on)$", verbose, re.IGNORECASE) is not None return None + @classmethod + def get_max_dependency_cache_size(cls) -> Optional[int]: + if max_dependency_cache_size := os.getenv(cls.HOMCCD_MAX_DEPENDENCY_CACHE_SIZE): + return size_string_to_bytes(max_dependency_cache_size) + + return None + files: List[str] address: Optional[str] port: Optional[int] limit: Optional[int] log_level: Optional[LogLevel] verbose: bool + max_dependency_cache_size_bytes: Optional[int] def __init__( self, @@ -128,6 +161,7 @@ def __init__( address: Optional[str] = None, log_level: Optional[str] = None, verbose: Optional[bool] = None, + max_dependency_cache_size_bytes: Optional[int] = None, ): self.files = files @@ -140,6 +174,10 @@ def __init__( verbose = self.EnvironmentVariables.get_verbose() or verbose self.verbose = verbose is not None and verbose + self.max_dependency_cache_size_bytes = ( + self.EnvironmentVariables.get_max_dependency_cache_size() or max_dependency_cache_size_bytes + ) + @classmethod def empty(cls): return cls(files=[]) @@ -151,8 +189,19 @@ def from_config_section(cls, files: List[str], homccd_config: SectionProxy) -> S address: Optional[str] = homccd_config.get("address") log_level: Optional[str] = homccd_config.get("log_level") verbose: Optional[bool] = homccd_config.getboolean("verbose") - - return ServerConfig(files=files, limit=limit, port=port, address=address, log_level=log_level, verbose=verbose) + max_dependency_cache_size: Optional[str] = homccd_config.get("max_dependency_cache_size") + + return ServerConfig( + files=files, + limit=limit, + port=port, + address=address, + log_level=log_level, + verbose=verbose, + max_dependency_cache_size_bytes=None + if max_dependency_cache_size is None + else size_string_to_bytes(max_dependency_cache_size), + ) def __str__(self): return ( @@ -162,6 +211,7 @@ def __str__(self): f"\taddress:\t{self.address}\n" f"\tlog_level:\t{self.log_level}\n" f"\tverbose:\t{self.verbose}\n" + f"\tmax_dependency_cache_size_bytes:\t{self.max_dependency_cache_size_bytes}\n" ) @@ -181,6 +231,9 @@ def min_job_limit(value: Union[int, str], minimum: int = 0) -> int: raise ArgumentTypeError(f"LIMIT must be more than {minimum}") + def max_dependency_cache_size_bytes(value: str) -> int: + return size_string_to_bytes(value) + general_options_group = parser.add_argument_group("Options") networking_group = parser.add_argument_group(" Networking") debug_group = parser.add_argument_group(" Debug") @@ -206,6 +259,14 @@ def min_job_limit(value: Union[int, str], minimum: int = 0) -> int: action="store_true", help="enforce that only configurations provided via the CLI are used", ) + general_options_group.add_argument( + "--max-dependency-cache-size", + required=False, + metavar="SIZE", + type=max_dependency_cache_size_bytes, + help=f"""The maximum cache size for the dependency cache. Expects a size string, e.g. 100M or 10G. + Default: {DEFAULT_MAX_CACHE_SIZE_BYTES} bytes""", + ) # networking networking_group.add_argument( diff --git a/homcc/server/server.py b/homcc/server/server.py index bb5ee44..0e7bd93 100644 --- a/homcc/server/server.py +++ b/homcc/server/server.py @@ -41,6 +41,7 @@ from homcc.server.parsing import ( DEFAULT_ADDRESS, DEFAULT_LIMIT, + DEFAULT_MAX_CACHE_SIZE_BYTES, DEFAULT_PORT, ServerConfig, ) @@ -56,7 +57,9 @@ class TCPServer(socketserver.ThreadingMixIn, socketserver.TCPServer): """TCP Server instance, holding data relevant across compilations.""" - def __init__(self, address: Optional[str], port: Optional[int], limit: Optional[int]): + def __init__( + self, address: Optional[str], port: Optional[int], limit: Optional[int], max_cache_size_bytes: Optional[int] + ): address = address or DEFAULT_ADDRESS port = port or DEFAULT_PORT @@ -77,7 +80,8 @@ def __init__(self, address: Optional[str], port: Optional[int], limit: Optional[ self.current_amount_connections: int = 0 # indicates the amount of clients that are currently connected self.current_amount_connections_mutex: Lock = Lock() - self.cache = Cache(Path(self.root_temp_folder.name)) + max_cache_size_bytes = max_cache_size_bytes or DEFAULT_MAX_CACHE_SIZE_BYTES + self.cache = Cache(root_folder=Path(self.root_temp_folder.name), max_size_bytes=max_cache_size_bytes) @staticmethod def send_message(request, message: Message): @@ -518,7 +522,7 @@ def handle(self): def start_server(config: ServerConfig) -> Tuple[TCPServer, threading.Thread]: try: - server: TCPServer = TCPServer(config.address, config.port, config.limit) + server: TCPServer = TCPServer(config.address, config.port, config.limit, config.max_dependency_cache_size_bytes) except OSError as err: logger.error("Could not start TCP server: %s", err) raise ServerInitializationError from err diff --git a/tests/server/cache_test.py b/tests/server/cache_test.py index 20d06eb..93c2059 100644 --- a/tests/server/cache_test.py +++ b/tests/server/cache_test.py @@ -5,7 +5,6 @@ """Test module for the server cache.""" from pathlib import Path -from tempfile import TemporaryDirectory from homcc.server.cache import Cache @@ -13,30 +12,100 @@ class TestCache: """Tests the server cache.""" - def test(self): - with TemporaryDirectory() as tmp_dir: - cache_dir = Path(tmp_dir) - cache = Cache(cache_dir) + def test_simple(self, tmp_path: Path): + cache = Cache(tmp_path, 1000) + cache_dir = tmp_path / "cache" - file1 = bytearray([0x1, 0x2, 0x3, 0x9]) - cache.put("hash1", file1) + file1 = bytearray([0x1, 0x2, 0x3, 0x9]) + cache.put("hash1", file1) - assert cache.get("hash1") == str(cache_dir / "cache" / "hash1") - assert "hash1" in cache - assert Path.read_bytes(Path(cache.get("hash1"))) == file1 + assert cache.get("hash1") == str(cache_dir / "hash1") + assert "hash1" in cache + assert Path.read_bytes(Path(cache.get("hash1"))) == file1 - file2 = bytearray([0x3, 0x6, 0x3, 0x9]) - cache.put("hash2", file2) + file2 = bytearray([0x3, 0x6, 0x3, 0x9]) + cache.put("hash2", file2) - assert cache.get("hash2") == str(cache_dir / "cache" / "hash2") - assert "hash2" in cache - assert Path.read_bytes(Path(cache.get("hash2"))) == file2 + assert cache.get("hash2") == str(cache_dir / "hash2") + assert "hash2" in cache + assert Path.read_bytes(Path(cache.get("hash2"))) == file2 - file3 = bytearray([0x4, 0x2]) - cache.put("hash3", file3) + file3 = bytearray([0x4, 0x2]) + cache.put("hash3", file3) - assert cache.get("hash3") == str(cache_dir / "cache" / "hash3") - assert "hash3" in cache - assert Path.read_bytes(Path(cache.get("hash3"))) == file3 + assert cache.get("hash3") == str(cache_dir / "hash3") + assert "hash3" in cache + assert Path.read_bytes(Path(cache.get("hash3"))) == file3 - assert "other_hash" not in cache + assert "other_hash" not in cache + + @staticmethod + def assert_hash_in_cache(cache: Cache, hash_value: str): + assert hash_value in cache + assert (cache.cache_folder / hash_value).exists() + + @staticmethod + def assert_hash_not_in_cache(cache: Cache, hash_value: str): + assert hash_value not in cache + assert not (cache.cache_folder / hash_value).exists() + + def test_eviction_size_limit(self, tmp_path: Path): + cache = Cache(tmp_path, max_size_bytes=10) + + cache.put("hash1", bytearray([0x1, 0x2, 0x3, 0x9])) + cache.put("hash2", bytearray([0x1, 0x2, 0x3, 0xA])) + cache.put("hash3", bytearray([0xFF, 0xFF])) + assert len(cache) == 3 + self.assert_hash_in_cache(cache, "hash1") + self.assert_hash_in_cache(cache, "hash2") + self.assert_hash_in_cache(cache, "hash3") + + cache.put("hash4", bytearray([0x1])) + assert len(cache) == 3 + self.assert_hash_not_in_cache(cache, "hash1") + self.assert_hash_in_cache(cache, "hash2") + self.assert_hash_in_cache(cache, "hash3") + self.assert_hash_in_cache(cache, "hash4") + + cache.put("hash5", bytearray([0x1])) + assert len(cache) == 4 + self.assert_hash_in_cache(cache, "hash2") + self.assert_hash_in_cache(cache, "hash3") + self.assert_hash_in_cache(cache, "hash4") + self.assert_hash_in_cache(cache, "hash5") + + cache.put("hash6", bytearray([0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9])) + assert len(cache) == 2 + self.assert_hash_not_in_cache(cache, "hash2") + self.assert_hash_not_in_cache(cache, "hash3") + self.assert_hash_not_in_cache(cache, "hash4") + self.assert_hash_in_cache(cache, "hash5") + self.assert_hash_in_cache(cache, "hash6") + + def test_eviction_order_lru(self, tmp_path: Path): + cache = Cache(tmp_path, max_size_bytes=10) + + cache.put("hash1", bytearray([0x1, 0x2, 0x3, 0x9])) + cache.put("hash2", bytearray([0x1, 0x2, 0x3, 0xA])) + cache.put("hash3", bytearray([0xFF, 0xFF])) + assert len(cache) == 3 + self.assert_hash_in_cache(cache, "hash1") + self.assert_hash_in_cache(cache, "hash2") + self.assert_hash_in_cache(cache, "hash3") + + cache.get("hash1") # make "hash1" the latest used element + cache.put("hash4", bytearray([0xFF, 0xFF, 0x0, 0x0])) + assert len(cache) == 3 + self.assert_hash_not_in_cache(cache, "hash2") + self.assert_hash_in_cache(cache, "hash1") + self.assert_hash_in_cache(cache, "hash3") + self.assert_hash_in_cache(cache, "hash4") + + assert "hash3" in cache # make "hash3" the latest used element + cache.put("hash5", bytearray([0xFF, 0xFF, 0x0, 0x0, 0xFF, 0xFF, 0x0, 0x0])) + assert len(cache) == 2 + self.assert_hash_in_cache(cache, "hash3") + self.assert_hash_in_cache(cache, "hash5") + self.assert_hash_not_in_cache(cache, "hash1") + self.assert_hash_not_in_cache(cache, "hash2") + self.assert_hash_not_in_cache(cache, "hash4") diff --git a/tests/server/environment_test.py b/tests/server/environment_test.py index a4f0b2c..6ac9c5d 100644 --- a/tests/server/environment_test.py +++ b/tests/server/environment_test.py @@ -135,9 +135,8 @@ def test_caching(self, mocker: MockerFixture): environment = create_mock_environment("", "") # pylint: disable=protected-access Cache._create_cache_folder = lambda *_: None # type: ignore - cache = Cache(Path("")) - cache.cache = {"hash2": "some/path/to/be/linked"} - + cache = Cache(Path(""), 1024) + cache.cache["hash2"] = "some/path/to/be/linked" needed_dependencies = environment.get_needed_dependencies(dependencies, cache) assert len(needed_dependencies) == 2 diff --git a/tests/server/parsing_test.py b/tests/server/parsing_test.py index 9a1d452..5e17859 100644 --- a/tests/server/parsing_test.py +++ b/tests/server/parsing_test.py @@ -4,6 +4,7 @@ """ Tests for client/compilation.py""" import os +from argparse import ArgumentTypeError from pathlib import Path from typing import List @@ -12,7 +13,13 @@ from homcc import server from homcc.common.parsing import HOMCC_CONFIG_FILENAME -from homcc.server.parsing import ServerConfig, parse_cli_args, parse_config +from homcc.server.parsing import ( + ServerConfig, + gib_to_bytes, + parse_cli_args, + parse_config, + size_string_to_bytes, +) class TestParsingConfig: @@ -28,17 +35,14 @@ class TestParsingConfig: "AdDrEsS=0.0.0.0", "LOG_LEVEL=DEBUG", "verbose=TRUE", + "max_dependency_cache_size=10G", # the following configs should be ignored "[homcc]", "LOG_LEVEL=INFO", "verbose=FALSE", ] - config_overwrite: List[str] = [ - "[homccd]", - "LOG_LEVEL=INFO", - "verbose=FALSE", - ] + config_overwrite: List[str] = ["[homccd]", "LOG_LEVEL=INFO", "verbose=FALSE", "max_dependency_cache_size=1G"] def test_version(self, capfd: CaptureFixture): with pytest.raises(SystemExit) as sys_exit: @@ -54,15 +58,19 @@ def test_parse_config_file(self, tmp_path: Path): tmp_config_file: Path = tmp_path / HOMCC_CONFIG_FILENAME tmp_config_file.write_text("\n".join(self.config)) - assert parse_config([tmp_config_file]) == ServerConfig( + parsed_config = parse_config([tmp_config_file]) + expected_config = ServerConfig( files=[str(tmp_config_file.absolute())], limit=42, port=3126, address="0.0.0.0", + max_dependency_cache_size_bytes=gib_to_bytes(10), log_level="DEBUG", verbose=True, ) + assert parsed_config == expected_config + def test_parse_multiple_config_files(self, tmp_path: Path): tmp_config_file: Path = tmp_path / HOMCC_CONFIG_FILENAME tmp_config_file.write_text("\n".join(self.config)) @@ -75,6 +83,19 @@ def test_parse_multiple_config_files(self, tmp_path: Path): limit=42, port=3126, address="0.0.0.0", + max_dependency_cache_size_bytes=gib_to_bytes(1), log_level="INFO", verbose=False, ) + + def test_size_string_conversions(self): + assert size_string_to_bytes("1M") == 1048576 + assert size_string_to_bytes("100M") == 104857600 + assert size_string_to_bytes("555M") == 581959680 + + assert size_string_to_bytes("1G") == 1073741824 + assert size_string_to_bytes("100G") == 107374182400 + assert size_string_to_bytes("123G") == 132070244352 + + with pytest.raises(ArgumentTypeError): + size_string_to_bytes("123")