Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

speed up get_datasite_states #528

Merged
merged 12 commits into from
Feb 4, 2025
8 changes: 4 additions & 4 deletions .github/workflows/pr-tests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -66,13 +66,13 @@ jobs:
# explicitly define which python version to use
# else we may end up picking system default which is not the same as the actions/setup-python
uv tool install tox --with tox-uv --python ${{ matrix.python-version }}
tox --version
uv run tox --version

- name: Run unit tests
env:
TOX_PYTHON: python${{ matrix.python-version }}
run: |
tox -e syft.test.unit
uv run tox -e syft.test.unit

integration:
strategy:
Expand Down Expand Up @@ -117,10 +117,10 @@ jobs:
# explicitly define which python version to use
# else we may end up picking system default which is not the same as the actions/setup-python
uv tool install tox --with tox-uv --python ${{ matrix.python-version }}
tox --version
uv run tox --version

- name: Run Integration tests
env:
TOX_PYTHON: python${{ matrix.python-version }}
run: |
tox -e syft.test.integration
uv run tox -e syft.test.integration
10 changes: 3 additions & 7 deletions syftbox/client/cli_setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@

import json
import shutil
import time
from pathlib import Path

import httpx
Expand Down Expand Up @@ -125,6 +124,7 @@ def setup_config_interactive(
**SYFTBOX_HEADERS,
HEADER_SYFTBOX_USER: conf.email,
},
transport=httpx.HTTPTransport(retries=10),
)
if not skip_verify_install:
verify_installation(conf, login_client)
Expand Down Expand Up @@ -170,12 +170,8 @@ def prompt_email() -> str:

def verify_installation(conf: SyftClientConfig, client: httpx.Client) -> None:
try:
try:
response = client.get("/info?verify_installation=1")
except httpx.ConnectError:
# try one more time, server may be starting (dev mode)
time.sleep(2)
response = client.get("/info?verify_installation=1")
response = client.get("/info?verify_installation=1")

response.raise_for_status()

except (httpx.HTTPError, KeyError):
Expand Down
26 changes: 24 additions & 2 deletions syftbox/lib/profiling.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import subprocess # nosec
import tempfile
import time
from typing import Iterator
from typing import Callable, Iterator


@contextlib.contextmanager
Expand All @@ -28,7 +28,7 @@ def pyspy() -> Iterator[subprocess.Popen]:
"py-spy",
"record",
"-r",
"100",
"1000",
"-o",
fname,
"--pid",
Expand All @@ -46,3 +46,25 @@ def pyspy() -> Iterator[subprocess.Popen]:
os.chmod(fname, 0o444)
except Exception as e:
print(f"Error: {e}")


class FakeThread:
"""Convenience class for profiling code that should be run in a thread.
Easy to swap with Thread when profiling is not needed and we want to run in the main thread.
"""

def __init__(self, target: Callable, args: tuple = (), daemon: bool = True) -> None:
self.target = target
self.args = args
self.daemon = daemon
self.is_alive_flag = False

def start(self) -> None:
self.is_alive_flag = True
self.target(*self.args)

def is_alive(self) -> bool:
return self.is_alive_flag

def join(self) -> None:
pass
23 changes: 9 additions & 14 deletions syftbox/server/api/v1/sync_router.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import base64
import hashlib
import sqlite3
import traceback
from collections import defaultdict
from typing import Iterator, List

import msgpack
Expand Down Expand Up @@ -69,22 +69,17 @@ def get_diff(

@router.post("/datasite_states", response_model=dict[str, list[FileMetadata]])
def get_datasite_states(
conn: sqlite3.Connection = Depends(get_db_connection),
file_store: FileStore = Depends(get_file_store),
server_settings: ServerSettings = Depends(get_server_settings),
email: str = Depends(get_current_user),
) -> dict[str, list[FileMetadata]]:
all_datasites = get_all_datasites(conn)
datasite_states: dict[str, list[FileMetadata]] = {}
for datasite in all_datasites:
try:
datasite_state = dir_state(RelativePath(datasite), file_store, server_settings, email)
except Exception as e:
logger.error(f"Failed to get dir state for {datasite}: {e} {traceback.format_exc()}")
continue
datasite_states[datasite] = datasite_state
file_metadata = file_store.list_for_user(email=email)

datasite_states = defaultdict(list)
for metadata in file_metadata:
user_email = metadata.path.parts[0]
datasite_states[user_email].append(metadata)

return datasite_states
return dict(datasite_states)


@router.post("/dir_state", response_model=list[FileMetadata])
Expand All @@ -94,7 +89,7 @@ def dir_state(
server_settings: ServerSettings = Depends(get_server_settings),
email: str = Depends(get_current_user),
) -> list[FileMetadata]:
return file_store.list_for_user(dir, email)
return file_store.list_for_user(email=email, path=dir)


@router.post("/get_metadata", response_model=FileMetadata)
Expand Down
107 changes: 62 additions & 45 deletions syftbox/server/db/db.py
Original file line number Diff line number Diff line change
Expand Up @@ -240,62 +240,78 @@ def get_read_permissions_for_user(

These bits are combined with a final OR operation.
"""

cursor = connection.cursor()

params: tuple = (user, user, user)
like_clause = ""
params: list = []
path_condition = ""
if path_like:
if "%" in path_like:
raise ValueError("we don't support % in paths")
path_like = path_like + "%"
escaped_path = path_like.replace("_", "\\_")
like_clause += " WHERE path LIKE ? ESCAPE '\\' "
params = (user, user, user, escaped_path)
path_condition = "AND f.path LIKE ? ESCAPE '\\'"
params.append(escaped_path)

query = """
SELECT path, hash, signature, file_size, last_modified,
(
SELECT COALESCE(
max(
CASE
WHEN can_read AND NOT disallow THEN rule_prio
ELSE 0
END
) >
max(
CASE
WHEN can_read AND disallow THEN rule_prio
ELSE 0
END
), 0)
or
-- First get all rules that apply to this user, including wildcards and email matches
WITH
user_matching_rules AS (
SELECT r.*, rf.file_id, rf.match_for_email
FROM rules r
JOIN rule_files rf
ON r.permfile_path = rf.permfile_path
AND r.priority = rf.priority
WHERE r.user = ? -- Direct user match
OR r.user = '*' -- Wildcard match
OR rf.match_for_email = ? -- Email pattern match
),

-- Then calculate effective permissions by taking the highest priority rule
-- Higher depth * 1000 + priority means more specific rules take precedence
-- Caveat: using 1000 means we can't have more than 1000 rules in the same permission file
permission_priorities AS (
SELECT
file_id,
MAX(CASE WHEN can_read AND NOT disallow THEN permfile_depth * 1000 + priority ELSE 0 END) as read_allow_prio,
MAX(CASE WHEN can_read AND disallow THEN permfile_depth * 1000 + priority ELSE 0 END) as read_deny_prio,
MAX(CASE WHEN admin AND NOT disallow THEN permfile_depth * 1000 + priority ELSE 0 END) as admin_allow_prio,
MAX(CASE WHEN admin AND disallow THEN permfile_depth * 1000 + priority ELSE 0 END) as admin_deny_prio
FROM user_matching_rules
GROUP BY file_id
),

final_permissions AS (
SELECT
file_id,
(read_allow_prio > read_deny_prio) as can_read,
(admin_allow_prio > admin_deny_prio) as is_admin
FROM permission_priorities
)

-- User has access if any of the following are true:
-- 1. They have an allowing rule that overrides any denying rules `can_read`
-- 2. They have admin access that overrides admin denials `is_admin`
-- 3. They own the datasite `f.datasite = user`
SELECT
f.path,
f.hash,
f.signature,
f.file_size,
f.last_modified,
COALESCE(
max(
CASE
WHEN admin AND NOT disallow THEN rule_prio
ELSE 0
END
) >
max(
CASE
WHEN admin AND disallow THEN rule_prio
ELSE 0
END
), 0)
FROM (
SELECT can_read, admin, disallow,
row_number() OVER (ORDER BY rules.permfile_depth, rules.priority ASC) AS rule_prio
FROM rule_files
JOIN rules ON rule_files.permfile_path = rules.permfile_path and rule_files.priority = rules.priority
WHERE rule_files.file_id = f.id and (rules.user = ? or rules.user = "*" or rule_files.match_for_email = ?)
)
) OR datasite = ? AS read_permission
can_read OR is_admin,
FALSE
) OR f.datasite = ? AS read_permission
FROM file_metadata f
{}
""".format(like_clause)
res = cursor.execute(query, params)
LEFT JOIN final_permissions fp ON f.id = fp.file_id
WHERE 1=1 {path_condition}
""".format(path_condition=path_condition)

# Add parameters in order: 2 user checks + 1 datasite check + optional path
query_params = [user, user, user] + params

return res.fetchall()
return cursor.execute(query, query_params).fetchall()


def print_table(connection: sqlite3.Connection, table: str) -> None:
Expand All @@ -312,6 +328,7 @@ def print_table(connection: sqlite3.Connection, table: str) -> None:
def get_filemetadata_with_read_access(
connection: sqlite3.Connection, user: str, path: Optional[RelativePath] = None
) -> list[FileMetadata]:
rows = get_read_permissions_for_user(connection, user, str(path))
string_path = str(path) if path else None
rows = get_read_permissions_for_user(connection, user, string_path)
res = [FileMetadata.from_row(row) for row in rows if row["read_permission"]]
return res
7 changes: 6 additions & 1 deletion syftbox/server/db/file_store.py
Original file line number Diff line number Diff line change
Expand Up @@ -191,6 +191,11 @@ def put(
conn.commit()
cursor.close()

def list_for_user(self, path: RelativePath, email: str) -> list[FileMetadata]:
def list_for_user(
self,
*,
email: str,
path: Optional[RelativePath] = None,
) -> list[FileMetadata]:
with get_db(self.db_path) as conn:
return db.get_filemetadata_with_read_access(conn, email, path)
2 changes: 2 additions & 0 deletions tox.ini
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@ requires =
[testenv]
runner = uv-venv-lock-runner
with_dev = True
allowlist_externals =
pytest
commands =
python --version
setenv =
Expand Down