ray-project · GeneDer · Sep 19, 2024 · Sep 20, 2024 · Sep 20, 2024 · Sep 20, 2024
diff --git a/python/ray/serve/tests/BUILD b/python/ray/serve/tests/BUILD
@@ -443,4 +443,3 @@ py_test_module_list(
         "//python/ray/serve:serve_lib",
     ],
 )
-
diff --git a/python/ray/serve/tests/test_metrics.py b/python/ray/serve/tests/test_metrics.py
@@ -32,6 +32,21 @@
 TEST_METRICS_EXPORT_PORT = 9999
 
 
+@pytest.fixture
+def clean_up_metrics():
+    prometheus_base_url = f"http://127.0.0.1:{TEST_METRICS_EXPORT_PORT}"
+    delete_all_series_url = (
+        f"{prometheus_base_url}/api/v1/admin/tsdb"
+        '/delete_series?match[]={__name__=~".*"}'
+    )
+    clean_tombstones_url = f"{prometheus_base_url}/api/v1/admin/tsdb/clean_tombstones"
+    requests.post(delete_all_series_url)
+    requests.post(clean_tombstones_url)
+    yield
+    requests.post(delete_all_series_url)
+    requests.post(clean_tombstones_url)
+
+
 @pytest.fixture
 def serve_start_shutdown():
     """Fixture provides a fresh Ray cluster to prevent metrics state sharing."""
@@ -193,7 +208,9 @@ def metric_available() -> bool:
     return metric_dicts
 
 
-def test_serve_metrics_for_successful_connection(serve_start_shutdown):
+def test_serve_metrics_for_successful_connection(
+    serve_start_shutdown, clean_up_metrics
+):
     @serve.deployment(name="metrics")
     async def f(request):
         return "hello"
@@ -258,7 +275,7 @@ def verify_metrics(do_assert=False):
         verify_metrics(do_assert=True)
 
 
-def test_http_replica_gauge_metrics(serve_start_shutdown):
+def test_http_replica_gauge_metrics(serve_start_shutdown, clean_up_metrics):
     """Test http replica gauge metrics"""
     signal = SignalActor.remote()
 
@@ -291,7 +308,7 @@ def ensure_request_processing():
     wait_for_condition(ensure_request_processing, timeout=5)
 
 
-def test_proxy_metrics_not_found(serve_start_shutdown):
+def test_proxy_metrics_not_found(serve_start_shutdown, clean_up_metrics):
     # NOTE: These metrics should be documented at
     # https://docs.ray.io/en/latest/serve/monitoring.html#metrics
     # Any updates here should be reflected there too.
@@ -333,7 +350,7 @@ def verify_metrics(_expected_metrics, do_assert=False):
             verify_metrics,
             retry_interval_ms=1000,
             timeout=10,
-            expected_metrics=expected_metrics,
+            _expected_metrics=expected_metrics,
         )
     except RuntimeError:
         verify_metrics(expected_metrics, True)
@@ -383,7 +400,7 @@ def verify_error_count(do_assert=False):
         verify_error_count(do_assert=True)
 
 
-def test_proxy_metrics_internal_error(serve_start_shutdown):
+def test_proxy_metrics_internal_error(serve_start_shutdown, clean_up_metrics):
     # NOTE: These metrics should be documented at
     # https://docs.ray.io/en/latest/serve/monitoring.html#metrics
     # Any updates here should be reflected there too.
@@ -434,7 +451,7 @@ async def __call__(self, *args):
             verify_metrics,
             retry_interval_ms=1000,
             timeout=10,
-            expected_metrics=expected_metrics,
+            _expected_metrics=expected_metrics,
         )
     except RuntimeError:
         verify_metrics(expected_metrics, True)
@@ -478,7 +495,7 @@ def verify_error_count(do_assert=False):
         verify_error_count(do_assert=True)
 
 
-def test_proxy_metrics_fields_not_found(serve_start_shutdown):
+def test_proxy_metrics_fields_not_found(serve_start_shutdown, clean_up_metrics):
     """Tests the proxy metrics' fields' behavior for not found."""
 
     # Should generate 404 responses
@@ -522,7 +539,7 @@ def test_proxy_metrics_fields_not_found(serve_start_shutdown):
     print("serve_num_grpc_error_requests working as expected.")
 
 
-def test_proxy_metrics_fields_internal_error(serve_start_shutdown):
+def test_proxy_metrics_fields_internal_error(serve_start_shutdown, clean_up_metrics):
     """Tests the proxy metrics' fields' behavior for internal error."""
 
     @serve.deployment()
@@ -583,7 +600,7 @@ def f(*args):
     print("serve_grpc_request_latency_ms_sum working as expected.")
 
 
-def test_replica_metrics_fields(serve_start_shutdown):
+def test_replica_metrics_fields(serve_start_shutdown, clean_up_metrics):
     """Test replica metrics fields"""
 
     @serve.deployment
@@ -722,7 +739,9 @@ def verify_metrics(self, metric, expected_output):
         for key in expected_output:
             assert metric[key] == expected_output[key]
 
-    def test_request_context_pass_for_http_proxy(self, serve_start_shutdown):
+    def test_request_context_pass_for_http_proxy(
+        self, serve_start_shutdown, clean_up_metrics
+    ):
         """Test HTTP proxy passing request context"""
 
         @serve.deployment(graceful_shutdown_timeout_s=0.001)
@@ -816,7 +835,9 @@ def check():
             assert metrics_app_name["g"] == "app2", msg
             assert metrics_app_name["h"] == "app3", msg
 
-    def test_request_context_pass_for_grpc_proxy(self, serve_start_shutdown):
+    def test_request_context_pass_for_grpc_proxy(
+        self, serve_start_shutdown, clean_up_metrics
+    ):
         """Test gRPC proxy passing request context"""
 
         @serve.deployment(graceful_shutdown_timeout_s=0.001)
@@ -970,7 +991,9 @@ async def app2(self):
         assert requests_metrics_app_name["g1"] == "app"
         assert requests_metrics_app_name["g2"] == "app"
 
-    def test_customer_metrics_with_context(self, serve_start_shutdown):
+    def test_customer_metrics_with_context(
+        self, serve_start_shutdown, clean_up_metrics
+    ):
         @serve.deployment
         class Model:
             def __init__(self):
@@ -1062,7 +1085,9 @@ def __call__(self):
         self.verify_metrics(histogram_metrics[0], expected_metrics)
 
     @pytest.mark.parametrize("use_actor", [False, True])
-    def test_serve_metrics_outside_serve(self, use_actor, serve_start_shutdown):
+    def test_serve_metrics_outside_serve(
+        self, use_actor, serve_start_shutdown, clean_up_metrics
+    ):
         """Make sure ray.serve.metrics work in ray actor"""
         if use_actor:
 
@@ -1186,7 +1211,7 @@ async def __call__(self):
         self.verify_metrics(histogram_metrics[0], expected_metrics)
 
 
-def test_multiplexed_metrics(serve_start_shutdown):
+def test_multiplexed_metrics(serve_start_shutdown, clean_up_metrics):
     """Tests multiplexed API corresponding metrics."""
 
     @serve.deployment
@@ -1261,7 +1286,7 @@ async def call(self, *args):
 
 
 class TestHandleMetrics:
-    def test_queued_queries_basic(self, serve_start_shutdown):
+    def test_queued_queries_basic(self, serve_start_shutdown, clean_up_metrics):
         signal = SignalActor.options(name="signal123").remote()
         serve.run(WaitForSignal.options(max_ongoing_requests=1).bind(), name="app1")
 
@@ -1290,7 +1315,9 @@ def test_queued_queries_basic(self, serve_start_shutdown):
             expected=0,
         )
 
-    def test_queued_queries_multiple_handles(self, serve_start_shutdown):
+    def test_queued_queries_multiple_handles(
+        self, serve_start_shutdown, clean_up_metrics
+    ):
         signal = SignalActor.options(name="signal123").remote()
         serve.run(WaitForSignal.options(max_ongoing_requests=1).bind(), name="app1")
 
@@ -1330,7 +1357,7 @@ def test_queued_queries_multiple_handles(self, serve_start_shutdown):
             expected=0,
         )
 
-    def test_queued_queries_disconnected(self, serve_start_shutdown):
+    def test_queued_queries_disconnected(self, serve_start_shutdown, clean_up_metrics):
         """Check that disconnected queued queries are tracked correctly."""
 
         signal = SignalActor.remote()
@@ -1471,7 +1498,7 @@ def do_request():
         # Unblock hanging request.
         ray.get(signal.send.remote())
 
-    def test_running_requests_gauge(self, serve_start_shutdown):
+    def test_running_requests_gauge(self, serve_start_shutdown, clean_up_metrics):
         signal = SignalActor.options(name="signal123").remote()
         serve.run(
             Router.options(num_replicas=2, ray_actor_options={"num_cpus": 0}).bind(
@@ -1531,7 +1558,7 @@ def test_running_requests_gauge(self, serve_start_shutdown):
         )
 
 
-def test_long_poll_host_sends_counted(serve_instance):
+def test_long_poll_host_sends_counted(serve_instance, clean_up_metrics):
     """Check that the transmissions by the long_poll are counted."""
 
     host = ray.remote(LongPollHost).remote(
@@ -1588,7 +1615,7 @@ def test_long_poll_host_sends_counted(serve_instance):
     )
 
 
-def test_actor_summary(serve_instance):
+def test_actor_summary(serve_instance, clean_up_metrics):
     @serve.deployment
     def f():
         pass