Skip to content

Commit

Permalink
Set the default timeout of rpyc connect to 30s and add pushgateway au…
Browse files Browse the repository at this point in the history
…th (#459)
  • Loading branch information
shihaobai authored Jul 9, 2024
1 parent b8155f0 commit 50f0e2e
Show file tree
Hide file tree
Showing 4 changed files with 32 additions and 4 deletions.
3 changes: 3 additions & 0 deletions lightllm/server/api_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -418,6 +418,9 @@ def main():
parser.add_argument("--metric_gateway", type=str, default=None, help="address for collecting monitoring metrics")
parser.add_argument("--job_name", type=str, default="lightllm", help="job name for monitor")
parser.add_argument("--push_interval", type=int, default=10, help="interval of pushing monitoring metrics")
parser.add_argument(
"--enable_monitor_auth", action="store_true", help="Whether to open authentication for push_gateway"
)

args = parser.parse_args()

Expand Down
15 changes: 13 additions & 2 deletions lightllm/server/metrics/manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,20 @@
from prometheus_client import generate_latest
import multiprocessing.shared_memory as shm
from concurrent.futures import ThreadPoolExecutor
import functools
from rpyc import async_, SocketStream

async_metric_server = None
from rpyc import async_

def connect_decorator(func):
@functools.wraps(func)
def wrapper(*args, **kwargs):
kwargs.update({"timeout": 30}) # update the default timeout (3) to 30s
return func(*args, **kwargs)

return wrapper


SocketStream._connect = connect_decorator(SocketStream._connect)


class MetricServer(rpyc.Service):
Expand Down
16 changes: 15 additions & 1 deletion lightllm/server/metrics/metrics.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
import os
import time
from prometheus_client import CollectorRegistry, Histogram, Counter, Gauge
from prometheus_client import push_to_gateway
from prometheus_client.exposition import basic_auth_handler

MONITOR_INFO = {
"lightllm_request_count": "The total number of requests",
Expand All @@ -27,6 +29,14 @@
}


def my_auth_handler(url, method, timeout, headers, data):
username = os.getenv("USERNAME", None)
password = os.getenv("PASSWORD", None)
if username is None or password is None:
raise ValueError("USERNAME and PASSWORD must be set when the auth is opened.")
return basic_auth_handler(url, method, timeout, headers, data, username, password)


class Monitor:
def __init__(self, args):
duration_buckets = []
Expand All @@ -40,6 +50,7 @@ def __init__(self, args):
self.gateway_url = args.metric_gateway
self.registry = CollectorRegistry()
self.job_name = args.job_name
self.auth = args.enable_monitor_auth
self.init_metrics(args)

def init_metrics(self, args):
Expand Down Expand Up @@ -115,4 +126,7 @@ def gauge_set(self, name, value):

def push_metrices(self):
if self.gateway_url is not None:
push_to_gateway(self.gateway_url, job=self.job_name, registry=self.registry)
if self.auth:
push_to_gateway(self.gateway_url, job=self.job_name, registry=self.registry, handler=my_auth_handler)
else:
push_to_gateway(self.gateway_url, job=self.job_name, registry=self.registry)
2 changes: 1 addition & 1 deletion lightllm/server/router/manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -190,7 +190,7 @@ async def loop_for_fwd(
else:
self.shared_token_load.set_dynamic_max_load(0.0)
self.shared_token_load.set_current_load(0.0)
if counter_count % 100 == 0:
if counter_count % 300 == 0:
self.metric_client.gauge_set("lightllm_batch_current_size", 0.0)
self.metric_client.gauge_set("lightllm_batch_pause_size", 0.0)
self.metric_client.gauge_set("lightllm_queue_size", 0.0)
Expand Down

0 comments on commit 50f0e2e

Please sign in to comment.