Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Make ldmsd continue startup on producer's hostname resolution failure #1537

Merged
merged 2 commits into from
Nov 27, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion ldms/man/ldmsd_controller.man
Original file line number Diff line number Diff line change
Expand Up @@ -338,7 +338,10 @@ data our ldmsd holds for the prdcr.
.br
The recv rate (bytes/sec) limit for this connection. The default is -1
(unlimited).

.TP
.BI [cache_ip " cache_ip"]
.br
Controls how \fBldmsd\fR handles hostname resolution for producer IP addresses. When set to \fBtrue\fR (default), \fBldmsd\fR resolves the hostname once during \fBprdcr_add\fR and caches the result. If the initial resolution fails and the producer is started (via \fBprdcr_start\R or \fBprdcr_start_regex\fR), \fBldmsd\fR will retry resolution at connection time and each resonnection attempt until successful. When set to \fBfalse\fR, \fBldmsd\fR performs hostname resolution at \fBprdcr_add\fR time and repeats the resolution at every connection and reconnection attempt if the producer is started.
.RE

.SS Delete a producer from the aggregator
Expand Down
10 changes: 8 additions & 2 deletions ldms/python/ldmsd/ldmsd_communicator.py
Original file line number Diff line number Diff line change
Expand Up @@ -2189,11 +2189,13 @@ def _prdcr_add_attr_prep(self, **kwargs):
attrs.append(LDMSD_Req_Attr(attr_id=LDMSD_Req_Attr.QUOTA, value=str(int(kwargs['quota']))))
if 'rx_rate' in kwargs.keys() and kwargs['rx_rate']:
attrs.append(LDMSD_Req_Attr(attr_id=LDMSD_Req_Attr.RX_RATE, value=str(int(kwargs['rx_rate']))))
if 'cache_ip' in kwargs.keys() and kwargs['cache_ip']:
attrs.append(LDMSD_Req_Attr(attr_id = LDMSD_Req_Attr.IP, value = str(kwargs['cache_ip'])))

return attrs

def prdcr_add(self, name, ptype, xprt, host, port, reconnect, auth=None, perm=None,
rail=None, quota=None, rx_rate=None):
rail=None, quota=None, rx_rate=None, cache_ip=None):
"""
Add a producer. A producer is a peer to the LDMSD being configured.
Once started, the LDSMD will attempt to connect to this peer
Expand All @@ -2211,6 +2213,7 @@ def prdcr_add(self, name, ptype, xprt, host, port, reconnect, auth=None, perm=No
- The reconnect interval in microseconds

Keyword Parameters:
auth - The authentication domain
perm - The configuration client permission required to
modify the producer configuration. Default is None.
rail - The number of endpoints in a rail. The default is 1.
Expand All @@ -2219,6 +2222,8 @@ def prdcr_add(self, name, ptype, xprt, host, port, reconnect, auth=None, perm=No
('--quota' ldmsd option).
rx_rate - The recv rate (bytes/second) limit for this connection. The
default is -1 (unlimited).
cache_ip - True: Cache hostname after first successfull resolution;
False: Resolve hostname on every connection

Returns:
A tuple of status, data
Expand All @@ -2227,7 +2232,8 @@ def prdcr_add(self, name, ptype, xprt, host, port, reconnect, auth=None, perm=No
"""
args_d = {'name': name, 'ptype': ptype, 'xprt': xprt, 'host': host, 'port': port,
'reconnect': reconnect, 'auth': auth, 'perm': perm,
'rail': rail, 'quota': quota, 'rx_rate': rx_rate}
'rail': rail, 'quota': quota, 'rx_rate': rx_rate,
'cache_ip' : cache_ip}
attrs = self._prdcr_add_attr_prep(**args_d)
req = LDMSD_Request( command_id = LDMSD_Request.PRDCR_ADD, attrs = attrs)
try:
Expand Down
5 changes: 4 additions & 1 deletion ldms/python/ldmsd/ldmsd_controller
Original file line number Diff line number Diff line change
Expand Up @@ -381,6 +381,8 @@ class LdmsdCmdParser(cmd.Cmd):
holds for the prdcr.
[rx_rate=] The recv rate (bytes/sec) limit for this connection. The
default is -1 (unlimited).
[cache_ip=] True to cache the IP address after first successful resolution (default).
False to resolve the hostname at prdcr_add and at every connection attempt.
"""
arg = self.handle_args('prdcr_add', arg)
if arg is None:
Expand All @@ -404,7 +406,8 @@ class LdmsdCmdParser(cmd.Cmd):
arg['perm'],
arg['rail'],
arg['quota'],
arg['rx_rate'])
arg['rx_rate'],
arg['cache_ip'])
if rc:
print(f'Error adding prdcr {arg["name"]}: {msg}')

Expand Down
7 changes: 6 additions & 1 deletion ldms/src/ldmsd/ldmsd.h
Original file line number Diff line number Diff line change
Expand Up @@ -205,6 +205,11 @@ typedef struct ldmsd_prdcr_stream_s {
typedef struct ldmsd_prdcr {
struct ldmsd_cfgobj obj;

/* Controls hostname resolution caching behavior (user configurable)
* 1 = (default) Cache hostname after first successfull resolution
* 0 = Resolve hostname on every connection
*/
uint8_t cache_ip;
struct sockaddr_storage ss; /* Host address */
socklen_t ss_len;
char *host_name; /* Host name */
Expand Down Expand Up @@ -1157,7 +1162,7 @@ ldmsd_prdcr_new_with_auth(const char *name, const char *xprt_name,
enum ldmsd_prdcr_type type,
int conn_intrvl_us,
const char *auth, uid_t uid, gid_t gid, int perm, int rail,
int64_t quota, int64_t rx_rate);
int64_t quota, int64_t rx_rate, int cache_ip);
int ldmsd_prdcr_del(const char *prdcr_name, ldmsd_sec_ctxt_t ctxt);
ldmsd_prdcr_t ldmsd_prdcr_first();
ldmsd_prdcr_t ldmsd_prdcr_next(struct ldmsd_prdcr *prdcr);
Expand Down
9 changes: 8 additions & 1 deletion ldms/src/ldmsd/ldmsd_failover.c
Original file line number Diff line number Diff line change
Expand Up @@ -463,6 +463,12 @@ int __failover_send_prdcr(ldmsd_failover_t f, ldms_t x, ldmsd_prdcr_t p)
if (rc)
goto cleanup;

/* Cache_ip */
snprintf(buff, sizeof(buff), "%d", p->cache_ip);
rc = ldmsd_req_cmd_attr_append_str(rcmd, LDMSD_ATTR_IP, buff);
if (rc)
goto cleanup;

/* Terminate the message */
rc = ldmsd_req_cmd_attr_term(rcmd);
if (rc)
Expand Down Expand Up @@ -2050,6 +2056,7 @@ int failover_cfgprdcr_handler(ldmsd_req_ctxt_t req)
char *rail_s = __req_attr_gets(req, LDMSD_ATTR_RAIL);
char *quota_s = __req_attr_gets(req, LDMSD_ATTR_QUOTA);
char *rx_rate_s = __req_attr_gets(req, LDMSD_ATTR_RX_RATE);
char *cache_ip = __req_attr_gets(req, LDMSD_ATTR_IP);

uid_t _uid;
gid_t _gid;
Expand Down Expand Up @@ -2116,7 +2123,7 @@ int failover_cfgprdcr_handler(ldmsd_req_ctxt_t req)

p = ldmsd_prdcr_new_with_auth(name, xprt, host, atoi(port), ptype,
atoi(interval), auth, _uid, _gid, _perm, rail, quota,
rx_rate);
rx_rate, atoi(cache_ip));
if (!p) {
rc = errno;
str_rbn_free(srbn);
Expand Down
25 changes: 18 additions & 7 deletions ldms/src/ldmsd/ldmsd_prdcr.c
Original file line number Diff line number Diff line change
Expand Up @@ -854,6 +854,17 @@ static void prdcr_connect(ldmsd_prdcr_t prdcr)
{
int ret;

if ((0 == prdcr->ss.ss_family) || (!prdcr->cache_ip)) {
if (prdcr_resolve(prdcr->host_name, prdcr->port_no,
&prdcr->ss, &prdcr->ss_len)) {
ovis_log(prdcr_log, OVIS_LERROR, "Producer '%s' connection failed. " \
"Hostname '%s:%u' not resolved.\n",
prdcr->obj.name, prdcr->host_name,
(unsigned) prdcr->port_no);
return;
}
}

switch (prdcr->type) {
case LDMSD_PRDCR_TYPE_ACTIVE:
case LDMSD_PRDCR_TYPE_BRIDGE:
Expand Down Expand Up @@ -995,7 +1006,7 @@ ldmsd_prdcr_new_with_auth(const char *name, const char *xprt_name,
const char *host_name, const unsigned short port_no,
enum ldmsd_prdcr_type type, int conn_intrvl_us,
const char *auth, uid_t uid, gid_t gid, int perm, int rail,
int64_t quota, int64_t rx_rate)
int64_t quota, int64_t rx_rate, int cache_ip)
{
extern struct rbt *cfgobj_trees[];
struct ldmsd_prdcr *prdcr;
Expand All @@ -1014,6 +1025,7 @@ ldmsd_prdcr_new_with_auth(const char *name, const char *xprt_name,
prdcr->conn_intrvl_us = conn_intrvl_us;
prdcr->port_no = port_no;
prdcr->conn_state = LDMSD_PRDCR_STATE_STOPPED;
prdcr->cache_ip = cache_ip;
rbt_init(&prdcr->set_tree, set_cmp);
rbt_init(&prdcr->hint_set_tree, ldmsd_updtr_schedule_cmp);
prdcr->rail = rail;
Expand All @@ -1032,11 +1044,9 @@ ldmsd_prdcr_new_with_auth(const char *name, const char *xprt_name,
}

prdcr->ss_len = sizeof(prdcr->ss);
if (prdcr_resolve(host_name, port_no, &prdcr->ss, &prdcr->ss_len)) {
errno = EAFNOSUPPORT;
ovis_log(prdcr_log, OVIS_LERROR, "ldmsd_prdcr_new: %s:%u not resolved.\n",
host_name,(unsigned) port_no);
goto out;
if (prdcr_resolve(prdcr->host_name, prdcr->port_no, &prdcr->ss, &prdcr->ss_len)) {
ovis_log(config_log, OVIS_LWARN, "Producer '%s': %s:%u not resolved.\n",
prdcr->obj.name, prdcr->host_name,(unsigned) prdcr->port_no);
}

if (!auth)
Expand Down Expand Up @@ -1076,7 +1086,8 @@ ldmsd_prdcr_new(const char *name, const char *xprt_name,
{
return ldmsd_prdcr_new_with_auth(name, xprt_name, host_name,
port_no, type, conn_intrvl_us,
DEFAULT_AUTH, getuid(), getgid(), 0777, rail, quota, rx_rate);
DEFAULT_AUTH, getuid(), getgid(), 0777, rail, quota,
rx_rate, 1);
}

extern struct rbt *cfgobj_trees[];
Expand Down
16 changes: 13 additions & 3 deletions ldms/src/ldmsd/ldmsd_request.c
Original file line number Diff line number Diff line change
Expand Up @@ -1647,7 +1647,9 @@ ldmsd_prdcr_t __prdcr_add_handler(ldmsd_req_ctxt_t reqc, char *verb, char *obj_n
int64_t quota = ldmsd_quota; /* use the global quota setting by default */
int64_t rx_rate = LDMS_UNLIMITED;
int rail = 1;
char *perm_s = NULL;
char *perm_s, *cache_ip_s;
int cache_ip = 1; /* Default is 1. */
perm_s = cache_ip_s = NULL;

name = host = xprt = type_s = port_s = interval_s = auth = rail_s = quota_s = NULL;

Expand Down Expand Up @@ -1773,9 +1775,17 @@ ldmsd_prdcr_t __prdcr_add_handler(ldmsd_req_ctxt_t reqc, char *verb, char *obj_n
goto out;
}
}

cache_ip_s = ldmsd_req_attr_str_value_get_by_id(reqc, LDMSD_ATTR_IP);
if (cache_ip_s) {
if (0 == strcasecmp(cache_ip_s, "false")) {
cache_ip = 0;
}
}

prdcr = ldmsd_prdcr_new_with_auth(name, xprt, host, port_no, type,
interval_us, auth, uid, gid, perm,
rail, quota, rx_rate);
rail, quota, rx_rate, cache_ip);
if (!prdcr) {
if (errno == EEXIST)
goto eexist;
Expand Down Expand Up @@ -9988,7 +9998,7 @@ static int __process_advertisement(ldmsd_req_ctxt_t reqc, ldmsd_prdcr_listen_t l
NULL, uid, gid, 0770,
ldms_xprt_rail_eps(x),
ldms_xprt_rail_recv_quota_get(x),
ldms_xprt_rail_recv_rate_limit_get(x));
ldms_xprt_rail_recv_rate_limit_get(x), 1);
if (!prdcr) {
reqc->errcode = ENOMEM;
reqc->line_off = snprintf(reqc->line_buf, reqc->line_len,
Expand Down
1 change: 1 addition & 0 deletions ldms/src/ldmsd/ldmsd_request_util.c
Original file line number Diff line number Diff line change
Expand Up @@ -183,6 +183,7 @@ struct req_str_id attr_str_id_table[] = {
{ "auto_interval", LDMSD_ATTR_AUTO_INTERVAL },
{ "auto_switch", LDMSD_ATTR_AUTO_SWITCH },
{ "base", LDMSD_ATTR_BASE },
{ "cache_ip", LDMSD_ATTR_IP },
{ "container", LDMSD_ATTR_CONTAINER },
{ "decomposition", LDMSD_ATTR_DECOMP },
{ "disable_start", LDMSD_ATTR_AUTO_INTERVAL },
Expand Down