Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve the thread_stats result #1316

Merged
merged 9 commits into from
Dec 27, 2023
38 changes: 32 additions & 6 deletions ldms/python/ldmsd/ldmsd_controller
Original file line number Diff line number Diff line change
Expand Up @@ -2161,14 +2161,26 @@ class LdmsdCmdParser(cmd.Cmd):
def complete_set_route(self, text, line, begidx, endidx):
return self.__complete_attr_list('set_info', text)

def display_thread_stats(self, stats):
print(f"{'Name':16} {'Samples':12} {'Sample Rate':12} " \
def display_worker_thread_stats(self, worker_threads):
print(f"LDMSD Worker Thread Statistics")
print(f"{'='*60}")
print(f"{'Thread ID':15} {'Linux Thread ID':20} {'Idle':15} {'Active':15} {'Duration (sec)':15} {'Event Counts':15}")
print(f"{'-'*15} {'-'*20} {'-'*15} {'-'*15} {'-'*15} {'-'*15}")
for e in worker_threads:
print(f"{e['tid']:^15} {e['thread_id']:20} {e['idle_pc']:14.2f}% " \
f"{e['active_pc']:14.2f}% {e['total_us']/1000000:15.2f}" \
f"{e['ev_cnt']:15}")

def display_io_thread_stats(self, io_threads):
print(f"IO Thread Statistics")
print(f"{'='*60}")
print(f"{'Thread ID':15} {'Linux Thread ID':20} {'Name':16} {'Samples':12} {'Sample Rate':12} " \
f"{'Utilization':12} {'Send Queue Size':16} " \
f"{'Num of EPs':12}")
print("---------------- ------------ ------------ ------------ "\
print(f"{'-'*15} {'-'*20} ---------------- ------------ ------------ ------------ "\
"---------------- ------------")
for e in stats['entries']:
print(f"{e['name']:16} {e['sample_count']:12.0f} " \
for e in io_threads:
print(f"{e['tid']:^15} {e['thread_id']:20} {e['name']:16} {e['sample_count']:12.0f} " \
f"{e['sample_rate']:12.2f} {e['utilization'] * 100:12.2f} " \
f"{e['sq_sz']:16} {e['n_eps']:12}")

Expand All @@ -2193,9 +2205,23 @@ class LdmsdCmdParser(cmd.Cmd):
if msg == "":
return
if rc != 0:
print(f"Error {msg['errcode']}: {resp['msg']}")
return
msg = fmt_status(msg)
self.display_thread_stats(msg)
self.display_worker_thread_stats(msg['worker_threads'])
print(f"{'='*60}")
self.display_io_thread_stats(msg['io_threads'])
print(f"{'='*60}")
print(f"IO Thread Usages of LDMS Operations")
print(f"{'='*60}")
for thr in msg['io_threads']:
thr['ldms_xprt'] = dict(sorted(thr['ldms_xprt'].items(), key = lambda item: item[1], reverse = True))
total = sum(v for v in thr['ldms_xprt'].values())
print(f"{thr['tid']} {thr['thread_id']} {thr['name']}")
print(" ", end="")
display = dict([(k, v) for k, v in thr['ldms_xprt'].items() if v != 0])
print('\n '.join(f"{key:20} {(value/total*100):12.6f}% {value}" for key, value in display.items()))
print(f"{'-'*60}")

def complete_thread_stats(self, text, line, begidx, endidx):
return self.__complete_attr_list('thread_stats', text)
Expand Down
81 changes: 81 additions & 0 deletions ldms/src/core/ldms.h
Original file line number Diff line number Diff line change
Expand Up @@ -1730,6 +1730,87 @@ struct ldms_xprt_rate_data {
double duration;
};

enum ldms_thrstat_op_e {
LDMS_THRSTAT_OP_OTHER, /* Ignore these operations, e.g., notify */
LDMS_THRSTAT_OP_CONNECT_SETUP,
LDMS_THRSTAT_OP_DIR_REQ,
LDMS_THRSTAT_OP_DIR_REPLY,
LDMS_THRSTAT_OP_LOOKUP_REQ,
LDMS_THRSTAT_OP_LOOKUP_REPLY,
LDMS_THRSTAT_OP_UPDATE_REQ,
LDMS_THRSTAT_OP_UPDATE_REPLY,
LDMS_THRSTAT_OP_STREAM_MSG,
LDMS_THRSTAT_OP_STREAM_CLIENT,
LDMS_THRSTAT_OP_PUSH_REQ,
LDMS_THRSTAT_OP_PUSH_REPLY,
LDMS_THRSTAT_OP_SET_DELETE_REQ,
LDMS_THRSTAT_OP_SET_DELETE_REPLY,
LDMS_THRSTAT_OP_SEND_MSG,
LDMS_THRSTAT_OP_RECV_MSG,
LDMS_THRSTAT_OP_AUTH,
LDMS_THRSTAT_OP_DISCONNECTED,
LDMS_THRSTAT_OP_COUNT
};

/*
* TODO: Revise the comment to explain the intended use of the thr stats structure
*
* ldms_xprt ensures that the thread statistics reported account for
* the time from ldms_xprt receiving an event from Zap until it returns
* its Zap callback. The time duration includes the time in its
* application callback. Applications are responsible for keeping
* track of the time usages by its operations. It may cache the data
* in \c app_stats field.
*/

struct ldms_thrstat_entry {
uint64_t total; /* Operation's Aggregated time in micro-seconds */
int count;
};

struct ldms_thrstat {
struct timespec last_op_start;
struct timespec last_op_end;
enum ldms_thrstat_op_e last_op;
struct ldms_thrstat_entry ops[LDMS_THRSTAT_OP_COUNT];
};

struct ldms_thrstat_result_entry {
struct zap_thrstat_result_entry *zap_res;
uint64_t idle;
uint64_t zap_time;
uint64_t ops[LDMS_THRSTAT_OP_COUNT];
void *app_ctxt;
};

struct ldms_thrstat_result {
int count;
struct zap_thrstat_result *_zres;
struct ldms_thrstat_result_entry entries[0];
};

/**
* \brief Convert \c enum ldms_thrstat_op_e to a string
*
* \return A string of the operation name
*/
char *ldms_thrstat_op_str(enum ldms_thrstat_op_e e);

/**
* \brief Return thread usage information
*
* Return an ldms_thrstat_result structure or NULL on memory allocation failure.
* This result must be freed with the ldms_thrstat_free_result() function.
*
* \return A pointer to an ldms_thrstat_result structure
*/
struct ldms_thrstat_result *ldms_thrstat_result_get();

/**
* \brief Free an ldms_thrstat_result returned by \c ldms_thrstat_result_get
*/
void ldms_thrstat_result_free(struct ldms_thrstat_result *res);

/**
* Query daemon telemetry data across transports
*
Expand Down
2 changes: 2 additions & 0 deletions ldms/src/core/ldms_stream.c
Original file line number Diff line number Diff line change
Expand Up @@ -525,7 +525,9 @@ __stream_deliver(struct ldms_addr *src, uint64_t msg_gn,
ref_get(&c->ref, "callback");
pthread_rwlock_unlock(&s->rwlock);
_ev.recv.client = c;
/* TODO: Start: Get timing for application's stream handling time. */
rc = c->cb_fn(&_ev, c->cb_arg);
/* TODO: End: Get timing for application's stream handling time. */
if (__stream_stats_level > 0) {
pthread_rwlock_wrlock(&c->rwlock);
if (rc) {
Expand Down
Loading