Skip to content

Commit

Permalink
utils: use JFR events to track door interaction with other cells
Browse files Browse the repository at this point in the history
Motivation:
for slow transfers we want to be able to see how long cell
communications took time. This information can be collected with java
flight recorder.

Modification:
Update Transfer class to record JFR events when interaction with other
cells takes place. The mover stop implementation uses an addition event
holder, as mover kill uses notification, thus no callback.

Result:
message interaction information visible in JFR.

```
$ jfr print --categories dCache /tmp/dCacheDomain_12455560939331420208.jfr | head -20                                                            (git)-[transfer-jfr]
org.dcache.door.Messages {
  startTime = 09:45:35.672
  duration = 37.8 ms
  message = "PnfsGetFileAttributes"
  Source = "NFS-nairi@dCacheDomain"
  Destination = "PnfsManager"
  eventThread = "proc-9" (javaThreadId = 73)
  stackTrace = [
    org.dcache.util.Transfer.lambda$readNameSpaceEntryAsync$2(Transfer$MessageEvent, boolean, PnfsGetFileAttributes) line: 818
    org.dcache.cells.CellStub.lambda$transformAsync$3(AsyncFunction, Message) line: 559
    com.google.common.util.concurrent.AbstractTransformFuture$AsyncTransformFuture.doTransform(AsyncFunction, Object) line: 223
    com.google.common.util.concurrent.AbstractTransformFuture$AsyncTransformFuture.doTransform(Object, Object) line: 210
    com.google.common.util.concurrent.AbstractTransformFuture.run() line: 123
  ]
}

...
```

Acked-by: Lea Morschel
Target: master
Require-book: no
Require-notes: yes
  • Loading branch information
kofemann committed Jan 27, 2025
1 parent 693fffa commit 3b12aa7
Showing 1 changed file with 68 additions and 0 deletions.
68 changes: 68 additions & 0 deletions modules/dcache/src/main/java/org/dcache/util/Transfer.java
Original file line number Diff line number Diff line change
Expand Up @@ -74,10 +74,17 @@
import java.util.concurrent.ScheduledExecutorService;
import java.util.concurrent.ThreadFactory;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicReference;
import java.util.function.Consumer;
import java.util.stream.Collectors;
import javax.annotation.Nullable;
import javax.security.auth.Subject;

import jdk.jfr.Category;
import jdk.jfr.Enabled;
import jdk.jfr.Event;
import jdk.jfr.Label;
import jdk.jfr.Name;
import org.dcache.acl.enums.AccessMask;
import org.dcache.auth.attributes.Restriction;
import org.dcache.auth.attributes.Restrictions;
Expand Down Expand Up @@ -172,6 +179,27 @@ private synchronized EnumSet<RequestState> getAllowedRequestStates() {
return EnumSet.copyOf(_allowedRequestStates);
}

@Category({"dCache", "Transfer"})
@Name("org.dcache.door.Messages")
@Label("Messages")
@Enabled(true)
static class MessageEvent extends Event {
@Label("Message")
String message;

@Name("Source")
String source;

@Name("Destination")
String destination;
}

/**
* Optional close event. The close uses notifications, thus we can't use cellMessage reply callback.
* So, use a call field that will commit message on finish.
*/
private final AtomicReference<MessageEvent> killMoverEventHolder = new AtomicReference<>();

/**
* Constructs a new Transfer object.
*
Expand Down Expand Up @@ -543,6 +571,12 @@ public final synchronized void finished(int rc, String error) {
* Signals that the mover of this transfer finished.
*/
public final synchronized void finished(DoorTransferFinishedMessage msg) {

var killMoverEvent = killMoverEventHolder.getAndSet(null);
if (killMoverEvent != null) {
killMoverEvent.commit();
}

setFileAttributes(msg.getFileAttributes());
setProtocolInfo(msg.getProtocolInfo());
moverInfoMessage = msg.getMoverInfo();
Expand Down Expand Up @@ -774,12 +808,19 @@ private ListenableFuture<Void> readNameSpaceEntryAsync(boolean allowWrite, long
}
request.setAccessMask(mask);
request.setUpdateAtime(true);

MessageEvent nameSpaceReadEvent = new MessageEvent();
nameSpaceReadEvent.source = getCellName() + "@" + getDomainName();
nameSpaceReadEvent.destination = "PnfsManager";
nameSpaceReadEvent.message = PnfsGetFileAttributes.class.getSimpleName();
nameSpaceReadEvent.begin();
ListenableFuture<PnfsGetFileAttributes> reply = _pnfs.requestAsync(request, timeout);

setStatusUntil("PnfsManager: Fetching storage info", reply);

return CellStub.transformAsync(reply,
msg -> {
nameSpaceReadEvent.commit();
FileAttributes attributes = msg.getFileAttributes();
/* We can only transfer regular files.
*/
Expand Down Expand Up @@ -942,6 +983,13 @@ public ListenableFuture<Void> selectPoolAsync(long timeout) {
return immediateFuture(null);
}

// init JFR event
MessageEvent poolSelectEvent = new MessageEvent();
poolSelectEvent.source = getCellName() + "@" + getDomainName();
poolSelectEvent.destination = "PoolManager";
poolSelectEvent.message = isWrite() ? PoolMgrSelectReadPoolMsg.class.getSimpleName() : PoolMgrSelectWritePoolMsg.class.getSimpleName();
poolSelectEvent.begin();

ProtocolInfo protocolInfo = getProtocolInfoForPoolManager();
ListenableFuture<? extends PoolMgrSelectPoolMsg> reply;
if (isWrite()) {
Expand Down Expand Up @@ -998,6 +1046,7 @@ public ListenableFuture<Void> selectPoolAsync(long timeout) {
setStatusUntil("PoolManager: Selecting pool", reply);
return CellStub.transform(reply,
(PoolMgrSelectPoolMsg msg) -> {
poolSelectEvent.commit();
setPool(msg.getPool());
setFileAttributes(msg.getFileAttributes());
return null;
Expand Down Expand Up @@ -1038,6 +1087,13 @@ public ListenableFuture<Void> startMoverAsync(long timeout) {
message.setId(_id);
message.setSubject(_subject);

// init JFR event
MessageEvent moverStartEvent = new MessageEvent();
moverStartEvent.source = getCellName() + "@" + getDomainName();
moverStartEvent.destination = pool.getName();
moverStartEvent.message = message.getClass().getSimpleName();
moverStartEvent.begin();

/*
* SpaceManager needs to spy mover shutdown to adjust the space reservation. for this reason we have to
* proxy mover start messages through SpaceManager. However, reads can be sent directly to pools.
Expand All @@ -1056,6 +1112,7 @@ public ListenableFuture<Void> startMoverAsync(long timeout) {

setStatusUntil("Pool " + pool + ": Creating mover", reply);
return CellStub.transformAsync(reply, msg -> {
moverStartEvent.commit();
setMoverId(msg.getMoverId());
return immediateFuture(null);
});
Expand Down Expand Up @@ -1096,6 +1153,17 @@ public void killMover(long millis, String explanation) {
PoolMoverKillMessage message =
new PoolMoverKillMessage(pool.getName(), moverId, explanation);
message.setReplyRequired(false);

// init JFR event
var killMoverEvent = new MessageEvent();
killMoverEvent.source = getCellName() + "@" + getDomainName();
killMoverEvent.destination = pool.getName();
killMoverEvent.message = message.getClass().getSimpleName();

if (killMoverEventHolder.compareAndSet(null, killMoverEvent)) {
killMoverEvent.begin();
}

_poolStub.notify(new CellPath(pool.getAddress()), message);

/* To reduce the risk of orphans when using PNFS, we wait
Expand Down

0 comments on commit 3b12aa7

Please sign in to comment.