Skip to content

Commit

Permalink
likwid-bridge: a helper tool to run likwid in containers like Apptain…
Browse files Browse the repository at this point in the history
…er (RRZE-HPC#626)

* src/access_client: add support for likwid-bridge

* src/access_client: use LIKWID_BRIDGE_PATH

* src/bridge: add likwid-bridge

* Add container bridge to build system. Config option CONTAINER_HELPER

---------

Co-authored-by: Thomas Roehl <[email protected]>
  • Loading branch information
pradt2 and TomTheBear authored Sep 9, 2024
1 parent d4f6c4e commit 854160e
Show file tree
Hide file tree
Showing 6 changed files with 330 additions and 26 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ likwid-pin
likwid-powermeter
likwid-setFrequencies
likwid-topology
likwid-bridge
likwid.lua

# generated doc files
Expand Down
46 changes: 40 additions & 6 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -193,7 +193,11 @@ else
ifeq ($(BUILDFREQ),false)
all: $(BUILD_DIR) $(PERFMONHEADERS) $(OBJ) $(TARGET_LIB) $(FORTRAN_IF) $(PINLIB) $(L_APPS) $(L_HELPER) $(DAEMON_TARGET) $(BENCH_TARGET) $(APPDAEMON_TARGET)
else
ifeq ($(CONTAINER_HELPER),false)
all: $(BUILD_DIR) $(PERFMONHEADERS) $(OBJ) $(TARGET_LIB) $(FORTRAN_IF) $(PINLIB) $(L_APPS) $(L_HELPER) $(DAEMON_TARGET) $(FREQ_TARGET) $(BENCH_TARGET) $(APPDAEMON_TARGET)
else
all: $(BUILD_DIR) $(PERFMONHEADERS) $(OBJ) $(TARGET_LIB) $(FORTRAN_IF) $(PINLIB) $(L_APPS) $(L_HELPER) $(DAEMON_TARGET) $(FREQ_TARGET) $(BENCH_TARGET) $(APPDAEMON_TARGET) $(CONTAINER_HELPER_TARGET)
endif
endif
endif

Expand Down Expand Up @@ -275,6 +279,10 @@ $(APPDAEMON_TARGET): $(SRC_DIR)/access-daemon/appDaemon.c $(TARGET_GOTCHA_LIB)
@echo "===> BUILD application interface likwid-appDaemon.so"
$(Q)$(MAKE) -C $(SRC_DIR)/access-daemon likwid-appDaemon.so

$(CONTAINER_HELPER_TARGET): $(SRC_DIR)/bridge/bridge.c
@echo "===> BUILD container helper likwid-bridge"
$(Q)$(CC) $(DEBUG_FLAGS) $(CFLAGS) $(ANSI_CFLAGS) $(CPPFLAGS) $(SRC_DIR)/bridge/bridge.c -o $@

$(BUILD_DIR):
@mkdir $(BUILD_DIR)

Expand Down Expand Up @@ -365,7 +373,7 @@ clean: $(TARGET_LUA_LIB) $(TARGET_HWLOC_LIB) $(TARGET_GOTCHA_LIB) $(BENCH_TARGET
@rm -f $(DYNAMIC_TARGET_LIB)*
@rm -f $(PINLIB)*
@rm -f $(FORTRAN_IF_NAME)
@rm -f $(FREQ_TARGET) $(DAEMON_TARGET) $(APPDAEMON_TARGET)
@rm -f $(FREQ_TARGET) $(DAEMON_TARGET) $(APPDAEMON_TARGET) $(CONTAINER_HELPER_TARGET)
@rm -f likwid-config.cmake

distclean: $(TARGET_LUA_LIB) $(TARGET_HWLOC_LIB) $(TARGET_GOTCHA_LIB) $(BENCH_TARGET)
Expand All @@ -378,7 +386,7 @@ distclean: $(TARGET_LUA_LIB) $(TARGET_HWLOC_LIB) $(TARGET_GOTCHA_LIB) $(BENCH_TA
@rm -f $(DYNAMIC_TARGET_LIB)*
@rm -f $(PINLIB)*
@rm -f $(FORTRAN_IF_NAME)
@rm -f $(FREQ_TARGET) $(DAEMON_TARGET) $(APPDAEMON_TARGET)
@rm -f $(FREQ_TARGET) $(DAEMON_TARGET) $(APPDAEMON_TARGET) $(CONTAINER_HELPER_TARGET)
@rm -rf $(BUILD_DIR)
@if [ "$(LUA_INTERNAL)" = "true" ]; then rm -f $(TARGET_LUA_LIB).* $(shell basename $(TARGET_LUA_LIB)).*; fi
@if [ "$(USE_INTERNAL_HWLOC)" = "true" ]; then rm -f $(TARGET_HWLOC_LIB).* $(shell basename $(TARGET_HWLOC_LIB)).*; fi
Expand Down Expand Up @@ -488,7 +496,33 @@ uninstall_appdaemon_moved:
@echo "===> No UNINSTALL of the application interface appDaemon"
endif

install: install_daemon install_freq install_appdaemon
ifeq ($(CONTAINER_HELPER),true)
install_container_helper: $(CONTAINER_HELPER_TARGET)
@echo "===> INSTALL container helper likwid-bridge to $(SBINPREFIX)/likwid-bridge"
@mkdir -p $(SBINPREFIX)
@install -m 755 $(CONTAINER_HELPER_TARGET) $(SBINPREFIX)/likwid-bridge
move_container_helper:
@echo "===> MOVE container helper likwid-bridge from $(SBINPREFIX)/likwid-bridge to $(INSTALLED_SBINPREFIX)/likwid-bridge"
@mkdir -p $(INSTALLED_SBINPREFIX)
@install -m 755 $(SBINPREFIX)/$(CONTAINER_HELPER_TARGET) $(INSTALLED_SBINPREFIX)/$(CONTAINER_HELPER_TARGET)
uninstall_container_helper:
@echo "===> REMOVING container helper likwid-bridge from $(SBINPREFIX)/$(CONTAINER_HELPER_TARGET)"
@rm -f $(SBINPREFIX)/$(CONTAINER_HELPER_TARGET)
uninstall_container_helper_moved:
@echo "===> REMOVING container helper likwid-bridge from $(INSTALLED_SBINPREFIX)/$(CONTAINER_HELPER_TARGET)"
@rm -f $(INSTALLED_SBINPREFIX)/$(CONTAINER_HELPER_TARGET)
else
install_container_helper:
@echo "===> No INSTALL of the container helper likwid-bridge"
move_appdaemon:
@echo "===> No MOVE of the container helper likwid-bridge"
uninstall_appdaemon:
@echo "===> No UNINSTALL of the container helper likwid-bridge"
uninstall_appdaemon_moved:
@echo "===> No UNINSTALL of the container helper likwid-bridge"
endif

install: install_daemon install_freq install_appdaemon install_container_helper
@echo "===> INSTALL applications to $(BINPREFIX)"
@mkdir -p $(BINPREFIX)
@chmod 755 $(BINPREFIX)
Expand Down Expand Up @@ -585,7 +619,7 @@ install: install_daemon install_freq install_appdaemon
@echo "===> INSTALL cmake to $(abspath $(PREFIX)/share/likwid)"
@install -m 644 $(PWD)/likwid-config.cmake $(PREFIX)/share/likwid

move: move_daemon move_freq move_appdaemon
move: move_daemon move_freq move_appdaemon move_container_helper
@echo "===> MOVE applications from $(BINPREFIX) to $(INSTALLED_BINPREFIX)"
@mkdir -p $(INSTALLED_BINPREFIX)
@chmod 755 $(INSTALLED_BINPREFIX)
Expand Down Expand Up @@ -656,7 +690,7 @@ move: move_daemon move_freq move_appdaemon
$(PREFIX)/share/likwid/likwid-config.cmake > $(INSTALLED_PREFIX)/share/likwid/likwid-config.cmake
@chmod 644 $(INSTALLED_PREFIX)/share/likwid/likwid-config.cmake

uninstall: uninstall_daemon uninstall_freq uninstall_appdaemon
uninstall: uninstall_daemon uninstall_freq uninstall_appdaemon uninstall_container_helper
@echo "===> REMOVING applications from $(PREFIX)/bin"
@rm -f $(addprefix $(BINPREFIX)/,$(addsuffix .lua,$(L_APPS)))
@for APP in $(L_APPS); do \
Expand Down Expand Up @@ -693,7 +727,7 @@ uninstall: uninstall_daemon uninstall_freq uninstall_appdaemon
@rm -rf $(PREFIX)/share/likwid/likwid-config.cmake
@rm -rf $(PREFIX)/share/likwid

uninstall_moved: uninstall_daemon_moved uninstall_freq_moved uninstall_appdaemon_moved
uninstall_moved: uninstall_daemon_moved uninstall_freq_moved uninstall_appdaemon_moved uninstall_container_helper_moved
@echo "===> REMOVING applications from $(INSTALLED_PREFIX)/bin"
@rm -f $(addprefix $(INSTALLED_BINPREFIX)/,$(addsuffix .lua,$(L_APPS)))
@for APP in $(L_APPS); do \
Expand Down
7 changes: 7 additions & 0 deletions config.mk
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,9 @@ ROCM_INTERFACE = false#NO SPACE
# Build experimental sysfeatures interface and Lua CLI application
BUILD_SYSFEATURES = false#NO SPACE

# Build container helper
CONTAINER_HELPER = true#NO SPACE

#################################################################
#################################################################
# Advanced configuration options #
Expand Down Expand Up @@ -87,6 +90,10 @@ BUILDAPPDAEMON=true
APPDAEMON = $(PREFIX)/lib/likwid-appDaemon.so#NO SPACE
INSTALLED_APPDAEMON = $(INSTALLED_PREFIX)/lib/likwid-appDaemon.so#NO SPACE

# Build the container helper.
TMP_CONTAINER_HELPER = $(PREFIX)/sbin/likwid-bridge
INSTALLED_CONTAINER_HELPER = $(INSTALLED_PREFIX)/sbin/likwid-bridge

# chown installed tools to this user/group
# if you change anything here, make sure that the user/group can access
# the MSR devices and (on Intel) the PCI devices.
Expand Down
4 changes: 4 additions & 0 deletions make/config_defines.mk
Original file line number Diff line number Diff line change
Expand Up @@ -302,6 +302,10 @@ DEFINES += -DLIKWID_WITH_ROCMON -D__HIP_PLATFORM_HCC__
BUILDAPPDAEMON = true
endif

ifeq ($(CONTAINER_HELPER),true)
C_APPS += likwid-bridge
CONTAINER_HELPER_TARGET = likwid-bridge
endif
ifeq ($(strip $(BUILDDAEMON)),true)
ifneq ($(strip $(COMPILER)),MIC)
DAEMON_TARGET = likwid-accessD
Expand Down
161 changes: 141 additions & 20 deletions src/access_client.c
Original file line number Diff line number Diff line change
Expand Up @@ -123,22 +123,15 @@ access_client_catch_signal()
}

static int
access_client_startDaemon(int cpu_id)
access_client_startDaemon_direct(int cpu_id, struct sockaddr_un *address)
{
/* Check the function of the daemon here */
int res = 0;
char* filepath;
char *newargv[] = { NULL };
char *newenv[] = { NULL };
char *safeexeprog = TOSTRING(ACCESSDAEMON);
char exeprog[1024];
struct sockaddr_un address;
size_t address_length;
int ret;
pid_t pid;
int timeout = 1000;
int socket_fd = -1;
int print_once = 0;

if (config.daemonPath != NULL)
{
Expand Down Expand Up @@ -183,34 +176,141 @@ access_client_startDaemon(int cpu_id)
return pid;
}

address->sun_family = AF_LOCAL;
snprintf(address->sun_path, sizeof(address->sun_path), TOSTRING(LIKWIDSOCKETBASE) "-%d", pid);

daemon_pids[cpu_id] = pid;
nr_daemons++;
return 0;
}

static int
access_client_startDaemon_bridge(int cpu_id, const char *bridge_path, struct sockaddr_un *daemon_address) {
struct sockaddr_un bridge_address;
int socket_fd = -1;
int address_length;
char *filepath;
int timeout = 1000;
int res;
int io_buf;
long io_count;
int daemon_pid;

bridge_address.sun_family = AF_LOCAL;
snprintf(bridge_address.sun_path, sizeof(bridge_address.sun_path), "%s", bridge_path);

socket_fd = socket(AF_LOCAL, SOCK_STREAM, 0);
if (socket_fd < 0)
{
ERROR_PRINT(socket() failed);
return -1;
}

address_length = sizeof(struct sockaddr_un);
filepath = strdup(bridge_address.sun_path);
DEBUG_PRINT(DEBUGLEV_DEVELOP, Waiting for bridge socket file %s, bridge_address.sun_path);
while (access(bridge_address.sun_path, F_OK) && timeout > 0)
{
usleep(2500);
timeout--;
}
if (!access(bridge_address.sun_path, F_OK))
{
DEBUG_PRINT(DEBUGLEV_DEVELOP, Bridge socket file %s exists, bridge_address.sun_path);
}
timeout = 1000;

res = connect(socket_fd, (struct sockaddr *) &bridge_address, address_length);
while (res && timeout > 0)
{
usleep(2500);
res = connect(socket_fd, (struct sockaddr *) &bridge_address, address_length);

if (res == 0)
{
break;
}

timeout--;
DEBUG_PRINT(DEBUGLEV_INFO, Still waiting for bridge socket %s for CPU %d..., filepath, cpu_id);
}

if (timeout <= 0)
{
ERRNO_PRINT; /* should hopefully still work, as we make no syscalls in between. */
fprintf(stderr, "Exiting due to timeout: The bridge socket file at '%s' could not be\n", filepath);
fprintf(stderr, "opened within 10 seconds. Consult the error message above\n");
fprintf(stderr, "this to find out why. If the error is 'no such file or directoy',\n");
fprintf(stderr, "it usually means that the bridge socket filesystem isn't shared between\n");
fprintf(stderr, "the bridge and the client processes.\n");
free(filepath);
close(socket_fd);
return -1;
}
DEBUG_PRINT(DEBUGLEV_INFO, Successfully opened bridge socket %s to daemon for CPU %d, filepath, cpu_id);
free(filepath);

// request socket creation via the connected bridge
io_buf = 1;
io_count = send(socket_fd, (char*) &io_buf, sizeof(io_buf), 0);

if (io_count != sizeof(io_buf)) {
ERROR_PRINT(Failed to send msg to the bridge socket)
close(socket_fd);
return -1;
}

io_count = recv(socket_fd, (char*) &io_buf, sizeof(io_buf), 0);

if (io_count != sizeof(io_buf)) {
ERROR_PRINT(Failed to recv msg from the bridge socket)
close(socket_fd);
return -1;
}

daemon_pid = io_buf;
daemon_address->sun_family = AF_LOCAL;
snprintf(daemon_address->sun_path, sizeof(daemon_address->sun_path), TOSTRING(LIKWIDSOCKETBASE) "-%d", daemon_pid);

daemon_pids[cpu_id] = daemon_pid;
nr_daemons++;
return 0;
}

static int
access_client_daemon_connect(int cpu_id, struct sockaddr_un *address) {
int res = 0;
char* filepath;
size_t address_length;
int timeout = 1000;
int socket_fd = -1;

socket_fd = socket(AF_LOCAL, SOCK_STREAM, 0);
if (socket_fd < 0)
{
ERROR_PRINT(socket() failed);
return -1;
}

address.sun_family = AF_LOCAL;
address_length = sizeof(address);
snprintf(address.sun_path, sizeof(address.sun_path), TOSTRING(LIKWIDSOCKETBASE) "-%d", pid);
filepath = strdup(address.sun_path);
DEBUG_PRINT(DEBUGLEV_DEVELOP, Waiting for socket file %s, address.sun_path);
while (access(address.sun_path, F_OK) && timeout > 0)
address_length = sizeof(struct sockaddr_un);
filepath = strdup(address->sun_path);
DEBUG_PRINT(DEBUGLEV_DEVELOP, Waiting for socket file %s, address->sun_path);
while (access(address->sun_path, F_OK) && timeout > 0)
{
usleep(2500);
timeout--;
}
if (!access(address.sun_path, F_OK))
if (!access(address->sun_path, F_OK))
{
DEBUG_PRINT(DEBUGLEV_DEVELOP, Socket file %s exists, address.sun_path);
DEBUG_PRINT(DEBUGLEV_DEVELOP, Socket file %s exists, address->sun_path);
}
timeout = 1000;

res = connect(socket_fd, (struct sockaddr *) &address, address_length);
res = connect(socket_fd, (struct sockaddr *) address, address_length);
while (res && timeout > 0)
{
usleep(2500);
res = connect(socket_fd, (struct sockaddr *) &address, address_length);
res = connect(socket_fd, (struct sockaddr *) address, address_length);

if (res == 0)
{
Expand All @@ -234,8 +334,29 @@ access_client_startDaemon(int cpu_id)
}
DEBUG_PRINT(DEBUGLEV_INFO, Successfully opened socket %s to daemon for CPU %d, filepath, cpu_id);
free(filepath);
daemon_pids[cpu_id] = pid;
nr_daemons++;
return socket_fd;
}

static int
access_client_startDaemon(int cpu_id) {
const char *bridge_path;
struct sockaddr_un address;
int daemon_ret_code;
int socket_fd;

bridge_path = getenv("LIKWID_BRIDGE_PATH");

if (!bridge_path) {
daemon_ret_code = access_client_startDaemon_direct(cpu_id, &address);
} else {
daemon_ret_code = access_client_startDaemon_bridge(cpu_id, bridge_path, &address);
}

if (daemon_ret_code < 0) {
return daemon_ret_code;
}

socket_fd = access_client_daemon_connect(cpu_id, &address);
return socket_fd;
}

Expand Down
Loading

0 comments on commit 854160e

Please sign in to comment.