diff --git a/.gitignore b/.gitignore index 898f7a87b..73729215d 100644 --- a/.gitignore +++ b/.gitignore @@ -26,6 +26,7 @@ likwid-pin likwid-powermeter likwid-setFrequencies likwid-topology +likwid-bridge likwid.lua # generated doc files diff --git a/Makefile b/Makefile index 718b3118f..41363118b 100644 --- a/Makefile +++ b/Makefile @@ -193,7 +193,11 @@ else ifeq ($(BUILDFREQ),false) all: $(BUILD_DIR) $(PERFMONHEADERS) $(OBJ) $(TARGET_LIB) $(FORTRAN_IF) $(PINLIB) $(L_APPS) $(L_HELPER) $(DAEMON_TARGET) $(BENCH_TARGET) $(APPDAEMON_TARGET) else +ifeq ($(CONTAINER_HELPER),false) all: $(BUILD_DIR) $(PERFMONHEADERS) $(OBJ) $(TARGET_LIB) $(FORTRAN_IF) $(PINLIB) $(L_APPS) $(L_HELPER) $(DAEMON_TARGET) $(FREQ_TARGET) $(BENCH_TARGET) $(APPDAEMON_TARGET) +else +all: $(BUILD_DIR) $(PERFMONHEADERS) $(OBJ) $(TARGET_LIB) $(FORTRAN_IF) $(PINLIB) $(L_APPS) $(L_HELPER) $(DAEMON_TARGET) $(FREQ_TARGET) $(BENCH_TARGET) $(APPDAEMON_TARGET) $(CONTAINER_HELPER_TARGET) +endif endif endif @@ -275,6 +279,10 @@ $(APPDAEMON_TARGET): $(SRC_DIR)/access-daemon/appDaemon.c $(TARGET_GOTCHA_LIB) @echo "===> BUILD application interface likwid-appDaemon.so" $(Q)$(MAKE) -C $(SRC_DIR)/access-daemon likwid-appDaemon.so +$(CONTAINER_HELPER_TARGET): $(SRC_DIR)/bridge/bridge.c + @echo "===> BUILD container helper likwid-bridge" + $(Q)$(CC) $(DEBUG_FLAGS) $(CFLAGS) $(ANSI_CFLAGS) $(CPPFLAGS) $(SRC_DIR)/bridge/bridge.c -o $@ + $(BUILD_DIR): @mkdir $(BUILD_DIR) @@ -365,7 +373,7 @@ clean: $(TARGET_LUA_LIB) $(TARGET_HWLOC_LIB) $(TARGET_GOTCHA_LIB) $(BENCH_TARGET @rm -f $(DYNAMIC_TARGET_LIB)* @rm -f $(PINLIB)* @rm -f $(FORTRAN_IF_NAME) - @rm -f $(FREQ_TARGET) $(DAEMON_TARGET) $(APPDAEMON_TARGET) + @rm -f $(FREQ_TARGET) $(DAEMON_TARGET) $(APPDAEMON_TARGET) $(CONTAINER_HELPER_TARGET) @rm -f likwid-config.cmake distclean: $(TARGET_LUA_LIB) $(TARGET_HWLOC_LIB) $(TARGET_GOTCHA_LIB) $(BENCH_TARGET) @@ -378,7 +386,7 @@ distclean: $(TARGET_LUA_LIB) $(TARGET_HWLOC_LIB) $(TARGET_GOTCHA_LIB) $(BENCH_TA @rm -f $(DYNAMIC_TARGET_LIB)* @rm -f $(PINLIB)* @rm -f $(FORTRAN_IF_NAME) - @rm -f $(FREQ_TARGET) $(DAEMON_TARGET) $(APPDAEMON_TARGET) + @rm -f $(FREQ_TARGET) $(DAEMON_TARGET) $(APPDAEMON_TARGET) $(CONTAINER_HELPER_TARGET) @rm -rf $(BUILD_DIR) @if [ "$(LUA_INTERNAL)" = "true" ]; then rm -f $(TARGET_LUA_LIB).* $(shell basename $(TARGET_LUA_LIB)).*; fi @if [ "$(USE_INTERNAL_HWLOC)" = "true" ]; then rm -f $(TARGET_HWLOC_LIB).* $(shell basename $(TARGET_HWLOC_LIB)).*; fi @@ -488,7 +496,33 @@ uninstall_appdaemon_moved: @echo "===> No UNINSTALL of the application interface appDaemon" endif -install: install_daemon install_freq install_appdaemon +ifeq ($(CONTAINER_HELPER),true) +install_container_helper: $(CONTAINER_HELPER_TARGET) + @echo "===> INSTALL container helper likwid-bridge to $(SBINPREFIX)/likwid-bridge" + @mkdir -p $(SBINPREFIX) + @install -m 755 $(CONTAINER_HELPER_TARGET) $(SBINPREFIX)/likwid-bridge +move_container_helper: + @echo "===> MOVE container helper likwid-bridge from $(SBINPREFIX)/likwid-bridge to $(INSTALLED_SBINPREFIX)/likwid-bridge" + @mkdir -p $(INSTALLED_SBINPREFIX) + @install -m 755 $(SBINPREFIX)/$(CONTAINER_HELPER_TARGET) $(INSTALLED_SBINPREFIX)/$(CONTAINER_HELPER_TARGET) +uninstall_container_helper: + @echo "===> REMOVING container helper likwid-bridge from $(SBINPREFIX)/$(CONTAINER_HELPER_TARGET)" + @rm -f $(SBINPREFIX)/$(CONTAINER_HELPER_TARGET) +uninstall_container_helper_moved: + @echo "===> REMOVING container helper likwid-bridge from $(INSTALLED_SBINPREFIX)/$(CONTAINER_HELPER_TARGET)" + @rm -f $(INSTALLED_SBINPREFIX)/$(CONTAINER_HELPER_TARGET) +else +install_container_helper: + @echo "===> No INSTALL of the container helper likwid-bridge" +move_appdaemon: + @echo "===> No MOVE of the container helper likwid-bridge" +uninstall_appdaemon: + @echo "===> No UNINSTALL of the container helper likwid-bridge" +uninstall_appdaemon_moved: + @echo "===> No UNINSTALL of the container helper likwid-bridge" +endif + +install: install_daemon install_freq install_appdaemon install_container_helper @echo "===> INSTALL applications to $(BINPREFIX)" @mkdir -p $(BINPREFIX) @chmod 755 $(BINPREFIX) @@ -585,7 +619,7 @@ install: install_daemon install_freq install_appdaemon @echo "===> INSTALL cmake to $(abspath $(PREFIX)/share/likwid)" @install -m 644 $(PWD)/likwid-config.cmake $(PREFIX)/share/likwid -move: move_daemon move_freq move_appdaemon +move: move_daemon move_freq move_appdaemon move_container_helper @echo "===> MOVE applications from $(BINPREFIX) to $(INSTALLED_BINPREFIX)" @mkdir -p $(INSTALLED_BINPREFIX) @chmod 755 $(INSTALLED_BINPREFIX) @@ -656,7 +690,7 @@ move: move_daemon move_freq move_appdaemon $(PREFIX)/share/likwid/likwid-config.cmake > $(INSTALLED_PREFIX)/share/likwid/likwid-config.cmake @chmod 644 $(INSTALLED_PREFIX)/share/likwid/likwid-config.cmake -uninstall: uninstall_daemon uninstall_freq uninstall_appdaemon +uninstall: uninstall_daemon uninstall_freq uninstall_appdaemon uninstall_container_helper @echo "===> REMOVING applications from $(PREFIX)/bin" @rm -f $(addprefix $(BINPREFIX)/,$(addsuffix .lua,$(L_APPS))) @for APP in $(L_APPS); do \ @@ -693,7 +727,7 @@ uninstall: uninstall_daemon uninstall_freq uninstall_appdaemon @rm -rf $(PREFIX)/share/likwid/likwid-config.cmake @rm -rf $(PREFIX)/share/likwid -uninstall_moved: uninstall_daemon_moved uninstall_freq_moved uninstall_appdaemon_moved +uninstall_moved: uninstall_daemon_moved uninstall_freq_moved uninstall_appdaemon_moved uninstall_container_helper_moved @echo "===> REMOVING applications from $(INSTALLED_PREFIX)/bin" @rm -f $(addprefix $(INSTALLED_BINPREFIX)/,$(addsuffix .lua,$(L_APPS))) @for APP in $(L_APPS); do \ diff --git a/config.mk b/config.mk index a4baefd27..3927e6a62 100644 --- a/config.mk +++ b/config.mk @@ -37,6 +37,9 @@ ROCM_INTERFACE = false#NO SPACE # Build experimental sysfeatures interface and Lua CLI application BUILD_SYSFEATURES = false#NO SPACE +# Build container helper +CONTAINER_HELPER = true#NO SPACE + ################################################################# ################################################################# # Advanced configuration options # @@ -87,6 +90,10 @@ BUILDAPPDAEMON=true APPDAEMON = $(PREFIX)/lib/likwid-appDaemon.so#NO SPACE INSTALLED_APPDAEMON = $(INSTALLED_PREFIX)/lib/likwid-appDaemon.so#NO SPACE +# Build the container helper. +TMP_CONTAINER_HELPER = $(PREFIX)/sbin/likwid-bridge +INSTALLED_CONTAINER_HELPER = $(INSTALLED_PREFIX)/sbin/likwid-bridge + # chown installed tools to this user/group # if you change anything here, make sure that the user/group can access # the MSR devices and (on Intel) the PCI devices. diff --git a/make/config_defines.mk b/make/config_defines.mk index d8f31de76..dfeb739ff 100644 --- a/make/config_defines.mk +++ b/make/config_defines.mk @@ -302,6 +302,10 @@ DEFINES += -DLIKWID_WITH_ROCMON -D__HIP_PLATFORM_HCC__ BUILDAPPDAEMON = true endif +ifeq ($(CONTAINER_HELPER),true) + C_APPS += likwid-bridge + CONTAINER_HELPER_TARGET = likwid-bridge +endif ifeq ($(strip $(BUILDDAEMON)),true) ifneq ($(strip $(COMPILER)),MIC) DAEMON_TARGET = likwid-accessD diff --git a/src/access_client.c b/src/access_client.c index abdd16dd6..9cb1b4715 100644 --- a/src/access_client.c +++ b/src/access_client.c @@ -123,22 +123,15 @@ access_client_catch_signal() } static int -access_client_startDaemon(int cpu_id) +access_client_startDaemon_direct(int cpu_id, struct sockaddr_un *address) { /* Check the function of the daemon here */ - int res = 0; - char* filepath; char *newargv[] = { NULL }; char *newenv[] = { NULL }; char *safeexeprog = TOSTRING(ACCESSDAEMON); char exeprog[1024]; - struct sockaddr_un address; - size_t address_length; int ret; pid_t pid; - int timeout = 1000; - int socket_fd = -1; - int print_once = 0; if (config.daemonPath != NULL) { @@ -183,6 +176,115 @@ access_client_startDaemon(int cpu_id) return pid; } + address->sun_family = AF_LOCAL; + snprintf(address->sun_path, sizeof(address->sun_path), TOSTRING(LIKWIDSOCKETBASE) "-%d", pid); + + daemon_pids[cpu_id] = pid; + nr_daemons++; + return 0; +} + +static int +access_client_startDaemon_bridge(int cpu_id, const char *bridge_path, struct sockaddr_un *daemon_address) { + struct sockaddr_un bridge_address; + int socket_fd = -1; + int address_length; + char *filepath; + int timeout = 1000; + int res; + int io_buf; + long io_count; + int daemon_pid; + + bridge_address.sun_family = AF_LOCAL; + snprintf(bridge_address.sun_path, sizeof(bridge_address.sun_path), "%s", bridge_path); + + socket_fd = socket(AF_LOCAL, SOCK_STREAM, 0); + if (socket_fd < 0) + { + ERROR_PRINT(socket() failed); + return -1; + } + + address_length = sizeof(struct sockaddr_un); + filepath = strdup(bridge_address.sun_path); + DEBUG_PRINT(DEBUGLEV_DEVELOP, Waiting for bridge socket file %s, bridge_address.sun_path); + while (access(bridge_address.sun_path, F_OK) && timeout > 0) + { + usleep(2500); + timeout--; + } + if (!access(bridge_address.sun_path, F_OK)) + { + DEBUG_PRINT(DEBUGLEV_DEVELOP, Bridge socket file %s exists, bridge_address.sun_path); + } + timeout = 1000; + + res = connect(socket_fd, (struct sockaddr *) &bridge_address, address_length); + while (res && timeout > 0) + { + usleep(2500); + res = connect(socket_fd, (struct sockaddr *) &bridge_address, address_length); + + if (res == 0) + { + break; + } + + timeout--; + DEBUG_PRINT(DEBUGLEV_INFO, Still waiting for bridge socket %s for CPU %d..., filepath, cpu_id); + } + + if (timeout <= 0) + { + ERRNO_PRINT; /* should hopefully still work, as we make no syscalls in between. */ + fprintf(stderr, "Exiting due to timeout: The bridge socket file at '%s' could not be\n", filepath); + fprintf(stderr, "opened within 10 seconds. Consult the error message above\n"); + fprintf(stderr, "this to find out why. If the error is 'no such file or directoy',\n"); + fprintf(stderr, "it usually means that the bridge socket filesystem isn't shared between\n"); + fprintf(stderr, "the bridge and the client processes.\n"); + free(filepath); + close(socket_fd); + return -1; + } + DEBUG_PRINT(DEBUGLEV_INFO, Successfully opened bridge socket %s to daemon for CPU %d, filepath, cpu_id); + free(filepath); + + // request socket creation via the connected bridge + io_buf = 1; + io_count = send(socket_fd, (char*) &io_buf, sizeof(io_buf), 0); + + if (io_count != sizeof(io_buf)) { + ERROR_PRINT(Failed to send msg to the bridge socket) + close(socket_fd); + return -1; + } + + io_count = recv(socket_fd, (char*) &io_buf, sizeof(io_buf), 0); + + if (io_count != sizeof(io_buf)) { + ERROR_PRINT(Failed to recv msg from the bridge socket) + close(socket_fd); + return -1; + } + + daemon_pid = io_buf; + daemon_address->sun_family = AF_LOCAL; + snprintf(daemon_address->sun_path, sizeof(daemon_address->sun_path), TOSTRING(LIKWIDSOCKETBASE) "-%d", daemon_pid); + + daemon_pids[cpu_id] = daemon_pid; + nr_daemons++; + return 0; +} + +static int +access_client_daemon_connect(int cpu_id, struct sockaddr_un *address) { + int res = 0; + char* filepath; + size_t address_length; + int timeout = 1000; + int socket_fd = -1; + socket_fd = socket(AF_LOCAL, SOCK_STREAM, 0); if (socket_fd < 0) { @@ -190,27 +292,25 @@ access_client_startDaemon(int cpu_id) return -1; } - address.sun_family = AF_LOCAL; - address_length = sizeof(address); - snprintf(address.sun_path, sizeof(address.sun_path), TOSTRING(LIKWIDSOCKETBASE) "-%d", pid); - filepath = strdup(address.sun_path); - DEBUG_PRINT(DEBUGLEV_DEVELOP, Waiting for socket file %s, address.sun_path); - while (access(address.sun_path, F_OK) && timeout > 0) + address_length = sizeof(struct sockaddr_un); + filepath = strdup(address->sun_path); + DEBUG_PRINT(DEBUGLEV_DEVELOP, Waiting for socket file %s, address->sun_path); + while (access(address->sun_path, F_OK) && timeout > 0) { usleep(2500); timeout--; } - if (!access(address.sun_path, F_OK)) + if (!access(address->sun_path, F_OK)) { - DEBUG_PRINT(DEBUGLEV_DEVELOP, Socket file %s exists, address.sun_path); + DEBUG_PRINT(DEBUGLEV_DEVELOP, Socket file %s exists, address->sun_path); } timeout = 1000; - res = connect(socket_fd, (struct sockaddr *) &address, address_length); + res = connect(socket_fd, (struct sockaddr *) address, address_length); while (res && timeout > 0) { usleep(2500); - res = connect(socket_fd, (struct sockaddr *) &address, address_length); + res = connect(socket_fd, (struct sockaddr *) address, address_length); if (res == 0) { @@ -234,8 +334,29 @@ access_client_startDaemon(int cpu_id) } DEBUG_PRINT(DEBUGLEV_INFO, Successfully opened socket %s to daemon for CPU %d, filepath, cpu_id); free(filepath); - daemon_pids[cpu_id] = pid; - nr_daemons++; + return socket_fd; +} + +static int +access_client_startDaemon(int cpu_id) { + const char *bridge_path; + struct sockaddr_un address; + int daemon_ret_code; + int socket_fd; + + bridge_path = getenv("LIKWID_BRIDGE_PATH"); + + if (!bridge_path) { + daemon_ret_code = access_client_startDaemon_direct(cpu_id, &address); + } else { + daemon_ret_code = access_client_startDaemon_bridge(cpu_id, bridge_path, &address); + } + + if (daemon_ret_code < 0) { + return daemon_ret_code; + } + + socket_fd = access_client_daemon_connect(cpu_id, &address); return socket_fd; } diff --git a/src/bridge/bridge.c b/src/bridge/bridge.c new file mode 100644 index 000000000..dd2c04837 --- /dev/null +++ b/src/bridge/bridge.c @@ -0,0 +1,137 @@ +#ifndef _GNU_SOURCE +#define _GNU_SOURCE +#endif + +#include +#include +#include +#include +#include +#include + +#ifdef __GNUC__ +#define NORETURN_ATTR __attribute__((noreturn)) +#endif +#ifdef __GNUG__ +#define NORETURN_ATTR [[noreturn]] +#endif + +NORETURN_ATTR void access_daemon_main() { + char *argv[1] = {NULL}; + int ret = execvp("likwid-accessD", argv); + if (ret < 0) { + printf("Failed to start likwid-accessD daemon\n", ret); + } + exit(ret); +} + +int create_access_daemon() { + int pid = fork(); + if (pid == 0) access_daemon_main(); + else return pid; +} + +int create_bridge_socket(int id) { + struct sockaddr_un sock_addr; + + sock_addr.sun_family = AF_LOCAL; + snprintf(sock_addr.sun_path, sizeof(sock_addr.sun_path), "/tmp/likwid-bridge-%d", id); + + int socket_fd = socket(AF_LOCAL, SOCK_STREAM, 0); + if (socket_fd < 0) { + printf("Failed to start the bridge socket\n"); + return -1; + } + + int ret = bind(socket_fd, (const struct sockaddr *) &sock_addr, sizeof(sock_addr)); + if (ret < 0) { + printf("Failed to bind() to the bridge socket\n"); + return -1; + } + + ret = listen(socket_fd, 128); + if (ret < 0) { + printf("Failed to listen() on the bridge socket\n"); + return -1; + } + + return socket_fd; +} + +NORETURN_ATTR void bridge_daemon_main(int socket_fd) { + int io_buf; + while (1) { + int conn_fd = accept(socket_fd, NULL, NULL); + + if (conn_fd < 0) { + printf("Failed to accept a bridge connection\n"); + exit(-1); + } + + long io_count = recv(conn_fd, (char *) &io_buf, sizeof(io_buf), 0); + if (io_count != sizeof(io_buf)) { + printf("Failed to recv from the bridge socket\n"); + exit(-1); + } + + switch (io_buf) { + case 1: { + int daemon_pid = create_access_daemon(); + + io_buf = daemon_pid; + io_count = send(conn_fd, (char *) &io_buf, sizeof(io_buf), 0); + if (io_count != sizeof(io_buf)) { + printf("Failed to send from the bridge socket\n"); + close(conn_fd); + close(socket_fd); + exit(-1); + } + + break; + } + default: { + printf("Unknown bridge command: %d. Ignoring.\n", io_buf); + close(conn_fd); + break; + } + } + } +} + +int create_bridge_daemon(int id) { + int socket_fd = create_bridge_socket(id); + if (socket_fd < 0) return socket_fd; + + int pid = fork(); + if (pid == 0) { + bridge_daemon_main(socket_fd); + } else return pid; +} + +int main(int argc, char *const *argv) { + int id = getpid(); + int bridge_pid = create_bridge_daemon(id); + if (bridge_pid < 0) return bridge_pid; + + char env_var[128]; + snprintf(env_var, sizeof(env_var), "LIKWID_BRIDGE_PATH=/tmp/likwid-bridge-%d", id); + char *envp[2] = {env_var, NULL}; + + int child_pid = fork(); + if (child_pid == 0) { + if (argc == 1) { + char *child_argv[2] = {"-i", NULL}; + int ret = execve("/bin/bash", child_argv, envp); + if (ret < 0) printf("Failed to invoke bash\n"); + } else { + const char *child_path = argv[1]; + char *const *child_argv = argv + 1; + int ret = execvpe(child_path, child_argv, envp); + if (ret < 0) printf("Failed to invoke %s\n", child_path); + } + } else { + waitpid(child_pid, NULL, 0); + } + + kill(bridge_pid, SIGKILL); +}