diff --git a/PILOTVERSION b/PILOTVERSION index bb737956..fe6e3d7f 100644 --- a/PILOTVERSION +++ b/PILOTVERSION @@ -1 +1 @@ -3.7.3.84 \ No newline at end of file +3.7.4.2 \ No newline at end of file diff --git a/pilot/common/errorcodes.py b/pilot/common/errorcodes.py index 4f059612..4c84c373 100644 --- a/pilot/common/errorcodes.py +++ b/pilot/common/errorcodes.py @@ -178,6 +178,7 @@ class ErrorCodes: LEASETIME = 1375 LOGCREATIONTIMEOUT = 1376 CVMFSISNOTALIVE = 1377 + LSETUPTIMEDOUT = 1378 _error_messages = { GENERALERROR: "General pilot error, consult batch log", @@ -317,7 +318,8 @@ class ErrorCodes: REMOTEFILEDICTDOESNOTEXIST: "Remote file open dictionary does not exist", LEASETIME: "Lease time is up", # internal use only LOGCREATIONTIMEOUT: "Log file creation timed out", - CVMFSISNOTALIVE: "CVMFS is not responding" + CVMFSISNOTALIVE: "CVMFS is not responding", + LSETUPTIMEDOUT: "Lsetup command timed out during remote file open" } put_error_codes = [1135, 1136, 1137, 1141, 1152, 1181] @@ -435,34 +437,39 @@ def resolve_transform_error(self, exit_code: int, stderr: str) -> int: :param stderr: transform stderr (str) :return: pilot error code (int). """ - if exit_code and "Not mounting requested bind point" in stderr: - exit_code = self.SINGULARITYBINDPOINTFAILURE + error_map = { + "Not mounting requested bind point": self.SINGULARITYBINDPOINTFAILURE, + "No more available loop devices": self.SINGULARITYNOLOOPDEVICES, + "Failed to mount image": self.SINGULARITYIMAGEMOUNTFAILURE, + "error: while mounting": self.SINGULARITYIMAGEMOUNTFAILURE, + "Operation not permitted": self.SINGULARITYGENERALFAILURE, + "Failed to create user namespace": self.SINGULARITYFAILEDUSERNAMESPACE, + "Singularity is not installed": self.SINGULARITYNOTINSTALLED, + "Apptainer is not installed": self.APPTAINERNOTINSTALLED, + "cannot create directory": self.MKDIR, + "General payload setup verification error": self.SETUPFAILURE + } + + # Check if stderr contains any known error messages + for error_message, error_code in error_map.items(): + if error_message in stderr: + return error_code + + # Handle specific exit codes + if exit_code == 2: + return self.LSETUPTIMEDOUT + elif exit_code == 3: + return self.REMOTEFILEOPENTIMEDOUT elif exit_code == 251: - exit_code = self.UNKNOWNTRFFAILURE - elif exit_code and "No more available loop devices" in stderr: - exit_code = self.SINGULARITYNOLOOPDEVICES - elif exit_code and ("Failed to mount image" in stderr or "error: while mounting" in stderr): - exit_code = self.SINGULARITYIMAGEMOUNTFAILURE - elif exit_code and "Operation not permitted" in stderr: - exit_code = self.SINGULARITYGENERALFAILURE - elif exit_code and "Failed to create user namespace" in stderr: - exit_code = self.SINGULARITYFAILEDUSERNAMESPACE - elif "Singularity is not installed" in stderr: # exit code should be 64 but not always? - exit_code = self.SINGULARITYNOTINSTALLED - elif "Apptainer is not installed" in stderr: # exit code should be 64 but not always? - exit_code = self.APPTAINERNOTINSTALLED - elif exit_code == 64 and "cannot create directory" in stderr: - exit_code = self.MKDIR - elif exit_code and "General payload setup verification error" in stderr: - exit_code = self.SETUPFAILURE + return self.UNKNOWNTRFFAILURE elif exit_code == -1: - exit_code = self.UNKNOWNTRFFAILURE + return self.UNKNOWNTRFFAILURE elif exit_code == self.COMMANDTIMEDOUT: - pass + return exit_code elif exit_code != 0: - exit_code = self.PAYLOADEXECUTIONFAILURE + return self.PAYLOADEXECUTIONFAILURE - return exit_code + return exit_code # Return original exit code if no specific error is found def extract_stderr_error(self, stderr: str) -> str: """ diff --git a/pilot/user/atlas/common.py b/pilot/user/atlas/common.py index 9216ba13..2bd8a56e 100644 --- a/pilot/user/atlas/common.py +++ b/pilot/user/atlas/common.py @@ -268,6 +268,8 @@ def open_remote_files(indata: list, workdir: str, nthreads: int) -> (int, str, l logger.warning(diagnostics) return 11, diagnostics, not_opened + # if execute_remote_file_open() returns exit code 1, it means general error. + # exit code 2 means that lsetup timed out, while 3 means that the python script (actual file open) timed out try: exitcode, stdout = execute_remote_file_open(path, timeout) except PilotException as exc: @@ -319,7 +321,7 @@ def get_timeout_for_remoteio(indata: list) -> int: """ remote_io = [fspec.status == 'remote_io' for fspec in indata] - return len(remote_io) * 30 + 600 + return len(remote_io) * 30 + 900 def parse_remotefileverification_dictionary(workdir: str) -> (int, str, list): diff --git a/pilot/user/atlas/container.py b/pilot/user/atlas/container.py index f4ff323b..e74b1ceb 100644 --- a/pilot/user/atlas/container.py +++ b/pilot/user/atlas/container.py @@ -833,6 +833,7 @@ def execute_remote_file_open(path: str, python_script_timeout: int) -> (int, str # Check for timeout (once per second) if time.time() - start_time > lsetup_timeout and not lsetup_completed: logger.warning("timeout for 'lsetup' exceeded - killing script") + exit_code = 2 # 'lsetup' timeout process.kill() break @@ -841,10 +842,11 @@ def execute_remote_file_open(path: str, python_script_timeout: int) -> (int, str output = process.stdout.readline() # Read bytes directly if output is not None: # Check if any output is available (not None) output = output.decode().strip() - logger.info(output) # Print output for monitoring + logger.info(f'remote file open: {output}') # Check for LSETUP_COMPLETED message if output == "LSETUP_COMPLETED": + logger.info('lsetup has completed (resetting start time)') lsetup_completed = True start_time = time.time() # Reset start time for 'python3' timeout @@ -861,18 +863,22 @@ def execute_remote_file_open(path: str, python_script_timeout: int) -> (int, str continue # Timeout for python script after LSETUP_COMPLETED - if lsetup_completed and time.time() - start_time > python_script_timeout: - logger.warning("timeout for 'python3' subscript exceeded - killing script") + if lsetup_completed and ((time.time() - start_time) > python_script_timeout): + logger.warning(f"timeout for 'python3' subscript exceeded - killing script " + f"({time.time()} - {start_time} > {python_script_timeout})") + exit_code = 3 # python script timeout process.kill() break # Check if script has completed normally return_code = process.poll() if return_code is not None: - logger.info("script execution completed with return code: {return_code}") + logger.info(f"script execution completed with return code: {return_code}") exit_code = return_code break + time.sleep(0.5) + # Ensure process is terminated if process.poll() is None: process.terminate() diff --git a/pilot/user/atlas/cvmfs.py b/pilot/user/atlas/cvmfs.py index 42212a33..03568b11 100644 --- a/pilot/user/atlas/cvmfs.py +++ b/pilot/user/atlas/cvmfs.py @@ -33,8 +33,6 @@ 'CVMFS_BASE/unpacked.cern.ch/logDir/lastUpdate', 'CVMFS_BASE/sft-nightlies.cern.ch/lcg/lastUpdate', ] -# when was the last cvmfs update? -last_update_file = '/cvmfs/sft.cern.ch/lcg/lastUpdate' def get_cvmfs_base_path() -> str: @@ -44,3 +42,12 @@ def get_cvmfs_base_path() -> str: :return: base path for CVMFS (str). """ return get_file_system_root_path() + + +def get_last_update_file() -> str: + """ + Return the last update file. + + :return: last update file (str). + """ + return f'{get_cvmfs_base_path()}/sft.cern.ch/lcg/lastUpdate' diff --git a/pilot/util/constants.py b/pilot/util/constants.py index 19f8bf94..da65ded0 100644 --- a/pilot/util/constants.py +++ b/pilot/util/constants.py @@ -27,8 +27,8 @@ # Pilot version RELEASE = '3' # released number should be fixed at 3 for Pilot 3 VERSION = '7' # version number is '1' for first release, '0' until then, increased for bigger updates -REVISION = '3' # revision number should be reset to '0' for every new version release, increased for small updates -BUILD = '84' # build number should be reset to '1' for every new development cycle +REVISION = '4' # revision number should be reset to '0' for every new version release, increased for small updates +BUILD = '2' # build number should be reset to '1' for every new development cycle SUCCESS = 0 FAILURE = 1 diff --git a/pilot/util/cvmfs.py b/pilot/util/cvmfs.py index 23d9560b..a869d528 100644 --- a/pilot/util/cvmfs.py +++ b/pilot/util/cvmfs.py @@ -87,7 +87,11 @@ def get_last_update() -> int: """ pilot_user = os.environ.get('PILOT_USER', 'generic').lower() user = __import__(f'pilot.user.{pilot_user}.cvmfs', globals(), locals(), [pilot_user], 0) - last_update_file = getattr(user, 'last_update_file', None) + try: + last_update_file = user.get_last_update_file() + except AttributeError: + last_update_file = None + timestamp = None if last_update_file: if os.path.exists(last_update_file):