Skip to content

Commit

Permalink
Merge pull request #122 from PalNilsson/next
Browse files Browse the repository at this point in the history
3.7.4.2
  • Loading branch information
PalNilsson authored May 7, 2024
2 parents 4b75cba + 612f379 commit 2dc71c3
Show file tree
Hide file tree
Showing 7 changed files with 61 additions and 35 deletions.
2 changes: 1 addition & 1 deletion PILOTVERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
3.7.3.84
3.7.4.2
55 changes: 31 additions & 24 deletions pilot/common/errorcodes.py
Original file line number Diff line number Diff line change
Expand Up @@ -178,6 +178,7 @@ class ErrorCodes:
LEASETIME = 1375
LOGCREATIONTIMEOUT = 1376
CVMFSISNOTALIVE = 1377
LSETUPTIMEDOUT = 1378

_error_messages = {
GENERALERROR: "General pilot error, consult batch log",
Expand Down Expand Up @@ -317,7 +318,8 @@ class ErrorCodes:
REMOTEFILEDICTDOESNOTEXIST: "Remote file open dictionary does not exist",
LEASETIME: "Lease time is up", # internal use only
LOGCREATIONTIMEOUT: "Log file creation timed out",
CVMFSISNOTALIVE: "CVMFS is not responding"
CVMFSISNOTALIVE: "CVMFS is not responding",
LSETUPTIMEDOUT: "Lsetup command timed out during remote file open"
}

put_error_codes = [1135, 1136, 1137, 1141, 1152, 1181]
Expand Down Expand Up @@ -435,34 +437,39 @@ def resolve_transform_error(self, exit_code: int, stderr: str) -> int:
:param stderr: transform stderr (str)
:return: pilot error code (int).
"""
if exit_code and "Not mounting requested bind point" in stderr:
exit_code = self.SINGULARITYBINDPOINTFAILURE
error_map = {
"Not mounting requested bind point": self.SINGULARITYBINDPOINTFAILURE,
"No more available loop devices": self.SINGULARITYNOLOOPDEVICES,
"Failed to mount image": self.SINGULARITYIMAGEMOUNTFAILURE,
"error: while mounting": self.SINGULARITYIMAGEMOUNTFAILURE,
"Operation not permitted": self.SINGULARITYGENERALFAILURE,
"Failed to create user namespace": self.SINGULARITYFAILEDUSERNAMESPACE,
"Singularity is not installed": self.SINGULARITYNOTINSTALLED,
"Apptainer is not installed": self.APPTAINERNOTINSTALLED,
"cannot create directory": self.MKDIR,
"General payload setup verification error": self.SETUPFAILURE
}

# Check if stderr contains any known error messages
for error_message, error_code in error_map.items():
if error_message in stderr:
return error_code

# Handle specific exit codes
if exit_code == 2:
return self.LSETUPTIMEDOUT
elif exit_code == 3:
return self.REMOTEFILEOPENTIMEDOUT
elif exit_code == 251:
exit_code = self.UNKNOWNTRFFAILURE
elif exit_code and "No more available loop devices" in stderr:
exit_code = self.SINGULARITYNOLOOPDEVICES
elif exit_code and ("Failed to mount image" in stderr or "error: while mounting" in stderr):
exit_code = self.SINGULARITYIMAGEMOUNTFAILURE
elif exit_code and "Operation not permitted" in stderr:
exit_code = self.SINGULARITYGENERALFAILURE
elif exit_code and "Failed to create user namespace" in stderr:
exit_code = self.SINGULARITYFAILEDUSERNAMESPACE
elif "Singularity is not installed" in stderr: # exit code should be 64 but not always?
exit_code = self.SINGULARITYNOTINSTALLED
elif "Apptainer is not installed" in stderr: # exit code should be 64 but not always?
exit_code = self.APPTAINERNOTINSTALLED
elif exit_code == 64 and "cannot create directory" in stderr:
exit_code = self.MKDIR
elif exit_code and "General payload setup verification error" in stderr:
exit_code = self.SETUPFAILURE
return self.UNKNOWNTRFFAILURE
elif exit_code == -1:
exit_code = self.UNKNOWNTRFFAILURE
return self.UNKNOWNTRFFAILURE
elif exit_code == self.COMMANDTIMEDOUT:
pass
return exit_code
elif exit_code != 0:
exit_code = self.PAYLOADEXECUTIONFAILURE
return self.PAYLOADEXECUTIONFAILURE

return exit_code
return exit_code # Return original exit code if no specific error is found

def extract_stderr_error(self, stderr: str) -> str:
"""
Expand Down
4 changes: 3 additions & 1 deletion pilot/user/atlas/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -268,6 +268,8 @@ def open_remote_files(indata: list, workdir: str, nthreads: int) -> (int, str, l
logger.warning(diagnostics)
return 11, diagnostics, not_opened

# if execute_remote_file_open() returns exit code 1, it means general error.
# exit code 2 means that lsetup timed out, while 3 means that the python script (actual file open) timed out
try:
exitcode, stdout = execute_remote_file_open(path, timeout)
except PilotException as exc:
Expand Down Expand Up @@ -319,7 +321,7 @@ def get_timeout_for_remoteio(indata: list) -> int:
"""
remote_io = [fspec.status == 'remote_io' for fspec in indata]

return len(remote_io) * 30 + 600
return len(remote_io) * 30 + 900


def parse_remotefileverification_dictionary(workdir: str) -> (int, str, list):
Expand Down
14 changes: 10 additions & 4 deletions pilot/user/atlas/container.py
Original file line number Diff line number Diff line change
Expand Up @@ -833,6 +833,7 @@ def execute_remote_file_open(path: str, python_script_timeout: int) -> (int, str
# Check for timeout (once per second)
if time.time() - start_time > lsetup_timeout and not lsetup_completed:
logger.warning("timeout for 'lsetup' exceeded - killing script")
exit_code = 2 # 'lsetup' timeout
process.kill()
break

Expand All @@ -841,10 +842,11 @@ def execute_remote_file_open(path: str, python_script_timeout: int) -> (int, str
output = process.stdout.readline() # Read bytes directly
if output is not None: # Check if any output is available (not None)
output = output.decode().strip()
logger.info(output) # Print output for monitoring
logger.info(f'remote file open: {output}')

# Check for LSETUP_COMPLETED message
if output == "LSETUP_COMPLETED":
logger.info('lsetup has completed (resetting start time)')
lsetup_completed = True
start_time = time.time() # Reset start time for 'python3' timeout

Expand All @@ -861,18 +863,22 @@ def execute_remote_file_open(path: str, python_script_timeout: int) -> (int, str
continue

# Timeout for python script after LSETUP_COMPLETED
if lsetup_completed and time.time() - start_time > python_script_timeout:
logger.warning("timeout for 'python3' subscript exceeded - killing script")
if lsetup_completed and ((time.time() - start_time) > python_script_timeout):
logger.warning(f"timeout for 'python3' subscript exceeded - killing script "
f"({time.time()} - {start_time} > {python_script_timeout})")
exit_code = 3 # python script timeout
process.kill()
break

# Check if script has completed normally
return_code = process.poll()
if return_code is not None:
logger.info("script execution completed with return code: {return_code}")
logger.info(f"script execution completed with return code: {return_code}")
exit_code = return_code
break

time.sleep(0.5)

# Ensure process is terminated
if process.poll() is None:
process.terminate()
Expand Down
11 changes: 9 additions & 2 deletions pilot/user/atlas/cvmfs.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,6 @@
'CVMFS_BASE/unpacked.cern.ch/logDir/lastUpdate',
'CVMFS_BASE/sft-nightlies.cern.ch/lcg/lastUpdate',
]
# when was the last cvmfs update?
last_update_file = '/cvmfs/sft.cern.ch/lcg/lastUpdate'


def get_cvmfs_base_path() -> str:
Expand All @@ -44,3 +42,12 @@ def get_cvmfs_base_path() -> str:
:return: base path for CVMFS (str).
"""
return get_file_system_root_path()


def get_last_update_file() -> str:
"""
Return the last update file.
:return: last update file (str).
"""
return f'{get_cvmfs_base_path()}/sft.cern.ch/lcg/lastUpdate'
4 changes: 2 additions & 2 deletions pilot/util/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,8 @@
# Pilot version
RELEASE = '3' # released number should be fixed at 3 for Pilot 3
VERSION = '7' # version number is '1' for first release, '0' until then, increased for bigger updates
REVISION = '3' # revision number should be reset to '0' for every new version release, increased for small updates
BUILD = '84' # build number should be reset to '1' for every new development cycle
REVISION = '4' # revision number should be reset to '0' for every new version release, increased for small updates
BUILD = '2' # build number should be reset to '1' for every new development cycle

SUCCESS = 0
FAILURE = 1
Expand Down
6 changes: 5 additions & 1 deletion pilot/util/cvmfs.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,11 @@ def get_last_update() -> int:
"""
pilot_user = os.environ.get('PILOT_USER', 'generic').lower()
user = __import__(f'pilot.user.{pilot_user}.cvmfs', globals(), locals(), [pilot_user], 0)
last_update_file = getattr(user, 'last_update_file', None)
try:
last_update_file = user.get_last_update_file()
except AttributeError:
last_update_file = None

timestamp = None
if last_update_file:
if os.path.exists(last_update_file):
Expand Down

0 comments on commit 2dc71c3

Please sign in to comment.