Skip to content

Commit

Permalink
Reverted cpu monitoring, removed threshold which is useless
Browse files Browse the repository at this point in the history
  • Loading branch information
Paul Nilsson committed Nov 25, 2024
1 parent 5113a06 commit 6503d2e
Show file tree
Hide file tree
Showing 4 changed files with 7 additions and 11 deletions.
2 changes: 1 addition & 1 deletion PILOTVERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
3.9.2.29
3.9.2.30
2 changes: 1 addition & 1 deletion pilot/util/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
RELEASE = '3' # released number should be fixed at 3 for Pilot 3
VERSION = '9' # version number is '1' for first release, '0' until then, increased for bigger updates
REVISION = '2' # revision number should be reset to '0' for every new version release, increased for small updates
BUILD = '29' # build number should be reset to '1' for every new development cycle
BUILD = '30' # build number should be reset to '1' for every new development cycle

SUCCESS = 0
FAILURE = 1
Expand Down
4 changes: 3 additions & 1 deletion pilot/util/monitoring.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,12 +116,14 @@ def job_monitor_tasks(job: JobData, mt: MonitoringTime, args: object) -> tuple[i
else:
_cpuconsumptiontime = int(round(cpuconsumptiontime))
if _cpuconsumptiontime > 0:
# make sure there are no sudden jumps in the cpuconsumptiontime
#factor = _cpuconsumptiontime / job.cpuconsumptiontime
job.cpuconsumptiontime = int(round(cpuconsumptiontime))
job.cpuconversionfactor = 1.0
logger.info(f'(instant) CPU consumption time for pid={job.pid}: {cpuconsumptiontime} (rounded to {job.cpuconsumptiontime})')
elif _cpuconsumptiontime == -1:
logger.warning('could not get CPU consumption time')
elif _cpuconsumptiontime == 0.0:
elif _cpuconsumptiontime == 0:
logger.warning(f'process {job.pid} can no longer be monitored (due to stat problems) - aborting')
return 0, ""
else:
Expand Down
10 changes: 2 additions & 8 deletions pilot/util/processes.py
Original file line number Diff line number Diff line change
Expand Up @@ -552,6 +552,7 @@ def get_cpu_consumption_time(t0: tuple) -> float:
def get_instant_cpu_consumption_time(pid: int) -> float:
"""
Return the CPU consumption time (system+user time) for a given process, by parsing /prod/pid/stat.
Note 1: the function returns 0.0 if the pid is not set.
Note 2: the function must sum up all the user+system times for both the main process (pid) and the child
processes, since the main process is most likely spawning new processes.
Expand All @@ -574,21 +575,14 @@ def get_instant_cpu_consumption_time(pid: int) -> float:
if os.path.exists(path):
try:
with open(path, "r", encoding="utf-8") as fp:
_read = fp.read()
fields = _read.split(' ')[13:17]
fields = fp.read().split(' ')[13:17]
utime, stime, cutime, cstime = [(float(f) / hz) for f in fields]
except IOError as exc:
logger.warning(f'exception caught: {exc} (ignored)')

if utime and stime and cutime and cstime:
# sum up all the user+system times for both the main process (pid) and the child processes
cpu_consumption_time = utime + stime + cutime + cstime
max_threshold = 1e6
if cpu_consumption_time > max_threshold:
logger.warning(f'CPU consumption time={cpu_consumption_time} for pid={pid} exceeds sanity threshold={max_threshold} (reset to 1.0)')
logger.warning(f"utime={utime} stime={stime} cutime={cutime} cstime={cstime} hz={hz}")
logger.warning(f"fp.read()={_read}")
cpu_consumption_time = 1.0
else:
cpu_consumption_time = 0.0

Expand Down

0 comments on commit 6503d2e

Please sign in to comment.