Skip to content

Commit

Permalink
do not use the JEL for completions, as the statistics are incomplete (#…
Browse files Browse the repository at this point in the history
…405)

* do not use the JEL for completions, as the statistics are incomplete

* <bot> update requirements-docs.txt

* <bot> update requirements-tests.txt

* <bot> update requirements.txt

* fix tests

* remove some ads from the JEL, as they aren't used

* make flake8 happy

* <bot> update requirements-docs.txt

* <bot> update requirements-tests.txt

* <bot> update requirements.txt

---------

Co-authored-by: github-actions <[email protected]>
  • Loading branch information
dsschult and github-actions authored Nov 20, 2024
1 parent 02f426f commit 0122974
Show file tree
Hide file tree
Showing 5 changed files with 50 additions and 41 deletions.
11 changes: 10 additions & 1 deletion iceprod/server/plugins/condor.py
Original file line number Diff line number Diff line change
Expand Up @@ -173,7 +173,7 @@ class CondorSubmit:

_GENERIC_ADS = ['Iwd', 'IceProdDatasetId', 'IceProdTaskId', 'IceProdTaskInstanceId', 'MATCH_EXP_JOBGLIDEIN_ResourceName']
AD_INFO = [
'RemotePool', 'RemoteHost', 'RemoteWallClockTime', 'ResidentSetSize_RAW', 'DiskUsage_RAW',
'RemotePool', 'RemoteHost',
'HoldReason', 'RemoveReason', 'Reason', 'MachineAttrGLIDEIN_Site0',
] + _GENERIC_ADS
AD_PROJECTION_QUEUE = ['JobStatus', 'RemotePool', 'RemoteHost'] + _GENERIC_ADS
Expand Down Expand Up @@ -681,6 +681,10 @@ async def wait(self, timeout):
if type_ == htcondor.JobEventType.JOB_TERMINATED:
logger.info("job %s %s.%s exited on its own", job_id, job.dataset_id, job.task_id)

# there's a bug where not all the classads are updated before the event fires
# so ignore this and let the cross-check take care of it
continue

# get stats
cpu = event.get('CpusUsage', None)
gpu = event.get('GpusUsage', None)
Expand Down Expand Up @@ -734,6 +738,11 @@ async def wait(self, timeout):
job.status = JobStatus.FAILED
reason = event.get('Reason', None)
logger.info("job %s %s.%s removed: %r", job_id, job.dataset_id, job.task_id, reason)

# there's a bug where not all the classads are updated before the event fires
# so ignore this and let the cross-check take care of it
continue

await self.finish(job_id, success=False, reason=reason)

else:
Expand Down
14 changes: 7 additions & 7 deletions requirements-docs.txt
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,9 @@ attrs==24.2.0
# referencing
babel==2.16.0
# via sphinx
boto3==1.35.61
boto3==1.35.66
# via iceprod (setup.py)
botocore==1.35.61
botocore==1.35.66
# via
# boto3
# s3transfer
Expand Down Expand Up @@ -52,7 +52,7 @@ h11==0.14.0
# via httpcore
htcondor==24.1.1
# via iceprod (setup.py)
httpcore==1.0.6
httpcore==1.0.7
# via httpx
httpx==0.27.2
# via iceprod (setup.py)
Expand Down Expand Up @@ -89,7 +89,7 @@ pycparser==2.22
# via cffi
pygments==2.18.0
# via sphinx
pyjwt[crypto]==2.9.0
pyjwt[crypto]==2.10.0
# via wipac-rest-tools
pymongo==4.9.2
# via
Expand Down Expand Up @@ -117,7 +117,7 @@ requests==2.32.3
# sphinx
# wipac-dev-tools
# wipac-rest-tools
requests-futures==1.0.1
requests-futures==1.0.2
# via
# iceprod (setup.py)
# wipac-rest-tools
Expand All @@ -127,9 +127,9 @@ rpds-py==0.21.0
# via
# jsonschema
# referencing
s3transfer==0.10.3
s3transfer==0.10.4
# via boto3
setproctitle==1.3.3
setproctitle==1.3.4
# via iceprod (setup.py)
six==1.16.0
# via python-dateutil
Expand Down
18 changes: 9 additions & 9 deletions requirements-tests.txt
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,11 @@ attrs==24.2.0
# referencing
beautifulsoup4==4.12.3
# via iceprod (setup.py)
boto3==1.35.61
boto3==1.35.66
# via
# iceprod (setup.py)
# moto
botocore==1.35.61
botocore==1.35.66
# via
# boto3
# moto
Expand All @@ -38,7 +38,7 @@ cffi==1.17.1
# via cryptography
charset-normalizer==3.4.0
# via requests
coverage[toml]==7.6.5
coverage[toml]==7.6.7
# via
# iceprod (setup.py)
# pytest-cov
Expand All @@ -62,7 +62,7 @@ h11==0.14.0
# via httpcore
htcondor==24.1.1
# via iceprod (setup.py)
httpcore==1.0.6
httpcore==1.0.7
# via httpx
httpx==0.27.2
# via
Expand Down Expand Up @@ -95,7 +95,7 @@ mccabe==0.7.0
# via flake8
mock==5.1.0
# via iceprod (setup.py)
moto[s3]==5.0.20
moto[s3]==5.0.21
# via iceprod (setup.py)
motor==3.6.0
# via iceprod (setup.py)
Expand All @@ -115,7 +115,7 @@ pycparser==2.22
# via cffi
pyflakes==3.2.0
# via flake8
pyjwt[crypto]==2.9.0
pyjwt[crypto]==2.10.0
# via wipac-rest-tools
pymongo==4.9.2
# via
Expand Down Expand Up @@ -161,7 +161,7 @@ requests==2.32.3
# responses
# wipac-dev-tools
# wipac-rest-tools
requests-futures==1.0.1
requests-futures==1.0.2
# via
# iceprod (setup.py)
# wipac-rest-tools
Expand All @@ -177,9 +177,9 @@ rpds-py==0.21.0
# via
# jsonschema
# referencing
s3transfer==0.10.3
s3transfer==0.10.4
# via boto3
setproctitle==1.3.3
setproctitle==1.3.4
# via iceprod (setup.py)
six==1.16.0
# via python-dateutil
Expand Down
14 changes: 7 additions & 7 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,9 @@ attrs==24.2.0
# via
# jsonschema
# referencing
boto3==1.35.61
boto3==1.35.66
# via iceprod (setup.py)
botocore==1.35.61
botocore==1.35.66
# via
# boto3
# s3transfer
Expand Down Expand Up @@ -46,7 +46,7 @@ h11==0.14.0
# via httpcore
htcondor==24.1.1
# via iceprod (setup.py)
httpcore==1.0.6
httpcore==1.0.7
# via httpx
httpx==0.27.2
# via iceprod (setup.py)
Expand All @@ -73,7 +73,7 @@ pyasn1==0.6.1
# via ldap3
pycparser==2.22
# via cffi
pyjwt[crypto]==2.9.0
pyjwt[crypto]==2.10.0
# via wipac-rest-tools
pymongo==4.9.2
# via
Expand All @@ -100,7 +100,7 @@ requests==2.32.3
# requests-toolbelt
# wipac-dev-tools
# wipac-rest-tools
requests-futures==1.0.1
requests-futures==1.0.2
# via
# iceprod (setup.py)
# wipac-rest-tools
Expand All @@ -110,9 +110,9 @@ rpds-py==0.21.0
# via
# jsonschema
# referencing
s3transfer==0.10.3
s3transfer==0.10.4
# via boto3
setproctitle==1.3.3
setproctitle==1.3.4
# via iceprod (setup.py)
six==1.16.0
# via python-dateutil
Expand Down
34 changes: 17 additions & 17 deletions tests/server/plugins/condor_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -488,24 +488,24 @@ async def test_Grid_wait_JEL(schedd, i3prod_path, set_time):

await g.wait(timeout=0)

assert len(g.jobs) == 7
#assert len(g.jobs) == 7

assert g.jobs[CondorJobId(cluster_id=110828038, proc_id=0)].dataset_id == '4ksd8'
assert g.jobs[CondorJobId(cluster_id=110828038, proc_id=0)].task_id == 'lnk3f'
assert g.jobs[CondorJobId(cluster_id=110828038, proc_id=0)].submit_dir == Path('/scratch/dschultz')
assert g.jobs[CondorJobId(cluster_id=110828038, proc_id=0)].status == JobStatus.COMPLETED
#assert g.jobs[CondorJobId(cluster_id=110828038, proc_id=0)].status == JobStatus.COMPLETED

assert g.jobs[CondorJobId(cluster_id=110828038, proc_id=1)].status == JobStatus.COMPLETED
assert g.jobs[CondorJobId(cluster_id=110828038, proc_id=2)].status == JobStatus.COMPLETED
assert g.jobs[CondorJobId(cluster_id=110828038, proc_id=3)].status == JobStatus.FAILED
assert g.jobs[CondorJobId(cluster_id=110828038, proc_id=4)].status == JobStatus.FAILED
assert g.jobs[CondorJobId(cluster_id=110828038, proc_id=5)].status == JobStatus.COMPLETED
assert g.jobs[CondorJobId(cluster_id=110828038, proc_id=6)].status == JobStatus.FAILED
#assert g.jobs[CondorJobId(cluster_id=110828038, proc_id=1)].status == JobStatus.COMPLETED
#assert g.jobs[CondorJobId(cluster_id=110828038, proc_id=2)].status == JobStatus.COMPLETED
#assert g.jobs[CondorJobId(cluster_id=110828038, proc_id=3)].status == JobStatus.FAILED
#assert g.jobs[CondorJobId(cluster_id=110828038, proc_id=4)].status == JobStatus.FAILED
#assert g.jobs[CondorJobId(cluster_id=110828038, proc_id=5)].status == JobStatus.COMPLETED
#assert g.jobs[CondorJobId(cluster_id=110828038, proc_id=6)].status == JobStatus.FAILED

assert g.task_idle.call_count == 1
assert g.task_processing.call_count == 7
assert g.task_reset.call_count == 0
assert g.finish.call_count == 6
#assert g.finish.call_count == 6


async def test_Grid_wait_JEL_finish(schedd, i3prod_path, set_time):
Expand All @@ -527,14 +527,14 @@ async def test_Grid_wait_JEL_finish(schedd, i3prod_path, set_time):

await g.wait(timeout=0)

assert len(g.jobs) == 1
assert list(g.jobs.keys()) == [CondorJobId(cluster_id=110828038, proc_id=4)]
#assert len(g.jobs) == 1
assert CondorJobId(cluster_id=110828038, proc_id=4) in g.jobs

assert g.task_idle.call_count == 1
assert g.task_processing.call_count == 7
assert g.task_reset.call_count == 0
assert g.task_failure.call_count == 2
assert g.task_success.call_count == 4
#assert g.task_failure.call_count == 2
#assert g.task_success.call_count == 4


async def test_Grid_wait_JEL_exception(schedd, i3prod_path, set_time):
Expand All @@ -556,14 +556,14 @@ async def test_Grid_wait_JEL_exception(schedd, i3prod_path, set_time):

await g.wait(timeout=0)

assert len(g.jobs) == 1
assert list(g.jobs.keys()) == [CondorJobId(cluster_id=110828038, proc_id=4)]
#assert len(g.jobs) == 1
assert CondorJobId(cluster_id=110828038, proc_id=4) in g.jobs

assert g.task_idle.call_count == 1
assert g.task_processing.call_count == 7
assert g.task_reset.call_count == 0
assert g.task_failure.call_count == 2
assert g.task_success.call_count == 4
#assert g.task_failure.call_count == 2
#assert g.task_success.call_count == 4


async def test_Grid_wait_JEL_reprocess(schedd, i3prod_path, set_time):
Expand Down

0 comments on commit 0122974

Please sign in to comment.