Skip to content

Commit 757acbe

Browse files
committed
Retrieve exec_hosts from bjobs
Propagate exec_hosts to scheduler
1 parent 8ff7299 commit 757acbe

File tree

9 files changed

+38
-4
lines changed

9 files changed

+38
-4
lines changed

src/_ert/events.py

+1
Original file line numberDiff line numberDiff line change
@@ -110,6 +110,7 @@ class RealizationBaseEvent(BaseEvent):
110110
real: str
111111
ensemble: Union[str, None] = None
112112
queue_event_type: Union[str, None] = None
113+
exec_hosts: Union[str, None] = None
113114

114115

115116
class RealizationPending(RealizationBaseEvent):

src/ert/ensemble_evaluator/snapshot.py

+7
Original file line numberDiff line numberDiff line change
@@ -252,6 +252,7 @@ def update_realization(
252252
status: str,
253253
start_time: Optional[datetime] = None,
254254
end_time: Optional[datetime] = None,
255+
exec_hosts: Optional[str] = None,
255256
callback_status_message: Optional[str] = None,
256257
) -> "EnsembleSnapshot":
257258
self._realization_snapshots[real_id].update(
@@ -260,6 +261,7 @@ def update_realization(
260261
status=status,
261262
start_time=start_time,
262263
end_time=end_time,
264+
exec_hosts=exec_hosts,
263265
callback_status_message=callback_status_message,
264266
)
265267
)
@@ -279,10 +281,12 @@ def update_from_event(
279281
status = _FM_TYPE_EVENT_TO_STATUS[type(event)]
280282
start_time = None
281283
end_time = None
284+
exec_hosts = None
282285
callback_status_message = None
283286

284287
if e_type is RealizationRunning:
285288
start_time = convert_iso8601_to_datetime(timestamp)
289+
exec_hosts = event.exec_hosts
286290
elif e_type in {
287291
RealizationSuccess,
288292
RealizationFailed,
@@ -296,6 +300,7 @@ def update_from_event(
296300
status,
297301
start_time,
298302
end_time,
303+
exec_hosts,
299304
callback_status_message,
300305
)
301306

@@ -397,6 +402,7 @@ class RealizationSnapshot(TypedDict, total=False):
397402
active: Optional[bool]
398403
start_time: Optional[datetime]
399404
end_time: Optional[datetime]
405+
exec_hosts: Optional[str]
400406
fm_steps: Dict[str, FMStepSnapshot]
401407
callback_status_message: Optional[str]
402408

@@ -411,6 +417,7 @@ def _realization_dict_to_realization_snapshot(
411417
end_time=source.get("end_time"),
412418
callback_status_message=source.get("callback_status_message"),
413419
fm_steps=source.get("fm_steps", {}),
420+
exec_hosts=source.get("exec_hosts"),
414421
)
415422
return _filter_nones(realization)
416423

src/ert/gui/model/node.py

+1
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,7 @@ class RealNodeData:
7474
real_status_color: Optional[QColor] = None
7575
current_memory_usage: Optional[int] = None
7676
max_memory_usage: Optional[int] = None
77+
exec_hosts: Optional[str] = None
7778
stderr: Optional[str] = None
7879
callback_status_message: Optional[str] = None
7980

src/ert/gui/model/snapshot.py

+2
Original file line numberDiff line numberDiff line change
@@ -168,6 +168,8 @@ def _update_snapshot(self, snapshot: EnsembleSnapshot, iter_: str) -> None:
168168
data = real_node.data
169169
if real_status := real.get("status"):
170170
data.status = real_status
171+
if real_exec_hosts := real.get("exec_hosts"):
172+
data.exec_hosts = real_exec_hosts
171173
for real_fm_step_id, color in (
172174
metadata["aggr_fm_step_status_colors"].get(real_id, {}).items()
173175
):

src/ert/gui/simulation/run_dialog.py

+12-1
Original file line numberDiff line numberDiff line change
@@ -335,10 +335,21 @@ def on_snapshot_new_iteration(
335335
def _select_real(self, index: QModelIndex) -> None:
336336
real = index.row()
337337
iter_ = index.model().get_iter() # type: ignore
338+
exec_hosts = None
339+
340+
iter_node = self._snapshot_model.root.children.get(str(iter_), None)
341+
if iter_node:
342+
real_node = iter_node.children.get(str(real), None)
343+
if real_node:
344+
exec_hosts = real_node.data.exec_hosts
345+
338346
self._fm_step_overview.set_realization(iter_, real)
339-
self._fm_step_label.setText(
347+
text = (
340348
f"Realization id {index.data(RealIens)} in iteration {index.data(IterNum)}"
341349
)
350+
if exec_hosts and exec_hosts != "-":
351+
text += f", assigned to host: [{exec_hosts}]"
352+
self._fm_step_label.setText(text)
342353

343354
def closeEvent(self, a0: Optional[QCloseEvent]) -> None:
344355
if not self._notifier.is_simulation_running:

src/ert/scheduler/event.py

+2
Original file line numberDiff line numberDiff line change
@@ -7,12 +7,14 @@
77
@dataclass
88
class StartedEvent:
99
iens: int
10+
exec_hosts: str = ""
1011

1112

1213
@dataclass
1314
class FinishedEvent:
1415
iens: int
1516
returncode: int
17+
exec_hosts: str = ""
1618

1719

1820
Event = Union[StartedEvent, FinishedEvent]

src/ert/scheduler/job.py

+2
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,7 @@ def __init__(self, scheduler: Scheduler, real: Realization) -> None:
6262
self.real = real
6363
self.state = JobState.WAITING
6464
self.started = asyncio.Event()
65+
self.exec_hosts: str = "-"
6566
self.returncode: asyncio.Future[int] = asyncio.Future()
6667
self._aborted = False
6768
self._scheduler: Scheduler = scheduler
@@ -263,6 +264,7 @@ async def _send(self, state: JobState) -> None:
263264
"event_type": _queue_jobstate_event_type[state],
264265
"queue_event_type": state,
265266
"real": str(self.iens),
267+
"exec_hosts": self.exec_hosts,
266268
}
267269
self.state = state
268270
if state == JobState.FAILED:

src/ert/scheduler/lsf_driver.py

+7-2
Original file line numberDiff line numberDiff line change
@@ -373,6 +373,7 @@ async def submit(
373373
iens=iens,
374374
job_state=QueuedJob(job_state="PEND"),
375375
submitted_timestamp=time.time(),
376+
exec_hosts="-",
376377
)
377378
self._iens2jobid[iens] = job_id
378379

@@ -500,11 +501,15 @@ async def _process_job_update(self, job_id: str, new_state: AnyJob) -> None:
500501
event: Optional[Event] = None
501502
if isinstance(new_state, RunningJob):
502503
logger.debug(f"Realization {iens} is running")
503-
event = StartedEvent(iens=iens)
504+
event = StartedEvent(iens=iens, exec_hosts=self._jobs[job_id].exec_hosts)
504505
elif isinstance(new_state, FinishedJobFailure):
505506
logger.info(f"Realization {iens} (LSF-id: {self._iens2jobid[iens]}) failed")
506507
exit_code = await self._get_exit_code(job_id)
507-
event = FinishedEvent(iens=iens, returncode=exit_code)
508+
event = FinishedEvent(
509+
iens=iens,
510+
returncode=exit_code,
511+
exec_hosts=self._jobs[job_id].exec_hosts,
512+
)
508513
elif isinstance(new_state, FinishedJobSuccess):
509514
logger.info(
510515
f"Realization {iens} (LSF-id: {self._iens2jobid[iens]}) succeeded"

src/ert/scheduler/scheduler.py

+4-1
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@
2828
from ert.constant_filenames import CERT_FILE
2929

3030
from .driver import Driver
31-
from .event import FinishedEvent
31+
from .event import FinishedEvent, StartedEvent
3232
from .job import Job, JobState
3333

3434
if TYPE_CHECKING:
@@ -308,6 +308,9 @@ async def _process_event_queue(self) -> None:
308308
# Any event implies the job has at least started
309309
job.started.set()
310310

311+
if isinstance(event, (StartedEvent, FinishedEvent)) and event.exec_hosts:
312+
self._jobs[event.iens].exec_hosts = event.exec_hosts
313+
311314
if (
312315
isinstance(event, FinishedEvent)
313316
and not self._cancelled

0 commit comments

Comments
 (0)